Merge branch 'akpm' (patches from Andrew)

Merge yet more updates from Andrew Morton:

 - a few MM remainders

 - misc things

 - autofs updates

 - signals

 - affs updates

 - ipc

 - nilfs2

 - spelling.txt updates

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (78 commits)
  mm, x86: fix HIGHMEM64 && PARAVIRT build config for native_pud_clear()
  mm: add arch-independent testcases for RODATA
  hfs: atomically read inode size
  mm: clarify mm_struct.mm_{users,count} documentation
  mm: use mmget_not_zero() helper
  mm: add new mmget() helper
  mm: add new mmgrab() helper
  checkpatch: warn when formats use %Z and suggest %z
  lib/vsprintf.c: remove %Z support
  scripts/spelling.txt: add some typo-words
  scripts/spelling.txt: add "followings" pattern and fix typo instances
  scripts/spelling.txt: add "therfore" pattern and fix typo instances
  scripts/spelling.txt: add "overwriten" pattern and fix typo instances
  scripts/spelling.txt: add "overwritting" pattern and fix typo instances
  scripts/spelling.txt: add "deintialize(d)" pattern and fix typo instances
  scripts/spelling.txt: add "disassocation" pattern and fix typo instances
  scripts/spelling.txt: add "omited" pattern and fix typo instances
  scripts/spelling.txt: add "explictely" pattern and fix typo instances
  scripts/spelling.txt: add "applys" pattern and fix typo instances
  scripts/spelling.txt: add "configuartion" pattern and fix typo instances
  ...
diff --git a/Documentation/ABI/testing/sysfs-bus-i2c-devices-bq32k b/Documentation/ABI/testing/sysfs-bus-i2c-devices-bq32k
new file mode 100644
index 0000000..398b258
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-i2c-devices-bq32k
@@ -0,0 +1,7 @@
+What:		/sys/bus/i2c/devices/.../trickle_charge_bypass
+Date:		Jan 2017
+KernelVersion:	4.11
+Contact:	Enric Balletbo i Serra <eballetbo@gmail.com>
+Description:    Attribute for enable/disable the trickle charge bypass
+		The trickle_charge_bypass attribute allows the userspace to
+                enable/disable the Trickle charge FET bypass.
diff --git a/Documentation/cgroup-v1/rdma.txt b/Documentation/cgroup-v1/rdma.txt
new file mode 100644
index 0000000..af61817
--- /dev/null
+++ b/Documentation/cgroup-v1/rdma.txt
@@ -0,0 +1,109 @@
+				RDMA Controller
+				----------------
+
+Contents
+--------
+
+1. Overview
+  1-1. What is RDMA controller?
+  1-2. Why RDMA controller needed?
+  1-3. How is RDMA controller implemented?
+2. Usage Examples
+
+1. Overview
+
+1-1. What is RDMA controller?
+-----------------------------
+
+RDMA controller allows user to limit RDMA/IB specific resources that a given
+set of processes can use. These processes are grouped using RDMA controller.
+
+RDMA controller defines two resources which can be limited for processes of a
+cgroup.
+
+1-2. Why RDMA controller needed?
+--------------------------------
+
+Currently user space applications can easily take away all the rdma verb
+specific resources such as AH, CQ, QP, MR etc. Due to which other applications
+in other cgroup or kernel space ULPs may not even get chance to allocate any
+rdma resources. This can leads to service unavailability.
+
+Therefore RDMA controller is needed through which resource consumption
+of processes can be limited. Through this controller different rdma
+resources can be accounted.
+
+1-3. How is RDMA controller implemented?
+----------------------------------------
+
+RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
+resource accounting per cgroup, per device using resource pool structure.
+Each such resource pool is limited up to 64 resources in given resource pool
+by rdma cgroup, which can be extended later if required.
+
+This resource pool object is linked to the cgroup css. Typically there
+are 0 to 4 resource pool instances per cgroup, per device in most use cases.
+But nothing limits to have it more. At present hundreds of RDMA devices per
+single cgroup may not be handled optimally, however there is no
+known use case or requirement for such configuration either.
+
+Since RDMA resources can be allocated from any process and can be freed by any
+of the child processes which shares the address space, rdma resources are
+always owned by the creator cgroup css. This allows process migration from one
+to other cgroup without major complexity of transferring resource ownership;
+because such ownership is not really present due to shared nature of
+rdma resources. Linking resources around css also ensures that cgroups can be
+deleted after processes migrated. This allow progress migration as well with
+active resources, even though that is not a primary use case.
+
+Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
+the caller. Same rdma cgroup should be passed while uncharging the resource.
+This also allows process migrated with active RDMA resource to charge
+to new owner cgroup for new resource. It also allows to uncharge resource of
+a process from previously charged cgroup which is migrated to new cgroup,
+even though that is not a primary use case.
+
+Resource pool object is created in following situations.
+(a) User sets the limit and no previous resource pool exist for the device
+of interest for the cgroup.
+(b) No resource limits were configured, but IB/RDMA stack tries to
+charge the resource. So that it correctly uncharge them when applications are
+running without limits and later on when limits are enforced during uncharging,
+otherwise usage count will drop to negative.
+
+Resource pool is destroyed if all the resource limits are set to max and
+it is the last resource getting deallocated.
+
+User should set all the limit to max value if it intents to remove/unconfigure
+the resource pool for a particular device.
+
+IB stack honors limits enforced by the rdma controller. When application
+query about maximum resource limits of IB device, it returns minimum of
+what is configured by user for a given cgroup and what is supported by
+IB device.
+
+Following resources can be accounted by rdma controller.
+  hca_handle	Maximum number of HCA Handles
+  hca_object 	Maximum number of HCA Objects
+
+2. Usage Examples
+-----------------
+
+(a) Configure resource limit:
+echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
+echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
+
+(b) Query resource limit:
+cat /sys/fs/cgroup/rdma/2/rdma.max
+#Output:
+mlx4_0 hca_handle=2 hca_object=2000
+ocrdma1 hca_handle=3 hca_object=max
+
+(c) Query current usage:
+cat /sys/fs/cgroup/rdma/2/rdma.current
+#Output:
+mlx4_0 hca_handle=1 hca_object=20
+ocrdma1 hca_handle=1 hca_object=23
+
+(d) Delete resource limit:
+echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 4cc07ce..3b8449f 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -47,6 +47,12 @@
   5-3. IO
     5-3-1. IO Interface Files
     5-3-2. Writeback
+  5-4. PID
+    5-4-1. PID Interface Files
+  5-5. RDMA
+    5-5-1. RDMA Interface Files
+  5-6. Misc
+    5-6-1. perf_event
 6. Namespace
   6-1. Basics
   6-2. The Root and Views
@@ -328,14 +334,12 @@
 cgroup by writing its PID to the "cgroup.procs" file, the following
 conditions must be met.
 
-- The writer's euid must match either uid or suid of the target process.
-
 - The writer must have write access to the "cgroup.procs" file.
 
 - The writer must have write access to the "cgroup.procs" file of the
   common ancestor of the source and destination cgroups.
 
-The above three constraints ensure that while a delegatee may migrate
+The above two constraints ensure that while a delegatee may migrate
 processes around freely in the delegated sub-hierarchy it can't pull
 in from or push out to outside the sub-hierarchy.
 
@@ -350,10 +354,10 @@
 
 Let's also say U0 wants to write the PID of a process which is
 currently in C10 into "C00/cgroup.procs".  U0 has write access to the
-file and uid match on the process; however, the common ancestor of the
-source cgroup C10 and the destination cgroup C00 is above the points
-of delegation and U0 would not have write access to its "cgroup.procs"
-files and thus the write will be denied with -EACCES.
+file; however, the common ancestor of the source cgroup C10 and the
+destination cgroup C00 is above the points of delegation and U0 would
+not have write access to its "cgroup.procs" files and thus the write
+will be denied with -EACCES.
 
 
 2-6. Guidelines
@@ -1119,6 +1123,91 @@
 	vm.dirty[_background]_ratio.
 
 
+5-4. PID
+
+The process number controller is used to allow a cgroup to stop any
+new tasks from being fork()'d or clone()'d after a specified limit is
+reached.
+
+The number of tasks in a cgroup can be exhausted in ways which other
+controllers cannot prevent, thus warranting its own controller.  For
+example, a fork bomb is likely to exhaust the number of tasks before
+hitting memory restrictions.
+
+Note that PIDs used in this controller refer to TIDs, process IDs as
+used by the kernel.
+
+
+5-4-1. PID Interface Files
+
+  pids.max
+
+ A read-write single value file which exists on non-root cgroups.  The
+ default is "max".
+
+ Hard limit of number of processes.
+
+  pids.current
+
+ A read-only single value file which exists on all cgroups.
+
+ The number of processes currently in the cgroup and its descendants.
+
+Organisational operations are not blocked by cgroup policies, so it is
+possible to have pids.current > pids.max.  This can be done by either
+setting the limit to be smaller than pids.current, or attaching enough
+processes to the cgroup such that pids.current is larger than
+pids.max.  However, it is not possible to violate a cgroup PID policy
+through fork() or clone(). These will return -EAGAIN if the creation
+of a new process would cause a cgroup policy to be violated.
+
+
+5-5. RDMA
+
+The "rdma" controller regulates the distribution and accounting of
+of RDMA resources.
+
+5-5-1. RDMA Interface Files
+
+  rdma.max
+	A readwrite nested-keyed file that exists for all the cgroups
+	except root that describes current configured resource limit
+	for a RDMA/IB device.
+
+	Lines are keyed by device name and are not ordered.
+	Each line contains space separated resource name and its configured
+	limit that can be distributed.
+
+	The following nested keys are defined.
+
+	  hca_handle	Maximum number of HCA Handles
+	  hca_object 	Maximum number of HCA Objects
+
+	An example for mlx4 and ocrdma device follows.
+
+	  mlx4_0 hca_handle=2 hca_object=2000
+	  ocrdma1 hca_handle=3 hca_object=max
+
+  rdma.current
+	A read-only file that describes current resource usage.
+	It exists for all the cgroup except root.
+
+	An example for mlx4 and ocrdma device follows.
+
+	  mlx4_0 hca_handle=1 hca_object=20
+	  ocrdma1 hca_handle=1 hca_object=23
+
+
+5-6. Misc
+
+5-6-1. perf_event
+
+perf_event controller, if not mounted on a legacy hierarchy, is
+automatically enabled on the v2 hierarchy so that perf events can
+always be filtered by cgroup v2 path.  The controller can still be
+moved to a legacy hierarchy after v2 hierarchy is populated.
+
+
 6. Namespace
 
 6-1. Basics
diff --git a/Documentation/devicetree/bindings/rtc/armada-380-rtc.txt b/Documentation/devicetree/bindings/rtc/armada-380-rtc.txt
index 2eb9d4e..c3c9a12 100644
--- a/Documentation/devicetree/bindings/rtc/armada-380-rtc.txt
+++ b/Documentation/devicetree/bindings/rtc/armada-380-rtc.txt
@@ -1,9 +1,11 @@
-* Real Time Clock of the Armada 38x SoCs
+* Real Time Clock of the Armada 38x/7K/8K SoCs
 
-RTC controller for the Armada 38x SoCs
+RTC controller for the Armada 38x, 7K and 8K SoCs
 
 Required properties:
-- compatible : Should be "marvell,armada-380-rtc"
+- compatible : Should be one of the following:
+	"marvell,armada-380-rtc" for Armada 38x SoC
+	"marvell,armada-8k-rtc" for Aramda 7K/8K SoCs
 - reg: a list of base address and size pairs, one for each entry in
   reg-names
 - reg names: should contain:
diff --git a/Documentation/devicetree/bindings/rtc/cortina,gemini.txt b/Documentation/devicetree/bindings/rtc/cortina,gemini.txt
new file mode 100644
index 0000000..4ce4e79
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/cortina,gemini.txt
@@ -0,0 +1,14 @@
+* Cortina Systems Gemini RTC
+
+Gemini SoC real-time clock.
+
+Required properties:
+- compatible : Should be "cortina,gemini-rtc"
+
+Examples:
+
+rtc@45000000 {
+	compatible = "cortina,gemini-rtc";
+	reg = <0x45000000 0x100>;
+	interrupts = <17 IRQ_TYPE_LEVEL_HIGH>;
+};
diff --git a/Documentation/devicetree/bindings/rtc/imxdi-rtc.txt b/Documentation/devicetree/bindings/rtc/imxdi-rtc.txt
index c9d80d7..323cf26 100644
--- a/Documentation/devicetree/bindings/rtc/imxdi-rtc.txt
+++ b/Documentation/devicetree/bindings/rtc/imxdi-rtc.txt
@@ -8,10 +8,13 @@
   region.
 - interrupts: rtc alarm interrupt
 
+Optional properties:
+- interrupts: dryice security violation interrupt
+
 Example:
 
 rtc@80056000 {
 	compatible = "fsl,imx53-rtc", "fsl,imx25-rtc";
 	reg = <0x80056000 2000>;
-	interrupts = <29>;
+	interrupts = <29 56>;
 };
diff --git a/Documentation/devicetree/bindings/rtc/maxim,ds3231.txt b/Documentation/devicetree/bindings/rtc/maxim,ds3231.txt
index 1ad4c1c..85be53a 100644
--- a/Documentation/devicetree/bindings/rtc/maxim,ds3231.txt
+++ b/Documentation/devicetree/bindings/rtc/maxim,ds3231.txt
@@ -1,7 +1,8 @@
 * Maxim DS3231 Real Time Clock
 
 Required properties:
-see: Documentation/devicetree/bindings/i2c/trivial-admin-guide/devices.rst
+- compatible: Should contain "maxim,ds3231".
+- reg: I2C address for chip.
 
 Optional property:
 - #clock-cells: Should be 1.
diff --git a/Documentation/devicetree/bindings/rtc/pcf8563.txt b/Documentation/devicetree/bindings/rtc/pcf8563.txt
index 086c998..36984ac 100644
--- a/Documentation/devicetree/bindings/rtc/pcf8563.txt
+++ b/Documentation/devicetree/bindings/rtc/pcf8563.txt
@@ -3,7 +3,8 @@
 Philips PCF8563/Epson RTC8564 Real Time Clock
 
 Required properties:
-see: Documentation/devicetree/bindings/i2c/trivial-admin-guide/devices.rst
+- compatible: Should contain "nxp,pcf8563".
+- reg: I2C address for chip.
 
 Optional property:
 - #clock-cells: Should be 0.
diff --git a/Documentation/devicetree/bindings/rtc/st,stm32-rtc.txt b/Documentation/devicetree/bindings/rtc/st,stm32-rtc.txt
new file mode 100644
index 0000000..e2837b9
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/st,stm32-rtc.txt
@@ -0,0 +1,27 @@
+STM32 Real Time Clock
+
+Required properties:
+- compatible: "st,stm32-rtc".
+- reg: address range of rtc register set.
+- clocks: reference to the clock entry ck_rtc.
+- interrupt-parent: phandle for the interrupt controller.
+- interrupts: rtc alarm interrupt.
+- st,syscfg: phandle for pwrcfg, mandatory to disable/enable backup domain
+  (RTC registers) write protection.
+
+Optional properties (to override default ck_rtc parent clock):
+- assigned-clocks: reference to the ck_rtc clock entry.
+- assigned-clock-parents: phandle of the new parent clock of ck_rtc.
+
+Example:
+
+	rtc: rtc@40002800 {
+		compatible = "st,stm32-rtc";
+		reg = <0x40002800 0x400>;
+		clocks = <&rcc 1 CLK_RTC>;
+		assigned-clocks = <&rcc 1 CLK_RTC>;
+		assigned-clock-parents = <&rcc 1 CLK_LSE>;
+		interrupt-parent = <&exti>;
+		interrupts = <17 1>;
+		st,syscfg = <&pwrcfg>;
+	};
diff --git a/Documentation/devicetree/bindings/rtc/sun6i-rtc.txt b/Documentation/devicetree/bindings/rtc/sun6i-rtc.txt
index f007e42..9459349 100644
--- a/Documentation/devicetree/bindings/rtc/sun6i-rtc.txt
+++ b/Documentation/devicetree/bindings/rtc/sun6i-rtc.txt
@@ -8,10 +8,20 @@
 		  memory mapped region.
 - interrupts	: IRQ lines for the RTC alarm 0 and alarm 1, in that order.
 
+Required properties for new device trees
+- clocks	: phandle to the 32kHz external oscillator
+- clock-output-names : name of the LOSC clock created
+- #clock-cells  : must be equals to 1. The RTC provides two clocks: the
+		  LOSC and its external output, with index 0 and 1
+		  respectively.
+
 Example:
 
 rtc: rtc@01f00000 {
 	compatible = "allwinner,sun6i-a31-rtc";
 	reg = <0x01f00000 0x54>;
 	interrupts = <0 40 4>, <0 41 4>;
+	clock-output-names = "osc32k";
+	clocks = <&ext_osc32k>;
+	#clock-cells = <1>;
 };
diff --git a/Documentation/static-keys.txt b/Documentation/static-keys.txt
index ea8d7b4..32a25fa 100644
--- a/Documentation/static-keys.txt
+++ b/Documentation/static-keys.txt
@@ -155,7 +155,9 @@
 
 There are a few functions and macros that architectures must implement in order
 to take advantage of this optimization. If there is no architecture support, we
-simply fall back to a traditional, load, test, and jump sequence.
+simply fall back to a traditional, load, test, and jump sequence. Also, the
+struct jump_entry table must be at least 4-byte aligned because the
+static_key->entry field makes use of the two least significant bits.
 
 * select HAVE_ARCH_JUMP_LABEL, see: arch/x86/Kconfig
 
diff --git a/arch/arm/mach-ep93xx/ts72xx.c b/arch/arm/mach-ep93xx/ts72xx.c
index 3b39ea3..8a5b6f0 100644
--- a/arch/arm/mach-ep93xx/ts72xx.c
+++ b/arch/arm/mach-ep93xx/ts72xx.c
@@ -16,7 +16,6 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
-#include <linux/platform_data/rtc-m48t86.h>
 #include <linux/mtd/nand.h>
 #include <linux/mtd/partitions.h>
 
@@ -45,16 +44,6 @@ static struct map_desc ts72xx_io_desc[] __initdata = {
 		.pfn		= __phys_to_pfn(TS72XX_OPTIONS2_PHYS_BASE),
 		.length		= TS72XX_OPTIONS2_SIZE,
 		.type		= MT_DEVICE,
-	}, {
-		.virtual	= (unsigned long)TS72XX_RTC_INDEX_VIRT_BASE,
-		.pfn		= __phys_to_pfn(TS72XX_RTC_INDEX_PHYS_BASE),
-		.length		= TS72XX_RTC_INDEX_SIZE,
-		.type		= MT_DEVICE,
-	}, {
-		.virtual	= (unsigned long)TS72XX_RTC_DATA_VIRT_BASE,
-		.pfn		= __phys_to_pfn(TS72XX_RTC_DATA_PHYS_BASE),
-		.length		= TS72XX_RTC_DATA_SIZE,
-		.type		= MT_DEVICE,
 	}
 };
 
@@ -179,31 +168,22 @@ static void __init ts72xx_register_flash(void)
 	}
 }
 
+/*************************************************************************
+ * RTC M48T86
+ *************************************************************************/
+#define TS72XX_RTC_INDEX_PHYS_BASE	(EP93XX_CS1_PHYS_BASE + 0x00800000)
+#define TS72XX_RTC_DATA_PHYS_BASE	(EP93XX_CS1_PHYS_BASE + 0x01700000)
 
-static unsigned char ts72xx_rtc_readbyte(unsigned long addr)
-{
-	__raw_writeb(addr, TS72XX_RTC_INDEX_VIRT_BASE);
-	return __raw_readb(TS72XX_RTC_DATA_VIRT_BASE);
-}
-
-static void ts72xx_rtc_writebyte(unsigned char value, unsigned long addr)
-{
-	__raw_writeb(addr, TS72XX_RTC_INDEX_VIRT_BASE);
-	__raw_writeb(value, TS72XX_RTC_DATA_VIRT_BASE);
-}
-
-static struct m48t86_ops ts72xx_rtc_ops = {
-	.readbyte	= ts72xx_rtc_readbyte,
-	.writebyte	= ts72xx_rtc_writebyte,
+static struct resource ts72xx_rtc_resources[] = {
+	DEFINE_RES_MEM(TS72XX_RTC_INDEX_PHYS_BASE, 0x01),
+	DEFINE_RES_MEM(TS72XX_RTC_DATA_PHYS_BASE, 0x01),
 };
 
 static struct platform_device ts72xx_rtc_device = {
 	.name		= "rtc-m48t86",
 	.id		= -1,
-	.dev		= {
-		.platform_data	= &ts72xx_rtc_ops,
-	},
-	.num_resources	= 0,
+	.resource	= ts72xx_rtc_resources,
+	.num_resources 	= ARRAY_SIZE(ts72xx_rtc_resources),
 };
 
 static struct resource ts72xx_wdt_resources[] = {
diff --git a/arch/arm/mach-ep93xx/ts72xx.h b/arch/arm/mach-ep93xx/ts72xx.h
index 071feaa..2255ba2 100644
--- a/arch/arm/mach-ep93xx/ts72xx.h
+++ b/arch/arm/mach-ep93xx/ts72xx.h
@@ -9,8 +9,6 @@
  * febff000	22000000	4K	model number register (bits 0-2)
  * febfe000	22400000	4K	options register
  * febfd000	22800000	4K	options register #2
- * febf9000	10800000	4K	TS-5620 RTC index register
- * febf8000	11700000	4K	TS-5620 RTC data register
  */
 
 #define TS72XX_MODEL_PHYS_BASE		0x22000000
@@ -40,15 +38,6 @@
 #define TS72XX_OPTIONS2_TS9420		0x04
 #define TS72XX_OPTIONS2_TS9420_BOOT	0x02
 
-
-#define TS72XX_RTC_INDEX_VIRT_BASE	IOMEM(0xfebf9000)
-#define TS72XX_RTC_INDEX_PHYS_BASE	0x10800000
-#define TS72XX_RTC_INDEX_SIZE		0x00001000
-
-#define TS72XX_RTC_DATA_VIRT_BASE	IOMEM(0xfebf8000)
-#define TS72XX_RTC_DATA_PHYS_BASE	0x11700000
-#define TS72XX_RTC_DATA_SIZE		0x00001000
-
 #define TS72XX_WDT_CONTROL_PHYS_BASE	0x23800000
 #define TS72XX_WDT_FEED_PHYS_BASE	0x23c00000
 
diff --git a/arch/arm/mach-orion5x/ts78xx-setup.c b/arch/arm/mach-orion5x/ts78xx-setup.c
index 8d59726..7ef80a8 100644
--- a/arch/arm/mach-orion5x/ts78xx-setup.c
+++ b/arch/arm/mach-orion5x/ts78xx-setup.c
@@ -16,7 +16,6 @@
 #include <linux/platform_device.h>
 #include <linux/mv643xx_eth.h>
 #include <linux/ata_platform.h>
-#include <linux/platform_data/rtc-m48t86.h>
 #include <linux/mtd/nand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/timeriomem-rng.h>
@@ -80,79 +79,38 @@ static struct mv_sata_platform_data ts78xx_sata_data = {
 /*****************************************************************************
  * RTC M48T86 - nicked^Wborrowed from arch/arm/mach-ep93xx/ts72xx.c
  ****************************************************************************/
-#define TS_RTC_CTRL	(TS78XX_FPGA_REGS_VIRT_BASE + 0x808)
-#define TS_RTC_DATA	(TS78XX_FPGA_REGS_VIRT_BASE + 0x80c)
+#define TS_RTC_CTRL	(TS78XX_FPGA_REGS_PHYS_BASE + 0x808)
+#define TS_RTC_DATA	(TS78XX_FPGA_REGS_PHYS_BASE + 0x80c)
 
-static unsigned char ts78xx_ts_rtc_readbyte(unsigned long addr)
-{
-	writeb(addr, TS_RTC_CTRL);
-	return readb(TS_RTC_DATA);
-}
-
-static void ts78xx_ts_rtc_writebyte(unsigned char value, unsigned long addr)
-{
-	writeb(addr, TS_RTC_CTRL);
-	writeb(value, TS_RTC_DATA);
-}
-
-static struct m48t86_ops ts78xx_ts_rtc_ops = {
-	.readbyte	= ts78xx_ts_rtc_readbyte,
-	.writebyte	= ts78xx_ts_rtc_writebyte,
+static struct resource ts78xx_ts_rtc_resources[] = {
+	DEFINE_RES_MEM(TS_RTC_CTRL, 0x01),
+	DEFINE_RES_MEM(TS_RTC_DATA, 0x01),
 };
 
 static struct platform_device ts78xx_ts_rtc_device = {
 	.name		= "rtc-m48t86",
 	.id		= -1,
-	.dev		= {
-		.platform_data	= &ts78xx_ts_rtc_ops,
-	},
-	.num_resources	= 0,
+	.resource	= ts78xx_ts_rtc_resources,
+	.num_resources 	= ARRAY_SIZE(ts78xx_ts_rtc_resources),
 };
 
-/*
- * TS uses some of the user storage space on the RTC chip so see if it is
- * present; as it's an optional feature at purchase time and not all boards
- * will have it present
- *
- * I've used the method TS use in their rtc7800.c example for the detection
- *
- * TODO: track down a guinea pig without an RTC to see if we can work out a
- *		better RTC detection routine
- */
 static int ts78xx_ts_rtc_load(void)
 {
 	int rc;
-	unsigned char tmp_rtc0, tmp_rtc1;
 
-	tmp_rtc0 = ts78xx_ts_rtc_readbyte(126);
-	tmp_rtc1 = ts78xx_ts_rtc_readbyte(127);
-
-	ts78xx_ts_rtc_writebyte(0x00, 126);
-	ts78xx_ts_rtc_writebyte(0x55, 127);
-	if (ts78xx_ts_rtc_readbyte(127) == 0x55) {
-		ts78xx_ts_rtc_writebyte(0xaa, 127);
-		if (ts78xx_ts_rtc_readbyte(127) == 0xaa
-				&& ts78xx_ts_rtc_readbyte(126) == 0x00) {
-			ts78xx_ts_rtc_writebyte(tmp_rtc0, 126);
-			ts78xx_ts_rtc_writebyte(tmp_rtc1, 127);
-
-			if (ts78xx_fpga.supports.ts_rtc.init == 0) {
-				rc = platform_device_register(&ts78xx_ts_rtc_device);
-				if (!rc)
-					ts78xx_fpga.supports.ts_rtc.init = 1;
-			} else
-				rc = platform_device_add(&ts78xx_ts_rtc_device);
-
-			if (rc)
-				pr_info("RTC could not be registered: %d\n",
-					rc);
-			return rc;
-		}
+	if (ts78xx_fpga.supports.ts_rtc.init == 0) {
+		rc = platform_device_register(&ts78xx_ts_rtc_device);
+		if (!rc)
+			ts78xx_fpga.supports.ts_rtc.init = 1;
+	} else {
+		rc = platform_device_add(&ts78xx_ts_rtc_device);
 	}
 
-	pr_info("RTC not found\n");
-	return -ENODEV;
-};
+	if (rc)
+		pr_info("RTC could not be registered: %d\n", rc);
+
+	return rc;
+}
 
 static void ts78xx_ts_rtc_unload(void)
 {
diff --git a/arch/m68k/configs/amcore_defconfig b/arch/m68k/configs/amcore_defconfig
index f108dd1..131b410 100644
--- a/arch/m68k/configs/amcore_defconfig
+++ b/arch/m68k/configs/amcore_defconfig
@@ -1,19 +1,20 @@
-CONFIG_LOCALVERSION="amcore-001"
+CONFIG_LOCALVERSION="amcore-002"
 CONFIG_DEFAULT_HOSTNAME="amcore"
 CONFIG_SYSVIPC=y
 # CONFIG_FHANDLE is not set
 # CONFIG_USELIB is not set
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_NAMESPACES=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 # CONFIG_AIO is not set
 # CONFIG_ADVISE_SYSCALLS is not set
 # CONFIG_MEMBARRIER is not set
 CONFIG_EMBEDDED=y
 # CONFIG_VM_EVENT_COUNTERS is not set
+# CONFIG_SLUB_DEBUG is not set
 # CONFIG_COMPAT_BRK is not set
 # CONFIG_LBDAF is not set
 # CONFIG_BLK_DEV_BSG is not set
+# CONFIG_IOSCHED_CFQ is not set
 # CONFIG_MMU is not set
 CONFIG_M5307=y
 CONFIG_AMCORE=y
@@ -27,13 +28,14 @@
 CONFIG_PACKET=y
 CONFIG_UNIX=y
 CONFIG_INET=y
+CONFIG_SYN_COOKIES=y
 # CONFIG_INET_XFRM_MODE_TRANSPORT is not set
 # CONFIG_INET_XFRM_MODE_TUNNEL is not set
 # CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_IPV6 is not set
 # CONFIG_WIRELESS is not set
 # CONFIG_UEVENT_HELPER is not set
-CONFIG_FW_LOADER_USER_HELPER_FALLBACK=y
+# CONFIG_FW_LOADER is not set
 # CONFIG_ALLOW_DEV_COREDUMP is not set
 CONFIG_CONNECTOR=y
 CONFIG_MTD=y
@@ -53,6 +55,7 @@
 CONFIG_MTD_PLATRAM=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_NETDEVICES=y
+# CONFIG_NET_VENDOR_AMAZON is not set
 # CONFIG_NET_VENDOR_ARC is not set
 # CONFIG_NET_CADENCE is not set
 # CONFIG_NET_VENDOR_BROADCOM is not set
@@ -89,14 +92,12 @@
 CONFIG_I2C_CHARDEV=y
 # CONFIG_I2C_HELPER_AUTO is not set
 CONFIG_I2C_IMX=y
-CONFIG_PPS=y
+CONFIG_GPIO_SYSFS=y
 # CONFIG_HWMON is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_RTC_CLASS=y
 # CONFIG_RTC_SYSTOHC is not set
 CONFIG_RTC_DRV_DS1307=y
-CONFIG_EXT2_FS=y
-CONFIG_EXT2_FS_XATTR=y
 # CONFIG_FILE_LOCKING is not set
 # CONFIG_DNOTIFY is not set
 # CONFIG_INOTIFY_USER is not set
@@ -108,6 +109,7 @@
 # CONFIG_NETWORK_FILESYSTEMS is not set
 CONFIG_PRINTK_TIME=y
 # CONFIG_ENABLE_WARN_DEPRECATED is not set
+# CONFIG_ENABLE_MUST_CHECK is not set
 # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
 CONFIG_PANIC_ON_OOPS=y
 # CONFIG_SCHED_DEBUG is not set
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index d5c1073..a2dcef0 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -134,6 +134,7 @@
 	select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES
 	select HAVE_CMPXCHG_DOUBLE
 	select HAVE_CMPXCHG_LOCAL
+	select HAVE_COPY_THREAD_TLS
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_CONTIGUOUS
diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig
index e0097536..143b1e0 100644
--- a/arch/s390/configs/default_defconfig
+++ b/arch/s390/configs/default_defconfig
@@ -678,6 +678,7 @@
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 CONFIG_ZCRYPT=m
+CONFIG_PKEY=m
 CONFIG_CRYPTO_SHA1_S390=m
 CONFIG_CRYPTO_SHA256_S390=m
 CONFIG_CRYPTO_SHA512_S390=m
diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig
index 2cf8734..2358bf3 100644
--- a/arch/s390/configs/performance_defconfig
+++ b/arch/s390/configs/performance_defconfig
@@ -628,6 +628,7 @@
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 CONFIG_ZCRYPT=m
+CONFIG_PKEY=m
 CONFIG_CRYPTO_SHA1_S390=m
 CONFIG_CRYPTO_SHA256_S390=m
 CONFIG_CRYPTO_SHA512_S390=m
diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile
index d1033de..402c530 100644
--- a/arch/s390/crypto/Makefile
+++ b/arch/s390/crypto/Makefile
@@ -6,7 +6,7 @@
 obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256_s390.o sha_common.o
 obj-$(CONFIG_CRYPTO_SHA512_S390) += sha512_s390.o sha_common.o
 obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o
-obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o
+obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o paes_s390.o
 obj-$(CONFIG_S390_PRNG) += prng.o
 obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o
 obj-$(CONFIG_CRYPTO_CRC32_S390) += crc32-vx_s390.o
diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c
new file mode 100644
index 0000000..d69ea49
--- /dev/null
+++ b/arch/s390/crypto/paes_s390.c
@@ -0,0 +1,619 @@
+/*
+ * Cryptographic API.
+ *
+ * s390 implementation of the AES Cipher Algorithm with protected keys.
+ *
+ * s390 Version:
+ *   Copyright IBM Corp. 2017
+ *   Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ *		Harald Freudenberger <freude@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ */
+
+#define KMSG_COMPONENT "paes_s390"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <crypto/aes.h>
+#include <crypto/algapi.h>
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/cpufeature.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <crypto/xts.h>
+#include <asm/cpacf.h>
+#include <asm/pkey.h>
+
+static u8 *ctrblk;
+static DEFINE_SPINLOCK(ctrblk_lock);
+
+static cpacf_mask_t km_functions, kmc_functions, kmctr_functions;
+
+struct s390_paes_ctx {
+	struct pkey_seckey sk;
+	struct pkey_protkey pk;
+	unsigned long fc;
+};
+
+struct s390_pxts_ctx {
+	struct pkey_seckey sk[2];
+	struct pkey_protkey pk[2];
+	unsigned long fc;
+};
+
+static inline int __paes_convert_key(struct pkey_seckey *sk,
+				     struct pkey_protkey *pk)
+{
+	int i, ret;
+
+	/* try three times in case of failure */
+	for (i = 0; i < 3; i++) {
+		ret = pkey_skey2pkey(sk, pk);
+		if (ret == 0)
+			break;
+	}
+
+	return ret;
+}
+
+static int __paes_set_key(struct s390_paes_ctx *ctx)
+{
+	unsigned long fc;
+
+	if (__paes_convert_key(&ctx->sk, &ctx->pk))
+		return -EINVAL;
+
+	/* Pick the correct function code based on the protected key type */
+	fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KM_PAES_128 :
+		(ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KM_PAES_192 :
+		(ctx->pk.type == PKEY_KEYTYPE_AES_256) ? CPACF_KM_PAES_256 : 0;
+
+	/* Check if the function code is available */
+	ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0;
+
+	return ctx->fc ? 0 : -EINVAL;
+}
+
+static int ecb_paes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+			    unsigned int key_len)
+{
+	struct s390_paes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	if (key_len != SECKEYBLOBSIZE)
+		return -EINVAL;
+
+	memcpy(ctx->sk.seckey, in_key, SECKEYBLOBSIZE);
+	if (__paes_set_key(ctx)) {
+		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int ecb_paes_crypt(struct blkcipher_desc *desc,
+			  unsigned long modifier,
+			  struct blkcipher_walk *walk)
+{
+	struct s390_paes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	unsigned int nbytes, n, k;
+	int ret;
+
+	ret = blkcipher_walk_virt(desc, walk);
+	while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) {
+		/* only use complete blocks */
+		n = nbytes & ~(AES_BLOCK_SIZE - 1);
+		k = cpacf_km(ctx->fc | modifier, ctx->pk.protkey,
+			     walk->dst.virt.addr, walk->src.virt.addr, n);
+		if (k)
+			ret = blkcipher_walk_done(desc, walk, nbytes - k);
+		if (k < n) {
+			if (__paes_set_key(ctx) != 0)
+				return blkcipher_walk_done(desc, walk, -EIO);
+		}
+	}
+	return ret;
+}
+
+static int ecb_paes_encrypt(struct blkcipher_desc *desc,
+			    struct scatterlist *dst, struct scatterlist *src,
+			    unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	return ecb_paes_crypt(desc, CPACF_ENCRYPT, &walk);
+}
+
+static int ecb_paes_decrypt(struct blkcipher_desc *desc,
+			    struct scatterlist *dst, struct scatterlist *src,
+			    unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	return ecb_paes_crypt(desc, CPACF_DECRYPT, &walk);
+}
+
+static struct crypto_alg ecb_paes_alg = {
+	.cra_name		=	"ecb(paes)",
+	.cra_driver_name	=	"ecb-paes-s390",
+	.cra_priority		=	400,	/* combo: aes + ecb */
+	.cra_flags		=	CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		=	AES_BLOCK_SIZE,
+	.cra_ctxsize		=	sizeof(struct s390_paes_ctx),
+	.cra_type		=	&crypto_blkcipher_type,
+	.cra_module		=	THIS_MODULE,
+	.cra_list		=	LIST_HEAD_INIT(ecb_paes_alg.cra_list),
+	.cra_u			=	{
+		.blkcipher = {
+			.min_keysize		=	SECKEYBLOBSIZE,
+			.max_keysize		=	SECKEYBLOBSIZE,
+			.setkey			=	ecb_paes_set_key,
+			.encrypt		=	ecb_paes_encrypt,
+			.decrypt		=	ecb_paes_decrypt,
+		}
+	}
+};
+
+static int __cbc_paes_set_key(struct s390_paes_ctx *ctx)
+{
+	unsigned long fc;
+
+	if (__paes_convert_key(&ctx->sk, &ctx->pk))
+		return -EINVAL;
+
+	/* Pick the correct function code based on the protected key type */
+	fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMC_PAES_128 :
+		(ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KMC_PAES_192 :
+		(ctx->pk.type == PKEY_KEYTYPE_AES_256) ? CPACF_KMC_PAES_256 : 0;
+
+	/* Check if the function code is available */
+	ctx->fc = (fc && cpacf_test_func(&kmc_functions, fc)) ? fc : 0;
+
+	return ctx->fc ? 0 : -EINVAL;
+}
+
+static int cbc_paes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+			    unsigned int key_len)
+{
+	struct s390_paes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	memcpy(ctx->sk.seckey, in_key, SECKEYBLOBSIZE);
+	if (__cbc_paes_set_key(ctx)) {
+		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int cbc_paes_crypt(struct blkcipher_desc *desc, unsigned long modifier,
+			  struct blkcipher_walk *walk)
+{
+	struct s390_paes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	unsigned int nbytes, n, k;
+	int ret;
+	struct {
+		u8 iv[AES_BLOCK_SIZE];
+		u8 key[MAXPROTKEYSIZE];
+	} param;
+
+	ret = blkcipher_walk_virt(desc, walk);
+	memcpy(param.iv, walk->iv, AES_BLOCK_SIZE);
+	memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+	while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) {
+		/* only use complete blocks */
+		n = nbytes & ~(AES_BLOCK_SIZE - 1);
+		k = cpacf_kmc(ctx->fc | modifier, &param,
+			      walk->dst.virt.addr, walk->src.virt.addr, n);
+		if (k)
+			ret = blkcipher_walk_done(desc, walk, nbytes - k);
+		if (n < k) {
+			if (__cbc_paes_set_key(ctx) != 0)
+				return blkcipher_walk_done(desc, walk, -EIO);
+			memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+		}
+	}
+	memcpy(walk->iv, param.iv, AES_BLOCK_SIZE);
+	return ret;
+}
+
+static int cbc_paes_encrypt(struct blkcipher_desc *desc,
+			    struct scatterlist *dst, struct scatterlist *src,
+			    unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	return cbc_paes_crypt(desc, 0, &walk);
+}
+
+static int cbc_paes_decrypt(struct blkcipher_desc *desc,
+			    struct scatterlist *dst, struct scatterlist *src,
+			    unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	return cbc_paes_crypt(desc, CPACF_DECRYPT, &walk);
+}
+
+static struct crypto_alg cbc_paes_alg = {
+	.cra_name		=	"cbc(paes)",
+	.cra_driver_name	=	"cbc-paes-s390",
+	.cra_priority		=	400,	/* combo: aes + cbc */
+	.cra_flags		=	CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		=	AES_BLOCK_SIZE,
+	.cra_ctxsize		=	sizeof(struct s390_paes_ctx),
+	.cra_type		=	&crypto_blkcipher_type,
+	.cra_module		=	THIS_MODULE,
+	.cra_list		=	LIST_HEAD_INIT(cbc_paes_alg.cra_list),
+	.cra_u			=	{
+		.blkcipher = {
+			.min_keysize		=	SECKEYBLOBSIZE,
+			.max_keysize		=	SECKEYBLOBSIZE,
+			.ivsize			=	AES_BLOCK_SIZE,
+			.setkey			=	cbc_paes_set_key,
+			.encrypt		=	cbc_paes_encrypt,
+			.decrypt		=	cbc_paes_decrypt,
+		}
+	}
+};
+
+static int __xts_paes_set_key(struct s390_pxts_ctx *ctx)
+{
+	unsigned long fc;
+
+	if (__paes_convert_key(&ctx->sk[0], &ctx->pk[0]) ||
+	    __paes_convert_key(&ctx->sk[1], &ctx->pk[1]))
+		return -EINVAL;
+
+	if (ctx->pk[0].type != ctx->pk[1].type)
+		return -EINVAL;
+
+	/* Pick the correct function code based on the protected key type */
+	fc = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? CPACF_KM_PXTS_128 :
+		(ctx->pk[0].type == PKEY_KEYTYPE_AES_256) ?
+		CPACF_KM_PXTS_256 : 0;
+
+	/* Check if the function code is available */
+	ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0;
+
+	return ctx->fc ? 0 : -EINVAL;
+}
+
+static int xts_paes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+			    unsigned int key_len)
+{
+	struct s390_pxts_ctx *ctx = crypto_tfm_ctx(tfm);
+	u8 ckey[2 * AES_MAX_KEY_SIZE];
+	unsigned int ckey_len;
+
+	memcpy(ctx->sk[0].seckey, in_key, SECKEYBLOBSIZE);
+	memcpy(ctx->sk[1].seckey, in_key + SECKEYBLOBSIZE, SECKEYBLOBSIZE);
+	if (__xts_paes_set_key(ctx)) {
+		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
+
+	/*
+	 * xts_check_key verifies the key length is not odd and makes
+	 * sure that the two keys are not the same. This can be done
+	 * on the two protected keys as well
+	 */
+	ckey_len = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ?
+		AES_KEYSIZE_128 : AES_KEYSIZE_256;
+	memcpy(ckey, ctx->pk[0].protkey, ckey_len);
+	memcpy(ckey + ckey_len, ctx->pk[1].protkey, ckey_len);
+	return xts_check_key(tfm, ckey, 2*ckey_len);
+}
+
+static int xts_paes_crypt(struct blkcipher_desc *desc, unsigned long modifier,
+			  struct blkcipher_walk *walk)
+{
+	struct s390_pxts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	unsigned int keylen, offset, nbytes, n, k;
+	int ret;
+	struct {
+		u8 key[MAXPROTKEYSIZE];	/* key + verification pattern */
+		u8 tweak[16];
+		u8 block[16];
+		u8 bit[16];
+		u8 xts[16];
+	} pcc_param;
+	struct {
+		u8 key[MAXPROTKEYSIZE];	/* key + verification pattern */
+		u8 init[16];
+	} xts_param;
+
+	ret = blkcipher_walk_virt(desc, walk);
+	keylen = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 48 : 64;
+	offset = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 16 : 0;
+retry:
+	memset(&pcc_param, 0, sizeof(pcc_param));
+	memcpy(pcc_param.tweak, walk->iv, sizeof(pcc_param.tweak));
+	memcpy(pcc_param.key + offset, ctx->pk[1].protkey, keylen);
+	cpacf_pcc(ctx->fc, pcc_param.key + offset);
+
+	memcpy(xts_param.key + offset, ctx->pk[0].protkey, keylen);
+	memcpy(xts_param.init, pcc_param.xts, 16);
+
+	while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) {
+		/* only use complete blocks */
+		n = nbytes & ~(AES_BLOCK_SIZE - 1);
+		k = cpacf_km(ctx->fc | modifier, xts_param.key + offset,
+			     walk->dst.virt.addr, walk->src.virt.addr, n);
+		if (k)
+			ret = blkcipher_walk_done(desc, walk, nbytes - k);
+		if (k < n) {
+			if (__xts_paes_set_key(ctx) != 0)
+				return blkcipher_walk_done(desc, walk, -EIO);
+			goto retry;
+		}
+	}
+	return ret;
+}
+
+static int xts_paes_encrypt(struct blkcipher_desc *desc,
+			    struct scatterlist *dst, struct scatterlist *src,
+			    unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	return xts_paes_crypt(desc, 0, &walk);
+}
+
+static int xts_paes_decrypt(struct blkcipher_desc *desc,
+			    struct scatterlist *dst, struct scatterlist *src,
+			    unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	return xts_paes_crypt(desc, CPACF_DECRYPT, &walk);
+}
+
+static struct crypto_alg xts_paes_alg = {
+	.cra_name		=	"xts(paes)",
+	.cra_driver_name	=	"xts-paes-s390",
+	.cra_priority		=	400,	/* combo: aes + xts */
+	.cra_flags		=	CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		=	AES_BLOCK_SIZE,
+	.cra_ctxsize		=	sizeof(struct s390_pxts_ctx),
+	.cra_type		=	&crypto_blkcipher_type,
+	.cra_module		=	THIS_MODULE,
+	.cra_list		=	LIST_HEAD_INIT(xts_paes_alg.cra_list),
+	.cra_u			=	{
+		.blkcipher = {
+			.min_keysize		=	2 * SECKEYBLOBSIZE,
+			.max_keysize		=	2 * SECKEYBLOBSIZE,
+			.ivsize			=	AES_BLOCK_SIZE,
+			.setkey			=	xts_paes_set_key,
+			.encrypt		=	xts_paes_encrypt,
+			.decrypt		=	xts_paes_decrypt,
+		}
+	}
+};
+
+static int __ctr_paes_set_key(struct s390_paes_ctx *ctx)
+{
+	unsigned long fc;
+
+	if (__paes_convert_key(&ctx->sk, &ctx->pk))
+		return -EINVAL;
+
+	/* Pick the correct function code based on the protected key type */
+	fc = (ctx->pk.type == PKEY_KEYTYPE_AES_128) ? CPACF_KMCTR_PAES_128 :
+		(ctx->pk.type == PKEY_KEYTYPE_AES_192) ? CPACF_KMCTR_PAES_192 :
+		(ctx->pk.type == PKEY_KEYTYPE_AES_256) ?
+		CPACF_KMCTR_PAES_256 : 0;
+
+	/* Check if the function code is available */
+	ctx->fc = (fc && cpacf_test_func(&kmctr_functions, fc)) ? fc : 0;
+
+	return ctx->fc ? 0 : -EINVAL;
+}
+
+static int ctr_paes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+			    unsigned int key_len)
+{
+	struct s390_paes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	memcpy(ctx->sk.seckey, in_key, key_len);
+	if (__ctr_paes_set_key(ctx)) {
+		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static unsigned int __ctrblk_init(u8 *ctrptr, u8 *iv, unsigned int nbytes)
+{
+	unsigned int i, n;
+
+	/* only use complete blocks, max. PAGE_SIZE */
+	memcpy(ctrptr, iv, AES_BLOCK_SIZE);
+	n = (nbytes > PAGE_SIZE) ? PAGE_SIZE : nbytes & ~(AES_BLOCK_SIZE - 1);
+	for (i = (n / AES_BLOCK_SIZE) - 1; i > 0; i--) {
+		memcpy(ctrptr + AES_BLOCK_SIZE, ctrptr, AES_BLOCK_SIZE);
+		crypto_inc(ctrptr + AES_BLOCK_SIZE, AES_BLOCK_SIZE);
+		ctrptr += AES_BLOCK_SIZE;
+	}
+	return n;
+}
+
+static int ctr_paes_crypt(struct blkcipher_desc *desc, unsigned long modifier,
+			  struct blkcipher_walk *walk)
+{
+	struct s390_paes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	u8 buf[AES_BLOCK_SIZE], *ctrptr;
+	unsigned int nbytes, n, k;
+	int ret, locked;
+
+	locked = spin_trylock(&ctrblk_lock);
+
+	ret = blkcipher_walk_virt_block(desc, walk, AES_BLOCK_SIZE);
+	while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) {
+		n = AES_BLOCK_SIZE;
+		if (nbytes >= 2*AES_BLOCK_SIZE && locked)
+			n = __ctrblk_init(ctrblk, walk->iv, nbytes);
+		ctrptr = (n > AES_BLOCK_SIZE) ? ctrblk : walk->iv;
+		k = cpacf_kmctr(ctx->fc | modifier, ctx->pk.protkey,
+				walk->dst.virt.addr, walk->src.virt.addr,
+				n, ctrptr);
+		if (k) {
+			if (ctrptr == ctrblk)
+				memcpy(walk->iv, ctrptr + k - AES_BLOCK_SIZE,
+				       AES_BLOCK_SIZE);
+			crypto_inc(walk->iv, AES_BLOCK_SIZE);
+			ret = blkcipher_walk_done(desc, walk, nbytes - n);
+		}
+		if (k < n) {
+			if (__ctr_paes_set_key(ctx) != 0)
+				return blkcipher_walk_done(desc, walk, -EIO);
+		}
+	}
+	if (locked)
+		spin_unlock(&ctrblk_lock);
+	/*
+	 * final block may be < AES_BLOCK_SIZE, copy only nbytes
+	 */
+	if (nbytes) {
+		while (1) {
+			if (cpacf_kmctr(ctx->fc | modifier,
+					ctx->pk.protkey, buf,
+					walk->src.virt.addr, AES_BLOCK_SIZE,
+					walk->iv) == AES_BLOCK_SIZE)
+				break;
+			if (__ctr_paes_set_key(ctx) != 0)
+				return blkcipher_walk_done(desc, walk, -EIO);
+		}
+		memcpy(walk->dst.virt.addr, buf, nbytes);
+		crypto_inc(walk->iv, AES_BLOCK_SIZE);
+		ret = blkcipher_walk_done(desc, walk, 0);
+	}
+
+	return ret;
+}
+
+static int ctr_paes_encrypt(struct blkcipher_desc *desc,
+			    struct scatterlist *dst, struct scatterlist *src,
+			    unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	return ctr_paes_crypt(desc, 0, &walk);
+}
+
+static int ctr_paes_decrypt(struct blkcipher_desc *desc,
+			    struct scatterlist *dst, struct scatterlist *src,
+			    unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	return ctr_paes_crypt(desc, CPACF_DECRYPT, &walk);
+}
+
+static struct crypto_alg ctr_paes_alg = {
+	.cra_name		=	"ctr(paes)",
+	.cra_driver_name	=	"ctr-paes-s390",
+	.cra_priority		=	400,	/* combo: aes + ctr */
+	.cra_flags		=	CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		=	1,
+	.cra_ctxsize		=	sizeof(struct s390_paes_ctx),
+	.cra_type		=	&crypto_blkcipher_type,
+	.cra_module		=	THIS_MODULE,
+	.cra_list		=	LIST_HEAD_INIT(ctr_paes_alg.cra_list),
+	.cra_u			=	{
+		.blkcipher = {
+			.min_keysize		=	SECKEYBLOBSIZE,
+			.max_keysize		=	SECKEYBLOBSIZE,
+			.ivsize			=	AES_BLOCK_SIZE,
+			.setkey			=	ctr_paes_set_key,
+			.encrypt		=	ctr_paes_encrypt,
+			.decrypt		=	ctr_paes_decrypt,
+		}
+	}
+};
+
+static inline void __crypto_unregister_alg(struct crypto_alg *alg)
+{
+	if (!list_empty(&alg->cra_list))
+		crypto_unregister_alg(alg);
+}
+
+static void paes_s390_fini(void)
+{
+	if (ctrblk)
+		free_page((unsigned long) ctrblk);
+	__crypto_unregister_alg(&ctr_paes_alg);
+	__crypto_unregister_alg(&xts_paes_alg);
+	__crypto_unregister_alg(&cbc_paes_alg);
+	__crypto_unregister_alg(&ecb_paes_alg);
+}
+
+static int __init paes_s390_init(void)
+{
+	int ret;
+
+	/* Query available functions for KM, KMC and KMCTR */
+	cpacf_query(CPACF_KM, &km_functions);
+	cpacf_query(CPACF_KMC, &kmc_functions);
+	cpacf_query(CPACF_KMCTR, &kmctr_functions);
+
+	if (cpacf_test_func(&km_functions, CPACF_KM_PAES_128) ||
+	    cpacf_test_func(&km_functions, CPACF_KM_PAES_192) ||
+	    cpacf_test_func(&km_functions, CPACF_KM_PAES_256)) {
+		ret = crypto_register_alg(&ecb_paes_alg);
+		if (ret)
+			goto out_err;
+	}
+
+	if (cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_128) ||
+	    cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_192) ||
+	    cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_256)) {
+		ret = crypto_register_alg(&cbc_paes_alg);
+		if (ret)
+			goto out_err;
+	}
+
+	if (cpacf_test_func(&km_functions, CPACF_KM_PXTS_128) ||
+	    cpacf_test_func(&km_functions, CPACF_KM_PXTS_256)) {
+		ret = crypto_register_alg(&xts_paes_alg);
+		if (ret)
+			goto out_err;
+	}
+
+	if (cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_128) ||
+	    cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_192) ||
+	    cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_256)) {
+		ret = crypto_register_alg(&ctr_paes_alg);
+		if (ret)
+			goto out_err;
+		ctrblk = (u8 *) __get_free_page(GFP_KERNEL);
+		if (!ctrblk) {
+			ret = -ENOMEM;
+			goto out_err;
+		}
+	}
+
+	return 0;
+out_err:
+	paes_s390_fini();
+	return ret;
+}
+
+module_init(paes_s390_init);
+module_exit(paes_s390_fini);
+
+MODULE_ALIAS_CRYPTO("aes-all");
+
+MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm with protected keys");
+MODULE_LICENSE("GPL");
diff --git a/arch/s390/defconfig b/arch/s390/defconfig
index d00e368..68bfd09 100644
--- a/arch/s390/defconfig
+++ b/arch/s390/defconfig
@@ -229,6 +229,7 @@
 CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_ZCRYPT=m
+CONFIG_PKEY=m
 CONFIG_CRYPTO_SHA1_S390=m
 CONFIG_CRYPTO_SHA256_S390=m
 CONFIG_CRYPTO_SHA512_S390=m
diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index 2c680db..e2dfbf2 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -28,8 +28,9 @@
 #define CPACF_PPNO		0xb93c		/* MSA5 */
 
 /*
- * Decryption modifier bit
+ * En/decryption modifier bits
  */
+#define CPACF_ENCRYPT		0x00
 #define CPACF_DECRYPT		0x80
 
 /*
@@ -42,8 +43,13 @@
 #define CPACF_KM_AES_128	0x12
 #define CPACF_KM_AES_192	0x13
 #define CPACF_KM_AES_256	0x14
+#define CPACF_KM_PAES_128	0x1a
+#define CPACF_KM_PAES_192	0x1b
+#define CPACF_KM_PAES_256	0x1c
 #define CPACF_KM_XTS_128	0x32
 #define CPACF_KM_XTS_256	0x34
+#define CPACF_KM_PXTS_128	0x3a
+#define CPACF_KM_PXTS_256	0x3c
 
 /*
  * Function codes for the KMC (CIPHER MESSAGE WITH CHAINING)
@@ -56,6 +62,9 @@
 #define CPACF_KMC_AES_128	0x12
 #define CPACF_KMC_AES_192	0x13
 #define CPACF_KMC_AES_256	0x14
+#define CPACF_KMC_PAES_128	0x1a
+#define CPACF_KMC_PAES_192	0x1b
+#define CPACF_KMC_PAES_256	0x1c
 #define CPACF_KMC_PRNG		0x43
 
 /*
@@ -69,6 +78,9 @@
 #define CPACF_KMCTR_AES_128	0x12
 #define CPACF_KMCTR_AES_192	0x13
 #define CPACF_KMCTR_AES_256	0x14
+#define CPACF_KMCTR_PAES_128	0x1a
+#define CPACF_KMCTR_PAES_192	0x1b
+#define CPACF_KMCTR_PAES_256	0x1c
 
 /*
  * Function codes for the KIMD (COMPUTE INTERMEDIATE MESSAGE DIGEST)
@@ -99,6 +111,18 @@
 #define CPACF_KMAC_TDEA_192	0x03
 
 /*
+ * Function codes for the PCKMO (PERFORM CRYPTOGRAPHIC KEY MANAGEMENT)
+ * instruction
+ */
+#define CPACF_PCKMO_QUERY		0x00
+#define CPACF_PCKMO_ENC_DES_KEY		0x01
+#define CPACF_PCKMO_ENC_TDES_128_KEY	0x02
+#define CPACF_PCKMO_ENC_TDES_192_KEY	0x03
+#define CPACF_PCKMO_ENC_AES_128_KEY	0x12
+#define CPACF_PCKMO_ENC_AES_192_KEY	0x13
+#define CPACF_PCKMO_ENC_AES_256_KEY	0x14
+
+/*
  * Function codes for the PPNO (PERFORM PSEUDORANDOM NUMBER OPERATION)
  * instruction
  */
@@ -397,4 +421,24 @@ static inline void cpacf_pcc(unsigned long func, void *param)
 		: "cc", "memory");
 }
 
+/**
+ * cpacf_pckmo() - executes the PCKMO (PERFORM CRYPTOGRAPHIC KEY
+ *		  MANAGEMENT) instruction
+ * @func: the function code passed to PCKMO; see CPACF_PCKMO_xxx defines
+ * @param: address of parameter block; see POP for details on each func
+ *
+ * Returns 0.
+ */
+static inline void cpacf_pckmo(long func, void *param)
+{
+	register unsigned long r0 asm("0") = (unsigned long) func;
+	register unsigned long r1 asm("1") = (unsigned long) param;
+
+	asm volatile(
+		"       .insn   rre,%[opc] << 16,0,0\n" /* PCKMO opcode */
+		:
+		: [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (CPACF_PCKMO)
+		: "cc", "memory");
+}
+
 #endif	/* _ASM_S390_CPACF_H */
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 67f7a99..9b828c0 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -63,7 +63,7 @@ static inline void set_user_asce(struct mm_struct *mm)
 	S390_lowcore.user_asce = mm->context.asce;
 	if (current->thread.mm_segment.ar4)
 		__ctl_load(S390_lowcore.user_asce, 7, 7);
-	set_cpu_flag(CIF_ASCE);
+	set_cpu_flag(CIF_ASCE_PRIMARY);
 }
 
 static inline void clear_user_asce(void)
@@ -81,7 +81,7 @@ static inline void load_kernel_asce(void)
 	__ctl_store(asce, 1, 1);
 	if (asce != S390_lowcore.kernel_asce)
 		__ctl_load(S390_lowcore.kernel_asce, 1, 1);
-	set_cpu_flag(CIF_ASCE);
+	set_cpu_flag(CIF_ASCE_PRIMARY);
 }
 
 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 5251186..7ed1972 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -640,12 +640,12 @@ static inline int pud_bad(pud_t pud)
 
 static inline int pmd_present(pmd_t pmd)
 {
-	return pmd_val(pmd) != _SEGMENT_ENTRY_INVALID;
+	return pmd_val(pmd) != _SEGMENT_ENTRY_EMPTY;
 }
 
 static inline int pmd_none(pmd_t pmd)
 {
-	return pmd_val(pmd) == _SEGMENT_ENTRY_INVALID;
+	return pmd_val(pmd) == _SEGMENT_ENTRY_EMPTY;
 }
 
 static inline unsigned long pmd_pfn(pmd_t pmd)
@@ -803,7 +803,7 @@ static inline void pud_clear(pud_t *pud)
 
 static inline void pmd_clear(pmd_t *pmdp)
 {
-	pmd_val(*pmdp) = _SEGMENT_ENTRY_INVALID;
+	pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
 }
 
 static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -1357,7 +1357,7 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)
 static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 					    unsigned long addr, pmd_t *pmdp)
 {
-	return pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_INVALID));
+	return pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
 }
 
 #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
@@ -1367,10 +1367,10 @@ static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
 {
 	if (full) {
 		pmd_t pmd = *pmdp;
-		*pmdp = __pmd(_SEGMENT_ENTRY_INVALID);
+		*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
 		return pmd;
 	}
-	return pmdp_xchg_lazy(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_INVALID));
+	return pmdp_xchg_lazy(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
 }
 
 #define __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
@@ -1384,7 +1384,7 @@ static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
 static inline void pmdp_invalidate(struct vm_area_struct *vma,
 				   unsigned long addr, pmd_t *pmdp)
 {
-	pmdp_xchg_direct(vma->vm_mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_INVALID));
+	pmdp_xchg_direct(vma->vm_mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
 }
 
 #define __HAVE_ARCH_PMDP_SET_WRPROTECT
diff --git a/arch/s390/include/asm/pkey.h b/arch/s390/include/asm/pkey.h
new file mode 100644
index 0000000..b48aef4
--- /dev/null
+++ b/arch/s390/include/asm/pkey.h
@@ -0,0 +1,90 @@
+/*
+ * Kernelspace interface to the pkey device driver
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author: Harald Freudenberger <freude@de.ibm.com>
+ *
+ */
+
+#ifndef _KAPI_PKEY_H
+#define _KAPI_PKEY_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+#include <uapi/asm/pkey.h>
+
+/*
+ * Generate (AES) random secure key.
+ * @param cardnr may be -1 (use default card)
+ * @param domain may be -1 (use default domain)
+ * @param keytype one of the PKEY_KEYTYPE values
+ * @param seckey pointer to buffer receiving the secure key
+ * @return 0 on success, negative errno value on failure
+ */
+int pkey_genseckey(__u16 cardnr, __u16 domain,
+		   __u32 keytype, struct pkey_seckey *seckey);
+
+/*
+ * Generate (AES) secure key with given key value.
+ * @param cardnr may be -1 (use default card)
+ * @param domain may be -1 (use default domain)
+ * @param keytype one of the PKEY_KEYTYPE values
+ * @param clrkey pointer to buffer with clear key data
+ * @param seckey pointer to buffer receiving the secure key
+ * @return 0 on success, negative errno value on failure
+ */
+int pkey_clr2seckey(__u16 cardnr, __u16 domain, __u32 keytype,
+		    const struct pkey_clrkey *clrkey,
+		    struct pkey_seckey *seckey);
+
+/*
+ * Derive (AES) proteced key from the (AES) secure key blob.
+ * @param cardnr may be -1 (use default card)
+ * @param domain may be -1 (use default domain)
+ * @param seckey pointer to buffer with the input secure key
+ * @param protkey pointer to buffer receiving the protected key and
+ *	  additional info (type, length)
+ * @return 0 on success, negative errno value on failure
+ */
+int pkey_sec2protkey(__u16 cardnr, __u16 domain,
+		     const struct pkey_seckey *seckey,
+		     struct pkey_protkey *protkey);
+
+/*
+ * Derive (AES) protected key from a given clear key value.
+ * @param keytype one of the PKEY_KEYTYPE values
+ * @param clrkey pointer to buffer with clear key data
+ * @param protkey pointer to buffer receiving the protected key and
+ *	  additional info (type, length)
+ * @return 0 on success, negative errno value on failure
+ */
+int pkey_clr2protkey(__u32 keytype,
+		     const struct pkey_clrkey *clrkey,
+		     struct pkey_protkey *protkey);
+
+/*
+ * Search for a matching crypto card based on the Master Key
+ * Verification Pattern provided inside a secure key.
+ * @param seckey pointer to buffer with the input secure key
+ * @param cardnr pointer to cardnr, receives the card number on success
+ * @param domain pointer to domain, receives the domain number on success
+ * @param verify if set, always verify by fetching verification pattern
+ *	  from card
+ * @return 0 on success, negative errno value on failure. If no card could be
+ *	   found, -ENODEV is returned.
+ */
+int pkey_findcard(const struct pkey_seckey *seckey,
+		  __u16 *cardnr, __u16 *domain, int verify);
+
+/*
+ * Find card and transform secure key to protected key.
+ * @param seckey pointer to buffer with the input secure key
+ * @param protkey pointer to buffer receiving the protected key and
+ *	  additional info (type, length)
+ * @return 0 on success, negative errno value on failure
+ */
+int pkey_skey2pkey(const struct pkey_seckey *seckey,
+		   struct pkey_protkey *protkey);
+
+#endif /* _KAPI_PKEY_H */
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index dacba34..e498871 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -14,14 +14,16 @@
 #include <linux/const.h>
 
 #define CIF_MCCK_PENDING	0	/* machine check handling is pending */
-#define CIF_ASCE		1	/* user asce needs fixup / uaccess */
-#define CIF_NOHZ_DELAY		2	/* delay HZ disable for a tick */
-#define CIF_FPU			3	/* restore FPU registers */
-#define CIF_IGNORE_IRQ		4	/* ignore interrupt (for udelay) */
-#define CIF_ENABLED_WAIT	5	/* in enabled wait state */
+#define CIF_ASCE_PRIMARY	1	/* primary asce needs fixup / uaccess */
+#define CIF_ASCE_SECONDARY	2	/* secondary asce needs fixup / uaccess */
+#define CIF_NOHZ_DELAY		3	/* delay HZ disable for a tick */
+#define CIF_FPU			4	/* restore FPU registers */
+#define CIF_IGNORE_IRQ		5	/* ignore interrupt (for udelay) */
+#define CIF_ENABLED_WAIT	6	/* in enabled wait state */
 
 #define _CIF_MCCK_PENDING	_BITUL(CIF_MCCK_PENDING)
-#define _CIF_ASCE		_BITUL(CIF_ASCE)
+#define _CIF_ASCE_PRIMARY	_BITUL(CIF_ASCE_PRIMARY)
+#define _CIF_ASCE_SECONDARY	_BITUL(CIF_ASCE_SECONDARY)
 #define _CIF_NOHZ_DELAY		_BITUL(CIF_NOHZ_DELAY)
 #define _CIF_FPU		_BITUL(CIF_FPU)
 #define _CIF_IGNORE_IRQ		_BITUL(CIF_IGNORE_IRQ)
@@ -89,7 +91,8 @@ extern void execve_tail(void);
  * User space process size: 2GB for 31 bit, 4TB or 8PT for 64 bit.
  */
 
-#define TASK_SIZE_OF(tsk)	((tsk)->mm->context.asce_limit)
+#define TASK_SIZE_OF(tsk)	((tsk)->mm ? \
+				 (tsk)->mm->context.asce_limit : TASK_MAX_SIZE)
 #define TASK_UNMAPPED_BASE	(test_thread_flag(TIF_31BIT) ? \
 					(1UL << 30) : (1UL << 41))
 #define TASK_SIZE		TASK_SIZE_OF(current)
@@ -200,10 +203,12 @@ struct stack_frame {
 struct task_struct;
 struct mm_struct;
 struct seq_file;
+struct pt_regs;
 
 typedef int (*dump_trace_func_t)(void *data, unsigned long address, int reliable);
 void dump_trace(dump_trace_func_t func, void *data,
 		struct task_struct *task, unsigned long sp);
+void show_registers(struct pt_regs *regs);
 
 void show_cacheinfo(struct seq_file *m);
 
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index b2988fc..136932f 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -14,6 +14,7 @@
  */
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include <asm/processor.h>
 #include <asm/ctl_reg.h>
 
 #define VERIFY_READ     0
@@ -36,18 +37,20 @@
 
 #define get_ds()        (KERNEL_DS)
 #define get_fs()        (current->thread.mm_segment)
-
-#define set_fs(x)							\
-do {									\
-	unsigned long __pto;						\
-	current->thread.mm_segment = (x);				\
-	__pto = current->thread.mm_segment.ar4 ?			\
-		S390_lowcore.user_asce : S390_lowcore.kernel_asce;	\
-	__ctl_load(__pto, 7, 7);					\
-} while (0)
-
 #define segment_eq(a,b) ((a).ar4 == (b).ar4)
 
+static inline void set_fs(mm_segment_t fs)
+{
+	current->thread.mm_segment = fs;
+	if (segment_eq(fs, KERNEL_DS)) {
+		set_cpu_flag(CIF_ASCE_SECONDARY);
+		__ctl_load(S390_lowcore.kernel_asce, 7, 7);
+	} else {
+		clear_cpu_flag(CIF_ASCE_SECONDARY);
+		__ctl_load(S390_lowcore.user_asce, 7, 7);
+	}
+}
+
 static inline int __range_ok(unsigned long addr, unsigned long size)
 {
 	return 1;
diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild
index bf736e7..6848ba5 100644
--- a/arch/s390/include/uapi/asm/Kbuild
+++ b/arch/s390/include/uapi/asm/Kbuild
@@ -24,6 +24,7 @@
 header-y += monwriter.h
 header-y += msgbuf.h
 header-y += param.h
+header-y += pkey.h
 header-y += poll.h
 header-y += posix_types.h
 header-y += ptrace.h
diff --git a/arch/s390/include/uapi/asm/pkey.h b/arch/s390/include/uapi/asm/pkey.h
new file mode 100644
index 0000000..ed7f19c
--- /dev/null
+++ b/arch/s390/include/uapi/asm/pkey.h
@@ -0,0 +1,112 @@
+/*
+ * Userspace interface to the pkey device driver
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * Author: Harald Freudenberger <freude@de.ibm.com>
+ *
+ */
+
+#ifndef _UAPI_PKEY_H
+#define _UAPI_PKEY_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/*
+ * Ioctl calls supported by the pkey device driver
+ */
+
+#define PKEY_IOCTL_MAGIC 'p'
+
+#define SECKEYBLOBSIZE	64     /* secure key blob size is always 64 bytes */
+#define MAXPROTKEYSIZE	64  /* a protected key blob may be up to 64 bytes */
+#define MAXCLRKEYSIZE	32     /* a clear key value may be up to 32 bytes */
+
+/* defines for the type field within the pkey_protkey struct */
+#define PKEY_KEYTYPE_AES_128  1
+#define PKEY_KEYTYPE_AES_192  2
+#define PKEY_KEYTYPE_AES_256  3
+
+/* Struct to hold a secure key blob */
+struct pkey_seckey {
+	__u8  seckey[SECKEYBLOBSIZE];		  /* the secure key blob */
+};
+
+/* Struct to hold protected key and length info */
+struct pkey_protkey {
+	__u32 type;	     /* key type, one of the PKEY_KEYTYPE values */
+	__u32 len;		/* bytes actually stored in protkey[]	 */
+	__u8  protkey[MAXPROTKEYSIZE];	       /* the protected key blob */
+};
+
+/* Struct to hold a clear key value */
+struct pkey_clrkey {
+	__u8  clrkey[MAXCLRKEYSIZE]; /* 16, 24, or 32 byte clear key value */
+};
+
+/*
+ * Generate secure key
+ */
+struct pkey_genseck {
+	__u16 cardnr;		    /* in: card to use or FFFF for any	 */
+	__u16 domain;		    /* in: domain or FFFF for any	 */
+	__u32 keytype;		    /* in: key type to generate		 */
+	struct pkey_seckey seckey;  /* out: the secure key blob		 */
+};
+#define PKEY_GENSECK _IOWR(PKEY_IOCTL_MAGIC, 0x01, struct pkey_genseck)
+
+/*
+ * Construct secure key from clear key value
+ */
+struct pkey_clr2seck {
+	__u16 cardnr;		    /* in: card to use or FFFF for any	 */
+	__u16 domain;		    /* in: domain or FFFF for any	 */
+	__u32 keytype;		    /* in: key type to generate		 */
+	struct pkey_clrkey clrkey;  /* in: the clear key value		 */
+	struct pkey_seckey seckey;  /* out: the secure key blob		 */
+};
+#define PKEY_CLR2SECK _IOWR(PKEY_IOCTL_MAGIC, 0x02, struct pkey_clr2seck)
+
+/*
+ * Fabricate protected key from a secure key
+ */
+struct pkey_sec2protk {
+	__u16 cardnr;		     /* in: card to use or FFFF for any   */
+	__u16 domain;		     /* in: domain or FFFF for any	  */
+	struct pkey_seckey seckey;   /* in: the secure key blob		  */
+	struct pkey_protkey protkey; /* out: the protected key		  */
+};
+#define PKEY_SEC2PROTK _IOWR(PKEY_IOCTL_MAGIC, 0x03, struct pkey_sec2protk)
+
+/*
+ * Fabricate protected key from an clear key value
+ */
+struct pkey_clr2protk {
+	__u32 keytype;		     /* in: key type to generate	  */
+	struct pkey_clrkey clrkey;   /* in: the clear key value		  */
+	struct pkey_protkey protkey; /* out: the protected key		  */
+};
+#define PKEY_CLR2PROTK _IOWR(PKEY_IOCTL_MAGIC, 0x04, struct pkey_clr2protk)
+
+/*
+ * Search for matching crypto card based on the Master Key
+ * Verification Pattern provided inside a secure key.
+ */
+struct pkey_findcard {
+	struct pkey_seckey seckey;	       /* in: the secure key blob */
+	__u16  cardnr;			       /* out: card number	  */
+	__u16  domain;			       /* out: domain number	  */
+};
+#define PKEY_FINDCARD _IOWR(PKEY_IOCTL_MAGIC, 0x05, struct pkey_findcard)
+
+/*
+ * Combined together: findcard + sec2prot
+ */
+struct pkey_skey2pkey {
+	struct pkey_seckey seckey;   /* in: the secure key blob		  */
+	struct pkey_protkey protkey; /* out: the protected key		  */
+};
+#define PKEY_SKEY2PKEY _IOWR(PKEY_IOCTL_MAGIC, 0x06, struct pkey_skey2pkey)
+
+#endif /* _UAPI_PKEY_H */
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index db469fa..dff2152 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -50,7 +50,8 @@
 		   _TIF_UPROBE)
 _TIF_TRACE	= (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \
 		   _TIF_SYSCALL_TRACEPOINT)
-_CIF_WORK	= (_CIF_MCCK_PENDING | _CIF_ASCE | _CIF_FPU)
+_CIF_WORK	= (_CIF_MCCK_PENDING | _CIF_ASCE_PRIMARY | \
+		   _CIF_ASCE_SECONDARY | _CIF_FPU)
 _PIF_WORK	= (_PIF_PER_TRAP)
 
 #define BASED(name) name-cleanup_critical(%r13)
@@ -339,8 +340,8 @@
 	jo	.Lsysc_notify_resume
 	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
 	jo	.Lsysc_vxrs
-	TSTMSK	__LC_CPU_FLAGS,_CIF_ASCE
-	jo	.Lsysc_uaccess
+	TSTMSK	__LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY)
+	jnz	.Lsysc_asce
 	j	.Lsysc_return		# beware of critical section cleanup
 
 #
@@ -358,12 +359,15 @@
 	jg	s390_handle_mcck	# TIF bit will be cleared by handler
 
 #
-# _CIF_ASCE is set, load user space asce
+# _CIF_ASCE_PRIMARY and/or CIF_ASCE_SECONDARY set, load user space asce
 #
-.Lsysc_uaccess:
-	ni	__LC_CPU_FLAGS+7,255-_CIF_ASCE
+.Lsysc_asce:
+	ni	__LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY
 	lctlg	%c1,%c1,__LC_USER_ASCE		# load primary asce
-	j	.Lsysc_return
+	TSTMSK	__LC_CPU_FLAGS,_CIF_ASCE_SECONDARY
+	jz	.Lsysc_return
+	larl	%r14,.Lsysc_return
+	jg	set_fs_fixup
 
 #
 # CIF_FPU is set, restore floating-point controls and floating-point registers.
@@ -661,8 +665,8 @@
 	jo	.Lio_notify_resume
 	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
 	jo	.Lio_vxrs
-	TSTMSK	__LC_CPU_FLAGS,_CIF_ASCE
-	jo	.Lio_uaccess
+	TSTMSK	__LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY)
+	jnz	.Lio_asce
 	j	.Lio_return		# beware of critical section cleanup
 
 #
@@ -675,12 +679,15 @@
 	j	.Lio_return
 
 #
-# _CIF_ASCE is set, load user space asce
+# _CIF_ASCE_PRIMARY and/or CIF_ASCE_SECONDARY set, load user space asce
 #
-.Lio_uaccess:
-	ni	__LC_CPU_FLAGS+7,255-_CIF_ASCE
+.Lio_asce:
+	ni	__LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY
 	lctlg	%c1,%c1,__LC_USER_ASCE		# load primary asce
-	j	.Lio_return
+	TSTMSK	__LC_CPU_FLAGS,_CIF_ASCE_SECONDARY
+	jz	.Lio_return
+	larl	%r14,.Lio_return
+	jg	set_fs_fixup
 
 #
 # CIF_FPU is set, restore floating-point controls and floating-point registers.
diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h
index e79f030..33f9018 100644
--- a/arch/s390/kernel/entry.h
+++ b/arch/s390/kernel/entry.h
@@ -80,5 +80,6 @@ long sys_s390_pci_mmio_read(unsigned long, void __user *, size_t);
 DECLARE_PER_CPU(u64, mt_cycles[8]);
 
 void verify_facilities(void);
+void set_fs_fixup(void);
 
 #endif /* _ENTRY_H */
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 56e14d0..80c093e 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -116,6 +116,19 @@ static int notrace s390_validate_registers(union mci mci, int umode)
 			s390_handle_damage();
 		kill_task = 1;
 	}
+	/* Validate control registers */
+	if (!mci.cr) {
+		/*
+		 * Control registers have unknown contents.
+		 * Can't recover and therefore stopping machine.
+		 */
+		s390_handle_damage();
+	} else {
+		asm volatile(
+			"	lctlg	0,15,0(%0)\n"
+			"	ptlb\n"
+			: : "a" (&S390_lowcore.cregs_save_area) : "memory");
+	}
 	if (!mci.fp) {
 		/*
 		 * Floating point registers can't be restored. If the
@@ -208,18 +221,6 @@ static int notrace s390_validate_registers(union mci mci, int umode)
 		 */
 		kill_task = 1;
 	}
-	/* Validate control registers */
-	if (!mci.cr) {
-		/*
-		 * Control registers have unknown contents.
-		 * Can't recover and therefore stopping machine.
-		 */
-		s390_handle_damage();
-	} else {
-		asm volatile(
-			"	lctlg	0,15,0(%0)"
-			: : "a" (&S390_lowcore.cregs_save_area) : "memory");
-	}
 	/*
 	 * We don't even try to validate the TOD register, since we simply
 	 * can't write something sensible into that register.
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index c5b86b4..5428166 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -100,8 +100,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 	return 0;
 }
 
-int copy_thread(unsigned long clone_flags, unsigned long new_stackp,
-		unsigned long arg, struct task_struct *p)
+int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
+		    unsigned long arg, struct task_struct *p, unsigned long tls)
 {
 	struct fake_frame
 	{
@@ -156,7 +156,6 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp,
 
 	/* Set a new TLS ?  */
 	if (clone_flags & CLONE_SETTLS) {
-		unsigned long tls = frame->childregs.gprs[6];
 		if (is_compat_task()) {
 			p->thread.acrs[0] = (unsigned int)tls;
 		} else {
@@ -234,3 +233,16 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
 	ret = PAGE_ALIGN(mm->brk + brk_rnd());
 	return (ret > mm->brk) ? ret : mm->brk;
 }
+
+void set_fs_fixup(void)
+{
+	struct pt_regs *regs = current_pt_regs();
+	static bool warned;
+
+	set_fs(USER_DS);
+	if (warned)
+		return;
+	WARN(1, "Unbalanced set_fs - int code: 0x%x\n", regs->int_code);
+	show_registers(regs);
+	warned = true;
+}
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 59ac937..a07b1ec 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -359,8 +359,8 @@ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
 	spin_lock(&gmap->guest_table_lock);
 	entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
 	if (entry) {
-		flush = (*entry != _SEGMENT_ENTRY_INVALID);
-		*entry = _SEGMENT_ENTRY_INVALID;
+		flush = (*entry != _SEGMENT_ENTRY_EMPTY);
+		*entry = _SEGMENT_ENTRY_EMPTY;
 	}
 	spin_unlock(&gmap->guest_table_lock);
 	return flush;
@@ -589,7 +589,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 		return rc;
 	ptl = pmd_lock(mm, pmd);
 	spin_lock(&gmap->guest_table_lock);
-	if (*table == _SEGMENT_ENTRY_INVALID) {
+	if (*table == _SEGMENT_ENTRY_EMPTY) {
 		rc = radix_tree_insert(&gmap->host_to_guest,
 				       vmaddr >> PMD_SHIFT, table);
 		if (!rc)
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index a038162..9b4050c 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -62,7 +62,7 @@ static inline unsigned long __pte_to_rste(pte_t pte)
 		rste |= move_set_bit(pte_val(pte), _PAGE_NOEXEC,
 				     _SEGMENT_ENTRY_NOEXEC);
 	} else
-		rste = _SEGMENT_ENTRY_INVALID;
+		rste = _SEGMENT_ENTRY_EMPTY;
 	return rste;
 }
 
diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 2cac445..0b49dbc 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -62,19 +62,32 @@
 	  will be called geode-aes.
 
 config ZCRYPT
-	tristate "Support for PCI-attached cryptographic adapters"
+	tristate "Support for s390 cryptographic adapters"
 	depends on S390
 	select HW_RANDOM
 	help
-	  Select this option if you want to use a PCI-attached cryptographic
-	  adapter like:
-	  + PCI Cryptographic Accelerator (PCICA)
-	  + PCI Cryptographic Coprocessor (PCICC)
+	  Select this option if you want to enable support for
+	  s390 cryptographic adapters like:
 	  + PCI-X Cryptographic Coprocessor (PCIXCC)
-	  + Crypto Express2 Coprocessor (CEX2C)
-	  + Crypto Express2 Accelerator (CEX2A)
-	  + Crypto Express3 Coprocessor (CEX3C)
-	  + Crypto Express3 Accelerator (CEX3A)
+	  + Crypto Express 2,3,4 or 5 Coprocessor (CEXxC)
+	  + Crypto Express 2,3,4 or 5 Accelerator (CEXxA)
+	  + Crypto Express 4 or 5 EP11 Coprocessor (CEXxP)
+
+config PKEY
+	tristate "Kernel API for protected key handling"
+	depends on S390
+	depends on ZCRYPT
+	help
+	  With this option enabled the pkey kernel module provides an API
+	  for creation and handling of protected keys. Other parts of the
+	  kernel or userspace applications may use these functions.
+
+	  Select this option if you want to enable the kernel and userspace
+	  API for proteced key handling.
+
+	  Please note that creation of protected keys from secure keys
+	  requires to have at least one CEX card in coprocessor mode
+	  available at runtime.
 
 config CRYPTO_SHA1_S390
 	tristate "SHA1 digest algorithm"
@@ -124,6 +137,7 @@
 	depends on S390
 	select CRYPTO_ALGAPI
 	select CRYPTO_BLKCIPHER
+	select PKEY
 	help
 	  This is the s390 hardware accelerated implementation of the
 	  AES cipher algorithms (FIPS-197).
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index edaae9f..e426ac8 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -13,6 +13,7 @@
 				multicast.o mad.o smi.o agent.o mad_rmpp.o
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
 ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
+ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
 
 ib_cm-y :=			cm.o
 
diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c
new file mode 100644
index 0000000..126ac5f
--- /dev/null
+++ b/drivers/infiniband/core/cgroup.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include "core_priv.h"
+
+/**
+ * ib_device_register_rdmacg - register with rdma cgroup.
+ * @device: device to register to participate in resource
+ *          accounting by rdma cgroup.
+ *
+ * Register with the rdma cgroup. Should be called before
+ * exposing rdma device to user space applications to avoid
+ * resource accounting leak.
+ * Returns 0 on success or otherwise failure code.
+ */
+int ib_device_register_rdmacg(struct ib_device *device)
+{
+	device->cg_device.name = device->name;
+	return rdmacg_register_device(&device->cg_device);
+}
+
+/**
+ * ib_device_unregister_rdmacg - unregister with rdma cgroup.
+ * @device: device to unregister.
+ *
+ * Unregister with the rdma cgroup. Should be called after
+ * all the resources are deallocated, and after a stage when any
+ * other resource allocation by user application cannot be done
+ * for this device to avoid any leak in accounting.
+ */
+void ib_device_unregister_rdmacg(struct ib_device *device)
+{
+	rdmacg_unregister_device(&device->cg_device);
+}
+
+int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+			 struct ib_device *device,
+			 enum rdmacg_resource_type resource_index)
+{
+	return rdmacg_try_charge(&cg_obj->cg, &device->cg_device,
+				 resource_index);
+}
+EXPORT_SYMBOL(ib_rdmacg_try_charge);
+
+void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+			struct ib_device *device,
+			enum rdmacg_resource_type resource_index)
+{
+	rdmacg_uncharge(cg_obj->cg, &device->cg_device,
+			resource_index);
+}
+EXPORT_SYMBOL(ib_rdmacg_uncharge);
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 912ab4c..cb7d372 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -35,6 +35,7 @@
 
 #include <linux/list.h>
 #include <linux/spinlock.h>
+#include <linux/cgroup_rdma.h>
 
 #include <rdma/ib_verbs.h>
 
@@ -124,6 +125,35 @@ int ib_cache_setup_one(struct ib_device *device);
 void ib_cache_cleanup_one(struct ib_device *device);
 void ib_cache_release_one(struct ib_device *device);
 
+#ifdef CONFIG_CGROUP_RDMA
+int ib_device_register_rdmacg(struct ib_device *device);
+void ib_device_unregister_rdmacg(struct ib_device *device);
+
+int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+			 struct ib_device *device,
+			 enum rdmacg_resource_type resource_index);
+
+void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+			struct ib_device *device,
+			enum rdmacg_resource_type resource_index);
+#else
+static inline int ib_device_register_rdmacg(struct ib_device *device)
+{ return 0; }
+
+static inline void ib_device_unregister_rdmacg(struct ib_device *device)
+{ }
+
+static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+				       struct ib_device *device,
+				       enum rdmacg_resource_type resource_index)
+{ return 0; }
+
+static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+				      struct ib_device *device,
+				      enum rdmacg_resource_type resource_index)
+{ }
+#endif
+
 static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
 					 struct net_device *upper)
 {
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index a63e840..593d2ce 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -369,10 +369,18 @@ int ib_register_device(struct ib_device *device,
 		goto out;
 	}
 
+	ret = ib_device_register_rdmacg(device);
+	if (ret) {
+		pr_warn("Couldn't register device with rdma cgroup\n");
+		ib_cache_cleanup_one(device);
+		goto out;
+	}
+
 	memset(&device->attrs, 0, sizeof(device->attrs));
 	ret = device->query_device(device, &device->attrs, &uhw);
 	if (ret) {
 		pr_warn("Couldn't query the device attributes\n");
+		ib_device_unregister_rdmacg(device);
 		ib_cache_cleanup_one(device);
 		goto out;
 	}
@@ -381,6 +389,7 @@ int ib_register_device(struct ib_device *device,
 	if (ret) {
 		pr_warn("Couldn't register device %s with driver model\n",
 			device->name);
+		ib_device_unregister_rdmacg(device);
 		ib_cache_cleanup_one(device);
 		goto out;
 	}
@@ -430,6 +439,7 @@ void ib_unregister_device(struct ib_device *device)
 
 	mutex_unlock(&device_mutex);
 
+	ib_device_unregister_rdmacg(device);
 	ib_device_unregister_sysfs(device);
 	ib_cache_cleanup_one(device);
 
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index b4b395a..7b7a76e 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -316,6 +316,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 	struct ib_udata                   udata;
 	struct ib_ucontext		 *ucontext;
 	struct file			 *filp;
+	struct ib_rdmacg_object		 cg_obj;
 	int ret;
 
 	if (out_len < sizeof resp)
@@ -335,13 +336,18 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 		   (unsigned long) cmd.response + sizeof resp,
 		   in_len - sizeof cmd, out_len - sizeof resp);
 
+	ret = ib_rdmacg_try_charge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
+	if (ret)
+		goto err;
+
 	ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);
 	if (IS_ERR(ucontext)) {
 		ret = PTR_ERR(ucontext);
-		goto err;
+		goto err_alloc;
 	}
 
 	ucontext->device = ib_dev;
+	ucontext->cg_obj = cg_obj;
 	INIT_LIST_HEAD(&ucontext->pd_list);
 	INIT_LIST_HEAD(&ucontext->mr_list);
 	INIT_LIST_HEAD(&ucontext->mw_list);
@@ -407,6 +413,9 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 	put_pid(ucontext->tgid);
 	ib_dev->dealloc_ucontext(ucontext);
 
+err_alloc:
+	ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
+
 err:
 	mutex_unlock(&file->mutex);
 	return ret;
@@ -561,6 +570,13 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
 		return -ENOMEM;
 
 	init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
+	ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+				   RDMACG_RESOURCE_HCA_OBJECT);
+	if (ret) {
+		kfree(uobj);
+		return ret;
+	}
+
 	down_write(&uobj->mutex);
 
 	pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata);
@@ -605,6 +621,7 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
 	ib_dealloc_pd(pd);
 
 err:
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
 	put_uobj_write(uobj);
 	return ret;
 }
@@ -637,6 +654,8 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
 	if (ret)
 		goto err_put;
 
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
 	uobj->live = 0;
 	put_uobj_write(uobj);
 
@@ -1006,6 +1025,10 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
 			goto err_put;
 		}
 	}
+	ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+				   RDMACG_RESOURCE_HCA_OBJECT);
+	if (ret)
+		goto err_charge;
 
 	mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
 				     cmd.access_flags, &udata);
@@ -1054,6 +1077,9 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
 	ib_dereg_mr(mr);
 
 err_put:
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
+err_charge:
 	put_pd_read(pd);
 
 err_free:
@@ -1178,6 +1204,8 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
 	if (ret)
 		return ret;
 
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
 	idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
 
 	mutex_lock(&file->mutex);
@@ -1226,6 +1254,11 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
 		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
 		   out_len - sizeof(resp));
 
+	ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+				   RDMACG_RESOURCE_HCA_OBJECT);
+	if (ret)
+		goto err_charge;
+
 	mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata);
 	if (IS_ERR(mw)) {
 		ret = PTR_ERR(mw);
@@ -1271,6 +1304,9 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
 	uverbs_dealloc_mw(mw);
 
 err_put:
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
+err_charge:
 	put_pd_read(pd);
 
 err_free:
@@ -1306,6 +1342,8 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
 	if (ret)
 		return ret;
 
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
 	idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
 
 	mutex_lock(&file->mutex);
@@ -1405,6 +1443,11 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
 	if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
 		attr.flags = cmd->flags;
 
+	ret = ib_rdmacg_try_charge(&obj->uobject.cg_obj, ib_dev,
+				   RDMACG_RESOURCE_HCA_OBJECT);
+	if (ret)
+		goto err_charge;
+
 	cq = ib_dev->create_cq(ib_dev, &attr,
 					     file->ucontext, uhw);
 	if (IS_ERR(cq)) {
@@ -1452,6 +1495,10 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
 	ib_destroy_cq(cq);
 
 err_file:
+	ib_rdmacg_uncharge(&obj->uobject.cg_obj, ib_dev,
+			   RDMACG_RESOURCE_HCA_OBJECT);
+
+err_charge:
 	if (ev_file)
 		ib_uverbs_release_ucq(file, ev_file, obj);
 
@@ -1732,6 +1779,8 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
 	if (ret)
 		return ret;
 
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
 	idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
 
 	mutex_lock(&file->mutex);
@@ -1905,6 +1954,11 @@ static int create_qp(struct ib_uverbs_file *file,
 			goto err_put;
 		}
 
+	ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, device,
+				   RDMACG_RESOURCE_HCA_OBJECT);
+	if (ret)
+		goto err_put;
+
 	if (cmd->qp_type == IB_QPT_XRC_TGT)
 		qp = ib_create_qp(pd, &attr);
 	else
@@ -1912,7 +1966,7 @@ static int create_qp(struct ib_uverbs_file *file,
 
 	if (IS_ERR(qp)) {
 		ret = PTR_ERR(qp);
-		goto err_put;
+		goto err_create;
 	}
 
 	if (cmd->qp_type != IB_QPT_XRC_TGT) {
@@ -1993,6 +2047,10 @@ static int create_qp(struct ib_uverbs_file *file,
 err_destroy:
 	ib_destroy_qp(qp);
 
+err_create:
+	ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, device,
+			   RDMACG_RESOURCE_HCA_OBJECT);
+
 err_put:
 	if (xrcd)
 		put_xrcd_read(xrcd_uobj);
@@ -2519,6 +2577,8 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
 	if (ret)
 		return ret;
 
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
 	if (obj->uxrcd)
 		atomic_dec(&obj->uxrcd->refcnt);
 
@@ -2970,11 +3030,16 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
 	memset(&attr.dmac, 0, sizeof(attr.dmac));
 	memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16);
 
+	ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+				   RDMACG_RESOURCE_HCA_OBJECT);
+	if (ret)
+		goto err_charge;
+
 	ah = pd->device->create_ah(pd, &attr, &udata);
 
 	if (IS_ERR(ah)) {
 		ret = PTR_ERR(ah);
-		goto err_put;
+		goto err_create;
 	}
 
 	ah->device  = pd->device;
@@ -3013,7 +3078,10 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
 err_destroy:
 	ib_destroy_ah(ah);
 
-err_put:
+err_create:
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
+err_charge:
 	put_pd_read(pd);
 
 err:
@@ -3047,6 +3115,8 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
 	if (ret)
 		return ret;
 
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
 	idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
 
 	mutex_lock(&file->mutex);
@@ -3861,10 +3931,16 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
 		err = -EINVAL;
 		goto err_free;
 	}
+
+	err = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
+				   RDMACG_RESOURCE_HCA_OBJECT);
+	if (err)
+		goto err_free;
+
 	flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER);
 	if (IS_ERR(flow_id)) {
 		err = PTR_ERR(flow_id);
-		goto err_free;
+		goto err_create;
 	}
 	flow_id->uobject = uobj;
 	uobj->object = flow_id;
@@ -3897,6 +3973,8 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
 	idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
 destroy_flow:
 	ib_destroy_flow(flow_id);
+err_create:
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
 err_free:
 	kfree(flow_attr);
 err_put:
@@ -3936,8 +4014,11 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
 	flow_id = uobj->object;
 
 	ret = ib_destroy_flow(flow_id);
-	if (!ret)
+	if (!ret) {
+		ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev,
+				   RDMACG_RESOURCE_HCA_OBJECT);
 		uobj->live = 0;
+	}
 
 	put_uobj_write(uobj);
 
@@ -4005,6 +4086,11 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
 	obj->uevent.events_reported = 0;
 	INIT_LIST_HEAD(&obj->uevent.event_list);
 
+	ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, ib_dev,
+				   RDMACG_RESOURCE_HCA_OBJECT);
+	if (ret)
+		goto err_put_cq;
+
 	srq = pd->device->create_srq(pd, &attr, udata);
 	if (IS_ERR(srq)) {
 		ret = PTR_ERR(srq);
@@ -4069,6 +4155,8 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
 	ib_destroy_srq(srq);
 
 err_put:
+	ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, ib_dev,
+			   RDMACG_RESOURCE_HCA_OBJECT);
 	put_pd_read(pd);
 
 err_put_cq:
@@ -4255,6 +4343,8 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
 	if (ret)
 		return ret;
 
+	ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
+
 	if (srq_type == IB_SRQT_XRC) {
 		us = container_of(obj, struct ib_usrq_object, uevent);
 		atomic_dec(&us->uxrcd->refcnt);
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index e3fb4b1..35c788a 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -51,6 +51,7 @@
 #include <rdma/ib.h>
 
 #include "uverbs.h"
+#include "core_priv.h"
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("InfiniBand userspace verbs access");
@@ -237,6 +238,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
 		idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
 		ib_destroy_ah(ah);
+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+				   RDMACG_RESOURCE_HCA_OBJECT);
 		kfree(uobj);
 	}
 
@@ -246,6 +249,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
 		idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
 		uverbs_dealloc_mw(mw);
+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+				   RDMACG_RESOURCE_HCA_OBJECT);
 		kfree(uobj);
 	}
 
@@ -254,6 +259,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
 		idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
 		ib_destroy_flow(flow_id);
+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+				   RDMACG_RESOURCE_HCA_OBJECT);
 		kfree(uobj);
 	}
 
@@ -266,6 +273,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 		if (qp == qp->real_qp)
 			ib_uverbs_detach_umcast(qp, uqp);
 		ib_destroy_qp(qp);
+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+				   RDMACG_RESOURCE_HCA_OBJECT);
 		ib_uverbs_release_uevent(file, &uqp->uevent);
 		kfree(uqp);
 	}
@@ -298,6 +307,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
 		idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
 		ib_destroy_srq(srq);
+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+				   RDMACG_RESOURCE_HCA_OBJECT);
 		ib_uverbs_release_uevent(file, uevent);
 		kfree(uevent);
 	}
@@ -310,6 +321,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
 		idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
 		ib_destroy_cq(cq);
+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+				   RDMACG_RESOURCE_HCA_OBJECT);
 		ib_uverbs_release_ucq(file, ev_file, ucq);
 		kfree(ucq);
 	}
@@ -319,6 +332,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
 		idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
 		ib_dereg_mr(mr);
+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+				   RDMACG_RESOURCE_HCA_OBJECT);
 		kfree(uobj);
 	}
 
@@ -339,11 +354,16 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 
 		idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
 		ib_dealloc_pd(pd);
+		ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
+				   RDMACG_RESOURCE_HCA_OBJECT);
 		kfree(uobj);
 	}
 
 	put_pid(context->tgid);
 
+	ib_rdmacg_uncharge(&context->cg_obj, context->device,
+			   RDMACG_RESOURCE_HCA_HANDLE);
+
 	return context->device->dealloc_ucontext(context);
 }
 
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 5dc673d..ee1b0e9 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -1434,9 +1434,10 @@
 	  based RTC on SUN4V systems.
 
 config RTC_DRV_SUN6I
-	tristate "Allwinner A31 RTC"
-	default MACH_SUN6I || MACH_SUN8I || COMPILE_TEST
-	depends on ARCH_SUNXI
+	bool "Allwinner A31 RTC"
+	default MACH_SUN6I || MACH_SUN8I
+	depends on COMMON_CLK
+	depends on ARCH_SUNXI || COMPILE_TEST
 	help
 	  If you say Y here you will get support for the RTC found in
 	  some Allwinner SoCs like the A31 or the A64.
@@ -1719,6 +1720,17 @@
 	   This driver can also be built as a module. If so, the module
 	   will be called rtc-r7301.
 
+config RTC_DRV_STM32
+	tristate "STM32 RTC"
+	select REGMAP_MMIO
+	depends on ARCH_STM32 || COMPILE_TEST
+	help
+	   If you say yes here you get support for the STM32 On-Chip
+	   Real Time Clock.
+
+	   This driver can also be built as a module, if so, the module
+	   will be called "rtc-stm32".
+
 comment "HID Sensor RTC drivers"
 
 config RTC_DRV_HID_SENSOR_TIME
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index f13ab1c..f07297b 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -145,6 +145,7 @@
 obj-$(CONFIG_RTC_DRV_SPEAR)	+= rtc-spear.o
 obj-$(CONFIG_RTC_DRV_STARFIRE)	+= rtc-starfire.o
 obj-$(CONFIG_RTC_DRV_STK17TA8)	+= rtc-stk17ta8.o
+obj-$(CONFIG_RTC_DRV_STM32) 	+= rtc-stm32.o
 obj-$(CONFIG_RTC_DRV_STMP)	+= rtc-stmp3xxx.o
 obj-$(CONFIG_RTC_DRV_ST_LPC)	+= rtc-st-lpc.o
 obj-$(CONFIG_RTC_DRV_SUN4V)	+= rtc-sun4v.o
diff --git a/drivers/rtc/rtc-armada38x.c b/drivers/rtc/rtc-armada38x.c
index 9a3f2a6..21f355c 100644
--- a/drivers/rtc/rtc-armada38x.c
+++ b/drivers/rtc/rtc-armada38x.c
@@ -16,6 +16,7 @@
 #include <linux/io.h>
 #include <linux/module.h>
 #include <linux/of.h>
+#include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/rtc.h>
 
@@ -23,17 +24,48 @@
 #define RTC_STATUS_ALARM1	    BIT(0)
 #define RTC_STATUS_ALARM2	    BIT(1)
 #define RTC_IRQ1_CONF	    0x4
-#define RTC_IRQ1_AL_EN		    BIT(0)
-#define RTC_IRQ1_FREQ_EN	    BIT(1)
-#define RTC_IRQ1_FREQ_1HZ	    BIT(2)
+#define RTC_IRQ2_CONF	    0x8
+#define RTC_IRQ_AL_EN		    BIT(0)
+#define RTC_IRQ_FREQ_EN		    BIT(1)
+#define RTC_IRQ_FREQ_1HZ	    BIT(2)
+
 #define RTC_TIME	    0xC
 #define RTC_ALARM1	    0x10
+#define RTC_ALARM2	    0x14
 
-#define SOC_RTC_INTERRUPT   0x8
-#define SOC_RTC_ALARM1		BIT(0)
-#define SOC_RTC_ALARM2		BIT(1)
-#define SOC_RTC_ALARM1_MASK	BIT(2)
-#define SOC_RTC_ALARM2_MASK	BIT(3)
+/* Armada38x SoC registers  */
+#define RTC_38X_BRIDGE_TIMING_CTL   0x0
+#define RTC_38X_PERIOD_OFFS		0
+#define RTC_38X_PERIOD_MASK		(0x3FF << RTC_38X_PERIOD_OFFS)
+#define RTC_38X_READ_DELAY_OFFS		26
+#define RTC_38X_READ_DELAY_MASK		(0x1F << RTC_38X_READ_DELAY_OFFS)
+
+/* Armada 7K/8K registers  */
+#define RTC_8K_BRIDGE_TIMING_CTL0    0x0
+#define RTC_8K_WRCLK_PERIOD_OFFS	0
+#define RTC_8K_WRCLK_PERIOD_MASK	(0xFFFF << RTC_8K_WRCLK_PERIOD_OFFS)
+#define RTC_8K_WRCLK_SETUP_OFFS		16
+#define RTC_8K_WRCLK_SETUP_MASK		(0xFFFF << RTC_8K_WRCLK_SETUP_OFFS)
+#define RTC_8K_BRIDGE_TIMING_CTL1   0x4
+#define RTC_8K_READ_DELAY_OFFS		0
+#define RTC_8K_READ_DELAY_MASK		(0xFFFF << RTC_8K_READ_DELAY_OFFS)
+
+#define RTC_8K_ISR		    0x10
+#define RTC_8K_IMR		    0x14
+#define RTC_8K_ALARM2			BIT(0)
+
+#define SOC_RTC_INTERRUPT	    0x8
+#define SOC_RTC_ALARM1			BIT(0)
+#define SOC_RTC_ALARM2			BIT(1)
+#define SOC_RTC_ALARM1_MASK		BIT(2)
+#define SOC_RTC_ALARM2_MASK		BIT(3)
+
+#define SAMPLE_NR 100
+
+struct value_to_freq {
+	u32 value;
+	u8 freq;
+};
 
 struct armada38x_rtc {
 	struct rtc_device   *rtc_dev;
@@ -41,38 +73,153 @@ struct armada38x_rtc {
 	void __iomem	    *regs_soc;
 	spinlock_t	    lock;
 	int		    irq;
+	struct value_to_freq *val_to_freq;
+	struct armada38x_rtc_data *data;
+};
+
+#define ALARM1	0
+#define ALARM2	1
+
+#define ALARM_REG(base, alarm)	 ((base) + (alarm) * sizeof(u32))
+
+struct armada38x_rtc_data {
+	/* Initialize the RTC-MBUS bridge timing */
+	void (*update_mbus_timing)(struct armada38x_rtc *rtc);
+	u32 (*read_rtc_reg)(struct armada38x_rtc *rtc, u8 rtc_reg);
+	void (*clear_isr)(struct armada38x_rtc *rtc);
+	void (*unmask_interrupt)(struct armada38x_rtc *rtc);
+	u32 alarm;
 };
 
 /*
  * According to the datasheet, the OS should wait 5us after every
  * register write to the RTC hard macro so that the required update
  * can occur without holding off the system bus
+ * According to errata RES-3124064, Write to any RTC register
+ * may fail. As a workaround, before writing to RTC
+ * register, issue a dummy write of 0x0 twice to RTC Status
+ * register.
  */
+
 static void rtc_delayed_write(u32 val, struct armada38x_rtc *rtc, int offset)
 {
+	writel(0, rtc->regs + RTC_STATUS);
+	writel(0, rtc->regs + RTC_STATUS);
 	writel(val, rtc->regs + offset);
 	udelay(5);
 }
 
+/* Update RTC-MBUS bridge timing parameters */
+static void rtc_update_38x_mbus_timing_params(struct armada38x_rtc *rtc)
+{
+	u32 reg;
+
+	reg = readl(rtc->regs_soc + RTC_38X_BRIDGE_TIMING_CTL);
+	reg &= ~RTC_38X_PERIOD_MASK;
+	reg |= 0x3FF << RTC_38X_PERIOD_OFFS; /* Maximum value */
+	reg &= ~RTC_38X_READ_DELAY_MASK;
+	reg |= 0x1F << RTC_38X_READ_DELAY_OFFS; /* Maximum value */
+	writel(reg, rtc->regs_soc + RTC_38X_BRIDGE_TIMING_CTL);
+}
+
+static void rtc_update_8k_mbus_timing_params(struct armada38x_rtc *rtc)
+{
+	u32 reg;
+
+	reg = readl(rtc->regs_soc + RTC_8K_BRIDGE_TIMING_CTL0);
+	reg &= ~RTC_8K_WRCLK_PERIOD_MASK;
+	reg |= 0x3FF << RTC_8K_WRCLK_PERIOD_OFFS;
+	reg &= ~RTC_8K_WRCLK_SETUP_MASK;
+	reg |= 0x29 << RTC_8K_WRCLK_SETUP_OFFS;
+	writel(reg, rtc->regs_soc + RTC_8K_BRIDGE_TIMING_CTL0);
+
+	reg = readl(rtc->regs_soc + RTC_8K_BRIDGE_TIMING_CTL1);
+	reg &= ~RTC_8K_READ_DELAY_MASK;
+	reg |= 0x3F << RTC_8K_READ_DELAY_OFFS;
+	writel(reg, rtc->regs_soc + RTC_8K_BRIDGE_TIMING_CTL1);
+}
+
+static u32 read_rtc_register(struct armada38x_rtc *rtc, u8 rtc_reg)
+{
+	return readl(rtc->regs + rtc_reg);
+}
+
+static u32 read_rtc_register_38x_wa(struct armada38x_rtc *rtc, u8 rtc_reg)
+{
+	int i, index_max = 0, max = 0;
+
+	for (i = 0; i < SAMPLE_NR; i++) {
+		rtc->val_to_freq[i].value = readl(rtc->regs + rtc_reg);
+		rtc->val_to_freq[i].freq = 0;
+	}
+
+	for (i = 0; i < SAMPLE_NR; i++) {
+		int j = 0;
+		u32 value = rtc->val_to_freq[i].value;
+
+		while (rtc->val_to_freq[j].freq) {
+			if (rtc->val_to_freq[j].value == value) {
+				rtc->val_to_freq[j].freq++;
+				break;
+			}
+			j++;
+		}
+
+		if (!rtc->val_to_freq[j].freq) {
+			rtc->val_to_freq[j].value = value;
+			rtc->val_to_freq[j].freq = 1;
+		}
+
+		if (rtc->val_to_freq[j].freq > max) {
+			index_max = j;
+			max = rtc->val_to_freq[j].freq;
+		}
+
+		/*
+		 * If a value already has half of the sample this is the most
+		 * frequent one and we can stop the research right now
+		 */
+		if (max > SAMPLE_NR / 2)
+			break;
+	}
+
+	return rtc->val_to_freq[index_max].value;
+}
+
+static void armada38x_clear_isr(struct armada38x_rtc *rtc)
+{
+	u32 val = readl(rtc->regs_soc + SOC_RTC_INTERRUPT);
+
+	writel(val & ~SOC_RTC_ALARM1, rtc->regs_soc + SOC_RTC_INTERRUPT);
+}
+
+static void armada38x_unmask_interrupt(struct armada38x_rtc *rtc)
+{
+	u32 val = readl(rtc->regs_soc + SOC_RTC_INTERRUPT);
+
+	writel(val | SOC_RTC_ALARM1_MASK, rtc->regs_soc + SOC_RTC_INTERRUPT);
+}
+
+static void armada8k_clear_isr(struct armada38x_rtc *rtc)
+{
+	writel(RTC_8K_ALARM2, rtc->regs_soc + RTC_8K_ISR);
+}
+
+static void armada8k_unmask_interrupt(struct armada38x_rtc *rtc)
+{
+	writel(RTC_8K_ALARM2, rtc->regs_soc + RTC_8K_IMR);
+}
+
 static int armada38x_rtc_read_time(struct device *dev, struct rtc_time *tm)
 {
 	struct armada38x_rtc *rtc = dev_get_drvdata(dev);
-	unsigned long time, time_check, flags;
+	unsigned long time, flags;
 
 	spin_lock_irqsave(&rtc->lock, flags);
-	time = readl(rtc->regs + RTC_TIME);
-	/*
-	 * WA for failing time set attempts. As stated in HW ERRATA if
-	 * more than one second between two time reads is detected
-	 * then read once again.
-	 */
-	time_check = readl(rtc->regs + RTC_TIME);
-	if ((time_check - time) > 1)
-		time_check = readl(rtc->regs + RTC_TIME);
-
+	time = rtc->data->read_rtc_reg(rtc, RTC_TIME);
 	spin_unlock_irqrestore(&rtc->lock, flags);
 
-	rtc_time_to_tm(time_check, tm);
+	rtc_time_to_tm(time, tm);
 
 	return 0;
 }
@@ -87,16 +234,9 @@ static int armada38x_rtc_set_time(struct device *dev, struct rtc_time *tm)
 
 	if (ret)
 		goto out;
-	/*
-	 * According to errata FE-3124064, Write to RTC TIME register
-	 * may fail. As a workaround, after writing to RTC TIME
-	 * register, issue a dummy write of 0x0 twice to RTC Status
-	 * register.
-	 */
+
 	spin_lock_irqsave(&rtc->lock, flags);
 	rtc_delayed_write(time, rtc, RTC_TIME);
-	rtc_delayed_write(0, rtc, RTC_STATUS);
-	rtc_delayed_write(0, rtc, RTC_STATUS);
 	spin_unlock_irqrestore(&rtc->lock, flags);
 
 out:
@@ -107,12 +247,14 @@ static int armada38x_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 {
 	struct armada38x_rtc *rtc = dev_get_drvdata(dev);
 	unsigned long time, flags;
+	u32 reg = ALARM_REG(RTC_ALARM1, rtc->data->alarm);
+	u32 reg_irq = ALARM_REG(RTC_IRQ1_CONF, rtc->data->alarm);
 	u32 val;
 
 	spin_lock_irqsave(&rtc->lock, flags);
 
-	time = readl(rtc->regs + RTC_ALARM1);
-	val = readl(rtc->regs + RTC_IRQ1_CONF) & RTC_IRQ1_AL_EN;
+	time = rtc->data->read_rtc_reg(rtc, reg);
+	val = rtc->data->read_rtc_reg(rtc, reg_irq) & RTC_IRQ_AL_EN;
 
 	spin_unlock_irqrestore(&rtc->lock, flags);
 
@@ -125,9 +267,10 @@ static int armada38x_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 static int armada38x_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 {
 	struct armada38x_rtc *rtc = dev_get_drvdata(dev);
+	u32 reg = ALARM_REG(RTC_ALARM1, rtc->data->alarm);
+	u32 reg_irq = ALARM_REG(RTC_IRQ1_CONF, rtc->data->alarm);
 	unsigned long time, flags;
 	int ret = 0;
-	u32 val;
 
 	ret = rtc_tm_to_time(&alrm->time, &time);
 
@@ -136,13 +279,11 @@ static int armada38x_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 
 	spin_lock_irqsave(&rtc->lock, flags);
 
-	rtc_delayed_write(time, rtc, RTC_ALARM1);
+	rtc_delayed_write(time, rtc, reg);
 
 	if (alrm->enabled) {
-			rtc_delayed_write(RTC_IRQ1_AL_EN, rtc, RTC_IRQ1_CONF);
-			val = readl(rtc->regs_soc + SOC_RTC_INTERRUPT);
-			writel(val | SOC_RTC_ALARM1_MASK,
-			       rtc->regs_soc + SOC_RTC_INTERRUPT);
+		rtc_delayed_write(RTC_IRQ_AL_EN, rtc, reg_irq);
+		rtc->data->unmask_interrupt(rtc);
 	}
 
 	spin_unlock_irqrestore(&rtc->lock, flags);
@@ -155,14 +296,15 @@ static int armada38x_rtc_alarm_irq_enable(struct device *dev,
 					 unsigned int enabled)
 {
 	struct armada38x_rtc *rtc = dev_get_drvdata(dev);
+	u32 reg_irq = ALARM_REG(RTC_IRQ1_CONF, rtc->data->alarm);
 	unsigned long flags;
 
 	spin_lock_irqsave(&rtc->lock, flags);
 
 	if (enabled)
-		rtc_delayed_write(RTC_IRQ1_AL_EN, rtc, RTC_IRQ1_CONF);
+		rtc_delayed_write(RTC_IRQ_AL_EN, rtc, reg_irq);
 	else
-		rtc_delayed_write(0, rtc, RTC_IRQ1_CONF);
+		rtc_delayed_write(0, rtc, reg_irq);
 
 	spin_unlock_irqrestore(&rtc->lock, flags);
 
@@ -174,24 +316,23 @@ static irqreturn_t armada38x_rtc_alarm_irq(int irq, void *data)
 	struct armada38x_rtc *rtc = data;
 	u32 val;
 	int event = RTC_IRQF | RTC_AF;
+	u32 reg_irq = ALARM_REG(RTC_IRQ1_CONF, rtc->data->alarm);
 
 	dev_dbg(&rtc->rtc_dev->dev, "%s:irq(%d)\n", __func__, irq);
 
 	spin_lock(&rtc->lock);
 
-	val = readl(rtc->regs_soc + SOC_RTC_INTERRUPT);
-
-	writel(val & ~SOC_RTC_ALARM1, rtc->regs_soc + SOC_RTC_INTERRUPT);
-	val = readl(rtc->regs + RTC_IRQ1_CONF);
-	/* disable all the interrupts for alarm 1 */
-	rtc_delayed_write(0, rtc, RTC_IRQ1_CONF);
+	rtc->data->clear_isr(rtc);
+	val = rtc->data->read_rtc_reg(rtc, reg_irq);
+	/* disable all the interrupts for alarm*/
+	rtc_delayed_write(0, rtc, reg_irq);
 	/* Ack the event */
-	rtc_delayed_write(RTC_STATUS_ALARM1, rtc, RTC_STATUS);
+	rtc_delayed_write(1 << rtc->data->alarm, rtc, RTC_STATUS);
 
 	spin_unlock(&rtc->lock);
 
-	if (val & RTC_IRQ1_FREQ_EN) {
-		if (val & RTC_IRQ1_FREQ_1HZ)
+	if (val & RTC_IRQ_FREQ_EN) {
+		if (val & RTC_IRQ_FREQ_1HZ)
 			event |= RTC_UF;
 		else
 			event |= RTC_PF;
@@ -202,7 +343,7 @@ static irqreturn_t armada38x_rtc_alarm_irq(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static struct rtc_class_ops armada38x_rtc_ops = {
+static const struct rtc_class_ops armada38x_rtc_ops = {
 	.read_time = armada38x_rtc_read_time,
 	.set_time = armada38x_rtc_set_time,
 	.read_alarm = armada38x_rtc_read_alarm,
@@ -210,17 +351,65 @@ static struct rtc_class_ops armada38x_rtc_ops = {
 	.alarm_irq_enable = armada38x_rtc_alarm_irq_enable,
 };
 
+static const struct rtc_class_ops armada38x_rtc_ops_noirq = {
+	.read_time = armada38x_rtc_read_time,
+	.set_time = armada38x_rtc_set_time,
+	.read_alarm = armada38x_rtc_read_alarm,
+};
+
+static const struct armada38x_rtc_data armada38x_data = {
+	.update_mbus_timing = rtc_update_38x_mbus_timing_params,
+	.read_rtc_reg = read_rtc_register_38x_wa,
+	.clear_isr = armada38x_clear_isr,
+	.unmask_interrupt = armada38x_unmask_interrupt,
+	.alarm = ALARM1,
+};
+
+static const struct armada38x_rtc_data armada8k_data = {
+	.update_mbus_timing = rtc_update_8k_mbus_timing_params,
+	.read_rtc_reg = read_rtc_register,
+	.clear_isr = armada8k_clear_isr,
+	.unmask_interrupt = armada8k_unmask_interrupt,
+	.alarm = ALARM2,
+};
+
+#ifdef CONFIG_OF
+static const struct of_device_id armada38x_rtc_of_match_table[] = {
+	{
+		.compatible = "marvell,armada-380-rtc",
+		.data = &armada38x_data,
+	},
+	{
+		.compatible = "marvell,armada-8k-rtc",
+		.data = &armada8k_data,
+	},
+	{}
+};
+MODULE_DEVICE_TABLE(of, armada38x_rtc_of_match_table);
+#endif
+
 static __init int armada38x_rtc_probe(struct platform_device *pdev)
 {
+	const struct rtc_class_ops *ops;
 	struct resource *res;
 	struct armada38x_rtc *rtc;
+	const struct of_device_id *match;
 	int ret;
 
+	match = of_match_device(armada38x_rtc_of_match_table, &pdev->dev);
+	if (!match)
+		return -ENODEV;
+
 	rtc = devm_kzalloc(&pdev->dev, sizeof(struct armada38x_rtc),
 			    GFP_KERNEL);
 	if (!rtc)
 		return -ENOMEM;
 
+	rtc->val_to_freq = devm_kcalloc(&pdev->dev, SAMPLE_NR,
+				sizeof(struct value_to_freq), GFP_KERNEL);
+	if (!rtc->val_to_freq)
+		return -ENOMEM;
+
 	spin_lock_init(&rtc->lock);
 
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "rtc");
@@ -242,19 +431,27 @@ static __init int armada38x_rtc_probe(struct platform_device *pdev)
 				0, pdev->name, rtc) < 0) {
 		dev_warn(&pdev->dev, "Interrupt not available.\n");
 		rtc->irq = -1;
+	}
+	platform_set_drvdata(pdev, rtc);
+
+	if (rtc->irq != -1) {
+		device_init_wakeup(&pdev->dev, 1);
+		ops = &armada38x_rtc_ops;
+	} else {
 		/*
 		 * If there is no interrupt available then we can't
 		 * use the alarm
 		 */
-		armada38x_rtc_ops.set_alarm = NULL;
-		armada38x_rtc_ops.alarm_irq_enable = NULL;
+		ops = &armada38x_rtc_ops_noirq;
 	}
-	platform_set_drvdata(pdev, rtc);
-	if (rtc->irq != -1)
-		device_init_wakeup(&pdev->dev, 1);
+	rtc->data = (struct armada38x_rtc_data *)match->data;
+
+
+	/* Update RTC-MBUS bridge timing parameters */
+	rtc->data->update_mbus_timing(rtc);
 
 	rtc->rtc_dev = devm_rtc_device_register(&pdev->dev, pdev->name,
-					&armada38x_rtc_ops, THIS_MODULE);
+						ops, THIS_MODULE);
 	if (IS_ERR(rtc->rtc_dev)) {
 		ret = PTR_ERR(rtc->rtc_dev);
 		dev_err(&pdev->dev, "Failed to register RTC device: %d\n", ret);
@@ -280,6 +477,9 @@ static int armada38x_rtc_resume(struct device *dev)
 	if (device_may_wakeup(dev)) {
 		struct armada38x_rtc *rtc = dev_get_drvdata(dev);
 
+		/* Update RTC-MBUS bridge timing parameters */
+		rtc->data->update_mbus_timing(rtc);
+
 		return disable_irq_wake(rtc->irq);
 	}
 
@@ -290,14 +490,6 @@ static int armada38x_rtc_resume(struct device *dev)
 static SIMPLE_DEV_PM_OPS(armada38x_rtc_pm_ops,
 			 armada38x_rtc_suspend, armada38x_rtc_resume);
 
-#ifdef CONFIG_OF
-static const struct of_device_id armada38x_rtc_of_match_table[] = {
-	{ .compatible = "marvell,armada-380-rtc", },
-	{}
-};
-MODULE_DEVICE_TABLE(of, armada38x_rtc_of_match_table);
-#endif
-
 static struct platform_driver armada38x_rtc_driver = {
 	.driver		= {
 		.name	= "armada38x-rtc",
diff --git a/drivers/rtc/rtc-au1xxx.c b/drivers/rtc/rtc-au1xxx.c
index 84d6e02..2ba44cc 100644
--- a/drivers/rtc/rtc-au1xxx.c
+++ b/drivers/rtc/rtc-au1xxx.c
@@ -56,7 +56,7 @@ static int au1xtoy_rtc_set_time(struct device *dev, struct rtc_time *tm)
 	return 0;
 }
 
-static struct rtc_class_ops au1xtoy_rtc_ops = {
+static const struct rtc_class_ops au1xtoy_rtc_ops = {
 	.read_time	= au1xtoy_rtc_read_time,
 	.set_time	= au1xtoy_rtc_set_time,
 };
diff --git a/drivers/rtc/rtc-bfin.c b/drivers/rtc/rtc-bfin.c
index 535a5f9..15344b7 100644
--- a/drivers/rtc/rtc-bfin.c
+++ b/drivers/rtc/rtc-bfin.c
@@ -333,7 +333,7 @@ static int bfin_rtc_proc(struct device *dev, struct seq_file *seq)
 #undef yesno
 }
 
-static struct rtc_class_ops bfin_rtc_ops = {
+static const struct rtc_class_ops bfin_rtc_ops = {
 	.read_time     = bfin_rtc_read_time,
 	.set_time      = bfin_rtc_set_time,
 	.read_alarm    = bfin_rtc_read_alarm,
diff --git a/drivers/rtc/rtc-bq32k.c b/drivers/rtc/rtc-bq32k.c
index 3977424..2b22393 100644
--- a/drivers/rtc/rtc-bq32k.c
+++ b/drivers/rtc/rtc-bq32k.c
@@ -34,6 +34,7 @@
 #define BQ32K_CALIBRATION	0x07	/* CAL_CFG1, calibration and control */
 #define BQ32K_TCH2		0x08	/* Trickle charge enable */
 #define BQ32K_CFG2		0x09	/* Trickle charger control */
+#define BQ32K_TCFE		BIT(6)	/* Trickle charge FET bypass */
 
 struct bq32k_regs {
 	uint8_t		seconds;
@@ -188,6 +189,65 @@ static int trickle_charger_of_init(struct device *dev, struct device_node *node)
 	return 0;
 }
 
+static ssize_t bq32k_sysfs_show_tricklecharge_bypass(struct device *dev,
+					       struct device_attribute *attr,
+					       char *buf)
+{
+	int reg, error;
+
+	error = bq32k_read(dev, &reg, BQ32K_CFG2, 1);
+	if (error)
+		return error;
+
+	return sprintf(buf, "%d\n", (reg & BQ32K_TCFE) ? 1 : 0);
+}
+
+static ssize_t bq32k_sysfs_store_tricklecharge_bypass(struct device *dev,
+						struct device_attribute *attr,
+						const char *buf, size_t count)
+{
+	int reg, enable, error;
+
+	if (kstrtoint(buf, 0, &enable))
+		return -EINVAL;
+
+	error = bq32k_read(dev, &reg, BQ32K_CFG2, 1);
+	if (error)
+		return error;
+
+	if (enable) {
+		reg |= BQ32K_TCFE;
+		error = bq32k_write(dev, &reg, BQ32K_CFG2, 1);
+		if (error)
+			return error;
+
+		dev_info(dev, "Enabled trickle charge FET bypass.\n");
+	} else {
+		reg &= ~BQ32K_TCFE;
+		error = bq32k_write(dev, &reg, BQ32K_CFG2, 1);
+		if (error)
+			return error;
+
+		dev_info(dev, "Disabled trickle charge FET bypass.\n");
+	}
+
+	return count;
+}
+
+static DEVICE_ATTR(trickle_charge_bypass, 0644,
+		   bq32k_sysfs_show_tricklecharge_bypass,
+		   bq32k_sysfs_store_tricklecharge_bypass);
+
+static int bq32k_sysfs_register(struct device *dev)
+{
+	return device_create_file(dev, &dev_attr_trickle_charge_bypass);
+}
+
+static void bq32k_sysfs_unregister(struct device *dev)
+{
+	device_remove_file(dev, &dev_attr_trickle_charge_bypass);
+}
+
 static int bq32k_probe(struct i2c_client *client,
 				const struct i2c_device_id *id)
 {
@@ -224,11 +284,26 @@ static int bq32k_probe(struct i2c_client *client,
 	if (IS_ERR(rtc))
 		return PTR_ERR(rtc);
 
+	error = bq32k_sysfs_register(&client->dev);
+	if (error) {
+		dev_err(&client->dev,
+			"Unable to create sysfs entries for rtc bq32000\n");
+		return error;
+	}
+
+
 	i2c_set_clientdata(client, rtc);
 
 	return 0;
 }
 
+static int bq32k_remove(struct i2c_client *client)
+{
+	bq32k_sysfs_unregister(&client->dev);
+
+	return 0;
+}
+
 static const struct i2c_device_id bq32k_id[] = {
 	{ "bq32000", 0 },
 	{ }
@@ -240,6 +315,7 @@ static struct i2c_driver bq32k_driver = {
 		.name	= "bq32k",
 	},
 	.probe		= bq32k_probe,
+	.remove		= bq32k_remove,
 	.id_table	= bq32k_id,
 };
 
diff --git a/drivers/rtc/rtc-dm355evm.c b/drivers/rtc/rtc-dm355evm.c
index 94067f8..f225cd8 100644
--- a/drivers/rtc/rtc-dm355evm.c
+++ b/drivers/rtc/rtc-dm355evm.c
@@ -116,7 +116,7 @@ static int dm355evm_rtc_set_time(struct device *dev, struct rtc_time *tm)
 	return 0;
 }
 
-static struct rtc_class_ops dm355evm_rtc_ops = {
+static const struct rtc_class_ops dm355evm_rtc_ops = {
 	.read_time	= dm355evm_rtc_read_time,
 	.set_time	= dm355evm_rtc_set_time,
 };
diff --git a/drivers/rtc/rtc-ds3232.c b/drivers/rtc/rtc-ds3232.c
index b1f20d8..9bb39a0 100644
--- a/drivers/rtc/rtc-ds3232.c
+++ b/drivers/rtc/rtc-ds3232.c
@@ -23,28 +23,28 @@
 #include <linux/slab.h>
 #include <linux/regmap.h>
 
-#define DS3232_REG_SECONDS	0x00
-#define DS3232_REG_MINUTES	0x01
-#define DS3232_REG_HOURS	0x02
-#define DS3232_REG_AMPM		0x02
-#define DS3232_REG_DAY		0x03
-#define DS3232_REG_DATE		0x04
-#define DS3232_REG_MONTH	0x05
-#define DS3232_REG_CENTURY	0x05
-#define DS3232_REG_YEAR		0x06
-#define DS3232_REG_ALARM1         0x07	/* Alarm 1 BASE */
-#define DS3232_REG_ALARM2         0x0B	/* Alarm 2 BASE */
-#define DS3232_REG_CR		0x0E	/* Control register */
-#	define DS3232_REG_CR_nEOSC        0x80
-#       define DS3232_REG_CR_INTCN        0x04
-#       define DS3232_REG_CR_A2IE        0x02
-#       define DS3232_REG_CR_A1IE        0x01
+#define DS3232_REG_SECONDS      0x00
+#define DS3232_REG_MINUTES      0x01
+#define DS3232_REG_HOURS        0x02
+#define DS3232_REG_AMPM         0x02
+#define DS3232_REG_DAY          0x03
+#define DS3232_REG_DATE         0x04
+#define DS3232_REG_MONTH        0x05
+#define DS3232_REG_CENTURY      0x05
+#define DS3232_REG_YEAR         0x06
+#define DS3232_REG_ALARM1       0x07       /* Alarm 1 BASE */
+#define DS3232_REG_ALARM2       0x0B       /* Alarm 2 BASE */
+#define DS3232_REG_CR           0x0E       /* Control register */
+#       define DS3232_REG_CR_nEOSC   0x80
+#       define DS3232_REG_CR_INTCN   0x04
+#       define DS3232_REG_CR_A2IE    0x02
+#       define DS3232_REG_CR_A1IE    0x01
 
-#define DS3232_REG_SR	0x0F	/* control/status register */
-#	define DS3232_REG_SR_OSF   0x80
-#       define DS3232_REG_SR_BSY   0x04
-#       define DS3232_REG_SR_A2F   0x02
-#       define DS3232_REG_SR_A1F   0x01
+#define DS3232_REG_SR           0x0F       /* control/status register */
+#       define DS3232_REG_SR_OSF     0x80
+#       define DS3232_REG_SR_BSY     0x04
+#       define DS3232_REG_SR_A2F     0x02
+#       define DS3232_REG_SR_A1F     0x01
 
 struct ds3232 {
 	struct device *dev;
@@ -363,6 +363,9 @@ static int ds3232_probe(struct device *dev, struct regmap *regmap, int irq,
 	if (ret)
 		return ret;
 
+	if (ds3232->irq > 0)
+		device_init_wakeup(dev, 1);
+
 	ds3232->rtc = devm_rtc_device_register(dev, name, &ds3232_rtc_ops,
 						THIS_MODULE);
 	if (IS_ERR(ds3232->rtc))
@@ -374,10 +377,10 @@ static int ds3232_probe(struct device *dev, struct regmap *regmap, int irq,
 						IRQF_SHARED | IRQF_ONESHOT,
 						name, dev);
 		if (ret) {
+			device_set_wakeup_capable(dev, 0);
 			ds3232->irq = 0;
 			dev_err(dev, "unable to request IRQ\n");
-		} else
-			device_init_wakeup(dev, 1);
+		}
 	}
 
 	return 0;
@@ -420,6 +423,7 @@ static int ds3232_i2c_probe(struct i2c_client *client,
 	static const struct regmap_config config = {
 		.reg_bits = 8,
 		.val_bits = 8,
+		.max_register = 0x13,
 	};
 
 	regmap = devm_regmap_init_i2c(client, &config);
@@ -479,6 +483,7 @@ static int ds3234_probe(struct spi_device *spi)
 	static const struct regmap_config config = {
 		.reg_bits = 8,
 		.val_bits = 8,
+		.max_register = 0x13,
 		.write_flag_mask = 0x80,
 	};
 	struct regmap *regmap;
diff --git a/drivers/rtc/rtc-gemini.c b/drivers/rtc/rtc-gemini.c
index 688debc..ccf0dba 100644
--- a/drivers/rtc/rtc-gemini.c
+++ b/drivers/rtc/rtc-gemini.c
@@ -159,9 +159,16 @@ static int gemini_rtc_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct of_device_id gemini_rtc_dt_match[] = {
+	{ .compatible = "cortina,gemini-rtc" },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, gemini_rtc_dt_match);
+
 static struct platform_driver gemini_rtc_driver = {
 	.driver		= {
 		.name	= DRV_NAME,
+		.of_match_table = gemini_rtc_dt_match,
 	},
 	.probe		= gemini_rtc_probe,
 	.remove		= gemini_rtc_remove,
diff --git a/drivers/rtc/rtc-imxdi.c b/drivers/rtc/rtc-imxdi.c
index 67b56b8..6b54f6c 100644
--- a/drivers/rtc/rtc-imxdi.c
+++ b/drivers/rtc/rtc-imxdi.c
@@ -108,7 +108,6 @@
  * @pdev: pionter to platform dev
  * @rtc: pointer to rtc struct
  * @ioaddr: IO registers pointer
- * @irq: dryice normal interrupt
  * @clk: input reference clock
  * @dsr: copy of the DSR register
  * @irq_lock: interrupt enable register (DIER) lock
@@ -120,7 +119,6 @@ struct imxdi_dev {
 	struct platform_device *pdev;
 	struct rtc_device *rtc;
 	void __iomem *ioaddr;
-	int irq;
 	struct clk *clk;
 	u32 dsr;
 	spinlock_t irq_lock;
@@ -668,7 +666,7 @@ static int dryice_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm)
 	return 0;
 }
 
-static struct rtc_class_ops dryice_rtc_ops = {
+static const struct rtc_class_ops dryice_rtc_ops = {
 	.read_time		= dryice_rtc_read_time,
 	.set_mmss		= dryice_rtc_set_mmss,
 	.alarm_irq_enable	= dryice_rtc_alarm_irq_enable,
@@ -677,9 +675,9 @@ static struct rtc_class_ops dryice_rtc_ops = {
 };
 
 /*
- * dryice "normal" interrupt handler
+ * interrupt handler for dryice "normal" and security violation interrupt
  */
-static irqreturn_t dryice_norm_irq(int irq, void *dev_id)
+static irqreturn_t dryice_irq(int irq, void *dev_id)
 {
 	struct imxdi_dev *imxdi = dev_id;
 	u32 dsr, dier;
@@ -765,6 +763,7 @@ static int __init dryice_rtc_probe(struct platform_device *pdev)
 {
 	struct resource *res;
 	struct imxdi_dev *imxdi;
+	int norm_irq, sec_irq;
 	int rc;
 
 	imxdi = devm_kzalloc(&pdev->dev, sizeof(*imxdi), GFP_KERNEL);
@@ -780,9 +779,16 @@ static int __init dryice_rtc_probe(struct platform_device *pdev)
 
 	spin_lock_init(&imxdi->irq_lock);
 
-	imxdi->irq = platform_get_irq(pdev, 0);
-	if (imxdi->irq < 0)
-		return imxdi->irq;
+	norm_irq = platform_get_irq(pdev, 0);
+	if (norm_irq < 0)
+		return norm_irq;
+
+	/* the 2nd irq is the security violation irq
+	 * make this optional, don't break the device tree ABI
+	 */
+	sec_irq = platform_get_irq(pdev, 1);
+	if (sec_irq <= 0)
+		sec_irq = IRQ_NOTCONNECTED;
 
 	init_waitqueue_head(&imxdi->write_wait);
 
@@ -808,13 +814,20 @@ static int __init dryice_rtc_probe(struct platform_device *pdev)
 	if (rc != 0)
 		goto err;
 
-	rc = devm_request_irq(&pdev->dev, imxdi->irq, dryice_norm_irq,
-			IRQF_SHARED, pdev->name, imxdi);
+	rc = devm_request_irq(&pdev->dev, norm_irq, dryice_irq,
+			      IRQF_SHARED, pdev->name, imxdi);
 	if (rc) {
 		dev_warn(&pdev->dev, "interrupt not available.\n");
 		goto err;
 	}
 
+	rc = devm_request_irq(&pdev->dev, sec_irq, dryice_irq,
+			      IRQF_SHARED, pdev->name, imxdi);
+	if (rc) {
+		dev_warn(&pdev->dev, "security violation interrupt not available.\n");
+		/* this is not an error, see above */
+	}
+
 	platform_set_drvdata(pdev, imxdi);
 	imxdi->rtc = devm_rtc_device_register(&pdev->dev, pdev->name,
 				  &dryice_rtc_ops, THIS_MODULE);
diff --git a/drivers/rtc/rtc-ls1x.c b/drivers/rtc/rtc-ls1x.c
index 22a9ec4..e04ca54 100644
--- a/drivers/rtc/rtc-ls1x.c
+++ b/drivers/rtc/rtc-ls1x.c
@@ -138,7 +138,7 @@ static int ls1x_rtc_set_time(struct device *dev, struct  rtc_time *rtm)
 	return ret;
 }
 
-static struct rtc_class_ops  ls1x_rtc_ops = {
+static const struct rtc_class_ops  ls1x_rtc_ops = {
 	.read_time	= ls1x_rtc_read_time,
 	.set_time	= ls1x_rtc_set_time,
 };
diff --git a/drivers/rtc/rtc-m48t86.c b/drivers/rtc/rtc-m48t86.c
index 0eeb571..02af045 100644
--- a/drivers/rtc/rtc-m48t86.c
+++ b/drivers/rtc/rtc-m48t86.c
@@ -16,62 +16,88 @@
 #include <linux/module.h>
 #include <linux/rtc.h>
 #include <linux/platform_device.h>
-#include <linux/platform_data/rtc-m48t86.h>
 #include <linux/bcd.h>
+#include <linux/io.h>
 
-#define M48T86_REG_SEC		0x00
-#define M48T86_REG_SECALRM	0x01
-#define M48T86_REG_MIN		0x02
-#define M48T86_REG_MINALRM	0x03
-#define M48T86_REG_HOUR		0x04
-#define M48T86_REG_HOURALRM	0x05
-#define M48T86_REG_DOW		0x06 /* 1 = sunday */
-#define M48T86_REG_DOM		0x07
-#define M48T86_REG_MONTH	0x08 /* 1 - 12 */
-#define M48T86_REG_YEAR		0x09 /* 0 - 99 */
-#define M48T86_REG_A		0x0A
-#define M48T86_REG_B		0x0B
-#define M48T86_REG_C		0x0C
-#define M48T86_REG_D		0x0D
+#define M48T86_SEC		0x00
+#define M48T86_SECALRM		0x01
+#define M48T86_MIN		0x02
+#define M48T86_MINALRM		0x03
+#define M48T86_HOUR		0x04
+#define M48T86_HOURALRM		0x05
+#define M48T86_DOW		0x06 /* 1 = sunday */
+#define M48T86_DOM		0x07
+#define M48T86_MONTH		0x08 /* 1 - 12 */
+#define M48T86_YEAR		0x09 /* 0 - 99 */
+#define M48T86_A		0x0a
+#define M48T86_B		0x0b
+#define M48T86_B_SET		BIT(7)
+#define M48T86_B_DM		BIT(2)
+#define M48T86_B_H24		BIT(1)
+#define M48T86_C		0x0c
+#define M48T86_D		0x0d
+#define M48T86_D_VRT		BIT(7)
+#define M48T86_NVRAM(x)		(0x0e + (x))
+#define M48T86_NVRAM_LEN	114
 
-#define M48T86_REG_B_H24	(1 << 1)
-#define M48T86_REG_B_DM		(1 << 2)
-#define M48T86_REG_B_SET	(1 << 7)
-#define M48T86_REG_D_VRT	(1 << 7)
+struct m48t86_rtc_info {
+	void __iomem *index_reg;
+	void __iomem *data_reg;
+	struct rtc_device *rtc;
+};
+
+static unsigned char m48t86_readb(struct device *dev, unsigned long addr)
+{
+	struct m48t86_rtc_info *info = dev_get_drvdata(dev);
+	unsigned char value;
+
+	writeb(addr, info->index_reg);
+	value = readb(info->data_reg);
+
+	return value;
+}
+
+static void m48t86_writeb(struct device *dev,
+			  unsigned char value, unsigned long addr)
+{
+	struct m48t86_rtc_info *info = dev_get_drvdata(dev);
+
+	writeb(addr, info->index_reg);
+	writeb(value, info->data_reg);
+}
 
 static int m48t86_rtc_read_time(struct device *dev, struct rtc_time *tm)
 {
 	unsigned char reg;
-	struct platform_device *pdev = to_platform_device(dev);
-	struct m48t86_ops *ops = dev_get_platdata(&pdev->dev);
 
-	reg = ops->readbyte(M48T86_REG_B);
+	reg = m48t86_readb(dev, M48T86_B);
 
-	if (reg & M48T86_REG_B_DM) {
+	if (reg & M48T86_B_DM) {
 		/* data (binary) mode */
-		tm->tm_sec	= ops->readbyte(M48T86_REG_SEC);
-		tm->tm_min	= ops->readbyte(M48T86_REG_MIN);
-		tm->tm_hour	= ops->readbyte(M48T86_REG_HOUR) & 0x3F;
-		tm->tm_mday	= ops->readbyte(M48T86_REG_DOM);
+		tm->tm_sec	= m48t86_readb(dev, M48T86_SEC);
+		tm->tm_min	= m48t86_readb(dev, M48T86_MIN);
+		tm->tm_hour	= m48t86_readb(dev, M48T86_HOUR) & 0x3f;
+		tm->tm_mday	= m48t86_readb(dev, M48T86_DOM);
 		/* tm_mon is 0-11 */
-		tm->tm_mon	= ops->readbyte(M48T86_REG_MONTH) - 1;
-		tm->tm_year	= ops->readbyte(M48T86_REG_YEAR) + 100;
-		tm->tm_wday	= ops->readbyte(M48T86_REG_DOW);
+		tm->tm_mon	= m48t86_readb(dev, M48T86_MONTH) - 1;
+		tm->tm_year	= m48t86_readb(dev, M48T86_YEAR) + 100;
+		tm->tm_wday	= m48t86_readb(dev, M48T86_DOW);
 	} else {
 		/* bcd mode */
-		tm->tm_sec	= bcd2bin(ops->readbyte(M48T86_REG_SEC));
-		tm->tm_min	= bcd2bin(ops->readbyte(M48T86_REG_MIN));
-		tm->tm_hour	= bcd2bin(ops->readbyte(M48T86_REG_HOUR) & 0x3F);
-		tm->tm_mday	= bcd2bin(ops->readbyte(M48T86_REG_DOM));
+		tm->tm_sec	= bcd2bin(m48t86_readb(dev, M48T86_SEC));
+		tm->tm_min	= bcd2bin(m48t86_readb(dev, M48T86_MIN));
+		tm->tm_hour	= bcd2bin(m48t86_readb(dev, M48T86_HOUR) &
+					  0x3f);
+		tm->tm_mday	= bcd2bin(m48t86_readb(dev, M48T86_DOM));
 		/* tm_mon is 0-11 */
-		tm->tm_mon	= bcd2bin(ops->readbyte(M48T86_REG_MONTH)) - 1;
-		tm->tm_year	= bcd2bin(ops->readbyte(M48T86_REG_YEAR)) + 100;
-		tm->tm_wday	= bcd2bin(ops->readbyte(M48T86_REG_DOW));
+		tm->tm_mon	= bcd2bin(m48t86_readb(dev, M48T86_MONTH)) - 1;
+		tm->tm_year	= bcd2bin(m48t86_readb(dev, M48T86_YEAR)) + 100;
+		tm->tm_wday	= bcd2bin(m48t86_readb(dev, M48T86_DOW));
 	}
 
 	/* correct the hour if the clock is in 12h mode */
-	if (!(reg & M48T86_REG_B_H24))
-		if (ops->readbyte(M48T86_REG_HOUR) & 0x80)
+	if (!(reg & M48T86_B_H24))
+		if (m48t86_readb(dev, M48T86_HOUR) & 0x80)
 			tm->tm_hour += 12;
 
 	return rtc_valid_tm(tm);
@@ -80,38 +106,36 @@ static int m48t86_rtc_read_time(struct device *dev, struct rtc_time *tm)
 static int m48t86_rtc_set_time(struct device *dev, struct rtc_time *tm)
 {
 	unsigned char reg;
-	struct platform_device *pdev = to_platform_device(dev);
-	struct m48t86_ops *ops = dev_get_platdata(&pdev->dev);
 
-	reg = ops->readbyte(M48T86_REG_B);
+	reg = m48t86_readb(dev, M48T86_B);
 
 	/* update flag and 24h mode */
-	reg |= M48T86_REG_B_SET | M48T86_REG_B_H24;
-	ops->writebyte(reg, M48T86_REG_B);
+	reg |= M48T86_B_SET | M48T86_B_H24;
+	m48t86_writeb(dev, reg, M48T86_B);
 
-	if (reg & M48T86_REG_B_DM) {
+	if (reg & M48T86_B_DM) {
 		/* data (binary) mode */
-		ops->writebyte(tm->tm_sec, M48T86_REG_SEC);
-		ops->writebyte(tm->tm_min, M48T86_REG_MIN);
-		ops->writebyte(tm->tm_hour, M48T86_REG_HOUR);
-		ops->writebyte(tm->tm_mday, M48T86_REG_DOM);
-		ops->writebyte(tm->tm_mon + 1, M48T86_REG_MONTH);
-		ops->writebyte(tm->tm_year % 100, M48T86_REG_YEAR);
-		ops->writebyte(tm->tm_wday, M48T86_REG_DOW);
+		m48t86_writeb(dev, tm->tm_sec, M48T86_SEC);
+		m48t86_writeb(dev, tm->tm_min, M48T86_MIN);
+		m48t86_writeb(dev, tm->tm_hour, M48T86_HOUR);
+		m48t86_writeb(dev, tm->tm_mday, M48T86_DOM);
+		m48t86_writeb(dev, tm->tm_mon + 1, M48T86_MONTH);
+		m48t86_writeb(dev, tm->tm_year % 100, M48T86_YEAR);
+		m48t86_writeb(dev, tm->tm_wday, M48T86_DOW);
 	} else {
 		/* bcd mode */
-		ops->writebyte(bin2bcd(tm->tm_sec), M48T86_REG_SEC);
-		ops->writebyte(bin2bcd(tm->tm_min), M48T86_REG_MIN);
-		ops->writebyte(bin2bcd(tm->tm_hour), M48T86_REG_HOUR);
-		ops->writebyte(bin2bcd(tm->tm_mday), M48T86_REG_DOM);
-		ops->writebyte(bin2bcd(tm->tm_mon + 1), M48T86_REG_MONTH);
-		ops->writebyte(bin2bcd(tm->tm_year % 100), M48T86_REG_YEAR);
-		ops->writebyte(bin2bcd(tm->tm_wday), M48T86_REG_DOW);
+		m48t86_writeb(dev, bin2bcd(tm->tm_sec), M48T86_SEC);
+		m48t86_writeb(dev, bin2bcd(tm->tm_min), M48T86_MIN);
+		m48t86_writeb(dev, bin2bcd(tm->tm_hour), M48T86_HOUR);
+		m48t86_writeb(dev, bin2bcd(tm->tm_mday), M48T86_DOM);
+		m48t86_writeb(dev, bin2bcd(tm->tm_mon + 1), M48T86_MONTH);
+		m48t86_writeb(dev, bin2bcd(tm->tm_year % 100), M48T86_YEAR);
+		m48t86_writeb(dev, bin2bcd(tm->tm_wday), M48T86_DOW);
 	}
 
 	/* update ended */
-	reg &= ~M48T86_REG_B_SET;
-	ops->writebyte(reg, M48T86_REG_B);
+	reg &= ~M48T86_B_SET;
+	m48t86_writeb(dev, reg, M48T86_B);
 
 	return 0;
 }
@@ -119,18 +143,16 @@ static int m48t86_rtc_set_time(struct device *dev, struct rtc_time *tm)
 static int m48t86_rtc_proc(struct device *dev, struct seq_file *seq)
 {
 	unsigned char reg;
-	struct platform_device *pdev = to_platform_device(dev);
-	struct m48t86_ops *ops = dev_get_platdata(&pdev->dev);
 
-	reg = ops->readbyte(M48T86_REG_B);
+	reg = m48t86_readb(dev, M48T86_B);
 
 	seq_printf(seq, "mode\t\t: %s\n",
-		 (reg & M48T86_REG_B_DM) ? "binary" : "bcd");
+		   (reg & M48T86_B_DM) ? "binary" : "bcd");
 
-	reg = ops->readbyte(M48T86_REG_D);
+	reg = m48t86_readb(dev, M48T86_D);
 
 	seq_printf(seq, "battery\t\t: %s\n",
-		 (reg & M48T86_REG_D_VRT) ? "ok" : "exhausted");
+		   (reg & M48T86_D_VRT) ? "ok" : "exhausted");
 
 	return 0;
 }
@@ -141,25 +163,116 @@ static const struct rtc_class_ops m48t86_rtc_ops = {
 	.proc		= m48t86_rtc_proc,
 };
 
-static int m48t86_rtc_probe(struct platform_device *dev)
+static ssize_t m48t86_nvram_read(struct file *filp, struct kobject *kobj,
+				 struct bin_attribute *attr,
+				 char *buf, loff_t off, size_t count)
 {
+	struct device *dev = kobj_to_dev(kobj);
+	unsigned int i;
+
+	for (i = 0; i < count; i++)
+		buf[i] = m48t86_readb(dev, M48T86_NVRAM(off + i));
+
+	return count;
+}
+
+static ssize_t m48t86_nvram_write(struct file *filp, struct kobject *kobj,
+				  struct bin_attribute *attr,
+				  char *buf, loff_t off, size_t count)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	unsigned int i;
+
+	for (i = 0; i < count; i++)
+		m48t86_writeb(dev, buf[i], M48T86_NVRAM(off + i));
+
+	return count;
+}
+
+static BIN_ATTR(nvram, 0644, m48t86_nvram_read, m48t86_nvram_write,
+		M48T86_NVRAM_LEN);
+
+/*
+ * The RTC is an optional feature at purchase time on some Technologic Systems
+ * boards. Verify that it actually exists by checking if the last two bytes
+ * of the NVRAM can be changed.
+ *
+ * This is based on the method used in their rtc7800.c example.
+ */
+static bool m48t86_verify_chip(struct platform_device *pdev)
+{
+	unsigned int offset0 = M48T86_NVRAM(M48T86_NVRAM_LEN - 2);
+	unsigned int offset1 = M48T86_NVRAM(M48T86_NVRAM_LEN - 1);
+	unsigned char tmp0, tmp1;
+
+	tmp0 = m48t86_readb(&pdev->dev, offset0);
+	tmp1 = m48t86_readb(&pdev->dev, offset1);
+
+	m48t86_writeb(&pdev->dev, 0x00, offset0);
+	m48t86_writeb(&pdev->dev, 0x55, offset1);
+	if (m48t86_readb(&pdev->dev, offset1) == 0x55) {
+		m48t86_writeb(&pdev->dev, 0xaa, offset1);
+		if (m48t86_readb(&pdev->dev, offset1) == 0xaa &&
+		    m48t86_readb(&pdev->dev, offset0) == 0x00) {
+			m48t86_writeb(&pdev->dev, tmp0, offset0);
+			m48t86_writeb(&pdev->dev, tmp1, offset1);
+
+			return true;
+		}
+	}
+	return false;
+}
+
+static int m48t86_rtc_probe(struct platform_device *pdev)
+{
+	struct m48t86_rtc_info *info;
+	struct resource *res;
 	unsigned char reg;
-	struct m48t86_ops *ops = dev_get_platdata(&dev->dev);
-	struct rtc_device *rtc;
 
-	rtc = devm_rtc_device_register(&dev->dev, "m48t86",
-				&m48t86_rtc_ops, THIS_MODULE);
+	info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
 
-	if (IS_ERR(rtc))
-		return PTR_ERR(rtc);
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENODEV;
+	info->index_reg = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(info->index_reg))
+		return PTR_ERR(info->index_reg);
 
-	platform_set_drvdata(dev, rtc);
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+	if (!res)
+		return -ENODEV;
+	info->data_reg = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(info->data_reg))
+		return PTR_ERR(info->data_reg);
+
+	dev_set_drvdata(&pdev->dev, info);
+
+	if (!m48t86_verify_chip(pdev)) {
+		dev_info(&pdev->dev, "RTC not present\n");
+		return -ENODEV;
+	}
+
+	info->rtc = devm_rtc_device_register(&pdev->dev, "m48t86",
+					     &m48t86_rtc_ops, THIS_MODULE);
+	if (IS_ERR(info->rtc))
+		return PTR_ERR(info->rtc);
 
 	/* read battery status */
-	reg = ops->readbyte(M48T86_REG_D);
-	dev_info(&dev->dev, "battery %s\n",
-		(reg & M48T86_REG_D_VRT) ? "ok" : "exhausted");
+	reg = m48t86_readb(&pdev->dev, M48T86_D);
+	dev_info(&pdev->dev, "battery %s\n",
+		 (reg & M48T86_D_VRT) ? "ok" : "exhausted");
 
+	if (device_create_bin_file(&pdev->dev, &bin_attr_nvram))
+		dev_err(&pdev->dev, "failed to create nvram sysfs entry\n");
+
+	return 0;
+}
+
+static int m48t86_rtc_remove(struct platform_device *pdev)
+{
+	device_remove_bin_file(&pdev->dev, &bin_attr_nvram);
 	return 0;
 }
 
@@ -168,6 +281,7 @@ static struct platform_driver m48t86_rtc_platform_driver = {
 		.name	= "rtc-m48t86",
 	},
 	.probe		= m48t86_rtc_probe,
+	.remove		= m48t86_rtc_remove,
 };
 
 module_platform_driver(m48t86_rtc_platform_driver);
diff --git a/drivers/rtc/rtc-mcp795.c b/drivers/rtc/rtc-mcp795.c
index ce75e42..77f2133 100644
--- a/drivers/rtc/rtc-mcp795.c
+++ b/drivers/rtc/rtc-mcp795.c
@@ -44,12 +44,22 @@
 #define MCP795_REG_DAY		0x04
 #define MCP795_REG_MONTH	0x06
 #define MCP795_REG_CONTROL	0x08
+#define MCP795_REG_ALM0_SECONDS	0x0C
+#define MCP795_REG_ALM0_DAY	0x0F
 
 #define MCP795_ST_BIT		BIT(7)
 #define MCP795_24_BIT		BIT(6)
 #define MCP795_LP_BIT		BIT(5)
 #define MCP795_EXTOSC_BIT	BIT(3)
 #define MCP795_OSCON_BIT	BIT(5)
+#define MCP795_ALM0_BIT		BIT(4)
+#define MCP795_ALM1_BIT		BIT(5)
+#define MCP795_ALM0IF_BIT	BIT(3)
+#define MCP795_ALM0C0_BIT	BIT(4)
+#define MCP795_ALM0C1_BIT	BIT(5)
+#define MCP795_ALM0C2_BIT	BIT(6)
+
+#define SEC_PER_DAY		(24 * 60 * 60)
 
 static int mcp795_rtcc_read(struct device *dev, u8 addr, u8 *buf, u8 count)
 {
@@ -150,6 +160,30 @@ static int mcp795_start_oscillator(struct device *dev, bool *extosc)
 			dev, MCP795_REG_SECONDS, MCP795_ST_BIT, MCP795_ST_BIT);
 }
 
+/* Enable or disable Alarm 0 in RTC */
+static int mcp795_update_alarm(struct device *dev, bool enable)
+{
+	int ret;
+
+	dev_dbg(dev, "%s alarm\n", enable ? "Enable" : "Disable");
+
+	if (enable) {
+		/* clear ALM0IF (Alarm 0 Interrupt Flag) bit */
+		ret = mcp795_rtcc_set_bits(dev, MCP795_REG_ALM0_DAY,
+					MCP795_ALM0IF_BIT, 0);
+		if (ret)
+			return ret;
+		/* enable alarm 0 */
+		ret = mcp795_rtcc_set_bits(dev, MCP795_REG_CONTROL,
+					MCP795_ALM0_BIT, MCP795_ALM0_BIT);
+	} else {
+		/* disable alarm 0 and alarm 1 */
+		ret = mcp795_rtcc_set_bits(dev, MCP795_REG_CONTROL,
+					MCP795_ALM0_BIT | MCP795_ALM1_BIT, 0);
+	}
+	return ret;
+}
+
 static int mcp795_set_time(struct device *dev, struct rtc_time *tim)
 {
 	int ret;
@@ -170,6 +204,7 @@ static int mcp795_set_time(struct device *dev, struct rtc_time *tim)
 	data[0] = (data[0] & 0x80) | bin2bcd(tim->tm_sec);
 	data[1] = (data[1] & 0x80) | bin2bcd(tim->tm_min);
 	data[2] = bin2bcd(tim->tm_hour);
+	data[3] = (data[3] & 0xF8) | bin2bcd(tim->tm_wday + 1);
 	data[4] = bin2bcd(tim->tm_mday);
 	data[5] = (data[5] & MCP795_LP_BIT) | bin2bcd(tim->tm_mon + 1);
 
@@ -198,9 +233,9 @@ static int mcp795_set_time(struct device *dev, struct rtc_time *tim)
 	if (ret)
 		return ret;
 
-	dev_dbg(dev, "Set mcp795: %04d-%02d-%02d %02d:%02d:%02d\n",
+	dev_dbg(dev, "Set mcp795: %04d-%02d-%02d(%d) %02d:%02d:%02d\n",
 			tim->tm_year + 1900, tim->tm_mon, tim->tm_mday,
-			tim->tm_hour, tim->tm_min, tim->tm_sec);
+			tim->tm_wday, tim->tm_hour, tim->tm_min, tim->tm_sec);
 
 	return 0;
 }
@@ -218,20 +253,139 @@ static int mcp795_read_time(struct device *dev, struct rtc_time *tim)
 	tim->tm_sec	= bcd2bin(data[0] & 0x7F);
 	tim->tm_min	= bcd2bin(data[1] & 0x7F);
 	tim->tm_hour	= bcd2bin(data[2] & 0x3F);
+	tim->tm_wday	= bcd2bin(data[3] & 0x07) - 1;
 	tim->tm_mday	= bcd2bin(data[4] & 0x3F);
 	tim->tm_mon	= bcd2bin(data[5] & 0x1F) - 1;
 	tim->tm_year	= bcd2bin(data[6]) + 100; /* Assume we are in 20xx */
 
-	dev_dbg(dev, "Read from mcp795: %04d-%02d-%02d %02d:%02d:%02d\n",
-				tim->tm_year + 1900, tim->tm_mon, tim->tm_mday,
-				tim->tm_hour, tim->tm_min, tim->tm_sec);
+	dev_dbg(dev, "Read from mcp795: %04d-%02d-%02d(%d) %02d:%02d:%02d\n",
+			tim->tm_year + 1900, tim->tm_mon, tim->tm_mday,
+			tim->tm_wday, tim->tm_hour, tim->tm_min, tim->tm_sec);
 
 	return rtc_valid_tm(tim);
 }
 
+static int mcp795_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
+{
+	struct rtc_time now_tm;
+	time64_t now;
+	time64_t later;
+	u8 tmp[6];
+	int ret;
+
+	/* Read current time from RTC hardware */
+	ret = mcp795_read_time(dev, &now_tm);
+	if (ret)
+		return ret;
+	/* Get the number of seconds since 1970 */
+	now = rtc_tm_to_time64(&now_tm);
+	later = rtc_tm_to_time64(&alm->time);
+	if (later <= now)
+		return -EINVAL;
+	/* make sure alarm fires within the next one year */
+	if ((later - now) >=
+		(SEC_PER_DAY * (365 + is_leap_year(alm->time.tm_year))))
+		return -EDOM;
+	/* disable alarm */
+	ret = mcp795_update_alarm(dev, false);
+	if (ret)
+		return ret;
+	/* Read registers, so we can leave configuration bits untouched */
+	ret = mcp795_rtcc_read(dev, MCP795_REG_ALM0_SECONDS, tmp, sizeof(tmp));
+	if (ret)
+		return ret;
+
+	alm->time.tm_year	= -1;
+	alm->time.tm_isdst	= -1;
+	alm->time.tm_yday	= -1;
+
+	tmp[0] = (tmp[0] & 0x80) | bin2bcd(alm->time.tm_sec);
+	tmp[1] = (tmp[1] & 0x80) | bin2bcd(alm->time.tm_min);
+	tmp[2] = (tmp[2] & 0xE0) | bin2bcd(alm->time.tm_hour);
+	tmp[3] = (tmp[3] & 0x80) | bin2bcd(alm->time.tm_wday + 1);
+	/* set alarm match: seconds, minutes, hour, day, date and month */
+	tmp[3] |= (MCP795_ALM0C2_BIT | MCP795_ALM0C1_BIT | MCP795_ALM0C0_BIT);
+	tmp[4] = (tmp[4] & 0xC0) | bin2bcd(alm->time.tm_mday);
+	tmp[5] = (tmp[5] & 0xE0) | bin2bcd(alm->time.tm_mon + 1);
+
+	ret = mcp795_rtcc_write(dev, MCP795_REG_ALM0_SECONDS, tmp, sizeof(tmp));
+	if (ret)
+		return ret;
+
+	/* enable alarm if requested */
+	if (alm->enabled) {
+		ret = mcp795_update_alarm(dev, true);
+		if (ret)
+			return ret;
+		dev_dbg(dev, "Alarm IRQ armed\n");
+	}
+	dev_dbg(dev, "Set alarm: %02d-%02d(%d) %02d:%02d:%02d\n",
+			alm->time.tm_mon, alm->time.tm_mday, alm->time.tm_wday,
+			alm->time.tm_hour, alm->time.tm_min, alm->time.tm_sec);
+	return 0;
+}
+
+static int mcp795_read_alarm(struct device *dev, struct rtc_wkalrm *alm)
+{
+	u8 data[6];
+	int ret;
+
+	ret = mcp795_rtcc_read(
+			dev, MCP795_REG_ALM0_SECONDS, data, sizeof(data));
+	if (ret)
+		return ret;
+
+	alm->time.tm_sec	= bcd2bin(data[0] & 0x7F);
+	alm->time.tm_min	= bcd2bin(data[1] & 0x7F);
+	alm->time.tm_hour	= bcd2bin(data[2] & 0x1F);
+	alm->time.tm_wday	= bcd2bin(data[3] & 0x07) - 1;
+	alm->time.tm_mday	= bcd2bin(data[4] & 0x3F);
+	alm->time.tm_mon	= bcd2bin(data[5] & 0x1F) - 1;
+	alm->time.tm_year	= -1;
+	alm->time.tm_isdst	= -1;
+	alm->time.tm_yday	= -1;
+
+	dev_dbg(dev, "Read alarm: %02d-%02d(%d) %02d:%02d:%02d\n",
+			alm->time.tm_mon, alm->time.tm_mday, alm->time.tm_wday,
+			alm->time.tm_hour, alm->time.tm_min, alm->time.tm_sec);
+	return 0;
+}
+
+static int mcp795_alarm_irq_enable(struct device *dev, unsigned int enabled)
+{
+	return mcp795_update_alarm(dev, !!enabled);
+}
+
+static irqreturn_t mcp795_irq(int irq, void *data)
+{
+	struct spi_device *spi = data;
+	struct rtc_device *rtc = spi_get_drvdata(spi);
+	struct mutex *lock = &rtc->ops_lock;
+	int ret;
+
+	mutex_lock(lock);
+
+	/* Disable alarm.
+	 * There is no need to clear ALM0IF (Alarm 0 Interrupt Flag) bit,
+	 * because it is done every time when alarm is enabled.
+	 */
+	ret = mcp795_update_alarm(&spi->dev, false);
+	if (ret)
+		dev_err(&spi->dev,
+			"Failed to disable alarm in IRQ (ret=%d)\n", ret);
+	rtc_update_irq(rtc, 1, RTC_AF | RTC_IRQF);
+
+	mutex_unlock(lock);
+
+	return IRQ_HANDLED;
+}
+
 static const struct rtc_class_ops mcp795_rtc_ops = {
 		.read_time = mcp795_read_time,
-		.set_time = mcp795_set_time
+		.set_time = mcp795_set_time,
+		.read_alarm = mcp795_read_alarm,
+		.set_alarm = mcp795_set_alarm,
+		.alarm_irq_enable = mcp795_alarm_irq_enable
 };
 
 static int mcp795_probe(struct spi_device *spi)
@@ -259,6 +413,23 @@ static int mcp795_probe(struct spi_device *spi)
 
 	spi_set_drvdata(spi, rtc);
 
+	if (spi->irq > 0) {
+		dev_dbg(&spi->dev, "Alarm support enabled\n");
+
+		/* Clear any pending alarm (ALM0IF bit) before requesting
+		 * the interrupt.
+		 */
+		mcp795_rtcc_set_bits(&spi->dev, MCP795_REG_ALM0_DAY,
+					MCP795_ALM0IF_BIT, 0);
+		ret = devm_request_threaded_irq(&spi->dev, spi->irq, NULL,
+				mcp795_irq, IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
+				dev_name(&rtc->dev), spi);
+		if (ret)
+			dev_err(&spi->dev, "Failed to request IRQ: %d: %d\n",
+						spi->irq, ret);
+		else
+			device_init_wakeup(&spi->dev, true);
+	}
 	return 0;
 }
 
diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c
index 359876a..7731912 100644
--- a/drivers/rtc/rtc-mxc.c
+++ b/drivers/rtc/rtc-mxc.c
@@ -353,7 +353,7 @@ static int mxc_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 }
 
 /* RTC layer */
-static struct rtc_class_ops mxc_rtc_ops = {
+static const struct rtc_class_ops mxc_rtc_ops = {
 	.release		= mxc_rtc_release,
 	.read_time		= mxc_rtc_read_time,
 	.set_mmss64		= mxc_rtc_set_mmss,
diff --git a/drivers/rtc/rtc-pcf2127.c b/drivers/rtc/rtc-pcf2127.c
index 2bfdf63..f33447c 100644
--- a/drivers/rtc/rtc-pcf2127.c
+++ b/drivers/rtc/rtc-pcf2127.c
@@ -52,9 +52,20 @@ static int pcf2127_rtc_read_time(struct device *dev, struct rtc_time *tm)
 	struct pcf2127 *pcf2127 = dev_get_drvdata(dev);
 	unsigned char buf[10];
 	int ret;
+	int i;
 
-	ret = regmap_bulk_read(pcf2127->regmap, PCF2127_REG_CTRL1, buf,
-				sizeof(buf));
+	for (i = 0; i <= PCF2127_REG_CTRL3; i++) {
+		ret = regmap_read(pcf2127->regmap, PCF2127_REG_CTRL1 + i,
+				  (unsigned int *)(buf + i));
+		if (ret) {
+			dev_err(dev, "%s: read error\n", __func__);
+			return ret;
+		}
+	}
+
+	ret = regmap_bulk_read(pcf2127->regmap, PCF2127_REG_SC,
+			       (buf + PCF2127_REG_SC),
+			       ARRAY_SIZE(buf) - PCF2127_REG_SC);
 	if (ret) {
 		dev_err(dev, "%s: read error\n", __func__);
 		return ret;
diff --git a/drivers/rtc/rtc-rx8010.c b/drivers/rtc/rtc-rx8010.c
index 7163b91..d08da37 100644
--- a/drivers/rtc/rtc-rx8010.c
+++ b/drivers/rtc/rtc-rx8010.c
@@ -63,7 +63,6 @@ struct rx8010_data {
 	struct i2c_client *client;
 	struct rtc_device *rtc;
 	u8 ctrlreg;
-	spinlock_t flags_lock;
 };
 
 static irqreturn_t rx8010_irq_1_handler(int irq, void *dev_id)
@@ -72,12 +71,12 @@ static irqreturn_t rx8010_irq_1_handler(int irq, void *dev_id)
 	struct rx8010_data *rx8010 = i2c_get_clientdata(client);
 	int flagreg;
 
-	spin_lock(&rx8010->flags_lock);
+	mutex_lock(&rx8010->rtc->ops_lock);
 
 	flagreg = i2c_smbus_read_byte_data(client, RX8010_FLAG);
 
 	if (flagreg <= 0) {
-		spin_unlock(&rx8010->flags_lock);
+		mutex_unlock(&rx8010->rtc->ops_lock);
 		return IRQ_NONE;
 	}
 
@@ -101,7 +100,7 @@ static irqreturn_t rx8010_irq_1_handler(int irq, void *dev_id)
 
 	i2c_smbus_write_byte_data(client, RX8010_FLAG, flagreg);
 
-	spin_unlock(&rx8010->flags_lock);
+	mutex_unlock(&rx8010->rtc->ops_lock);
 	return IRQ_HANDLED;
 }
 
@@ -143,7 +142,6 @@ static int rx8010_set_time(struct device *dev, struct rtc_time *dt)
 	u8 date[7];
 	int ctrl, flagreg;
 	int ret;
-	unsigned long irqflags;
 
 	if ((dt->tm_year < 100) || (dt->tm_year > 199))
 		return -EINVAL;
@@ -181,11 +179,8 @@ static int rx8010_set_time(struct device *dev, struct rtc_time *dt)
 	if (ret < 0)
 		return ret;
 
-	spin_lock_irqsave(&rx8010->flags_lock, irqflags);
-
 	flagreg = i2c_smbus_read_byte_data(rx8010->client, RX8010_FLAG);
 	if (flagreg < 0) {
-		spin_unlock_irqrestore(&rx8010->flags_lock, irqflags);
 		return flagreg;
 	}
 
@@ -193,8 +188,6 @@ static int rx8010_set_time(struct device *dev, struct rtc_time *dt)
 		ret = i2c_smbus_write_byte_data(rx8010->client, RX8010_FLAG,
 						flagreg & ~RX8010_FLAG_VLF);
 
-	spin_unlock_irqrestore(&rx8010->flags_lock, irqflags);
-
 	return 0;
 }
 
@@ -288,12 +281,9 @@ static int rx8010_set_alarm(struct device *dev, struct rtc_wkalrm *t)
 	u8 alarmvals[3];
 	int extreg, flagreg;
 	int err;
-	unsigned long irqflags;
 
-	spin_lock_irqsave(&rx8010->flags_lock, irqflags);
 	flagreg = i2c_smbus_read_byte_data(client, RX8010_FLAG);
 	if (flagreg < 0) {
-		spin_unlock_irqrestore(&rx8010->flags_lock, irqflags);
 		return flagreg;
 	}
 
@@ -302,14 +292,12 @@ static int rx8010_set_alarm(struct device *dev, struct rtc_wkalrm *t)
 		err = i2c_smbus_write_byte_data(rx8010->client, RX8010_CTRL,
 						rx8010->ctrlreg);
 		if (err < 0) {
-			spin_unlock_irqrestore(&rx8010->flags_lock, irqflags);
 			return err;
 		}
 	}
 
 	flagreg &= ~RX8010_FLAG_AF;
 	err = i2c_smbus_write_byte_data(rx8010->client, RX8010_FLAG, flagreg);
-	spin_unlock_irqrestore(&rx8010->flags_lock, irqflags);
 	if (err < 0)
 		return err;
 
@@ -404,7 +392,6 @@ static int rx8010_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
 	struct rx8010_data *rx8010 = dev_get_drvdata(dev);
 	int ret, tmp;
 	int flagreg;
-	unsigned long irqflags;
 
 	switch (cmd) {
 	case RTC_VL_READ:
@@ -419,16 +406,13 @@ static int rx8010_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
 		return 0;
 
 	case RTC_VL_CLR:
-		spin_lock_irqsave(&rx8010->flags_lock, irqflags);
 		flagreg = i2c_smbus_read_byte_data(rx8010->client, RX8010_FLAG);
 		if (flagreg < 0) {
-			spin_unlock_irqrestore(&rx8010->flags_lock, irqflags);
 			return flagreg;
 		}
 
 		flagreg &= ~RX8010_FLAG_VLF;
 		ret = i2c_smbus_write_byte_data(client, RX8010_FLAG, flagreg);
-		spin_unlock_irqrestore(&rx8010->flags_lock, irqflags);
 		if (ret < 0)
 			return ret;
 
@@ -466,8 +450,6 @@ static int rx8010_probe(struct i2c_client *client,
 	rx8010->client = client;
 	i2c_set_clientdata(client, rx8010);
 
-	spin_lock_init(&rx8010->flags_lock);
-
 	err = rx8010_init_client(client);
 	if (err)
 		return err;
diff --git a/drivers/rtc/rtc-sh.c b/drivers/rtc/rtc-sh.c
index 17b6235..c626e43 100644
--- a/drivers/rtc/rtc-sh.c
+++ b/drivers/rtc/rtc-sh.c
@@ -535,7 +535,7 @@ static int sh_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm)
 	return 0;
 }
 
-static struct rtc_class_ops sh_rtc_ops = {
+static const struct rtc_class_ops sh_rtc_ops = {
 	.read_time	= sh_rtc_read_time,
 	.set_time	= sh_rtc_set_time,
 	.read_alarm	= sh_rtc_read_alarm,
diff --git a/drivers/rtc/rtc-snvs.c b/drivers/rtc/rtc-snvs.c
index 0f11c2a..d51b07d 100644
--- a/drivers/rtc/rtc-snvs.c
+++ b/drivers/rtc/rtc-snvs.c
@@ -184,6 +184,7 @@ static int snvs_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 	rtc_tm_to_time(alrm_tm, &time);
 
 	regmap_update_bits(data->regmap, data->offset + SNVS_LPCR, SNVS_LPCR_LPTA_EN, 0);
+	rtc_write_sync_lp(data);
 	regmap_write(data->regmap, data->offset + SNVS_LPTAR, time);
 
 	/* Clear alarm interrupt status bit */
diff --git a/drivers/rtc/rtc-stm32.c b/drivers/rtc/rtc-stm32.c
new file mode 100644
index 0000000..bd57eb1
--- /dev/null
+++ b/drivers/rtc/rtc-stm32.c
@@ -0,0 +1,725 @@
+/*
+ * Copyright (C) Amelie Delaunay 2016
+ * Author:  Amelie Delaunay <amelie.delaunay@st.com>
+ * License terms:  GNU General Public License (GPL), version 2
+ */
+
+#include <linux/bcd.h>
+#include <linux/clk.h>
+#include <linux/iopoll.h>
+#include <linux/ioport.h>
+#include <linux/mfd/syscon.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/regmap.h>
+#include <linux/rtc.h>
+
+#define DRIVER_NAME "stm32_rtc"
+
+/* STM32 RTC registers */
+#define STM32_RTC_TR		0x00
+#define STM32_RTC_DR		0x04
+#define STM32_RTC_CR		0x08
+#define STM32_RTC_ISR		0x0C
+#define STM32_RTC_PRER		0x10
+#define STM32_RTC_ALRMAR	0x1C
+#define STM32_RTC_WPR		0x24
+
+/* STM32_RTC_TR bit fields  */
+#define STM32_RTC_TR_SEC_SHIFT		0
+#define STM32_RTC_TR_SEC		GENMASK(6, 0)
+#define STM32_RTC_TR_MIN_SHIFT		8
+#define STM32_RTC_TR_MIN		GENMASK(14, 8)
+#define STM32_RTC_TR_HOUR_SHIFT		16
+#define STM32_RTC_TR_HOUR		GENMASK(21, 16)
+
+/* STM32_RTC_DR bit fields */
+#define STM32_RTC_DR_DATE_SHIFT		0
+#define STM32_RTC_DR_DATE		GENMASK(5, 0)
+#define STM32_RTC_DR_MONTH_SHIFT	8
+#define STM32_RTC_DR_MONTH		GENMASK(12, 8)
+#define STM32_RTC_DR_WDAY_SHIFT		13
+#define STM32_RTC_DR_WDAY		GENMASK(15, 13)
+#define STM32_RTC_DR_YEAR_SHIFT		16
+#define STM32_RTC_DR_YEAR		GENMASK(23, 16)
+
+/* STM32_RTC_CR bit fields */
+#define STM32_RTC_CR_FMT		BIT(6)
+#define STM32_RTC_CR_ALRAE		BIT(8)
+#define STM32_RTC_CR_ALRAIE		BIT(12)
+
+/* STM32_RTC_ISR bit fields */
+#define STM32_RTC_ISR_ALRAWF		BIT(0)
+#define STM32_RTC_ISR_INITS		BIT(4)
+#define STM32_RTC_ISR_RSF		BIT(5)
+#define STM32_RTC_ISR_INITF		BIT(6)
+#define STM32_RTC_ISR_INIT		BIT(7)
+#define STM32_RTC_ISR_ALRAF		BIT(8)
+
+/* STM32_RTC_PRER bit fields */
+#define STM32_RTC_PRER_PRED_S_SHIFT	0
+#define STM32_RTC_PRER_PRED_S		GENMASK(14, 0)
+#define STM32_RTC_PRER_PRED_A_SHIFT	16
+#define STM32_RTC_PRER_PRED_A		GENMASK(22, 16)
+
+/* STM32_RTC_ALRMAR and STM32_RTC_ALRMBR bit fields */
+#define STM32_RTC_ALRMXR_SEC_SHIFT	0
+#define STM32_RTC_ALRMXR_SEC		GENMASK(6, 0)
+#define STM32_RTC_ALRMXR_SEC_MASK	BIT(7)
+#define STM32_RTC_ALRMXR_MIN_SHIFT	8
+#define STM32_RTC_ALRMXR_MIN		GENMASK(14, 8)
+#define STM32_RTC_ALRMXR_MIN_MASK	BIT(15)
+#define STM32_RTC_ALRMXR_HOUR_SHIFT	16
+#define STM32_RTC_ALRMXR_HOUR		GENMASK(21, 16)
+#define STM32_RTC_ALRMXR_PM		BIT(22)
+#define STM32_RTC_ALRMXR_HOUR_MASK	BIT(23)
+#define STM32_RTC_ALRMXR_DATE_SHIFT	24
+#define STM32_RTC_ALRMXR_DATE		GENMASK(29, 24)
+#define STM32_RTC_ALRMXR_WDSEL		BIT(30)
+#define STM32_RTC_ALRMXR_WDAY_SHIFT	24
+#define STM32_RTC_ALRMXR_WDAY		GENMASK(27, 24)
+#define STM32_RTC_ALRMXR_DATE_MASK	BIT(31)
+
+/* STM32_RTC_WPR key constants */
+#define RTC_WPR_1ST_KEY			0xCA
+#define RTC_WPR_2ND_KEY			0x53
+#define RTC_WPR_WRONG_KEY		0xFF
+
+/*
+ * RTC registers are protected against parasitic write access.
+ * PWR_CR_DBP bit must be set to enable write access to RTC registers.
+ */
+/* STM32_PWR_CR */
+#define PWR_CR				0x00
+/* STM32_PWR_CR bit field */
+#define PWR_CR_DBP			BIT(8)
+
+struct stm32_rtc {
+	struct rtc_device *rtc_dev;
+	void __iomem *base;
+	struct regmap *dbp;
+	struct clk *ck_rtc;
+	int irq_alarm;
+};
+
+static void stm32_rtc_wpr_unlock(struct stm32_rtc *rtc)
+{
+	writel_relaxed(RTC_WPR_1ST_KEY, rtc->base + STM32_RTC_WPR);
+	writel_relaxed(RTC_WPR_2ND_KEY, rtc->base + STM32_RTC_WPR);
+}
+
+static void stm32_rtc_wpr_lock(struct stm32_rtc *rtc)
+{
+	writel_relaxed(RTC_WPR_WRONG_KEY, rtc->base + STM32_RTC_WPR);
+}
+
+static int stm32_rtc_enter_init_mode(struct stm32_rtc *rtc)
+{
+	unsigned int isr = readl_relaxed(rtc->base + STM32_RTC_ISR);
+
+	if (!(isr & STM32_RTC_ISR_INITF)) {
+		isr |= STM32_RTC_ISR_INIT;
+		writel_relaxed(isr, rtc->base + STM32_RTC_ISR);
+
+		/*
+		 * It takes around 2 ck_rtc clock cycles to enter in
+		 * initialization phase mode (and have INITF flag set). As
+		 * slowest ck_rtc frequency may be 32kHz and highest should be
+		 * 1MHz, we poll every 10 us with a timeout of 100ms.
+		 */
+		return readl_relaxed_poll_timeout_atomic(
+					rtc->base + STM32_RTC_ISR,
+					isr, (isr & STM32_RTC_ISR_INITF),
+					10, 100000);
+	}
+
+	return 0;
+}
+
+static void stm32_rtc_exit_init_mode(struct stm32_rtc *rtc)
+{
+	unsigned int isr = readl_relaxed(rtc->base + STM32_RTC_ISR);
+
+	isr &= ~STM32_RTC_ISR_INIT;
+	writel_relaxed(isr, rtc->base + STM32_RTC_ISR);
+}
+
+static int stm32_rtc_wait_sync(struct stm32_rtc *rtc)
+{
+	unsigned int isr = readl_relaxed(rtc->base + STM32_RTC_ISR);
+
+	isr &= ~STM32_RTC_ISR_RSF;
+	writel_relaxed(isr, rtc->base + STM32_RTC_ISR);
+
+	/*
+	 * Wait for RSF to be set to ensure the calendar registers are
+	 * synchronised, it takes around 2 ck_rtc clock cycles
+	 */
+	return readl_relaxed_poll_timeout_atomic(rtc->base + STM32_RTC_ISR,
+						 isr,
+						 (isr & STM32_RTC_ISR_RSF),
+						 10, 100000);
+}
+
+static irqreturn_t stm32_rtc_alarm_irq(int irq, void *dev_id)
+{
+	struct stm32_rtc *rtc = (struct stm32_rtc *)dev_id;
+	unsigned int isr, cr;
+
+	mutex_lock(&rtc->rtc_dev->ops_lock);
+
+	isr = readl_relaxed(rtc->base + STM32_RTC_ISR);
+	cr = readl_relaxed(rtc->base + STM32_RTC_CR);
+
+	if ((isr & STM32_RTC_ISR_ALRAF) &&
+	    (cr & STM32_RTC_CR_ALRAIE)) {
+		/* Alarm A flag - Alarm interrupt */
+		dev_dbg(&rtc->rtc_dev->dev, "Alarm occurred\n");
+
+		/* Pass event to the kernel */
+		rtc_update_irq(rtc->rtc_dev, 1, RTC_IRQF | RTC_AF);
+
+		/* Clear event flag, otherwise new events won't be received */
+		writel_relaxed(isr & ~STM32_RTC_ISR_ALRAF,
+			       rtc->base + STM32_RTC_ISR);
+	}
+
+	mutex_unlock(&rtc->rtc_dev->ops_lock);
+
+	return IRQ_HANDLED;
+}
+
+/* Convert rtc_time structure from bin to bcd format */
+static void tm2bcd(struct rtc_time *tm)
+{
+	tm->tm_sec = bin2bcd(tm->tm_sec);
+	tm->tm_min = bin2bcd(tm->tm_min);
+	tm->tm_hour = bin2bcd(tm->tm_hour);
+
+	tm->tm_mday = bin2bcd(tm->tm_mday);
+	tm->tm_mon = bin2bcd(tm->tm_mon + 1);
+	tm->tm_year = bin2bcd(tm->tm_year - 100);
+	/*
+	 * Number of days since Sunday
+	 * - on kernel side, 0=Sunday...6=Saturday
+	 * - on rtc side, 0=invalid,1=Monday...7=Sunday
+	 */
+	tm->tm_wday = (!tm->tm_wday) ? 7 : tm->tm_wday;
+}
+
+/* Convert rtc_time structure from bcd to bin format */
+static void bcd2tm(struct rtc_time *tm)
+{
+	tm->tm_sec = bcd2bin(tm->tm_sec);
+	tm->tm_min = bcd2bin(tm->tm_min);
+	tm->tm_hour = bcd2bin(tm->tm_hour);
+
+	tm->tm_mday = bcd2bin(tm->tm_mday);
+	tm->tm_mon = bcd2bin(tm->tm_mon) - 1;
+	tm->tm_year = bcd2bin(tm->tm_year) + 100;
+	/*
+	 * Number of days since Sunday
+	 * - on kernel side, 0=Sunday...6=Saturday
+	 * - on rtc side, 0=invalid,1=Monday...7=Sunday
+	 */
+	tm->tm_wday %= 7;
+}
+
+static int stm32_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+	struct stm32_rtc *rtc = dev_get_drvdata(dev);
+	unsigned int tr, dr;
+
+	/* Time and Date in BCD format */
+	tr = readl_relaxed(rtc->base + STM32_RTC_TR);
+	dr = readl_relaxed(rtc->base + STM32_RTC_DR);
+
+	tm->tm_sec = (tr & STM32_RTC_TR_SEC) >> STM32_RTC_TR_SEC_SHIFT;
+	tm->tm_min = (tr & STM32_RTC_TR_MIN) >> STM32_RTC_TR_MIN_SHIFT;
+	tm->tm_hour = (tr & STM32_RTC_TR_HOUR) >> STM32_RTC_TR_HOUR_SHIFT;
+
+	tm->tm_mday = (dr & STM32_RTC_DR_DATE) >> STM32_RTC_DR_DATE_SHIFT;
+	tm->tm_mon = (dr & STM32_RTC_DR_MONTH) >> STM32_RTC_DR_MONTH_SHIFT;
+	tm->tm_year = (dr & STM32_RTC_DR_YEAR) >> STM32_RTC_DR_YEAR_SHIFT;
+	tm->tm_wday = (dr & STM32_RTC_DR_WDAY) >> STM32_RTC_DR_WDAY_SHIFT;
+
+	/* We don't report tm_yday and tm_isdst */
+
+	bcd2tm(tm);
+
+	return 0;
+}
+
+static int stm32_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+	struct stm32_rtc *rtc = dev_get_drvdata(dev);
+	unsigned int tr, dr;
+	int ret = 0;
+
+	tm2bcd(tm);
+
+	/* Time in BCD format */
+	tr = ((tm->tm_sec << STM32_RTC_TR_SEC_SHIFT) & STM32_RTC_TR_SEC) |
+	     ((tm->tm_min << STM32_RTC_TR_MIN_SHIFT) & STM32_RTC_TR_MIN) |
+	     ((tm->tm_hour << STM32_RTC_TR_HOUR_SHIFT) & STM32_RTC_TR_HOUR);
+
+	/* Date in BCD format */
+	dr = ((tm->tm_mday << STM32_RTC_DR_DATE_SHIFT) & STM32_RTC_DR_DATE) |
+	     ((tm->tm_mon << STM32_RTC_DR_MONTH_SHIFT) & STM32_RTC_DR_MONTH) |
+	     ((tm->tm_year << STM32_RTC_DR_YEAR_SHIFT) & STM32_RTC_DR_YEAR) |
+	     ((tm->tm_wday << STM32_RTC_DR_WDAY_SHIFT) & STM32_RTC_DR_WDAY);
+
+	stm32_rtc_wpr_unlock(rtc);
+
+	ret = stm32_rtc_enter_init_mode(rtc);
+	if (ret) {
+		dev_err(dev, "Can't enter in init mode. Set time aborted.\n");
+		goto end;
+	}
+
+	writel_relaxed(tr, rtc->base + STM32_RTC_TR);
+	writel_relaxed(dr, rtc->base + STM32_RTC_DR);
+
+	stm32_rtc_exit_init_mode(rtc);
+
+	ret = stm32_rtc_wait_sync(rtc);
+end:
+	stm32_rtc_wpr_lock(rtc);
+
+	return ret;
+}
+
+static int stm32_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+	struct stm32_rtc *rtc = dev_get_drvdata(dev);
+	struct rtc_time *tm = &alrm->time;
+	unsigned int alrmar, cr, isr;
+
+	alrmar = readl_relaxed(rtc->base + STM32_RTC_ALRMAR);
+	cr = readl_relaxed(rtc->base + STM32_RTC_CR);
+	isr = readl_relaxed(rtc->base + STM32_RTC_ISR);
+
+	if (alrmar & STM32_RTC_ALRMXR_DATE_MASK) {
+		/*
+		 * Date/day doesn't matter in Alarm comparison so alarm
+		 * triggers every day
+		 */
+		tm->tm_mday = -1;
+		tm->tm_wday = -1;
+	} else {
+		if (alrmar & STM32_RTC_ALRMXR_WDSEL) {
+			/* Alarm is set to a day of week */
+			tm->tm_mday = -1;
+			tm->tm_wday = (alrmar & STM32_RTC_ALRMXR_WDAY) >>
+				      STM32_RTC_ALRMXR_WDAY_SHIFT;
+			tm->tm_wday %= 7;
+		} else {
+			/* Alarm is set to a day of month */
+			tm->tm_wday = -1;
+			tm->tm_mday = (alrmar & STM32_RTC_ALRMXR_DATE) >>
+				       STM32_RTC_ALRMXR_DATE_SHIFT;
+		}
+	}
+
+	if (alrmar & STM32_RTC_ALRMXR_HOUR_MASK) {
+		/* Hours don't matter in Alarm comparison */
+		tm->tm_hour = -1;
+	} else {
+		tm->tm_hour = (alrmar & STM32_RTC_ALRMXR_HOUR) >>
+			       STM32_RTC_ALRMXR_HOUR_SHIFT;
+		if (alrmar & STM32_RTC_ALRMXR_PM)
+			tm->tm_hour += 12;
+	}
+
+	if (alrmar & STM32_RTC_ALRMXR_MIN_MASK) {
+		/* Minutes don't matter in Alarm comparison */
+		tm->tm_min = -1;
+	} else {
+		tm->tm_min = (alrmar & STM32_RTC_ALRMXR_MIN) >>
+			      STM32_RTC_ALRMXR_MIN_SHIFT;
+	}
+
+	if (alrmar & STM32_RTC_ALRMXR_SEC_MASK) {
+		/* Seconds don't matter in Alarm comparison */
+		tm->tm_sec = -1;
+	} else {
+		tm->tm_sec = (alrmar & STM32_RTC_ALRMXR_SEC) >>
+			      STM32_RTC_ALRMXR_SEC_SHIFT;
+	}
+
+	bcd2tm(tm);
+
+	alrm->enabled = (cr & STM32_RTC_CR_ALRAE) ? 1 : 0;
+	alrm->pending = (isr & STM32_RTC_ISR_ALRAF) ? 1 : 0;
+
+	return 0;
+}
+
+static int stm32_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
+{
+	struct stm32_rtc *rtc = dev_get_drvdata(dev);
+	unsigned int isr, cr;
+
+	cr = readl_relaxed(rtc->base + STM32_RTC_CR);
+
+	stm32_rtc_wpr_unlock(rtc);
+
+	/* We expose Alarm A to the kernel */
+	if (enabled)
+		cr |= (STM32_RTC_CR_ALRAIE | STM32_RTC_CR_ALRAE);
+	else
+		cr &= ~(STM32_RTC_CR_ALRAIE | STM32_RTC_CR_ALRAE);
+	writel_relaxed(cr, rtc->base + STM32_RTC_CR);
+
+	/* Clear event flag, otherwise new events won't be received */
+	isr = readl_relaxed(rtc->base + STM32_RTC_ISR);
+	isr &= ~STM32_RTC_ISR_ALRAF;
+	writel_relaxed(isr, rtc->base + STM32_RTC_ISR);
+
+	stm32_rtc_wpr_lock(rtc);
+
+	return 0;
+}
+
+static int stm32_rtc_valid_alrm(struct stm32_rtc *rtc, struct rtc_time *tm)
+{
+	int cur_day, cur_mon, cur_year, cur_hour, cur_min, cur_sec;
+	unsigned int dr = readl_relaxed(rtc->base + STM32_RTC_DR);
+	unsigned int tr = readl_relaxed(rtc->base + STM32_RTC_TR);
+
+	cur_day = (dr & STM32_RTC_DR_DATE) >> STM32_RTC_DR_DATE_SHIFT;
+	cur_mon = (dr & STM32_RTC_DR_MONTH) >> STM32_RTC_DR_MONTH_SHIFT;
+	cur_year = (dr & STM32_RTC_DR_YEAR) >> STM32_RTC_DR_YEAR_SHIFT;
+	cur_sec = (tr & STM32_RTC_TR_SEC) >> STM32_RTC_TR_SEC_SHIFT;
+	cur_min = (tr & STM32_RTC_TR_MIN) >> STM32_RTC_TR_MIN_SHIFT;
+	cur_hour = (tr & STM32_RTC_TR_HOUR) >> STM32_RTC_TR_HOUR_SHIFT;
+
+	/*
+	 * Assuming current date is M-D-Y H:M:S.
+	 * RTC alarm can't be set on a specific month and year.
+	 * So the valid alarm range is:
+	 *	M-D-Y H:M:S < alarm <= (M+1)-D-Y H:M:S
+	 * with a specific case for December...
+	 */
+	if ((((tm->tm_year > cur_year) &&
+	      (tm->tm_mon == 0x1) && (cur_mon == 0x12)) ||
+	     ((tm->tm_year == cur_year) &&
+	      (tm->tm_mon <= cur_mon + 1))) &&
+	    ((tm->tm_mday > cur_day) ||
+	     ((tm->tm_mday == cur_day) &&
+	     ((tm->tm_hour > cur_hour) ||
+	      ((tm->tm_hour == cur_hour) && (tm->tm_min > cur_min)) ||
+	      ((tm->tm_hour == cur_hour) && (tm->tm_min == cur_min) &&
+	       (tm->tm_sec >= cur_sec))))))
+		return 0;
+
+	return -EINVAL;
+}
+
+static int stm32_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+	struct stm32_rtc *rtc = dev_get_drvdata(dev);
+	struct rtc_time *tm = &alrm->time;
+	unsigned int cr, isr, alrmar;
+	int ret = 0;
+
+	tm2bcd(tm);
+
+	/*
+	 * RTC alarm can't be set on a specific date, unless this date is
+	 * up to the same day of month next month.
+	 */
+	if (stm32_rtc_valid_alrm(rtc, tm) < 0) {
+		dev_err(dev, "Alarm can be set only on upcoming month.\n");
+		return -EINVAL;
+	}
+
+	alrmar = 0;
+	/* tm_year and tm_mon are not used because not supported by RTC */
+	alrmar |= (tm->tm_mday << STM32_RTC_ALRMXR_DATE_SHIFT) &
+		  STM32_RTC_ALRMXR_DATE;
+	/* 24-hour format */
+	alrmar &= ~STM32_RTC_ALRMXR_PM;
+	alrmar |= (tm->tm_hour << STM32_RTC_ALRMXR_HOUR_SHIFT) &
+		  STM32_RTC_ALRMXR_HOUR;
+	alrmar |= (tm->tm_min << STM32_RTC_ALRMXR_MIN_SHIFT) &
+		  STM32_RTC_ALRMXR_MIN;
+	alrmar |= (tm->tm_sec << STM32_RTC_ALRMXR_SEC_SHIFT) &
+		  STM32_RTC_ALRMXR_SEC;
+
+	stm32_rtc_wpr_unlock(rtc);
+
+	/* Disable Alarm */
+	cr = readl_relaxed(rtc->base + STM32_RTC_CR);
+	cr &= ~STM32_RTC_CR_ALRAE;
+	writel_relaxed(cr, rtc->base + STM32_RTC_CR);
+
+	/*
+	 * Poll Alarm write flag to be sure that Alarm update is allowed: it
+	 * takes around 2 ck_rtc clock cycles
+	 */
+	ret = readl_relaxed_poll_timeout_atomic(rtc->base + STM32_RTC_ISR,
+						isr,
+						(isr & STM32_RTC_ISR_ALRAWF),
+						10, 100000);
+
+	if (ret) {
+		dev_err(dev, "Alarm update not allowed\n");
+		goto end;
+	}
+
+	/* Write to Alarm register */
+	writel_relaxed(alrmar, rtc->base + STM32_RTC_ALRMAR);
+
+	if (alrm->enabled)
+		stm32_rtc_alarm_irq_enable(dev, 1);
+	else
+		stm32_rtc_alarm_irq_enable(dev, 0);
+
+end:
+	stm32_rtc_wpr_lock(rtc);
+
+	return ret;
+}
+
+static const struct rtc_class_ops stm32_rtc_ops = {
+	.read_time	= stm32_rtc_read_time,
+	.set_time	= stm32_rtc_set_time,
+	.read_alarm	= stm32_rtc_read_alarm,
+	.set_alarm	= stm32_rtc_set_alarm,
+	.alarm_irq_enable = stm32_rtc_alarm_irq_enable,
+};
+
+static const struct of_device_id stm32_rtc_of_match[] = {
+	{ .compatible = "st,stm32-rtc" },
+	{}
+};
+MODULE_DEVICE_TABLE(of, stm32_rtc_of_match);
+
+static int stm32_rtc_init(struct platform_device *pdev,
+			  struct stm32_rtc *rtc)
+{
+	unsigned int prer, pred_a, pred_s, pred_a_max, pred_s_max, cr;
+	unsigned int rate;
+	int ret = 0;
+
+	rate = clk_get_rate(rtc->ck_rtc);
+
+	/* Find prediv_a and prediv_s to obtain the 1Hz calendar clock */
+	pred_a_max = STM32_RTC_PRER_PRED_A >> STM32_RTC_PRER_PRED_A_SHIFT;
+	pred_s_max = STM32_RTC_PRER_PRED_S >> STM32_RTC_PRER_PRED_S_SHIFT;
+
+	for (pred_a = pred_a_max; pred_a + 1 > 0; pred_a--) {
+		pred_s = (rate / (pred_a + 1)) - 1;
+
+		if (((pred_s + 1) * (pred_a + 1)) == rate)
+			break;
+	}
+
+	/*
+	 * Can't find a 1Hz, so give priority to RTC power consumption
+	 * by choosing the higher possible value for prediv_a
+	 */
+	if ((pred_s > pred_s_max) || (pred_a > pred_a_max)) {
+		pred_a = pred_a_max;
+		pred_s = (rate / (pred_a + 1)) - 1;
+
+		dev_warn(&pdev->dev, "ck_rtc is %s\n",
+			 (rate < ((pred_a + 1) * (pred_s + 1))) ?
+			 "fast" : "slow");
+	}
+
+	stm32_rtc_wpr_unlock(rtc);
+
+	ret = stm32_rtc_enter_init_mode(rtc);
+	if (ret) {
+		dev_err(&pdev->dev,
+			"Can't enter in init mode. Prescaler config failed.\n");
+		goto end;
+	}
+
+	prer = (pred_s << STM32_RTC_PRER_PRED_S_SHIFT) & STM32_RTC_PRER_PRED_S;
+	writel_relaxed(prer, rtc->base + STM32_RTC_PRER);
+	prer |= (pred_a << STM32_RTC_PRER_PRED_A_SHIFT) & STM32_RTC_PRER_PRED_A;
+	writel_relaxed(prer, rtc->base + STM32_RTC_PRER);
+
+	/* Force 24h time format */
+	cr = readl_relaxed(rtc->base + STM32_RTC_CR);
+	cr &= ~STM32_RTC_CR_FMT;
+	writel_relaxed(cr, rtc->base + STM32_RTC_CR);
+
+	stm32_rtc_exit_init_mode(rtc);
+
+	ret = stm32_rtc_wait_sync(rtc);
+end:
+	stm32_rtc_wpr_lock(rtc);
+
+	return ret;
+}
+
+static int stm32_rtc_probe(struct platform_device *pdev)
+{
+	struct stm32_rtc *rtc;
+	struct resource *res;
+	int ret;
+
+	rtc = devm_kzalloc(&pdev->dev, sizeof(*rtc), GFP_KERNEL);
+	if (!rtc)
+		return -ENOMEM;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	rtc->base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(rtc->base))
+		return PTR_ERR(rtc->base);
+
+	rtc->dbp = syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
+						   "st,syscfg");
+	if (IS_ERR(rtc->dbp)) {
+		dev_err(&pdev->dev, "no st,syscfg\n");
+		return PTR_ERR(rtc->dbp);
+	}
+
+	rtc->ck_rtc = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(rtc->ck_rtc)) {
+		dev_err(&pdev->dev, "no ck_rtc clock");
+		return PTR_ERR(rtc->ck_rtc);
+	}
+
+	ret = clk_prepare_enable(rtc->ck_rtc);
+	if (ret)
+		return ret;
+
+	regmap_update_bits(rtc->dbp, PWR_CR, PWR_CR_DBP, PWR_CR_DBP);
+
+	/*
+	 * After a system reset, RTC_ISR.INITS flag can be read to check if
+	 * the calendar has been initalized or not. INITS flag is reset by a
+	 * power-on reset (no vbat, no power-supply). It is not reset if
+	 * ck_rtc parent clock has changed (so RTC prescalers need to be
+	 * changed). That's why we cannot rely on this flag to know if RTC
+	 * init has to be done.
+	 */
+	ret = stm32_rtc_init(pdev, rtc);
+	if (ret)
+		goto err;
+
+	rtc->irq_alarm = platform_get_irq(pdev, 0);
+	if (rtc->irq_alarm <= 0) {
+		dev_err(&pdev->dev, "no alarm irq\n");
+		ret = rtc->irq_alarm;
+		goto err;
+	}
+
+	platform_set_drvdata(pdev, rtc);
+
+	ret = device_init_wakeup(&pdev->dev, true);
+	if (ret)
+		dev_warn(&pdev->dev,
+			 "alarm won't be able to wake up the system");
+
+	rtc->rtc_dev = devm_rtc_device_register(&pdev->dev, pdev->name,
+			&stm32_rtc_ops, THIS_MODULE);
+	if (IS_ERR(rtc->rtc_dev)) {
+		ret = PTR_ERR(rtc->rtc_dev);
+		dev_err(&pdev->dev, "rtc device registration failed, err=%d\n",
+			ret);
+		goto err;
+	}
+
+	/* Handle RTC alarm interrupts */
+	ret = devm_request_threaded_irq(&pdev->dev, rtc->irq_alarm, NULL,
+					stm32_rtc_alarm_irq,
+					IRQF_TRIGGER_RISING | IRQF_ONESHOT,
+					pdev->name, rtc);
+	if (ret) {
+		dev_err(&pdev->dev, "IRQ%d (alarm interrupt) already claimed\n",
+			rtc->irq_alarm);
+		goto err;
+	}
+
+	/*
+	 * If INITS flag is reset (calendar year field set to 0x00), calendar
+	 * must be initialized
+	 */
+	if (!(readl_relaxed(rtc->base + STM32_RTC_ISR) & STM32_RTC_ISR_INITS))
+		dev_warn(&pdev->dev, "Date/Time must be initialized\n");
+
+	return 0;
+err:
+	clk_disable_unprepare(rtc->ck_rtc);
+
+	regmap_update_bits(rtc->dbp, PWR_CR, PWR_CR_DBP, 0);
+
+	device_init_wakeup(&pdev->dev, false);
+
+	return ret;
+}
+
+static int stm32_rtc_remove(struct platform_device *pdev)
+{
+	struct stm32_rtc *rtc = platform_get_drvdata(pdev);
+	unsigned int cr;
+
+	/* Disable interrupts */
+	stm32_rtc_wpr_unlock(rtc);
+	cr = readl_relaxed(rtc->base + STM32_RTC_CR);
+	cr &= ~STM32_RTC_CR_ALRAIE;
+	writel_relaxed(cr, rtc->base + STM32_RTC_CR);
+	stm32_rtc_wpr_lock(rtc);
+
+	clk_disable_unprepare(rtc->ck_rtc);
+
+	/* Enable backup domain write protection */
+	regmap_update_bits(rtc->dbp, PWR_CR, PWR_CR_DBP, 0);
+
+	device_init_wakeup(&pdev->dev, false);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int stm32_rtc_suspend(struct device *dev)
+{
+	struct stm32_rtc *rtc = dev_get_drvdata(dev);
+
+	if (device_may_wakeup(dev))
+		return enable_irq_wake(rtc->irq_alarm);
+
+	return 0;
+}
+
+static int stm32_rtc_resume(struct device *dev)
+{
+	struct stm32_rtc *rtc = dev_get_drvdata(dev);
+	int ret = 0;
+
+	ret = stm32_rtc_wait_sync(rtc);
+	if (ret < 0)
+		return ret;
+
+	if (device_may_wakeup(dev))
+		return disable_irq_wake(rtc->irq_alarm);
+
+	return ret;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(stm32_rtc_pm_ops,
+			 stm32_rtc_suspend, stm32_rtc_resume);
+
+static struct platform_driver stm32_rtc_driver = {
+	.probe		= stm32_rtc_probe,
+	.remove		= stm32_rtc_remove,
+	.driver		= {
+		.name	= DRIVER_NAME,
+		.pm	= &stm32_rtc_pm_ops,
+		.of_match_table = stm32_rtc_of_match,
+	},
+};
+
+module_platform_driver(stm32_rtc_driver);
+
+MODULE_ALIAS("platform:" DRIVER_NAME);
+MODULE_AUTHOR("Amelie Delaunay <amelie.delaunay@st.com>");
+MODULE_DESCRIPTION("STMicroelectronics STM32 Real Time Clock driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/rtc/rtc-sun6i.c b/drivers/rtc/rtc-sun6i.c
index c169a2c..39cbc12 100644
--- a/drivers/rtc/rtc-sun6i.c
+++ b/drivers/rtc/rtc-sun6i.c
@@ -20,6 +20,8 @@
  * more details.
  */
 
+#include <linux/clk.h>
+#include <linux/clk-provider.h>
 #include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/fs.h>
@@ -33,15 +35,20 @@
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/rtc.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 
 /* Control register */
 #define SUN6I_LOSC_CTRL				0x0000
+#define SUN6I_LOSC_CTRL_KEY			(0x16aa << 16)
 #define SUN6I_LOSC_CTRL_ALM_DHMS_ACC		BIT(9)
 #define SUN6I_LOSC_CTRL_RTC_HMS_ACC		BIT(8)
 #define SUN6I_LOSC_CTRL_RTC_YMD_ACC		BIT(7)
+#define SUN6I_LOSC_CTRL_EXT_OSC			BIT(0)
 #define SUN6I_LOSC_CTRL_ACC_MASK		GENMASK(9, 7)
 
+#define SUN6I_LOSC_CLK_PRESCAL			0x0008
+
 /* RTC */
 #define SUN6I_RTC_YMD				0x0010
 #define SUN6I_RTC_HMS				0x0014
@@ -114,13 +121,142 @@ struct sun6i_rtc_dev {
 	void __iomem *base;
 	int irq;
 	unsigned long alarm;
+
+	struct clk_hw hw;
+	struct clk_hw *int_osc;
+	struct clk *losc;
+
+	spinlock_t lock;
 };
 
+static struct sun6i_rtc_dev *sun6i_rtc;
+
+static unsigned long sun6i_rtc_osc_recalc_rate(struct clk_hw *hw,
+					       unsigned long parent_rate)
+{
+	struct sun6i_rtc_dev *rtc = container_of(hw, struct sun6i_rtc_dev, hw);
+	u32 val;
+
+	val = readl(rtc->base + SUN6I_LOSC_CTRL);
+	if (val & SUN6I_LOSC_CTRL_EXT_OSC)
+		return parent_rate;
+
+	val = readl(rtc->base + SUN6I_LOSC_CLK_PRESCAL);
+	val &= GENMASK(4, 0);
+
+	return parent_rate / (val + 1);
+}
+
+static u8 sun6i_rtc_osc_get_parent(struct clk_hw *hw)
+{
+	struct sun6i_rtc_dev *rtc = container_of(hw, struct sun6i_rtc_dev, hw);
+
+	return readl(rtc->base + SUN6I_LOSC_CTRL) & SUN6I_LOSC_CTRL_EXT_OSC;
+}
+
+static int sun6i_rtc_osc_set_parent(struct clk_hw *hw, u8 index)
+{
+	struct sun6i_rtc_dev *rtc = container_of(hw, struct sun6i_rtc_dev, hw);
+	unsigned long flags;
+	u32 val;
+
+	if (index > 1)
+		return -EINVAL;
+
+	spin_lock_irqsave(&rtc->lock, flags);
+	val = readl(rtc->base + SUN6I_LOSC_CTRL);
+	val &= ~SUN6I_LOSC_CTRL_EXT_OSC;
+	val |= SUN6I_LOSC_CTRL_KEY;
+	val |= index ? SUN6I_LOSC_CTRL_EXT_OSC : 0;
+	writel(val, rtc->base + SUN6I_LOSC_CTRL);
+	spin_unlock_irqrestore(&rtc->lock, flags);
+
+	return 0;
+}
+
+static const struct clk_ops sun6i_rtc_osc_ops = {
+	.recalc_rate	= sun6i_rtc_osc_recalc_rate,
+
+	.get_parent	= sun6i_rtc_osc_get_parent,
+	.set_parent	= sun6i_rtc_osc_set_parent,
+};
+
+static void __init sun6i_rtc_clk_init(struct device_node *node)
+{
+	struct clk_hw_onecell_data *clk_data;
+	struct sun6i_rtc_dev *rtc;
+	struct clk_init_data init = {
+		.ops		= &sun6i_rtc_osc_ops,
+	};
+	const char *parents[2];
+
+	rtc = kzalloc(sizeof(*rtc), GFP_KERNEL);
+	if (!rtc)
+		return;
+	spin_lock_init(&rtc->lock);
+
+	clk_data = kzalloc(sizeof(*clk_data) + sizeof(*clk_data->hws),
+			   GFP_KERNEL);
+	if (!clk_data)
+		return;
+	spin_lock_init(&rtc->lock);
+
+	rtc->base = of_io_request_and_map(node, 0, of_node_full_name(node));
+	if (IS_ERR(rtc->base)) {
+		pr_crit("Can't map RTC registers");
+		return;
+	}
+
+	/* Switch to the external, more precise, oscillator */
+	writel(SUN6I_LOSC_CTRL_KEY | SUN6I_LOSC_CTRL_EXT_OSC,
+	       rtc->base + SUN6I_LOSC_CTRL);
+
+	/* Yes, I know, this is ugly. */
+	sun6i_rtc = rtc;
+
+	/* Deal with old DTs */
+	if (!of_get_property(node, "clocks", NULL))
+		return;
+
+	rtc->int_osc = clk_hw_register_fixed_rate_with_accuracy(NULL,
+								"rtc-int-osc",
+								NULL, 0,
+								667000,
+								300000000);
+	if (IS_ERR(rtc->int_osc)) {
+		pr_crit("Couldn't register the internal oscillator\n");
+		return;
+	}
+
+	parents[0] = clk_hw_get_name(rtc->int_osc);
+	parents[1] = of_clk_get_parent_name(node, 0);
+
+	rtc->hw.init = &init;
+
+	init.parent_names = parents;
+	init.num_parents = of_clk_get_parent_count(node) + 1;
+	of_property_read_string(node, "clock-output-names", &init.name);
+
+	rtc->losc = clk_register(NULL, &rtc->hw);
+	if (IS_ERR(rtc->losc)) {
+		pr_crit("Couldn't register the LOSC clock\n");
+		return;
+	}
+
+	clk_data->num = 1;
+	clk_data->hws[0] = &rtc->hw;
+	of_clk_add_hw_provider(node, of_clk_hw_onecell_get, clk_data);
+}
+CLK_OF_DECLARE_DRIVER(sun6i_rtc_clk, "allwinner,sun6i-a31-rtc",
+		      sun6i_rtc_clk_init);
+
 static irqreturn_t sun6i_rtc_alarmirq(int irq, void *id)
 {
 	struct sun6i_rtc_dev *chip = (struct sun6i_rtc_dev *) id;
+	irqreturn_t ret = IRQ_NONE;
 	u32 val;
 
+	spin_lock(&chip->lock);
 	val = readl(chip->base + SUN6I_ALRM_IRQ_STA);
 
 	if (val & SUN6I_ALRM_IRQ_STA_CNT_IRQ_PEND) {
@@ -129,10 +265,11 @@ static irqreturn_t sun6i_rtc_alarmirq(int irq, void *id)
 
 		rtc_update_irq(chip->rtc, 1, RTC_AF | RTC_IRQF);
 
-		return IRQ_HANDLED;
+		ret = IRQ_HANDLED;
 	}
+	spin_unlock(&chip->lock);
 
-	return IRQ_NONE;
+	return ret;
 }
 
 static void sun6i_rtc_setaie(int to, struct sun6i_rtc_dev *chip)
@@ -140,6 +277,7 @@ static void sun6i_rtc_setaie(int to, struct sun6i_rtc_dev *chip)
 	u32 alrm_val = 0;
 	u32 alrm_irq_val = 0;
 	u32 alrm_wake_val = 0;
+	unsigned long flags;
 
 	if (to) {
 		alrm_val = SUN6I_ALRM_EN_CNT_EN;
@@ -150,9 +288,11 @@ static void sun6i_rtc_setaie(int to, struct sun6i_rtc_dev *chip)
 		       chip->base + SUN6I_ALRM_IRQ_STA);
 	}
 
+	spin_lock_irqsave(&chip->lock, flags);
 	writel(alrm_val, chip->base + SUN6I_ALRM_EN);
 	writel(alrm_irq_val, chip->base + SUN6I_ALRM_IRQ_EN);
 	writel(alrm_wake_val, chip->base + SUN6I_ALARM_CONFIG);
+	spin_unlock_irqrestore(&chip->lock, flags);
 }
 
 static int sun6i_rtc_gettime(struct device *dev, struct rtc_time *rtc_tm)
@@ -191,11 +331,15 @@ static int sun6i_rtc_gettime(struct device *dev, struct rtc_time *rtc_tm)
 static int sun6i_rtc_getalarm(struct device *dev, struct rtc_wkalrm *wkalrm)
 {
 	struct sun6i_rtc_dev *chip = dev_get_drvdata(dev);
+	unsigned long flags;
 	u32 alrm_st;
 	u32 alrm_en;
 
+	spin_lock_irqsave(&chip->lock, flags);
 	alrm_en = readl(chip->base + SUN6I_ALRM_IRQ_EN);
 	alrm_st = readl(chip->base + SUN6I_ALRM_IRQ_STA);
+	spin_unlock_irqrestore(&chip->lock, flags);
+
 	wkalrm->enabled = !!(alrm_en & SUN6I_ALRM_EN_CNT_EN);
 	wkalrm->pending = !!(alrm_st & SUN6I_ALRM_EN_CNT_EN);
 	rtc_time_to_tm(chip->alarm, &wkalrm->time);
@@ -349,22 +493,15 @@ static const struct rtc_class_ops sun6i_rtc_ops = {
 
 static int sun6i_rtc_probe(struct platform_device *pdev)
 {
-	struct sun6i_rtc_dev *chip;
-	struct resource *res;
+	struct sun6i_rtc_dev *chip = sun6i_rtc;
 	int ret;
 
-	chip = devm_kzalloc(&pdev->dev, sizeof(*chip), GFP_KERNEL);
 	if (!chip)
-		return -ENOMEM;
+		return -ENODEV;
 
 	platform_set_drvdata(pdev, chip);
 	chip->dev = &pdev->dev;
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	chip->base = devm_ioremap_resource(&pdev->dev, res);
-	if (IS_ERR(chip->base))
-		return PTR_ERR(chip->base);
-
 	chip->irq = platform_get_irq(pdev, 0);
 	if (chip->irq < 0) {
 		dev_err(&pdev->dev, "No IRQ resource\n");
@@ -404,8 +541,10 @@ static int sun6i_rtc_probe(struct platform_device *pdev)
 	/* disable alarm wakeup */
 	writel(0, chip->base + SUN6I_ALARM_CONFIG);
 
-	chip->rtc = rtc_device_register("rtc-sun6i", &pdev->dev,
-					&sun6i_rtc_ops, THIS_MODULE);
+	clk_prepare_enable(chip->losc);
+
+	chip->rtc = devm_rtc_device_register(&pdev->dev, "rtc-sun6i",
+					     &sun6i_rtc_ops, THIS_MODULE);
 	if (IS_ERR(chip->rtc)) {
 		dev_err(&pdev->dev, "unable to register device\n");
 		return PTR_ERR(chip->rtc);
@@ -416,15 +555,6 @@ static int sun6i_rtc_probe(struct platform_device *pdev)
 	return 0;
 }
 
-static int sun6i_rtc_remove(struct platform_device *pdev)
-{
-	struct sun6i_rtc_dev *chip = platform_get_drvdata(pdev);
-
-	rtc_device_unregister(chip->rtc);
-
-	return 0;
-}
-
 static const struct of_device_id sun6i_rtc_dt_ids[] = {
 	{ .compatible = "allwinner,sun6i-a31-rtc" },
 	{ /* sentinel */ },
@@ -433,15 +563,9 @@ MODULE_DEVICE_TABLE(of, sun6i_rtc_dt_ids);
 
 static struct platform_driver sun6i_rtc_driver = {
 	.probe		= sun6i_rtc_probe,
-	.remove		= sun6i_rtc_remove,
 	.driver		= {
 		.name		= "sun6i-rtc",
 		.of_match_table = sun6i_rtc_dt_ids,
 	},
 };
-
-module_platform_driver(sun6i_rtc_driver);
-
-MODULE_DESCRIPTION("sun6i RTC driver");
-MODULE_AUTHOR("Chen-Yu Tsai <wens@csie.org>");
-MODULE_LICENSE("GPL");
+builtin_platform_driver(sun6i_rtc_driver);
diff --git a/drivers/rtc/rtc-tegra.c b/drivers/rtc/rtc-tegra.c
index 3853ba9..d30d57b 100644
--- a/drivers/rtc/rtc-tegra.c
+++ b/drivers/rtc/rtc-tegra.c
@@ -17,16 +17,18 @@
  * with this program; if not, write to the Free Software Foundation, Inc.,
  * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/irq.h>
-#include <linux/io.h>
+
+#include <linux/clk.h>
 #include <linux/delay.h>
-#include <linux/rtc.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/pm.h>
+#include <linux/rtc.h>
+#include <linux/slab.h>
 
 /* set to 1 = busy every eight 32kHz clocks during copy of sec+msec to AHB */
 #define TEGRA_RTC_REG_BUSY			0x004
@@ -59,6 +61,7 @@ struct tegra_rtc_info {
 	struct platform_device	*pdev;
 	struct rtc_device	*rtc_dev;
 	void __iomem		*rtc_base; /* NULL if not initialized. */
+	struct clk		*clk;
 	int			tegra_rtc_irq; /* alarm and periodic irq */
 	spinlock_t		tegra_rtc_lock;
 };
@@ -326,6 +329,14 @@ static int __init tegra_rtc_probe(struct platform_device *pdev)
 	if (info->tegra_rtc_irq <= 0)
 		return -EBUSY;
 
+	info->clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(info->clk))
+		return PTR_ERR(info->clk);
+
+	ret = clk_prepare_enable(info->clk);
+	if (ret < 0)
+		return ret;
+
 	/* set context info. */
 	info->pdev = pdev;
 	spin_lock_init(&info->tegra_rtc_lock);
@@ -346,7 +357,7 @@ static int __init tegra_rtc_probe(struct platform_device *pdev)
 		ret = PTR_ERR(info->rtc_dev);
 		dev_err(&pdev->dev, "Unable to register device (err=%d).\n",
 			ret);
-		return ret;
+		goto disable_clk;
 	}
 
 	ret = devm_request_irq(&pdev->dev, info->tegra_rtc_irq,
@@ -356,12 +367,25 @@ static int __init tegra_rtc_probe(struct platform_device *pdev)
 		dev_err(&pdev->dev,
 			"Unable to request interrupt for device (err=%d).\n",
 			ret);
-		return ret;
+		goto disable_clk;
 	}
 
 	dev_notice(&pdev->dev, "Tegra internal Real Time Clock\n");
 
 	return 0;
+
+disable_clk:
+	clk_disable_unprepare(info->clk);
+	return ret;
+}
+
+static int tegra_rtc_remove(struct platform_device *pdev)
+{
+	struct tegra_rtc_info *info = platform_get_drvdata(pdev);
+
+	clk_disable_unprepare(info->clk);
+
+	return 0;
 }
 
 #ifdef CONFIG_PM_SLEEP
@@ -413,6 +437,7 @@ static void tegra_rtc_shutdown(struct platform_device *pdev)
 
 MODULE_ALIAS("platform:tegra_rtc");
 static struct platform_driver tegra_rtc_driver = {
+	.remove		= tegra_rtc_remove,
 	.shutdown	= tegra_rtc_shutdown,
 	.driver		= {
 		.name	= "tegra_rtc",
diff --git a/drivers/rtc/rtc-tps65910.c b/drivers/rtc/rtc-tps65910.c
index 5a3d53c..d0244d7 100644
--- a/drivers/rtc/rtc-tps65910.c
+++ b/drivers/rtc/rtc-tps65910.c
@@ -21,6 +21,7 @@
 #include <linux/types.h>
 #include <linux/rtc.h>
 #include <linux/bcd.h>
+#include <linux/math64.h>
 #include <linux/platform_device.h>
 #include <linux/interrupt.h>
 #include <linux/mfd/tps65910.h>
@@ -33,7 +34,21 @@ struct tps65910_rtc {
 /* Total number of RTC registers needed to set time*/
 #define NUM_TIME_REGS	(TPS65910_YEARS - TPS65910_SECONDS + 1)
 
-static int tps65910_rtc_alarm_irq_enable(struct device *dev, unsigned enabled)
+/* Total number of RTC registers needed to set compensation registers */
+#define NUM_COMP_REGS	(TPS65910_RTC_COMP_MSB - TPS65910_RTC_COMP_LSB + 1)
+
+/* Min and max values supported with 'offset' interface (swapped sign) */
+#define MIN_OFFSET	(-277761)
+#define MAX_OFFSET	(277778)
+
+/* Number of ticks per hour */
+#define TICKS_PER_HOUR	(32768 * 3600)
+
+/* Multiplier for ppb conversions */
+#define PPB_MULT	(1000000000LL)
+
+static int tps65910_rtc_alarm_irq_enable(struct device *dev,
+					 unsigned int enabled)
 {
 	struct tps65910 *tps = dev_get_drvdata(dev->parent);
 	u8 val = 0;
@@ -187,6 +202,133 @@ static int tps65910_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
 	return ret;
 }
 
+static int tps65910_rtc_set_calibration(struct device *dev, int calibration)
+{
+	unsigned char comp_data[NUM_COMP_REGS];
+	struct tps65910 *tps = dev_get_drvdata(dev->parent);
+	s16 value;
+	int ret;
+
+	/*
+	 * TPS65910 uses two's complement 16 bit value for compensation for RTC
+	 * crystal inaccuracies. One time every hour when seconds counter
+	 * increments from 0 to 1 compensation value will be added to internal
+	 * RTC counter value.
+	 *
+	 * Compensation value 0x7FFF is prohibited value.
+	 *
+	 * Valid range for compensation value: [-32768 .. 32766]
+	 */
+	if ((calibration < -32768) || (calibration > 32766)) {
+		dev_err(dev, "RTC calibration value out of range: %d\n",
+			calibration);
+		return -EINVAL;
+	}
+
+	value = (s16)calibration;
+
+	comp_data[0] = (u16)value & 0xFF;
+	comp_data[1] = ((u16)value >> 8) & 0xFF;
+
+	/* Update all the compensation registers in one shot */
+	ret = regmap_bulk_write(tps->regmap, TPS65910_RTC_COMP_LSB,
+		comp_data, NUM_COMP_REGS);
+	if (ret < 0) {
+		dev_err(dev, "rtc_set_calibration error: %d\n", ret);
+		return ret;
+	}
+
+	/* Enable automatic compensation */
+	ret = regmap_update_bits(tps->regmap, TPS65910_RTC_CTRL,
+		TPS65910_RTC_CTRL_AUTO_COMP, TPS65910_RTC_CTRL_AUTO_COMP);
+	if (ret < 0)
+		dev_err(dev, "auto_comp enable failed with error: %d\n", ret);
+
+	return ret;
+}
+
+static int tps65910_rtc_get_calibration(struct device *dev, int *calibration)
+{
+	unsigned char comp_data[NUM_COMP_REGS];
+	struct tps65910 *tps = dev_get_drvdata(dev->parent);
+	unsigned int ctrl;
+	u16 value;
+	int ret;
+
+	ret = regmap_read(tps->regmap, TPS65910_RTC_CTRL, &ctrl);
+	if (ret < 0)
+		return ret;
+
+	/* If automatic compensation is not enabled report back zero */
+	if (!(ctrl & TPS65910_RTC_CTRL_AUTO_COMP)) {
+		*calibration = 0;
+		return 0;
+	}
+
+	ret = regmap_bulk_read(tps->regmap, TPS65910_RTC_COMP_LSB, comp_data,
+		NUM_COMP_REGS);
+	if (ret < 0) {
+		dev_err(dev, "rtc_get_calibration error: %d\n", ret);
+		return ret;
+	}
+
+	value = (u16)comp_data[0] | ((u16)comp_data[1] << 8);
+
+	*calibration = (s16)value;
+
+	return 0;
+}
+
+static int tps65910_read_offset(struct device *dev, long *offset)
+{
+	int calibration;
+	s64 tmp;
+	int ret;
+
+	ret = tps65910_rtc_get_calibration(dev, &calibration);
+	if (ret < 0)
+		return ret;
+
+	/* Convert from RTC calibration register format to ppb format */
+	tmp = calibration * (s64)PPB_MULT;
+	if (tmp < 0)
+		tmp -= TICKS_PER_HOUR / 2LL;
+	else
+		tmp += TICKS_PER_HOUR / 2LL;
+	tmp = div_s64(tmp, TICKS_PER_HOUR);
+
+	/* Offset value operates in negative way, so swap sign */
+	*offset = (long)-tmp;
+
+	return 0;
+}
+
+static int tps65910_set_offset(struct device *dev, long offset)
+{
+	int calibration;
+	s64 tmp;
+	int ret;
+
+	/* Make sure offset value is within supported range */
+	if (offset < MIN_OFFSET || offset > MAX_OFFSET)
+		return -ERANGE;
+
+	/* Convert from ppb format to RTC calibration register format */
+	tmp = offset * (s64)TICKS_PER_HOUR;
+	if (tmp < 0)
+		tmp -= PPB_MULT / 2LL;
+	else
+		tmp += PPB_MULT / 2LL;
+	tmp = div_s64(tmp, PPB_MULT);
+
+	/* Offset value operates in negative way, so swap sign */
+	calibration = (int)-tmp;
+
+	ret = tps65910_rtc_set_calibration(dev, calibration);
+
+	return ret;
+}
+
 static irqreturn_t tps65910_rtc_interrupt(int irq, void *rtc)
 {
 	struct device *dev = rtc;
@@ -219,6 +361,8 @@ static const struct rtc_class_ops tps65910_rtc_ops = {
 	.read_alarm	= tps65910_rtc_read_alarm,
 	.set_alarm	= tps65910_rtc_set_alarm,
 	.alarm_irq_enable = tps65910_rtc_alarm_irq_enable,
+	.read_offset	= tps65910_read_offset,
+	.set_offset	= tps65910_set_offset,
 };
 
 static int tps65910_rtc_probe(struct platform_device *pdev)
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index 0f17137..0b38217 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -4864,7 +4864,7 @@ static void dasd_eckd_dump_sense_tcw(struct dasd_device *device,
 			break;
 		case 3: /* tsa_intrg */
 			len += sprintf(page + len, PRINTK_HEADER
-				      " tsb->tsa.intrg.: not supportet yet\n");
+				      " tsb->tsa.intrg.: not supported yet\n");
 			break;
 		}
 
diff --git a/drivers/s390/cio/ioasm.c b/drivers/s390/cio/ioasm.c
index 8225da6..4182f60 100644
--- a/drivers/s390/cio/ioasm.c
+++ b/drivers/s390/cio/ioasm.c
@@ -165,13 +165,15 @@ int tpi(struct tpi_info *addr)
 int chsc(void *chsc_area)
 {
 	typedef struct { char _[4096]; } addr_type;
-	int cc;
+	int cc = -EIO;
 
 	asm volatile(
 		"	.insn	rre,0xb25f0000,%2,0\n"
-		"	ipm	%0\n"
+		"0:	ipm	%0\n"
 		"	srl	%0,28\n"
-		: "=d" (cc), "=m" (*(addr_type *) chsc_area)
+		"1:\n"
+		EX_TABLE(0b, 1b)
+		: "+d" (cc), "=m" (*(addr_type *) chsc_area)
 		: "d" (chsc_area), "m" (*(addr_type *) chsc_area)
 		: "cc");
 	trace_s390_cio_chsc(chsc_area, cc);
diff --git a/drivers/s390/crypto/Makefile b/drivers/s390/crypto/Makefile
index 0a7fb83..be36f10 100644
--- a/drivers/s390/crypto/Makefile
+++ b/drivers/s390/crypto/Makefile
@@ -10,3 +10,7 @@
 obj-$(CONFIG_ZCRYPT) += zcrypt.o
 # adapter drivers depend on ap.o and zcrypt.o
 obj-$(CONFIG_ZCRYPT) += zcrypt_pcixcc.o zcrypt_cex2a.o zcrypt_cex4.o
+
+# pkey kernel module
+pkey-objs := pkey_api.o
+obj-$(CONFIG_PKEY) += pkey.o
diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c
index 56db76c..9be4596 100644
--- a/drivers/s390/crypto/ap_bus.c
+++ b/drivers/s390/crypto/ap_bus.c
@@ -1107,16 +1107,6 @@ static void ap_config_timeout(unsigned long ptr)
 	queue_work(system_long_wq, &ap_scan_work);
 }
 
-static void ap_reset_domain(void)
-{
-	int i;
-
-	if (ap_domain_index == -1 || !ap_test_config_domain(ap_domain_index))
-		return;
-	for (i = 0; i < AP_DEVICES; i++)
-		ap_rapq(AP_MKQID(i, ap_domain_index));
-}
-
 static void ap_reset_all(void)
 {
 	int i, j;
diff --git a/drivers/s390/crypto/ap_card.c b/drivers/s390/crypto/ap_card.c
index 1cd9128..cfa161c 100644
--- a/drivers/s390/crypto/ap_card.c
+++ b/drivers/s390/crypto/ap_card.c
@@ -58,9 +58,9 @@ static ssize_t ap_functions_show(struct device *dev,
 
 static DEVICE_ATTR(ap_functions, 0444, ap_functions_show, NULL);
 
-static ssize_t ap_request_count_show(struct device *dev,
-				     struct device_attribute *attr,
-				     char *buf)
+static ssize_t ap_req_count_show(struct device *dev,
+				 struct device_attribute *attr,
+				 char *buf)
 {
 	struct ap_card *ac = to_ap_card(dev);
 	unsigned int req_cnt;
@@ -72,7 +72,23 @@ static ssize_t ap_request_count_show(struct device *dev,
 	return snprintf(buf, PAGE_SIZE, "%d\n", req_cnt);
 }
 
-static DEVICE_ATTR(request_count, 0444, ap_request_count_show, NULL);
+static ssize_t ap_req_count_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct ap_card *ac = to_ap_card(dev);
+	struct ap_queue *aq;
+
+	spin_lock_bh(&ap_list_lock);
+	for_each_ap_queue(aq, ac)
+		aq->total_request_count = 0;
+	spin_unlock_bh(&ap_list_lock);
+	atomic_set(&ac->total_request_count, 0);
+
+	return count;
+}
+
+static DEVICE_ATTR(request_count, 0644, ap_req_count_show, ap_req_count_store);
 
 static ssize_t ap_requestq_count_show(struct device *dev,
 				      struct device_attribute *attr, char *buf)
diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c
index 7be67fa..480c58a 100644
--- a/drivers/s390/crypto/ap_queue.c
+++ b/drivers/s390/crypto/ap_queue.c
@@ -459,9 +459,9 @@ EXPORT_SYMBOL(ap_queue_resume);
 /*
  * AP queue related attributes.
  */
-static ssize_t ap_request_count_show(struct device *dev,
-				     struct device_attribute *attr,
-				     char *buf)
+static ssize_t ap_req_count_show(struct device *dev,
+				 struct device_attribute *attr,
+				 char *buf)
 {
 	struct ap_queue *aq = to_ap_queue(dev);
 	unsigned int req_cnt;
@@ -472,7 +472,20 @@ static ssize_t ap_request_count_show(struct device *dev,
 	return snprintf(buf, PAGE_SIZE, "%d\n", req_cnt);
 }
 
-static DEVICE_ATTR(request_count, 0444, ap_request_count_show, NULL);
+static ssize_t ap_req_count_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct ap_queue *aq = to_ap_queue(dev);
+
+	spin_lock_bh(&aq->lock);
+	aq->total_request_count = 0;
+	spin_unlock_bh(&aq->lock);
+
+	return count;
+}
+
+static DEVICE_ATTR(request_count, 0644, ap_req_count_show, ap_req_count_store);
 
 static ssize_t ap_requestq_count_show(struct device *dev,
 				      struct device_attribute *attr, char *buf)
diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c
new file mode 100644
index 0000000..40f1136
--- /dev/null
+++ b/drivers/s390/crypto/pkey_api.c
@@ -0,0 +1,1148 @@
+/*
+ *  pkey device driver
+ *
+ *  Copyright IBM Corp. 2017
+ *  Author(s): Harald Freudenberger
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ */
+
+#define KMSG_COMPONENT "pkey"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kallsyms.h>
+#include <linux/debugfs.h>
+#include <asm/zcrypt.h>
+#include <asm/cpacf.h>
+#include <asm/pkey.h>
+
+#include "zcrypt_api.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("IBM Corporation");
+MODULE_DESCRIPTION("s390 protected key interface");
+
+/* Size of parameter block used for all cca requests/replies */
+#define PARMBSIZE 512
+
+/* Size of vardata block used for some of the cca requests/replies */
+#define VARDATASIZE 4096
+
+/*
+ * debug feature data and functions
+ */
+
+static debug_info_t *debug_info;
+
+#define DEBUG_DBG(...)	debug_sprintf_event(debug_info, 6, ##__VA_ARGS__)
+#define DEBUG_INFO(...) debug_sprintf_event(debug_info, 5, ##__VA_ARGS__)
+#define DEBUG_WARN(...) debug_sprintf_event(debug_info, 4, ##__VA_ARGS__)
+#define DEBUG_ERR(...)	debug_sprintf_event(debug_info, 3, ##__VA_ARGS__)
+
+static void __init pkey_debug_init(void)
+{
+	debug_info = debug_register("pkey", 1, 1, 4 * sizeof(long));
+	debug_register_view(debug_info, &debug_sprintf_view);
+	debug_set_level(debug_info, 3);
+}
+
+static void __exit pkey_debug_exit(void)
+{
+	debug_unregister(debug_info);
+}
+
+/* inside view of a secure key token (only type 0x01 version 0x04) */
+struct secaeskeytoken {
+	u8  type;     /* 0x01 for internal key token */
+	u8  res0[3];
+	u8  version;  /* should be 0x04 */
+	u8  res1[1];
+	u8  flag;     /* key flags */
+	u8  res2[1];
+	u64 mkvp;     /* master key verification pattern */
+	u8  key[32];  /* key value (encrypted) */
+	u8  cv[8];    /* control vector */
+	u16 bitsize;  /* key bit size */
+	u16 keysize;  /* key byte size */
+	u8  tvv[4];   /* token validation value */
+} __packed;
+
+/*
+ * Simple check if the token is a valid CCA secure AES key
+ * token. If keybitsize is given, the bitsize of the key is
+ * also checked. Returns 0 on success or errno value on failure.
+ */
+static int check_secaeskeytoken(u8 *token, int keybitsize)
+{
+	struct secaeskeytoken *t = (struct secaeskeytoken *) token;
+
+	if (t->type != 0x01) {
+		DEBUG_ERR(
+			"check_secaeskeytoken secure token check failed, type mismatch 0x%02x != 0x01\n",
+			(int) t->type);
+		return -EINVAL;
+	}
+	if (t->version != 0x04) {
+		DEBUG_ERR(
+			"check_secaeskeytoken secure token check failed, version mismatch 0x%02x != 0x04\n",
+			(int) t->version);
+		return -EINVAL;
+	}
+	if (keybitsize > 0 && t->bitsize != keybitsize) {
+		DEBUG_ERR(
+			"check_secaeskeytoken secure token check failed, bitsize mismatch %d != %d\n",
+			(int) t->bitsize, keybitsize);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Allocate consecutive memory for request CPRB, request param
+ * block, reply CPRB and reply param block and fill in values
+ * for the common fields. Returns 0 on success or errno value
+ * on failure.
+ */
+static int alloc_and_prep_cprbmem(size_t paramblen,
+				  u8 **pcprbmem,
+				  struct CPRBX **preqCPRB,
+				  struct CPRBX **prepCPRB)
+{
+	u8 *cprbmem;
+	size_t cprbplusparamblen = sizeof(struct CPRBX) + paramblen;
+	struct CPRBX *preqcblk, *prepcblk;
+
+	/*
+	 * allocate consecutive memory for request CPRB, request param
+	 * block, reply CPRB and reply param block
+	 */
+	cprbmem = kmalloc(2 * cprbplusparamblen, GFP_KERNEL);
+	if (!cprbmem)
+		return -ENOMEM;
+	memset(cprbmem, 0, 2 * cprbplusparamblen);
+
+	preqcblk = (struct CPRBX *) cprbmem;
+	prepcblk = (struct CPRBX *) (cprbmem + cprbplusparamblen);
+
+	/* fill request cprb struct */
+	preqcblk->cprb_len = sizeof(struct CPRBX);
+	preqcblk->cprb_ver_id = 0x02;
+	memcpy(preqcblk->func_id, "T2", 2);
+	preqcblk->rpl_msgbl = cprbplusparamblen;
+	if (paramblen) {
+		preqcblk->req_parmb =
+			((u8 *) preqcblk) + sizeof(struct CPRBX);
+		preqcblk->rpl_parmb =
+			((u8 *) prepcblk) + sizeof(struct CPRBX);
+	}
+
+	*pcprbmem = cprbmem;
+	*preqCPRB = preqcblk;
+	*prepCPRB = prepcblk;
+
+	return 0;
+}
+
+/*
+ * Free the cprb memory allocated with the function above.
+ * If the scrub value is not zero, the memory is filled
+ * with zeros before freeing (useful if there was some
+ * clear key material in there).
+ */
+static void free_cprbmem(void *mem, size_t paramblen, int scrub)
+{
+	if (scrub)
+		memzero_explicit(mem, 2 * (sizeof(struct CPRBX) + paramblen));
+	kfree(mem);
+}
+
+/*
+ * Helper function to prepare the xcrb struct
+ */
+static inline void prep_xcrb(struct ica_xcRB *pxcrb,
+			     u16 cardnr,
+			     struct CPRBX *preqcblk,
+			     struct CPRBX *prepcblk)
+{
+	memset(pxcrb, 0, sizeof(*pxcrb));
+	pxcrb->agent_ID = 0x4341; /* 'CA' */
+	pxcrb->user_defined = (cardnr == 0xFFFF ? AUTOSELECT : cardnr);
+	pxcrb->request_control_blk_length =
+		preqcblk->cprb_len + preqcblk->req_parml;
+	pxcrb->request_control_blk_addr = (void *) preqcblk;
+	pxcrb->reply_control_blk_length = preqcblk->rpl_msgbl;
+	pxcrb->reply_control_blk_addr = (void *) prepcblk;
+}
+
+/*
+ * Helper function which calls zcrypt_send_cprb with
+ * memory management segment adjusted to kernel space
+ * so that the copy_from_user called within this
+ * function do in fact copy from kernel space.
+ */
+static inline int _zcrypt_send_cprb(struct ica_xcRB *xcrb)
+{
+	int rc;
+	mm_segment_t old_fs = get_fs();
+
+	set_fs(KERNEL_DS);
+	rc = zcrypt_send_cprb(xcrb);
+	set_fs(old_fs);
+
+	return rc;
+}
+
+/*
+ * Generate (random) AES secure key.
+ */
+int pkey_genseckey(u16 cardnr, u16 domain,
+		   u32 keytype, struct pkey_seckey *seckey)
+{
+	int i, rc, keysize;
+	int seckeysize;
+	u8 *mem;
+	struct CPRBX *preqcblk, *prepcblk;
+	struct ica_xcRB xcrb;
+	struct kgreqparm {
+		u8  subfunc_code[2];
+		u16 rule_array_len;
+		struct lv1 {
+			u16 len;
+			char  key_form[8];
+			char  key_length[8];
+			char  key_type1[8];
+			char  key_type2[8];
+		} lv1;
+		struct lv2 {
+			u16 len;
+			struct keyid {
+				u16 len;
+				u16 attr;
+				u8  data[SECKEYBLOBSIZE];
+			} keyid[6];
+		} lv2;
+	} *preqparm;
+	struct kgrepparm {
+		u8  subfunc_code[2];
+		u16 rule_array_len;
+		struct lv3 {
+			u16 len;
+			u16 keyblocklen;
+			struct {
+				u16 toklen;
+				u16 tokattr;
+				u8  tok[0];
+				/* ... some more data ... */
+			} keyblock;
+		} lv3;
+	} *prepparm;
+
+	/* get already prepared memory for 2 cprbs with param block each */
+	rc = alloc_and_prep_cprbmem(PARMBSIZE, &mem, &preqcblk, &prepcblk);
+	if (rc)
+		return rc;
+
+	/* fill request cprb struct */
+	preqcblk->domain = domain;
+
+	/* fill request cprb param block with KG request */
+	preqparm = (struct kgreqparm *) preqcblk->req_parmb;
+	memcpy(preqparm->subfunc_code, "KG", 2);
+	preqparm->rule_array_len = sizeof(preqparm->rule_array_len);
+	preqparm->lv1.len = sizeof(struct lv1);
+	memcpy(preqparm->lv1.key_form,	 "OP      ", 8);
+	switch (keytype) {
+	case PKEY_KEYTYPE_AES_128:
+		keysize = 16;
+		memcpy(preqparm->lv1.key_length, "KEYLN16 ", 8);
+		break;
+	case PKEY_KEYTYPE_AES_192:
+		keysize = 24;
+		memcpy(preqparm->lv1.key_length, "KEYLN24 ", 8);
+		break;
+	case PKEY_KEYTYPE_AES_256:
+		keysize = 32;
+		memcpy(preqparm->lv1.key_length, "KEYLN32 ", 8);
+		break;
+	default:
+		DEBUG_ERR(
+			"pkey_genseckey unknown/unsupported keytype %d\n",
+			keytype);
+		rc = -EINVAL;
+		goto out;
+	}
+	memcpy(preqparm->lv1.key_type1,  "AESDATA ", 8);
+	preqparm->lv2.len = sizeof(struct lv2);
+	for (i = 0; i < 6; i++) {
+		preqparm->lv2.keyid[i].len = sizeof(struct keyid);
+		preqparm->lv2.keyid[i].attr = (i == 2 ? 0x30 : 0x10);
+	}
+	preqcblk->req_parml = sizeof(struct kgreqparm);
+
+	/* fill xcrb struct */
+	prep_xcrb(&xcrb, cardnr, preqcblk, prepcblk);
+
+	/* forward xcrb with request CPRB and reply CPRB to zcrypt dd */
+	rc = _zcrypt_send_cprb(&xcrb);
+	if (rc) {
+		DEBUG_ERR(
+			"pkey_genseckey zcrypt_send_cprb (cardnr=%d domain=%d) failed with errno %d\n",
+			(int) cardnr, (int) domain, rc);
+		goto out;
+	}
+
+	/* check response returncode and reasoncode */
+	if (prepcblk->ccp_rtcode != 0) {
+		DEBUG_ERR(
+			"pkey_genseckey secure key generate failure, card response %d/%d\n",
+			(int) prepcblk->ccp_rtcode,
+			(int) prepcblk->ccp_rscode);
+		rc = -EIO;
+		goto out;
+	}
+
+	/* process response cprb param block */
+	prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX);
+	prepparm = (struct kgrepparm *) prepcblk->rpl_parmb;
+
+	/* check length of the returned secure key token */
+	seckeysize = prepparm->lv3.keyblock.toklen
+		- sizeof(prepparm->lv3.keyblock.toklen)
+		- sizeof(prepparm->lv3.keyblock.tokattr);
+	if (seckeysize != SECKEYBLOBSIZE) {
+		DEBUG_ERR(
+			"pkey_genseckey secure token size mismatch %d != %d bytes\n",
+			seckeysize, SECKEYBLOBSIZE);
+		rc = -EIO;
+		goto out;
+	}
+
+	/* check secure key token */
+	rc = check_secaeskeytoken(prepparm->lv3.keyblock.tok, 8*keysize);
+	if (rc) {
+		rc = -EIO;
+		goto out;
+	}
+
+	/* copy the generated secure key token */
+	memcpy(seckey->seckey, prepparm->lv3.keyblock.tok, SECKEYBLOBSIZE);
+
+out:
+	free_cprbmem(mem, PARMBSIZE, 0);
+	return rc;
+}
+EXPORT_SYMBOL(pkey_genseckey);
+
+/*
+ * Generate an AES secure key with given key value.
+ */
+int pkey_clr2seckey(u16 cardnr, u16 domain, u32 keytype,
+		    const struct pkey_clrkey *clrkey,
+		    struct pkey_seckey *seckey)
+{
+	int rc, keysize, seckeysize;
+	u8 *mem;
+	struct CPRBX *preqcblk, *prepcblk;
+	struct ica_xcRB xcrb;
+	struct cmreqparm {
+		u8  subfunc_code[2];
+		u16 rule_array_len;
+		char  rule_array[8];
+		struct lv1 {
+			u16 len;
+			u8  clrkey[0];
+		} lv1;
+		struct lv2 {
+			u16 len;
+			struct keyid {
+				u16 len;
+				u16 attr;
+				u8  data[SECKEYBLOBSIZE];
+			} keyid;
+		} lv2;
+	} *preqparm;
+	struct lv2 *plv2;
+	struct cmrepparm {
+		u8  subfunc_code[2];
+		u16 rule_array_len;
+		struct lv3 {
+			u16 len;
+			u16 keyblocklen;
+			struct {
+				u16 toklen;
+				u16 tokattr;
+				u8  tok[0];
+				/* ... some more data ... */
+			} keyblock;
+		} lv3;
+	} *prepparm;
+
+	/* get already prepared memory for 2 cprbs with param block each */
+	rc = alloc_and_prep_cprbmem(PARMBSIZE, &mem, &preqcblk, &prepcblk);
+	if (rc)
+		return rc;
+
+	/* fill request cprb struct */
+	preqcblk->domain = domain;
+
+	/* fill request cprb param block with CM request */
+	preqparm = (struct cmreqparm *) preqcblk->req_parmb;
+	memcpy(preqparm->subfunc_code, "CM", 2);
+	memcpy(preqparm->rule_array, "AES     ", 8);
+	preqparm->rule_array_len =
+		sizeof(preqparm->rule_array_len) + sizeof(preqparm->rule_array);
+	switch (keytype) {
+	case PKEY_KEYTYPE_AES_128:
+		keysize = 16;
+		break;
+	case PKEY_KEYTYPE_AES_192:
+		keysize = 24;
+		break;
+	case PKEY_KEYTYPE_AES_256:
+		keysize = 32;
+		break;
+	default:
+		DEBUG_ERR(
+			"pkey_clr2seckey unknown/unsupported keytype %d\n",
+			keytype);
+		rc = -EINVAL;
+		goto out;
+	}
+	preqparm->lv1.len = sizeof(struct lv1) + keysize;
+	memcpy(preqparm->lv1.clrkey, clrkey->clrkey, keysize);
+	plv2 = (struct lv2 *) (((u8 *) &preqparm->lv2) + keysize);
+	plv2->len = sizeof(struct lv2);
+	plv2->keyid.len = sizeof(struct keyid);
+	plv2->keyid.attr = 0x30;
+	preqcblk->req_parml = sizeof(struct cmreqparm) + keysize;
+
+	/* fill xcrb struct */
+	prep_xcrb(&xcrb, cardnr, preqcblk, prepcblk);
+
+	/* forward xcrb with request CPRB and reply CPRB to zcrypt dd */
+	rc = _zcrypt_send_cprb(&xcrb);
+	if (rc) {
+		DEBUG_ERR(
+			"pkey_clr2seckey zcrypt_send_cprb (cardnr=%d domain=%d) failed with errno %d\n",
+			(int) cardnr, (int) domain, rc);
+		goto out;
+	}
+
+	/* check response returncode and reasoncode */
+	if (prepcblk->ccp_rtcode != 0) {
+		DEBUG_ERR(
+			"pkey_clr2seckey clear key import failure, card response %d/%d\n",
+			(int) prepcblk->ccp_rtcode,
+			(int) prepcblk->ccp_rscode);
+		rc = -EIO;
+		goto out;
+	}
+
+	/* process response cprb param block */
+	prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX);
+	prepparm = (struct cmrepparm *) prepcblk->rpl_parmb;
+
+	/* check length of the returned secure key token */
+	seckeysize = prepparm->lv3.keyblock.toklen
+		- sizeof(prepparm->lv3.keyblock.toklen)
+		- sizeof(prepparm->lv3.keyblock.tokattr);
+	if (seckeysize != SECKEYBLOBSIZE) {
+		DEBUG_ERR(
+			"pkey_clr2seckey secure token size mismatch %d != %d bytes\n",
+			seckeysize, SECKEYBLOBSIZE);
+		rc = -EIO;
+		goto out;
+	}
+
+	/* check secure key token */
+	rc = check_secaeskeytoken(prepparm->lv3.keyblock.tok, 8*keysize);
+	if (rc) {
+		rc = -EIO;
+		goto out;
+	}
+
+	/* copy the generated secure key token */
+	memcpy(seckey->seckey, prepparm->lv3.keyblock.tok, SECKEYBLOBSIZE);
+
+out:
+	free_cprbmem(mem, PARMBSIZE, 1);
+	return rc;
+}
+EXPORT_SYMBOL(pkey_clr2seckey);
+
+/*
+ * Derive a proteced key from the secure key blob.
+ */
+int pkey_sec2protkey(u16 cardnr, u16 domain,
+		     const struct pkey_seckey *seckey,
+		     struct pkey_protkey *protkey)
+{
+	int rc;
+	u8 *mem;
+	struct CPRBX *preqcblk, *prepcblk;
+	struct ica_xcRB xcrb;
+	struct uskreqparm {
+		u8  subfunc_code[2];
+		u16 rule_array_len;
+		struct lv1 {
+			u16 len;
+			u16 attr_len;
+			u16 attr_flags;
+		} lv1;
+		struct lv2 {
+			u16 len;
+			u16 attr_len;
+			u16 attr_flags;
+			u8  token[0];	      /* cca secure key token */
+		} lv2 __packed;
+	} *preqparm;
+	struct uskrepparm {
+		u8  subfunc_code[2];
+		u16 rule_array_len;
+		struct lv3 {
+			u16 len;
+			u16 attr_len;
+			u16 attr_flags;
+			struct cpacfkeyblock {
+				u8  version;  /* version of this struct */
+				u8  flags[2];
+				u8  algo;
+				u8  form;
+				u8  pad1[3];
+				u16 keylen;
+				u8  key[64];  /* the key (keylen bytes) */
+				u16 keyattrlen;
+				u8  keyattr[32];
+				u8  pad2[1];
+				u8  vptype;
+				u8  vp[32];  /* verification pattern */
+			} keyblock;
+		} lv3 __packed;
+	} *prepparm;
+
+	/* get already prepared memory for 2 cprbs with param block each */
+	rc = alloc_and_prep_cprbmem(PARMBSIZE, &mem, &preqcblk, &prepcblk);
+	if (rc)
+		return rc;
+
+	/* fill request cprb struct */
+	preqcblk->domain = domain;
+
+	/* fill request cprb param block with USK request */
+	preqparm = (struct uskreqparm *) preqcblk->req_parmb;
+	memcpy(preqparm->subfunc_code, "US", 2);
+	preqparm->rule_array_len = sizeof(preqparm->rule_array_len);
+	preqparm->lv1.len = sizeof(struct lv1);
+	preqparm->lv1.attr_len = sizeof(struct lv1) - sizeof(preqparm->lv1.len);
+	preqparm->lv1.attr_flags = 0x0001;
+	preqparm->lv2.len = sizeof(struct lv2) + SECKEYBLOBSIZE;
+	preqparm->lv2.attr_len = sizeof(struct lv2)
+		- sizeof(preqparm->lv2.len) + SECKEYBLOBSIZE;
+	preqparm->lv2.attr_flags = 0x0000;
+	memcpy(preqparm->lv2.token, seckey->seckey, SECKEYBLOBSIZE);
+	preqcblk->req_parml = sizeof(struct uskreqparm) + SECKEYBLOBSIZE;
+
+	/* fill xcrb struct */
+	prep_xcrb(&xcrb, cardnr, preqcblk, prepcblk);
+
+	/* forward xcrb with request CPRB and reply CPRB to zcrypt dd */
+	rc = _zcrypt_send_cprb(&xcrb);
+	if (rc) {
+		DEBUG_ERR(
+			"pkey_sec2protkey zcrypt_send_cprb (cardnr=%d domain=%d) failed with errno %d\n",
+			(int) cardnr, (int) domain, rc);
+		goto out;
+	}
+
+	/* check response returncode and reasoncode */
+	if (prepcblk->ccp_rtcode != 0) {
+		DEBUG_ERR(
+			"pkey_sec2protkey unwrap secure key failure, card response %d/%d\n",
+			(int) prepcblk->ccp_rtcode,
+			(int) prepcblk->ccp_rscode);
+		rc = -EIO;
+		goto out;
+	}
+
+	/* process response cprb param block */
+	prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX);
+	prepparm = (struct uskrepparm *) prepcblk->rpl_parmb;
+
+	/* check the returned keyblock */
+	if (prepparm->lv3.keyblock.version != 0x01) {
+		DEBUG_ERR(
+			"pkey_sec2protkey reply param keyblock version mismatch 0x%02x != 0x01\n",
+			(int) prepparm->lv3.keyblock.version);
+		rc = -EIO;
+		goto out;
+	}
+
+	/* copy the tanslated protected key */
+	switch (prepparm->lv3.keyblock.keylen) {
+	case 16+32:
+		protkey->type = PKEY_KEYTYPE_AES_128;
+		break;
+	case 24+32:
+		protkey->type = PKEY_KEYTYPE_AES_192;
+		break;
+	case 32+32:
+		protkey->type = PKEY_KEYTYPE_AES_256;
+		break;
+	default:
+		DEBUG_ERR("pkey_sec2protkey unknown/unsupported keytype %d\n",
+			  prepparm->lv3.keyblock.keylen);
+		rc = -EIO;
+		goto out;
+	}
+	protkey->len = prepparm->lv3.keyblock.keylen;
+	memcpy(protkey->protkey, prepparm->lv3.keyblock.key, protkey->len);
+
+out:
+	free_cprbmem(mem, PARMBSIZE, 0);
+	return rc;
+}
+EXPORT_SYMBOL(pkey_sec2protkey);
+
+/*
+ * Create a protected key from a clear key value.
+ */
+int pkey_clr2protkey(u32 keytype,
+		     const struct pkey_clrkey *clrkey,
+		     struct pkey_protkey *protkey)
+{
+	long fc;
+	int keysize;
+	u8 paramblock[64];
+
+	switch (keytype) {
+	case PKEY_KEYTYPE_AES_128:
+		keysize = 16;
+		fc = CPACF_PCKMO_ENC_AES_128_KEY;
+		break;
+	case PKEY_KEYTYPE_AES_192:
+		keysize = 24;
+		fc = CPACF_PCKMO_ENC_AES_192_KEY;
+		break;
+	case PKEY_KEYTYPE_AES_256:
+		keysize = 32;
+		fc = CPACF_PCKMO_ENC_AES_256_KEY;
+		break;
+	default:
+		DEBUG_ERR("pkey_clr2protkey unknown/unsupported keytype %d\n",
+			  keytype);
+		return -EINVAL;
+	}
+
+	/* prepare param block */
+	memset(paramblock, 0, sizeof(paramblock));
+	memcpy(paramblock, clrkey->clrkey, keysize);
+
+	/* call the pckmo instruction */
+	cpacf_pckmo(fc, paramblock);
+
+	/* copy created protected key */
+	protkey->type = keytype;
+	protkey->len = keysize + 32;
+	memcpy(protkey->protkey, paramblock, keysize + 32);
+
+	return 0;
+}
+EXPORT_SYMBOL(pkey_clr2protkey);
+
+/*
+ * query cryptographic facility from adapter
+ */
+static int query_crypto_facility(u16 cardnr, u16 domain,
+				 const char *keyword,
+				 u8 *rarray, size_t *rarraylen,
+				 u8 *varray, size_t *varraylen)
+{
+	int rc;
+	u16 len;
+	u8 *mem, *ptr;
+	struct CPRBX *preqcblk, *prepcblk;
+	struct ica_xcRB xcrb;
+	struct fqreqparm {
+		u8  subfunc_code[2];
+		u16 rule_array_len;
+		char  rule_array[8];
+		struct lv1 {
+			u16 len;
+			u8  data[VARDATASIZE];
+		} lv1;
+		u16 dummylen;
+	} *preqparm;
+	size_t parmbsize = sizeof(struct fqreqparm);
+	struct fqrepparm {
+		u8  subfunc_code[2];
+		u8  lvdata[0];
+	} *prepparm;
+
+	/* get already prepared memory for 2 cprbs with param block each */
+	rc = alloc_and_prep_cprbmem(parmbsize, &mem, &preqcblk, &prepcblk);
+	if (rc)
+		return rc;
+
+	/* fill request cprb struct */
+	preqcblk->domain = domain;
+
+	/* fill request cprb param block with FQ request */
+	preqparm = (struct fqreqparm *) preqcblk->req_parmb;
+	memcpy(preqparm->subfunc_code, "FQ", 2);
+	strncpy(preqparm->rule_array, keyword, sizeof(preqparm->rule_array));
+	preqparm->rule_array_len =
+		sizeof(preqparm->rule_array_len) + sizeof(preqparm->rule_array);
+	preqparm->lv1.len = sizeof(preqparm->lv1);
+	preqparm->dummylen = sizeof(preqparm->dummylen);
+	preqcblk->req_parml = parmbsize;
+
+	/* fill xcrb struct */
+	prep_xcrb(&xcrb, cardnr, preqcblk, prepcblk);
+
+	/* forward xcrb with request CPRB and reply CPRB to zcrypt dd */
+	rc = _zcrypt_send_cprb(&xcrb);
+	if (rc) {
+		DEBUG_ERR(
+			"query_crypto_facility zcrypt_send_cprb (cardnr=%d domain=%d) failed with errno %d\n",
+			(int) cardnr, (int) domain, rc);
+		goto out;
+	}
+
+	/* check response returncode and reasoncode */
+	if (prepcblk->ccp_rtcode != 0) {
+		DEBUG_ERR(
+			"query_crypto_facility unwrap secure key failure, card response %d/%d\n",
+			(int) prepcblk->ccp_rtcode,
+			(int) prepcblk->ccp_rscode);
+		rc = -EIO;
+		goto out;
+	}
+
+	/* process response cprb param block */
+	prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX);
+	prepparm = (struct fqrepparm *) prepcblk->rpl_parmb;
+	ptr = prepparm->lvdata;
+
+	/* check and possibly copy reply rule array */
+	len = *((u16 *) ptr);
+	if (len > sizeof(u16)) {
+		ptr += sizeof(u16);
+		len -= sizeof(u16);
+		if (rarray && rarraylen && *rarraylen > 0) {
+			*rarraylen = (len > *rarraylen ? *rarraylen : len);
+			memcpy(rarray, ptr, *rarraylen);
+		}
+		ptr += len;
+	}
+	/* check and possible copy reply var array */
+	len = *((u16 *) ptr);
+	if (len > sizeof(u16)) {
+		ptr += sizeof(u16);
+		len -= sizeof(u16);
+		if (varray && varraylen && *varraylen > 0) {
+			*varraylen = (len > *varraylen ? *varraylen : len);
+			memcpy(varray, ptr, *varraylen);
+		}
+		ptr += len;
+	}
+
+out:
+	free_cprbmem(mem, parmbsize, 0);
+	return rc;
+}
+
+/*
+ * Fetch just the mkvp value via query_crypto_facility from adapter.
+ */
+static int fetch_mkvp(u16 cardnr, u16 domain, u64 *mkvp)
+{
+	int rc, found = 0;
+	size_t rlen, vlen;
+	u8 *rarray, *varray, *pg;
+
+	pg = (u8 *) __get_free_page(GFP_KERNEL);
+	if (!pg)
+		return -ENOMEM;
+	rarray = pg;
+	varray = pg + PAGE_SIZE/2;
+	rlen = vlen = PAGE_SIZE/2;
+
+	rc = query_crypto_facility(cardnr, domain, "STATICSA",
+				   rarray, &rlen, varray, &vlen);
+	if (rc == 0 && rlen > 8*8 && vlen > 184+8) {
+		if (rarray[64] == '2') {
+			/* current master key state is valid */
+			*mkvp = *((u64 *)(varray + 184));
+			found = 1;
+		}
+	}
+
+	free_page((unsigned long) pg);
+
+	return found ? 0 : -ENOENT;
+}
+
+/* struct to hold cached mkvp info for each card/domain */
+struct mkvp_info {
+	struct list_head list;
+	u16 cardnr;
+	u16 domain;
+	u64 mkvp;
+};
+
+/* a list with mkvp_info entries */
+static LIST_HEAD(mkvp_list);
+static DEFINE_SPINLOCK(mkvp_list_lock);
+
+static int mkvp_cache_fetch(u16 cardnr, u16 domain, u64 *mkvp)
+{
+	int rc = -ENOENT;
+	struct mkvp_info *ptr;
+
+	spin_lock_bh(&mkvp_list_lock);
+	list_for_each_entry(ptr, &mkvp_list, list) {
+		if (ptr->cardnr == cardnr &&
+		    ptr->domain == domain) {
+			*mkvp = ptr->mkvp;
+			rc = 0;
+			break;
+		}
+	}
+	spin_unlock_bh(&mkvp_list_lock);
+
+	return rc;
+}
+
+static void mkvp_cache_update(u16 cardnr, u16 domain, u64 mkvp)
+{
+	int found = 0;
+	struct mkvp_info *ptr;
+
+	spin_lock_bh(&mkvp_list_lock);
+	list_for_each_entry(ptr, &mkvp_list, list) {
+		if (ptr->cardnr == cardnr &&
+		    ptr->domain == domain) {
+			ptr->mkvp = mkvp;
+			found = 1;
+			break;
+		}
+	}
+	if (!found) {
+		ptr = kmalloc(sizeof(*ptr), GFP_ATOMIC);
+		if (!ptr) {
+			spin_unlock_bh(&mkvp_list_lock);
+			return;
+		}
+		ptr->cardnr = cardnr;
+		ptr->domain = domain;
+		ptr->mkvp = mkvp;
+		list_add(&ptr->list, &mkvp_list);
+	}
+	spin_unlock_bh(&mkvp_list_lock);
+}
+
+static void mkvp_cache_scrub(u16 cardnr, u16 domain)
+{
+	struct mkvp_info *ptr;
+
+	spin_lock_bh(&mkvp_list_lock);
+	list_for_each_entry(ptr, &mkvp_list, list) {
+		if (ptr->cardnr == cardnr &&
+		    ptr->domain == domain) {
+			list_del(&ptr->list);
+			kfree(ptr);
+			break;
+		}
+	}
+	spin_unlock_bh(&mkvp_list_lock);
+}
+
+static void __exit mkvp_cache_free(void)
+{
+	struct mkvp_info *ptr, *pnext;
+
+	spin_lock_bh(&mkvp_list_lock);
+	list_for_each_entry_safe(ptr, pnext, &mkvp_list, list) {
+		list_del(&ptr->list);
+		kfree(ptr);
+	}
+	spin_unlock_bh(&mkvp_list_lock);
+}
+
+/*
+ * Search for a matching crypto card based on the Master Key
+ * Verification Pattern provided inside a secure key.
+ */
+int pkey_findcard(const struct pkey_seckey *seckey,
+		  u16 *pcardnr, u16 *pdomain, int verify)
+{
+	struct secaeskeytoken *t = (struct secaeskeytoken *) seckey;
+	struct zcrypt_device_matrix *device_matrix;
+	u16 card, dom;
+	u64 mkvp;
+	int i, rc;
+
+	/* mkvp must not be zero */
+	if (t->mkvp == 0)
+		return -EINVAL;
+
+	/* fetch status of all crypto cards */
+	device_matrix = kmalloc(sizeof(struct zcrypt_device_matrix),
+				GFP_KERNEL);
+	if (!device_matrix)
+		return -ENOMEM;
+	zcrypt_device_status_mask(device_matrix);
+
+	/* walk through all crypto cards */
+	for (i = 0; i < MAX_ZDEV_ENTRIES; i++) {
+		card = AP_QID_CARD(device_matrix->device[i].qid);
+		dom = AP_QID_QUEUE(device_matrix->device[i].qid);
+		if (device_matrix->device[i].online &&
+		    device_matrix->device[i].functions & 0x04) {
+			/* an enabled CCA Coprocessor card */
+			/* try cached mkvp */
+			if (mkvp_cache_fetch(card, dom, &mkvp) == 0 &&
+			    t->mkvp == mkvp) {
+				if (!verify)
+					break;
+				/* verify: fetch mkvp from adapter */
+				if (fetch_mkvp(card, dom, &mkvp) == 0) {
+					mkvp_cache_update(card, dom, mkvp);
+					if (t->mkvp == mkvp)
+						break;
+				}
+			}
+		} else {
+			/* Card is offline and/or not a CCA card. */
+			/* del mkvp entry from cache if it exists */
+			mkvp_cache_scrub(card, dom);
+		}
+	}
+	if (i >= MAX_ZDEV_ENTRIES) {
+		/* nothing found, so this time without cache */
+		for (i = 0; i < MAX_ZDEV_ENTRIES; i++) {
+			if (!(device_matrix->device[i].online &&
+			      device_matrix->device[i].functions & 0x04))
+				continue;
+			card = AP_QID_CARD(device_matrix->device[i].qid);
+			dom = AP_QID_QUEUE(device_matrix->device[i].qid);
+			/* fresh fetch mkvp from adapter */
+			if (fetch_mkvp(card, dom, &mkvp) == 0) {
+				mkvp_cache_update(card, dom, mkvp);
+				if (t->mkvp == mkvp)
+					break;
+			}
+		}
+	}
+	if (i < MAX_ZDEV_ENTRIES) {
+		if (pcardnr)
+			*pcardnr = card;
+		if (pdomain)
+			*pdomain = dom;
+		rc = 0;
+	} else
+		rc = -ENODEV;
+
+	kfree(device_matrix);
+	return rc;
+}
+EXPORT_SYMBOL(pkey_findcard);
+
+/*
+ * Find card and transform secure key into protected key.
+ */
+int pkey_skey2pkey(const struct pkey_seckey *seckey,
+		   struct pkey_protkey *protkey)
+{
+	u16 cardnr, domain;
+	int rc, verify;
+
+	/*
+	 * The pkey_sec2protkey call may fail when a card has been
+	 * addressed where the master key was changed after last fetch
+	 * of the mkvp into the cache. So first try without verify then
+	 * with verify enabled (thus refreshing the mkvp for each card).
+	 */
+	for (verify = 0; verify < 2; verify++) {
+		rc = pkey_findcard(seckey, &cardnr, &domain, verify);
+		if (rc)
+			continue;
+		rc = pkey_sec2protkey(cardnr, domain, seckey, protkey);
+		if (rc == 0)
+			break;
+	}
+
+	if (rc)
+		DEBUG_DBG("pkey_skey2pkey failed rc=%d\n", rc);
+
+	return rc;
+}
+EXPORT_SYMBOL(pkey_skey2pkey);
+
+/*
+ * File io functions
+ */
+
+static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
+				unsigned long arg)
+{
+	int rc;
+
+	switch (cmd) {
+	case PKEY_GENSECK: {
+		struct pkey_genseck __user *ugs = (void __user *) arg;
+		struct pkey_genseck kgs;
+
+		if (copy_from_user(&kgs, ugs, sizeof(kgs)))
+			return -EFAULT;
+		rc = pkey_genseckey(kgs.cardnr, kgs.domain,
+				    kgs.keytype, &kgs.seckey);
+		DEBUG_DBG("pkey_ioctl pkey_genseckey()=%d\n", rc);
+		if (rc)
+			break;
+		if (copy_to_user(ugs, &kgs, sizeof(kgs)))
+			return -EFAULT;
+		break;
+	}
+	case PKEY_CLR2SECK: {
+		struct pkey_clr2seck __user *ucs = (void __user *) arg;
+		struct pkey_clr2seck kcs;
+
+		if (copy_from_user(&kcs, ucs, sizeof(kcs)))
+			return -EFAULT;
+		rc = pkey_clr2seckey(kcs.cardnr, kcs.domain, kcs.keytype,
+				     &kcs.clrkey, &kcs.seckey);
+		DEBUG_DBG("pkey_ioctl pkey_clr2seckey()=%d\n", rc);
+		if (rc)
+			break;
+		if (copy_to_user(ucs, &kcs, sizeof(kcs)))
+			return -EFAULT;
+		memzero_explicit(&kcs, sizeof(kcs));
+		break;
+	}
+	case PKEY_SEC2PROTK: {
+		struct pkey_sec2protk __user *usp = (void __user *) arg;
+		struct pkey_sec2protk ksp;
+
+		if (copy_from_user(&ksp, usp, sizeof(ksp)))
+			return -EFAULT;
+		rc = pkey_sec2protkey(ksp.cardnr, ksp.domain,
+				      &ksp.seckey, &ksp.protkey);
+		DEBUG_DBG("pkey_ioctl pkey_sec2protkey()=%d\n", rc);
+		if (rc)
+			break;
+		if (copy_to_user(usp, &ksp, sizeof(ksp)))
+			return -EFAULT;
+		break;
+	}
+	case PKEY_CLR2PROTK: {
+		struct pkey_clr2protk __user *ucp = (void __user *) arg;
+		struct pkey_clr2protk kcp;
+
+		if (copy_from_user(&kcp, ucp, sizeof(kcp)))
+			return -EFAULT;
+		rc = pkey_clr2protkey(kcp.keytype,
+				      &kcp.clrkey, &kcp.protkey);
+		DEBUG_DBG("pkey_ioctl pkey_clr2protkey()=%d\n", rc);
+		if (rc)
+			break;
+		if (copy_to_user(ucp, &kcp, sizeof(kcp)))
+			return -EFAULT;
+		memzero_explicit(&kcp, sizeof(kcp));
+		break;
+	}
+	case PKEY_FINDCARD: {
+		struct pkey_findcard __user *ufc = (void __user *) arg;
+		struct pkey_findcard kfc;
+
+		if (copy_from_user(&kfc, ufc, sizeof(kfc)))
+			return -EFAULT;
+		rc = pkey_findcard(&kfc.seckey,
+				   &kfc.cardnr, &kfc.domain, 1);
+		DEBUG_DBG("pkey_ioctl pkey_findcard()=%d\n", rc);
+		if (rc)
+			break;
+		if (copy_to_user(ufc, &kfc, sizeof(kfc)))
+			return -EFAULT;
+		break;
+	}
+	case PKEY_SKEY2PKEY: {
+		struct pkey_skey2pkey __user *usp = (void __user *) arg;
+		struct pkey_skey2pkey ksp;
+
+		if (copy_from_user(&ksp, usp, sizeof(ksp)))
+			return -EFAULT;
+		rc = pkey_skey2pkey(&ksp.seckey, &ksp.protkey);
+		DEBUG_DBG("pkey_ioctl pkey_skey2pkey()=%d\n", rc);
+		if (rc)
+			break;
+		if (copy_to_user(usp, &ksp, sizeof(ksp)))
+			return -EFAULT;
+		break;
+	}
+	default:
+		/* unknown/unsupported ioctl cmd */
+		return -ENOTTY;
+	}
+
+	return rc;
+}
+
+/*
+ * Sysfs and file io operations
+ */
+static const struct file_operations pkey_fops = {
+	.owner		= THIS_MODULE,
+	.open		= nonseekable_open,
+	.llseek		= no_llseek,
+	.unlocked_ioctl = pkey_unlocked_ioctl,
+};
+
+static struct miscdevice pkey_dev = {
+	.name	= "pkey",
+	.minor	= MISC_DYNAMIC_MINOR,
+	.mode	= 0666,
+	.fops	= &pkey_fops,
+};
+
+/*
+ * Module init
+ */
+int __init pkey_init(void)
+{
+	cpacf_mask_t pckmo_functions;
+
+	/* check for pckmo instructions available */
+	if (!cpacf_query(CPACF_PCKMO, &pckmo_functions))
+		return -EOPNOTSUPP;
+	if (!cpacf_test_func(&pckmo_functions, CPACF_PCKMO_ENC_AES_128_KEY) ||
+	    !cpacf_test_func(&pckmo_functions, CPACF_PCKMO_ENC_AES_192_KEY) ||
+	    !cpacf_test_func(&pckmo_functions, CPACF_PCKMO_ENC_AES_256_KEY))
+		return -EOPNOTSUPP;
+
+	pkey_debug_init();
+
+	return misc_register(&pkey_dev);
+}
+
+/*
+ * Module exit
+ */
+static void __exit pkey_exit(void)
+{
+	misc_deregister(&pkey_dev);
+	mkvp_cache_free();
+	pkey_debug_exit();
+}
+
+module_init(pkey_init);
+module_exit(pkey_exit);
diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c
index 144a179..93015f8 100644
--- a/drivers/s390/crypto/zcrypt_api.c
+++ b/drivers/s390/crypto/zcrypt_api.c
@@ -374,7 +374,7 @@ static long zcrypt_rsa_crt(struct ica_rsa_modexpo_crt *crt)
 	return rc;
 }
 
-static long zcrypt_send_cprb(struct ica_xcRB *xcRB)
+long zcrypt_send_cprb(struct ica_xcRB *xcRB)
 {
 	struct zcrypt_card *zc, *pref_zc;
 	struct zcrypt_queue *zq, *pref_zq;
@@ -444,6 +444,7 @@ static long zcrypt_send_cprb(struct ica_xcRB *xcRB)
 			      AP_QID_CARD(qid), AP_QID_QUEUE(qid));
 	return rc;
 }
+EXPORT_SYMBOL(zcrypt_send_cprb);
 
 static bool is_desired_ep11_card(unsigned int dev_id,
 				 unsigned short target_num,
@@ -619,7 +620,7 @@ static long zcrypt_rng(char *buffer)
 	return rc;
 }
 
-static void zcrypt_device_status_mask(struct zcrypt_device_matrix *matrix)
+void zcrypt_device_status_mask(struct zcrypt_device_matrix *matrix)
 {
 	struct zcrypt_card *zc;
 	struct zcrypt_queue *zq;
diff --git a/drivers/s390/crypto/zcrypt_api.h b/drivers/s390/crypto/zcrypt_api.h
index 274a590..6c94efd 100644
--- a/drivers/s390/crypto/zcrypt_api.h
+++ b/drivers/s390/crypto/zcrypt_api.h
@@ -190,5 +190,7 @@ void zcrypt_msgtype_unregister(struct zcrypt_ops *);
 struct zcrypt_ops *zcrypt_msgtype(unsigned char *, int);
 int zcrypt_api_init(void);
 void zcrypt_api_exit(void);
+long zcrypt_send_cprb(struct ica_xcRB *xcRB);
+void zcrypt_device_status_mask(struct zcrypt_device_matrix *devstatus);
 
 #endif /* _ZCRYPT_API_H_ */
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 439b946..db5900aaa 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -478,7 +478,7 @@ static void kernfs_drain(struct kernfs_node *kn)
 		rwsem_release(&kn->dep_map, 1, _RET_IP_);
 	}
 
-	kernfs_unmap_bin_file(kn);
+	kernfs_drain_open_files(kn);
 
 	mutex_lock(&kernfs_mutex);
 }
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 4f05358..35043a8 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -515,7 +515,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
 		goto out_put;
 
 	rc = 0;
-	of->mmapped = 1;
+	of->mmapped = true;
 	of->vm_ops = vma->vm_ops;
 	vma->vm_ops = &kernfs_vm_ops;
 out_put:
@@ -707,7 +707,8 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
 	if (error)
 		goto err_free;
 
-	((struct seq_file *)file->private_data)->private = of;
+	of->seq_file = file->private_data;
+	of->seq_file->private = of;
 
 	/* seq_file clears PWRITE unconditionally, restore it if WRITE */
 	if (file->f_mode & FMODE_WRITE)
@@ -716,13 +717,22 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
 	/* make sure we have open node struct */
 	error = kernfs_get_open_node(kn, of);
 	if (error)
-		goto err_close;
+		goto err_seq_release;
+
+	if (ops->open) {
+		/* nobody has access to @of yet, skip @of->mutex */
+		error = ops->open(of);
+		if (error)
+			goto err_put_node;
+	}
 
 	/* open succeeded, put active references */
 	kernfs_put_active(kn);
 	return 0;
 
-err_close:
+err_put_node:
+	kernfs_put_open_node(kn, of);
+err_seq_release:
 	seq_release(inode, file);
 err_free:
 	kfree(of->prealloc_buf);
@@ -732,11 +742,41 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
 	return error;
 }
 
+/* used from release/drain to ensure that ->release() is called exactly once */
+static void kernfs_release_file(struct kernfs_node *kn,
+				struct kernfs_open_file *of)
+{
+	/*
+	 * @of is guaranteed to have no other file operations in flight and
+	 * we just want to synchronize release and drain paths.
+	 * @kernfs_open_file_mutex is enough.  @of->mutex can't be used
+	 * here because drain path may be called from places which can
+	 * cause circular dependency.
+	 */
+	lockdep_assert_held(&kernfs_open_file_mutex);
+
+	if (!of->released) {
+		/*
+		 * A file is never detached without being released and we
+		 * need to be able to release files which are deactivated
+		 * and being drained.  Don't use kernfs_ops().
+		 */
+		kn->attr.ops->release(of);
+		of->released = true;
+	}
+}
+
 static int kernfs_fop_release(struct inode *inode, struct file *filp)
 {
 	struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
 	struct kernfs_open_file *of = kernfs_of(filp);
 
+	if (kn->flags & KERNFS_HAS_RELEASE) {
+		mutex_lock(&kernfs_open_file_mutex);
+		kernfs_release_file(kn, of);
+		mutex_unlock(&kernfs_open_file_mutex);
+	}
+
 	kernfs_put_open_node(kn, of);
 	seq_release(inode, filp);
 	kfree(of->prealloc_buf);
@@ -745,12 +785,12 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-void kernfs_unmap_bin_file(struct kernfs_node *kn)
+void kernfs_drain_open_files(struct kernfs_node *kn)
 {
 	struct kernfs_open_node *on;
 	struct kernfs_open_file *of;
 
-	if (!(kn->flags & KERNFS_HAS_MMAP))
+	if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE)))
 		return;
 
 	spin_lock_irq(&kernfs_open_node_lock);
@@ -762,10 +802,16 @@ void kernfs_unmap_bin_file(struct kernfs_node *kn)
 		return;
 
 	mutex_lock(&kernfs_open_file_mutex);
+
 	list_for_each_entry(of, &on->files, list) {
 		struct inode *inode = file_inode(of->file);
-		unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+
+		if (kn->flags & KERNFS_HAS_MMAP)
+			unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+
+		kernfs_release_file(kn, of);
 	}
+
 	mutex_unlock(&kernfs_open_file_mutex);
 
 	kernfs_put_open_node(kn, NULL);
@@ -964,6 +1010,8 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 		kn->flags |= KERNFS_HAS_SEQ_SHOW;
 	if (ops->mmap)
 		kn->flags |= KERNFS_HAS_MMAP;
+	if (ops->release)
+		kn->flags |= KERNFS_HAS_RELEASE;
 
 	rc = kernfs_add_one(kn);
 	if (rc) {
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index bfd551b..3100987 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -104,7 +104,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
  */
 extern const struct file_operations kernfs_file_fops;
 
-void kernfs_unmap_bin_file(struct kernfs_node *kn);
+void kernfs_drain_open_files(struct kernfs_node *kn);
 
 /*
  * symlink.c
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 861b467..3c02404 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -148,14 +148,18 @@ struct cgroup_subsys_state {
  * set for a task.
  */
 struct css_set {
-	/* Reference count */
+	/*
+	 * Set of subsystem states, one for each subsystem. This array is
+	 * immutable after creation apart from the init_css_set during
+	 * subsystem registration (at boot time).
+	 */
+	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+
+	/* reference count */
 	atomic_t refcount;
 
-	/*
-	 * List running through all cgroup groups in the same hash
-	 * slot. Protected by css_set_lock
-	 */
-	struct hlist_node hlist;
+	/* the default cgroup associated with this css_set */
+	struct cgroup *dfl_cgrp;
 
 	/*
 	 * Lists running through all tasks using this cgroup group.
@@ -167,22 +171,30 @@ struct css_set {
 	struct list_head tasks;
 	struct list_head mg_tasks;
 
+	/* all css_task_iters currently walking this cset */
+	struct list_head task_iters;
+
+	/*
+	 * On the default hierarhcy, ->subsys[ssid] may point to a css
+	 * attached to an ancestor instead of the cgroup this css_set is
+	 * associated with.  The following node is anchored at
+	 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
+	 * iterate through all css's attached to a given cgroup.
+	 */
+	struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
+
+	/*
+	 * List running through all cgroup groups in the same hash
+	 * slot. Protected by css_set_lock
+	 */
+	struct hlist_node hlist;
+
 	/*
 	 * List of cgrp_cset_links pointing at cgroups referenced from this
 	 * css_set.  Protected by css_set_lock.
 	 */
 	struct list_head cgrp_links;
 
-	/* the default cgroup associated with this css_set */
-	struct cgroup *dfl_cgrp;
-
-	/*
-	 * Set of subsystem states, one for each subsystem. This array is
-	 * immutable after creation apart from the init_css_set during
-	 * subsystem registration (at boot time).
-	 */
-	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
-
 	/*
 	 * List of csets participating in the on-going migration either as
 	 * source or destination.  Protected by cgroup_mutex.
@@ -201,18 +213,6 @@ struct css_set {
 	struct cgroup *mg_dst_cgrp;
 	struct css_set *mg_dst_cset;
 
-	/*
-	 * On the default hierarhcy, ->subsys[ssid] may point to a css
-	 * attached to an ancestor instead of the cgroup this css_set is
-	 * associated with.  The following node is anchored at
-	 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
-	 * iterate through all css's attached to a given cgroup.
-	 */
-	struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
-
-	/* all css_task_iters currently walking this cset */
-	struct list_head task_iters;
-
 	/* dead and being drained, ignore for migration */
 	bool dead;
 
@@ -388,6 +388,9 @@ struct cftype {
 	struct list_head node;		/* anchored at ss->cfts */
 	struct kernfs_ops *kf_ops;
 
+	int (*open)(struct kernfs_open_file *of);
+	void (*release)(struct kernfs_open_file *of);
+
 	/*
 	 * read_u64() is a shortcut for the common case of returning a
 	 * single integer. Use it in place of read()
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c83c23f..f6b43fb 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -266,7 +266,7 @@ void css_task_iter_end(struct css_task_iter *it);
  * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
  * @leader: the loop cursor
  * @dst_css: the destination css
- * @tset: takset to iterate
+ * @tset: taskset to iterate
  *
  * Iterate threadgroup leaders of @tset.  For single-task migrations, @tset
  * may not contain any.
diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h
new file mode 100644
index 0000000..e94290b
--- /dev/null
+++ b/include/linux/cgroup_rdma.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#ifndef _CGROUP_RDMA_H
+#define _CGROUP_RDMA_H
+
+#include <linux/cgroup.h>
+
+enum rdmacg_resource_type {
+	RDMACG_RESOURCE_HCA_HANDLE,
+	RDMACG_RESOURCE_HCA_OBJECT,
+	RDMACG_RESOURCE_MAX,
+};
+
+#ifdef CONFIG_CGROUP_RDMA
+
+struct rdma_cgroup {
+	struct cgroup_subsys_state	css;
+
+	/*
+	 * head to keep track of all resource pools
+	 * that belongs to this cgroup.
+	 */
+	struct list_head		rpools;
+};
+
+struct rdmacg_device {
+	struct list_head	dev_node;
+	struct list_head	rpools;
+	char			*name;
+};
+
+/*
+ * APIs for RDMA/IB stack to publish when a device wants to
+ * participate in resource accounting
+ */
+int rdmacg_register_device(struct rdmacg_device *device);
+void rdmacg_unregister_device(struct rdmacg_device *device);
+
+/* APIs for RDMA/IB stack to charge/uncharge pool specific resources */
+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
+		      struct rdmacg_device *device,
+		      enum rdmacg_resource_type index);
+void rdmacg_uncharge(struct rdma_cgroup *cg,
+		     struct rdmacg_device *device,
+		     enum rdmacg_resource_type index);
+#endif	/* CONFIG_CGROUP_RDMA */
+#endif	/* _CGROUP_RDMA_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 0df0336a..d0e597c 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -56,6 +56,10 @@ SUBSYS(hugetlb)
 SUBSYS(pids)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_RDMA)
+SUBSYS(rdma)
+#endif
+
 /*
  * The following subsystems are not supported on the default hierarchy.
  */
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index b2eb9c0a..f811005 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -105,29 +105,36 @@ struct ftrace_branch_data {
 	};
 };
 
+struct ftrace_likely_data {
+	struct ftrace_branch_data	data;
+	unsigned long			constant;
+};
+
 /*
  * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code
  * to disable branch tracing on a per file basis.
  */
 #if defined(CONFIG_TRACE_BRANCH_PROFILING) \
     && !defined(DISABLE_BRANCH_PROFILING) && !defined(__CHECKER__)
-void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
+void ftrace_likely_update(struct ftrace_likely_data *f, int val,
+			  int expect, int is_constant);
 
 #define likely_notrace(x)	__builtin_expect(!!(x), 1)
 #define unlikely_notrace(x)	__builtin_expect(!!(x), 0)
 
-#define __branch_check__(x, expect) ({					\
+#define __branch_check__(x, expect, is_constant) ({			\
 			int ______r;					\
-			static struct ftrace_branch_data		\
+			static struct ftrace_likely_data		\
 				__attribute__((__aligned__(4)))		\
 				__attribute__((section("_ftrace_annotated_branch"))) \
 				______f = {				\
-				.func = __func__,			\
-				.file = __FILE__,			\
-				.line = __LINE__,			\
+				.data.func = __func__,			\
+				.data.file = __FILE__,			\
+				.data.line = __LINE__,			\
 			};						\
-			______r = likely_notrace(x);			\
-			ftrace_likely_update(&______f, ______r, expect); \
+			______r = __builtin_expect(!!(x), expect);	\
+			ftrace_likely_update(&______f, ______r,		\
+					     expect, is_constant);	\
 			______r;					\
 		})
 
@@ -137,10 +144,10 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
  * written by Daniel Walker.
  */
 # ifndef likely
-#  define likely(x)	(__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 1))
+#  define likely(x)	(__branch_check__(x, 1, __builtin_constant_p(x)))
 # endif
 # ifndef unlikely
-#  define unlikely(x)	(__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0))
+#  define unlikely(x)	(__branch_check__(x, 0, __builtin_constant_p(x)))
 # endif
 
 #ifdef CONFIG_PROFILE_ALL_BRANCHES
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index b63d6b7..8e06d75 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -89,11 +89,17 @@ extern bool static_key_initialized;
 
 struct static_key {
 	atomic_t enabled;
-/* Set lsb bit to 1 if branch is default true, 0 ot */
-	struct jump_entry *entries;
-#ifdef CONFIG_MODULES
-	struct static_key_mod *next;
-#endif
+/*
+ * bit 0 => 1 if key is initially true
+ *	    0 if initially false
+ * bit 1 => 1 if points to struct static_key_mod
+ *	    0 if points to struct jump_entry
+ */
+	union {
+		unsigned long type;
+		struct jump_entry *entries;
+		struct static_key_mod *next;
+	};
 };
 
 #else
@@ -118,9 +124,10 @@ struct module;
 
 #ifdef HAVE_JUMP_LABEL
 
-#define JUMP_TYPE_FALSE	0UL
-#define JUMP_TYPE_TRUE	1UL
-#define JUMP_TYPE_MASK	1UL
+#define JUMP_TYPE_FALSE		0UL
+#define JUMP_TYPE_TRUE		1UL
+#define JUMP_TYPE_LINKED	2UL
+#define JUMP_TYPE_MASK		3UL
 
 static __always_inline bool static_key_false(struct static_key *key)
 {
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 7056238..a9b11b8 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -46,6 +46,7 @@ enum kernfs_node_flag {
 	KERNFS_SUICIDAL		= 0x0400,
 	KERNFS_SUICIDED		= 0x0800,
 	KERNFS_EMPTY_DIR	= 0x1000,
+	KERNFS_HAS_RELEASE	= 0x2000,
 };
 
 /* @flags for kernfs_create_root() */
@@ -175,6 +176,7 @@ struct kernfs_open_file {
 	/* published fields */
 	struct kernfs_node	*kn;
 	struct file		*file;
+	struct seq_file		*seq_file;
 	void			*priv;
 
 	/* private fields, do not use outside kernfs proper */
@@ -185,12 +187,20 @@ struct kernfs_open_file {
 	char			*prealloc_buf;
 
 	size_t			atomic_write_len;
-	bool			mmapped;
+	bool			mmapped:1;
+	bool			released:1;
 	const struct vm_operations_struct *vm_ops;
 };
 
 struct kernfs_ops {
 	/*
+	 * Optional open/release methods.  Both are called with
+	 * @of->seq_file populated.
+	 */
+	int (*open)(struct kernfs_open_file *of);
+	void (*release)(struct kernfs_open_file *of);
+
+	/*
 	 * Read is handled by either seq_file or raw_read().
 	 *
 	 * If seq_show() is present, seq_file path is active.  Other seq
diff --git a/include/linux/mfd/tps65910.h b/include/linux/mfd/tps65910.h
index 6483a6f..ffb21e7 100644
--- a/include/linux/mfd/tps65910.h
+++ b/include/linux/mfd/tps65910.h
@@ -134,6 +134,7 @@
 
 /* RTC_CTRL_REG bitfields */
 #define TPS65910_RTC_CTRL_STOP_RTC			0x01 /*0=stop, 1=run */
+#define TPS65910_RTC_CTRL_AUTO_COMP			0x04
 #define TPS65910_RTC_CTRL_GET_TIME			0x40
 
 /* RTC_STATUS_REG bitfields */
diff --git a/include/linux/platform_data/rtc-m48t86.h b/include/linux/platform_data/rtc-m48t86.h
deleted file mode 100644
index 915d6b4..0000000
--- a/include/linux/platform_data/rtc-m48t86.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * ST M48T86 / Dallas DS12887 RTC driver
- * Copyright (c) 2006 Tower Technologies
- *
- * Author: Alessandro Zummo <a.zummo@towertech.it>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-struct m48t86_ops
-{
-	void (*writebyte)(unsigned char value, unsigned long addr);
-	unsigned char (*readbyte)(unsigned long addr);
-};
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 5a209b8..c7bdf89 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -61,6 +61,8 @@ struct timer_list {
 #define TIMER_ARRAYSHIFT	22
 #define TIMER_ARRAYMASK		0xFFC00000
 
+#define TIMER_TRACE_FLAGMASK	(TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE)
+
 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index a26cc437..bde063c 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -106,9 +106,9 @@ struct work_struct {
 #endif
 };
 
-#define WORK_DATA_INIT()	ATOMIC_LONG_INIT(WORK_STRUCT_NO_POOL)
+#define WORK_DATA_INIT()	ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL)
 #define WORK_DATA_STATIC_INIT()	\
-	ATOMIC_LONG_INIT(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC)
+	ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC))
 
 struct delayed_work {
 	struct work_struct work;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index d84849c..0f1813c 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -60,6 +60,7 @@
 #include <linux/atomic.h>
 #include <linux/mmu_notifier.h>
 #include <linux/uaccess.h>
+#include <linux/cgroup_rdma.h>
 
 extern struct workqueue_struct *ib_wq;
 extern struct workqueue_struct *ib_comp_wq;
@@ -1356,6 +1357,12 @@ struct ib_fmr_attr {
 
 struct ib_umem;
 
+struct ib_rdmacg_object {
+#ifdef CONFIG_CGROUP_RDMA
+	struct rdma_cgroup	*cg;		/* owner rdma cgroup */
+#endif
+};
+
 struct ib_ucontext {
 	struct ib_device       *device;
 	struct list_head	pd_list;
@@ -1388,6 +1395,8 @@ struct ib_ucontext {
 	struct list_head	no_private_counters;
 	int                     odp_mrs_count;
 #endif
+
+	struct ib_rdmacg_object	cg_obj;
 };
 
 struct ib_uobject {
@@ -1395,6 +1404,7 @@ struct ib_uobject {
 	struct ib_ucontext     *context;	/* associated user context */
 	void		       *object;		/* containing object */
 	struct list_head	list;		/* link to context's list */
+	struct ib_rdmacg_object	cg_obj;		/* rdmacg object */
 	int			id;		/* index into kernel idr */
 	struct kref		ref;
 	struct rw_semaphore	mutex;		/* protects .live */
@@ -2128,6 +2138,10 @@ struct ib_device {
 	struct attribute_group	     *hw_stats_ag;
 	struct rdma_hw_stats         *hw_stats;
 
+#ifdef CONFIG_CGROUP_RDMA
+	struct rdmacg_device         cg_device;
+#endif
+
 	/**
 	 * The following mandatory functions are used only at device
 	 * registration.  Keep functions such as these at the end of this
diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
index 1bca99d..80787ea 100644
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -36,6 +36,13 @@ DEFINE_EVENT(timer_class, timer_init,
 	TP_ARGS(timer)
 );
 
+#define decode_timer_flags(flags)			\
+	__print_flags(flags, "|",			\
+		{  TIMER_MIGRATING,	"M" },		\
+		{  TIMER_DEFERRABLE,	"D" },		\
+		{  TIMER_PINNED,	"P" },		\
+		{  TIMER_IRQSAFE,	"I" })
+
 /**
  * timer_start - called when the timer is started
  * @timer:	pointer to struct timer_list
@@ -65,9 +72,12 @@ TRACE_EVENT(timer_start,
 		__entry->flags		= flags;
 	),
 
-	TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld] flags=0x%08x",
+	TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld] cpu=%u idx=%u flags=%s",
 		  __entry->timer, __entry->function, __entry->expires,
-		  (long)__entry->expires - __entry->now, __entry->flags)
+		  (long)__entry->expires - __entry->now,
+		  __entry->flags & TIMER_CPUMASK,
+		  __entry->flags >> TIMER_ARRAYSHIFT,
+		  decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK))
 );
 
 /**
diff --git a/init/Kconfig b/init/Kconfig
index 8c39615..a92f27d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1078,6 +1078,16 @@
 	  since the PIDs limit only affects a process's ability to fork, not to
 	  attach to a cgroup.
 
+config CGROUP_RDMA
+	bool "RDMA controller"
+	help
+	  Provides enforcement of RDMA resources defined by IB stack.
+	  It is fairly easy for consumers to exhaust RDMA resources, which
+	  can result into resource unavailability to other consumers.
+	  RDMA controller is designed to stop this from happening.
+	  Attaching processes with active RDMA resources to the cgroup
+	  hierarchy is allowed even if can cross the hierarchy's limit.
+
 config CGROUP_FREEZER
 	bool "Freezer controller"
 	help
diff --git a/kernel/Makefile b/kernel/Makefile
index 12c679f..b302b47 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -64,10 +64,7 @@
 obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
-obj-$(CONFIG_CGROUPS) += cgroup.o
-obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
-obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
-obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUPS) += cgroup/
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
new file mode 100644
index 0000000..387348a
--- /dev/null
+++ b/kernel/cgroup/Makefile
@@ -0,0 +1,6 @@
+obj-y := cgroup.o namespace.o cgroup-v1.o
+
+obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
+obj-$(CONFIG_CGROUP_PIDS) += pids.o
+obj-$(CONFIG_CGROUP_RDMA) += rdma.o
+obj-$(CONFIG_CPUSETS) += cpuset.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
new file mode 100644
index 0000000..9203bfb
--- /dev/null
+++ b/kernel/cgroup/cgroup-internal.h
@@ -0,0 +1,214 @@
+#ifndef __CGROUP_INTERNAL_H
+#define __CGROUP_INTERNAL_H
+
+#include <linux/cgroup.h>
+#include <linux/kernfs.h>
+#include <linux/workqueue.h>
+#include <linux/list.h>
+
+/*
+ * A cgroup can be associated with multiple css_sets as different tasks may
+ * belong to different cgroups on different hierarchies.  In the other
+ * direction, a css_set is naturally associated with multiple cgroups.
+ * This M:N relationship is represented by the following link structure
+ * which exists for each association and allows traversing the associations
+ * from both sides.
+ */
+struct cgrp_cset_link {
+	/* the cgroup and css_set this link associates */
+	struct cgroup		*cgrp;
+	struct css_set		*cset;
+
+	/* list of cgrp_cset_links anchored at cgrp->cset_links */
+	struct list_head	cset_link;
+
+	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
+	struct list_head	cgrp_link;
+};
+
+/* used to track tasks and csets during migration */
+struct cgroup_taskset {
+	/* the src and dst cset list running through cset->mg_node */
+	struct list_head	src_csets;
+	struct list_head	dst_csets;
+
+	/* the subsys currently being processed */
+	int			ssid;
+
+	/*
+	 * Fields for cgroup_taskset_*() iteration.
+	 *
+	 * Before migration is committed, the target migration tasks are on
+	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
+	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
+	 * or ->dst_csets depending on whether migration is committed.
+	 *
+	 * ->cur_csets and ->cur_task point to the current task position
+	 * during iteration.
+	 */
+	struct list_head	*csets;
+	struct css_set		*cur_cset;
+	struct task_struct	*cur_task;
+};
+
+/* migration context also tracks preloading */
+struct cgroup_mgctx {
+	/*
+	 * Preloaded source and destination csets.  Used to guarantee
+	 * atomic success or failure on actual migration.
+	 */
+	struct list_head	preloaded_src_csets;
+	struct list_head	preloaded_dst_csets;
+
+	/* tasks and csets to migrate */
+	struct cgroup_taskset	tset;
+
+	/* subsystems affected by migration */
+	u16			ss_mask;
+};
+
+#define CGROUP_TASKSET_INIT(tset)						\
+{										\
+	.src_csets		= LIST_HEAD_INIT(tset.src_csets),		\
+	.dst_csets		= LIST_HEAD_INIT(tset.dst_csets),		\
+	.csets			= &tset.src_csets,				\
+}
+
+#define CGROUP_MGCTX_INIT(name)							\
+{										\
+	LIST_HEAD_INIT(name.preloaded_src_csets),				\
+	LIST_HEAD_INIT(name.preloaded_dst_csets),				\
+	CGROUP_TASKSET_INIT(name.tset),						\
+}
+
+#define DEFINE_CGROUP_MGCTX(name)						\
+	struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
+
+struct cgroup_sb_opts {
+	u16 subsys_mask;
+	unsigned int flags;
+	char *release_agent;
+	bool cpuset_clone_children;
+	char *name;
+	/* User explicitly requested empty subsystem */
+	bool none;
+};
+
+extern struct mutex cgroup_mutex;
+extern spinlock_t css_set_lock;
+extern struct cgroup_subsys *cgroup_subsys[];
+extern struct list_head cgroup_roots;
+extern struct file_system_type cgroup_fs_type;
+
+/* iterate across the hierarchies */
+#define for_each_root(root)						\
+	list_for_each_entry((root), &cgroup_roots, root_list)
+
+/**
+ * for_each_subsys - iterate all enabled cgroup subsystems
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ */
+#define for_each_subsys(ss, ssid)					\
+	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
+	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
+
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
+{
+	return !(cgrp->self.flags & CSS_ONLINE);
+}
+
+static inline bool notify_on_release(const struct cgroup *cgrp)
+{
+	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+}
+
+void put_css_set_locked(struct css_set *cset);
+
+static inline void put_css_set(struct css_set *cset)
+{
+	unsigned long flags;
+
+	/*
+	 * Ensure that the refcount doesn't hit zero while any readers
+	 * can see it. Similar to atomic_dec_and_lock(), but for an
+	 * rwlock
+	 */
+	if (atomic_add_unless(&cset->refcount, -1, 1))
+		return;
+
+	spin_lock_irqsave(&css_set_lock, flags);
+	put_css_set_locked(cset);
+	spin_unlock_irqrestore(&css_set_lock, flags);
+}
+
+/*
+ * refcounted get/put for css_set objects
+ */
+static inline void get_css_set(struct css_set *cset)
+{
+	atomic_inc(&cset->refcount);
+}
+
+bool cgroup_ssid_enabled(int ssid);
+bool cgroup_on_dfl(const struct cgroup *cgrp);
+
+struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
+struct cgroup *task_cgroup_from_root(struct task_struct *task,
+				     struct cgroup_root *root);
+struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline);
+void cgroup_kn_unlock(struct kernfs_node *kn);
+int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+			  struct cgroup_namespace *ns);
+
+void cgroup_free_root(struct cgroup_root *root);
+void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
+int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
+			       struct cgroup_root *root, unsigned long magic,
+			       struct cgroup_namespace *ns);
+
+bool cgroup_may_migrate_to(struct cgroup *dst_cgrp);
+void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
+void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
+			    struct cgroup_mgctx *mgctx);
+int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx);
+int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+		   struct cgroup_mgctx *mgctx);
+
+int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
+		       bool threadgroup);
+ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+			     size_t nbytes, loff_t off, bool threadgroup);
+ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+			   loff_t off);
+
+void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
+
+int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode);
+int cgroup_rmdir(struct kernfs_node *kn);
+int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+		     struct kernfs_root *kf_root);
+
+/*
+ * namespace.c
+ */
+extern const struct proc_ns_operations cgroupns_operations;
+
+/*
+ * cgroup-v1.c
+ */
+extern struct cftype cgroup1_base_files[];
+extern const struct file_operations proc_cgroupstats_operations;
+extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
+
+bool cgroup1_ssid_disabled(int ssid);
+void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
+void cgroup1_release_agent(struct work_struct *work);
+void cgroup1_check_for_release(struct cgroup *cgrp);
+struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
+			     void *data, unsigned long magic,
+			     struct cgroup_namespace *ns);
+
+#endif /* __CGROUP_INTERNAL_H */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
new file mode 100644
index 0000000..fc34bcf
--- /dev/null
+++ b/kernel/cgroup/cgroup-v1.c
@@ -0,0 +1,1395 @@
+#include "cgroup-internal.h"
+
+#include <linux/ctype.h>
+#include <linux/kmod.h>
+#include <linux/sort.h>
+#include <linux/delay.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/delayacct.h>
+#include <linux/pid_namespace.h>
+#include <linux/cgroupstats.h>
+
+#include <trace/events/cgroup.h>
+
+/*
+ * pidlists linger the following amount before being destroyed.  The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
+ */
+#define CGROUP_PIDLIST_DESTROY_DELAY	HZ
+
+/* Controllers blocked by the commandline in v1 */
+static u16 cgroup_no_v1_mask;
+
+/*
+ * pidlist destructions need to be flushed on cgroup destruction.  Use a
+ * separate workqueue as flush domain.
+ */
+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
+
+/*
+ * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
+ * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
+ */
+static DEFINE_SPINLOCK(release_agent_path_lock);
+
+bool cgroup1_ssid_disabled(int ssid)
+{
+	return cgroup_no_v1_mask & (1 << ssid);
+}
+
+/**
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
+{
+	struct cgroup_root *root;
+	int retval = 0;
+
+	mutex_lock(&cgroup_mutex);
+	percpu_down_write(&cgroup_threadgroup_rwsem);
+	for_each_root(root) {
+		struct cgroup *from_cgrp;
+
+		if (root == &cgrp_dfl_root)
+			continue;
+
+		spin_lock_irq(&css_set_lock);
+		from_cgrp = task_cgroup_from_root(from, root);
+		spin_unlock_irq(&css_set_lock);
+
+		retval = cgroup_attach_task(from_cgrp, tsk, false);
+		if (retval)
+			break;
+	}
+	percpu_up_write(&cgroup_threadgroup_rwsem);
+	mutex_unlock(&cgroup_mutex);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
+
+/**
+ * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * @to: cgroup to which the tasks will be moved
+ * @from: cgroup in which the tasks currently reside
+ *
+ * Locking rules between cgroup_post_fork() and the migration path
+ * guarantee that, if a task is forking while being migrated, the new child
+ * is guaranteed to be either visible in the source cgroup after the
+ * parent's migration is complete or put into the target cgroup.  No task
+ * can slip out of migration through forking.
+ */
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
+{
+	DEFINE_CGROUP_MGCTX(mgctx);
+	struct cgrp_cset_link *link;
+	struct css_task_iter it;
+	struct task_struct *task;
+	int ret;
+
+	if (cgroup_on_dfl(to))
+		return -EINVAL;
+
+	if (!cgroup_may_migrate_to(to))
+		return -EBUSY;
+
+	mutex_lock(&cgroup_mutex);
+
+	percpu_down_write(&cgroup_threadgroup_rwsem);
+
+	/* all tasks in @from are being moved, all csets are source */
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(link, &from->cset_links, cset_link)
+		cgroup_migrate_add_src(link->cset, to, &mgctx);
+	spin_unlock_irq(&css_set_lock);
+
+	ret = cgroup_migrate_prepare_dst(&mgctx);
+	if (ret)
+		goto out_err;
+
+	/*
+	 * Migrate tasks one-by-one until @from is empty.  This fails iff
+	 * ->can_attach() fails.
+	 */
+	do {
+		css_task_iter_start(&from->self, &it);
+		task = css_task_iter_next(&it);
+		if (task)
+			get_task_struct(task);
+		css_task_iter_end(&it);
+
+		if (task) {
+			ret = cgroup_migrate(task, false, &mgctx);
+			if (!ret)
+				trace_cgroup_transfer_tasks(to, task, false);
+			put_task_struct(task);
+		}
+	} while (task && !ret);
+out_err:
+	cgroup_migrate_finish(&mgctx);
+	percpu_up_write(&cgroup_threadgroup_rwsem);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+
+/*
+ * Stuff for reading the 'tasks'/'procs' files.
+ *
+ * Reading this file can return large amounts of data if a cgroup has
+ * *lots* of attached tasks. So it may need several calls to read(),
+ * but we cannot guarantee that the information we produce is correct
+ * unless we produce it entirely atomically.
+ *
+ */
+
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+	CGROUP_FILE_PROCS,
+	CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
+struct cgroup_pidlist {
+	/*
+	 * used to find which pidlist is wanted. doesn't change as long as
+	 * this particular list stays in the list.
+	*/
+	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
+	/* array of xids */
+	pid_t *list;
+	/* how many elements the above list has */
+	int length;
+	/* each of these stored in a list by its cgroup */
+	struct list_head links;
+	/* pointer to the cgroup we belong to, for list removal purposes */
+	struct cgroup *owner;
+	/* for delayed destruction */
+	struct delayed_work destroy_dwork;
+};
+
+/*
+ * The following two functions "fix" the issue where there are more pids
+ * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
+ * TODO: replace with a kernel-wide solution to this problem
+ */
+#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
+static void *pidlist_allocate(int count)
+{
+	if (PIDLIST_TOO_LARGE(count))
+		return vmalloc(count * sizeof(pid_t));
+	else
+		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
+}
+
+static void pidlist_free(void *p)
+{
+	kvfree(p);
+}
+
+/*
+ * Used to destroy all pidlists lingering waiting for destroy timer.  None
+ * should be left afterwards.
+ */
+void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
+{
+	struct cgroup_pidlist *l, *tmp_l;
+
+	mutex_lock(&cgrp->pidlist_mutex);
+	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
+		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
+	mutex_unlock(&cgrp->pidlist_mutex);
+
+	flush_workqueue(cgroup_pidlist_destroy_wq);
+	BUG_ON(!list_empty(&cgrp->pidlists));
+}
+
+static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
+						destroy_dwork);
+	struct cgroup_pidlist *tofree = NULL;
+
+	mutex_lock(&l->owner->pidlist_mutex);
+
+	/*
+	 * Destroy iff we didn't get queued again.  The state won't change
+	 * as destroy_dwork can only be queued while locked.
+	 */
+	if (!delayed_work_pending(dwork)) {
+		list_del(&l->links);
+		pidlist_free(l->list);
+		put_pid_ns(l->key.ns);
+		tofree = l;
+	}
+
+	mutex_unlock(&l->owner->pidlist_mutex);
+	kfree(tofree);
+}
+
+/*
+ * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
+ * Returns the number of unique elements.
+ */
+static int pidlist_uniq(pid_t *list, int length)
+{
+	int src, dest = 1;
+
+	/*
+	 * we presume the 0th element is unique, so i starts at 1. trivial
+	 * edge cases first; no work needs to be done for either
+	 */
+	if (length == 0 || length == 1)
+		return length;
+	/* src and dest walk down the list; dest counts unique elements */
+	for (src = 1; src < length; src++) {
+		/* find next unique element */
+		while (list[src] == list[src-1]) {
+			src++;
+			if (src == length)
+				goto after;
+		}
+		/* dest always points to where the next unique element goes */
+		list[dest] = list[src];
+		dest++;
+	}
+after:
+	return dest;
+}
+
+/*
+ * The two pid files - task and cgroup.procs - guaranteed that the result
+ * is sorted, which forced this whole pidlist fiasco.  As pid order is
+ * different per namespace, each namespace needs differently sorted list,
+ * making it impossible to use, for example, single rbtree of member tasks
+ * sorted by task pointer.  As pidlists can be fairly large, allocating one
+ * per open file is dangerous, so cgroup had to implement shared pool of
+ * pidlists keyed by cgroup and namespace.
+ */
+static int cmppid(const void *a, const void *b)
+{
+	return *(pid_t *)a - *(pid_t *)b;
+}
+
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+						  enum cgroup_filetype type)
+{
+	struct cgroup_pidlist *l;
+	/* don't need task_nsproxy() if we're looking at ourself */
+	struct pid_namespace *ns = task_active_pid_ns(current);
+
+	lockdep_assert_held(&cgrp->pidlist_mutex);
+
+	list_for_each_entry(l, &cgrp->pidlists, links)
+		if (l->key.type == type && l->key.ns == ns)
+			return l;
+	return NULL;
+}
+
+/*
+ * find the appropriate pidlist for our purpose (given procs vs tasks)
+ * returns with the lock on that pidlist already held, and takes care
+ * of the use count, or returns NULL with no locks held if we're out of
+ * memory.
+ */
+static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
+						enum cgroup_filetype type)
+{
+	struct cgroup_pidlist *l;
+
+	lockdep_assert_held(&cgrp->pidlist_mutex);
+
+	l = cgroup_pidlist_find(cgrp, type);
+	if (l)
+		return l;
+
+	/* entry not found; create a new one */
+	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+	if (!l)
+		return l;
+
+	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
+	l->key.type = type;
+	/* don't need task_nsproxy() if we're looking at ourself */
+	l->key.ns = get_pid_ns(task_active_pid_ns(current));
+	l->owner = cgrp;
+	list_add(&l->links, &cgrp->pidlists);
+	return l;
+}
+
+/**
+ * cgroup_task_count - count the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ *
+ * Return the number of tasks in the cgroup.  The returned number can be
+ * higher than the actual number of tasks due to css_set references from
+ * namespace roots and temporary usages.
+ */
+static int cgroup_task_count(const struct cgroup *cgrp)
+{
+	int count = 0;
+	struct cgrp_cset_link *link;
+
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(link, &cgrp->cset_links, cset_link)
+		count += atomic_read(&link->cset->refcount);
+	spin_unlock_irq(&css_set_lock);
+	return count;
+}
+
+/*
+ * Load a cgroup's pidarray with either procs' tgids or tasks' pids
+ */
+static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
+			      struct cgroup_pidlist **lp)
+{
+	pid_t *array;
+	int length;
+	int pid, n = 0; /* used for populating the array */
+	struct css_task_iter it;
+	struct task_struct *tsk;
+	struct cgroup_pidlist *l;
+
+	lockdep_assert_held(&cgrp->pidlist_mutex);
+
+	/*
+	 * If cgroup gets more users after we read count, we won't have
+	 * enough space - tough.  This race is indistinguishable to the
+	 * caller from the case that the additional cgroup users didn't
+	 * show up until sometime later on.
+	 */
+	length = cgroup_task_count(cgrp);
+	array = pidlist_allocate(length);
+	if (!array)
+		return -ENOMEM;
+	/* now, populate the array */
+	css_task_iter_start(&cgrp->self, &it);
+	while ((tsk = css_task_iter_next(&it))) {
+		if (unlikely(n == length))
+			break;
+		/* get tgid or pid for procs or tasks file respectively */
+		if (type == CGROUP_FILE_PROCS)
+			pid = task_tgid_vnr(tsk);
+		else
+			pid = task_pid_vnr(tsk);
+		if (pid > 0) /* make sure to only use valid results */
+			array[n++] = pid;
+	}
+	css_task_iter_end(&it);
+	length = n;
+	/* now sort & (if procs) strip out duplicates */
+	sort(array, length, sizeof(pid_t), cmppid, NULL);
+	if (type == CGROUP_FILE_PROCS)
+		length = pidlist_uniq(array, length);
+
+	l = cgroup_pidlist_find_create(cgrp, type);
+	if (!l) {
+		pidlist_free(array);
+		return -ENOMEM;
+	}
+
+	/* store array, freeing old if necessary */
+	pidlist_free(l->list);
+	l->list = array;
+	l->length = length;
+	*lp = l;
+	return 0;
+}
+
+/*
+ * seq_file methods for the tasks/procs files. The seq_file position is the
+ * next pid to display; the seq_file iterator is a pointer to the pid
+ * in the cgroup->l->list array.
+ */
+
+static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
+{
+	/*
+	 * Initially we receive a position value that corresponds to
+	 * one more than the last pid shown (or 0 on the first call or
+	 * after a seek to the start). Use a binary-search to find the
+	 * next pid to display, if any
+	 */
+	struct kernfs_open_file *of = s->private;
+	struct cgroup *cgrp = seq_css(s)->cgroup;
+	struct cgroup_pidlist *l;
+	enum cgroup_filetype type = seq_cft(s)->private;
+	int index = 0, pid = *pos;
+	int *iter, ret;
+
+	mutex_lock(&cgrp->pidlist_mutex);
+
+	/*
+	 * !NULL @of->priv indicates that this isn't the first start()
+	 * after open.  If the matching pidlist is around, we can use that.
+	 * Look for it.  Note that @of->priv can't be used directly.  It
+	 * could already have been destroyed.
+	 */
+	if (of->priv)
+		of->priv = cgroup_pidlist_find(cgrp, type);
+
+	/*
+	 * Either this is the first start() after open or the matching
+	 * pidlist has been destroyed inbetween.  Create a new one.
+	 */
+	if (!of->priv) {
+		ret = pidlist_array_load(cgrp, type,
+					 (struct cgroup_pidlist **)&of->priv);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+	l = of->priv;
+
+	if (pid) {
+		int end = l->length;
+
+		while (index < end) {
+			int mid = (index + end) / 2;
+			if (l->list[mid] == pid) {
+				index = mid;
+				break;
+			} else if (l->list[mid] <= pid)
+				index = mid + 1;
+			else
+				end = mid;
+		}
+	}
+	/* If we're off the end of the array, we're done */
+	if (index >= l->length)
+		return NULL;
+	/* Update the abstract position to be the actual pid that we found */
+	iter = l->list + index;
+	*pos = *iter;
+	return iter;
+}
+
+static void cgroup_pidlist_stop(struct seq_file *s, void *v)
+{
+	struct kernfs_open_file *of = s->private;
+	struct cgroup_pidlist *l = of->priv;
+
+	if (l)
+		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
+				 CGROUP_PIDLIST_DESTROY_DELAY);
+	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
+}
+
+static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct kernfs_open_file *of = s->private;
+	struct cgroup_pidlist *l = of->priv;
+	pid_t *p = v;
+	pid_t *end = l->list + l->length;
+	/*
+	 * Advance to the next pid in the array. If this goes off the
+	 * end, we're done
+	 */
+	p++;
+	if (p >= end) {
+		return NULL;
+	} else {
+		*pos = *p;
+		return p;
+	}
+}
+
+static int cgroup_pidlist_show(struct seq_file *s, void *v)
+{
+	seq_printf(s, "%d\n", *(int *)v);
+
+	return 0;
+}
+
+static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
+				  char *buf, size_t nbytes, loff_t off)
+{
+	return __cgroup_procs_write(of, buf, nbytes, off, false);
+}
+
+static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
+					  char *buf, size_t nbytes, loff_t off)
+{
+	struct cgroup *cgrp;
+
+	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENODEV;
+	spin_lock(&release_agent_path_lock);
+	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
+		sizeof(cgrp->root->release_agent_path));
+	spin_unlock(&release_agent_path_lock);
+	cgroup_kn_unlock(of->kn);
+	return nbytes;
+}
+
+static int cgroup_release_agent_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+	spin_lock(&release_agent_path_lock);
+	seq_puts(seq, cgrp->root->release_agent_path);
+	spin_unlock(&release_agent_path_lock);
+	seq_putc(seq, '\n');
+	return 0;
+}
+
+static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
+{
+	seq_puts(seq, "0\n");
+	return 0;
+}
+
+static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
+					 struct cftype *cft)
+{
+	return notify_on_release(css->cgroup);
+}
+
+static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
+					  struct cftype *cft, u64 val)
+{
+	if (val)
+		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+	else
+		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+	return 0;
+}
+
+static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
+				      struct cftype *cft)
+{
+	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+}
+
+static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
+				       struct cftype *cft, u64 val)
+{
+	if (val)
+		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+	else
+		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+	return 0;
+}
+
+/* cgroup core interface files for the legacy hierarchies */
+struct cftype cgroup1_base_files[] = {
+	{
+		.name = "cgroup.procs",
+		.seq_start = cgroup_pidlist_start,
+		.seq_next = cgroup_pidlist_next,
+		.seq_stop = cgroup_pidlist_stop,
+		.seq_show = cgroup_pidlist_show,
+		.private = CGROUP_FILE_PROCS,
+		.write = cgroup_procs_write,
+	},
+	{
+		.name = "cgroup.clone_children",
+		.read_u64 = cgroup_clone_children_read,
+		.write_u64 = cgroup_clone_children_write,
+	},
+	{
+		.name = "cgroup.sane_behavior",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.seq_show = cgroup_sane_behavior_show,
+	},
+	{
+		.name = "tasks",
+		.seq_start = cgroup_pidlist_start,
+		.seq_next = cgroup_pidlist_next,
+		.seq_stop = cgroup_pidlist_stop,
+		.seq_show = cgroup_pidlist_show,
+		.private = CGROUP_FILE_TASKS,
+		.write = cgroup_tasks_write,
+	},
+	{
+		.name = "notify_on_release",
+		.read_u64 = cgroup_read_notify_on_release,
+		.write_u64 = cgroup_write_notify_on_release,
+	},
+	{
+		.name = "release_agent",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.seq_show = cgroup_release_agent_show,
+		.write = cgroup_release_agent_write,
+		.max_write_len = PATH_MAX - 1,
+	},
+	{ }	/* terminate */
+};
+
+/* Display information about each subsystem and each hierarchy */
+static int proc_cgroupstats_show(struct seq_file *m, void *v)
+{
+	struct cgroup_subsys *ss;
+	int i;
+
+	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+	/*
+	 * ideally we don't want subsystems moving around while we do this.
+	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
+	 * subsys/hierarchy state.
+	 */
+	mutex_lock(&cgroup_mutex);
+
+	for_each_subsys(ss, i)
+		seq_printf(m, "%s\t%d\t%d\t%d\n",
+			   ss->legacy_name, ss->root->hierarchy_id,
+			   atomic_read(&ss->root->nr_cgrps),
+			   cgroup_ssid_enabled(i));
+
+	mutex_unlock(&cgroup_mutex);
+	return 0;
+}
+
+static int cgroupstats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, proc_cgroupstats_show, NULL);
+}
+
+const struct file_operations proc_cgroupstats_operations = {
+	.open = cgroupstats_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+/**
+ * cgroupstats_build - build and fill cgroupstats
+ * @stats: cgroupstats to fill information into
+ * @dentry: A dentry entry belonging to the cgroup for which stats have
+ * been requested.
+ *
+ * Build and fill cgroupstats so that taskstats can export it to user
+ * space.
+ */
+int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
+{
+	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+	struct cgroup *cgrp;
+	struct css_task_iter it;
+	struct task_struct *tsk;
+
+	/* it should be kernfs_node belonging to cgroupfs and is a directory */
+	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+	    kernfs_type(kn) != KERNFS_DIR)
+		return -EINVAL;
+
+	mutex_lock(&cgroup_mutex);
+
+	/*
+	 * We aren't being called from kernfs and there's no guarantee on
+	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
+	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
+	 */
+	rcu_read_lock();
+	cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+	if (!cgrp || cgroup_is_dead(cgrp)) {
+		rcu_read_unlock();
+		mutex_unlock(&cgroup_mutex);
+		return -ENOENT;
+	}
+	rcu_read_unlock();
+
+	css_task_iter_start(&cgrp->self, &it);
+	while ((tsk = css_task_iter_next(&it))) {
+		switch (tsk->state) {
+		case TASK_RUNNING:
+			stats->nr_running++;
+			break;
+		case TASK_INTERRUPTIBLE:
+			stats->nr_sleeping++;
+			break;
+		case TASK_UNINTERRUPTIBLE:
+			stats->nr_uninterruptible++;
+			break;
+		case TASK_STOPPED:
+			stats->nr_stopped++;
+			break;
+		default:
+			if (delayacct_is_task_waiting_on_io(tsk))
+				stats->nr_io_wait++;
+			break;
+		}
+	}
+	css_task_iter_end(&it);
+
+	mutex_unlock(&cgroup_mutex);
+	return 0;
+}
+
+void cgroup1_check_for_release(struct cgroup *cgrp)
+{
+	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
+	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
+		schedule_work(&cgrp->release_agent_work);
+}
+
+/*
+ * Notify userspace when a cgroup is released, by running the
+ * configured release agent with the name of the cgroup (path
+ * relative to the root of cgroup file system) as the argument.
+ *
+ * Most likely, this user command will try to rmdir this cgroup.
+ *
+ * This races with the possibility that some other task will be
+ * attached to this cgroup before it is removed, or that some other
+ * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
+ * The presumed 'rmdir' will fail quietly if this cgroup is no longer
+ * unused, and this cgroup will be reprieved from its death sentence,
+ * to continue to serve a useful existence.  Next time it's released,
+ * we will get notified again, if it still has 'notify_on_release' set.
+ *
+ * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
+ * means only wait until the task is successfully execve()'d.  The
+ * separate release agent task is forked by call_usermodehelper(),
+ * then control in this thread returns here, without waiting for the
+ * release agent task.  We don't bother to wait because the caller of
+ * this routine has no use for the exit status of the release agent
+ * task, so no sense holding our caller up for that.
+ */
+void cgroup1_release_agent(struct work_struct *work)
+{
+	struct cgroup *cgrp =
+		container_of(work, struct cgroup, release_agent_work);
+	char *pathbuf = NULL, *agentbuf = NULL;
+	char *argv[3], *envp[3];
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+
+	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+	if (!pathbuf || !agentbuf)
+		goto out;
+
+	spin_lock_irq(&css_set_lock);
+	ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
+	spin_unlock_irq(&css_set_lock);
+	if (ret < 0 || ret >= PATH_MAX)
+		goto out;
+
+	argv[0] = agentbuf;
+	argv[1] = pathbuf;
+	argv[2] = NULL;
+
+	/* minimal command environment */
+	envp[0] = "HOME=/";
+	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+	envp[2] = NULL;
+
+	mutex_unlock(&cgroup_mutex);
+	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+	goto out_free;
+out:
+	mutex_unlock(&cgroup_mutex);
+out_free:
+	kfree(agentbuf);
+	kfree(pathbuf);
+}
+
+/*
+ * cgroup_rename - Only allow simple rename of directories in place.
+ */
+static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+			  const char *new_name_str)
+{
+	struct cgroup *cgrp = kn->priv;
+	int ret;
+
+	if (kernfs_type(kn) != KERNFS_DIR)
+		return -ENOTDIR;
+	if (kn->parent != new_parent)
+		return -EIO;
+
+	/*
+	 * We're gonna grab cgroup_mutex which nests outside kernfs
+	 * active_ref.  kernfs_rename() doesn't require active_ref
+	 * protection.  Break them before grabbing cgroup_mutex.
+	 */
+	kernfs_break_active_protection(new_parent);
+	kernfs_break_active_protection(kn);
+
+	mutex_lock(&cgroup_mutex);
+
+	ret = kernfs_rename(kn, new_parent, new_name_str);
+	if (!ret)
+		trace_cgroup_rename(cgrp);
+
+	mutex_unlock(&cgroup_mutex);
+
+	kernfs_unbreak_active_protection(kn);
+	kernfs_unbreak_active_protection(new_parent);
+	return ret;
+}
+
+static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
+{
+	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+	struct cgroup_subsys *ss;
+	int ssid;
+
+	for_each_subsys(ss, ssid)
+		if (root->subsys_mask & (1 << ssid))
+			seq_show_option(seq, ss->legacy_name, NULL);
+	if (root->flags & CGRP_ROOT_NOPREFIX)
+		seq_puts(seq, ",noprefix");
+	if (root->flags & CGRP_ROOT_XATTR)
+		seq_puts(seq, ",xattr");
+
+	spin_lock(&release_agent_path_lock);
+	if (strlen(root->release_agent_path))
+		seq_show_option(seq, "release_agent",
+				root->release_agent_path);
+	spin_unlock(&release_agent_path_lock);
+
+	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
+		seq_puts(seq, ",clone_children");
+	if (strlen(root->name))
+		seq_show_option(seq, "name", root->name);
+	return 0;
+}
+
+static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
+{
+	char *token, *o = data;
+	bool all_ss = false, one_ss = false;
+	u16 mask = U16_MAX;
+	struct cgroup_subsys *ss;
+	int nr_opts = 0;
+	int i;
+
+#ifdef CONFIG_CPUSETS
+	mask = ~((u16)1 << cpuset_cgrp_id);
+#endif
+
+	memset(opts, 0, sizeof(*opts));
+
+	while ((token = strsep(&o, ",")) != NULL) {
+		nr_opts++;
+
+		if (!*token)
+			return -EINVAL;
+		if (!strcmp(token, "none")) {
+			/* Explicitly have no subsystems */
+			opts->none = true;
+			continue;
+		}
+		if (!strcmp(token, "all")) {
+			/* Mutually exclusive option 'all' + subsystem name */
+			if (one_ss)
+				return -EINVAL;
+			all_ss = true;
+			continue;
+		}
+		if (!strcmp(token, "noprefix")) {
+			opts->flags |= CGRP_ROOT_NOPREFIX;
+			continue;
+		}
+		if (!strcmp(token, "clone_children")) {
+			opts->cpuset_clone_children = true;
+			continue;
+		}
+		if (!strcmp(token, "xattr")) {
+			opts->flags |= CGRP_ROOT_XATTR;
+			continue;
+		}
+		if (!strncmp(token, "release_agent=", 14)) {
+			/* Specifying two release agents is forbidden */
+			if (opts->release_agent)
+				return -EINVAL;
+			opts->release_agent =
+				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
+			if (!opts->release_agent)
+				return -ENOMEM;
+			continue;
+		}
+		if (!strncmp(token, "name=", 5)) {
+			const char *name = token + 5;
+			/* Can't specify an empty name */
+			if (!strlen(name))
+				return -EINVAL;
+			/* Must match [\w.-]+ */
+			for (i = 0; i < strlen(name); i++) {
+				char c = name[i];
+				if (isalnum(c))
+					continue;
+				if ((c == '.') || (c == '-') || (c == '_'))
+					continue;
+				return -EINVAL;
+			}
+			/* Specifying two names is forbidden */
+			if (opts->name)
+				return -EINVAL;
+			opts->name = kstrndup(name,
+					      MAX_CGROUP_ROOT_NAMELEN - 1,
+					      GFP_KERNEL);
+			if (!opts->name)
+				return -ENOMEM;
+
+			continue;
+		}
+
+		for_each_subsys(ss, i) {
+			if (strcmp(token, ss->legacy_name))
+				continue;
+			if (!cgroup_ssid_enabled(i))
+				continue;
+			if (cgroup1_ssid_disabled(i))
+				continue;
+
+			/* Mutually exclusive option 'all' + subsystem name */
+			if (all_ss)
+				return -EINVAL;
+			opts->subsys_mask |= (1 << i);
+			one_ss = true;
+
+			break;
+		}
+		if (i == CGROUP_SUBSYS_COUNT)
+			return -ENOENT;
+	}
+
+	/*
+	 * If the 'all' option was specified select all the subsystems,
+	 * otherwise if 'none', 'name=' and a subsystem name options were
+	 * not specified, let's default to 'all'
+	 */
+	if (all_ss || (!one_ss && !opts->none && !opts->name))
+		for_each_subsys(ss, i)
+			if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
+				opts->subsys_mask |= (1 << i);
+
+	/*
+	 * We either have to specify by name or by subsystems. (So all
+	 * empty hierarchies must have a name).
+	 */
+	if (!opts->subsys_mask && !opts->name)
+		return -EINVAL;
+
+	/*
+	 * Option noprefix was introduced just for backward compatibility
+	 * with the old cpuset, so we allow noprefix only if mounting just
+	 * the cpuset subsystem.
+	 */
+	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
+		return -EINVAL;
+
+	/* Can't specify "none" and some subsystems */
+	if (opts->subsys_mask && opts->none)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
+{
+	int ret = 0;
+	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+	struct cgroup_sb_opts opts;
+	u16 added_mask, removed_mask;
+
+	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+	/* See what subsystems are wanted */
+	ret = parse_cgroupfs_options(data, &opts);
+	if (ret)
+		goto out_unlock;
+
+	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
+		pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
+			task_tgid_nr(current), current->comm);
+
+	added_mask = opts.subsys_mask & ~root->subsys_mask;
+	removed_mask = root->subsys_mask & ~opts.subsys_mask;
+
+	/* Don't allow flags or name to change at remount */
+	if ((opts.flags ^ root->flags) ||
+	    (opts.name && strcmp(opts.name, root->name))) {
+		pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
+		       opts.flags, opts.name ?: "", root->flags, root->name);
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* remounting is not allowed for populated hierarchies */
+	if (!list_empty(&root->cgrp.self.children)) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	ret = rebind_subsystems(root, added_mask);
+	if (ret)
+		goto out_unlock;
+
+	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
+
+	if (opts.release_agent) {
+		spin_lock(&release_agent_path_lock);
+		strcpy(root->release_agent_path, opts.release_agent);
+		spin_unlock(&release_agent_path_lock);
+	}
+
+	trace_cgroup_remount(root);
+
+ out_unlock:
+	kfree(opts.release_agent);
+	kfree(opts.name);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+
+struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
+	.rename			= cgroup1_rename,
+	.show_options		= cgroup1_show_options,
+	.remount_fs		= cgroup1_remount,
+	.mkdir			= cgroup_mkdir,
+	.rmdir			= cgroup_rmdir,
+	.show_path		= cgroup_show_path,
+};
+
+struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
+			     void *data, unsigned long magic,
+			     struct cgroup_namespace *ns)
+{
+	struct super_block *pinned_sb = NULL;
+	struct cgroup_sb_opts opts;
+	struct cgroup_root *root;
+	struct cgroup_subsys *ss;
+	struct dentry *dentry;
+	int i, ret;
+
+	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+	/* First find the desired set of subsystems */
+	ret = parse_cgroupfs_options(data, &opts);
+	if (ret)
+		goto out_unlock;
+
+	/*
+	 * Destruction of cgroup root is asynchronous, so subsystems may
+	 * still be dying after the previous unmount.  Let's drain the
+	 * dying subsystems.  We just need to ensure that the ones
+	 * unmounted previously finish dying and don't care about new ones
+	 * starting.  Testing ref liveliness is good enough.
+	 */
+	for_each_subsys(ss, i) {
+		if (!(opts.subsys_mask & (1 << i)) ||
+		    ss->root == &cgrp_dfl_root)
+			continue;
+
+		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+			mutex_unlock(&cgroup_mutex);
+			msleep(10);
+			ret = restart_syscall();
+			goto out_free;
+		}
+		cgroup_put(&ss->root->cgrp);
+	}
+
+	for_each_root(root) {
+		bool name_match = false;
+
+		if (root == &cgrp_dfl_root)
+			continue;
+
+		/*
+		 * If we asked for a name then it must match.  Also, if
+		 * name matches but sybsys_mask doesn't, we should fail.
+		 * Remember whether name matched.
+		 */
+		if (opts.name) {
+			if (strcmp(opts.name, root->name))
+				continue;
+			name_match = true;
+		}
+
+		/*
+		 * If we asked for subsystems (or explicitly for no
+		 * subsystems) then they must match.
+		 */
+		if ((opts.subsys_mask || opts.none) &&
+		    (opts.subsys_mask != root->subsys_mask)) {
+			if (!name_match)
+				continue;
+			ret = -EBUSY;
+			goto out_unlock;
+		}
+
+		if (root->flags ^ opts.flags)
+			pr_warn("new mount options do not match the existing superblock, will be ignored\n");
+
+		/*
+		 * We want to reuse @root whose lifetime is governed by its
+		 * ->cgrp.  Let's check whether @root is alive and keep it
+		 * that way.  As cgroup_kill_sb() can happen anytime, we
+		 * want to block it by pinning the sb so that @root doesn't
+		 * get killed before mount is complete.
+		 *
+		 * With the sb pinned, tryget_live can reliably indicate
+		 * whether @root can be reused.  If it's being killed,
+		 * drain it.  We can use wait_queue for the wait but this
+		 * path is super cold.  Let's just sleep a bit and retry.
+		 */
+		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+		if (IS_ERR(pinned_sb) ||
+		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+			mutex_unlock(&cgroup_mutex);
+			if (!IS_ERR_OR_NULL(pinned_sb))
+				deactivate_super(pinned_sb);
+			msleep(10);
+			ret = restart_syscall();
+			goto out_free;
+		}
+
+		ret = 0;
+		goto out_unlock;
+	}
+
+	/*
+	 * No such thing, create a new one.  name= matching without subsys
+	 * specification is allowed for already existing hierarchies but we
+	 * can't create new one without subsys specification.
+	 */
+	if (!opts.subsys_mask && !opts.none) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* Hierarchies may only be created in the initial cgroup namespace. */
+	if (ns != &init_cgroup_ns) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+
+	root = kzalloc(sizeof(*root), GFP_KERNEL);
+	if (!root) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	init_cgroup_root(root, &opts);
+
+	ret = cgroup_setup_root(root, opts.subsys_mask);
+	if (ret)
+		cgroup_free_root(root);
+
+out_unlock:
+	mutex_unlock(&cgroup_mutex);
+out_free:
+	kfree(opts.release_agent);
+	kfree(opts.name);
+
+	if (ret)
+		return ERR_PTR(ret);
+
+	dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
+				 CGROUP_SUPER_MAGIC, ns);
+
+	/*
+	 * If @pinned_sb, we're reusing an existing root and holding an
+	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
+	 */
+	if (pinned_sb)
+		deactivate_super(pinned_sb);
+
+	return dentry;
+}
+
+static int __init cgroup1_wq_init(void)
+{
+	/*
+	 * Used to destroy pidlists and separate to serve as flush domain.
+	 * Cap @max_active to 1 too.
+	 */
+	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
+						    0, 1);
+	BUG_ON(!cgroup_pidlist_destroy_wq);
+	return 0;
+}
+core_initcall(cgroup1_wq_init);
+
+static int __init cgroup_no_v1(char *str)
+{
+	struct cgroup_subsys *ss;
+	char *token;
+	int i;
+
+	while ((token = strsep(&str, ",")) != NULL) {
+		if (!*token)
+			continue;
+
+		if (!strcmp(token, "all")) {
+			cgroup_no_v1_mask = U16_MAX;
+			break;
+		}
+
+		for_each_subsys(ss, i) {
+			if (strcmp(token, ss->name) &&
+			    strcmp(token, ss->legacy_name))
+				continue;
+
+			cgroup_no_v1_mask |= 1 << i;
+		}
+	}
+	return 1;
+}
+__setup("cgroup_no_v1=", cgroup_no_v1);
+
+
+#ifdef CONFIG_CGROUP_DEBUG
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+	if (!css)
+		return ERR_PTR(-ENOMEM);
+
+	return css;
+}
+
+static void debug_css_free(struct cgroup_subsys_state *css)
+{
+	kfree(css);
+}
+
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+				struct cftype *cft)
+{
+	return cgroup_task_count(css->cgroup);
+}
+
+static u64 current_css_set_read(struct cgroup_subsys_state *css,
+				struct cftype *cft)
+{
+	return (u64)(unsigned long)current->cgroups;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+					 struct cftype *cft)
+{
+	u64 count;
+
+	rcu_read_lock();
+	count = atomic_read(&task_css_set(current)->refcount);
+	rcu_read_unlock();
+	return count;
+}
+
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
+{
+	struct cgrp_cset_link *link;
+	struct css_set *cset;
+	char *name_buf;
+
+	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+	if (!name_buf)
+		return -ENOMEM;
+
+	spin_lock_irq(&css_set_lock);
+	rcu_read_lock();
+	cset = rcu_dereference(current->cgroups);
+	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+		struct cgroup *c = link->cgrp;
+
+		cgroup_name(c, name_buf, NAME_MAX + 1);
+		seq_printf(seq, "Root %d group %s\n",
+			   c->root->hierarchy_id, name_buf);
+	}
+	rcu_read_unlock();
+	spin_unlock_irq(&css_set_lock);
+	kfree(name_buf);
+	return 0;
+}
+
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
+{
+	struct cgroup_subsys_state *css = seq_css(seq);
+	struct cgrp_cset_link *link;
+
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+		struct css_set *cset = link->cset;
+		struct task_struct *task;
+		int count = 0;
+
+		seq_printf(seq, "css_set %p\n", cset);
+
+		list_for_each_entry(task, &cset->tasks, cg_list) {
+			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+				goto overflow;
+			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
+		}
+
+		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+				goto overflow;
+			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
+		}
+		continue;
+	overflow:
+		seq_puts(seq, "  ...\n");
+	}
+	spin_unlock_irq(&css_set_lock);
+	return 0;
+}
+
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	return (!cgroup_is_populated(css->cgroup) &&
+		!css_has_online_children(&css->cgroup->self));
+}
+
+static struct cftype debug_files[] =  {
+	{
+		.name = "taskcount",
+		.read_u64 = debug_taskcount_read,
+	},
+
+	{
+		.name = "current_css_set",
+		.read_u64 = current_css_set_read,
+	},
+
+	{
+		.name = "current_css_set_refcount",
+		.read_u64 = current_css_set_refcount_read,
+	},
+
+	{
+		.name = "current_css_set_cg_links",
+		.seq_show = current_css_set_cg_links_read,
+	},
+
+	{
+		.name = "cgroup_css_links",
+		.seq_show = cgroup_css_links_read,
+	},
+
+	{
+		.name = "releasable",
+		.read_u64 = releasable_read,
+	},
+
+	{ }	/* terminate */
+};
+
+struct cgroup_subsys debug_cgrp_subsys = {
+	.css_alloc = debug_css_alloc,
+	.css_free = debug_css_free,
+	.legacy_cftypes = debug_files,
+};
+#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup.c b/kernel/cgroup/cgroup.c
similarity index 72%
rename from kernel/cgroup.c
rename to kernel/cgroup/cgroup.c
index 53bbca7..e8f87bf 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -28,15 +28,13 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <linux/cgroup.h>
+#include "cgroup-internal.h"
+
 #include <linux/cred.h>
-#include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/init_task.h>
 #include <linux/kernel.h>
-#include <linux/list.h>
 #include <linux/magic.h>
-#include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
@@ -47,16 +45,9 @@
 #include <linux/spinlock.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/string.h>
-#include <linux/sort.h>
-#include <linux/kmod.h>
-#include <linux/delayacct.h>
-#include <linux/cgroupstats.h>
 #include <linux/hashtable.h>
-#include <linux/pid_namespace.h>
 #include <linux/idr.h>
-#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/kthread.h>
-#include <linux/delay.h>
 #include <linux/atomic.h>
 #include <linux/cpuset.h>
 #include <linux/proc_ns.h>
@@ -67,14 +58,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/cgroup.h>
 
-/*
- * pidlists linger the following amount before being destroyed.  The goal
- * is avoiding frequent destruction in the middle of consecutive read calls
- * Expiring in the middle is a performance problem not a correctness one.
- * 1 sec should be enough.
- */
-#define CGROUP_PIDLIST_DESTROY_DELAY	HZ
-
 #define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
 					 MAX_CFTYPE_NAME + 2)
 
@@ -88,14 +71,12 @@
  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
  * cgroup.h can use them for lockdep annotations.
  */
-#ifdef CONFIG_PROVE_RCU
 DEFINE_MUTEX(cgroup_mutex);
 DEFINE_SPINLOCK(css_set_lock);
+
+#ifdef CONFIG_PROVE_RCU
 EXPORT_SYMBOL_GPL(cgroup_mutex);
 EXPORT_SYMBOL_GPL(css_set_lock);
-#else
-static DEFINE_MUTEX(cgroup_mutex);
-static DEFINE_SPINLOCK(css_set_lock);
 #endif
 
 /*
@@ -110,12 +91,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
  */
 static DEFINE_SPINLOCK(cgroup_file_kn_lock);
 
-/*
- * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
- * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
- */
-static DEFINE_SPINLOCK(release_agent_path_lock);
-
 struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
 
 #define cgroup_assert_mutex_or_rcu_locked()				\
@@ -131,15 +106,9 @@ struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
  */
 static struct workqueue_struct *cgroup_destroy_wq;
 
-/*
- * pidlist destructions need to be flushed on cgroup destruction.  Use a
- * separate workqueue as flush domain.
- */
-static struct workqueue_struct *cgroup_pidlist_destroy_wq;
-
 /* generate an array of cgroup subsystem pointers */
 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
-static struct cgroup_subsys *cgroup_subsys[] = {
+struct cgroup_subsys *cgroup_subsys[] = {
 #include <linux/cgroup_subsys.h>
 };
 #undef SUBSYS
@@ -186,18 +155,14 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
  */
 static bool cgrp_dfl_visible;
 
-/* Controllers blocked by the commandline in v1 */
-static u16 cgroup_no_v1_mask;
-
 /* some controllers are not supported in the default hierarchy */
 static u16 cgrp_dfl_inhibit_ss_mask;
 
 /* some controllers are implicitly enabled on the default hierarchy */
-static unsigned long cgrp_dfl_implicit_ss_mask;
+static u16 cgrp_dfl_implicit_ss_mask;
 
 /* The list of hierarchy roots */
-
-static LIST_HEAD(cgroup_roots);
+LIST_HEAD(cgroup_roots);
 static int cgroup_root_count;
 
 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */
@@ -213,13 +178,13 @@ static DEFINE_IDR(cgroup_hierarchy_idr);
 static u64 css_serial_nr_next = 1;
 
 /*
- * These bitmask flags indicate whether tasks in the fork and exit paths have
- * fork/exit handlers to call. This avoids us having to do extra work in the
- * fork/exit path to check which subsystems have fork/exit callbacks.
+ * These bitmasks identify subsystems with specific features to avoid
+ * having to do iterative checks repeatedly.
  */
 static u16 have_fork_callback __read_mostly;
 static u16 have_exit_callback __read_mostly;
 static u16 have_free_callback __read_mostly;
+static u16 have_canfork_callback __read_mostly;
 
 /* cgroup namespace for init task */
 struct cgroup_namespace init_cgroup_ns = {
@@ -230,15 +195,9 @@ struct cgroup_namespace init_cgroup_ns = {
 	.root_cset	= &init_css_set,
 };
 
-/* Ditto for the can_fork callback. */
-static u16 have_canfork_callback __read_mostly;
-
 static struct file_system_type cgroup2_fs_type;
-static struct cftype cgroup_dfl_base_files[];
-static struct cftype cgroup_legacy_base_files[];
+static struct cftype cgroup_base_files[];
 
-static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
-static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
 static int cgroup_apply_control(struct cgroup *cgrp);
 static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
 static void css_task_iter_advance(struct css_task_iter *it);
@@ -259,7 +218,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
  * is fine for individual subsystems but unsuitable for cgroup core.  This
  * is slower static_key_enabled() based test indexed by @ssid.
  */
-static bool cgroup_ssid_enabled(int ssid)
+bool cgroup_ssid_enabled(int ssid)
 {
 	if (CGROUP_SUBSYS_COUNT == 0)
 		return false;
@@ -267,11 +226,6 @@ static bool cgroup_ssid_enabled(int ssid)
 	return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
 }
 
-static bool cgroup_ssid_no_v1(int ssid)
-{
-	return cgroup_no_v1_mask & (1 << ssid);
-}
-
 /**
  * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
  * @cgrp: the cgroup of interest
@@ -325,7 +279,7 @@ static bool cgroup_ssid_no_v1(int ssid)
  *
  * - debug: disallowed on the default hierarchy.
  */
-static bool cgroup_on_dfl(const struct cgroup *cgrp)
+bool cgroup_on_dfl(const struct cgroup *cgrp)
 {
 	return cgrp->root == &cgrp_dfl_root;
 }
@@ -481,12 +435,6 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
 	return css;
 }
 
-/* convenient tests for these bits */
-static inline bool cgroup_is_dead(const struct cgroup *cgrp)
-{
-	return !(cgrp->self.flags & CSS_ONLINE);
-}
-
 static void cgroup_get(struct cgroup *cgrp)
 {
 	WARN_ON_ONCE(cgroup_is_dead(cgrp));
@@ -518,11 +466,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 }
 EXPORT_SYMBOL_GPL(of_css);
 
-static int notify_on_release(const struct cgroup *cgrp)
-{
-	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-}
-
 /**
  * for_each_css - iterate all css's of a cgroup
  * @css: the iteration cursor
@@ -553,15 +496,6 @@ static int notify_on_release(const struct cgroup *cgrp)
 		else
 
 /**
- * for_each_subsys - iterate all enabled cgroup subsystems
- * @ss: the iteration cursor
- * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
- */
-#define for_each_subsys(ss, ssid)					\
-	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
-	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
-
-/**
  * do_each_subsys_mask - filter for_each_subsys with a bitmask
  * @ss: the iteration cursor
  * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
@@ -585,10 +519,6 @@ static int notify_on_release(const struct cgroup *cgrp)
 	}								\
 } while (false)
 
-/* iterate across the hierarchies */
-#define for_each_root(root)						\
-	list_for_each_entry((root), &cgroup_roots, root_list)
-
 /* iterate over child cgrps, lock should be held throughout iteration */
 #define cgroup_for_each_live_child(child, cgrp)				\
 	list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
@@ -615,29 +545,6 @@ static int notify_on_release(const struct cgroup *cgrp)
 			;						\
 		else
 
-static void cgroup_release_agent(struct work_struct *work);
-static void check_for_release(struct cgroup *cgrp);
-
-/*
- * A cgroup can be associated with multiple css_sets as different tasks may
- * belong to different cgroups on different hierarchies.  In the other
- * direction, a css_set is naturally associated with multiple cgroups.
- * This M:N relationship is represented by the following link structure
- * which exists for each association and allows traversing the associations
- * from both sides.
- */
-struct cgrp_cset_link {
-	/* the cgroup and css_set this link associates */
-	struct cgroup		*cgrp;
-	struct css_set		*cset;
-
-	/* list of cgrp_cset_links anchored at cgrp->cset_links */
-	struct list_head	cset_link;
-
-	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
-	struct list_head	cgrp_link;
-};
-
 /*
  * The default css_set - used by init and its children prior to any
  * hierarchies being mounted. It contains a pointer to the root state
@@ -647,12 +554,12 @@ struct cgrp_cset_link {
  */
 struct css_set init_css_set = {
 	.refcount		= ATOMIC_INIT(1),
-	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
 	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
 	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
+	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
+	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
 	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
 	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
-	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
 };
 
 static int css_set_count	= 1;	/* 1 for init_css_set */
@@ -699,7 +606,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 		if (!trigger)
 			break;
 
-		check_for_release(cgrp);
+		cgroup1_check_for_release(cgrp);
 		cgroup_file_notify(&cgrp->events_file);
 
 		cgrp = cgroup_parent(cgrp);
@@ -808,7 +715,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 	return key;
 }
 
-static void put_css_set_locked(struct css_set *cset)
+void put_css_set_locked(struct css_set *cset)
 {
 	struct cgrp_cset_link *link, *tmp_link;
 	struct cgroup_subsys *ss;
@@ -838,31 +745,6 @@ static void put_css_set_locked(struct css_set *cset)
 	kfree_rcu(cset, rcu_head);
 }
 
-static void put_css_set(struct css_set *cset)
-{
-	unsigned long flags;
-
-	/*
-	 * Ensure that the refcount doesn't hit zero while any readers
-	 * can see it. Similar to atomic_dec_and_lock(), but for an
-	 * rwlock
-	 */
-	if (atomic_add_unless(&cset->refcount, -1, 1))
-		return;
-
-	spin_lock_irqsave(&css_set_lock, flags);
-	put_css_set_locked(cset);
-	spin_unlock_irqrestore(&css_set_lock, flags);
-}
-
-/*
- * refcounted get/put for css_set objects
- */
-static inline void get_css_set(struct css_set *cset)
-{
-	atomic_inc(&cset->refcount);
-}
-
 /**
  * compare_css_sets - helper function for find_existing_css_set().
  * @cset: candidate css_set being tested
@@ -1095,13 +977,13 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	}
 
 	atomic_set(&cset->refcount, 1);
-	INIT_LIST_HEAD(&cset->cgrp_links);
 	INIT_LIST_HEAD(&cset->tasks);
 	INIT_LIST_HEAD(&cset->mg_tasks);
-	INIT_LIST_HEAD(&cset->mg_preload_node);
-	INIT_LIST_HEAD(&cset->mg_node);
 	INIT_LIST_HEAD(&cset->task_iters);
 	INIT_HLIST_NODE(&cset->hlist);
+	INIT_LIST_HEAD(&cset->cgrp_links);
+	INIT_LIST_HEAD(&cset->mg_preload_node);
+	INIT_LIST_HEAD(&cset->mg_node);
 
 	/* Copy the set of subsystem state objects generated in
 	 * find_existing_css_set() */
@@ -1138,7 +1020,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	return cset;
 }
 
-static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
+struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 {
 	struct cgroup *root_cgrp = kf_root->kn->priv;
 
@@ -1166,7 +1048,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
 	idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
 }
 
-static void cgroup_free_root(struct cgroup_root *root)
+void cgroup_free_root(struct cgroup_root *root)
 {
 	if (root) {
 		idr_destroy(&root->cgroup_idr);
@@ -1283,8 +1165,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
  * Return the cgroup for "task" from the given hierarchy. Must be
  * called with cgroup_mutex and css_set_lock held.
  */
-static struct cgroup *task_cgroup_from_root(struct task_struct *task,
-					    struct cgroup_root *root)
+struct cgroup *task_cgroup_from_root(struct task_struct *task,
+				     struct cgroup_root *root)
 {
 	/*
 	 * No need to lock the task - since we hold cgroup_mutex the
@@ -1321,7 +1203,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
  */
 
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
-static const struct file_operations proc_cgroupstats_operations;
 
 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 			      char *buf)
@@ -1415,7 +1296,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
  * inaccessible any time.  If the caller intends to continue to access the
  * cgroup, it should pin it before invoking this function.
  */
-static void cgroup_kn_unlock(struct kernfs_node *kn)
+void cgroup_kn_unlock(struct kernfs_node *kn)
 {
 	struct cgroup *cgrp;
 
@@ -1447,8 +1328,7 @@ static void cgroup_kn_unlock(struct kernfs_node *kn)
  * locking under kernfs active protection and allows all kernfs operations
  * including self-removal.
  */
-static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn,
-					  bool drain_offline)
+struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
 {
 	struct cgroup *cgrp;
 
@@ -1532,9 +1412,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
 
 	if (!css->ss) {
 		if (cgroup_on_dfl(cgrp))
-			cfts = cgroup_dfl_base_files;
+			cfts = cgroup_base_files;
 		else
-			cfts = cgroup_legacy_base_files;
+			cfts = cgroup1_base_files;
 
 		return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
 	}
@@ -1559,7 +1439,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
 	return ret;
 }
 
-static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
+int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
 {
 	struct cgroup *dcgrp = &dst_root->cgrp;
 	struct cgroup_subsys *ss;
@@ -1629,8 +1509,8 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
 	return 0;
 }
 
-static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
-			    struct kernfs_root *kf_root)
+int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+		     struct kernfs_root *kf_root)
 {
 	int len = 0;
 	char *buf = NULL;
@@ -1656,237 +1536,10 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 	return len;
 }
 
-static int cgroup_show_options(struct seq_file *seq,
-			       struct kernfs_root *kf_root)
-{
-	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
-	struct cgroup_subsys *ss;
-	int ssid;
-
-	if (root != &cgrp_dfl_root)
-		for_each_subsys(ss, ssid)
-			if (root->subsys_mask & (1 << ssid))
-				seq_show_option(seq, ss->legacy_name, NULL);
-	if (root->flags & CGRP_ROOT_NOPREFIX)
-		seq_puts(seq, ",noprefix");
-	if (root->flags & CGRP_ROOT_XATTR)
-		seq_puts(seq, ",xattr");
-
-	spin_lock(&release_agent_path_lock);
-	if (strlen(root->release_agent_path))
-		seq_show_option(seq, "release_agent",
-				root->release_agent_path);
-	spin_unlock(&release_agent_path_lock);
-
-	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
-		seq_puts(seq, ",clone_children");
-	if (strlen(root->name))
-		seq_show_option(seq, "name", root->name);
-	return 0;
-}
-
-struct cgroup_sb_opts {
-	u16 subsys_mask;
-	unsigned int flags;
-	char *release_agent;
-	bool cpuset_clone_children;
-	char *name;
-	/* User explicitly requested empty subsystem */
-	bool none;
-};
-
-static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
-{
-	char *token, *o = data;
-	bool all_ss = false, one_ss = false;
-	u16 mask = U16_MAX;
-	struct cgroup_subsys *ss;
-	int nr_opts = 0;
-	int i;
-
-#ifdef CONFIG_CPUSETS
-	mask = ~((u16)1 << cpuset_cgrp_id);
-#endif
-
-	memset(opts, 0, sizeof(*opts));
-
-	while ((token = strsep(&o, ",")) != NULL) {
-		nr_opts++;
-
-		if (!*token)
-			return -EINVAL;
-		if (!strcmp(token, "none")) {
-			/* Explicitly have no subsystems */
-			opts->none = true;
-			continue;
-		}
-		if (!strcmp(token, "all")) {
-			/* Mutually exclusive option 'all' + subsystem name */
-			if (one_ss)
-				return -EINVAL;
-			all_ss = true;
-			continue;
-		}
-		if (!strcmp(token, "noprefix")) {
-			opts->flags |= CGRP_ROOT_NOPREFIX;
-			continue;
-		}
-		if (!strcmp(token, "clone_children")) {
-			opts->cpuset_clone_children = true;
-			continue;
-		}
-		if (!strcmp(token, "xattr")) {
-			opts->flags |= CGRP_ROOT_XATTR;
-			continue;
-		}
-		if (!strncmp(token, "release_agent=", 14)) {
-			/* Specifying two release agents is forbidden */
-			if (opts->release_agent)
-				return -EINVAL;
-			opts->release_agent =
-				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
-			if (!opts->release_agent)
-				return -ENOMEM;
-			continue;
-		}
-		if (!strncmp(token, "name=", 5)) {
-			const char *name = token + 5;
-			/* Can't specify an empty name */
-			if (!strlen(name))
-				return -EINVAL;
-			/* Must match [\w.-]+ */
-			for (i = 0; i < strlen(name); i++) {
-				char c = name[i];
-				if (isalnum(c))
-					continue;
-				if ((c == '.') || (c == '-') || (c == '_'))
-					continue;
-				return -EINVAL;
-			}
-			/* Specifying two names is forbidden */
-			if (opts->name)
-				return -EINVAL;
-			opts->name = kstrndup(name,
-					      MAX_CGROUP_ROOT_NAMELEN - 1,
-					      GFP_KERNEL);
-			if (!opts->name)
-				return -ENOMEM;
-
-			continue;
-		}
-
-		for_each_subsys(ss, i) {
-			if (strcmp(token, ss->legacy_name))
-				continue;
-			if (!cgroup_ssid_enabled(i))
-				continue;
-			if (cgroup_ssid_no_v1(i))
-				continue;
-
-			/* Mutually exclusive option 'all' + subsystem name */
-			if (all_ss)
-				return -EINVAL;
-			opts->subsys_mask |= (1 << i);
-			one_ss = true;
-
-			break;
-		}
-		if (i == CGROUP_SUBSYS_COUNT)
-			return -ENOENT;
-	}
-
-	/*
-	 * If the 'all' option was specified select all the subsystems,
-	 * otherwise if 'none', 'name=' and a subsystem name options were
-	 * not specified, let's default to 'all'
-	 */
-	if (all_ss || (!one_ss && !opts->none && !opts->name))
-		for_each_subsys(ss, i)
-			if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
-				opts->subsys_mask |= (1 << i);
-
-	/*
-	 * We either have to specify by name or by subsystems. (So all
-	 * empty hierarchies must have a name).
-	 */
-	if (!opts->subsys_mask && !opts->name)
-		return -EINVAL;
-
-	/*
-	 * Option noprefix was introduced just for backward compatibility
-	 * with the old cpuset, so we allow noprefix only if mounting just
-	 * the cpuset subsystem.
-	 */
-	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
-		return -EINVAL;
-
-	/* Can't specify "none" and some subsystems */
-	if (opts->subsys_mask && opts->none)
-		return -EINVAL;
-
-	return 0;
-}
-
 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 {
-	int ret = 0;
-	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
-	struct cgroup_sb_opts opts;
-	u16 added_mask, removed_mask;
-
-	if (root == &cgrp_dfl_root) {
-		pr_err("remount is not allowed\n");
-		return -EINVAL;
-	}
-
-	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
-
-	/* See what subsystems are wanted */
-	ret = parse_cgroupfs_options(data, &opts);
-	if (ret)
-		goto out_unlock;
-
-	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
-		pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
-			task_tgid_nr(current), current->comm);
-
-	added_mask = opts.subsys_mask & ~root->subsys_mask;
-	removed_mask = root->subsys_mask & ~opts.subsys_mask;
-
-	/* Don't allow flags or name to change at remount */
-	if ((opts.flags ^ root->flags) ||
-	    (opts.name && strcmp(opts.name, root->name))) {
-		pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
-		       opts.flags, opts.name ?: "", root->flags, root->name);
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-
-	/* remounting is not allowed for populated hierarchies */
-	if (!list_empty(&root->cgrp.self.children)) {
-		ret = -EBUSY;
-		goto out_unlock;
-	}
-
-	ret = rebind_subsystems(root, added_mask);
-	if (ret)
-		goto out_unlock;
-
-	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
-
-	if (opts.release_agent) {
-		spin_lock(&release_agent_path_lock);
-		strcpy(root->release_agent_path, opts.release_agent);
-		spin_unlock(&release_agent_path_lock);
-	}
-
-	trace_cgroup_remount(root);
-
- out_unlock:
-	kfree(opts.release_agent);
-	kfree(opts.name);
-	mutex_unlock(&cgroup_mutex);
-	return ret;
+	pr_err("remount is not allowed\n");
+	return -EINVAL;
 }
 
 /*
@@ -1964,11 +1617,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
 
 	init_waitqueue_head(&cgrp->offline_waitq);
-	INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
+	INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
 }
 
-static void init_cgroup_root(struct cgroup_root *root,
-			     struct cgroup_sb_opts *opts)
+void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
 {
 	struct cgroup *cgrp = &root->cgrp;
 
@@ -1987,10 +1639,11 @@ static void init_cgroup_root(struct cgroup_root *root,
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }
 
-static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 {
 	LIST_HEAD(tmp_links);
 	struct cgroup *root_cgrp = &root->cgrp;
+	struct kernfs_syscall_ops *kf_sops;
 	struct css_set *cset;
 	int i, ret;
 
@@ -2022,7 +1675,10 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 	if (ret)
 		goto cancel_ref;
 
-	root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
+	kf_sops = root == &cgrp_dfl_root ?
+		&cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
+
+	root->kf_root = kernfs_create_root(kf_sops,
 					   KERNFS_ROOT_CREATE_DEACTIVATED,
 					   root_cgrp);
 	if (IS_ERR(root->kf_root)) {
@@ -2080,182 +1736,18 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 	return ret;
 }
 
-static struct dentry *cgroup_mount(struct file_system_type *fs_type,
-			 int flags, const char *unused_dev_name,
-			 void *data)
+struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
+			       struct cgroup_root *root, unsigned long magic,
+			       struct cgroup_namespace *ns)
 {
-	bool is_v2 = fs_type == &cgroup2_fs_type;
-	struct super_block *pinned_sb = NULL;
-	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
-	struct cgroup_subsys *ss;
-	struct cgroup_root *root;
-	struct cgroup_sb_opts opts;
 	struct dentry *dentry;
-	int ret;
-	int i;
 	bool new_sb;
 
-	get_cgroup_ns(ns);
-
-	/* Check if the caller has permission to mount. */
-	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
-		put_cgroup_ns(ns);
-		return ERR_PTR(-EPERM);
-	}
+	dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
 
 	/*
-	 * The first time anyone tries to mount a cgroup, enable the list
-	 * linking each css_set to its tasks and fix up all existing tasks.
-	 */
-	if (!use_task_css_set_links)
-		cgroup_enable_task_cg_lists();
-
-	if (is_v2) {
-		if (data) {
-			pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
-			put_cgroup_ns(ns);
-			return ERR_PTR(-EINVAL);
-		}
-		cgrp_dfl_visible = true;
-		root = &cgrp_dfl_root;
-		cgroup_get(&root->cgrp);
-		goto out_mount;
-	}
-
-	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
-
-	/* First find the desired set of subsystems */
-	ret = parse_cgroupfs_options(data, &opts);
-	if (ret)
-		goto out_unlock;
-
-	/*
-	 * Destruction of cgroup root is asynchronous, so subsystems may
-	 * still be dying after the previous unmount.  Let's drain the
-	 * dying subsystems.  We just need to ensure that the ones
-	 * unmounted previously finish dying and don't care about new ones
-	 * starting.  Testing ref liveliness is good enough.
-	 */
-	for_each_subsys(ss, i) {
-		if (!(opts.subsys_mask & (1 << i)) ||
-		    ss->root == &cgrp_dfl_root)
-			continue;
-
-		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
-			mutex_unlock(&cgroup_mutex);
-			msleep(10);
-			ret = restart_syscall();
-			goto out_free;
-		}
-		cgroup_put(&ss->root->cgrp);
-	}
-
-	for_each_root(root) {
-		bool name_match = false;
-
-		if (root == &cgrp_dfl_root)
-			continue;
-
-		/*
-		 * If we asked for a name then it must match.  Also, if
-		 * name matches but sybsys_mask doesn't, we should fail.
-		 * Remember whether name matched.
-		 */
-		if (opts.name) {
-			if (strcmp(opts.name, root->name))
-				continue;
-			name_match = true;
-		}
-
-		/*
-		 * If we asked for subsystems (or explicitly for no
-		 * subsystems) then they must match.
-		 */
-		if ((opts.subsys_mask || opts.none) &&
-		    (opts.subsys_mask != root->subsys_mask)) {
-			if (!name_match)
-				continue;
-			ret = -EBUSY;
-			goto out_unlock;
-		}
-
-		if (root->flags ^ opts.flags)
-			pr_warn("new mount options do not match the existing superblock, will be ignored\n");
-
-		/*
-		 * We want to reuse @root whose lifetime is governed by its
-		 * ->cgrp.  Let's check whether @root is alive and keep it
-		 * that way.  As cgroup_kill_sb() can happen anytime, we
-		 * want to block it by pinning the sb so that @root doesn't
-		 * get killed before mount is complete.
-		 *
-		 * With the sb pinned, tryget_live can reliably indicate
-		 * whether @root can be reused.  If it's being killed,
-		 * drain it.  We can use wait_queue for the wait but this
-		 * path is super cold.  Let's just sleep a bit and retry.
-		 */
-		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
-		if (IS_ERR(pinned_sb) ||
-		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
-			mutex_unlock(&cgroup_mutex);
-			if (!IS_ERR_OR_NULL(pinned_sb))
-				deactivate_super(pinned_sb);
-			msleep(10);
-			ret = restart_syscall();
-			goto out_free;
-		}
-
-		ret = 0;
-		goto out_unlock;
-	}
-
-	/*
-	 * No such thing, create a new one.  name= matching without subsys
-	 * specification is allowed for already existing hierarchies but we
-	 * can't create new one without subsys specification.
-	 */
-	if (!opts.subsys_mask && !opts.none) {
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-
-	/* Hierarchies may only be created in the initial cgroup namespace. */
-	if (ns != &init_cgroup_ns) {
-		ret = -EPERM;
-		goto out_unlock;
-	}
-
-	root = kzalloc(sizeof(*root), GFP_KERNEL);
-	if (!root) {
-		ret = -ENOMEM;
-		goto out_unlock;
-	}
-
-	init_cgroup_root(root, &opts);
-
-	ret = cgroup_setup_root(root, opts.subsys_mask);
-	if (ret)
-		cgroup_free_root(root);
-
-out_unlock:
-	mutex_unlock(&cgroup_mutex);
-out_free:
-	kfree(opts.release_agent);
-	kfree(opts.name);
-
-	if (ret) {
-		put_cgroup_ns(ns);
-		return ERR_PTR(ret);
-	}
-out_mount:
-	dentry = kernfs_mount(fs_type, flags, root->kf_root,
-			      is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
-			      &new_sb);
-
-	/*
-	 * In non-init cgroup namespace, instead of root cgroup's
-	 * dentry, we return the dentry corresponding to the
-	 * cgroupns->root_cgrp.
+	 * In non-init cgroup namespace, instead of root cgroup's dentry,
+	 * we return the dentry corresponding to the cgroupns->root_cgrp.
 	 */
 	if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
 		struct dentry *nsdentry;
@@ -2277,13 +1769,45 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	if (IS_ERR(dentry) || !new_sb)
 		cgroup_put(&root->cgrp);
 
+	return dentry;
+}
+
+static struct dentry *cgroup_mount(struct file_system_type *fs_type,
+			 int flags, const char *unused_dev_name,
+			 void *data)
+{
+	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+	struct dentry *dentry;
+
+	get_cgroup_ns(ns);
+
+	/* Check if the caller has permission to mount. */
+	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
+		put_cgroup_ns(ns);
+		return ERR_PTR(-EPERM);
+	}
+
 	/*
-	 * If @pinned_sb, we're reusing an existing root and holding an
-	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
+	 * The first time anyone tries to mount a cgroup, enable the list
+	 * linking each css_set to its tasks and fix up all existing tasks.
 	 */
-	if (pinned_sb) {
-		WARN_ON(new_sb);
-		deactivate_super(pinned_sb);
+	if (!use_task_css_set_links)
+		cgroup_enable_task_cg_lists();
+
+	if (fs_type == &cgroup2_fs_type) {
+		if (data) {
+			pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+			put_cgroup_ns(ns);
+			return ERR_PTR(-EINVAL);
+		}
+		cgrp_dfl_visible = true;
+		cgroup_get(&cgrp_dfl_root.cgrp);
+
+		dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
+					 CGROUP2_SUPER_MAGIC, ns);
+	} else {
+		dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
+				       CGROUP_SUPER_MAGIC, ns);
 	}
 
 	put_cgroup_ns(ns);
@@ -2311,7 +1835,7 @@ static void cgroup_kill_sb(struct super_block *sb)
 	kernfs_kill_sb(sb);
 }
 
-static struct file_system_type cgroup_fs_type = {
+struct file_system_type cgroup_fs_type = {
 	.name = "cgroup",
 	.mount = cgroup_mount,
 	.kill_sb = cgroup_kill_sb,
@@ -2325,8 +1849,8 @@ static struct file_system_type cgroup2_fs_type = {
 	.fs_flags = FS_USERNS_MOUNT,
 };
 
-static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
-				 struct cgroup_namespace *ns)
+int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+			  struct cgroup_namespace *ns)
 {
 	struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
 
@@ -2389,49 +1913,18 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 }
 EXPORT_SYMBOL_GPL(task_cgroup_path);
 
-/* used to track tasks and other necessary states during migration */
-struct cgroup_taskset {
-	/* the src and dst cset list running through cset->mg_node */
-	struct list_head	src_csets;
-	struct list_head	dst_csets;
-
-	/* the subsys currently being processed */
-	int			ssid;
-
-	/*
-	 * Fields for cgroup_taskset_*() iteration.
-	 *
-	 * Before migration is committed, the target migration tasks are on
-	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
-	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
-	 * or ->dst_csets depending on whether migration is committed.
-	 *
-	 * ->cur_csets and ->cur_task point to the current task position
-	 * during iteration.
-	 */
-	struct list_head	*csets;
-	struct css_set		*cur_cset;
-	struct task_struct	*cur_task;
-};
-
-#define CGROUP_TASKSET_INIT(tset)	(struct cgroup_taskset){	\
-	.src_csets		= LIST_HEAD_INIT(tset.src_csets),	\
-	.dst_csets		= LIST_HEAD_INIT(tset.dst_csets),	\
-	.csets			= &tset.src_csets,			\
-}
-
 /**
- * cgroup_taskset_add - try to add a migration target task to a taskset
+ * cgroup_migrate_add_task - add a migration target task to a migration context
  * @task: target task
- * @tset: target taskset
+ * @mgctx: target migration context
  *
- * Add @task, which is a migration target, to @tset.  This function becomes
- * noop if @task doesn't need to be migrated.  @task's css_set should have
- * been added as a migration source and @task->cg_list will be moved from
- * the css_set's tasks list to mg_tasks one.
+ * Add @task, which is a migration target, to @mgctx->tset.  This function
+ * becomes noop if @task doesn't need to be migrated.  @task's css_set
+ * should have been added as a migration source and @task->cg_list will be
+ * moved from the css_set's tasks list to mg_tasks one.
  */
-static void cgroup_taskset_add(struct task_struct *task,
-			       struct cgroup_taskset *tset)
+static void cgroup_migrate_add_task(struct task_struct *task,
+				    struct cgroup_mgctx *mgctx)
 {
 	struct css_set *cset;
 
@@ -2451,10 +1944,11 @@ static void cgroup_taskset_add(struct task_struct *task,
 
 	list_move_tail(&task->cg_list, &cset->mg_tasks);
 	if (list_empty(&cset->mg_node))
-		list_add_tail(&cset->mg_node, &tset->src_csets);
+		list_add_tail(&cset->mg_node,
+			      &mgctx->tset.src_csets);
 	if (list_empty(&cset->mg_dst_cset->mg_node))
-		list_move_tail(&cset->mg_dst_cset->mg_node,
-			       &tset->dst_csets);
+		list_add_tail(&cset->mg_dst_cset->mg_node,
+			      &mgctx->tset.dst_csets);
 }
 
 /**
@@ -2521,17 +2015,16 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
 
 /**
  * cgroup_taskset_migrate - migrate a taskset
- * @tset: taget taskset
- * @root: cgroup root the migration is taking place on
+ * @mgctx: migration context
  *
- * Migrate tasks in @tset as setup by migration preparation functions.
+ * Migrate tasks in @mgctx as setup by migration preparation functions.
  * This function fails iff one of the ->can_attach callbacks fails and
- * guarantees that either all or none of the tasks in @tset are migrated.
- * @tset is consumed regardless of success.
+ * guarantees that either all or none of the tasks in @mgctx are migrated.
+ * @mgctx is consumed regardless of success.
  */
-static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
-				  struct cgroup_root *root)
+static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
 {
+	struct cgroup_taskset *tset = &mgctx->tset;
 	struct cgroup_subsys *ss;
 	struct task_struct *task, *tmp_task;
 	struct css_set *cset, *tmp_cset;
@@ -2542,7 +2035,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 		return 0;
 
 	/* check that we can legitimately attach to the cgroup */
-	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
 		if (ss->can_attach) {
 			tset->ssid = ssid;
 			ret = ss->can_attach(tset);
@@ -2578,7 +2071,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 	 */
 	tset->csets = &tset->dst_csets;
 
-	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
 		if (ss->attach) {
 			tset->ssid = ssid;
 			ss->attach(tset);
@@ -2589,7 +2082,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 	goto out_release_tset;
 
 out_cancel_attach:
-	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
 		if (ssid == failed_ssid)
 			break;
 		if (ss->cancel_attach) {
@@ -2616,7 +2109,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
  * zero for migration destination cgroups with tasks so that child cgroups
  * don't compete against tasks.
  */
-static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
 {
 	return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
 		!dst_cgrp->subtree_control;
@@ -2624,25 +2117,31 @@ static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
 
 /**
  * cgroup_migrate_finish - cleanup after attach
- * @preloaded_csets: list of preloaded css_sets
+ * @mgctx: migration context
  *
  * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
  * those functions for details.
  */
-static void cgroup_migrate_finish(struct list_head *preloaded_csets)
+void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
 {
+	LIST_HEAD(preloaded);
 	struct css_set *cset, *tmp_cset;
 
 	lockdep_assert_held(&cgroup_mutex);
 
 	spin_lock_irq(&css_set_lock);
-	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
+
+	list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
+	list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
+
+	list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
 		cset->mg_src_cgrp = NULL;
 		cset->mg_dst_cgrp = NULL;
 		cset->mg_dst_cset = NULL;
 		list_del_init(&cset->mg_preload_node);
 		put_css_set_locked(cset);
 	}
+
 	spin_unlock_irq(&css_set_lock);
 }
 
@@ -2650,10 +2149,10 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
  * cgroup_migrate_add_src - add a migration source css_set
  * @src_cset: the source css_set to add
  * @dst_cgrp: the destination cgroup
- * @preloaded_csets: list of preloaded css_sets
+ * @mgctx: migration context
  *
  * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
- * @src_cset and add it to @preloaded_csets, which should later be cleaned
+ * @src_cset and add it to @mgctx->src_csets, which should later be cleaned
  * up by cgroup_migrate_finish().
  *
  * This function may be called without holding cgroup_threadgroup_rwsem
@@ -2662,9 +2161,9 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
  * into play and the preloaded css_sets are guaranteed to cover all
  * migrations.
  */
-static void cgroup_migrate_add_src(struct css_set *src_cset,
-				   struct cgroup *dst_cgrp,
-				   struct list_head *preloaded_csets)
+void cgroup_migrate_add_src(struct css_set *src_cset,
+			    struct cgroup *dst_cgrp,
+			    struct cgroup_mgctx *mgctx)
 {
 	struct cgroup *src_cgrp;
 
@@ -2692,33 +2191,35 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
 	src_cset->mg_src_cgrp = src_cgrp;
 	src_cset->mg_dst_cgrp = dst_cgrp;
 	get_css_set(src_cset);
-	list_add(&src_cset->mg_preload_node, preloaded_csets);
+	list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
 }
 
 /**
  * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
- * @preloaded_csets: list of preloaded source css_sets
+ * @mgctx: migration context
  *
  * Tasks are about to be moved and all the source css_sets have been
- * preloaded to @preloaded_csets.  This function looks up and pins all
- * destination css_sets, links each to its source, and append them to
- * @preloaded_csets.
+ * preloaded to @mgctx->preloaded_src_csets.  This function looks up and
+ * pins all destination css_sets, links each to its source, and append them
+ * to @mgctx->preloaded_dst_csets.
  *
  * This function must be called after cgroup_migrate_add_src() has been
  * called on each migration source css_set.  After migration is performed
  * using cgroup_migrate(), cgroup_migrate_finish() must be called on
- * @preloaded_csets.
+ * @mgctx.
  */
-static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
+int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
 {
-	LIST_HEAD(csets);
 	struct css_set *src_cset, *tmp_cset;
 
 	lockdep_assert_held(&cgroup_mutex);
 
 	/* look up the dst cset for each src cset and link it to src */
-	list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
+	list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
+				 mg_preload_node) {
 		struct css_set *dst_cset;
+		struct cgroup_subsys *ss;
+		int ssid;
 
 		dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
 		if (!dst_cset)
@@ -2743,15 +2244,19 @@ static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
 		src_cset->mg_dst_cset = dst_cset;
 
 		if (list_empty(&dst_cset->mg_preload_node))
-			list_add(&dst_cset->mg_preload_node, &csets);
+			list_add_tail(&dst_cset->mg_preload_node,
+				      &mgctx->preloaded_dst_csets);
 		else
 			put_css_set(dst_cset);
+
+		for_each_subsys(ss, ssid)
+			if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
+				mgctx->ss_mask |= 1 << ssid;
 	}
 
-	list_splice_tail(&csets, preloaded_csets);
 	return 0;
 err:
-	cgroup_migrate_finish(&csets);
+	cgroup_migrate_finish(mgctx);
 	return -ENOMEM;
 }
 
@@ -2759,7 +2264,7 @@ static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
  * cgroup_migrate - migrate a process or task to a cgroup
  * @leader: the leader of the process or the task to migrate
  * @threadgroup: whether @leader points to the whole process or a single task
- * @root: cgroup root migration is taking place on
+ * @mgctx: migration context
  *
  * Migrate a process or task denoted by @leader.  If migrating a process,
  * the caller must be holding cgroup_threadgroup_rwsem.  The caller is also
@@ -2773,10 +2278,9 @@ static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
  * decided for all targets by invoking group_migrate_prepare_dst() before
  * actually starting migrating.
  */
-static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
-			  struct cgroup_root *root)
+int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+		   struct cgroup_mgctx *mgctx)
 {
-	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
 	struct task_struct *task;
 
 	/*
@@ -2788,14 +2292,14 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 	rcu_read_lock();
 	task = leader;
 	do {
-		cgroup_taskset_add(task, &tset);
+		cgroup_migrate_add_task(task, mgctx);
 		if (!threadgroup)
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
 	spin_unlock_irq(&css_set_lock);
 
-	return cgroup_taskset_migrate(&tset, root);
+	return cgroup_migrate_execute(mgctx);
 }
 
 /**
@@ -2806,10 +2310,10 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
  *
  * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
  */
-static int cgroup_attach_task(struct cgroup *dst_cgrp,
-			      struct task_struct *leader, bool threadgroup)
+int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
+		       bool threadgroup)
 {
-	LIST_HEAD(preloaded_csets);
+	DEFINE_CGROUP_MGCTX(mgctx);
 	struct task_struct *task;
 	int ret;
 
@@ -2821,8 +2325,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 	rcu_read_lock();
 	task = leader;
 	do {
-		cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
-				       &preloaded_csets);
+		cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
 		if (!threadgroup)
 			break;
 	} while_each_thread(leader, task);
@@ -2830,11 +2333,11 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 	spin_unlock_irq(&css_set_lock);
 
 	/* prepare dst csets and commit */
-	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
+	ret = cgroup_migrate_prepare_dst(&mgctx);
 	if (!ret)
-		ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
+		ret = cgroup_migrate(leader, threadgroup, &mgctx);
 
-	cgroup_migrate_finish(&preloaded_csets);
+	cgroup_migrate_finish(&mgctx);
 
 	if (!ret)
 		trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
@@ -2846,20 +2349,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 					 struct cgroup *dst_cgrp,
 					 struct kernfs_open_file *of)
 {
-	const struct cred *cred = current_cred();
-	const struct cred *tcred = get_task_cred(task);
 	int ret = 0;
 
-	/*
-	 * even if we're attaching all tasks in the thread group, we only
-	 * need to check permissions on one of them.
-	 */
-	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
-	    !uid_eq(cred->euid, tcred->uid) &&
-	    !uid_eq(cred->euid, tcred->suid))
-		ret = -EACCES;
-
-	if (!ret && cgroup_on_dfl(dst_cgrp)) {
+	if (cgroup_on_dfl(dst_cgrp)) {
 		struct super_block *sb = of->file->f_path.dentry->d_sb;
 		struct cgroup *cgrp;
 		struct inode *inode;
@@ -2877,9 +2369,21 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 			ret = inode_permission(inode, MAY_WRITE);
 			iput(inode);
 		}
+	} else {
+		const struct cred *cred = current_cred();
+		const struct cred *tcred = get_task_cred(task);
+
+		/*
+		 * even if we're attaching all tasks in the thread group,
+		 * we only need to check permissions on one of them.
+		 */
+		if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+		    !uid_eq(cred->euid, tcred->uid) &&
+		    !uid_eq(cred->euid, tcred->suid))
+			ret = -EACCES;
+		put_cred(tcred);
 	}
 
-	put_cred(tcred);
 	return ret;
 }
 
@@ -2888,8 +2392,8 @@ static int cgroup_procs_write_permission(struct task_struct *task,
  * function to attach either it or all tasks in its threadgroup. Will lock
  * cgroup_mutex and threadgroup.
  */
-static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
-				    size_t nbytes, loff_t off, bool threadgroup)
+ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+			     size_t nbytes, loff_t off, bool threadgroup)
 {
 	struct task_struct *tsk;
 	struct cgroup_subsys *ss;
@@ -2950,86 +2454,12 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 	return ret ?: nbytes;
 }
 
-/**
- * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
- * @from: attach to all cgroups of a given task
- * @tsk: the task to be attached
- */
-int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
-{
-	struct cgroup_root *root;
-	int retval = 0;
-
-	mutex_lock(&cgroup_mutex);
-	percpu_down_write(&cgroup_threadgroup_rwsem);
-	for_each_root(root) {
-		struct cgroup *from_cgrp;
-
-		if (root == &cgrp_dfl_root)
-			continue;
-
-		spin_lock_irq(&css_set_lock);
-		from_cgrp = task_cgroup_from_root(from, root);
-		spin_unlock_irq(&css_set_lock);
-
-		retval = cgroup_attach_task(from_cgrp, tsk, false);
-		if (retval)
-			break;
-	}
-	percpu_up_write(&cgroup_threadgroup_rwsem);
-	mutex_unlock(&cgroup_mutex);
-
-	return retval;
-}
-EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-
-static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
-				  char *buf, size_t nbytes, loff_t off)
-{
-	return __cgroup_procs_write(of, buf, nbytes, off, false);
-}
-
-static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
-				  char *buf, size_t nbytes, loff_t off)
+ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+			   loff_t off)
 {
 	return __cgroup_procs_write(of, buf, nbytes, off, true);
 }
 
-static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
-					  char *buf, size_t nbytes, loff_t off)
-{
-	struct cgroup *cgrp;
-
-	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
-
-	cgrp = cgroup_kn_lock_live(of->kn, false);
-	if (!cgrp)
-		return -ENODEV;
-	spin_lock(&release_agent_path_lock);
-	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
-		sizeof(cgrp->root->release_agent_path));
-	spin_unlock(&release_agent_path_lock);
-	cgroup_kn_unlock(of->kn);
-	return nbytes;
-}
-
-static int cgroup_release_agent_show(struct seq_file *seq, void *v)
-{
-	struct cgroup *cgrp = seq_css(seq)->cgroup;
-
-	spin_lock(&release_agent_path_lock);
-	seq_puts(seq, cgrp->root->release_agent_path);
-	spin_unlock(&release_agent_path_lock);
-	seq_putc(seq, '\n');
-	return 0;
-}
-
-static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
-{
-	seq_puts(seq, "0\n");
-	return 0;
-}
-
 static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
 {
 	struct cgroup_subsys *ss;
@@ -3075,8 +2505,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
  */
 static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 {
-	LIST_HEAD(preloaded_csets);
-	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
+	DEFINE_CGROUP_MGCTX(mgctx);
 	struct cgroup_subsys_state *d_css;
 	struct cgroup *dsct;
 	struct css_set *src_cset;
@@ -3092,33 +2521,28 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 		struct cgrp_cset_link *link;
 
 		list_for_each_entry(link, &dsct->cset_links, cset_link)
-			cgroup_migrate_add_src(link->cset, dsct,
-					       &preloaded_csets);
+			cgroup_migrate_add_src(link->cset, dsct, &mgctx);
 	}
 	spin_unlock_irq(&css_set_lock);
 
 	/* NULL dst indicates self on default hierarchy */
-	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
+	ret = cgroup_migrate_prepare_dst(&mgctx);
 	if (ret)
 		goto out_finish;
 
 	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
+	list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
 		struct task_struct *task, *ntask;
 
-		/* src_csets precede dst_csets, break on the first dst_cset */
-		if (!src_cset->mg_src_cgrp)
-			break;
-
 		/* all tasks in src_csets need to be migrated */
 		list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
-			cgroup_taskset_add(task, &tset);
+			cgroup_migrate_add_task(task, &mgctx);
 	}
 	spin_unlock_irq(&css_set_lock);
 
-	ret = cgroup_taskset_migrate(&tset, cgrp->root);
+	ret = cgroup_migrate_execute(&mgctx);
 out_finish:
-	cgroup_migrate_finish(&preloaded_csets);
+	cgroup_migrate_finish(&mgctx);
 	percpu_up_write(&cgroup_threadgroup_rwsem);
 	return ret;
 }
@@ -3131,7 +2555,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
  * controller while the previous css is still around.  This function grabs
  * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
  */
-static void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
+void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
 	__acquires(&cgroup_mutex)
 {
 	struct cgroup *dsct;
@@ -3503,6 +2927,23 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
+static int cgroup_file_open(struct kernfs_open_file *of)
+{
+	struct cftype *cft = of->kn->priv;
+
+	if (cft->open)
+		return cft->open(of);
+	return 0;
+}
+
+static void cgroup_file_release(struct kernfs_open_file *of)
+{
+	struct cftype *cft = of->kn->priv;
+
+	if (cft->release)
+		cft->release(of);
+}
+
 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
 				 size_t nbytes, loff_t off)
 {
@@ -3553,7 +2994,8 @@ static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
 
 static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
 {
-	seq_cft(seq)->seq_stop(seq, v);
+	if (seq_cft(seq)->seq_stop)
+		seq_cft(seq)->seq_stop(seq, v);
 }
 
 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
@@ -3575,12 +3017,16 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 
 static struct kernfs_ops cgroup_kf_single_ops = {
 	.atomic_write_len	= PAGE_SIZE,
+	.open			= cgroup_file_open,
+	.release		= cgroup_file_release,
 	.write			= cgroup_file_write,
 	.seq_show		= cgroup_seqfile_show,
 };
 
 static struct kernfs_ops cgroup_kf_ops = {
 	.atomic_write_len	= PAGE_SIZE,
+	.open			= cgroup_file_open,
+	.release		= cgroup_file_release,
 	.write			= cgroup_file_write,
 	.seq_start		= cgroup_seqfile_start,
 	.seq_next		= cgroup_seqfile_next,
@@ -3588,48 +3034,6 @@ static struct kernfs_ops cgroup_kf_ops = {
 	.seq_show		= cgroup_seqfile_show,
 };
 
-/*
- * cgroup_rename - Only allow simple rename of directories in place.
- */
-static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
-			 const char *new_name_str)
-{
-	struct cgroup *cgrp = kn->priv;
-	int ret;
-
-	if (kernfs_type(kn) != KERNFS_DIR)
-		return -ENOTDIR;
-	if (kn->parent != new_parent)
-		return -EIO;
-
-	/*
-	 * This isn't a proper migration and its usefulness is very
-	 * limited.  Disallow on the default hierarchy.
-	 */
-	if (cgroup_on_dfl(cgrp))
-		return -EPERM;
-
-	/*
-	 * We're gonna grab cgroup_mutex which nests outside kernfs
-	 * active_ref.  kernfs_rename() doesn't require active_ref
-	 * protection.  Break them before grabbing cgroup_mutex.
-	 */
-	kernfs_break_active_protection(new_parent);
-	kernfs_break_active_protection(kn);
-
-	mutex_lock(&cgroup_mutex);
-
-	ret = kernfs_rename(kn, new_parent, new_name_str);
-	if (!ret)
-		trace_cgroup_rename(cgrp);
-
-	mutex_unlock(&cgroup_mutex);
-
-	kernfs_unbreak_active_protection(kn);
-	kernfs_unbreak_active_protection(new_parent);
-	return ret;
-}
-
 /* set uid and gid of cgroup dirs and files to that of the creator */
 static int cgroup_kn_set_ugid(struct kernfs_node *kn)
 {
@@ -3926,26 +3330,6 @@ void cgroup_file_notify(struct cgroup_file *cfile)
 }
 
 /**
- * cgroup_task_count - count the number of tasks in a cgroup.
- * @cgrp: the cgroup in question
- *
- * Return the number of tasks in the cgroup.  The returned number can be
- * higher than the actual number of tasks due to css_set references from
- * namespace roots and temporary usages.
- */
-static int cgroup_task_count(const struct cgroup *cgrp)
-{
-	int count = 0;
-	struct cgrp_cset_link *link;
-
-	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(link, &cgrp->cset_links, cset_link)
-		count += atomic_read(&link->cset->refcount);
-	spin_unlock_irq(&css_set_lock);
-	return count;
-}
-
-/**
  * css_next_child - find the next child of a given css
  * @pos: the current position (%NULL to initiate traversal)
  * @parent: css whose children to walk
@@ -4343,560 +3727,69 @@ void css_task_iter_end(struct css_task_iter *it)
 		put_task_struct(it->cur_task);
 }
 
-/**
- * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
- * @to: cgroup to which the tasks will be moved
- * @from: cgroup in which the tasks currently reside
- *
- * Locking rules between cgroup_post_fork() and the migration path
- * guarantee that, if a task is forking while being migrated, the new child
- * is guaranteed to be either visible in the source cgroup after the
- * parent's migration is complete or put into the target cgroup.  No task
- * can slip out of migration through forking.
- */
-int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
+static void cgroup_procs_release(struct kernfs_open_file *of)
 {
-	LIST_HEAD(preloaded_csets);
-	struct cgrp_cset_link *link;
-	struct css_task_iter it;
+	if (of->priv) {
+		css_task_iter_end(of->priv);
+		kfree(of->priv);
+	}
+}
+
+static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct kernfs_open_file *of = s->private;
+	struct css_task_iter *it = of->priv;
 	struct task_struct *task;
-	int ret;
 
-	if (!cgroup_may_migrate_to(to))
-		return -EBUSY;
-
-	mutex_lock(&cgroup_mutex);
-
-	percpu_down_write(&cgroup_threadgroup_rwsem);
-
-	/* all tasks in @from are being moved, all csets are source */
-	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(link, &from->cset_links, cset_link)
-		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
-	spin_unlock_irq(&css_set_lock);
-
-	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
-	if (ret)
-		goto out_err;
-
-	/*
-	 * Migrate tasks one-by-one until @from is empty.  This fails iff
-	 * ->can_attach() fails.
-	 */
 	do {
-		css_task_iter_start(&from->self, &it);
-		task = css_task_iter_next(&it);
-		if (task)
-			get_task_struct(task);
-		css_task_iter_end(&it);
+		task = css_task_iter_next(it);
+	} while (task && !thread_group_leader(task));
 
-		if (task) {
-			ret = cgroup_migrate(task, false, to->root);
-			if (!ret)
-				trace_cgroup_transfer_tasks(to, task, false);
-			put_task_struct(task);
-		}
-	} while (task && !ret);
-out_err:
-	cgroup_migrate_finish(&preloaded_csets);
-	percpu_up_write(&cgroup_threadgroup_rwsem);
-	mutex_unlock(&cgroup_mutex);
-	return ret;
+	return task;
 }
 
-/*
- * Stuff for reading the 'tasks'/'procs' files.
- *
- * Reading this file can return large amounts of data if a cgroup has
- * *lots* of attached tasks. So it may need several calls to read(),
- * but we cannot guarantee that the information we produce is correct
- * unless we produce it entirely atomically.
- *
- */
-
-/* which pidlist file are we talking about? */
-enum cgroup_filetype {
-	CGROUP_FILE_PROCS,
-	CGROUP_FILE_TASKS,
-};
-
-/*
- * A pidlist is a list of pids that virtually represents the contents of one
- * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
- * a pair (one each for procs, tasks) for each pid namespace that's relevant
- * to the cgroup.
- */
-struct cgroup_pidlist {
-	/*
-	 * used to find which pidlist is wanted. doesn't change as long as
-	 * this particular list stays in the list.
-	*/
-	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
-	/* array of xids */
-	pid_t *list;
-	/* how many elements the above list has */
-	int length;
-	/* each of these stored in a list by its cgroup */
-	struct list_head links;
-	/* pointer to the cgroup we belong to, for list removal purposes */
-	struct cgroup *owner;
-	/* for delayed destruction */
-	struct delayed_work destroy_dwork;
-};
-
-/*
- * The following two functions "fix" the issue where there are more pids
- * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
- * TODO: replace with a kernel-wide solution to this problem
- */
-#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
-static void *pidlist_allocate(int count)
+static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
 {
-	if (PIDLIST_TOO_LARGE(count))
-		return vmalloc(count * sizeof(pid_t));
-	else
-		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
-}
-
-static void pidlist_free(void *p)
-{
-	kvfree(p);
-}
-
-/*
- * Used to destroy all pidlists lingering waiting for destroy timer.  None
- * should be left afterwards.
- */
-static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
-{
-	struct cgroup_pidlist *l, *tmp_l;
-
-	mutex_lock(&cgrp->pidlist_mutex);
-	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
-		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
-	mutex_unlock(&cgrp->pidlist_mutex);
-
-	flush_workqueue(cgroup_pidlist_destroy_wq);
-	BUG_ON(!list_empty(&cgrp->pidlists));
-}
-
-static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
-{
-	struct delayed_work *dwork = to_delayed_work(work);
-	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
-						destroy_dwork);
-	struct cgroup_pidlist *tofree = NULL;
-
-	mutex_lock(&l->owner->pidlist_mutex);
-
-	/*
-	 * Destroy iff we didn't get queued again.  The state won't change
-	 * as destroy_dwork can only be queued while locked.
-	 */
-	if (!delayed_work_pending(dwork)) {
-		list_del(&l->links);
-		pidlist_free(l->list);
-		put_pid_ns(l->key.ns);
-		tofree = l;
-	}
-
-	mutex_unlock(&l->owner->pidlist_mutex);
-	kfree(tofree);
-}
-
-/*
- * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
- * Returns the number of unique elements.
- */
-static int pidlist_uniq(pid_t *list, int length)
-{
-	int src, dest = 1;
-
-	/*
-	 * we presume the 0th element is unique, so i starts at 1. trivial
-	 * edge cases first; no work needs to be done for either
-	 */
-	if (length == 0 || length == 1)
-		return length;
-	/* src and dest walk down the list; dest counts unique elements */
-	for (src = 1; src < length; src++) {
-		/* find next unique element */
-		while (list[src] == list[src-1]) {
-			src++;
-			if (src == length)
-				goto after;
-		}
-		/* dest always points to where the next unique element goes */
-		list[dest] = list[src];
-		dest++;
-	}
-after:
-	return dest;
-}
-
-/*
- * The two pid files - task and cgroup.procs - guaranteed that the result
- * is sorted, which forced this whole pidlist fiasco.  As pid order is
- * different per namespace, each namespace needs differently sorted list,
- * making it impossible to use, for example, single rbtree of member tasks
- * sorted by task pointer.  As pidlists can be fairly large, allocating one
- * per open file is dangerous, so cgroup had to implement shared pool of
- * pidlists keyed by cgroup and namespace.
- *
- * All this extra complexity was caused by the original implementation
- * committing to an entirely unnecessary property.  In the long term, we
- * want to do away with it.  Explicitly scramble sort order if on the
- * default hierarchy so that no such expectation exists in the new
- * interface.
- *
- * Scrambling is done by swapping every two consecutive bits, which is
- * non-identity one-to-one mapping which disturbs sort order sufficiently.
- */
-static pid_t pid_fry(pid_t pid)
-{
-	unsigned a = pid & 0x55555555;
-	unsigned b = pid & 0xAAAAAAAA;
-
-	return (a << 1) | (b >> 1);
-}
-
-static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
-{
-	if (cgroup_on_dfl(cgrp))
-		return pid_fry(pid);
-	else
-		return pid;
-}
-
-static int cmppid(const void *a, const void *b)
-{
-	return *(pid_t *)a - *(pid_t *)b;
-}
-
-static int fried_cmppid(const void *a, const void *b)
-{
-	return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
-}
-
-static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
-						  enum cgroup_filetype type)
-{
-	struct cgroup_pidlist *l;
-	/* don't need task_nsproxy() if we're looking at ourself */
-	struct pid_namespace *ns = task_active_pid_ns(current);
-
-	lockdep_assert_held(&cgrp->pidlist_mutex);
-
-	list_for_each_entry(l, &cgrp->pidlists, links)
-		if (l->key.type == type && l->key.ns == ns)
-			return l;
-	return NULL;
-}
-
-/*
- * find the appropriate pidlist for our purpose (given procs vs tasks)
- * returns with the lock on that pidlist already held, and takes care
- * of the use count, or returns NULL with no locks held if we're out of
- * memory.
- */
-static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
-						enum cgroup_filetype type)
-{
-	struct cgroup_pidlist *l;
-
-	lockdep_assert_held(&cgrp->pidlist_mutex);
-
-	l = cgroup_pidlist_find(cgrp, type);
-	if (l)
-		return l;
-
-	/* entry not found; create a new one */
-	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
-	if (!l)
-		return l;
-
-	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
-	l->key.type = type;
-	/* don't need task_nsproxy() if we're looking at ourself */
-	l->key.ns = get_pid_ns(task_active_pid_ns(current));
-	l->owner = cgrp;
-	list_add(&l->links, &cgrp->pidlists);
-	return l;
-}
-
-/*
- * Load a cgroup's pidarray with either procs' tgids or tasks' pids
- */
-static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
-			      struct cgroup_pidlist **lp)
-{
-	pid_t *array;
-	int length;
-	int pid, n = 0; /* used for populating the array */
-	struct css_task_iter it;
-	struct task_struct *tsk;
-	struct cgroup_pidlist *l;
-
-	lockdep_assert_held(&cgrp->pidlist_mutex);
-
-	/*
-	 * If cgroup gets more users after we read count, we won't have
-	 * enough space - tough.  This race is indistinguishable to the
-	 * caller from the case that the additional cgroup users didn't
-	 * show up until sometime later on.
-	 */
-	length = cgroup_task_count(cgrp);
-	array = pidlist_allocate(length);
-	if (!array)
-		return -ENOMEM;
-	/* now, populate the array */
-	css_task_iter_start(&cgrp->self, &it);
-	while ((tsk = css_task_iter_next(&it))) {
-		if (unlikely(n == length))
-			break;
-		/* get tgid or pid for procs or tasks file respectively */
-		if (type == CGROUP_FILE_PROCS)
-			pid = task_tgid_vnr(tsk);
-		else
-			pid = task_pid_vnr(tsk);
-		if (pid > 0) /* make sure to only use valid results */
-			array[n++] = pid;
-	}
-	css_task_iter_end(&it);
-	length = n;
-	/* now sort & (if procs) strip out duplicates */
-	if (cgroup_on_dfl(cgrp))
-		sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
-	else
-		sort(array, length, sizeof(pid_t), cmppid, NULL);
-	if (type == CGROUP_FILE_PROCS)
-		length = pidlist_uniq(array, length);
-
-	l = cgroup_pidlist_find_create(cgrp, type);
-	if (!l) {
-		pidlist_free(array);
-		return -ENOMEM;
-	}
-
-	/* store array, freeing old if necessary */
-	pidlist_free(l->list);
-	l->list = array;
-	l->length = length;
-	*lp = l;
-	return 0;
-}
-
-/**
- * cgroupstats_build - build and fill cgroupstats
- * @stats: cgroupstats to fill information into
- * @dentry: A dentry entry belonging to the cgroup for which stats have
- * been requested.
- *
- * Build and fill cgroupstats so that taskstats can export it to user
- * space.
- */
-int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
-{
-	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
-	struct cgroup *cgrp;
-	struct css_task_iter it;
-	struct task_struct *tsk;
-
-	/* it should be kernfs_node belonging to cgroupfs and is a directory */
-	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
-	    kernfs_type(kn) != KERNFS_DIR)
-		return -EINVAL;
-
-	mutex_lock(&cgroup_mutex);
-
-	/*
-	 * We aren't being called from kernfs and there's no guarantee on
-	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
-	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
-	 */
-	rcu_read_lock();
-	cgrp = rcu_dereference(kn->priv);
-	if (!cgrp || cgroup_is_dead(cgrp)) {
-		rcu_read_unlock();
-		mutex_unlock(&cgroup_mutex);
-		return -ENOENT;
-	}
-	rcu_read_unlock();
-
-	css_task_iter_start(&cgrp->self, &it);
-	while ((tsk = css_task_iter_next(&it))) {
-		switch (tsk->state) {
-		case TASK_RUNNING:
-			stats->nr_running++;
-			break;
-		case TASK_INTERRUPTIBLE:
-			stats->nr_sleeping++;
-			break;
-		case TASK_UNINTERRUPTIBLE:
-			stats->nr_uninterruptible++;
-			break;
-		case TASK_STOPPED:
-			stats->nr_stopped++;
-			break;
-		default:
-			if (delayacct_is_task_waiting_on_io(tsk))
-				stats->nr_io_wait++;
-			break;
-		}
-	}
-	css_task_iter_end(&it);
-
-	mutex_unlock(&cgroup_mutex);
-	return 0;
-}
-
-
-/*
- * seq_file methods for the tasks/procs files. The seq_file position is the
- * next pid to display; the seq_file iterator is a pointer to the pid
- * in the cgroup->l->list array.
- */
-
-static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
-{
-	/*
-	 * Initially we receive a position value that corresponds to
-	 * one more than the last pid shown (or 0 on the first call or
-	 * after a seek to the start). Use a binary-search to find the
-	 * next pid to display, if any
-	 */
 	struct kernfs_open_file *of = s->private;
 	struct cgroup *cgrp = seq_css(s)->cgroup;
-	struct cgroup_pidlist *l;
-	enum cgroup_filetype type = seq_cft(s)->private;
-	int index = 0, pid = *pos;
-	int *iter, ret;
-
-	mutex_lock(&cgrp->pidlist_mutex);
+	struct css_task_iter *it = of->priv;
 
 	/*
-	 * !NULL @of->priv indicates that this isn't the first start()
-	 * after open.  If the matching pidlist is around, we can use that.
-	 * Look for it.  Note that @of->priv can't be used directly.  It
-	 * could already have been destroyed.
+	 * When a seq_file is seeked, it's always traversed sequentially
+	 * from position 0, so we can simply keep iterating on !0 *pos.
 	 */
-	if (of->priv)
-		of->priv = cgroup_pidlist_find(cgrp, type);
+	if (!it) {
+		if (WARN_ON_ONCE((*pos)++))
+			return ERR_PTR(-EINVAL);
 
-	/*
-	 * Either this is the first start() after open or the matching
-	 * pidlist has been destroyed inbetween.  Create a new one.
-	 */
-	if (!of->priv) {
-		ret = pidlist_array_load(cgrp, type,
-					 (struct cgroup_pidlist **)&of->priv);
-		if (ret)
-			return ERR_PTR(ret);
+		it = kzalloc(sizeof(*it), GFP_KERNEL);
+		if (!it)
+			return ERR_PTR(-ENOMEM);
+		of->priv = it;
+		css_task_iter_start(&cgrp->self, it);
+	} else if (!(*pos)++) {
+		css_task_iter_end(it);
+		css_task_iter_start(&cgrp->self, it);
 	}
-	l = of->priv;
 
-	if (pid) {
-		int end = l->length;
-
-		while (index < end) {
-			int mid = (index + end) / 2;
-			if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
-				index = mid;
-				break;
-			} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
-				index = mid + 1;
-			else
-				end = mid;
-		}
-	}
-	/* If we're off the end of the array, we're done */
-	if (index >= l->length)
-		return NULL;
-	/* Update the abstract position to be the actual pid that we found */
-	iter = l->list + index;
-	*pos = cgroup_pid_fry(cgrp, *iter);
-	return iter;
+	return cgroup_procs_next(s, NULL, NULL);
 }
 
-static void cgroup_pidlist_stop(struct seq_file *s, void *v)
+static int cgroup_procs_show(struct seq_file *s, void *v)
 {
-	struct kernfs_open_file *of = s->private;
-	struct cgroup_pidlist *l = of->priv;
-
-	if (l)
-		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
-				 CGROUP_PIDLIST_DESTROY_DELAY);
-	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
-}
-
-static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
-{
-	struct kernfs_open_file *of = s->private;
-	struct cgroup_pidlist *l = of->priv;
-	pid_t *p = v;
-	pid_t *end = l->list + l->length;
-	/*
-	 * Advance to the next pid in the array. If this goes off the
-	 * end, we're done
-	 */
-	p++;
-	if (p >= end) {
-		return NULL;
-	} else {
-		*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
-		return p;
-	}
-}
-
-static int cgroup_pidlist_show(struct seq_file *s, void *v)
-{
-	seq_printf(s, "%d\n", *(int *)v);
-
-	return 0;
-}
-
-static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
-					 struct cftype *cft)
-{
-	return notify_on_release(css->cgroup);
-}
-
-static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
-					  struct cftype *cft, u64 val)
-{
-	if (val)
-		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
-	else
-		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
-	return 0;
-}
-
-static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
-				      struct cftype *cft)
-{
-	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
-}
-
-static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
-				       struct cftype *cft, u64 val)
-{
-	if (val)
-		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
-	else
-		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+	seq_printf(s, "%d\n", task_tgid_vnr(v));
 	return 0;
 }
 
 /* cgroup core interface files for the default hierarchy */
-static struct cftype cgroup_dfl_base_files[] = {
+static struct cftype cgroup_base_files[] = {
 	{
 		.name = "cgroup.procs",
 		.file_offset = offsetof(struct cgroup, procs_file),
-		.seq_start = cgroup_pidlist_start,
-		.seq_next = cgroup_pidlist_next,
-		.seq_stop = cgroup_pidlist_stop,
-		.seq_show = cgroup_pidlist_show,
-		.private = CGROUP_FILE_PROCS,
+		.release = cgroup_procs_release,
+		.seq_start = cgroup_procs_start,
+		.seq_next = cgroup_procs_next,
+		.seq_show = cgroup_procs_show,
 		.write = cgroup_procs_write,
 	},
 	{
@@ -4917,51 +3810,6 @@ static struct cftype cgroup_dfl_base_files[] = {
 	{ }	/* terminate */
 };
 
-/* cgroup core interface files for the legacy hierarchies */
-static struct cftype cgroup_legacy_base_files[] = {
-	{
-		.name = "cgroup.procs",
-		.seq_start = cgroup_pidlist_start,
-		.seq_next = cgroup_pidlist_next,
-		.seq_stop = cgroup_pidlist_stop,
-		.seq_show = cgroup_pidlist_show,
-		.private = CGROUP_FILE_PROCS,
-		.write = cgroup_procs_write,
-	},
-	{
-		.name = "cgroup.clone_children",
-		.read_u64 = cgroup_clone_children_read,
-		.write_u64 = cgroup_clone_children_write,
-	},
-	{
-		.name = "cgroup.sane_behavior",
-		.flags = CFTYPE_ONLY_ON_ROOT,
-		.seq_show = cgroup_sane_behavior_show,
-	},
-	{
-		.name = "tasks",
-		.seq_start = cgroup_pidlist_start,
-		.seq_next = cgroup_pidlist_next,
-		.seq_stop = cgroup_pidlist_stop,
-		.seq_show = cgroup_pidlist_show,
-		.private = CGROUP_FILE_TASKS,
-		.write = cgroup_tasks_write,
-	},
-	{
-		.name = "notify_on_release",
-		.read_u64 = cgroup_read_notify_on_release,
-		.write_u64 = cgroup_write_notify_on_release,
-	},
-	{
-		.name = "release_agent",
-		.flags = CFTYPE_ONLY_ON_ROOT,
-		.seq_show = cgroup_release_agent_show,
-		.write = cgroup_release_agent_write,
-		.max_write_len = PATH_MAX - 1,
-	},
-	{ }	/* terminate */
-};
-
 /*
  * css destruction is four-stage process.
  *
@@ -5007,7 +3855,7 @@ static void css_free_work_fn(struct work_struct *work)
 	} else {
 		/* cgroup free path */
 		atomic_dec(&cgrp->root->nr_cgrps);
-		cgroup_pidlist_destroy_all(cgrp);
+		cgroup1_pidlist_destroy_all(cgrp);
 		cancel_work_sync(&cgrp->release_agent_work);
 
 		if (cgroup_parent(cgrp)) {
@@ -5302,8 +4150,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	return ERR_PTR(ret);
 }
 
-static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
-			umode_t mode)
+int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
 {
 	struct cgroup *parent, *cgrp;
 	struct kernfs_node *kn;
@@ -5507,7 +4354,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	 */
 	kernfs_remove(cgrp->kn);
 
-	check_for_release(cgroup_parent(cgrp));
+	cgroup1_check_for_release(cgroup_parent(cgrp));
 
 	/* put the base reference */
 	percpu_ref_kill(&cgrp->self.refcnt);
@@ -5515,7 +4362,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	return 0;
 };
 
-static int cgroup_rmdir(struct kernfs_node *kn)
+int cgroup_rmdir(struct kernfs_node *kn)
 {
 	struct cgroup *cgrp;
 	int ret = 0;
@@ -5535,10 +4382,8 @@ static int cgroup_rmdir(struct kernfs_node *kn)
 
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
 	.remount_fs		= cgroup_remount,
-	.show_options		= cgroup_show_options,
 	.mkdir			= cgroup_mkdir,
 	.rmdir			= cgroup_rmdir,
-	.rename			= cgroup_rename,
 	.show_path		= cgroup_show_path,
 };
 
@@ -5646,8 +4491,8 @@ int __init cgroup_init(void)
 
 	BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
 	BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
-	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
-	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
+	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+	BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
 
 	/*
 	 * The latency of the synchronize_sched() is too high for cgroups,
@@ -5697,7 +4542,7 @@ int __init cgroup_init(void)
 			continue;
 		}
 
-		if (cgroup_ssid_no_v1(ssid))
+		if (cgroup1_ssid_disabled(ssid))
 			printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
 			       ss->name);
 
@@ -5744,15 +4589,6 @@ static int __init cgroup_wq_init(void)
 	 */
 	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
 	BUG_ON(!cgroup_destroy_wq);
-
-	/*
-	 * Used to destroy pidlists and separate to serve as flush domain.
-	 * Cap @max_active to 1 too.
-	 */
-	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
-						    0, 1);
-	BUG_ON(!cgroup_pidlist_destroy_wq);
-
 	return 0;
 }
 core_initcall(cgroup_wq_init);
@@ -5835,42 +4671,6 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 	return retval;
 }
 
-/* Display information about each subsystem and each hierarchy */
-static int proc_cgroupstats_show(struct seq_file *m, void *v)
-{
-	struct cgroup_subsys *ss;
-	int i;
-
-	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
-	/*
-	 * ideally we don't want subsystems moving around while we do this.
-	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
-	 * subsys/hierarchy state.
-	 */
-	mutex_lock(&cgroup_mutex);
-
-	for_each_subsys(ss, i)
-		seq_printf(m, "%s\t%d\t%d\t%d\n",
-			   ss->legacy_name, ss->root->hierarchy_id,
-			   atomic_read(&ss->root->nr_cgrps),
-			   cgroup_ssid_enabled(i));
-
-	mutex_unlock(&cgroup_mutex);
-	return 0;
-}
-
-static int cgroupstats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_cgroupstats_show, NULL);
-}
-
-static const struct file_operations proc_cgroupstats_operations = {
-	.open = cgroupstats_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
 /**
  * cgroup_fork - initialize cgroup related fields during copy_process()
  * @child: pointer to task_struct of forking parent process.
@@ -6050,76 +4850,6 @@ void cgroup_free(struct task_struct *task)
 	put_css_set(cset);
 }
 
-static void check_for_release(struct cgroup *cgrp)
-{
-	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
-	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
-		schedule_work(&cgrp->release_agent_work);
-}
-
-/*
- * Notify userspace when a cgroup is released, by running the
- * configured release agent with the name of the cgroup (path
- * relative to the root of cgroup file system) as the argument.
- *
- * Most likely, this user command will try to rmdir this cgroup.
- *
- * This races with the possibility that some other task will be
- * attached to this cgroup before it is removed, or that some other
- * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
- * The presumed 'rmdir' will fail quietly if this cgroup is no longer
- * unused, and this cgroup will be reprieved from its death sentence,
- * to continue to serve a useful existence.  Next time it's released,
- * we will get notified again, if it still has 'notify_on_release' set.
- *
- * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
- * means only wait until the task is successfully execve()'d.  The
- * separate release agent task is forked by call_usermodehelper(),
- * then control in this thread returns here, without waiting for the
- * release agent task.  We don't bother to wait because the caller of
- * this routine has no use for the exit status of the release agent
- * task, so no sense holding our caller up for that.
- */
-static void cgroup_release_agent(struct work_struct *work)
-{
-	struct cgroup *cgrp =
-		container_of(work, struct cgroup, release_agent_work);
-	char *pathbuf = NULL, *agentbuf = NULL;
-	char *argv[3], *envp[3];
-	int ret;
-
-	mutex_lock(&cgroup_mutex);
-
-	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
-	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
-	if (!pathbuf || !agentbuf)
-		goto out;
-
-	spin_lock_irq(&css_set_lock);
-	ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
-	spin_unlock_irq(&css_set_lock);
-	if (ret < 0 || ret >= PATH_MAX)
-		goto out;
-
-	argv[0] = agentbuf;
-	argv[1] = pathbuf;
-	argv[2] = NULL;
-
-	/* minimal command environment */
-	envp[0] = "HOME=/";
-	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
-	envp[2] = NULL;
-
-	mutex_unlock(&cgroup_mutex);
-	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
-	goto out_free;
-out:
-	mutex_unlock(&cgroup_mutex);
-out_free:
-	kfree(agentbuf);
-	kfree(pathbuf);
-}
-
 static int __init cgroup_disable(char *str)
 {
 	struct cgroup_subsys *ss;
@@ -6141,33 +4871,6 @@ static int __init cgroup_disable(char *str)
 }
 __setup("cgroup_disable=", cgroup_disable);
 
-static int __init cgroup_no_v1(char *str)
-{
-	struct cgroup_subsys *ss;
-	char *token;
-	int i;
-
-	while ((token = strsep(&str, ",")) != NULL) {
-		if (!*token)
-			continue;
-
-		if (!strcmp(token, "all")) {
-			cgroup_no_v1_mask = U16_MAX;
-			break;
-		}
-
-		for_each_subsys(ss, i) {
-			if (strcmp(token, ss->name) &&
-			    strcmp(token, ss->legacy_name))
-				continue;
-
-			cgroup_no_v1_mask |= 1 << i;
-		}
-	}
-	return 1;
-}
-__setup("cgroup_no_v1=", cgroup_no_v1);
-
 /**
  * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
  * @dentry: directory dentry of interest
@@ -6197,7 +4900,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
 	 * have been or be removed at any point.  @kn->priv is RCU
 	 * protected for this access.  See css_release_work_fn() for details.
 	 */
-	cgrp = rcu_dereference(kn->priv);
+	cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
 	if (cgrp)
 		css = cgroup_css(cgrp, ss);
 
@@ -6349,154 +5052,6 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
 
 #endif	/* CONFIG_SOCK_CGROUP_DATA */
 
-/* cgroup namespaces */
-
-static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
-{
-	return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
-}
-
-static void dec_cgroup_namespaces(struct ucounts *ucounts)
-{
-	dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
-}
-
-static struct cgroup_namespace *alloc_cgroup_ns(void)
-{
-	struct cgroup_namespace *new_ns;
-	int ret;
-
-	new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
-	if (!new_ns)
-		return ERR_PTR(-ENOMEM);
-	ret = ns_alloc_inum(&new_ns->ns);
-	if (ret) {
-		kfree(new_ns);
-		return ERR_PTR(ret);
-	}
-	atomic_set(&new_ns->count, 1);
-	new_ns->ns.ops = &cgroupns_operations;
-	return new_ns;
-}
-
-void free_cgroup_ns(struct cgroup_namespace *ns)
-{
-	put_css_set(ns->root_cset);
-	dec_cgroup_namespaces(ns->ucounts);
-	put_user_ns(ns->user_ns);
-	ns_free_inum(&ns->ns);
-	kfree(ns);
-}
-EXPORT_SYMBOL(free_cgroup_ns);
-
-struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
-					struct user_namespace *user_ns,
-					struct cgroup_namespace *old_ns)
-{
-	struct cgroup_namespace *new_ns;
-	struct ucounts *ucounts;
-	struct css_set *cset;
-
-	BUG_ON(!old_ns);
-
-	if (!(flags & CLONE_NEWCGROUP)) {
-		get_cgroup_ns(old_ns);
-		return old_ns;
-	}
-
-	/* Allow only sysadmin to create cgroup namespace. */
-	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
-		return ERR_PTR(-EPERM);
-
-	ucounts = inc_cgroup_namespaces(user_ns);
-	if (!ucounts)
-		return ERR_PTR(-ENOSPC);
-
-	/* It is not safe to take cgroup_mutex here */
-	spin_lock_irq(&css_set_lock);
-	cset = task_css_set(current);
-	get_css_set(cset);
-	spin_unlock_irq(&css_set_lock);
-
-	new_ns = alloc_cgroup_ns();
-	if (IS_ERR(new_ns)) {
-		put_css_set(cset);
-		dec_cgroup_namespaces(ucounts);
-		return new_ns;
-	}
-
-	new_ns->user_ns = get_user_ns(user_ns);
-	new_ns->ucounts = ucounts;
-	new_ns->root_cset = cset;
-
-	return new_ns;
-}
-
-static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
-{
-	return container_of(ns, struct cgroup_namespace, ns);
-}
-
-static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
-{
-	struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
-
-	if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
-	    !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
-		return -EPERM;
-
-	/* Don't need to do anything if we are attaching to our own cgroupns. */
-	if (cgroup_ns == nsproxy->cgroup_ns)
-		return 0;
-
-	get_cgroup_ns(cgroup_ns);
-	put_cgroup_ns(nsproxy->cgroup_ns);
-	nsproxy->cgroup_ns = cgroup_ns;
-
-	return 0;
-}
-
-static struct ns_common *cgroupns_get(struct task_struct *task)
-{
-	struct cgroup_namespace *ns = NULL;
-	struct nsproxy *nsproxy;
-
-	task_lock(task);
-	nsproxy = task->nsproxy;
-	if (nsproxy) {
-		ns = nsproxy->cgroup_ns;
-		get_cgroup_ns(ns);
-	}
-	task_unlock(task);
-
-	return ns ? &ns->ns : NULL;
-}
-
-static void cgroupns_put(struct ns_common *ns)
-{
-	put_cgroup_ns(to_cg_ns(ns));
-}
-
-static struct user_namespace *cgroupns_owner(struct ns_common *ns)
-{
-	return to_cg_ns(ns)->user_ns;
-}
-
-const struct proc_ns_operations cgroupns_operations = {
-	.name		= "cgroup",
-	.type		= CLONE_NEWCGROUP,
-	.get		= cgroupns_get,
-	.put		= cgroupns_put,
-	.install	= cgroupns_install,
-	.owner		= cgroupns_owner,
-};
-
-static __init int cgroup_namespaces_init(void)
-{
-	return 0;
-}
-subsys_initcall(cgroup_namespaces_init);
-
 #ifdef CONFIG_CGROUP_BPF
 int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type, bool overridable)
@@ -6510,149 +5065,3 @@ int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
 	return ret;
 }
 #endif /* CONFIG_CGROUP_BPF */
-
-#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *
-debug_css_alloc(struct cgroup_subsys_state *parent_css)
-{
-	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
-	if (!css)
-		return ERR_PTR(-ENOMEM);
-
-	return css;
-}
-
-static void debug_css_free(struct cgroup_subsys_state *css)
-{
-	kfree(css);
-}
-
-static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
-				struct cftype *cft)
-{
-	return cgroup_task_count(css->cgroup);
-}
-
-static u64 current_css_set_read(struct cgroup_subsys_state *css,
-				struct cftype *cft)
-{
-	return (u64)(unsigned long)current->cgroups;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
-					 struct cftype *cft)
-{
-	u64 count;
-
-	rcu_read_lock();
-	count = atomic_read(&task_css_set(current)->refcount);
-	rcu_read_unlock();
-	return count;
-}
-
-static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
-{
-	struct cgrp_cset_link *link;
-	struct css_set *cset;
-	char *name_buf;
-
-	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
-	if (!name_buf)
-		return -ENOMEM;
-
-	spin_lock_irq(&css_set_lock);
-	rcu_read_lock();
-	cset = rcu_dereference(current->cgroups);
-	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
-		struct cgroup *c = link->cgrp;
-
-		cgroup_name(c, name_buf, NAME_MAX + 1);
-		seq_printf(seq, "Root %d group %s\n",
-			   c->root->hierarchy_id, name_buf);
-	}
-	rcu_read_unlock();
-	spin_unlock_irq(&css_set_lock);
-	kfree(name_buf);
-	return 0;
-}
-
-#define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct seq_file *seq, void *v)
-{
-	struct cgroup_subsys_state *css = seq_css(seq);
-	struct cgrp_cset_link *link;
-
-	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
-		struct css_set *cset = link->cset;
-		struct task_struct *task;
-		int count = 0;
-
-		seq_printf(seq, "css_set %p\n", cset);
-
-		list_for_each_entry(task, &cset->tasks, cg_list) {
-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-				goto overflow;
-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
-		}
-
-		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-				goto overflow;
-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
-		}
-		continue;
-	overflow:
-		seq_puts(seq, "  ...\n");
-	}
-	spin_unlock_irq(&css_set_lock);
-	return 0;
-}
-
-static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
-{
-	return (!cgroup_is_populated(css->cgroup) &&
-		!css_has_online_children(&css->cgroup->self));
-}
-
-static struct cftype debug_files[] =  {
-	{
-		.name = "taskcount",
-		.read_u64 = debug_taskcount_read,
-	},
-
-	{
-		.name = "current_css_set",
-		.read_u64 = current_css_set_read,
-	},
-
-	{
-		.name = "current_css_set_refcount",
-		.read_u64 = current_css_set_refcount_read,
-	},
-
-	{
-		.name = "current_css_set_cg_links",
-		.seq_show = current_css_set_cg_links_read,
-	},
-
-	{
-		.name = "cgroup_css_links",
-		.seq_show = cgroup_css_links_read,
-	},
-
-	{
-		.name = "releasable",
-		.read_u64 = releasable_read,
-	},
-
-	{ }	/* terminate */
-};
-
-struct cgroup_subsys debug_cgrp_subsys = {
-	.css_alloc = debug_css_alloc,
-	.css_free = debug_css_free,
-	.legacy_cftypes = debug_files,
-};
-#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cpuset.c b/kernel/cgroup/cpuset.c
similarity index 100%
rename from kernel/cpuset.c
rename to kernel/cgroup/cpuset.c
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup/freezer.c
similarity index 100%
rename from kernel/cgroup_freezer.c
rename to kernel/cgroup/freezer.c
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
new file mode 100644
index 0000000..cff7ea6
--- /dev/null
+++ b/kernel/cgroup/namespace.c
@@ -0,0 +1,155 @@
+#include "cgroup-internal.h"
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/nsproxy.h>
+#include <linux/proc_ns.h>
+
+
+/* cgroup namespaces */
+
+static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
+{
+	return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
+}
+
+static void dec_cgroup_namespaces(struct ucounts *ucounts)
+{
+	dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
+}
+
+static struct cgroup_namespace *alloc_cgroup_ns(void)
+{
+	struct cgroup_namespace *new_ns;
+	int ret;
+
+	new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+	if (!new_ns)
+		return ERR_PTR(-ENOMEM);
+	ret = ns_alloc_inum(&new_ns->ns);
+	if (ret) {
+		kfree(new_ns);
+		return ERR_PTR(ret);
+	}
+	atomic_set(&new_ns->count, 1);
+	new_ns->ns.ops = &cgroupns_operations;
+	return new_ns;
+}
+
+void free_cgroup_ns(struct cgroup_namespace *ns)
+{
+	put_css_set(ns->root_cset);
+	dec_cgroup_namespaces(ns->ucounts);
+	put_user_ns(ns->user_ns);
+	ns_free_inum(&ns->ns);
+	kfree(ns);
+}
+EXPORT_SYMBOL(free_cgroup_ns);
+
+struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+					struct user_namespace *user_ns,
+					struct cgroup_namespace *old_ns)
+{
+	struct cgroup_namespace *new_ns;
+	struct ucounts *ucounts;
+	struct css_set *cset;
+
+	BUG_ON(!old_ns);
+
+	if (!(flags & CLONE_NEWCGROUP)) {
+		get_cgroup_ns(old_ns);
+		return old_ns;
+	}
+
+	/* Allow only sysadmin to create cgroup namespace. */
+	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	ucounts = inc_cgroup_namespaces(user_ns);
+	if (!ucounts)
+		return ERR_PTR(-ENOSPC);
+
+	/* It is not safe to take cgroup_mutex here */
+	spin_lock_irq(&css_set_lock);
+	cset = task_css_set(current);
+	get_css_set(cset);
+	spin_unlock_irq(&css_set_lock);
+
+	new_ns = alloc_cgroup_ns();
+	if (IS_ERR(new_ns)) {
+		put_css_set(cset);
+		dec_cgroup_namespaces(ucounts);
+		return new_ns;
+	}
+
+	new_ns->user_ns = get_user_ns(user_ns);
+	new_ns->ucounts = ucounts;
+	new_ns->root_cset = cset;
+
+	return new_ns;
+}
+
+static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
+{
+	return container_of(ns, struct cgroup_namespace, ns);
+}
+
+static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+{
+	struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
+
+	if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+	    !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* Don't need to do anything if we are attaching to our own cgroupns. */
+	if (cgroup_ns == nsproxy->cgroup_ns)
+		return 0;
+
+	get_cgroup_ns(cgroup_ns);
+	put_cgroup_ns(nsproxy->cgroup_ns);
+	nsproxy->cgroup_ns = cgroup_ns;
+
+	return 0;
+}
+
+static struct ns_common *cgroupns_get(struct task_struct *task)
+{
+	struct cgroup_namespace *ns = NULL;
+	struct nsproxy *nsproxy;
+
+	task_lock(task);
+	nsproxy = task->nsproxy;
+	if (nsproxy) {
+		ns = nsproxy->cgroup_ns;
+		get_cgroup_ns(ns);
+	}
+	task_unlock(task);
+
+	return ns ? &ns->ns : NULL;
+}
+
+static void cgroupns_put(struct ns_common *ns)
+{
+	put_cgroup_ns(to_cg_ns(ns));
+}
+
+static struct user_namespace *cgroupns_owner(struct ns_common *ns)
+{
+	return to_cg_ns(ns)->user_ns;
+}
+
+const struct proc_ns_operations cgroupns_operations = {
+	.name		= "cgroup",
+	.type		= CLONE_NEWCGROUP,
+	.get		= cgroupns_get,
+	.put		= cgroupns_put,
+	.install	= cgroupns_install,
+	.owner		= cgroupns_owner,
+};
+
+static __init int cgroup_namespaces_init(void)
+{
+	return 0;
+}
+subsys_initcall(cgroup_namespaces_init);
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup/pids.c
similarity index 100%
rename from kernel/cgroup_pids.c
rename to kernel/cgroup/pids.c
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
new file mode 100644
index 0000000..defad3c
--- /dev/null
+++ b/kernel/cgroup/rdma.c
@@ -0,0 +1,619 @@
+/*
+ * RDMA resource limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop processes from consuming
+ * additional RDMA resources after a certain limit is reached.
+ *
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/cgroup.h>
+#include <linux/parser.h>
+#include <linux/cgroup_rdma.h>
+
+#define RDMACG_MAX_STR "max"
+
+/*
+ * Protects list of resource pools maintained on per cgroup basis
+ * and rdma device list.
+ */
+static DEFINE_MUTEX(rdmacg_mutex);
+static LIST_HEAD(rdmacg_devices);
+
+enum rdmacg_file_type {
+	RDMACG_RESOURCE_TYPE_MAX,
+	RDMACG_RESOURCE_TYPE_STAT,
+};
+
+/*
+ * resource table definition as to be seen by the user.
+ * Need to add entries to it when more resources are
+ * added/defined at IB verb/core layer.
+ */
+static char const *rdmacg_resource_names[] = {
+	[RDMACG_RESOURCE_HCA_HANDLE]	= "hca_handle",
+	[RDMACG_RESOURCE_HCA_OBJECT]	= "hca_object",
+};
+
+/* resource tracker for each resource of rdma cgroup */
+struct rdmacg_resource {
+	int max;
+	int usage;
+};
+
+/*
+ * resource pool object which represents per cgroup, per device
+ * resources. There are multiple instances of this object per cgroup,
+ * therefore it cannot be embedded within rdma_cgroup structure. It
+ * is maintained as list.
+ */
+struct rdmacg_resource_pool {
+	struct rdmacg_device	*device;
+	struct rdmacg_resource	resources[RDMACG_RESOURCE_MAX];
+
+	struct list_head	cg_node;
+	struct list_head	dev_node;
+
+	/* count active user tasks of this pool */
+	u64			usage_sum;
+	/* total number counts which are set to max */
+	int			num_max_cnt;
+};
+
+static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
+{
+	return container_of(css, struct rdma_cgroup, css);
+}
+
+static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
+{
+	return css_rdmacg(cg->css.parent);
+}
+
+static inline struct rdma_cgroup *get_current_rdmacg(void)
+{
+	return css_rdmacg(task_get_css(current, rdma_cgrp_id));
+}
+
+static void set_resource_limit(struct rdmacg_resource_pool *rpool,
+			       int index, int new_max)
+{
+	if (new_max == S32_MAX) {
+		if (rpool->resources[index].max != S32_MAX)
+			rpool->num_max_cnt++;
+	} else {
+		if (rpool->resources[index].max == S32_MAX)
+			rpool->num_max_cnt--;
+	}
+	rpool->resources[index].max = new_max;
+}
+
+static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
+{
+	int i;
+
+	for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
+		set_resource_limit(rpool, i, S32_MAX);
+}
+
+static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
+{
+	lockdep_assert_held(&rdmacg_mutex);
+
+	list_del(&rpool->cg_node);
+	list_del(&rpool->dev_node);
+	kfree(rpool);
+}
+
+static struct rdmacg_resource_pool *
+find_cg_rpool_locked(struct rdma_cgroup *cg,
+		     struct rdmacg_device *device)
+
+{
+	struct rdmacg_resource_pool *pool;
+
+	lockdep_assert_held(&rdmacg_mutex);
+
+	list_for_each_entry(pool, &cg->rpools, cg_node)
+		if (pool->device == device)
+			return pool;
+
+	return NULL;
+}
+
+static struct rdmacg_resource_pool *
+get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
+{
+	struct rdmacg_resource_pool *rpool;
+
+	rpool = find_cg_rpool_locked(cg, device);
+	if (rpool)
+		return rpool;
+
+	rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
+	if (!rpool)
+		return ERR_PTR(-ENOMEM);
+
+	rpool->device = device;
+	set_all_resource_max_limit(rpool);
+
+	INIT_LIST_HEAD(&rpool->cg_node);
+	INIT_LIST_HEAD(&rpool->dev_node);
+	list_add_tail(&rpool->cg_node, &cg->rpools);
+	list_add_tail(&rpool->dev_node, &device->rpools);
+	return rpool;
+}
+
+/**
+ * uncharge_cg_locked - uncharge resource for rdma cgroup
+ * @cg: pointer to cg to uncharge and all parents in hierarchy
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cg (resource pool)
+ *
+ * It also frees the resource pool which was created as part of
+ * charging operation when there are no resources attached to
+ * resource pool.
+ */
+static void
+uncharge_cg_locked(struct rdma_cgroup *cg,
+		   struct rdmacg_device *device,
+		   enum rdmacg_resource_type index)
+{
+	struct rdmacg_resource_pool *rpool;
+
+	rpool = find_cg_rpool_locked(cg, device);
+
+	/*
+	 * rpool cannot be null at this stage. Let kernel operate in case
+	 * if there a bug in IB stack or rdma controller, instead of crashing
+	 * the system.
+	 */
+	if (unlikely(!rpool)) {
+		pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
+		return;
+	}
+
+	rpool->resources[index].usage--;
+
+	/*
+	 * A negative count (or overflow) is invalid,
+	 * it indicates a bug in the rdma controller.
+	 */
+	WARN_ON_ONCE(rpool->resources[index].usage < 0);
+	rpool->usage_sum--;
+	if (rpool->usage_sum == 0 &&
+	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+		/*
+		 * No user of the rpool and all entries are set to max, so
+		 * safe to delete this rpool.
+		 */
+		free_cg_rpool_locked(rpool);
+	}
+}
+
+/**
+ * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
+ * @device: pointer to rdmacg device
+ * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
+ *           stop uncharging
+ * @index: index of the resource to uncharge in cg in given resource pool
+ */
+static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
+				     struct rdmacg_device *device,
+				     struct rdma_cgroup *stop_cg,
+				     enum rdmacg_resource_type index)
+{
+	struct rdma_cgroup *p;
+
+	mutex_lock(&rdmacg_mutex);
+
+	for (p = cg; p != stop_cg; p = parent_rdmacg(p))
+		uncharge_cg_locked(p, device, index);
+
+	mutex_unlock(&rdmacg_mutex);
+
+	css_put(&cg->css);
+}
+
+/**
+ * rdmacg_uncharge - hierarchically uncharge rdma resource count
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cgroup in given resource pool
+ */
+void rdmacg_uncharge(struct rdma_cgroup *cg,
+		     struct rdmacg_device *device,
+		     enum rdmacg_resource_type index)
+{
+	if (index >= RDMACG_RESOURCE_MAX)
+		return;
+
+	rdmacg_uncharge_hierarchy(cg, device, NULL, index);
+}
+EXPORT_SYMBOL(rdmacg_uncharge);
+
+/**
+ * rdmacg_try_charge - hierarchically try to charge the rdma resource
+ * @rdmacg: pointer to rdma cgroup which will own this resource
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to charge in cgroup (resource pool)
+ *
+ * This function follows charging resource in hierarchical way.
+ * It will fail if the charge would cause the new value to exceed the
+ * hierarchical limit.
+ * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
+ * Returns pointer to rdmacg for this resource when charging is successful.
+ *
+ * Charger needs to account resources on two criteria.
+ * (a) per cgroup & (b) per device resource usage.
+ * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
+ * the configured limits. Per device provides granular configuration
+ * in multi device usage. It allocates resource pool in the hierarchy
+ * for each parent it come across for first resource. Later on resource
+ * pool will be available. Therefore it will be much faster thereon
+ * to charge/uncharge.
+ */
+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
+		      struct rdmacg_device *device,
+		      enum rdmacg_resource_type index)
+{
+	struct rdma_cgroup *cg, *p;
+	struct rdmacg_resource_pool *rpool;
+	s64 new;
+	int ret = 0;
+
+	if (index >= RDMACG_RESOURCE_MAX)
+		return -EINVAL;
+
+	/*
+	 * hold on to css, as cgroup can be removed but resource
+	 * accounting happens on css.
+	 */
+	cg = get_current_rdmacg();
+
+	mutex_lock(&rdmacg_mutex);
+	for (p = cg; p; p = parent_rdmacg(p)) {
+		rpool = get_cg_rpool_locked(p, device);
+		if (IS_ERR(rpool)) {
+			ret = PTR_ERR(rpool);
+			goto err;
+		} else {
+			new = rpool->resources[index].usage + 1;
+			if (new > rpool->resources[index].max) {
+				ret = -EAGAIN;
+				goto err;
+			} else {
+				rpool->resources[index].usage = new;
+				rpool->usage_sum++;
+			}
+		}
+	}
+	mutex_unlock(&rdmacg_mutex);
+
+	*rdmacg = cg;
+	return 0;
+
+err:
+	mutex_unlock(&rdmacg_mutex);
+	rdmacg_uncharge_hierarchy(cg, device, p, index);
+	return ret;
+}
+EXPORT_SYMBOL(rdmacg_try_charge);
+
+/**
+ * rdmacg_register_device - register rdmacg device to rdma controller.
+ * @device: pointer to rdmacg device whose resources need to be accounted.
+ *
+ * If IB stack wish a device to participate in rdma cgroup resource
+ * tracking, it must invoke this API to register with rdma cgroup before
+ * any user space application can start using the RDMA resources.
+ * Returns 0 on success or EINVAL when table length given is beyond
+ * supported size.
+ */
+int rdmacg_register_device(struct rdmacg_device *device)
+{
+	INIT_LIST_HEAD(&device->dev_node);
+	INIT_LIST_HEAD(&device->rpools);
+
+	mutex_lock(&rdmacg_mutex);
+	list_add_tail(&device->dev_node, &rdmacg_devices);
+	mutex_unlock(&rdmacg_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(rdmacg_register_device);
+
+/**
+ * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
+ * @device: pointer to rdmacg device which was previously registered with rdma
+ *          controller using rdmacg_register_device().
+ *
+ * IB stack must invoke this after all the resources of the IB device
+ * are destroyed and after ensuring that no more resources will be created
+ * when this API is invoked.
+ */
+void rdmacg_unregister_device(struct rdmacg_device *device)
+{
+	struct rdmacg_resource_pool *rpool, *tmp;
+
+	/*
+	 * Synchronize with any active resource settings,
+	 * usage query happening via configfs.
+	 */
+	mutex_lock(&rdmacg_mutex);
+	list_del_init(&device->dev_node);
+
+	/*
+	 * Now that this device is off the cgroup list, its safe to free
+	 * all the rpool resources.
+	 */
+	list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
+		free_cg_rpool_locked(rpool);
+
+	mutex_unlock(&rdmacg_mutex);
+}
+EXPORT_SYMBOL(rdmacg_unregister_device);
+
+static int parse_resource(char *c, int *intval)
+{
+	substring_t argstr;
+	const char **table = &rdmacg_resource_names[0];
+	char *name, *value = c;
+	size_t len;
+	int ret, i = 0;
+
+	name = strsep(&value, "=");
+	if (!name || !value)
+		return -EINVAL;
+
+	len = strlen(value);
+
+	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+		if (strcmp(table[i], name))
+			continue;
+
+		argstr.from = value;
+		argstr.to = value + len;
+
+		ret = match_int(&argstr, intval);
+		if (ret >= 0) {
+			if (*intval < 0)
+				break;
+			return i;
+		}
+		if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
+			*intval = S32_MAX;
+			return i;
+		}
+		break;
+	}
+	return -EINVAL;
+}
+
+static int rdmacg_parse_limits(char *options,
+			       int *new_limits, unsigned long *enables)
+{
+	char *c;
+	int err = -EINVAL;
+
+	/* parse resource options */
+	while ((c = strsep(&options, " ")) != NULL) {
+		int index, intval;
+
+		index = parse_resource(c, &intval);
+		if (index < 0)
+			goto err;
+
+		new_limits[index] = intval;
+		*enables |= BIT(index);
+	}
+	return 0;
+
+err:
+	return err;
+}
+
+static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
+{
+	struct rdmacg_device *device;
+
+	lockdep_assert_held(&rdmacg_mutex);
+
+	list_for_each_entry(device, &rdmacg_devices, dev_node)
+		if (!strcmp(name, device->name))
+			return device;
+
+	return NULL;
+}
+
+static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
+				       char *buf, size_t nbytes, loff_t off)
+{
+	struct rdma_cgroup *cg = css_rdmacg(of_css(of));
+	const char *dev_name;
+	struct rdmacg_resource_pool *rpool;
+	struct rdmacg_device *device;
+	char *options = strstrip(buf);
+	int *new_limits;
+	unsigned long enables = 0;
+	int i = 0, ret = 0;
+
+	/* extract the device name first */
+	dev_name = strsep(&options, " ");
+	if (!dev_name) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
+	if (!new_limits) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	ret = rdmacg_parse_limits(options, new_limits, &enables);
+	if (ret)
+		goto parse_err;
+
+	/* acquire lock to synchronize with hot plug devices */
+	mutex_lock(&rdmacg_mutex);
+
+	device = rdmacg_get_device_locked(dev_name);
+	if (!device) {
+		ret = -ENODEV;
+		goto dev_err;
+	}
+
+	rpool = get_cg_rpool_locked(cg, device);
+	if (IS_ERR(rpool)) {
+		ret = PTR_ERR(rpool);
+		goto dev_err;
+	}
+
+	/* now set the new limits of the rpool */
+	for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
+		set_resource_limit(rpool, i, new_limits[i]);
+
+	if (rpool->usage_sum == 0 &&
+	    rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+		/*
+		 * No user of the rpool and all entries are set to max, so
+		 * safe to delete this rpool.
+		 */
+		free_cg_rpool_locked(rpool);
+	}
+
+dev_err:
+	mutex_unlock(&rdmacg_mutex);
+
+parse_err:
+	kfree(new_limits);
+
+err:
+	return ret ?: nbytes;
+}
+
+static void print_rpool_values(struct seq_file *sf,
+			       struct rdmacg_resource_pool *rpool)
+{
+	enum rdmacg_file_type sf_type;
+	int i;
+	u32 value;
+
+	sf_type = seq_cft(sf)->private;
+
+	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+		seq_puts(sf, rdmacg_resource_names[i]);
+		seq_putc(sf, '=');
+		if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
+			if (rpool)
+				value = rpool->resources[i].max;
+			else
+				value = S32_MAX;
+		} else {
+			if (rpool)
+				value = rpool->resources[i].usage;
+			else
+				value = 0;
+		}
+
+		if (value == S32_MAX)
+			seq_puts(sf, RDMACG_MAX_STR);
+		else
+			seq_printf(sf, "%d", value);
+		seq_putc(sf, ' ');
+	}
+}
+
+static int rdmacg_resource_read(struct seq_file *sf, void *v)
+{
+	struct rdmacg_device *device;
+	struct rdmacg_resource_pool *rpool;
+	struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
+
+	mutex_lock(&rdmacg_mutex);
+
+	list_for_each_entry(device, &rdmacg_devices, dev_node) {
+		seq_printf(sf, "%s ", device->name);
+
+		rpool = find_cg_rpool_locked(cg, device);
+		print_rpool_values(sf, rpool);
+
+		seq_putc(sf, '\n');
+	}
+
+	mutex_unlock(&rdmacg_mutex);
+	return 0;
+}
+
+static struct cftype rdmacg_files[] = {
+	{
+		.name = "max",
+		.write = rdmacg_resource_set_max,
+		.seq_show = rdmacg_resource_read,
+		.private = RDMACG_RESOURCE_TYPE_MAX,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "current",
+		.seq_show = rdmacg_resource_read,
+		.private = RDMACG_RESOURCE_TYPE_STAT,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{ }	/* terminate */
+};
+
+static struct cgroup_subsys_state *
+rdmacg_css_alloc(struct cgroup_subsys_state *parent)
+{
+	struct rdma_cgroup *cg;
+
+	cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+	if (!cg)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&cg->rpools);
+	return &cg->css;
+}
+
+static void rdmacg_css_free(struct cgroup_subsys_state *css)
+{
+	struct rdma_cgroup *cg = css_rdmacg(css);
+
+	kfree(cg);
+}
+
+/**
+ * rdmacg_css_offline - cgroup css_offline callback
+ * @css: css of interest
+ *
+ * This function is called when @css is about to go away and responsible
+ * for shooting down all rdmacg associated with @css. As part of that it
+ * marks all the resource pool entries to max value, so that when resources are
+ * uncharged, associated resource pool can be freed as well.
+ */
+static void rdmacg_css_offline(struct cgroup_subsys_state *css)
+{
+	struct rdma_cgroup *cg = css_rdmacg(css);
+	struct rdmacg_resource_pool *rpool;
+
+	mutex_lock(&rdmacg_mutex);
+
+	list_for_each_entry(rpool, &cg->rpools, cg_node)
+		set_all_resource_max_limit(rpool);
+
+	mutex_unlock(&rdmacg_mutex);
+}
+
+struct cgroup_subsys rdma_cgrp_subsys = {
+	.css_alloc	= rdmacg_css_alloc,
+	.css_free	= rdmacg_css_free,
+	.css_offline	= rdmacg_css_offline,
+	.legacy_cftypes	= rdmacg_files,
+	.dfl_cftypes	= rdmacg_files,
+};
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b2eb354..5b4e0b9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10959,5 +10959,11 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
 	.css_alloc	= perf_cgroup_css_alloc,
 	.css_free	= perf_cgroup_css_free,
 	.attach		= perf_cgroup_attach,
+	/*
+	 * Implicitly enable on dfl hierarchy so that perf events can
+	 * always be filtered by cgroup2 path as long as perf_event
+	 * controller is not mounted on a legacy hierarchy.
+	 */
+	.implicit_on_dfl = true,
 };
 #endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index a9b8cf5..6c9cb20 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -236,12 +236,28 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry
 
 static inline struct jump_entry *static_key_entries(struct static_key *key)
 {
-	return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK);
+	WARN_ON_ONCE(key->type & JUMP_TYPE_LINKED);
+	return (struct jump_entry *)(key->type & ~JUMP_TYPE_MASK);
 }
 
 static inline bool static_key_type(struct static_key *key)
 {
-	return (unsigned long)key->entries & JUMP_TYPE_MASK;
+	return key->type & JUMP_TYPE_TRUE;
+}
+
+static inline bool static_key_linked(struct static_key *key)
+{
+	return key->type & JUMP_TYPE_LINKED;
+}
+
+static inline void static_key_clear_linked(struct static_key *key)
+{
+	key->type &= ~JUMP_TYPE_LINKED;
+}
+
+static inline void static_key_set_linked(struct static_key *key)
+{
+	key->type |= JUMP_TYPE_LINKED;
 }
 
 static inline struct static_key *jump_entry_key(struct jump_entry *entry)
@@ -254,6 +270,26 @@ static bool jump_entry_branch(struct jump_entry *entry)
 	return (unsigned long)entry->key & 1UL;
 }
 
+/***
+ * A 'struct static_key' uses a union such that it either points directly
+ * to a table of 'struct jump_entry' or to a linked list of modules which in
+ * turn point to 'struct jump_entry' tables.
+ *
+ * The two lower bits of the pointer are used to keep track of which pointer
+ * type is in use and to store the initial branch direction, we use an access
+ * function which preserves these bits.
+ */
+static void static_key_set_entries(struct static_key *key,
+				   struct jump_entry *entries)
+{
+	unsigned long type;
+
+	WARN_ON_ONCE((unsigned long)entries & JUMP_TYPE_MASK);
+	type = key->type & JUMP_TYPE_MASK;
+	key->entries = entries;
+	key->type |= type;
+}
+
 static enum jump_label_type jump_label_type(struct jump_entry *entry)
 {
 	struct static_key *key = jump_entry_key(entry);
@@ -313,13 +349,7 @@ void __init jump_label_init(void)
 			continue;
 
 		key = iterk;
-		/*
-		 * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
-		 */
-		*((unsigned long *)&key->entries) += (unsigned long)iter;
-#ifdef CONFIG_MODULES
-		key->next = NULL;
-#endif
+		static_key_set_entries(key, iter);
 	}
 	static_key_initialized = true;
 	jump_label_unlock();
@@ -343,6 +373,29 @@ struct static_key_mod {
 	struct module *mod;
 };
 
+static inline struct static_key_mod *static_key_mod(struct static_key *key)
+{
+	WARN_ON_ONCE(!(key->type & JUMP_TYPE_LINKED));
+	return (struct static_key_mod *)(key->type & ~JUMP_TYPE_MASK);
+}
+
+/***
+ * key->type and key->next are the same via union.
+ * This sets key->next and preserves the type bits.
+ *
+ * See additional comments above static_key_set_entries().
+ */
+static void static_key_set_mod(struct static_key *key,
+			       struct static_key_mod *mod)
+{
+	unsigned long type;
+
+	WARN_ON_ONCE((unsigned long)mod & JUMP_TYPE_MASK);
+	type = key->type & JUMP_TYPE_MASK;
+	key->next = mod;
+	key->type |= type;
+}
+
 static int __jump_label_mod_text_reserved(void *start, void *end)
 {
 	struct module *mod;
@@ -365,11 +418,23 @@ static void __jump_label_mod_update(struct static_key *key)
 {
 	struct static_key_mod *mod;
 
-	for (mod = key->next; mod; mod = mod->next) {
-		struct module *m = mod->mod;
+	for (mod = static_key_mod(key); mod; mod = mod->next) {
+		struct jump_entry *stop;
+		struct module *m;
 
-		__jump_label_update(key, mod->entries,
-				    m->jump_entries + m->num_jump_entries);
+		/*
+		 * NULL if the static_key is defined in a module
+		 * that does not use it
+		 */
+		if (!mod->entries)
+			continue;
+
+		m = mod->mod;
+		if (!m)
+			stop = __stop___jump_table;
+		else
+			stop = m->jump_entries + m->num_jump_entries;
+		__jump_label_update(key, mod->entries, stop);
 	}
 }
 
@@ -404,7 +469,7 @@ static int jump_label_add_module(struct module *mod)
 	struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
 	struct jump_entry *iter;
 	struct static_key *key = NULL;
-	struct static_key_mod *jlm;
+	struct static_key_mod *jlm, *jlm2;
 
 	/* if the module doesn't have jump label entries, just return */
 	if (iter_start == iter_stop)
@@ -421,20 +486,32 @@ static int jump_label_add_module(struct module *mod)
 
 		key = iterk;
 		if (within_module(iter->key, mod)) {
-			/*
-			 * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
-			 */
-			*((unsigned long *)&key->entries) += (unsigned long)iter;
-			key->next = NULL;
+			static_key_set_entries(key, iter);
 			continue;
 		}
 		jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
 		if (!jlm)
 			return -ENOMEM;
+		if (!static_key_linked(key)) {
+			jlm2 = kzalloc(sizeof(struct static_key_mod),
+				       GFP_KERNEL);
+			if (!jlm2) {
+				kfree(jlm);
+				return -ENOMEM;
+			}
+			preempt_disable();
+			jlm2->mod = __module_address((unsigned long)key);
+			preempt_enable();
+			jlm2->entries = static_key_entries(key);
+			jlm2->next = NULL;
+			static_key_set_mod(key, jlm2);
+			static_key_set_linked(key);
+		}
 		jlm->mod = mod;
 		jlm->entries = iter;
-		jlm->next = key->next;
-		key->next = jlm;
+		jlm->next = static_key_mod(key);
+		static_key_set_mod(key, jlm);
+		static_key_set_linked(key);
 
 		/* Only update if we've changed from our initial state */
 		if (jump_label_type(iter) != jump_label_init_type(iter))
@@ -461,16 +538,34 @@ static void jump_label_del_module(struct module *mod)
 		if (within_module(iter->key, mod))
 			continue;
 
+		/* No memory during module load */
+		if (WARN_ON(!static_key_linked(key)))
+			continue;
+
 		prev = &key->next;
-		jlm = key->next;
+		jlm = static_key_mod(key);
 
 		while (jlm && jlm->mod != mod) {
 			prev = &jlm->next;
 			jlm = jlm->next;
 		}
 
-		if (jlm) {
+		/* No memory during module load */
+		if (WARN_ON(!jlm))
+			continue;
+
+		if (prev == &key->next)
+			static_key_set_mod(key, jlm->next);
+		else
 			*prev = jlm->next;
+
+		kfree(jlm);
+
+		jlm = static_key_mod(key);
+		/* if only one etry is left, fold it back into the static_key */
+		if (jlm->next == NULL) {
+			static_key_set_entries(key, jlm->entries);
+			static_key_clear_linked(key);
 			kfree(jlm);
 		}
 	}
@@ -499,8 +594,10 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
 	case MODULE_STATE_COMING:
 		jump_label_lock();
 		ret = jump_label_add_module(mod);
-		if (ret)
+		if (ret) {
+			WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n");
 			jump_label_del_module(mod);
+		}
 		jump_label_unlock();
 		break;
 	case MODULE_STATE_GOING:
@@ -561,11 +658,14 @@ int jump_label_text_reserved(void *start, void *end)
 static void jump_label_update(struct static_key *key)
 {
 	struct jump_entry *stop = __stop___jump_table;
-	struct jump_entry *entry = static_key_entries(key);
+	struct jump_entry *entry;
 #ifdef CONFIG_MODULES
 	struct module *mod;
 
-	__jump_label_mod_update(key);
+	if (static_key_linked(key)) {
+		__jump_label_mod_update(key);
+		return;
+	}
 
 	preempt_disable();
 	mod = __module_address((unsigned long)key);
@@ -573,6 +673,7 @@ static void jump_label_update(struct static_key *key)
 		stop = mod->jump_entries + mod->num_jump_entries;
 	preempt_enable();
 #endif
+	entry = static_key_entries(key);
 	/* if there are no users, entry can be NULL */
 	if (entry)
 		__jump_label_update(key, entry, stop);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eb230f0..0c06093 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1110,13 +1110,6 @@ struct ftrace_func_entry {
 	unsigned long ip;
 };
 
-struct ftrace_hash {
-	unsigned long		size_bits;
-	struct hlist_head	*buckets;
-	unsigned long		count;
-	struct rcu_head		rcu;
-};
-
 /*
  * We make these constant because no one should touch them,
  * but they are used as the default "empty hash", to avoid allocating
@@ -1192,26 +1185,24 @@ struct ftrace_page {
 static struct ftrace_page	*ftrace_pages_start;
 static struct ftrace_page	*ftrace_pages;
 
-static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash)
+static __always_inline unsigned long
+ftrace_hash_key(struct ftrace_hash *hash, unsigned long ip)
 {
-	return !hash || !hash->count;
+	if (hash->size_bits > 0)
+		return hash_long(ip, hash->size_bits);
+
+	return 0;
 }
 
-static struct ftrace_func_entry *
-ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
+/* Only use this function if ftrace_hash_empty() has already been tested */
+static __always_inline struct ftrace_func_entry *
+__ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
 {
 	unsigned long key;
 	struct ftrace_func_entry *entry;
 	struct hlist_head *hhd;
 
-	if (ftrace_hash_empty(hash))
-		return NULL;
-
-	if (hash->size_bits > 0)
-		key = hash_long(ip, hash->size_bits);
-	else
-		key = 0;
-
+	key = ftrace_hash_key(hash, ip);
 	hhd = &hash->buckets[key];
 
 	hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) {
@@ -1221,17 +1212,32 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
 	return NULL;
 }
 
+/**
+ * ftrace_lookup_ip - Test to see if an ip exists in an ftrace_hash
+ * @hash: The hash to look at
+ * @ip: The instruction pointer to test
+ *
+ * Search a given @hash to see if a given instruction pointer (@ip)
+ * exists in it.
+ *
+ * Returns the entry that holds the @ip if found. NULL otherwise.
+ */
+struct ftrace_func_entry *
+ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
+{
+	if (ftrace_hash_empty(hash))
+		return NULL;
+
+	return __ftrace_lookup_ip(hash, ip);
+}
+
 static void __add_hash_entry(struct ftrace_hash *hash,
 			     struct ftrace_func_entry *entry)
 {
 	struct hlist_head *hhd;
 	unsigned long key;
 
-	if (hash->size_bits)
-		key = hash_long(entry->ip, hash->size_bits);
-	else
-		key = 0;
-
+	key = ftrace_hash_key(hash, entry->ip);
 	hhd = &hash->buckets[key];
 	hlist_add_head(&entry->hlist, hhd);
 	hash->count++;
@@ -1383,9 +1389,8 @@ ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
 static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
 				       struct ftrace_hash *new_hash);
 
-static int
-ftrace_hash_move(struct ftrace_ops *ops, int enable,
-		 struct ftrace_hash **dst, struct ftrace_hash *src)
+static struct ftrace_hash *
+__ftrace_hash_move(struct ftrace_hash *src)
 {
 	struct ftrace_func_entry *entry;
 	struct hlist_node *tn;
@@ -1393,21 +1398,13 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
 	struct ftrace_hash *new_hash;
 	int size = src->count;
 	int bits = 0;
-	int ret;
 	int i;
 
-	/* Reject setting notrace hash on IPMODIFY ftrace_ops */
-	if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable)
-		return -EINVAL;
-
 	/*
-	 * If the new source is empty, just free dst and assign it
-	 * the empty_hash.
+	 * If the new source is empty, just return the empty_hash.
 	 */
-	if (!src->count) {
-		new_hash = EMPTY_HASH;
-		goto update;
-	}
+	if (!src->count)
+		return EMPTY_HASH;
 
 	/*
 	 * Make the hash size about 1/2 the # found
@@ -1421,7 +1418,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
 
 	new_hash = alloc_ftrace_hash(bits);
 	if (!new_hash)
-		return -ENOMEM;
+		return NULL;
 
 	size = 1 << src->size_bits;
 	for (i = 0; i < size; i++) {
@@ -1432,7 +1429,24 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
 		}
 	}
 
-update:
+	return new_hash;
+}
+
+static int
+ftrace_hash_move(struct ftrace_ops *ops, int enable,
+		 struct ftrace_hash **dst, struct ftrace_hash *src)
+{
+	struct ftrace_hash *new_hash;
+	int ret;
+
+	/* Reject setting notrace hash on IPMODIFY ftrace_ops */
+	if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable)
+		return -EINVAL;
+
+	new_hash = __ftrace_hash_move(src);
+	if (!new_hash)
+		return -ENOMEM;
+
 	/* Make sure this can be applied if it is IPMODIFY ftrace_ops */
 	if (enable) {
 		/* IPMODIFY should be updated only when filter_hash updating */
@@ -1466,9 +1480,9 @@ static bool hash_contains_ip(unsigned long ip,
 	 * notrace hash is considered not in the notrace hash.
 	 */
 	return (ftrace_hash_empty(hash->filter_hash) ||
-		ftrace_lookup_ip(hash->filter_hash, ip)) &&
+		__ftrace_lookup_ip(hash->filter_hash, ip)) &&
 		(ftrace_hash_empty(hash->notrace_hash) ||
-		 !ftrace_lookup_ip(hash->notrace_hash, ip));
+		 !__ftrace_lookup_ip(hash->notrace_hash, ip));
 }
 
 /*
@@ -2880,7 +2894,7 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
 
 	/* The function must be in the filter */
 	if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
-	    !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
+	    !__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
 		return 0;
 
 	/* If in notrace hash, we ignore it too */
@@ -4382,7 +4396,7 @@ __setup("ftrace_filter=", set_ftrace_filter);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
 static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
-static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
+static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer);
 
 static unsigned long save_global_trampoline;
 static unsigned long save_global_flags;
@@ -4405,18 +4419,17 @@ static void __init set_ftrace_early_graph(char *buf, int enable)
 {
 	int ret;
 	char *func;
-	unsigned long *table = ftrace_graph_funcs;
-	int *count = &ftrace_graph_count;
+	struct ftrace_hash *hash;
 
-	if (!enable) {
-		table = ftrace_graph_notrace_funcs;
-		count = &ftrace_graph_notrace_count;
-	}
+	if (enable)
+		hash = ftrace_graph_hash;
+	else
+		hash = ftrace_graph_notrace_hash;
 
 	while (buf) {
 		func = strsep(&buf, ",");
 		/* we allow only one expression at a time */
-		ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func);
+		ret = ftrace_graph_set_hash(hash, func);
 		if (ret)
 			printk(KERN_DEBUG "ftrace: function %s not "
 					  "traceable\n", func);
@@ -4540,26 +4553,55 @@ static const struct file_operations ftrace_notrace_fops = {
 
 static DEFINE_MUTEX(graph_lock);
 
-int ftrace_graph_count;
-int ftrace_graph_notrace_count;
-unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
-unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
+struct ftrace_hash *ftrace_graph_hash = EMPTY_HASH;
+struct ftrace_hash *ftrace_graph_notrace_hash = EMPTY_HASH;
+
+enum graph_filter_type {
+	GRAPH_FILTER_NOTRACE	= 0,
+	GRAPH_FILTER_FUNCTION,
+};
+
+#define FTRACE_GRAPH_EMPTY	((void *)1)
 
 struct ftrace_graph_data {
-	unsigned long *table;
-	size_t size;
-	int *count;
-	const struct seq_operations *seq_ops;
+	struct ftrace_hash		*hash;
+	struct ftrace_func_entry	*entry;
+	int				idx;   /* for hash table iteration */
+	enum graph_filter_type		type;
+	struct ftrace_hash		*new_hash;
+	const struct seq_operations	*seq_ops;
+	struct trace_parser		parser;
 };
 
 static void *
 __g_next(struct seq_file *m, loff_t *pos)
 {
 	struct ftrace_graph_data *fgd = m->private;
+	struct ftrace_func_entry *entry = fgd->entry;
+	struct hlist_head *head;
+	int i, idx = fgd->idx;
 
-	if (*pos >= *fgd->count)
+	if (*pos >= fgd->hash->count)
 		return NULL;
-	return &fgd->table[*pos];
+
+	if (entry) {
+		hlist_for_each_entry_continue(entry, hlist) {
+			fgd->entry = entry;
+			return entry;
+		}
+
+		idx++;
+	}
+
+	for (i = idx; i < 1 << fgd->hash->size_bits; i++) {
+		head = &fgd->hash->buckets[i];
+		hlist_for_each_entry(entry, head, hlist) {
+			fgd->entry = entry;
+			fgd->idx = i;
+			return entry;
+		}
+	}
+	return NULL;
 }
 
 static void *
@@ -4575,10 +4617,19 @@ static void *g_start(struct seq_file *m, loff_t *pos)
 
 	mutex_lock(&graph_lock);
 
-	/* Nothing, tell g_show to print all functions are enabled */
-	if (!*fgd->count && !*pos)
-		return (void *)1;
+	if (fgd->type == GRAPH_FILTER_FUNCTION)
+		fgd->hash = rcu_dereference_protected(ftrace_graph_hash,
+					lockdep_is_held(&graph_lock));
+	else
+		fgd->hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
+					lockdep_is_held(&graph_lock));
 
+	/* Nothing, tell g_show to print all functions are enabled */
+	if (ftrace_hash_empty(fgd->hash) && !*pos)
+		return FTRACE_GRAPH_EMPTY;
+
+	fgd->idx = 0;
+	fgd->entry = NULL;
 	return __g_next(m, pos);
 }
 
@@ -4589,22 +4640,22 @@ static void g_stop(struct seq_file *m, void *p)
 
 static int g_show(struct seq_file *m, void *v)
 {
-	unsigned long *ptr = v;
+	struct ftrace_func_entry *entry = v;
 
-	if (!ptr)
+	if (!entry)
 		return 0;
 
-	if (ptr == (unsigned long *)1) {
+	if (entry == FTRACE_GRAPH_EMPTY) {
 		struct ftrace_graph_data *fgd = m->private;
 
-		if (fgd->table == ftrace_graph_funcs)
+		if (fgd->type == GRAPH_FILTER_FUNCTION)
 			seq_puts(m, "#### all functions enabled ####\n");
 		else
 			seq_puts(m, "#### no functions disabled ####\n");
 		return 0;
 	}
 
-	seq_printf(m, "%ps\n", (void *)*ptr);
+	seq_printf(m, "%ps\n", (void *)entry->ip);
 
 	return 0;
 }
@@ -4621,24 +4672,51 @@ __ftrace_graph_open(struct inode *inode, struct file *file,
 		    struct ftrace_graph_data *fgd)
 {
 	int ret = 0;
+	struct ftrace_hash *new_hash = NULL;
 
-	mutex_lock(&graph_lock);
-	if ((file->f_mode & FMODE_WRITE) &&
-	    (file->f_flags & O_TRUNC)) {
-		*fgd->count = 0;
-		memset(fgd->table, 0, fgd->size * sizeof(*fgd->table));
+	if (file->f_mode & FMODE_WRITE) {
+		const int size_bits = FTRACE_HASH_DEFAULT_BITS;
+
+		if (trace_parser_get_init(&fgd->parser, FTRACE_BUFF_MAX))
+			return -ENOMEM;
+
+		if (file->f_flags & O_TRUNC)
+			new_hash = alloc_ftrace_hash(size_bits);
+		else
+			new_hash = alloc_and_copy_ftrace_hash(size_bits,
+							      fgd->hash);
+		if (!new_hash) {
+			ret = -ENOMEM;
+			goto out;
+		}
 	}
-	mutex_unlock(&graph_lock);
 
 	if (file->f_mode & FMODE_READ) {
-		ret = seq_open(file, fgd->seq_ops);
+		ret = seq_open(file, &ftrace_graph_seq_ops);
 		if (!ret) {
 			struct seq_file *m = file->private_data;
 			m->private = fgd;
+		} else {
+			/* Failed */
+			free_ftrace_hash(new_hash);
+			new_hash = NULL;
 		}
 	} else
 		file->private_data = fgd;
 
+out:
+	if (ret < 0 && file->f_mode & FMODE_WRITE)
+		trace_parser_put(&fgd->parser);
+
+	fgd->new_hash = new_hash;
+
+	/*
+	 * All uses of fgd->hash must be taken with the graph_lock
+	 * held. The graph_lock is going to be released, so force
+	 * fgd->hash to be reinitialized when it is taken again.
+	 */
+	fgd->hash = NULL;
+
 	return ret;
 }
 
@@ -4646,6 +4724,7 @@ static int
 ftrace_graph_open(struct inode *inode, struct file *file)
 {
 	struct ftrace_graph_data *fgd;
+	int ret;
 
 	if (unlikely(ftrace_disabled))
 		return -ENODEV;
@@ -4654,18 +4733,26 @@ ftrace_graph_open(struct inode *inode, struct file *file)
 	if (fgd == NULL)
 		return -ENOMEM;
 
-	fgd->table = ftrace_graph_funcs;
-	fgd->size = FTRACE_GRAPH_MAX_FUNCS;
-	fgd->count = &ftrace_graph_count;
+	mutex_lock(&graph_lock);
+
+	fgd->hash = rcu_dereference_protected(ftrace_graph_hash,
+					lockdep_is_held(&graph_lock));
+	fgd->type = GRAPH_FILTER_FUNCTION;
 	fgd->seq_ops = &ftrace_graph_seq_ops;
 
-	return __ftrace_graph_open(inode, file, fgd);
+	ret = __ftrace_graph_open(inode, file, fgd);
+	if (ret < 0)
+		kfree(fgd);
+
+	mutex_unlock(&graph_lock);
+	return ret;
 }
 
 static int
 ftrace_graph_notrace_open(struct inode *inode, struct file *file)
 {
 	struct ftrace_graph_data *fgd;
+	int ret;
 
 	if (unlikely(ftrace_disabled))
 		return -ENODEV;
@@ -4674,45 +4761,97 @@ ftrace_graph_notrace_open(struct inode *inode, struct file *file)
 	if (fgd == NULL)
 		return -ENOMEM;
 
-	fgd->table = ftrace_graph_notrace_funcs;
-	fgd->size = FTRACE_GRAPH_MAX_FUNCS;
-	fgd->count = &ftrace_graph_notrace_count;
+	mutex_lock(&graph_lock);
+
+	fgd->hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
+					lockdep_is_held(&graph_lock));
+	fgd->type = GRAPH_FILTER_NOTRACE;
 	fgd->seq_ops = &ftrace_graph_seq_ops;
 
-	return __ftrace_graph_open(inode, file, fgd);
+	ret = __ftrace_graph_open(inode, file, fgd);
+	if (ret < 0)
+		kfree(fgd);
+
+	mutex_unlock(&graph_lock);
+	return ret;
 }
 
 static int
 ftrace_graph_release(struct inode *inode, struct file *file)
 {
+	struct ftrace_graph_data *fgd;
+	struct ftrace_hash *old_hash, *new_hash;
+	struct trace_parser *parser;
+	int ret = 0;
+
 	if (file->f_mode & FMODE_READ) {
 		struct seq_file *m = file->private_data;
 
-		kfree(m->private);
+		fgd = m->private;
 		seq_release(inode, file);
 	} else {
-		kfree(file->private_data);
+		fgd = file->private_data;
 	}
 
-	return 0;
+
+	if (file->f_mode & FMODE_WRITE) {
+
+		parser = &fgd->parser;
+
+		if (trace_parser_loaded((parser))) {
+			parser->buffer[parser->idx] = 0;
+			ret = ftrace_graph_set_hash(fgd->new_hash,
+						    parser->buffer);
+		}
+
+		trace_parser_put(parser);
+
+		new_hash = __ftrace_hash_move(fgd->new_hash);
+		if (!new_hash) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		mutex_lock(&graph_lock);
+
+		if (fgd->type == GRAPH_FILTER_FUNCTION) {
+			old_hash = rcu_dereference_protected(ftrace_graph_hash,
+					lockdep_is_held(&graph_lock));
+			rcu_assign_pointer(ftrace_graph_hash, new_hash);
+		} else {
+			old_hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
+					lockdep_is_held(&graph_lock));
+			rcu_assign_pointer(ftrace_graph_notrace_hash, new_hash);
+		}
+
+		mutex_unlock(&graph_lock);
+
+		/* Wait till all users are no longer using the old hash */
+		synchronize_sched();
+
+		free_ftrace_hash(old_hash);
+	}
+
+ out:
+	kfree(fgd->new_hash);
+	kfree(fgd);
+
+	return ret;
 }
 
 static int
-ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
+ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
 {
 	struct ftrace_glob func_g;
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
+	struct ftrace_func_entry *entry;
 	int fail = 1;
 	int not;
-	bool exists;
-	int i;
 
 	/* decode regex */
 	func_g.type = filter_parse_regex(buffer, strlen(buffer),
 					 &func_g.search, &not);
-	if (!not && *idx >= size)
-		return -EBUSY;
 
 	func_g.len = strlen(func_g.search);
 
@@ -4729,26 +4868,18 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
 			continue;
 
 		if (ftrace_match_record(rec, &func_g, NULL, 0)) {
-			/* if it is in the array */
-			exists = false;
-			for (i = 0; i < *idx; i++) {
-				if (array[i] == rec->ip) {
-					exists = true;
-					break;
-				}
-			}
+			entry = ftrace_lookup_ip(hash, rec->ip);
 
 			if (!not) {
 				fail = 0;
-				if (!exists) {
-					array[(*idx)++] = rec->ip;
-					if (*idx >= size)
-						goto out;
-				}
+
+				if (entry)
+					continue;
+				if (add_hash_entry(hash, rec->ip) < 0)
+					goto out;
 			} else {
-				if (exists) {
-					array[i] = array[--(*idx)];
-					array[*idx] = 0;
+				if (entry) {
+					free_hash_entry(hash, entry);
 					fail = 0;
 				}
 			}
@@ -4767,35 +4898,34 @@ static ssize_t
 ftrace_graph_write(struct file *file, const char __user *ubuf,
 		   size_t cnt, loff_t *ppos)
 {
-	struct trace_parser parser;
 	ssize_t read, ret = 0;
 	struct ftrace_graph_data *fgd = file->private_data;
+	struct trace_parser *parser;
 
 	if (!cnt)
 		return 0;
 
-	if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX))
-		return -ENOMEM;
+	/* Read mode uses seq functions */
+	if (file->f_mode & FMODE_READ) {
+		struct seq_file *m = file->private_data;
+		fgd = m->private;
+	}
 
-	read = trace_get_user(&parser, ubuf, cnt, ppos);
+	parser = &fgd->parser;
 
-	if (read >= 0 && trace_parser_loaded((&parser))) {
-		parser.buffer[parser.idx] = 0;
+	read = trace_get_user(parser, ubuf, cnt, ppos);
 
-		mutex_lock(&graph_lock);
+	if (read >= 0 && trace_parser_loaded(parser) &&
+	    !trace_parser_cont(parser)) {
 
-		/* we allow only one expression at a time */
-		ret = ftrace_set_func(fgd->table, fgd->count, fgd->size,
-				      parser.buffer);
-
-		mutex_unlock(&graph_lock);
+		ret = ftrace_graph_set_hash(fgd->new_hash,
+					    parser->buffer);
+		trace_parser_clear(parser);
 	}
 
 	if (!ret)
 		ret = read;
 
-	trace_parser_put(&parser);
-
 	return ret;
 }
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 310f0ea..707445c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -260,16 +260,8 @@ unsigned long long ns2usecs(u64 nsec)
 	TRACE_ITER_EVENT_FORK
 
 /*
- * The global_trace is the descriptor that holds the tracing
- * buffers for the live tracing. For each CPU, it contains
- * a link list of pages that will store trace entries. The
- * page descriptor of the pages in the memory is used to hold
- * the link list by linking the lru item in the page descriptor
- * to each of the pages in the buffer per CPU.
- *
- * For each active CPU there is a data field that holds the
- * pages for the buffer for that CPU. Each CPU has the same number
- * of pages allocated for its buffer.
+ * The global_trace is the descriptor that holds the top-level tracing
+ * buffers for the live tracing.
  */
 static struct trace_array global_trace = {
 	.trace_flags = TRACE_DEFAULT_FLAGS,
@@ -1193,6 +1185,7 @@ int trace_parser_get_init(struct trace_parser *parser, int size)
 void trace_parser_put(struct trace_parser *parser)
 {
 	kfree(parser->buffer);
+	parser->buffer = NULL;
 }
 
 /*
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1ea51ab..ae1cce9 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -753,6 +753,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
 
 extern char trace_find_mark(unsigned long long duration);
 
+struct ftrace_hash {
+	unsigned long		size_bits;
+	struct hlist_head	*buckets;
+	unsigned long		count;
+	struct rcu_head		rcu;
+};
+
+struct ftrace_func_entry *
+ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip);
+
+static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
+{
+	return !hash || !hash->count;
+}
+
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
@@ -787,53 +802,50 @@ extern void __trace_graph_return(struct trace_array *tr,
 				 struct ftrace_graph_ret *trace,
 				 unsigned long flags, int pc);
 
-
 #ifdef CONFIG_DYNAMIC_FTRACE
-/* TODO: make this variable */
-#define FTRACE_GRAPH_MAX_FUNCS		32
-extern int ftrace_graph_count;
-extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
-extern int ftrace_graph_notrace_count;
-extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS];
+extern struct ftrace_hash *ftrace_graph_hash;
+extern struct ftrace_hash *ftrace_graph_notrace_hash;
 
 static inline int ftrace_graph_addr(unsigned long addr)
 {
-	int i;
+	int ret = 0;
 
-	if (!ftrace_graph_count)
-		return 1;
+	preempt_disable_notrace();
 
-	for (i = 0; i < ftrace_graph_count; i++) {
-		if (addr == ftrace_graph_funcs[i]) {
-			/*
-			 * If no irqs are to be traced, but a set_graph_function
-			 * is set, and called by an interrupt handler, we still
-			 * want to trace it.
-			 */
-			if (in_irq())
-				trace_recursion_set(TRACE_IRQ_BIT);
-			else
-				trace_recursion_clear(TRACE_IRQ_BIT);
-			return 1;
-		}
+	if (ftrace_hash_empty(ftrace_graph_hash)) {
+		ret = 1;
+		goto out;
 	}
 
-	return 0;
+	if (ftrace_lookup_ip(ftrace_graph_hash, addr)) {
+		/*
+		 * If no irqs are to be traced, but a set_graph_function
+		 * is set, and called by an interrupt handler, we still
+		 * want to trace it.
+		 */
+		if (in_irq())
+			trace_recursion_set(TRACE_IRQ_BIT);
+		else
+			trace_recursion_clear(TRACE_IRQ_BIT);
+		ret = 1;
+	}
+
+out:
+	preempt_enable_notrace();
+	return ret;
 }
 
 static inline int ftrace_graph_notrace_addr(unsigned long addr)
 {
-	int i;
+	int ret = 0;
 
-	if (!ftrace_graph_notrace_count)
-		return 0;
+	preempt_disable_notrace();
 
-	for (i = 0; i < ftrace_graph_notrace_count; i++) {
-		if (addr == ftrace_graph_notrace_funcs[i])
-			return 1;
-	}
+	if (ftrace_lookup_ip(ftrace_graph_notrace_hash, addr))
+		ret = 1;
 
-	return 0;
+	preempt_enable_notrace();
+	return ret;
 }
 #else
 static inline int ftrace_graph_addr(unsigned long addr)
@@ -1300,7 +1312,8 @@ static inline bool is_string_field(struct ftrace_event_field *field)
 {
 	return field->filter_type == FILTER_DYN_STRING ||
 	       field->filter_type == FILTER_STATIC_STRING ||
-	       field->filter_type == FILTER_PTR_STRING;
+	       field->filter_type == FILTER_PTR_STRING ||
+	       field->filter_type == FILTER_COMM;
 }
 
 static inline bool is_function_field(struct ftrace_event_field *field)
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index e3b4888..e49fbe9 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -175,9 +175,9 @@ int trace_benchmark_reg(void)
 
 	bm_event_thread = kthread_run(benchmark_event_kthread,
 				      NULL, "event_benchmark");
-	if (!bm_event_thread) {
+	if (IS_ERR(bm_event_thread)) {
 		pr_warning("trace benchmark failed to create kernel thread\n");
-		return -ENOMEM;
+		return PTR_ERR(bm_event_thread);
 	}
 
 	return 0;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 75489de..4d8fdf3 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -27,7 +27,7 @@ static DEFINE_MUTEX(branch_tracing_mutex);
 static struct trace_array *branch_tracer;
 
 static void
-probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
 {
 	struct trace_event_call *call = &event_branch;
 	struct trace_array *tr = branch_tracer;
@@ -68,16 +68,17 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	entry	= ring_buffer_event_data(event);
 
 	/* Strip off the path, only save the file */
-	p = f->file + strlen(f->file);
-	while (p >= f->file && *p != '/')
+	p = f->data.file + strlen(f->data.file);
+	while (p >= f->data.file && *p != '/')
 		p--;
 	p++;
 
-	strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
+	strncpy(entry->func, f->data.func, TRACE_FUNC_SIZE);
 	strncpy(entry->file, p, TRACE_FILE_SIZE);
 	entry->func[TRACE_FUNC_SIZE] = 0;
 	entry->file[TRACE_FILE_SIZE] = 0;
-	entry->line = f->line;
+	entry->constant = f->constant;
+	entry->line = f->data.line;
 	entry->correct = val == expect;
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
@@ -89,7 +90,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 }
 
 static inline
-void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
 {
 	if (!branch_tracing_enabled)
 		return;
@@ -195,13 +196,19 @@ core_initcall(init_branch_tracer);
 
 #else
 static inline
-void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
 {
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
-void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
+void ftrace_likely_update(struct ftrace_likely_data *f, int val,
+			  int expect, int is_constant)
 {
+	/* A constant is always correct */
+	if (is_constant) {
+		f->constant++;
+		val = expect;
+	}
 	/*
 	 * I would love to have a trace point here instead, but the
 	 * trace point code is so inundated with unlikely and likely
@@ -212,9 +219,9 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
 
 	/* FIXME: Make this atomic! */
 	if (val == expect)
-		f->correct++;
+		f->data.correct++;
 	else
-		f->incorrect++;
+		f->data.incorrect++;
 }
 EXPORT_SYMBOL(ftrace_likely_update);
 
@@ -245,29 +252,60 @@ static inline long get_incorrect_percent(struct ftrace_branch_data *p)
 	return percent;
 }
 
-static int branch_stat_show(struct seq_file *m, void *v)
+static const char *branch_stat_process_file(struct ftrace_branch_data *p)
 {
-	struct ftrace_branch_data *p = v;
 	const char *f;
-	long percent;
 
 	/* Only print the file, not the path */
 	f = p->file + strlen(p->file);
 	while (f >= p->file && *f != '/')
 		f--;
-	f++;
+	return ++f;
+}
+
+static void branch_stat_show(struct seq_file *m,
+			     struct ftrace_branch_data *p, const char *f)
+{
+	long percent;
 
 	/*
 	 * The miss is overlayed on correct, and hit on incorrect.
 	 */
 	percent = get_incorrect_percent(p);
 
-	seq_printf(m, "%8lu %8lu ",  p->correct, p->incorrect);
 	if (percent < 0)
 		seq_puts(m, "  X ");
 	else
 		seq_printf(m, "%3ld ", percent);
+
 	seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
+}
+
+static int branch_stat_show_normal(struct seq_file *m,
+				   struct ftrace_branch_data *p, const char *f)
+{
+	seq_printf(m, "%8lu %8lu ",  p->correct, p->incorrect);
+	branch_stat_show(m, p, f);
+	return 0;
+}
+
+static int annotate_branch_stat_show(struct seq_file *m, void *v)
+{
+	struct ftrace_likely_data *p = v;
+	const char *f;
+	int l;
+
+	f = branch_stat_process_file(&p->data);
+
+	if (!p->constant)
+		return branch_stat_show_normal(m, &p->data, f);
+
+	l = snprintf(NULL, 0, "/%lu", p->constant);
+	l = l > 8 ? 0 : 8 - l;
+
+	seq_printf(m, "%8lu/%lu %*lu ",
+		   p->data.correct, p->constant, l, p->data.incorrect);
+	branch_stat_show(m, &p->data, f);
 	return 0;
 }
 
@@ -279,7 +317,7 @@ static void *annotated_branch_stat_start(struct tracer_stat *trace)
 static void *
 annotated_branch_stat_next(void *v, int idx)
 {
-	struct ftrace_branch_data *p = v;
+	struct ftrace_likely_data *p = v;
 
 	++p;
 
@@ -328,7 +366,7 @@ static struct tracer_stat annotated_branch_stats = {
 	.stat_next = annotated_branch_stat_next,
 	.stat_cmp = annotated_branch_stat_cmp,
 	.stat_headers = annotated_branch_stat_headers,
-	.stat_show = branch_stat_show
+	.stat_show = annotate_branch_stat_show
 };
 
 __init static int init_annotated_branch_stats(void)
@@ -379,12 +417,21 @@ all_branch_stat_next(void *v, int idx)
 	return p;
 }
 
+static int all_branch_stat_show(struct seq_file *m, void *v)
+{
+	struct ftrace_branch_data *p = v;
+	const char *f;
+
+	f = branch_stat_process_file(p);
+	return branch_stat_show_normal(m, p, f);
+}
+
 static struct tracer_stat all_branch_stats = {
 	.name = "branch_all",
 	.stat_start = all_branch_stat_start,
 	.stat_next = all_branch_stat_next,
 	.stat_headers = all_branch_stat_headers,
-	.stat_show = branch_stat_show
+	.stat_show = all_branch_stat_show
 };
 
 __init static int all_annotated_branch_stats(void)
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index eb7396b..c203ac4 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -328,11 +328,13 @@ FTRACE_ENTRY(branch, trace_branch,
 		__array(	char,		func,	TRACE_FUNC_SIZE+1	)
 		__array(	char,		file,	TRACE_FILE_SIZE+1	)
 		__field(	char,		correct				)
+		__field(	char,		constant			)
 	),
 
-	F_printk("%u:%s:%s (%u)",
+	F_printk("%u:%s:%s (%u)%s",
 		 __entry->line,
-		 __entry->func, __entry->file, __entry->correct),
+		 __entry->func, __entry->file, __entry->correct,
+		 __entry->constant ? " CONSTANT" : ""),
 
 	FILTER_OTHER
 );
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index af344a1b..edfacd9 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -266,24 +266,13 @@ static int get_sample(void)
 static struct cpumask save_cpumask;
 static bool disable_migrate;
 
-static void move_to_next_cpu(bool initmask)
+static void move_to_next_cpu(void)
 {
-	static struct cpumask *current_mask;
+	struct cpumask *current_mask = &save_cpumask;
 	int next_cpu;
 
 	if (disable_migrate)
 		return;
-
-	/* Just pick the first CPU on first iteration */
-	if (initmask) {
-		current_mask = &save_cpumask;
-		get_online_cpus();
-		cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
-		put_online_cpus();
-		next_cpu = cpumask_first(current_mask);
-		goto set_affinity;
-	}
-
 	/*
 	 * If for some reason the user modifies the CPU affinity
 	 * of this thread, than stop migrating for the duration
@@ -300,7 +289,6 @@ static void move_to_next_cpu(bool initmask)
 	if (next_cpu >= nr_cpu_ids)
 		next_cpu = cpumask_first(current_mask);
 
- set_affinity:
 	if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */
 		goto disable;
 
@@ -322,20 +310,15 @@ static void move_to_next_cpu(bool initmask)
  * need to ensure nothing else might be running (and thus preempting).
  * Obviously this should never be used in production environments.
  *
- * Currently this runs on which ever CPU it was scheduled on, but most
- * real-world hardware latency situations occur across several CPUs,
- * but we might later generalize this if we find there are any actualy
- * systems with alternate SMI delivery or other hardware latencies.
+ * Executes one loop interaction on each CPU in tracing_cpumask sysfs file.
  */
 static int kthread_fn(void *data)
 {
 	u64 interval;
-	bool initmask = true;
 
 	while (!kthread_should_stop()) {
 
-		move_to_next_cpu(initmask);
-		initmask = false;
+		move_to_next_cpu();
 
 		local_irq_disable();
 		get_sample();
@@ -366,13 +349,27 @@ static int kthread_fn(void *data)
  */
 static int start_kthread(struct trace_array *tr)
 {
+	struct cpumask *current_mask = &save_cpumask;
 	struct task_struct *kthread;
+	int next_cpu;
+
+	/* Just pick the first CPU on first iteration */
+	current_mask = &save_cpumask;
+	get_online_cpus();
+	cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+	put_online_cpus();
+	next_cpu = cpumask_first(current_mask);
 
 	kthread = kthread_create(kthread_fn, NULL, "hwlatd");
 	if (IS_ERR(kthread)) {
 		pr_err(BANNER "could not start sampling thread\n");
 		return -ENOMEM;
 	}
+
+	cpumask_clear(current_mask);
+	cpumask_set_cpu(next_cpu, current_mask);
+	sched_setaffinity(kthread->pid, current_mask);
+
 	hwlat_kthread = kthread;
 	wake_up_process(kthread);
 
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 7ad9e53..eadd96e 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -16,6 +16,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
+#define pr_fmt(fmt)	"trace_kprobe: " fmt
 
 #include <linux/module.h>
 #include <linux/uaccess.h>
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 8c0553d..52478f0 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -21,6 +21,7 @@
  * Copyright (C) IBM Corporation, 2010-2011
  * Author:     Srikar Dronamraju
  */
+#define pr_fmt(fmt)	"trace_probe: " fmt
 
 #include "trace_probe.h"
 
@@ -647,7 +648,7 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
 				size_t count, loff_t *ppos,
 				int (*createfn)(int, char **))
 {
-	char *kbuf, *tmp;
+	char *kbuf, *buf, *tmp;
 	int ret = 0;
 	size_t done = 0;
 	size_t size;
@@ -667,27 +668,38 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
 			goto out;
 		}
 		kbuf[size] = '\0';
-		tmp = strchr(kbuf, '\n');
+		buf = kbuf;
+		do {
+			tmp = strchr(buf, '\n');
+			if (tmp) {
+				*tmp = '\0';
+				size = tmp - buf + 1;
+			} else {
+				size = strlen(buf);
+				if (done + size < count) {
+					if (buf != kbuf)
+						break;
+					/* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
+					pr_warn("Line length is too long: Should be less than %d\n",
+						WRITE_BUFSIZE - 2);
+					ret = -EINVAL;
+					goto out;
+				}
+			}
+			done += size;
 
-		if (tmp) {
-			*tmp = '\0';
-			size = tmp - kbuf + 1;
-		} else if (done + size < count) {
-			pr_warn("Line length is too long: Should be less than %d\n",
-				WRITE_BUFSIZE);
-			ret = -EINVAL;
-			goto out;
-		}
-		done += size;
-		/* Remove comments */
-		tmp = strchr(kbuf, '#');
+			/* Remove comments */
+			tmp = strchr(buf, '#');
 
-		if (tmp)
-			*tmp = '\0';
+			if (tmp)
+				*tmp = '\0';
 
-		ret = traceprobe_command(kbuf, createfn);
-		if (ret)
-			goto out;
+			ret = traceprobe_command(buf, createfn);
+			if (ret)
+				goto out;
+			buf += size;
+
+		} while (done < count);
 	}
 	ret = done;
 
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 0913693..f4379e7 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -17,6 +17,7 @@
  * Copyright (C) IBM Corporation, 2010-2012
  * Author:	Srikar Dronamraju <srikar@linux.vnet.ibm.com>
  */
+#define pr_fmt(fmt)	"trace_kprobe: " fmt
 
 #include <linux/module.h>
 #include <linux/uaccess.h>
@@ -431,7 +432,8 @@ static int create_trace_uprobe(int argc, char **argv)
 		pr_info("Probe point is not specified.\n");
 		return -EINVAL;
 	}
-	arg = strchr(argv[1], ':');
+	/* Find the last occurrence, in case the path contains ':' too. */
+	arg = strrchr(argv[1], ':');
 	if (!arg) {
 		ret = -EINVAL;
 		goto fail_address_parse;
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index c8cebb1..9c21000 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -176,13 +176,12 @@ static int percpu_counter_cpu_dead(unsigned int cpu)
 	spin_lock_irq(&percpu_counters_lock);
 	list_for_each_entry(fbc, &percpu_counters, list) {
 		s32 *pcount;
-		unsigned long flags;
 
-		raw_spin_lock_irqsave(&fbc->lock, flags);
+		raw_spin_lock(&fbc->lock);
 		pcount = per_cpu_ptr(fbc->counters, cpu);
 		fbc->count += *pcount;
 		*pcount = 0;
-		raw_spin_unlock_irqrestore(&fbc->lock, flags);
+		raw_spin_unlock(&fbc->lock);
 	}
 	spin_unlock_irq(&percpu_counters_lock);
 #endif
diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c
index 8fdee24..eafbf114 100644
--- a/tools/perf/util/cgroup.c
+++ b/tools/perf/util/cgroup.c
@@ -12,8 +12,8 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
 {
 	FILE *fp;
 	char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1];
+	char path_v1[PATH_MAX + 1], path_v2[PATH_MAX + 2], *path;
 	char *token, *saved_ptr = NULL;
-	int found = 0;
 
 	fp = fopen("/proc/mounts", "r");
 	if (!fp)
@@ -24,31 +24,43 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
 	 * and inspect every cgroupfs mount point to find one that has
 	 * perf_event subsystem
 	 */
+	path_v1[0] = '\0';
+	path_v2[0] = '\0';
+
 	while (fscanf(fp, "%*s %"STR(PATH_MAX)"s %"STR(PATH_MAX)"s %"
 				STR(PATH_MAX)"s %*d %*d\n",
 				mountpoint, type, tokens) == 3) {
 
-		if (!strcmp(type, "cgroup")) {
+		if (!path_v1[0] && !strcmp(type, "cgroup")) {
 
 			token = strtok_r(tokens, ",", &saved_ptr);
 
 			while (token != NULL) {
 				if (!strcmp(token, "perf_event")) {
-					found = 1;
+					strcpy(path_v1, mountpoint);
 					break;
 				}
 				token = strtok_r(NULL, ",", &saved_ptr);
 			}
 		}
-		if (found)
+
+		if (!path_v2[0] && !strcmp(type, "cgroup2"))
+			strcpy(path_v2, mountpoint);
+
+		if (path_v1[0] && path_v2[0])
 			break;
 	}
 	fclose(fp);
-	if (!found)
+
+	if (path_v1[0])
+		path = path_v1;
+	else if (path_v2[0])
+		path = path_v2;
+	else
 		return -1;
 
-	if (strlen(mountpoint) < maxlen) {
-		strcpy(buf, mountpoint);
+	if (strlen(path) < maxlen) {
+		strcpy(buf, path);
 		return 0;
 	}
 	return -1;
diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index be93ab0..6e4eb2f 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -179,6 +179,7 @@
 my $iteration = 0;
 my $successes = 0;
 my $stty_orig;
+my $run_command_status = 0;
 
 my $bisect_good;
 my $bisect_bad;
@@ -1325,26 +1326,44 @@
 
 sub reboot {
     my ($time) = @_;
+    my $powercycle = 0;
 
-    # Make sure everything has been written to disk
-    run_ssh("sync");
+    # test if the machine can be connected to within 5 seconds
+    my $stat = run_ssh("echo check machine status", 5);
+    if (!$stat) {
+	doprint("power cycle\n");
+	$powercycle = 1;
+    }
 
-    if (defined($time)) {
+    if ($powercycle) {
+	run_command "$power_cycle";
+
 	start_monitor;
 	# flush out current monitor
 	# May contain the reboot success line
 	wait_for_monitor 1;
-    }
 
-    # try to reboot normally
-    if (run_command $reboot) {
-	if (defined($powercycle_after_reboot)) {
-	    sleep $powercycle_after_reboot;
+    } else {
+	# Make sure everything has been written to disk
+	run_ssh("sync");
+
+	if (defined($time)) {
+	    start_monitor;
+	    # flush out current monitor
+	    # May contain the reboot success line
+	    wait_for_monitor 1;
+	}
+
+	# try to reboot normally
+	if (run_command $reboot) {
+	    if (defined($powercycle_after_reboot)) {
+		sleep $powercycle_after_reboot;
+		run_command "$power_cycle";
+	    }
+	} else {
+	    # nope? power cycle it.
 	    run_command "$power_cycle";
 	}
-    } else {
-	# nope? power cycle it.
-	run_command "$power_cycle";
     }
 
     if (defined($time)) {
@@ -1412,6 +1431,10 @@
 	    system("stty $stty_orig");
     }
 
+    if (defined($post_test)) {
+	run_command $post_test;
+    }
+
     die @_, "\n";
 }
 
@@ -1624,10 +1647,6 @@
 
 sub fail {
 
-	if (defined($post_test)) {
-		run_command $post_test;
-	}
-
 	if ($die_on_failure) {
 		dodie @_;
 	}
@@ -1660,23 +1679,26 @@
 	    save_logs "fail", $store_failures;
         }
 
+	if (defined($post_test)) {
+		run_command $post_test;
+	}
+
 	return 1;
 }
 
 sub run_command {
-    my ($command, $redirect) = @_;
+    my ($command, $redirect, $timeout) = @_;
     my $start_time;
     my $end_time;
     my $dolog = 0;
     my $dord = 0;
     my $pid;
 
-    $start_time = time;
-
     $command =~ s/\$SSH_USER/$ssh_user/g;
     $command =~ s/\$MACHINE/$machine/g;
 
     doprint("$command ... ");
+    $start_time = time;
 
     $pid = open(CMD, "$command 2>&1 |") or
 	(fail "unable to exec $command" and return 0);
@@ -1693,13 +1715,30 @@
 	$dord = 1;
     }
 
-    while (<CMD>) {
-	print LOG if ($dolog);
-	print RD  if ($dord);
+    my $hit_timeout = 0;
+
+    while (1) {
+	my $fp = \*CMD;
+	if (defined($timeout)) {
+	    doprint "timeout = $timeout\n";
+	}
+	my $line = wait_for_input($fp, $timeout);
+	if (!defined($line)) {
+	    my $now = time;
+	    if (defined($timeout) && (($now - $start_time) >= $timeout)) {
+		doprint "Hit timeout of $timeout, killing process\n";
+		$hit_timeout = 1;
+		kill 9, $pid;
+	    }
+	    last;
+	}
+	print LOG $line if ($dolog);
+	print RD $line if ($dord);
     }
 
     waitpid($pid, 0);
-    my $failed = $?;
+    # shift 8 for real exit status
+    $run_command_status = $? >> 8;
 
     close(CMD);
     close(LOG) if ($dolog);
@@ -1714,21 +1753,25 @@
 	doprint "[$delta seconds] ";
     }
 
-    if ($failed) {
+    if ($hit_timeout) {
+	$run_command_status = 1;
+    }
+
+    if ($run_command_status) {
 	doprint "FAILED!\n";
     } else {
 	doprint "SUCCESS\n";
     }
 
-    return !$failed;
+    return !$run_command_status;
 }
 
 sub run_ssh {
-    my ($cmd) = @_;
+    my ($cmd, $timeout) = @_;
     my $cp_exec = $ssh_exec;
 
     $cp_exec =~ s/\$SSH_COMMAND/$cmd/g;
-    return run_command "$cp_exec";
+    return run_command "$cp_exec", undef , $timeout;
 }
 
 sub run_scp {
@@ -2489,10 +2532,6 @@
 sub success {
     my ($i) = @_;
 
-    if (defined($post_test)) {
-	run_command $post_test;
-    }
-
     $successes++;
 
     my $name = "";
@@ -2517,6 +2556,10 @@
 	doprint "Reboot and wait $sleep_time seconds\n";
 	reboot_to_good $sleep_time;
     }
+
+    if (defined($post_test)) {
+	run_command $post_test;
+    }
 }
 
 sub answer_bisect {
@@ -2537,16 +2580,15 @@
 }
 
 sub child_run_test {
-    my $failed = 0;
 
     # child should have no power
     $reboot_on_error = 0;
     $poweroff_on_error = 0;
     $die_on_failure = 1;
 
-    run_command $run_test, $testlog or $failed = 1;
+    run_command $run_test, $testlog;
 
-    exit $failed;
+    exit $run_command_status;
 }
 
 my $child_done;
@@ -2629,7 +2671,7 @@
     }
 
     waitpid $child_pid, 0;
-    $child_exit = $?;
+    $child_exit = $? >> 8;
 
     my $end_time = time;
     $test_time = $end_time - $start_time;
@@ -3330,7 +3372,6 @@
     save_config \%good_configs, $good_config;
     save_config \%bad_configs, $bad_config;
 
-
     if (defined($config_bisect_check) && $config_bisect_check ne "0") {
 	if ($config_bisect_check ne "good") {
 	    doprint "Testing bad config\n";