Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

* git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (46 commits)
  hwrng: via_rng - Fix memory scribbling on some CPUs
  crypto: padlock - Move padlock.h into include/crypto
  hwrng: via_rng - Fix asm constraints
  crypto: n2 - use __devexit not __exit in n2_unregister_algs
  crypto: mark crypto workqueues CPU_INTENSIVE
  crypto: mv_cesa - dont return PTR_ERR() of wrong pointer
  crypto: ripemd - Set module author and update email address
  crypto: omap-sham - backlog handling fix
  crypto: gf128mul - Remove experimental tag
  crypto: af_alg - fix af_alg memory_allocated data type
  crypto: aesni-intel - Fixed build with binutils 2.16
  crypto: af_alg - Make sure sk_security is initialized on accept()ed sockets
  net: Add missing lockdep class names for af_alg
  include: Install linux/if_alg.h for user-space crypto API
  crypto: omap-aes - checkpatch --file warning fixes
  crypto: omap-aes - initialize aes module once per request
  crypto: omap-aes - unnecessary code removed
  crypto: omap-aes - error handling implementation improved
  crypto: omap-aes - redundant locking is removed
  crypto: omap-aes - DMA initialization fixes for OMAP off mode
  ...
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index ff16756..8fe2a49 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -9,6 +9,20 @@
  *            Vinodh Gopal <vinodh.gopal@intel.com>
  *            Kahraman Akdemir
  *
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *             Adrian Hoban <adrian.hoban@intel.com>
+ *             James Guilford (james.guilford@intel.com)
+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *             Tadeusz Struk (tadeusz.struk@intel.com)
+ *             Wajdi Feghali (wajdi.k.feghali@intel.com)
+ *    Copyright (c) 2010, Intel Corporation.
+ *
+ * Ported x86_64 version to x86:
+ *    Author: Mathias Krause <minipli@googlemail.com>
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -18,8 +32,62 @@
 #include <linux/linkage.h>
 #include <asm/inst.h>
 
+#ifdef __x86_64__
+.data
+POLY:   .octa 0xC2000000000000000000000000000001
+TWOONE: .octa 0x00000001000000000000000000000001
+
+# order of these constants should not change.
+# more specifically, ALL_F should follow SHIFT_MASK,
+# and ZERO should follow ALL_F
+
+SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
+MASK1:      .octa 0x0000000000000000ffffffffffffffff
+MASK2:      .octa 0xffffffffffffffff0000000000000000
+SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
+ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
+ZERO:       .octa 0x00000000000000000000000000000000
+ONE:        .octa 0x00000000000000000000000000000001
+F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
+dec:        .octa 0x1
+enc:        .octa 0x2
+
+
 .text
 
+
+#define	STACK_OFFSET    8*3
+#define	HashKey		16*0	// store HashKey <<1 mod poly here
+#define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here
+#define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here
+#define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here
+#define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64
+				// bits of  HashKey <<1 mod poly here
+				//(for Karatsuba purposes)
+#define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64
+				// bits of  HashKey^2 <<1 mod poly here
+				// (for Karatsuba purposes)
+#define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64
+				// bits of  HashKey^3 <<1 mod poly here
+				// (for Karatsuba purposes)
+#define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64
+				// bits of  HashKey^4 <<1 mod poly here
+				// (for Karatsuba purposes)
+#define	VARIABLE_OFFSET	16*8
+
+#define arg1 rdi
+#define arg2 rsi
+#define arg3 rdx
+#define arg4 rcx
+#define arg5 r8
+#define arg6 r9
+#define arg7 STACK_OFFSET+8(%r14)
+#define arg8 STACK_OFFSET+16(%r14)
+#define arg9 STACK_OFFSET+24(%r14)
+#define arg10 STACK_OFFSET+32(%r14)
+#endif
+
+
 #define STATE1	%xmm0
 #define STATE2	%xmm4
 #define STATE3	%xmm5
@@ -32,12 +100,16 @@
 #define IN	IN1
 #define KEY	%xmm2
 #define IV	%xmm3
+
 #define BSWAP_MASK %xmm10
 #define CTR	%xmm11
 #define INC	%xmm12
 
+#ifdef __x86_64__
+#define AREG	%rax
 #define KEYP	%rdi
 #define OUTP	%rsi
+#define UKEYP	OUTP
 #define INP	%rdx
 #define LEN	%rcx
 #define IVP	%r8
@@ -46,6 +118,1588 @@
 #define TKEYP	T1
 #define T2	%r11
 #define TCTR_LOW T2
+#else
+#define AREG	%eax
+#define KEYP	%edi
+#define OUTP	AREG
+#define UKEYP	OUTP
+#define INP	%edx
+#define LEN	%esi
+#define IVP	%ebp
+#define KLEN	%ebx
+#define T1	%ecx
+#define TKEYP	T1
+#endif
+
+
+#ifdef __x86_64__
+/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+*
+*
+* Input: A and B (128-bits each, bit-reflected)
+* Output: C = A*B*x mod poly, (i.e. >>1 )
+* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+*
+*/
+.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
+	movdqa	  \GH, \TMP1
+	pshufd	  $78, \GH, \TMP2
+	pshufd	  $78, \HK, \TMP3
+	pxor	  \GH, \TMP2            # TMP2 = a1+a0
+	pxor	  \HK, \TMP3            # TMP3 = b1+b0
+	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
+	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
+	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
+	pxor	  \GH, \TMP2
+	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
+	movdqa	  \TMP2, \TMP3
+	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
+	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
+	pxor	  \TMP3, \GH
+	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
+
+        # first phase of the reduction
+
+	movdqa    \GH, \TMP2
+	movdqa    \GH, \TMP3
+	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
+					# in in order to perform
+					# independent shifts
+	pslld     $31, \TMP2            # packed right shift <<31
+	pslld     $30, \TMP3            # packed right shift <<30
+	pslld     $25, \TMP4            # packed right shift <<25
+	pxor      \TMP3, \TMP2          # xor the shifted versions
+	pxor      \TMP4, \TMP2
+	movdqa    \TMP2, \TMP5
+	psrldq    $4, \TMP5             # right shift TMP5 1 DW
+	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
+	pxor      \TMP2, \GH
+
+        # second phase of the reduction
+
+	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
+					# in in order to perform
+					# independent shifts
+	movdqa    \GH,\TMP3
+	movdqa    \GH,\TMP4
+	psrld     $1,\TMP2              # packed left shift >>1
+	psrld     $2,\TMP3              # packed left shift >>2
+	psrld     $7,\TMP4              # packed left shift >>7
+	pxor      \TMP3,\TMP2		# xor the shifted versions
+	pxor      \TMP4,\TMP2
+	pxor      \TMP5, \TMP2
+	pxor      \TMP2, \GH
+	pxor      \TMP1, \GH            # result is in TMP1
+.endm
+
+/*
+* if a = number of total plaintext bytes
+* b = floor(a/16)
+* num_initial_blocks = b mod 4
+* encrypt the initial num_initial_blocks blocks and apply ghash on
+* the ciphertext
+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
+* are clobbered
+* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
+*/
+
+
+.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
+XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+	mov	   arg7, %r10           # %r10 = AAD
+	mov	   arg8, %r12           # %r12 = aadLen
+	mov	   %r12, %r11
+	pxor	   %xmm\i, %xmm\i
+_get_AAD_loop\num_initial_blocks\operation:
+	movd	   (%r10), \TMP1
+	pslldq	   $12, \TMP1
+	psrldq	   $4, %xmm\i
+	pxor	   \TMP1, %xmm\i
+	add	   $4, %r10
+	sub	   $4, %r12
+	jne	   _get_AAD_loop\num_initial_blocks\operation
+	cmp	   $16, %r11
+	je	   _get_AAD_loop2_done\num_initial_blocks\operation
+	mov	   $16, %r12
+_get_AAD_loop2\num_initial_blocks\operation:
+	psrldq	   $4, %xmm\i
+	sub	   $4, %r12
+	cmp	   %r11, %r12
+	jne	   _get_AAD_loop2\num_initial_blocks\operation
+_get_AAD_loop2_done\num_initial_blocks\operation:
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
+
+	xor	   %r11, %r11 # initialise the data pointer offset as zero
+
+        # start AES for num_initial_blocks blocks
+
+	mov	   %arg5, %rax                      # %rax = *Y0
+	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM   %xmm14, \XMM0
+
+.if (\i == 5) || (\i == 6) || (\i == 7)
+.irpc index, \i_seq
+	paddd	   ONE(%rip), \XMM0                 # INCR Y0
+	movdqa	   \XMM0, %xmm\index
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
+
+.endr
+.irpc index, \i_seq
+	pxor	   16*0(%arg1), %xmm\index
+.endr
+.irpc index, \i_seq
+	movaps 0x10(%rdi), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 1
+.endr
+.irpc index, \i_seq
+	movaps 0x20(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0x30(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0x40(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0x50(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0x60(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0x70(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0x80(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0x90(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0xa0(%arg1), \TMP1
+	AESENCLAST \TMP1, %xmm\index         # Round 10
+.endr
+.irpc index, \i_seq
+	movdqu	   (%arg3 , %r11, 1), \TMP1
+	pxor	   \TMP1, %xmm\index
+	movdqu	   %xmm\index, (%arg2 , %r11, 1)
+	# write back plaintext/ciphertext for num_initial_blocks
+	add	   $16, %r11
+
+	movdqa     \TMP1, %xmm\index
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM	   %xmm14, %xmm\index
+
+		# prepare plaintext/ciphertext for GHASH computation
+.endr
+.endif
+	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        # apply GHASH on num_initial_blocks blocks
+
+.if \i == 5
+        pxor       %xmm5, %xmm6
+	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm6, %xmm7
+	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 6
+        pxor       %xmm6, %xmm7
+	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 7
+        pxor       %xmm7, %xmm8
+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.endif
+	cmp	   $64, %r13
+	jl	_initial_blocks_done\num_initial_blocks\operation
+	# no need for precomputed values
+/*
+*
+* Precomputations for HashKey parallel with encryption of first 4 blocks.
+* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+*/
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM1
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
+
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM2
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
+
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM3
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
+
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM4
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
+
+	pxor	   16*0(%arg1), \XMM1
+	pxor	   16*0(%arg1), \XMM2
+	pxor	   16*0(%arg1), \XMM3
+	pxor	   16*0(%arg1), \XMM4
+	movdqa	   \TMP3, \TMP5
+	pshufd	   $78, \TMP3, \TMP1
+	pxor	   \TMP3, \TMP1
+	movdqa	   \TMP1, HashKey_k(%rsp)
+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^2<<1 (mod poly)
+	movdqa	   \TMP5, HashKey_2(%rsp)
+# HashKey_2 = HashKey^2<<1 (mod poly)
+	pshufd	   $78, \TMP5, \TMP1
+	pxor	   \TMP5, \TMP1
+	movdqa	   \TMP1, HashKey_2_k(%rsp)
+.irpc index, 1234 # do 4 rounds
+	movaps 0x10*\index(%arg1), \TMP1
+	AESENC	   \TMP1, \XMM1
+	AESENC	   \TMP1, \XMM2
+	AESENC	   \TMP1, \XMM3
+	AESENC	   \TMP1, \XMM4
+.endr
+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+	movdqa	   \TMP5, HashKey_3(%rsp)
+	pshufd	   $78, \TMP5, \TMP1
+	pxor	   \TMP5, \TMP1
+	movdqa	   \TMP1, HashKey_3_k(%rsp)
+.irpc index, 56789 # do next 5 rounds
+	movaps 0x10*\index(%arg1), \TMP1
+	AESENC	   \TMP1, \XMM1
+	AESENC	   \TMP1, \XMM2
+	AESENC	   \TMP1, \XMM3
+	AESENC	   \TMP1, \XMM4
+.endr
+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+	movdqa	   \TMP5, HashKey_4(%rsp)
+	pshufd	   $78, \TMP5, \TMP1
+	pxor	   \TMP5, \TMP1
+	movdqa	   \TMP1, HashKey_4_k(%rsp)
+	movaps 0xa0(%arg1), \TMP2
+	AESENCLAST \TMP2, \XMM1
+	AESENCLAST \TMP2, \XMM2
+	AESENCLAST \TMP2, \XMM3
+	AESENCLAST \TMP2, \XMM4
+	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM1
+	movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1)
+	movdqa     \TMP1, \XMM1
+	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM2
+	movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1)
+	movdqa     \TMP1, \XMM2
+	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM3
+	movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1)
+	movdqa     \TMP1, \XMM3
+	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM4
+	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
+	movdqa     \TMP1, \XMM4
+	add	   $64, %r11
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
+	pxor	   \XMMDst, \XMM1
+# combine GHASHed value with the corresponding ciphertext
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+
+_initial_blocks_done\num_initial_blocks\operation:
+
+.endm
+
+
+/*
+* if a = number of total plaintext bytes
+* b = floor(a/16)
+* num_initial_blocks = b mod 4
+* encrypt the initial num_initial_blocks blocks and apply ghash on
+* the ciphertext
+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
+* are clobbered
+* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
+*/
+
+
+.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
+XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+	mov	   arg7, %r10           # %r10 = AAD
+	mov	   arg8, %r12           # %r12 = aadLen
+	mov	   %r12, %r11
+	pxor	   %xmm\i, %xmm\i
+_get_AAD_loop\num_initial_blocks\operation:
+	movd	   (%r10), \TMP1
+	pslldq	   $12, \TMP1
+	psrldq	   $4, %xmm\i
+	pxor	   \TMP1, %xmm\i
+	add	   $4, %r10
+	sub	   $4, %r12
+	jne	   _get_AAD_loop\num_initial_blocks\operation
+	cmp	   $16, %r11
+	je	   _get_AAD_loop2_done\num_initial_blocks\operation
+	mov	   $16, %r12
+_get_AAD_loop2\num_initial_blocks\operation:
+	psrldq	   $4, %xmm\i
+	sub	   $4, %r12
+	cmp	   %r11, %r12
+	jne	   _get_AAD_loop2\num_initial_blocks\operation
+_get_AAD_loop2_done\num_initial_blocks\operation:
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
+
+	xor	   %r11, %r11 # initialise the data pointer offset as zero
+
+        # start AES for num_initial_blocks blocks
+
+	mov	   %arg5, %rax                      # %rax = *Y0
+	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM   %xmm14, \XMM0
+
+.if (\i == 5) || (\i == 6) || (\i == 7)
+.irpc index, \i_seq
+	paddd	   ONE(%rip), \XMM0                 # INCR Y0
+	movdqa	   \XMM0, %xmm\index
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
+
+.endr
+.irpc index, \i_seq
+	pxor	   16*0(%arg1), %xmm\index
+.endr
+.irpc index, \i_seq
+	movaps 0x10(%rdi), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 1
+.endr
+.irpc index, \i_seq
+	movaps 0x20(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0x30(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0x40(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0x50(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0x60(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0x70(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0x80(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0x90(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0xa0(%arg1), \TMP1
+	AESENCLAST \TMP1, %xmm\index         # Round 10
+.endr
+.irpc index, \i_seq
+	movdqu	   (%arg3 , %r11, 1), \TMP1
+	pxor	   \TMP1, %xmm\index
+	movdqu	   %xmm\index, (%arg2 , %r11, 1)
+	# write back plaintext/ciphertext for num_initial_blocks
+	add	   $16, %r11
+
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM	   %xmm14, %xmm\index
+
+		# prepare plaintext/ciphertext for GHASH computation
+.endr
+.endif
+	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        # apply GHASH on num_initial_blocks blocks
+
+.if \i == 5
+        pxor       %xmm5, %xmm6
+	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm6, %xmm7
+	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 6
+        pxor       %xmm6, %xmm7
+	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 7
+        pxor       %xmm7, %xmm8
+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.endif
+	cmp	   $64, %r13
+	jl	_initial_blocks_done\num_initial_blocks\operation
+	# no need for precomputed values
+/*
+*
+* Precomputations for HashKey parallel with encryption of first 4 blocks.
+* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+*/
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM1
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
+
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM2
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
+
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM3
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
+
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM4
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
+
+	pxor	   16*0(%arg1), \XMM1
+	pxor	   16*0(%arg1), \XMM2
+	pxor	   16*0(%arg1), \XMM3
+	pxor	   16*0(%arg1), \XMM4
+	movdqa	   \TMP3, \TMP5
+	pshufd	   $78, \TMP3, \TMP1
+	pxor	   \TMP3, \TMP1
+	movdqa	   \TMP1, HashKey_k(%rsp)
+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^2<<1 (mod poly)
+	movdqa	   \TMP5, HashKey_2(%rsp)
+# HashKey_2 = HashKey^2<<1 (mod poly)
+	pshufd	   $78, \TMP5, \TMP1
+	pxor	   \TMP5, \TMP1
+	movdqa	   \TMP1, HashKey_2_k(%rsp)
+.irpc index, 1234 # do 4 rounds
+	movaps 0x10*\index(%arg1), \TMP1
+	AESENC	   \TMP1, \XMM1
+	AESENC	   \TMP1, \XMM2
+	AESENC	   \TMP1, \XMM3
+	AESENC	   \TMP1, \XMM4
+.endr
+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+	movdqa	   \TMP5, HashKey_3(%rsp)
+	pshufd	   $78, \TMP5, \TMP1
+	pxor	   \TMP5, \TMP1
+	movdqa	   \TMP1, HashKey_3_k(%rsp)
+.irpc index, 56789 # do next 5 rounds
+	movaps 0x10*\index(%arg1), \TMP1
+	AESENC	   \TMP1, \XMM1
+	AESENC	   \TMP1, \XMM2
+	AESENC	   \TMP1, \XMM3
+	AESENC	   \TMP1, \XMM4
+.endr
+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+	movdqa	   \TMP5, HashKey_4(%rsp)
+	pshufd	   $78, \TMP5, \TMP1
+	pxor	   \TMP5, \TMP1
+	movdqa	   \TMP1, HashKey_4_k(%rsp)
+	movaps 0xa0(%arg1), \TMP2
+	AESENCLAST \TMP2, \XMM1
+	AESENCLAST \TMP2, \XMM2
+	AESENCLAST \TMP2, \XMM3
+	AESENCLAST \TMP2, \XMM4
+	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM1
+	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM2
+	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM3
+	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM4
+	movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
+	movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
+	movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
+	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
+
+	add	   $64, %r11
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
+	pxor	   \XMMDst, \XMM1
+# combine GHASHed value with the corresponding ciphertext
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
+        movdqa     SHUF_MASK(%rip), %xmm14
+	PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+
+_initial_blocks_done\num_initial_blocks\operation:
+
+.endm
+
+/*
+* encrypt 4 blocks at a time
+* ghash the 4 previously encrypted ciphertext blocks
+* arg1, %arg2, %arg3 are used as pointers only, not modified
+* %r11 is the data offset value
+*/
+.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
+TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
+
+	movdqa	  \XMM1, \XMM5
+	movdqa	  \XMM2, \XMM6
+	movdqa	  \XMM3, \XMM7
+	movdqa	  \XMM4, \XMM8
+
+        movdqa    SHUF_MASK(%rip), %xmm15
+        # multiply TMP5 * HashKey using karatsuba
+
+	movdqa	  \XMM5, \TMP4
+	pshufd	  $78, \XMM5, \TMP6
+	pxor	  \XMM5, \TMP6
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa	  HashKey_4(%rsp), \TMP5
+	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
+	movdqa    \XMM0, \XMM1
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa    \XMM0, \XMM2
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa    \XMM0, \XMM3
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa    \XMM0, \XMM4
+	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
+	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
+	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
+	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
+	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
+
+	pxor	  (%arg1), \XMM1
+	pxor	  (%arg1), \XMM2
+	pxor	  (%arg1), \XMM3
+	pxor	  (%arg1), \XMM4
+	movdqa	  HashKey_4_k(%rsp), \TMP5
+	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
+	movaps 0x10(%arg1), \TMP1
+	AESENC	  \TMP1, \XMM1              # Round 1
+	AESENC	  \TMP1, \XMM2
+	AESENC	  \TMP1, \XMM3
+	AESENC	  \TMP1, \XMM4
+	movaps 0x20(%arg1), \TMP1
+	AESENC	  \TMP1, \XMM1              # Round 2
+	AESENC	  \TMP1, \XMM2
+	AESENC	  \TMP1, \XMM3
+	AESENC	  \TMP1, \XMM4
+	movdqa	  \XMM6, \TMP1
+	pshufd	  $78, \XMM6, \TMP2
+	pxor	  \XMM6, \TMP2
+	movdqa	  HashKey_3(%rsp), \TMP5
+	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
+	movaps 0x30(%arg1), \TMP3
+	AESENC    \TMP3, \XMM1              # Round 3
+	AESENC    \TMP3, \XMM2
+	AESENC    \TMP3, \XMM3
+	AESENC    \TMP3, \XMM4
+	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
+	movaps 0x40(%arg1), \TMP3
+	AESENC	  \TMP3, \XMM1              # Round 4
+	AESENC	  \TMP3, \XMM2
+	AESENC	  \TMP3, \XMM3
+	AESENC	  \TMP3, \XMM4
+	movdqa	  HashKey_3_k(%rsp), \TMP5
+	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+	movaps 0x50(%arg1), \TMP3
+	AESENC	  \TMP3, \XMM1              # Round 5
+	AESENC	  \TMP3, \XMM2
+	AESENC	  \TMP3, \XMM3
+	AESENC	  \TMP3, \XMM4
+	pxor	  \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+	pxor	  \XMM6, \XMM5
+	pxor	  \TMP2, \TMP6
+	movdqa	  \XMM7, \TMP1
+	pshufd	  $78, \XMM7, \TMP2
+	pxor	  \XMM7, \TMP2
+	movdqa	  HashKey_2(%rsp ), \TMP5
+
+        # Multiply TMP5 * HashKey using karatsuba
+
+	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
+	movaps 0x60(%arg1), \TMP3
+	AESENC	  \TMP3, \XMM1              # Round 6
+	AESENC	  \TMP3, \XMM2
+	AESENC	  \TMP3, \XMM3
+	AESENC	  \TMP3, \XMM4
+	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
+	movaps 0x70(%arg1), \TMP3
+	AESENC	  \TMP3, \XMM1             # Round 7
+	AESENC	  \TMP3, \XMM2
+	AESENC	  \TMP3, \XMM3
+	AESENC	  \TMP3, \XMM4
+	movdqa	  HashKey_2_k(%rsp), \TMP5
+	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+	movaps 0x80(%arg1), \TMP3
+	AESENC	  \TMP3, \XMM1             # Round 8
+	AESENC	  \TMP3, \XMM2
+	AESENC	  \TMP3, \XMM3
+	AESENC	  \TMP3, \XMM4
+	pxor	  \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+	pxor	  \XMM7, \XMM5
+	pxor	  \TMP2, \TMP6
+
+        # Multiply XMM8 * HashKey
+        # XMM8 and TMP5 hold the values for the two operands
+
+	movdqa	  \XMM8, \TMP1
+	pshufd	  $78, \XMM8, \TMP2
+	pxor	  \XMM8, \TMP2
+	movdqa	  HashKey(%rsp), \TMP5
+	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
+	movaps 0x90(%arg1), \TMP3
+	AESENC	  \TMP3, \XMM1            # Round 9
+	AESENC	  \TMP3, \XMM2
+	AESENC	  \TMP3, \XMM3
+	AESENC	  \TMP3, \XMM4
+	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
+	movaps 0xa0(%arg1), \TMP3
+	AESENCLAST \TMP3, \XMM1           # Round 10
+	AESENCLAST \TMP3, \XMM2
+	AESENCLAST \TMP3, \XMM3
+	AESENCLAST \TMP3, \XMM4
+	movdqa    HashKey_k(%rsp), \TMP5
+	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
+	movdqu	  (%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
+	movdqu	  16(%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
+	movdqu	  32(%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
+	movdqu	  48(%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
+        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
+        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
+        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
+        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
+	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
+	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
+	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
+	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
+
+	pxor	  \TMP4, \TMP1
+	pxor	  \XMM8, \XMM5
+	pxor	  \TMP6, \TMP2
+	pxor	  \TMP1, \TMP2
+	pxor	  \XMM5, \TMP2
+	movdqa	  \TMP2, \TMP3
+	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
+	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
+	pxor	  \TMP3, \XMM5
+	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
+
+        # first phase of reduction
+
+	movdqa    \XMM5, \TMP2
+	movdqa    \XMM5, \TMP3
+	movdqa    \XMM5, \TMP4
+# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
+	pslld     $31, \TMP2                   # packed right shift << 31
+	pslld     $30, \TMP3                   # packed right shift << 30
+	pslld     $25, \TMP4                   # packed right shift << 25
+	pxor      \TMP3, \TMP2	               # xor the shifted versions
+	pxor      \TMP4, \TMP2
+	movdqa    \TMP2, \TMP5
+	psrldq    $4, \TMP5                    # right shift T5 1 DW
+	pslldq    $12, \TMP2                   # left shift T2 3 DWs
+	pxor      \TMP2, \XMM5
+
+        # second phase of reduction
+
+	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
+	movdqa    \XMM5,\TMP3
+	movdqa    \XMM5,\TMP4
+	psrld     $1, \TMP2                    # packed left shift >>1
+	psrld     $2, \TMP3                    # packed left shift >>2
+	psrld     $7, \TMP4                    # packed left shift >>7
+	pxor      \TMP3,\TMP2		       # xor the shifted versions
+	pxor      \TMP4,\TMP2
+	pxor      \TMP5, \TMP2
+	pxor      \TMP2, \XMM5
+	pxor      \TMP1, \XMM5                 # result is in TMP1
+
+	pxor	  \XMM5, \XMM1
+.endm
+
+/*
+* decrypt 4 blocks at a time
+* ghash the 4 previously decrypted ciphertext blocks
+* arg1, %arg2, %arg3 are used as pointers only, not modified
+* %r11 is the data offset value
+*/
+.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
+TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
+
+	movdqa	  \XMM1, \XMM5
+	movdqa	  \XMM2, \XMM6
+	movdqa	  \XMM3, \XMM7
+	movdqa	  \XMM4, \XMM8
+
+        movdqa    SHUF_MASK(%rip), %xmm15
+        # multiply TMP5 * HashKey using karatsuba
+
+	movdqa	  \XMM5, \TMP4
+	pshufd	  $78, \XMM5, \TMP6
+	pxor	  \XMM5, \TMP6
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa	  HashKey_4(%rsp), \TMP5
+	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
+	movdqa    \XMM0, \XMM1
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa    \XMM0, \XMM2
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa    \XMM0, \XMM3
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa    \XMM0, \XMM4
+	PSHUFB_XMM %xmm15, \XMM1	# perform a 16 byte swap
+	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
+	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
+	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
+	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
+
+	pxor	  (%arg1), \XMM1
+	pxor	  (%arg1), \XMM2
+	pxor	  (%arg1), \XMM3
+	pxor	  (%arg1), \XMM4
+	movdqa	  HashKey_4_k(%rsp), \TMP5
+	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
+	movaps 0x10(%arg1), \TMP1
+	AESENC	  \TMP1, \XMM1              # Round 1
+	AESENC	  \TMP1, \XMM2
+	AESENC	  \TMP1, \XMM3
+	AESENC	  \TMP1, \XMM4
+	movaps 0x20(%arg1), \TMP1
+	AESENC	  \TMP1, \XMM1              # Round 2
+	AESENC	  \TMP1, \XMM2
+	AESENC	  \TMP1, \XMM3
+	AESENC	  \TMP1, \XMM4
+	movdqa	  \XMM6, \TMP1
+	pshufd	  $78, \XMM6, \TMP2
+	pxor	  \XMM6, \TMP2
+	movdqa	  HashKey_3(%rsp), \TMP5
+	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
+	movaps 0x30(%arg1), \TMP3
+	AESENC    \TMP3, \XMM1              # Round 3
+	AESENC    \TMP3, \XMM2
+	AESENC    \TMP3, \XMM3
+	AESENC    \TMP3, \XMM4
+	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
+	movaps 0x40(%arg1), \TMP3
+	AESENC	  \TMP3, \XMM1              # Round 4
+	AESENC	  \TMP3, \XMM2
+	AESENC	  \TMP3, \XMM3
+	AESENC	  \TMP3, \XMM4
+	movdqa	  HashKey_3_k(%rsp), \TMP5
+	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+	movaps 0x50(%arg1), \TMP3
+	AESENC	  \TMP3, \XMM1              # Round 5
+	AESENC	  \TMP3, \XMM2
+	AESENC	  \TMP3, \XMM3
+	AESENC	  \TMP3, \XMM4
+	pxor	  \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+	pxor	  \XMM6, \XMM5
+	pxor	  \TMP2, \TMP6
+	movdqa	  \XMM7, \TMP1
+	pshufd	  $78, \XMM7, \TMP2
+	pxor	  \XMM7, \TMP2
+	movdqa	  HashKey_2(%rsp ), \TMP5
+
+        # Multiply TMP5 * HashKey using karatsuba
+
+	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
+	movaps 0x60(%arg1), \TMP3
+	AESENC	  \TMP3, \XMM1              # Round 6
+	AESENC	  \TMP3, \XMM2
+	AESENC	  \TMP3, \XMM3
+	AESENC	  \TMP3, \XMM4
+	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
+	movaps 0x70(%arg1), \TMP3
+	AESENC	  \TMP3, \XMM1             # Round 7
+	AESENC	  \TMP3, \XMM2
+	AESENC	  \TMP3, \XMM3
+	AESENC	  \TMP3, \XMM4
+	movdqa	  HashKey_2_k(%rsp), \TMP5
+	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+	movaps 0x80(%arg1), \TMP3
+	AESENC	  \TMP3, \XMM1             # Round 8
+	AESENC	  \TMP3, \XMM2
+	AESENC	  \TMP3, \XMM3
+	AESENC	  \TMP3, \XMM4
+	pxor	  \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+	pxor	  \XMM7, \XMM5
+	pxor	  \TMP2, \TMP6
+
+        # Multiply XMM8 * HashKey
+        # XMM8 and TMP5 hold the values for the two operands
+
+	movdqa	  \XMM8, \TMP1
+	pshufd	  $78, \XMM8, \TMP2
+	pxor	  \XMM8, \TMP2
+	movdqa	  HashKey(%rsp), \TMP5
+	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
+	movaps 0x90(%arg1), \TMP3
+	AESENC	  \TMP3, \XMM1            # Round 9
+	AESENC	  \TMP3, \XMM2
+	AESENC	  \TMP3, \XMM3
+	AESENC	  \TMP3, \XMM4
+	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
+	movaps 0xa0(%arg1), \TMP3
+	AESENCLAST \TMP3, \XMM1           # Round 10
+	AESENCLAST \TMP3, \XMM2
+	AESENCLAST \TMP3, \XMM3
+	AESENCLAST \TMP3, \XMM4
+	movdqa    HashKey_k(%rsp), \TMP5
+	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
+	movdqu	  (%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
+	movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
+	movdqa    \TMP3, \XMM1
+	movdqu	  16(%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
+	movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
+	movdqa    \TMP3, \XMM2
+	movdqu	  32(%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
+	movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
+	movdqa    \TMP3, \XMM3
+	movdqu	  48(%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
+	movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
+	movdqa    \TMP3, \XMM4
+	PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
+	PSHUFB_XMM %xmm15, \XMM2	# perform a 16 byte swap
+	PSHUFB_XMM %xmm15, \XMM3	# perform a 16 byte swap
+	PSHUFB_XMM %xmm15, \XMM4	# perform a 16 byte swap
+
+	pxor	  \TMP4, \TMP1
+	pxor	  \XMM8, \XMM5
+	pxor	  \TMP6, \TMP2
+	pxor	  \TMP1, \TMP2
+	pxor	  \XMM5, \TMP2
+	movdqa	  \TMP2, \TMP3
+	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
+	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
+	pxor	  \TMP3, \XMM5
+	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
+
+        # first phase of reduction
+
+	movdqa    \XMM5, \TMP2
+	movdqa    \XMM5, \TMP3
+	movdqa    \XMM5, \TMP4
+# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
+	pslld     $31, \TMP2                   # packed right shift << 31
+	pslld     $30, \TMP3                   # packed right shift << 30
+	pslld     $25, \TMP4                   # packed right shift << 25
+	pxor      \TMP3, \TMP2	               # xor the shifted versions
+	pxor      \TMP4, \TMP2
+	movdqa    \TMP2, \TMP5
+	psrldq    $4, \TMP5                    # right shift T5 1 DW
+	pslldq    $12, \TMP2                   # left shift T2 3 DWs
+	pxor      \TMP2, \XMM5
+
+        # second phase of reduction
+
+	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
+	movdqa    \XMM5,\TMP3
+	movdqa    \XMM5,\TMP4
+	psrld     $1, \TMP2                    # packed left shift >>1
+	psrld     $2, \TMP3                    # packed left shift >>2
+	psrld     $7, \TMP4                    # packed left shift >>7
+	pxor      \TMP3,\TMP2		       # xor the shifted versions
+	pxor      \TMP4,\TMP2
+	pxor      \TMP5, \TMP2
+	pxor      \TMP2, \XMM5
+	pxor      \TMP1, \XMM5                 # result is in TMP1
+
+	pxor	  \XMM5, \XMM1
+.endm
+
+/* GHASH the last 4 ciphertext blocks. */
+.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
+TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
+
+        # Multiply TMP6 * HashKey (using Karatsuba)
+
+	movdqa	  \XMM1, \TMP6
+	pshufd	  $78, \XMM1, \TMP2
+	pxor	  \XMM1, \TMP2
+	movdqa	  HashKey_4(%rsp), \TMP5
+	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
+	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
+	movdqa	  HashKey_4_k(%rsp), \TMP4
+	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+	movdqa	  \XMM1, \XMMDst
+	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+
+	movdqa	  \XMM2, \TMP1
+	pshufd	  $78, \XMM2, \TMP2
+	pxor	  \XMM2, \TMP2
+	movdqa	  HashKey_3(%rsp), \TMP5
+	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
+	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
+	movdqa	  HashKey_3_k(%rsp), \TMP4
+	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+	pxor	  \TMP1, \TMP6
+	pxor	  \XMM2, \XMMDst
+	pxor	  \TMP2, \XMM1
+# results accumulated in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+
+	movdqa	  \XMM3, \TMP1
+	pshufd	  $78, \XMM3, \TMP2
+	pxor	  \XMM3, \TMP2
+	movdqa	  HashKey_2(%rsp), \TMP5
+	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
+	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
+	movdqa	  HashKey_2_k(%rsp), \TMP4
+	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+	pxor	  \TMP1, \TMP6
+	pxor	  \XMM3, \XMMDst
+	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+	movdqa	  \XMM4, \TMP1
+	pshufd	  $78, \XMM4, \TMP2
+	pxor	  \XMM4, \TMP2
+	movdqa	  HashKey(%rsp), \TMP5
+	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
+	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
+	movdqa	  HashKey_k(%rsp), \TMP4
+	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+	pxor	  \TMP1, \TMP6
+	pxor	  \XMM4, \XMMDst
+	pxor	  \XMM1, \TMP2
+	pxor	  \TMP6, \TMP2
+	pxor	  \XMMDst, \TMP2
+	# middle section of the temp results combined as in karatsuba algorithm
+	movdqa	  \TMP2, \TMP4
+	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
+	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
+	pxor	  \TMP4, \XMMDst
+	pxor	  \TMP2, \TMP6
+# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
+	# first phase of the reduction
+	movdqa    \XMMDst, \TMP2
+	movdqa    \XMMDst, \TMP3
+	movdqa    \XMMDst, \TMP4
+# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
+	pslld     $31, \TMP2                # packed right shifting << 31
+	pslld     $30, \TMP3                # packed right shifting << 30
+	pslld     $25, \TMP4                # packed right shifting << 25
+	pxor      \TMP3, \TMP2              # xor the shifted versions
+	pxor      \TMP4, \TMP2
+	movdqa    \TMP2, \TMP7
+	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
+	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
+	pxor      \TMP2, \XMMDst
+
+        # second phase of the reduction
+	movdqa    \XMMDst, \TMP2
+	# make 3 copies of XMMDst for doing 3 shift operations
+	movdqa    \XMMDst, \TMP3
+	movdqa    \XMMDst, \TMP4
+	psrld     $1, \TMP2                 # packed left shift >> 1
+	psrld     $2, \TMP3                 # packed left shift >> 2
+	psrld     $7, \TMP4                 # packed left shift >> 7
+	pxor      \TMP3, \TMP2              # xor the shifted versions
+	pxor      \TMP4, \TMP2
+	pxor      \TMP7, \TMP2
+	pxor      \TMP2, \XMMDst
+	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
+.endm
+
+/* Encryption of a single block done*/
+.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
+
+	pxor	(%arg1), \XMM0
+        movaps 16(%arg1), \TMP1
+	AESENC	\TMP1, \XMM0
+        movaps 32(%arg1), \TMP1
+	AESENC	\TMP1, \XMM0
+        movaps 48(%arg1), \TMP1
+	AESENC	\TMP1, \XMM0
+        movaps 64(%arg1), \TMP1
+	AESENC	\TMP1, \XMM0
+        movaps 80(%arg1), \TMP1
+	AESENC	\TMP1, \XMM0
+        movaps 96(%arg1), \TMP1
+	AESENC	\TMP1, \XMM0
+        movaps 112(%arg1), \TMP1
+	AESENC	\TMP1, \XMM0
+        movaps 128(%arg1), \TMP1
+	AESENC	\TMP1, \XMM0
+        movaps 144(%arg1), \TMP1
+	AESENC	\TMP1, \XMM0
+        movaps 160(%arg1), \TMP1
+	AESENCLAST	\TMP1, \XMM0
+.endm
+
+
+/*****************************************************************************
+* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
+*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
+*                   const u8 *in,      // Ciphertext input
+*                   u64 plaintext_len, // Length of data in bytes for decryption.
+*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
+*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
+*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
+*                   const u8 *aad,     // Additional Authentication Data (AAD)
+*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
+*                                      // given authentication tag and only return the plaintext if they match.
+*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
+*                                      // (most likely), 12 or 8.
+*
+* Assumptions:
+*
+* keys:
+*       keys are pre-expanded and aligned to 16 bytes. we are using the first
+*       set of 11 keys in the data structure void *aes_ctx
+*
+* iv:
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                             Salt  (From the SA)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     Initialization Vector                     |
+*       |         (This is the sequence number from IPSec header)       |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x1                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+*       AAD padded to 128 bits with 0
+*       for example, assume AAD is a u32 vector
+*
+*       if AAD is 8 bytes:
+*       AAD[3] = {A0, A1};
+*       padded AAD in xmm register = {A1 A0 0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A1)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     32-bit Sequence Number (A0)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                       AAD Format with 32-bit Sequence Number
+*
+*       if AAD is 12 bytes:
+*       AAD[3] = {A0, A1, A2};
+*       padded AAD in xmm register = {A2 A1 A0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A2)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                 64-bit Extended Sequence Number {A1,A0}       |
+*       |                                                               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                        AAD Format with 64-bit Extended Sequence Number
+*
+* aadLen:
+*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
+*       The code supports 16 too but for other sizes, the code will fail.
+*
+* TLen:
+*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+*       For other sizes, the code will fail.
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+*
+*****************************************************************************/
+
+ENTRY(aesni_gcm_dec)
+	push	%r12
+	push	%r13
+	push	%r14
+	mov	%rsp, %r14
+/*
+* states of %xmm registers %xmm6:%xmm15 not saved
+* all %xmm registers are clobbered
+*/
+	sub	$VARIABLE_OFFSET, %rsp
+	and	$~63, %rsp                        # align rsp to 64 bytes
+	mov	%arg6, %r12
+	movdqu	(%r12), %xmm13			  # %xmm13 = HashKey
+        movdqa  SHUF_MASK(%rip), %xmm2
+	PSHUFB_XMM %xmm2, %xmm13
+
+
+# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
+
+	movdqa	%xmm13, %xmm2
+	psllq	$1, %xmm13
+	psrlq	$63, %xmm2
+	movdqa	%xmm2, %xmm1
+	pslldq	$8, %xmm2
+	psrldq	$8, %xmm1
+	por	%xmm2, %xmm13
+
+        # Reduction
+
+	pshufd	$0x24, %xmm1, %xmm2
+	pcmpeqd TWOONE(%rip), %xmm2
+	pand	POLY(%rip), %xmm2
+	pxor	%xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly)
+
+
+        # Decrypt first few blocks
+
+	movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
+	mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext
+	and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
+	mov %r13, %r12
+	and $(3<<4), %r12
+	jz _initial_num_blocks_is_0_decrypt
+	cmp $(2<<4), %r12
+	jb _initial_num_blocks_is_1_decrypt
+	je _initial_num_blocks_is_2_decrypt
+_initial_num_blocks_is_3_decrypt:
+	INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
+	sub	$48, %r13
+	jmp	_initial_blocks_decrypted
+_initial_num_blocks_is_2_decrypt:
+	INITIAL_BLOCKS_DEC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
+	sub	$32, %r13
+	jmp	_initial_blocks_decrypted
+_initial_num_blocks_is_1_decrypt:
+	INITIAL_BLOCKS_DEC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
+	sub	$16, %r13
+	jmp	_initial_blocks_decrypted
+_initial_num_blocks_is_0_decrypt:
+	INITIAL_BLOCKS_DEC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
+_initial_blocks_decrypted:
+	cmp	$0, %r13
+	je	_zero_cipher_left_decrypt
+	sub	$64, %r13
+	je	_four_cipher_left_decrypt
+_decrypt_by_4:
+	GHASH_4_ENCRYPT_4_PARALLEL_DEC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
+%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
+	add	$64, %r11
+	sub	$64, %r13
+	jne	_decrypt_by_4
+_four_cipher_left_decrypt:
+	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
+%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_decrypt:
+	mov	%arg4, %r13
+	and	$15, %r13				# %r13 = arg4 (mod 16)
+	je	_multiple_of_16_bytes_decrypt
+
+        # Handle the last <16 byte block seperately
+
+	paddd ONE(%rip), %xmm0         # increment CNT to get Yn
+        movdqa SHUF_MASK(%rip), %xmm10
+	PSHUFB_XMM %xmm10, %xmm0
+
+	ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn)
+	sub $16, %r11
+	add %r13, %r11
+	movdqu (%arg3,%r11,1), %xmm1   # recieve the last <16 byte block
+	lea SHIFT_MASK+16(%rip), %r12
+	sub %r13, %r12
+# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
+# (%r13 is the number of bytes in plaintext mod 16)
+	movdqu (%r12), %xmm2           # get the appropriate shuffle mask
+	PSHUFB_XMM %xmm2, %xmm1            # right shift 16-%r13 butes
+
+	movdqa  %xmm1, %xmm2
+	pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
+	movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+	# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
+	pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
+	pand    %xmm1, %xmm2
+        movdqa SHUF_MASK(%rip), %xmm10
+	PSHUFB_XMM %xmm10 ,%xmm2
+
+	pxor %xmm2, %xmm8
+	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+	          # GHASH computation for the last <16 byte block
+	sub %r13, %r11
+	add $16, %r11
+
+        # output %r13 bytes
+	MOVQ_R64_XMM	%xmm0, %rax
+	cmp	$8, %r13
+	jle	_less_than_8_bytes_left_decrypt
+	mov	%rax, (%arg2 , %r11, 1)
+	add	$8, %r11
+	psrldq	$8, %xmm0
+	MOVQ_R64_XMM	%xmm0, %rax
+	sub	$8, %r13
+_less_than_8_bytes_left_decrypt:
+	mov	%al,  (%arg2, %r11, 1)
+	add	$1, %r11
+	shr	$8, %rax
+	sub	$1, %r13
+	jne	_less_than_8_bytes_left_decrypt
+_multiple_of_16_bytes_decrypt:
+	mov	arg8, %r12		  # %r13 = aadLen (number of bytes)
+	shl	$3, %r12		  # convert into number of bits
+	movd	%r12d, %xmm15		  # len(A) in %xmm15
+	shl	$3, %arg4		  # len(C) in bits (*128)
+	MOVQ_R64_XMM	%arg4, %xmm1
+	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
+	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
+	pxor	%xmm15, %xmm8
+	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+	         # final GHASH computation
+        movdqa SHUF_MASK(%rip), %xmm10
+	PSHUFB_XMM %xmm10, %xmm8
+
+	mov	%arg5, %rax		  # %rax = *Y0
+	movdqu	(%rax), %xmm0		  # %xmm0 = Y0
+	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
+	pxor	%xmm8, %xmm0
+_return_T_decrypt:
+	mov	arg9, %r10                # %r10 = authTag
+	mov	arg10, %r11               # %r11 = auth_tag_len
+	cmp	$16, %r11
+	je	_T_16_decrypt
+	cmp	$12, %r11
+	je	_T_12_decrypt
+_T_8_decrypt:
+	MOVQ_R64_XMM	%xmm0, %rax
+	mov	%rax, (%r10)
+	jmp	_return_T_done_decrypt
+_T_12_decrypt:
+	MOVQ_R64_XMM	%xmm0, %rax
+	mov	%rax, (%r10)
+	psrldq	$8, %xmm0
+	movd	%xmm0, %eax
+	mov	%eax, 8(%r10)
+	jmp	_return_T_done_decrypt
+_T_16_decrypt:
+	movdqu	%xmm0, (%r10)
+_return_T_done_decrypt:
+	mov	%r14, %rsp
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	ret
+
+
+/*****************************************************************************
+* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
+*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
+*                    const u8 *in,       // Plaintext input
+*                    u64 plaintext_len,  // Length of data in bytes for encryption.
+*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
+*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
+*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
+*                    const u8 *aad,      // Additional Authentication Data (AAD)
+*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+*                    u8 *auth_tag,       // Authenticated Tag output.
+*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
+*                                        // 12 or 8.
+*
+* Assumptions:
+*
+* keys:
+*       keys are pre-expanded and aligned to 16 bytes. we are using the
+*       first set of 11 keys in the data structure void *aes_ctx
+*
+*
+* iv:
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                             Salt  (From the SA)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     Initialization Vector                     |
+*       |         (This is the sequence number from IPSec header)       |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x1                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+*       AAD padded to 128 bits with 0
+*       for example, assume AAD is a u32 vector
+*
+*       if AAD is 8 bytes:
+*       AAD[3] = {A0, A1};
+*       padded AAD in xmm register = {A1 A0 0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A1)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     32-bit Sequence Number (A0)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                 AAD Format with 32-bit Sequence Number
+*
+*       if AAD is 12 bytes:
+*       AAD[3] = {A0, A1, A2};
+*       padded AAD in xmm register = {A2 A1 A0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A2)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                 64-bit Extended Sequence Number {A1,A0}       |
+*       |                                                               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                         AAD Format with 64-bit Extended Sequence Number
+*
+* aadLen:
+*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
+*       The code supports 16 too but for other sizes, the code will fail.
+*
+* TLen:
+*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+*       For other sizes, the code will fail.
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+***************************************************************************/
+ENTRY(aesni_gcm_enc)
+	push	%r12
+	push	%r13
+	push	%r14
+	mov	%rsp, %r14
+#
+# states of %xmm registers %xmm6:%xmm15 not saved
+# all %xmm registers are clobbered
+#
+	sub	$VARIABLE_OFFSET, %rsp
+	and	$~63, %rsp
+	mov	%arg6, %r12
+	movdqu	(%r12), %xmm13
+        movdqa  SHUF_MASK(%rip), %xmm2
+	PSHUFB_XMM %xmm2, %xmm13
+
+
+# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
+
+	movdqa	%xmm13, %xmm2
+	psllq	$1, %xmm13
+	psrlq	$63, %xmm2
+	movdqa	%xmm2, %xmm1
+	pslldq	$8, %xmm2
+	psrldq	$8, %xmm1
+	por	%xmm2, %xmm13
+
+        # reduce HashKey<<1
+
+	pshufd	$0x24, %xmm1, %xmm2
+	pcmpeqd TWOONE(%rip), %xmm2
+	pand	POLY(%rip), %xmm2
+	pxor	%xmm2, %xmm13
+	movdqa	%xmm13, HashKey(%rsp)
+	mov	%arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly)
+	and	$-16, %r13
+	mov	%r13, %r12
+
+        # Encrypt first few blocks
+
+	and	$(3<<4), %r12
+	jz	_initial_num_blocks_is_0_encrypt
+	cmp	$(2<<4), %r12
+	jb	_initial_num_blocks_is_1_encrypt
+	je	_initial_num_blocks_is_2_encrypt
+_initial_num_blocks_is_3_encrypt:
+	INITIAL_BLOCKS_ENC	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
+	sub	$48, %r13
+	jmp	_initial_blocks_encrypted
+_initial_num_blocks_is_2_encrypt:
+	INITIAL_BLOCKS_ENC	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
+	sub	$32, %r13
+	jmp	_initial_blocks_encrypted
+_initial_num_blocks_is_1_encrypt:
+	INITIAL_BLOCKS_ENC	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
+	sub	$16, %r13
+	jmp	_initial_blocks_encrypted
+_initial_num_blocks_is_0_encrypt:
+	INITIAL_BLOCKS_ENC	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
+_initial_blocks_encrypted:
+
+        # Main loop - Encrypt remaining blocks
+
+	cmp	$0, %r13
+	je	_zero_cipher_left_encrypt
+	sub	$64, %r13
+	je	_four_cipher_left_encrypt
+_encrypt_by_4_encrypt:
+	GHASH_4_ENCRYPT_4_PARALLEL_ENC	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
+%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
+	add	$64, %r11
+	sub	$64, %r13
+	jne	_encrypt_by_4_encrypt
+_four_cipher_left_encrypt:
+	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
+%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_encrypt:
+	mov	%arg4, %r13
+	and	$15, %r13			# %r13 = arg4 (mod 16)
+	je	_multiple_of_16_bytes_encrypt
+
+         # Handle the last <16 Byte block seperately
+	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
+        movdqa SHUF_MASK(%rip), %xmm10
+	PSHUFB_XMM %xmm10, %xmm0
+
+	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
+	sub $16, %r11
+	add %r13, %r11
+	movdqu (%arg3,%r11,1), %xmm1     # receive the last <16 byte blocks
+	lea SHIFT_MASK+16(%rip), %r12
+	sub %r13, %r12
+	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+	# (%r13 is the number of bytes in plaintext mod 16)
+	movdqu	(%r12), %xmm2           # get the appropriate shuffle mask
+	PSHUFB_XMM	%xmm2, %xmm1            # shift right 16-r13 byte
+	pxor	%xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
+	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
+	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
+	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
+        movdqa SHUF_MASK(%rip), %xmm10
+	PSHUFB_XMM %xmm10,%xmm0
+
+	pxor	%xmm0, %xmm8
+	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+	# GHASH computation for the last <16 byte block
+	sub	%r13, %r11
+	add	$16, %r11
+	PSHUFB_XMM %xmm10, %xmm1
+
+	# shuffle xmm0 back to output as ciphertext
+
+        # Output %r13 bytes
+	MOVQ_R64_XMM %xmm0, %rax
+	cmp $8, %r13
+	jle _less_than_8_bytes_left_encrypt
+	mov %rax, (%arg2 , %r11, 1)
+	add $8, %r11
+	psrldq $8, %xmm0
+	MOVQ_R64_XMM %xmm0, %rax
+	sub $8, %r13
+_less_than_8_bytes_left_encrypt:
+	mov %al,  (%arg2, %r11, 1)
+	add $1, %r11
+	shr $8, %rax
+	sub $1, %r13
+	jne _less_than_8_bytes_left_encrypt
+_multiple_of_16_bytes_encrypt:
+	mov	arg8, %r12    # %r12 = addLen (number of bytes)
+	shl	$3, %r12
+	movd	%r12d, %xmm15       # len(A) in %xmm15
+	shl	$3, %arg4               # len(C) in bits (*128)
+	MOVQ_R64_XMM	%arg4, %xmm1
+	pslldq	$8, %xmm15          # %xmm15 = len(A)||0x0000000000000000
+	pxor	%xmm1, %xmm15       # %xmm15 = len(A)||len(C)
+	pxor	%xmm15, %xmm8
+	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+	# final GHASH computation
+        movdqa SHUF_MASK(%rip), %xmm10
+	PSHUFB_XMM %xmm10, %xmm8         # perform a 16 byte swap
+
+	mov	%arg5, %rax		       # %rax  = *Y0
+	movdqu	(%rax), %xmm0		       # %xmm0 = Y0
+	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm15         # Encrypt(K, Y0)
+	pxor	%xmm8, %xmm0
+_return_T_encrypt:
+	mov	arg9, %r10                     # %r10 = authTag
+	mov	arg10, %r11                    # %r11 = auth_tag_len
+	cmp	$16, %r11
+	je	_T_16_encrypt
+	cmp	$12, %r11
+	je	_T_12_encrypt
+_T_8_encrypt:
+	MOVQ_R64_XMM	%xmm0, %rax
+	mov	%rax, (%r10)
+	jmp	_return_T_done_encrypt
+_T_12_encrypt:
+	MOVQ_R64_XMM	%xmm0, %rax
+	mov	%rax, (%r10)
+	psrldq	$8, %xmm0
+	movd	%xmm0, %eax
+	mov	%eax, 8(%r10)
+	jmp	_return_T_done_encrypt
+_T_16_encrypt:
+	movdqu	%xmm0, (%r10)
+_return_T_done_encrypt:
+	mov	%r14, %rsp
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	ret
+
+#endif
+
 
 _key_expansion_128:
 _key_expansion_256a:
@@ -55,10 +1709,11 @@
 	shufps $0b10001100, %xmm0, %xmm4
 	pxor %xmm4, %xmm0
 	pxor %xmm1, %xmm0
-	movaps %xmm0, (%rcx)
-	add $0x10, %rcx
+	movaps %xmm0, (TKEYP)
+	add $0x10, TKEYP
 	ret
 
+.align 4
 _key_expansion_192a:
 	pshufd $0b01010101, %xmm1, %xmm1
 	shufps $0b00010000, %xmm0, %xmm4
@@ -76,12 +1731,13 @@
 
 	movaps %xmm0, %xmm1
 	shufps $0b01000100, %xmm0, %xmm6
-	movaps %xmm6, (%rcx)
+	movaps %xmm6, (TKEYP)
 	shufps $0b01001110, %xmm2, %xmm1
-	movaps %xmm1, 16(%rcx)
-	add $0x20, %rcx
+	movaps %xmm1, 0x10(TKEYP)
+	add $0x20, TKEYP
 	ret
 
+.align 4
 _key_expansion_192b:
 	pshufd $0b01010101, %xmm1, %xmm1
 	shufps $0b00010000, %xmm0, %xmm4
@@ -96,10 +1752,11 @@
 	pxor %xmm3, %xmm2
 	pxor %xmm5, %xmm2
 
-	movaps %xmm0, (%rcx)
-	add $0x10, %rcx
+	movaps %xmm0, (TKEYP)
+	add $0x10, TKEYP
 	ret
 
+.align 4
 _key_expansion_256b:
 	pshufd $0b10101010, %xmm1, %xmm1
 	shufps $0b00010000, %xmm2, %xmm4
@@ -107,8 +1764,8 @@
 	shufps $0b10001100, %xmm2, %xmm4
 	pxor %xmm4, %xmm2
 	pxor %xmm1, %xmm2
-	movaps %xmm2, (%rcx)
-	add $0x10, %rcx
+	movaps %xmm2, (TKEYP)
+	add $0x10, TKEYP
 	ret
 
 /*
@@ -116,17 +1773,23 @@
  *                   unsigned int key_len)
  */
 ENTRY(aesni_set_key)
-	movups (%rsi), %xmm0		# user key (first 16 bytes)
-	movaps %xmm0, (%rdi)
-	lea 0x10(%rdi), %rcx		# key addr
-	movl %edx, 480(%rdi)
+#ifndef __x86_64__
+	pushl KEYP
+	movl 8(%esp), KEYP		# ctx
+	movl 12(%esp), UKEYP		# in_key
+	movl 16(%esp), %edx		# key_len
+#endif
+	movups (UKEYP), %xmm0		# user key (first 16 bytes)
+	movaps %xmm0, (KEYP)
+	lea 0x10(KEYP), TKEYP		# key addr
+	movl %edx, 480(KEYP)
 	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
 	cmp $24, %dl
 	jb .Lenc_key128
 	je .Lenc_key192
-	movups 0x10(%rsi), %xmm2	# other user key
-	movaps %xmm2, (%rcx)
-	add $0x10, %rcx
+	movups 0x10(UKEYP), %xmm2	# other user key
+	movaps %xmm2, (TKEYP)
+	add $0x10, TKEYP
 	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
 	call _key_expansion_256a
 	AESKEYGENASSIST 0x1 %xmm0 %xmm1
@@ -155,7 +1818,7 @@
 	call _key_expansion_256a
 	jmp .Ldec_key
 .Lenc_key192:
-	movq 0x10(%rsi), %xmm2		# other user key
+	movq 0x10(UKEYP), %xmm2		# other user key
 	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
 	call _key_expansion_192a
 	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
@@ -195,33 +1858,47 @@
 	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
 	call _key_expansion_128
 .Ldec_key:
-	sub $0x10, %rcx
-	movaps (%rdi), %xmm0
-	movaps (%rcx), %xmm1
-	movaps %xmm0, 240(%rcx)
-	movaps %xmm1, 240(%rdi)
-	add $0x10, %rdi
-	lea 240-16(%rcx), %rsi
+	sub $0x10, TKEYP
+	movaps (KEYP), %xmm0
+	movaps (TKEYP), %xmm1
+	movaps %xmm0, 240(TKEYP)
+	movaps %xmm1, 240(KEYP)
+	add $0x10, KEYP
+	lea 240-16(TKEYP), UKEYP
 .align 4
 .Ldec_key_loop:
-	movaps (%rdi), %xmm0
+	movaps (KEYP), %xmm0
 	AESIMC %xmm0 %xmm1
-	movaps %xmm1, (%rsi)
-	add $0x10, %rdi
-	sub $0x10, %rsi
-	cmp %rcx, %rdi
+	movaps %xmm1, (UKEYP)
+	add $0x10, KEYP
+	sub $0x10, UKEYP
+	cmp TKEYP, KEYP
 	jb .Ldec_key_loop
-	xor %rax, %rax
+	xor AREG, AREG
+#ifndef __x86_64__
+	popl KEYP
+#endif
 	ret
 
 /*
  * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  */
 ENTRY(aesni_enc)
+#ifndef __x86_64__
+	pushl KEYP
+	pushl KLEN
+	movl 12(%esp), KEYP
+	movl 16(%esp), OUTP
+	movl 20(%esp), INP
+#endif
 	movl 480(KEYP), KLEN		# key length
 	movups (INP), STATE		# input
 	call _aesni_enc1
 	movups STATE, (OUTP)		# output
+#ifndef __x86_64__
+	popl KLEN
+	popl KEYP
+#endif
 	ret
 
 /*
@@ -236,6 +1913,7 @@
  *	KEY
  *	TKEYP (T1)
  */
+.align 4
 _aesni_enc1:
 	movaps (KEYP), KEY		# key
 	mov KEYP, TKEYP
@@ -298,6 +1976,7 @@
  *	KEY
  *	TKEYP (T1)
  */
+.align 4
 _aesni_enc4:
 	movaps (KEYP), KEY		# key
 	mov KEYP, TKEYP
@@ -391,11 +2070,22 @@
  * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  */
 ENTRY(aesni_dec)
+#ifndef __x86_64__
+	pushl KEYP
+	pushl KLEN
+	movl 12(%esp), KEYP
+	movl 16(%esp), OUTP
+	movl 20(%esp), INP
+#endif
 	mov 480(KEYP), KLEN		# key length
 	add $240, KEYP
 	movups (INP), STATE		# input
 	call _aesni_dec1
 	movups STATE, (OUTP)		#output
+#ifndef __x86_64__
+	popl KLEN
+	popl KEYP
+#endif
 	ret
 
 /*
@@ -410,6 +2100,7 @@
  *	KEY
  *	TKEYP (T1)
  */
+.align 4
 _aesni_dec1:
 	movaps (KEYP), KEY		# key
 	mov KEYP, TKEYP
@@ -472,6 +2163,7 @@
  *	KEY
  *	TKEYP (T1)
  */
+.align 4
 _aesni_dec4:
 	movaps (KEYP), KEY		# key
 	mov KEYP, TKEYP
@@ -566,6 +2258,15 @@
  *		      size_t len)
  */
 ENTRY(aesni_ecb_enc)
+#ifndef __x86_64__
+	pushl LEN
+	pushl KEYP
+	pushl KLEN
+	movl 16(%esp), KEYP
+	movl 20(%esp), OUTP
+	movl 24(%esp), INP
+	movl 28(%esp), LEN
+#endif
 	test LEN, LEN		# check length
 	jz .Lecb_enc_ret
 	mov 480(KEYP), KLEN
@@ -602,6 +2303,11 @@
 	cmp $16, LEN
 	jge .Lecb_enc_loop1
 .Lecb_enc_ret:
+#ifndef __x86_64__
+	popl KLEN
+	popl KEYP
+	popl LEN
+#endif
 	ret
 
 /*
@@ -609,6 +2315,15 @@
  *		      size_t len);
  */
 ENTRY(aesni_ecb_dec)
+#ifndef __x86_64__
+	pushl LEN
+	pushl KEYP
+	pushl KLEN
+	movl 16(%esp), KEYP
+	movl 20(%esp), OUTP
+	movl 24(%esp), INP
+	movl 28(%esp), LEN
+#endif
 	test LEN, LEN
 	jz .Lecb_dec_ret
 	mov 480(KEYP), KLEN
@@ -646,6 +2361,11 @@
 	cmp $16, LEN
 	jge .Lecb_dec_loop1
 .Lecb_dec_ret:
+#ifndef __x86_64__
+	popl KLEN
+	popl KEYP
+	popl LEN
+#endif
 	ret
 
 /*
@@ -653,6 +2373,17 @@
  *		      size_t len, u8 *iv)
  */
 ENTRY(aesni_cbc_enc)
+#ifndef __x86_64__
+	pushl IVP
+	pushl LEN
+	pushl KEYP
+	pushl KLEN
+	movl 20(%esp), KEYP
+	movl 24(%esp), OUTP
+	movl 28(%esp), INP
+	movl 32(%esp), LEN
+	movl 36(%esp), IVP
+#endif
 	cmp $16, LEN
 	jb .Lcbc_enc_ret
 	mov 480(KEYP), KLEN
@@ -670,6 +2401,12 @@
 	jge .Lcbc_enc_loop
 	movups STATE, (IVP)
 .Lcbc_enc_ret:
+#ifndef __x86_64__
+	popl KLEN
+	popl KEYP
+	popl LEN
+	popl IVP
+#endif
 	ret
 
 /*
@@ -677,6 +2414,17 @@
  *		      size_t len, u8 *iv)
  */
 ENTRY(aesni_cbc_dec)
+#ifndef __x86_64__
+	pushl IVP
+	pushl LEN
+	pushl KEYP
+	pushl KLEN
+	movl 20(%esp), KEYP
+	movl 24(%esp), OUTP
+	movl 28(%esp), INP
+	movl 32(%esp), LEN
+	movl 36(%esp), IVP
+#endif
 	cmp $16, LEN
 	jb .Lcbc_dec_just_ret
 	mov 480(KEYP), KLEN
@@ -690,16 +2438,30 @@
 	movaps IN1, STATE1
 	movups 0x10(INP), IN2
 	movaps IN2, STATE2
+#ifdef __x86_64__
 	movups 0x20(INP), IN3
 	movaps IN3, STATE3
 	movups 0x30(INP), IN4
 	movaps IN4, STATE4
+#else
+	movups 0x20(INP), IN1
+	movaps IN1, STATE3
+	movups 0x30(INP), IN2
+	movaps IN2, STATE4
+#endif
 	call _aesni_dec4
 	pxor IV, STATE1
+#ifdef __x86_64__
 	pxor IN1, STATE2
 	pxor IN2, STATE3
 	pxor IN3, STATE4
 	movaps IN4, IV
+#else
+	pxor (INP), STATE2
+	pxor 0x10(INP), STATE3
+	pxor IN1, STATE4
+	movaps IN2, IV
+#endif
 	movups STATE1, (OUTP)
 	movups STATE2, 0x10(OUTP)
 	movups STATE3, 0x20(OUTP)
@@ -727,8 +2489,15 @@
 .Lcbc_dec_ret:
 	movups IV, (IVP)
 .Lcbc_dec_just_ret:
+#ifndef __x86_64__
+	popl KLEN
+	popl KEYP
+	popl LEN
+	popl IVP
+#endif
 	ret
 
+#ifdef __x86_64__
 .align 16
 .Lbswap_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
@@ -744,6 +2513,7 @@
  *	INC:	== 1, in little endian
  *	BSWAP_MASK == endian swapping mask
  */
+.align 4
 _aesni_inc_init:
 	movaps .Lbswap_mask, BSWAP_MASK
 	movaps IV, CTR
@@ -768,6 +2538,7 @@
  *	CTR:	== output IV, in little endian
  *	TCTR_LOW: == lower qword of CTR
  */
+.align 4
 _aesni_inc:
 	paddq INC, CTR
 	add $1, TCTR_LOW
@@ -839,3 +2610,4 @@
 	movups IV, (IVP)
 .Lctr_enc_just_ret:
 	ret
+#endif
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2cb3dcc..e1e60c7 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -5,6 +5,14 @@
  * Copyright (C) 2008, Intel Corp.
  *    Author: Huang Ying <ying.huang@intel.com>
  *
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ *    Authors: Adrian Hoban <adrian.hoban@intel.com>
+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *             Tadeusz Struk (tadeusz.struk@intel.com)
+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *    Copyright (c) 2010, Intel Corporation.
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -21,6 +29,10 @@
 #include <crypto/ctr.h>
 #include <asm/i387.h>
 #include <asm/aes.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/internal/aead.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
 
 #if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
 #define HAS_CTR
@@ -42,8 +54,31 @@
 	struct cryptd_ablkcipher *cryptd_tfm;
 };
 
-#define AESNI_ALIGN	16
+/* This data is stored at the end of the crypto_tfm struct.
+ * It's a type of per "session" data storage location.
+ * This needs to be 16 byte aligned.
+ */
+struct aesni_rfc4106_gcm_ctx {
+	u8 hash_subkey[16];
+	struct crypto_aes_ctx aes_key_expanded;
+	u8 nonce[4];
+	struct cryptd_aead *cryptd_tfm;
+};
+
+struct aesni_gcm_set_hash_subkey_result {
+	int err;
+	struct completion completion;
+};
+
+struct aesni_hash_subkey_req_data {
+	u8 iv[16];
+	struct aesni_gcm_set_hash_subkey_result result;
+	struct scatterlist sg;
+};
+
+#define AESNI_ALIGN	(16)
 #define AES_BLOCK_MASK	(~(AES_BLOCK_SIZE-1))
+#define RFC4106_HASH_SUBKEY_SIZE 16
 
 asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
 			     unsigned int key_len);
@@ -59,9 +94,62 @@
 			      const u8 *in, unsigned int len, u8 *iv);
 asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
+#ifdef CONFIG_X86_64
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 
+/* asmlinkage void aesni_gcm_enc()
+ * void *ctx,  AES Key schedule. Starts on a 16 byte boundary.
+ * u8 *out, Ciphertext output. Encrypt in-place is allowed.
+ * const u8 *in, Plaintext input
+ * unsigned long plaintext_len, Length of data in bytes for encryption.
+ * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
+ *         concatenated with 8 byte Initialisation Vector (from IPSec ESP
+ *         Payload) concatenated with 0x00000001. 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this
+ *          is going to be 8 or 12 bytes
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len), Authenticated Tag Length in bytes.
+ *          Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_enc(void *ctx, u8 *out,
+			const u8 *in, unsigned long plaintext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
+/* asmlinkage void aesni_gcm_dec()
+ * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
+ * u8 *out, Plaintext output. Decrypt in-place is allowed.
+ * const u8 *in, Ciphertext input
+ * unsigned long ciphertext_len, Length of data in bytes for decryption.
+ * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
+ *         concatenated with 8 byte Initialisation Vector (from IPSec ESP
+ *         Payload) concatenated with 0x00000001. 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going
+ * to be 8 or 12 bytes
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len) Authenticated Tag Length in bytes.
+ * Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
+			const u8 *in, unsigned long ciphertext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
+static inline struct
+aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
+{
+	return
+		(struct aesni_rfc4106_gcm_ctx *)
+		PTR_ALIGN((u8 *)
+		crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN);
+}
+#endif
+
 static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
 {
 	unsigned long addr = (unsigned long)raw_ctx;
@@ -324,6 +412,7 @@
 	},
 };
 
+#ifdef CONFIG_X86_64
 static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
 			    struct blkcipher_walk *walk)
 {
@@ -389,6 +478,7 @@
 		},
 	},
 };
+#endif
 
 static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
 			unsigned int key_len)
@@ -536,6 +626,7 @@
 	},
 };
 
+#ifdef CONFIG_X86_64
 static int ablk_ctr_init(struct crypto_tfm *tfm)
 {
 	struct cryptd_ablkcipher *cryptd_tfm;
@@ -612,6 +703,7 @@
 	},
 };
 #endif
+#endif
 
 #ifdef HAS_LRW
 static int ablk_lrw_init(struct crypto_tfm *tfm)
@@ -730,6 +822,424 @@
 };
 #endif
 
+#ifdef CONFIG_X86_64
+static int rfc4106_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_aead *cryptd_tfm;
+	struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *)
+		PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
+	cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ctx->cryptd_tfm = cryptd_tfm;
+	tfm->crt_aead.reqsize = sizeof(struct aead_request)
+		+ crypto_aead_reqsize(&cryptd_tfm->base);
+	return 0;
+}
+
+static void rfc4106_exit(struct crypto_tfm *tfm)
+{
+	struct aesni_rfc4106_gcm_ctx *ctx =
+		(struct aesni_rfc4106_gcm_ctx *)
+		PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
+	if (!IS_ERR(ctx->cryptd_tfm))
+		cryptd_free_aead(ctx->cryptd_tfm);
+	return;
+}
+
+static void
+rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err)
+{
+	struct aesni_gcm_set_hash_subkey_result *result = req->data;
+
+	if (err == -EINPROGRESS)
+		return;
+	result->err = err;
+	complete(&result->completion);
+}
+
+static int
+rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
+{
+	struct crypto_ablkcipher *ctr_tfm;
+	struct ablkcipher_request *req;
+	int ret = -EINVAL;
+	struct aesni_hash_subkey_req_data *req_data;
+
+	ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0);
+	if (IS_ERR(ctr_tfm))
+		return PTR_ERR(ctr_tfm);
+
+	crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
+
+	ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
+	if (ret) {
+		crypto_free_ablkcipher(ctr_tfm);
+		return ret;
+	}
+
+	req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
+	if (!req) {
+		crypto_free_ablkcipher(ctr_tfm);
+		return -EINVAL;
+	}
+
+	req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
+	if (!req_data) {
+		crypto_free_ablkcipher(ctr_tfm);
+		return -ENOMEM;
+	}
+	memset(req_data->iv, 0, sizeof(req_data->iv));
+
+	/* Clear the data in the hash sub key container to zero.*/
+	/* We want to cipher all zeros to create the hash sub key. */
+	memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
+
+	init_completion(&req_data->result.completion);
+	sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE);
+	ablkcipher_request_set_tfm(req, ctr_tfm);
+	ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+					CRYPTO_TFM_REQ_MAY_BACKLOG,
+					rfc4106_set_hash_subkey_done,
+					&req_data->result);
+
+	ablkcipher_request_set_crypt(req, &req_data->sg,
+		&req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv);
+
+	ret = crypto_ablkcipher_encrypt(req);
+	if (ret == -EINPROGRESS || ret == -EBUSY) {
+		ret = wait_for_completion_interruptible
+			(&req_data->result.completion);
+		if (!ret)
+			ret = req_data->result.err;
+	}
+	ablkcipher_request_free(req);
+	kfree(req_data);
+	crypto_free_ablkcipher(ctr_tfm);
+	return ret;
+}
+
+static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
+						   unsigned int key_len)
+{
+	int ret = 0;
+	struct crypto_tfm *tfm = crypto_aead_tfm(parent);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
+	u8 *new_key_mem = NULL;
+
+	if (key_len < 4) {
+		crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	/*Account for 4 byte nonce at the end.*/
+	key_len -= 4;
+	if (key_len != AES_KEYSIZE_128) {
+		crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
+	/*This must be on a 16 byte boundary!*/
+	if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN)
+		return -EINVAL;
+
+	if ((unsigned long)key % AESNI_ALIGN) {
+		/*key is not aligned: use an auxuliar aligned pointer*/
+		new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL);
+		if (!new_key_mem)
+			return -ENOMEM;
+
+		new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
+		memcpy(new_key_mem, key, key_len);
+		key = new_key_mem;
+	}
+
+	if (!irq_fpu_usable())
+		ret = crypto_aes_expand_key(&(ctx->aes_key_expanded),
+		key, key_len);
+	else {
+		kernel_fpu_begin();
+		ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len);
+		kernel_fpu_end();
+	}
+	/*This must be on a 16 byte boundary!*/
+	if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) {
+		ret = -EINVAL;
+		goto exit;
+	}
+	ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
+exit:
+	kfree(new_key_mem);
+	return ret;
+}
+
+/* This is the Integrity Check Value (aka the authentication tag length and can
+ * be 8, 12 or 16 bytes long. */
+static int rfc4106_set_authsize(struct crypto_aead *parent,
+				unsigned int authsize)
+{
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
+	struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+	switch (authsize) {
+	case 8:
+	case 12:
+	case 16:
+		break;
+	default:
+		return -EINVAL;
+	}
+	crypto_aead_crt(parent)->authsize = authsize;
+	crypto_aead_crt(cryptd_child)->authsize = authsize;
+	return 0;
+}
+
+static int rfc4106_encrypt(struct aead_request *req)
+{
+	int ret;
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+	if (!irq_fpu_usable()) {
+		struct aead_request *cryptd_req =
+			(struct aead_request *) aead_request_ctx(req);
+		memcpy(cryptd_req, req, sizeof(*req));
+		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+		return crypto_aead_encrypt(cryptd_req);
+	} else {
+		kernel_fpu_begin();
+		ret = cryptd_child->base.crt_aead.encrypt(req);
+		kernel_fpu_end();
+		return ret;
+	}
+}
+
+static int rfc4106_decrypt(struct aead_request *req)
+{
+	int ret;
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+	if (!irq_fpu_usable()) {
+		struct aead_request *cryptd_req =
+			(struct aead_request *) aead_request_ctx(req);
+		memcpy(cryptd_req, req, sizeof(*req));
+		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+		return crypto_aead_decrypt(cryptd_req);
+	} else {
+		kernel_fpu_begin();
+		ret = cryptd_child->base.crt_aead.decrypt(req);
+		kernel_fpu_end();
+		return ret;
+	}
+}
+
+static struct crypto_alg rfc4106_alg = {
+	.cra_name = "rfc4106(gcm(aes))",
+	.cra_driver_name = "rfc4106-gcm-aesni",
+	.cra_priority = 400,
+	.cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
+	.cra_blocksize = 1,
+	.cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
+	.cra_alignmask = 0,
+	.cra_type = &crypto_nivaead_type,
+	.cra_module = THIS_MODULE,
+	.cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list),
+	.cra_init = rfc4106_init,
+	.cra_exit = rfc4106_exit,
+	.cra_u = {
+		.aead = {
+			.setkey = rfc4106_set_key,
+			.setauthsize = rfc4106_set_authsize,
+			.encrypt = rfc4106_encrypt,
+			.decrypt = rfc4106_decrypt,
+			.geniv = "seqiv",
+			.ivsize = 8,
+			.maxauthsize = 16,
+		},
+	},
+};
+
+static int __driver_rfc4106_encrypt(struct aead_request *req)
+{
+	u8 one_entry_in_sg = 0;
+	u8 *src, *dst, *assoc;
+	__be32 counter = cpu_to_be32(1);
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	void *aes_ctx = &(ctx->aes_key_expanded);
+	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+	u8 iv_tab[16+AESNI_ALIGN];
+	u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN);
+	struct scatter_walk src_sg_walk;
+	struct scatter_walk assoc_sg_walk;
+	struct scatter_walk dst_sg_walk;
+	unsigned int i;
+
+	/* Assuming we are supporting rfc4106 64-bit extended */
+	/* sequence numbers We need to have the AAD length equal */
+	/* to 8 or 12 bytes */
+	if (unlikely(req->assoclen != 8 && req->assoclen != 12))
+		return -EINVAL;
+	/* IV below built */
+	for (i = 0; i < 4; i++)
+		*(iv+i) = ctx->nonce[i];
+	for (i = 0; i < 8; i++)
+		*(iv+4+i) = req->iv[i];
+	*((__be32 *)(iv+12)) = counter;
+
+	if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+		one_entry_in_sg = 1;
+		scatterwalk_start(&src_sg_walk, req->src);
+		scatterwalk_start(&assoc_sg_walk, req->assoc);
+		src = scatterwalk_map(&src_sg_walk, 0);
+		assoc = scatterwalk_map(&assoc_sg_walk, 0);
+		dst = src;
+		if (unlikely(req->src != req->dst)) {
+			scatterwalk_start(&dst_sg_walk, req->dst);
+			dst = scatterwalk_map(&dst_sg_walk, 0);
+		}
+
+	} else {
+		/* Allocate memory for src, dst, assoc */
+		src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
+			GFP_ATOMIC);
+		if (unlikely(!src))
+			return -ENOMEM;
+		assoc = (src + req->cryptlen + auth_tag_len);
+		scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
+		scatterwalk_map_and_copy(assoc, req->assoc, 0,
+					req->assoclen, 0);
+		dst = src;
+	}
+
+	aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
+		ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
+		+ ((unsigned long)req->cryptlen), auth_tag_len);
+
+	/* The authTag (aka the Integrity Check Value) needs to be written
+	 * back to the packet. */
+	if (one_entry_in_sg) {
+		if (unlikely(req->src != req->dst)) {
+			scatterwalk_unmap(dst, 0);
+			scatterwalk_done(&dst_sg_walk, 0, 0);
+		}
+		scatterwalk_unmap(src, 0);
+		scatterwalk_unmap(assoc, 0);
+		scatterwalk_done(&src_sg_walk, 0, 0);
+		scatterwalk_done(&assoc_sg_walk, 0, 0);
+	} else {
+		scatterwalk_map_and_copy(dst, req->dst, 0,
+			req->cryptlen + auth_tag_len, 1);
+		kfree(src);
+	}
+	return 0;
+}
+
+static int __driver_rfc4106_decrypt(struct aead_request *req)
+{
+	u8 one_entry_in_sg = 0;
+	u8 *src, *dst, *assoc;
+	unsigned long tempCipherLen = 0;
+	__be32 counter = cpu_to_be32(1);
+	int retval = 0;
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	void *aes_ctx = &(ctx->aes_key_expanded);
+	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+	u8 iv_and_authTag[32+AESNI_ALIGN];
+	u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN);
+	u8 *authTag = iv + 16;
+	struct scatter_walk src_sg_walk;
+	struct scatter_walk assoc_sg_walk;
+	struct scatter_walk dst_sg_walk;
+	unsigned int i;
+
+	if (unlikely((req->cryptlen < auth_tag_len) ||
+		(req->assoclen != 8 && req->assoclen != 12)))
+		return -EINVAL;
+	/* Assuming we are supporting rfc4106 64-bit extended */
+	/* sequence numbers We need to have the AAD length */
+	/* equal to 8 or 12 bytes */
+
+	tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
+	/* IV below built */
+	for (i = 0; i < 4; i++)
+		*(iv+i) = ctx->nonce[i];
+	for (i = 0; i < 8; i++)
+		*(iv+4+i) = req->iv[i];
+	*((__be32 *)(iv+12)) = counter;
+
+	if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+		one_entry_in_sg = 1;
+		scatterwalk_start(&src_sg_walk, req->src);
+		scatterwalk_start(&assoc_sg_walk, req->assoc);
+		src = scatterwalk_map(&src_sg_walk, 0);
+		assoc = scatterwalk_map(&assoc_sg_walk, 0);
+		dst = src;
+		if (unlikely(req->src != req->dst)) {
+			scatterwalk_start(&dst_sg_walk, req->dst);
+			dst = scatterwalk_map(&dst_sg_walk, 0);
+		}
+
+	} else {
+		/* Allocate memory for src, dst, assoc */
+		src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
+		if (!src)
+			return -ENOMEM;
+		assoc = (src + req->cryptlen + auth_tag_len);
+		scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
+		scatterwalk_map_and_copy(assoc, req->assoc, 0,
+			req->assoclen, 0);
+		dst = src;
+	}
+
+	aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv,
+		ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
+		authTag, auth_tag_len);
+
+	/* Compare generated tag with passed in tag. */
+	retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ?
+		-EBADMSG : 0;
+
+	if (one_entry_in_sg) {
+		if (unlikely(req->src != req->dst)) {
+			scatterwalk_unmap(dst, 0);
+			scatterwalk_done(&dst_sg_walk, 0, 0);
+		}
+		scatterwalk_unmap(src, 0);
+		scatterwalk_unmap(assoc, 0);
+		scatterwalk_done(&src_sg_walk, 0, 0);
+		scatterwalk_done(&assoc_sg_walk, 0, 0);
+	} else {
+		scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1);
+		kfree(src);
+	}
+	return retval;
+}
+
+static struct crypto_alg __rfc4106_alg = {
+	.cra_name		= "__gcm-aes-aesni",
+	.cra_driver_name	= "__driver-gcm-aes-aesni",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_AEAD,
+	.cra_blocksize		= 1,
+	.cra_ctxsize	= sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_aead_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(__rfc4106_alg.cra_list),
+	.cra_u = {
+		.aead = {
+			.encrypt	= __driver_rfc4106_encrypt,
+			.decrypt	= __driver_rfc4106_decrypt,
+		},
+	},
+};
+#endif
+
 static int __init aesni_init(void)
 {
 	int err;
@@ -738,6 +1248,7 @@
 		printk(KERN_INFO "Intel AES-NI instructions are not detected.\n");
 		return -ENODEV;
 	}
+
 	if ((err = crypto_register_alg(&aesni_alg)))
 		goto aes_err;
 	if ((err = crypto_register_alg(&__aesni_alg)))
@@ -746,18 +1257,24 @@
 		goto blk_ecb_err;
 	if ((err = crypto_register_alg(&blk_cbc_alg)))
 		goto blk_cbc_err;
-	if ((err = crypto_register_alg(&blk_ctr_alg)))
-		goto blk_ctr_err;
 	if ((err = crypto_register_alg(&ablk_ecb_alg)))
 		goto ablk_ecb_err;
 	if ((err = crypto_register_alg(&ablk_cbc_alg)))
 		goto ablk_cbc_err;
+#ifdef CONFIG_X86_64
+	if ((err = crypto_register_alg(&blk_ctr_alg)))
+		goto blk_ctr_err;
 	if ((err = crypto_register_alg(&ablk_ctr_alg)))
 		goto ablk_ctr_err;
+	if ((err = crypto_register_alg(&__rfc4106_alg)))
+		goto __aead_gcm_err;
+	if ((err = crypto_register_alg(&rfc4106_alg)))
+		goto aead_gcm_err;
 #ifdef HAS_CTR
 	if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg)))
 		goto ablk_rfc3686_ctr_err;
 #endif
+#endif
 #ifdef HAS_LRW
 	if ((err = crypto_register_alg(&ablk_lrw_alg)))
 		goto ablk_lrw_err;
@@ -770,7 +1287,6 @@
 	if ((err = crypto_register_alg(&ablk_xts_alg)))
 		goto ablk_xts_err;
 #endif
-
 	return err;
 
 #ifdef HAS_XTS
@@ -784,18 +1300,24 @@
 	crypto_unregister_alg(&ablk_lrw_alg);
 ablk_lrw_err:
 #endif
+#ifdef CONFIG_X86_64
 #ifdef HAS_CTR
 	crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
 ablk_rfc3686_ctr_err:
 #endif
+	crypto_unregister_alg(&rfc4106_alg);
+aead_gcm_err:
+	crypto_unregister_alg(&__rfc4106_alg);
+__aead_gcm_err:
 	crypto_unregister_alg(&ablk_ctr_alg);
 ablk_ctr_err:
+	crypto_unregister_alg(&blk_ctr_alg);
+blk_ctr_err:
+#endif
 	crypto_unregister_alg(&ablk_cbc_alg);
 ablk_cbc_err:
 	crypto_unregister_alg(&ablk_ecb_alg);
 ablk_ecb_err:
-	crypto_unregister_alg(&blk_ctr_alg);
-blk_ctr_err:
 	crypto_unregister_alg(&blk_cbc_alg);
 blk_cbc_err:
 	crypto_unregister_alg(&blk_ecb_alg);
@@ -818,13 +1340,17 @@
 #ifdef HAS_LRW
 	crypto_unregister_alg(&ablk_lrw_alg);
 #endif
+#ifdef CONFIG_X86_64
 #ifdef HAS_CTR
 	crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
 #endif
+	crypto_unregister_alg(&rfc4106_alg);
+	crypto_unregister_alg(&__rfc4106_alg);
 	crypto_unregister_alg(&ablk_ctr_alg);
+	crypto_unregister_alg(&blk_ctr_alg);
+#endif
 	crypto_unregister_alg(&ablk_cbc_alg);
 	crypto_unregister_alg(&ablk_ecb_alg);
-	crypto_unregister_alg(&blk_ctr_alg);
 	crypto_unregister_alg(&blk_cbc_alg);
 	crypto_unregister_alg(&blk_ecb_alg);
 	crypto_unregister_alg(&__aesni_alg);
diff --git a/crypto/Kconfig b/crypto/Kconfig
index e4bac29..4b7cb0e 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -110,7 +110,6 @@
 
 config CRYPTO_GF128MUL
 	tristate "GF(2^128) multiplication functions (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
 	help
 	  Efficient table driven implementation of multiplications in the
 	  field GF(2^128).  This is needed by some cypher modes. This
@@ -539,8 +538,9 @@
 
 config CRYPTO_AES_NI_INTEL
 	tristate "AES cipher algorithms (AES-NI)"
-	depends on (X86 || UML_X86) && 64BIT
-	select CRYPTO_AES_X86_64
+	depends on (X86 || UML_X86)
+	select CRYPTO_AES_X86_64 if 64BIT
+	select CRYPTO_AES_586 if !64BIT
 	select CRYPTO_CRYPTD
 	select CRYPTO_ALGAPI
 	select CRYPTO_FPU
@@ -563,9 +563,10 @@
 
 	  See <http://csrc.nist.gov/encryption/aes/> for more information.
 
-	  In addition to AES cipher algorithm support, the
-	  acceleration for some popular block cipher mode is supported
-	  too, including ECB, CBC, CTR, LRW, PCBC, XTS.
+	  In addition to AES cipher algorithm support, the acceleration
+	  for some popular block cipher mode is supported too, including
+	  ECB, CBC, LRW, PCBC, XTS. The 64 bit version has additional
+	  acceleration for CTR.
 
 config CRYPTO_ANUBIS
 	tristate "Anubis cipher algorithm"
@@ -841,6 +842,27 @@
 	  ANSI X9.31 A.2.4. Note that this option must be enabled if
 	  CRYPTO_FIPS is selected
 
+config CRYPTO_USER_API
+	tristate
+
+config CRYPTO_USER_API_HASH
+	tristate "User-space interface for hash algorithms"
+	depends on NET
+	select CRYPTO_HASH
+	select CRYPTO_USER_API
+	help
+	  This option enables the user-spaces interface for hash
+	  algorithms.
+
+config CRYPTO_USER_API_SKCIPHER
+	tristate "User-space interface for symmetric key cipher algorithms"
+	depends on NET
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_USER_API
+	help
+	  This option enables the user-spaces interface for symmetric
+	  key cipher algorithms.
+
 source "drivers/crypto/Kconfig"
 
 endif	# if CRYPTO
diff --git a/crypto/Makefile b/crypto/Makefile
index 423b7de..e9a399c 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -3,32 +3,32 @@
 #
 
 obj-$(CONFIG_CRYPTO) += crypto.o
-crypto-objs := api.o cipher.o compress.o
+crypto-y := api.o cipher.o compress.o
 
 obj-$(CONFIG_CRYPTO_WORKQUEUE) += crypto_wq.o
 
 obj-$(CONFIG_CRYPTO_FIPS) += fips.o
 
 crypto_algapi-$(CONFIG_PROC_FS) += proc.o
-crypto_algapi-objs := algapi.o scatterwalk.o $(crypto_algapi-y)
+crypto_algapi-y := algapi.o scatterwalk.o $(crypto_algapi-y)
 obj-$(CONFIG_CRYPTO_ALGAPI2) += crypto_algapi.o
 
 obj-$(CONFIG_CRYPTO_AEAD2) += aead.o
 
-crypto_blkcipher-objs := ablkcipher.o
-crypto_blkcipher-objs += blkcipher.o
+crypto_blkcipher-y := ablkcipher.o
+crypto_blkcipher-y += blkcipher.o
 obj-$(CONFIG_CRYPTO_BLKCIPHER2) += crypto_blkcipher.o
 obj-$(CONFIG_CRYPTO_BLKCIPHER2) += chainiv.o
 obj-$(CONFIG_CRYPTO_BLKCIPHER2) += eseqiv.o
 obj-$(CONFIG_CRYPTO_SEQIV) += seqiv.o
 
-crypto_hash-objs += ahash.o
-crypto_hash-objs += shash.o
+crypto_hash-y += ahash.o
+crypto_hash-y += shash.o
 obj-$(CONFIG_CRYPTO_HASH2) += crypto_hash.o
 
 obj-$(CONFIG_CRYPTO_PCOMP2) += pcompress.o
 
-cryptomgr-objs := algboss.o testmgr.o
+cryptomgr-y := algboss.o testmgr.o
 
 obj-$(CONFIG_CRYPTO_MANAGER2) += cryptomgr.o
 obj-$(CONFIG_CRYPTO_HMAC) += hmac.o
@@ -85,6 +85,9 @@
 obj-$(CONFIG_CRYPTO_ANSI_CPRNG) += ansi_cprng.o
 obj-$(CONFIG_CRYPTO_TEST) += tcrypt.o
 obj-$(CONFIG_CRYPTO_GHASH) += ghash-generic.o
+obj-$(CONFIG_CRYPTO_USER_API) += af_alg.o
+obj-$(CONFIG_CRYPTO_USER_API_HASH) += algif_hash.o
+obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o
 
 #
 # generic algorithms and the async_tx api
diff --git a/crypto/af_alg.c b/crypto/af_alg.c
new file mode 100644
index 0000000..940d70c
--- /dev/null
+++ b/crypto/af_alg.c
@@ -0,0 +1,483 @@
+/*
+ * af_alg: User-space algorithm interface
+ *
+ * This file provides the user-space API for algorithms.
+ *
+ * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <asm/atomic.h>
+#include <crypto/if_alg.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/rwsem.h>
+
+struct alg_type_list {
+	const struct af_alg_type *type;
+	struct list_head list;
+};
+
+static atomic_long_t alg_memory_allocated;
+
+static struct proto alg_proto = {
+	.name			= "ALG",
+	.owner			= THIS_MODULE,
+	.memory_allocated	= &alg_memory_allocated,
+	.obj_size		= sizeof(struct alg_sock),
+};
+
+static LIST_HEAD(alg_types);
+static DECLARE_RWSEM(alg_types_sem);
+
+static const struct af_alg_type *alg_get_type(const char *name)
+{
+	const struct af_alg_type *type = ERR_PTR(-ENOENT);
+	struct alg_type_list *node;
+
+	down_read(&alg_types_sem);
+	list_for_each_entry(node, &alg_types, list) {
+		if (strcmp(node->type->name, name))
+			continue;
+
+		if (try_module_get(node->type->owner))
+			type = node->type;
+		break;
+	}
+	up_read(&alg_types_sem);
+
+	return type;
+}
+
+int af_alg_register_type(const struct af_alg_type *type)
+{
+	struct alg_type_list *node;
+	int err = -EEXIST;
+
+	down_write(&alg_types_sem);
+	list_for_each_entry(node, &alg_types, list) {
+		if (!strcmp(node->type->name, type->name))
+			goto unlock;
+	}
+
+	node = kmalloc(sizeof(*node), GFP_KERNEL);
+	err = -ENOMEM;
+	if (!node)
+		goto unlock;
+
+	type->ops->owner = THIS_MODULE;
+	node->type = type;
+	list_add(&node->list, &alg_types);
+	err = 0;
+
+unlock:
+	up_write(&alg_types_sem);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(af_alg_register_type);
+
+int af_alg_unregister_type(const struct af_alg_type *type)
+{
+	struct alg_type_list *node;
+	int err = -ENOENT;
+
+	down_write(&alg_types_sem);
+	list_for_each_entry(node, &alg_types, list) {
+		if (strcmp(node->type->name, type->name))
+			continue;
+
+		list_del(&node->list);
+		kfree(node);
+		err = 0;
+		break;
+	}
+	up_write(&alg_types_sem);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(af_alg_unregister_type);
+
+static void alg_do_release(const struct af_alg_type *type, void *private)
+{
+	if (!type)
+		return;
+
+	type->release(private);
+	module_put(type->owner);
+}
+
+int af_alg_release(struct socket *sock)
+{
+	if (sock->sk)
+		sock_put(sock->sk);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(af_alg_release);
+
+static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct sockaddr_alg *sa = (void *)uaddr;
+	const struct af_alg_type *type;
+	void *private;
+
+	if (sock->state == SS_CONNECTED)
+		return -EINVAL;
+
+	if (addr_len != sizeof(*sa))
+		return -EINVAL;
+
+	sa->salg_type[sizeof(sa->salg_type) - 1] = 0;
+	sa->salg_name[sizeof(sa->salg_name) - 1] = 0;
+
+	type = alg_get_type(sa->salg_type);
+	if (IS_ERR(type) && PTR_ERR(type) == -ENOENT) {
+		request_module("algif-%s", sa->salg_type);
+		type = alg_get_type(sa->salg_type);
+	}
+
+	if (IS_ERR(type))
+		return PTR_ERR(type);
+
+	private = type->bind(sa->salg_name, sa->salg_feat, sa->salg_mask);
+	if (IS_ERR(private)) {
+		module_put(type->owner);
+		return PTR_ERR(private);
+	}
+
+	lock_sock(sk);
+
+	swap(ask->type, type);
+	swap(ask->private, private);
+
+	release_sock(sk);
+
+	alg_do_release(type, private);
+
+	return 0;
+}
+
+static int alg_setkey(struct sock *sk, char __user *ukey,
+		      unsigned int keylen)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	const struct af_alg_type *type = ask->type;
+	u8 *key;
+	int err;
+
+	key = sock_kmalloc(sk, keylen, GFP_KERNEL);
+	if (!key)
+		return -ENOMEM;
+
+	err = -EFAULT;
+	if (copy_from_user(key, ukey, keylen))
+		goto out;
+
+	err = type->setkey(ask->private, key, keylen);
+
+out:
+	sock_kfree_s(sk, key, keylen);
+
+	return err;
+}
+
+static int alg_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	const struct af_alg_type *type;
+	int err = -ENOPROTOOPT;
+
+	lock_sock(sk);
+	type = ask->type;
+
+	if (level != SOL_ALG || !type)
+		goto unlock;
+
+	switch (optname) {
+	case ALG_SET_KEY:
+		if (sock->state == SS_CONNECTED)
+			goto unlock;
+		if (!type->setkey)
+			goto unlock;
+
+		err = alg_setkey(sk, optval, optlen);
+	}
+
+unlock:
+	release_sock(sk);
+
+	return err;
+}
+
+int af_alg_accept(struct sock *sk, struct socket *newsock)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	const struct af_alg_type *type;
+	struct sock *sk2;
+	int err;
+
+	lock_sock(sk);
+	type = ask->type;
+
+	err = -EINVAL;
+	if (!type)
+		goto unlock;
+
+	sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto);
+	err = -ENOMEM;
+	if (!sk2)
+		goto unlock;
+
+	sock_init_data(newsock, sk2);
+	sock_graft(sk2, newsock);
+
+	err = type->accept(ask->private, sk2);
+	if (err) {
+		sk_free(sk2);
+		goto unlock;
+	}
+
+	sk2->sk_family = PF_ALG;
+
+	sock_hold(sk);
+	alg_sk(sk2)->parent = sk;
+	alg_sk(sk2)->type = type;
+
+	newsock->ops = type->ops;
+	newsock->state = SS_CONNECTED;
+
+	err = 0;
+
+unlock:
+	release_sock(sk);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(af_alg_accept);
+
+static int alg_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+	return af_alg_accept(sock->sk, newsock);
+}
+
+static const struct proto_ops alg_proto_ops = {
+	.family		=	PF_ALG,
+	.owner		=	THIS_MODULE,
+
+	.connect	=	sock_no_connect,
+	.socketpair	=	sock_no_socketpair,
+	.getname	=	sock_no_getname,
+	.ioctl		=	sock_no_ioctl,
+	.listen		=	sock_no_listen,
+	.shutdown	=	sock_no_shutdown,
+	.getsockopt	=	sock_no_getsockopt,
+	.mmap		=	sock_no_mmap,
+	.sendpage	=	sock_no_sendpage,
+	.sendmsg	=	sock_no_sendmsg,
+	.recvmsg	=	sock_no_recvmsg,
+	.poll		=	sock_no_poll,
+
+	.bind		=	alg_bind,
+	.release	=	af_alg_release,
+	.setsockopt	=	alg_setsockopt,
+	.accept		=	alg_accept,
+};
+
+static void alg_sock_destruct(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+
+	alg_do_release(ask->type, ask->private);
+}
+
+static int alg_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
+{
+	struct sock *sk;
+	int err;
+
+	if (sock->type != SOCK_SEQPACKET)
+		return -ESOCKTNOSUPPORT;
+	if (protocol != 0)
+		return -EPROTONOSUPPORT;
+
+	err = -ENOMEM;
+	sk = sk_alloc(net, PF_ALG, GFP_KERNEL, &alg_proto);
+	if (!sk)
+		goto out;
+
+	sock->ops = &alg_proto_ops;
+	sock_init_data(sock, sk);
+
+	sk->sk_family = PF_ALG;
+	sk->sk_destruct = alg_sock_destruct;
+
+	return 0;
+out:
+	return err;
+}
+
+static const struct net_proto_family alg_family = {
+	.family	=	PF_ALG,
+	.create	=	alg_create,
+	.owner	=	THIS_MODULE,
+};
+
+int af_alg_make_sg(struct af_alg_sgl *sgl, void __user *addr, int len,
+		   int write)
+{
+	unsigned long from = (unsigned long)addr;
+	unsigned long npages;
+	unsigned off;
+	int err;
+	int i;
+
+	err = -EFAULT;
+	if (!access_ok(write ? VERIFY_READ : VERIFY_WRITE, addr, len))
+		goto out;
+
+	off = from & ~PAGE_MASK;
+	npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (npages > ALG_MAX_PAGES)
+		npages = ALG_MAX_PAGES;
+
+	err = get_user_pages_fast(from, npages, write, sgl->pages);
+	if (err < 0)
+		goto out;
+
+	npages = err;
+	err = -EINVAL;
+	if (WARN_ON(npages == 0))
+		goto out;
+
+	err = 0;
+
+	sg_init_table(sgl->sg, npages);
+
+	for (i = 0; i < npages; i++) {
+		int plen = min_t(int, len, PAGE_SIZE - off);
+
+		sg_set_page(sgl->sg + i, sgl->pages[i], plen, off);
+
+		off = 0;
+		len -= plen;
+		err += plen;
+	}
+
+out:
+	return err;
+}
+EXPORT_SYMBOL_GPL(af_alg_make_sg);
+
+void af_alg_free_sg(struct af_alg_sgl *sgl)
+{
+	int i;
+
+	i = 0;
+	do {
+		put_page(sgl->pages[i]);
+	} while (!sg_is_last(sgl->sg + (i++)));
+}
+EXPORT_SYMBOL_GPL(af_alg_free_sg);
+
+int af_alg_cmsg_send(struct msghdr *msg, struct af_alg_control *con)
+{
+	struct cmsghdr *cmsg;
+
+	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+		if (cmsg->cmsg_level != SOL_ALG)
+			continue;
+
+		switch(cmsg->cmsg_type) {
+		case ALG_SET_IV:
+			if (cmsg->cmsg_len < CMSG_LEN(sizeof(*con->iv)))
+				return -EINVAL;
+			con->iv = (void *)CMSG_DATA(cmsg);
+			if (cmsg->cmsg_len < CMSG_LEN(con->iv->ivlen +
+						      sizeof(*con->iv)))
+				return -EINVAL;
+			break;
+
+		case ALG_SET_OP:
+			if (cmsg->cmsg_len < CMSG_LEN(sizeof(u32)))
+				return -EINVAL;
+			con->op = *(u32 *)CMSG_DATA(cmsg);
+			break;
+
+		default:
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(af_alg_cmsg_send);
+
+int af_alg_wait_for_completion(int err, struct af_alg_completion *completion)
+{
+	switch (err) {
+	case -EINPROGRESS:
+	case -EBUSY:
+		wait_for_completion(&completion->completion);
+		INIT_COMPLETION(completion->completion);
+		err = completion->err;
+		break;
+	};
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(af_alg_wait_for_completion);
+
+void af_alg_complete(struct crypto_async_request *req, int err)
+{
+	struct af_alg_completion *completion = req->data;
+
+	completion->err = err;
+	complete(&completion->completion);
+}
+EXPORT_SYMBOL_GPL(af_alg_complete);
+
+static int __init af_alg_init(void)
+{
+	int err = proto_register(&alg_proto, 0);
+
+	if (err)
+		goto out;
+
+	err = sock_register(&alg_family);
+	if (err != 0)
+		goto out_unregister_proto;
+
+out:
+	return err;
+
+out_unregister_proto:
+	proto_unregister(&alg_proto);
+	goto out;
+}
+
+static void __exit af_alg_exit(void)
+{
+	sock_unregister(PF_ALG);
+	proto_unregister(&alg_proto);
+}
+
+module_init(af_alg_init);
+module_exit(af_alg_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(AF_ALG);
diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c
new file mode 100644
index 0000000..62122a1
--- /dev/null
+++ b/crypto/algif_hash.c
@@ -0,0 +1,319 @@
+/*
+ * algif_hash: User-space interface for hash algorithms
+ *
+ * This file provides the user-space API for hash algorithms.
+ *
+ * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/hash.h>
+#include <crypto/if_alg.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <net/sock.h>
+
+struct hash_ctx {
+	struct af_alg_sgl sgl;
+
+	u8 *result;
+
+	struct af_alg_completion completion;
+
+	unsigned int len;
+	bool more;
+
+	struct ahash_request req;
+};
+
+static int hash_sendmsg(struct kiocb *unused, struct socket *sock,
+			struct msghdr *msg, size_t ignored)
+{
+	int limit = ALG_MAX_PAGES * PAGE_SIZE;
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct hash_ctx *ctx = ask->private;
+	unsigned long iovlen;
+	struct iovec *iov;
+	long copied = 0;
+	int err;
+
+	if (limit > sk->sk_sndbuf)
+		limit = sk->sk_sndbuf;
+
+	lock_sock(sk);
+	if (!ctx->more) {
+		err = crypto_ahash_init(&ctx->req);
+		if (err)
+			goto unlock;
+	}
+
+	ctx->more = 0;
+
+	for (iov = msg->msg_iov, iovlen = msg->msg_iovlen; iovlen > 0;
+	     iovlen--, iov++) {
+		unsigned long seglen = iov->iov_len;
+		char __user *from = iov->iov_base;
+
+		while (seglen) {
+			int len = min_t(unsigned long, seglen, limit);
+			int newlen;
+
+			newlen = af_alg_make_sg(&ctx->sgl, from, len, 0);
+			if (newlen < 0)
+				goto unlock;
+
+			ahash_request_set_crypt(&ctx->req, ctx->sgl.sg, NULL,
+						newlen);
+
+			err = af_alg_wait_for_completion(
+				crypto_ahash_update(&ctx->req),
+				&ctx->completion);
+
+			af_alg_free_sg(&ctx->sgl);
+
+			if (err)
+				goto unlock;
+
+			seglen -= newlen;
+			from += newlen;
+			copied += newlen;
+		}
+	}
+
+	err = 0;
+
+	ctx->more = msg->msg_flags & MSG_MORE;
+	if (!ctx->more) {
+		ahash_request_set_crypt(&ctx->req, NULL, ctx->result, 0);
+		err = af_alg_wait_for_completion(crypto_ahash_final(&ctx->req),
+						 &ctx->completion);
+	}
+
+unlock:
+	release_sock(sk);
+
+	return err ?: copied;
+}
+
+static ssize_t hash_sendpage(struct socket *sock, struct page *page,
+			     int offset, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct hash_ctx *ctx = ask->private;
+	int err;
+
+	lock_sock(sk);
+	sg_init_table(ctx->sgl.sg, 1);
+	sg_set_page(ctx->sgl.sg, page, size, offset);
+
+	ahash_request_set_crypt(&ctx->req, ctx->sgl.sg, ctx->result, size);
+
+	if (!(flags & MSG_MORE)) {
+		if (ctx->more)
+			err = crypto_ahash_finup(&ctx->req);
+		else
+			err = crypto_ahash_digest(&ctx->req);
+	} else {
+		if (!ctx->more) {
+			err = crypto_ahash_init(&ctx->req);
+			if (err)
+				goto unlock;
+		}
+
+		err = crypto_ahash_update(&ctx->req);
+	}
+
+	err = af_alg_wait_for_completion(err, &ctx->completion);
+	if (err)
+		goto unlock;
+
+	ctx->more = flags & MSG_MORE;
+
+unlock:
+	release_sock(sk);
+
+	return err ?: size;
+}
+
+static int hash_recvmsg(struct kiocb *unused, struct socket *sock,
+			struct msghdr *msg, size_t len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct hash_ctx *ctx = ask->private;
+	unsigned ds = crypto_ahash_digestsize(crypto_ahash_reqtfm(&ctx->req));
+	int err;
+
+	if (len > ds)
+		len = ds;
+	else if (len < ds)
+		msg->msg_flags |= MSG_TRUNC;
+
+	lock_sock(sk);
+	if (ctx->more) {
+		ctx->more = 0;
+		ahash_request_set_crypt(&ctx->req, NULL, ctx->result, 0);
+		err = af_alg_wait_for_completion(crypto_ahash_final(&ctx->req),
+						 &ctx->completion);
+		if (err)
+			goto unlock;
+	}
+
+	err = memcpy_toiovec(msg->msg_iov, ctx->result, len);
+
+unlock:
+	release_sock(sk);
+
+	return err ?: len;
+}
+
+static int hash_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct hash_ctx *ctx = ask->private;
+	struct ahash_request *req = &ctx->req;
+	char state[crypto_ahash_statesize(crypto_ahash_reqtfm(req))];
+	struct sock *sk2;
+	struct alg_sock *ask2;
+	struct hash_ctx *ctx2;
+	int err;
+
+	err = crypto_ahash_export(req, state);
+	if (err)
+		return err;
+
+	err = af_alg_accept(ask->parent, newsock);
+	if (err)
+		return err;
+
+	sk2 = newsock->sk;
+	ask2 = alg_sk(sk2);
+	ctx2 = ask2->private;
+	ctx2->more = 1;
+
+	err = crypto_ahash_import(&ctx2->req, state);
+	if (err) {
+		sock_orphan(sk2);
+		sock_put(sk2);
+	}
+
+	return err;
+}
+
+static struct proto_ops algif_hash_ops = {
+	.family		=	PF_ALG,
+
+	.connect	=	sock_no_connect,
+	.socketpair	=	sock_no_socketpair,
+	.getname	=	sock_no_getname,
+	.ioctl		=	sock_no_ioctl,
+	.listen		=	sock_no_listen,
+	.shutdown	=	sock_no_shutdown,
+	.getsockopt	=	sock_no_getsockopt,
+	.mmap		=	sock_no_mmap,
+	.bind		=	sock_no_bind,
+	.setsockopt	=	sock_no_setsockopt,
+	.poll		=	sock_no_poll,
+
+	.release	=	af_alg_release,
+	.sendmsg	=	hash_sendmsg,
+	.sendpage	=	hash_sendpage,
+	.recvmsg	=	hash_recvmsg,
+	.accept		=	hash_accept,
+};
+
+static void *hash_bind(const char *name, u32 type, u32 mask)
+{
+	return crypto_alloc_ahash(name, type, mask);
+}
+
+static void hash_release(void *private)
+{
+	crypto_free_ahash(private);
+}
+
+static int hash_setkey(void *private, const u8 *key, unsigned int keylen)
+{
+	return crypto_ahash_setkey(private, key, keylen);
+}
+
+static void hash_sock_destruct(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct hash_ctx *ctx = ask->private;
+
+	sock_kfree_s(sk, ctx->result,
+		     crypto_ahash_digestsize(crypto_ahash_reqtfm(&ctx->req)));
+	sock_kfree_s(sk, ctx, ctx->len);
+	af_alg_release_parent(sk);
+}
+
+static int hash_accept_parent(void *private, struct sock *sk)
+{
+	struct hash_ctx *ctx;
+	struct alg_sock *ask = alg_sk(sk);
+	unsigned len = sizeof(*ctx) + crypto_ahash_reqsize(private);
+	unsigned ds = crypto_ahash_digestsize(private);
+
+	ctx = sock_kmalloc(sk, len, GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->result = sock_kmalloc(sk, ds, GFP_KERNEL);
+	if (!ctx->result) {
+		sock_kfree_s(sk, ctx, len);
+		return -ENOMEM;
+	}
+
+	memset(ctx->result, 0, ds);
+
+	ctx->len = len;
+	ctx->more = 0;
+	af_alg_init_completion(&ctx->completion);
+
+	ask->private = ctx;
+
+	ahash_request_set_tfm(&ctx->req, private);
+	ahash_request_set_callback(&ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				   af_alg_complete, &ctx->completion);
+
+	sk->sk_destruct = hash_sock_destruct;
+
+	return 0;
+}
+
+static const struct af_alg_type algif_type_hash = {
+	.bind		=	hash_bind,
+	.release	=	hash_release,
+	.setkey		=	hash_setkey,
+	.accept		=	hash_accept_parent,
+	.ops		=	&algif_hash_ops,
+	.name		=	"hash",
+	.owner		=	THIS_MODULE
+};
+
+static int __init algif_hash_init(void)
+{
+	return af_alg_register_type(&algif_type_hash);
+}
+
+static void __exit algif_hash_exit(void)
+{
+	int err = af_alg_unregister_type(&algif_type_hash);
+	BUG_ON(err);
+}
+
+module_init(algif_hash_init);
+module_exit(algif_hash_exit);
+MODULE_LICENSE("GPL");
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
new file mode 100644
index 0000000..6a6dfc0
--- /dev/null
+++ b/crypto/algif_skcipher.c
@@ -0,0 +1,632 @@
+/*
+ * algif_skcipher: User-space interface for skcipher algorithms
+ *
+ * This file provides the user-space API for symmetric key ciphers.
+ *
+ * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/scatterwalk.h>
+#include <crypto/skcipher.h>
+#include <crypto/if_alg.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <net/sock.h>
+
+struct skcipher_sg_list {
+	struct list_head list;
+
+	int cur;
+
+	struct scatterlist sg[0];
+};
+
+struct skcipher_ctx {
+	struct list_head tsgl;
+	struct af_alg_sgl rsgl;
+
+	void *iv;
+
+	struct af_alg_completion completion;
+
+	unsigned used;
+
+	unsigned int len;
+	bool more;
+	bool merge;
+	bool enc;
+
+	struct ablkcipher_request req;
+};
+
+#define MAX_SGL_ENTS ((PAGE_SIZE - sizeof(struct skcipher_sg_list)) / \
+		      sizeof(struct scatterlist) - 1)
+
+static inline int skcipher_sndbuf(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct skcipher_ctx *ctx = ask->private;
+
+	return max_t(int, max_t(int, sk->sk_sndbuf & PAGE_MASK, PAGE_SIZE) -
+			  ctx->used, 0);
+}
+
+static inline bool skcipher_writable(struct sock *sk)
+{
+	return PAGE_SIZE <= skcipher_sndbuf(sk);
+}
+
+static int skcipher_alloc_sgl(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct skcipher_ctx *ctx = ask->private;
+	struct skcipher_sg_list *sgl;
+	struct scatterlist *sg = NULL;
+
+	sgl = list_entry(ctx->tsgl.prev, struct skcipher_sg_list, list);
+	if (!list_empty(&ctx->tsgl))
+		sg = sgl->sg;
+
+	if (!sg || sgl->cur >= MAX_SGL_ENTS) {
+		sgl = sock_kmalloc(sk, sizeof(*sgl) +
+				       sizeof(sgl->sg[0]) * (MAX_SGL_ENTS + 1),
+				   GFP_KERNEL);
+		if (!sgl)
+			return -ENOMEM;
+
+		sg_init_table(sgl->sg, MAX_SGL_ENTS + 1);
+		sgl->cur = 0;
+
+		if (sg)
+			scatterwalk_sg_chain(sg, MAX_SGL_ENTS + 1, sgl->sg);
+
+		list_add_tail(&sgl->list, &ctx->tsgl);
+	}
+
+	return 0;
+}
+
+static void skcipher_pull_sgl(struct sock *sk, int used)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct skcipher_ctx *ctx = ask->private;
+	struct skcipher_sg_list *sgl;
+	struct scatterlist *sg;
+	int i;
+
+	while (!list_empty(&ctx->tsgl)) {
+		sgl = list_first_entry(&ctx->tsgl, struct skcipher_sg_list,
+				       list);
+		sg = sgl->sg;
+
+		for (i = 0; i < sgl->cur; i++) {
+			int plen = min_t(int, used, sg[i].length);
+
+			if (!sg_page(sg + i))
+				continue;
+
+			sg[i].length -= plen;
+			sg[i].offset += plen;
+
+			used -= plen;
+			ctx->used -= plen;
+
+			if (sg[i].length)
+				return;
+
+			put_page(sg_page(sg + i));
+			sg_assign_page(sg + i, NULL);
+		}
+
+		list_del(&sgl->list);
+		sock_kfree_s(sk, sgl,
+			     sizeof(*sgl) + sizeof(sgl->sg[0]) *
+					    (MAX_SGL_ENTS + 1));
+	}
+
+	if (!ctx->used)
+		ctx->merge = 0;
+}
+
+static void skcipher_free_sgl(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct skcipher_ctx *ctx = ask->private;
+
+	skcipher_pull_sgl(sk, ctx->used);
+}
+
+static int skcipher_wait_for_wmem(struct sock *sk, unsigned flags)
+{
+	long timeout;
+	DEFINE_WAIT(wait);
+	int err = -ERESTARTSYS;
+
+	if (flags & MSG_DONTWAIT)
+		return -EAGAIN;
+
+	set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+	for (;;) {
+		if (signal_pending(current))
+			break;
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		timeout = MAX_SCHEDULE_TIMEOUT;
+		if (sk_wait_event(sk, &timeout, skcipher_writable(sk))) {
+			err = 0;
+			break;
+		}
+	}
+	finish_wait(sk_sleep(sk), &wait);
+
+	return err;
+}
+
+static void skcipher_wmem_wakeup(struct sock *sk)
+{
+	struct socket_wq *wq;
+
+	if (!skcipher_writable(sk))
+		return;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
+							   POLLRDNORM |
+							   POLLRDBAND);
+	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+	rcu_read_unlock();
+}
+
+static int skcipher_wait_for_data(struct sock *sk, unsigned flags)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct skcipher_ctx *ctx = ask->private;
+	long timeout;
+	DEFINE_WAIT(wait);
+	int err = -ERESTARTSYS;
+
+	if (flags & MSG_DONTWAIT) {
+		return -EAGAIN;
+	}
+
+	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+
+	for (;;) {
+		if (signal_pending(current))
+			break;
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		timeout = MAX_SCHEDULE_TIMEOUT;
+		if (sk_wait_event(sk, &timeout, ctx->used)) {
+			err = 0;
+			break;
+		}
+	}
+	finish_wait(sk_sleep(sk), &wait);
+
+	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+
+	return err;
+}
+
+static void skcipher_data_wakeup(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct skcipher_ctx *ctx = ask->private;
+	struct socket_wq *wq;
+
+	if (!ctx->used)
+		return;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
+							   POLLRDNORM |
+							   POLLRDBAND);
+	sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+	rcu_read_unlock();
+}
+
+static int skcipher_sendmsg(struct kiocb *unused, struct socket *sock,
+			    struct msghdr *msg, size_t size)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct skcipher_ctx *ctx = ask->private;
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(&ctx->req);
+	unsigned ivsize = crypto_ablkcipher_ivsize(tfm);
+	struct skcipher_sg_list *sgl;
+	struct af_alg_control con = {};
+	long copied = 0;
+	bool enc = 0;
+	int err;
+	int i;
+
+	if (msg->msg_controllen) {
+		err = af_alg_cmsg_send(msg, &con);
+		if (err)
+			return err;
+
+		switch (con.op) {
+		case ALG_OP_ENCRYPT:
+			enc = 1;
+			break;
+		case ALG_OP_DECRYPT:
+			enc = 0;
+			break;
+		default:
+			return -EINVAL;
+		}
+
+		if (con.iv && con.iv->ivlen != ivsize)
+			return -EINVAL;
+	}
+
+	err = -EINVAL;
+
+	lock_sock(sk);
+	if (!ctx->more && ctx->used)
+		goto unlock;
+
+	if (!ctx->used) {
+		ctx->enc = enc;
+		if (con.iv)
+			memcpy(ctx->iv, con.iv->iv, ivsize);
+	}
+
+	while (size) {
+		struct scatterlist *sg;
+		unsigned long len = size;
+		int plen;
+
+		if (ctx->merge) {
+			sgl = list_entry(ctx->tsgl.prev,
+					 struct skcipher_sg_list, list);
+			sg = sgl->sg + sgl->cur - 1;
+			len = min_t(unsigned long, len,
+				    PAGE_SIZE - sg->offset - sg->length);
+
+			err = memcpy_fromiovec(page_address(sg_page(sg)) +
+					       sg->offset + sg->length,
+					       msg->msg_iov, len);
+			if (err)
+				goto unlock;
+
+			sg->length += len;
+			ctx->merge = (sg->offset + sg->length) &
+				     (PAGE_SIZE - 1);
+
+			ctx->used += len;
+			copied += len;
+			size -= len;
+			continue;
+		}
+
+		if (!skcipher_writable(sk)) {
+			err = skcipher_wait_for_wmem(sk, msg->msg_flags);
+			if (err)
+				goto unlock;
+		}
+
+		len = min_t(unsigned long, len, skcipher_sndbuf(sk));
+
+		err = skcipher_alloc_sgl(sk);
+		if (err)
+			goto unlock;
+
+		sgl = list_entry(ctx->tsgl.prev, struct skcipher_sg_list, list);
+		sg = sgl->sg;
+		do {
+			i = sgl->cur;
+			plen = min_t(int, len, PAGE_SIZE);
+
+			sg_assign_page(sg + i, alloc_page(GFP_KERNEL));
+			err = -ENOMEM;
+			if (!sg_page(sg + i))
+				goto unlock;
+
+			err = memcpy_fromiovec(page_address(sg_page(sg + i)),
+					       msg->msg_iov, plen);
+			if (err) {
+				__free_page(sg_page(sg + i));
+				sg_assign_page(sg + i, NULL);
+				goto unlock;
+			}
+
+			sg[i].length = plen;
+			len -= plen;
+			ctx->used += plen;
+			copied += plen;
+			size -= plen;
+			sgl->cur++;
+		} while (len && sgl->cur < MAX_SGL_ENTS);
+
+		ctx->merge = plen & (PAGE_SIZE - 1);
+	}
+
+	err = 0;
+
+	ctx->more = msg->msg_flags & MSG_MORE;
+	if (!ctx->more && !list_empty(&ctx->tsgl))
+		sgl = list_entry(ctx->tsgl.prev, struct skcipher_sg_list, list);
+
+unlock:
+	skcipher_data_wakeup(sk);
+	release_sock(sk);
+
+	return copied ?: err;
+}
+
+static ssize_t skcipher_sendpage(struct socket *sock, struct page *page,
+				 int offset, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct skcipher_ctx *ctx = ask->private;
+	struct skcipher_sg_list *sgl;
+	int err = -EINVAL;
+
+	lock_sock(sk);
+	if (!ctx->more && ctx->used)
+		goto unlock;
+
+	if (!size)
+		goto done;
+
+	if (!skcipher_writable(sk)) {
+		err = skcipher_wait_for_wmem(sk, flags);
+		if (err)
+			goto unlock;
+	}
+
+	err = skcipher_alloc_sgl(sk);
+	if (err)
+		goto unlock;
+
+	ctx->merge = 0;
+	sgl = list_entry(ctx->tsgl.prev, struct skcipher_sg_list, list);
+
+	get_page(page);
+	sg_set_page(sgl->sg + sgl->cur, page, size, offset);
+	sgl->cur++;
+	ctx->used += size;
+
+done:
+	ctx->more = flags & MSG_MORE;
+	if (!ctx->more && !list_empty(&ctx->tsgl))
+		sgl = list_entry(ctx->tsgl.prev, struct skcipher_sg_list, list);
+
+unlock:
+	skcipher_data_wakeup(sk);
+	release_sock(sk);
+
+	return err ?: size;
+}
+
+static int skcipher_recvmsg(struct kiocb *unused, struct socket *sock,
+			    struct msghdr *msg, size_t ignored, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct skcipher_ctx *ctx = ask->private;
+	unsigned bs = crypto_ablkcipher_blocksize(crypto_ablkcipher_reqtfm(
+		&ctx->req));
+	struct skcipher_sg_list *sgl;
+	struct scatterlist *sg;
+	unsigned long iovlen;
+	struct iovec *iov;
+	int err = -EAGAIN;
+	int used;
+	long copied = 0;
+
+	lock_sock(sk);
+	for (iov = msg->msg_iov, iovlen = msg->msg_iovlen; iovlen > 0;
+	     iovlen--, iov++) {
+		unsigned long seglen = iov->iov_len;
+		char __user *from = iov->iov_base;
+
+		while (seglen) {
+			sgl = list_first_entry(&ctx->tsgl,
+					       struct skcipher_sg_list, list);
+			sg = sgl->sg;
+
+			while (!sg->length)
+				sg++;
+
+			used = ctx->used;
+			if (!used) {
+				err = skcipher_wait_for_data(sk, flags);
+				if (err)
+					goto unlock;
+			}
+
+			used = min_t(unsigned long, used, seglen);
+
+			used = af_alg_make_sg(&ctx->rsgl, from, used, 1);
+			err = used;
+			if (err < 0)
+				goto unlock;
+
+			if (ctx->more || used < ctx->used)
+				used -= used % bs;
+
+			err = -EINVAL;
+			if (!used)
+				goto free;
+
+			ablkcipher_request_set_crypt(&ctx->req, sg,
+						     ctx->rsgl.sg, used,
+						     ctx->iv);
+
+			err = af_alg_wait_for_completion(
+				ctx->enc ?
+					crypto_ablkcipher_encrypt(&ctx->req) :
+					crypto_ablkcipher_decrypt(&ctx->req),
+				&ctx->completion);
+
+free:
+			af_alg_free_sg(&ctx->rsgl);
+
+			if (err)
+				goto unlock;
+
+			copied += used;
+			from += used;
+			seglen -= used;
+			skcipher_pull_sgl(sk, used);
+		}
+	}
+
+	err = 0;
+
+unlock:
+	skcipher_wmem_wakeup(sk);
+	release_sock(sk);
+
+	return copied ?: err;
+}
+
+
+static unsigned int skcipher_poll(struct file *file, struct socket *sock,
+				  poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct skcipher_ctx *ctx = ask->private;
+	unsigned int mask;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
+
+	if (ctx->used)
+		mask |= POLLIN | POLLRDNORM;
+
+	if (skcipher_writable(sk))
+		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+
+	return mask;
+}
+
+static struct proto_ops algif_skcipher_ops = {
+	.family		=	PF_ALG,
+
+	.connect	=	sock_no_connect,
+	.socketpair	=	sock_no_socketpair,
+	.getname	=	sock_no_getname,
+	.ioctl		=	sock_no_ioctl,
+	.listen		=	sock_no_listen,
+	.shutdown	=	sock_no_shutdown,
+	.getsockopt	=	sock_no_getsockopt,
+	.mmap		=	sock_no_mmap,
+	.bind		=	sock_no_bind,
+	.accept		=	sock_no_accept,
+	.setsockopt	=	sock_no_setsockopt,
+
+	.release	=	af_alg_release,
+	.sendmsg	=	skcipher_sendmsg,
+	.sendpage	=	skcipher_sendpage,
+	.recvmsg	=	skcipher_recvmsg,
+	.poll		=	skcipher_poll,
+};
+
+static void *skcipher_bind(const char *name, u32 type, u32 mask)
+{
+	return crypto_alloc_ablkcipher(name, type, mask);
+}
+
+static void skcipher_release(void *private)
+{
+	crypto_free_ablkcipher(private);
+}
+
+static int skcipher_setkey(void *private, const u8 *key, unsigned int keylen)
+{
+	return crypto_ablkcipher_setkey(private, key, keylen);
+}
+
+static void skcipher_sock_destruct(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct skcipher_ctx *ctx = ask->private;
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(&ctx->req);
+
+	skcipher_free_sgl(sk);
+	sock_kfree_s(sk, ctx->iv, crypto_ablkcipher_ivsize(tfm));
+	sock_kfree_s(sk, ctx, ctx->len);
+	af_alg_release_parent(sk);
+}
+
+static int skcipher_accept_parent(void *private, struct sock *sk)
+{
+	struct skcipher_ctx *ctx;
+	struct alg_sock *ask = alg_sk(sk);
+	unsigned int len = sizeof(*ctx) + crypto_ablkcipher_reqsize(private);
+
+	ctx = sock_kmalloc(sk, len, GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->iv = sock_kmalloc(sk, crypto_ablkcipher_ivsize(private),
+			       GFP_KERNEL);
+	if (!ctx->iv) {
+		sock_kfree_s(sk, ctx, len);
+		return -ENOMEM;
+	}
+
+	memset(ctx->iv, 0, crypto_ablkcipher_ivsize(private));
+
+	INIT_LIST_HEAD(&ctx->tsgl);
+	ctx->len = len;
+	ctx->used = 0;
+	ctx->more = 0;
+	ctx->merge = 0;
+	ctx->enc = 0;
+	af_alg_init_completion(&ctx->completion);
+
+	ask->private = ctx;
+
+	ablkcipher_request_set_tfm(&ctx->req, private);
+	ablkcipher_request_set_callback(&ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+					af_alg_complete, &ctx->completion);
+
+	sk->sk_destruct = skcipher_sock_destruct;
+
+	return 0;
+}
+
+static const struct af_alg_type algif_type_skcipher = {
+	.bind		=	skcipher_bind,
+	.release	=	skcipher_release,
+	.setkey		=	skcipher_setkey,
+	.accept		=	skcipher_accept_parent,
+	.ops		=	&algif_skcipher_ops,
+	.name		=	"skcipher",
+	.owner		=	THIS_MODULE
+};
+
+static int __init algif_skcipher_init(void)
+{
+	return af_alg_register_type(&algif_type_skcipher);
+}
+
+static void __exit algif_skcipher_exit(void)
+{
+	int err = af_alg_unregister_type(&algif_type_skcipher);
+	BUG_ON(err);
+}
+
+module_init(algif_skcipher_init);
+module_exit(algif_skcipher_exit);
+MODULE_LICENSE("GPL");
diff --git a/crypto/authenc.c b/crypto/authenc.c
index a5a22cf..5ef7ba6 100644
--- a/crypto/authenc.c
+++ b/crypto/authenc.c
@@ -107,20 +107,6 @@
 	goto out;
 }
 
-static void authenc_chain(struct scatterlist *head, struct scatterlist *sg,
-			  int chain)
-{
-	if (chain) {
-		head->length += sg->length;
-		sg = scatterwalk_sg_next(sg);
-	}
-
-	if (sg)
-		scatterwalk_sg_chain(head, 2, sg);
-	else
-		sg_mark_end(head);
-}
-
 static void authenc_geniv_ahash_update_done(struct crypto_async_request *areq,
 					    int err)
 {
@@ -345,7 +331,7 @@
 	if (ivsize) {
 		sg_init_table(cipher, 2);
 		sg_set_buf(cipher, iv, ivsize);
-		authenc_chain(cipher, dst, vdst == iv + ivsize);
+		scatterwalk_crypto_chain(cipher, dst, vdst == iv + ivsize, 2);
 		dst = cipher;
 		cryptlen += ivsize;
 	}
@@ -354,7 +340,7 @@
 		authenc_ahash_fn = crypto_authenc_ahash;
 		sg_init_table(asg, 2);
 		sg_set_page(asg, sg_page(assoc), assoc->length, assoc->offset);
-		authenc_chain(asg, dst, 0);
+		scatterwalk_crypto_chain(asg, dst, 0, 2);
 		dst = asg;
 		cryptlen += req->assoclen;
 	}
@@ -499,7 +485,7 @@
 	if (ivsize) {
 		sg_init_table(cipher, 2);
 		sg_set_buf(cipher, iv, ivsize);
-		authenc_chain(cipher, src, vsrc == iv + ivsize);
+		scatterwalk_crypto_chain(cipher, src, vsrc == iv + ivsize, 2);
 		src = cipher;
 		cryptlen += ivsize;
 	}
@@ -508,7 +494,7 @@
 		authenc_ahash_fn = crypto_authenc_ahash;
 		sg_init_table(asg, 2);
 		sg_set_page(asg, sg_page(assoc), assoc->length, assoc->offset);
-		authenc_chain(asg, src, 0);
+		scatterwalk_crypto_chain(asg, src, 0, 2);
 		src = asg;
 		cryptlen += req->assoclen;
 	}
diff --git a/crypto/cast5.c b/crypto/cast5.c
index a1d2294..4a230dd 100644
--- a/crypto/cast5.c
+++ b/crypto/cast5.c
@@ -604,36 +604,23 @@
 	 * Rounds 3, 6, 9, 12, and 15 use f function Type 3.
 	 */
 
+	t = l; l = r; r = t ^ F1(r, Km[0], Kr[0]);
+	t = l; l = r; r = t ^ F2(r, Km[1], Kr[1]);
+	t = l; l = r; r = t ^ F3(r, Km[2], Kr[2]);
+	t = l; l = r; r = t ^ F1(r, Km[3], Kr[3]);
+	t = l; l = r; r = t ^ F2(r, Km[4], Kr[4]);
+	t = l; l = r; r = t ^ F3(r, Km[5], Kr[5]);
+	t = l; l = r; r = t ^ F1(r, Km[6], Kr[6]);
+	t = l; l = r; r = t ^ F2(r, Km[7], Kr[7]);
+	t = l; l = r; r = t ^ F3(r, Km[8], Kr[8]);
+	t = l; l = r; r = t ^ F1(r, Km[9], Kr[9]);
+	t = l; l = r; r = t ^ F2(r, Km[10], Kr[10]);
+	t = l; l = r; r = t ^ F3(r, Km[11], Kr[11]);
 	if (!(c->rr)) {
-		t = l; l = r; r = t ^ F1(r, Km[0], Kr[0]);
-		t = l; l = r; r = t ^ F2(r, Km[1], Kr[1]);
-		t = l; l = r; r = t ^ F3(r, Km[2], Kr[2]);
-		t = l; l = r; r = t ^ F1(r, Km[3], Kr[3]);
-		t = l; l = r; r = t ^ F2(r, Km[4], Kr[4]);
-		t = l; l = r; r = t ^ F3(r, Km[5], Kr[5]);
-		t = l; l = r; r = t ^ F1(r, Km[6], Kr[6]);
-		t = l; l = r; r = t ^ F2(r, Km[7], Kr[7]);
-		t = l; l = r; r = t ^ F3(r, Km[8], Kr[8]);
-		t = l; l = r; r = t ^ F1(r, Km[9], Kr[9]);
-		t = l; l = r; r = t ^ F2(r, Km[10], Kr[10]);
-		t = l; l = r; r = t ^ F3(r, Km[11], Kr[11]);
 		t = l; l = r; r = t ^ F1(r, Km[12], Kr[12]);
 		t = l; l = r; r = t ^ F2(r, Km[13], Kr[13]);
 		t = l; l = r; r = t ^ F3(r, Km[14], Kr[14]);
 		t = l; l = r; r = t ^ F1(r, Km[15], Kr[15]);
-	} else {
-		t = l; l = r; r = t ^ F1(r, Km[0], Kr[0]);
-		t = l; l = r; r = t ^ F2(r, Km[1], Kr[1]);
-		t = l; l = r; r = t ^ F3(r, Km[2], Kr[2]);
-		t = l; l = r; r = t ^ F1(r, Km[3], Kr[3]);
-		t = l; l = r; r = t ^ F2(r, Km[4], Kr[4]);
-		t = l; l = r; r = t ^ F3(r, Km[5], Kr[5]);
-		t = l; l = r; r = t ^ F1(r, Km[6], Kr[6]);
-		t = l; l = r; r = t ^ F2(r, Km[7], Kr[7]);
-		t = l; l = r; r = t ^ F3(r, Km[8], Kr[8]);
-		t = l; l = r; r = t ^ F1(r, Km[9], Kr[9]);
-		t = l; l = r; r = t ^ F2(r, Km[10], Kr[10]);
-		t = l; l = r; r = t ^ F3(r, Km[11], Kr[11]);
 	}
 
 	/* c1...c64 <-- (R16,L16).  (Exchange final blocks L16, R16 and
@@ -663,32 +650,19 @@
 		t = l; l = r; r = t ^ F3(r, Km[14], Kr[14]);
 		t = l; l = r; r = t ^ F2(r, Km[13], Kr[13]);
 		t = l; l = r; r = t ^ F1(r, Km[12], Kr[12]);
-		t = l; l = r; r = t ^ F3(r, Km[11], Kr[11]);
-		t = l; l = r; r = t ^ F2(r, Km[10], Kr[10]);
-		t = l; l = r; r = t ^ F1(r, Km[9], Kr[9]);
-		t = l; l = r; r = t ^ F3(r, Km[8], Kr[8]);
-		t = l; l = r; r = t ^ F2(r, Km[7], Kr[7]);
-		t = l; l = r; r = t ^ F1(r, Km[6], Kr[6]);
-		t = l; l = r; r = t ^ F3(r, Km[5], Kr[5]);
-		t = l; l = r; r = t ^ F2(r, Km[4], Kr[4]);
-		t = l; l = r; r = t ^ F1(r, Km[3], Kr[3]);
-		t = l; l = r; r = t ^ F3(r, Km[2], Kr[2]);
-		t = l; l = r; r = t ^ F2(r, Km[1], Kr[1]);
-		t = l; l = r; r = t ^ F1(r, Km[0], Kr[0]);
-	} else {
-		t = l; l = r; r = t ^ F3(r, Km[11], Kr[11]);
-		t = l; l = r; r = t ^ F2(r, Km[10], Kr[10]);
-		t = l; l = r; r = t ^ F1(r, Km[9], Kr[9]);
-		t = l; l = r; r = t ^ F3(r, Km[8], Kr[8]);
-		t = l; l = r; r = t ^ F2(r, Km[7], Kr[7]);
-		t = l; l = r; r = t ^ F1(r, Km[6], Kr[6]);
-		t = l; l = r; r = t ^ F3(r, Km[5], Kr[5]);
-		t = l; l = r; r = t ^ F2(r, Km[4], Kr[4]);
-		t = l; l = r; r = t ^ F1(r, Km[3], Kr[3]);
-		t = l; l = r; r = t ^ F3(r, Km[2], Kr[2]);
-		t = l; l = r; r = t ^ F2(r, Km[1], Kr[1]);
-		t = l; l = r; r = t ^ F1(r, Km[0], Kr[0]);
 	}
+	t = l; l = r; r = t ^ F3(r, Km[11], Kr[11]);
+	t = l; l = r; r = t ^ F2(r, Km[10], Kr[10]);
+	t = l; l = r; r = t ^ F1(r, Km[9], Kr[9]);
+	t = l; l = r; r = t ^ F3(r, Km[8], Kr[8]);
+	t = l; l = r; r = t ^ F2(r, Km[7], Kr[7]);
+	t = l; l = r; r = t ^ F1(r, Km[6], Kr[6]);
+	t = l; l = r; r = t ^ F3(r, Km[5], Kr[5]);
+	t = l; l = r; r = t ^ F2(r, Km[4], Kr[4]);
+	t = l; l = r; r = t ^ F1(r, Km[3], Kr[3]);
+	t = l; l = r; r = t ^ F3(r, Km[2], Kr[2]);
+	t = l; l = r; r = t ^ F2(r, Km[1], Kr[1]);
+	t = l; l = r; r = t ^ F1(r, Km[0], Kr[0]);
 
 	dst[0] = cpu_to_be32(r);
 	dst[1] = cpu_to_be32(l);
diff --git a/crypto/crypto_wq.c b/crypto/crypto_wq.c
index fdcf624..b980ee1 100644
--- a/crypto/crypto_wq.c
+++ b/crypto/crypto_wq.c
@@ -20,7 +20,8 @@
 
 static int __init crypto_wq_init(void)
 {
-	kcrypto_wq = create_workqueue("crypto");
+	kcrypto_wq = alloc_workqueue("crypto",
+				     WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE, 1);
 	if (unlikely(!kcrypto_wq))
 		return -ENOMEM;
 	return 0;
diff --git a/crypto/deflate.c b/crypto/deflate.c
index 463dc85..cbc7a33 100644
--- a/crypto/deflate.c
+++ b/crypto/deflate.c
@@ -48,12 +48,11 @@
 	int ret = 0;
 	struct z_stream_s *stream = &ctx->comp_stream;
 
-	stream->workspace = vmalloc(zlib_deflate_workspacesize());
+	stream->workspace = vzalloc(zlib_deflate_workspacesize());
 	if (!stream->workspace) {
 		ret = -ENOMEM;
 		goto out;
 	}
-	memset(stream->workspace, 0, zlib_deflate_workspacesize());
 	ret = zlib_deflateInit2(stream, DEFLATE_DEF_LEVEL, Z_DEFLATED,
 	                        -DEFLATE_DEF_WINBITS, DEFLATE_DEF_MEMLEVEL,
 	                        Z_DEFAULT_STRATEGY);
diff --git a/crypto/eseqiv.c b/crypto/eseqiv.c
index 3ca3b66..42ce9f5 100644
--- a/crypto/eseqiv.c
+++ b/crypto/eseqiv.c
@@ -62,20 +62,6 @@
 	skcipher_givcrypt_complete(req, err);
 }
 
-static void eseqiv_chain(struct scatterlist *head, struct scatterlist *sg,
-			 int chain)
-{
-	if (chain) {
-		head->length += sg->length;
-		sg = scatterwalk_sg_next(sg);
-	}
-
-	if (sg)
-		scatterwalk_sg_chain(head, 2, sg);
-	else
-		sg_mark_end(head);
-}
-
 static int eseqiv_givencrypt(struct skcipher_givcrypt_request *req)
 {
 	struct crypto_ablkcipher *geniv = skcipher_givcrypt_reqtfm(req);
@@ -124,13 +110,13 @@
 
 	sg_init_table(reqctx->src, 2);
 	sg_set_buf(reqctx->src, giv, ivsize);
-	eseqiv_chain(reqctx->src, osrc, vsrc == giv + ivsize);
+	scatterwalk_crypto_chain(reqctx->src, osrc, vsrc == giv + ivsize, 2);
 
 	dst = reqctx->src;
 	if (osrc != odst) {
 		sg_init_table(reqctx->dst, 2);
 		sg_set_buf(reqctx->dst, giv, ivsize);
-		eseqiv_chain(reqctx->dst, odst, vdst == giv + ivsize);
+		scatterwalk_crypto_chain(reqctx->dst, odst, vdst == giv + ivsize, 2);
 
 		dst = reqctx->dst;
 	}
diff --git a/crypto/gcm.c b/crypto/gcm.c
index 2f5fbba..1a25263 100644
--- a/crypto/gcm.c
+++ b/crypto/gcm.c
@@ -1102,21 +1102,6 @@
 	return crypto_aead_setauthsize(ctx->child, authsize);
 }
 
-/* this is the same as crypto_authenc_chain */
-static void crypto_rfc4543_chain(struct scatterlist *head,
-				 struct scatterlist *sg, int chain)
-{
-	if (chain) {
-		head->length += sg->length;
-		sg = scatterwalk_sg_next(sg);
-	}
-
-	if (sg)
-		scatterwalk_sg_chain(head, 2, sg);
-	else
-		sg_mark_end(head);
-}
-
 static struct aead_request *crypto_rfc4543_crypt(struct aead_request *req,
 						 int enc)
 {
@@ -1154,13 +1139,13 @@
 
 	sg_init_table(payload, 2);
 	sg_set_buf(payload, req->iv, 8);
-	crypto_rfc4543_chain(payload, dst, vdst == req->iv + 8);
+	scatterwalk_crypto_chain(payload, dst, vdst == req->iv + 8, 2);
 	assoclen += 8 + req->cryptlen - (enc ? 0 : authsize);
 
 	sg_init_table(assoc, 2);
 	sg_set_page(assoc, sg_page(req->assoc), req->assoc->length,
 		    req->assoc->offset);
-	crypto_rfc4543_chain(assoc, payload, 0);
+	scatterwalk_crypto_chain(assoc, payload, 0, 2);
 
 	aead_request_set_tfm(subreq, ctx->child);
 	aead_request_set_callback(subreq, req->base.flags, req->base.complete,
diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c
index 75586f1..29a89da 100644
--- a/crypto/pcrypt.c
+++ b/crypto/pcrypt.c
@@ -455,7 +455,8 @@
 
 	get_online_cpus();
 
-	pcrypt->wq = create_workqueue(name);
+	pcrypt->wq = alloc_workqueue(name,
+				     WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE, 1);
 	if (!pcrypt->wq)
 		goto err;
 
diff --git a/crypto/rmd128.c b/crypto/rmd128.c
index 1ceb673..8a0f68b 100644
--- a/crypto/rmd128.c
+++ b/crypto/rmd128.c
@@ -5,7 +5,7 @@
  *
  * Based on the reference implementation by Antoon Bosselaers, ESAT-COSIC
  *
- * Copyright (c) 2008 Adrian-Ken Rueegsegger <rueegsegger (at) swiss-it.ch>
+ * Copyright (c) 2008 Adrian-Ken Rueegsegger <ken@codelabs.ch>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -325,4 +325,5 @@
 module_exit(rmd128_mod_fini);
 
 MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Adrian-Ken Rueegsegger <ken@codelabs.ch>");
 MODULE_DESCRIPTION("RIPEMD-128 Message Digest");
diff --git a/crypto/rmd160.c b/crypto/rmd160.c
index 472261fc..525d7bb 100644
--- a/crypto/rmd160.c
+++ b/crypto/rmd160.c
@@ -5,7 +5,7 @@
  *
  * Based on the reference implementation by Antoon Bosselaers, ESAT-COSIC
  *
- * Copyright (c) 2008 Adrian-Ken Rueegsegger <rueegsegger (at) swiss-it.ch>
+ * Copyright (c) 2008 Adrian-Ken Rueegsegger <ken@codelabs.ch>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -369,4 +369,5 @@
 module_exit(rmd160_mod_fini);
 
 MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Adrian-Ken Rueegsegger <ken@codelabs.ch>");
 MODULE_DESCRIPTION("RIPEMD-160 Message Digest");
diff --git a/crypto/rmd256.c b/crypto/rmd256.c
index 72eafa8..69293d9 100644
--- a/crypto/rmd256.c
+++ b/crypto/rmd256.c
@@ -5,7 +5,7 @@
  *
  * Based on the reference implementation by Antoon Bosselaers, ESAT-COSIC
  *
- * Copyright (c) 2008 Adrian-Ken Rueegsegger <rueegsegger (at) swiss-it.ch>
+ * Copyright (c) 2008 Adrian-Ken Rueegsegger <ken@codelabs.ch>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -344,4 +344,5 @@
 module_exit(rmd256_mod_fini);
 
 MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Adrian-Ken Rueegsegger <ken@codelabs.ch>");
 MODULE_DESCRIPTION("RIPEMD-256 Message Digest");
diff --git a/crypto/rmd320.c b/crypto/rmd320.c
index 86becab..09f97df 100644
--- a/crypto/rmd320.c
+++ b/crypto/rmd320.c
@@ -5,7 +5,7 @@
  *
  * Based on the reference implementation by Antoon Bosselaers, ESAT-COSIC
  *
- * Copyright (c) 2008 Adrian-Ken Rueegsegger <rueegsegger (at) swiss-it.ch>
+ * Copyright (c) 2008 Adrian-Ken Rueegsegger <ken@codelabs.ch>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -393,4 +393,5 @@
 module_exit(rmd320_mod_fini);
 
 MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Adrian-Ken Rueegsegger <ken@codelabs.ch>");
 MODULE_DESCRIPTION("RIPEMD-320 Message Digest");
diff --git a/crypto/shash.c b/crypto/shash.c
index 22fd943..76f74b9 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -310,7 +310,13 @@
 
 static int shash_async_import(struct ahash_request *req, const void *in)
 {
-	return crypto_shash_import(ahash_request_ctx(req), in);
+	struct crypto_shash **ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
+	struct shash_desc *desc = ahash_request_ctx(req);
+
+	desc->tfm = *ctx;
+	desc->flags = req->base.flags;
+
+	return crypto_shash_import(desc, in);
 }
 
 static void crypto_exit_shash_ops_async(struct crypto_tfm *tfm)
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 3ca68f9..9aac5e5 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -8,6 +8,13 @@
  * Copyright (c) 2002 Jean-Francois Dive <jef@linuxbe.org>
  * Copyright (c) 2007 Nokia Siemens Networks
  *
+ * Updated RFC4106 AES-GCM testing.
+ *    Authors: Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *             Adrian Hoban <adrian.hoban@intel.com>
+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *             Tadeusz Struk (tadeusz.struk@intel.com)
+ *             Copyright (c) 2010, Intel Corporation.
+ *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
  * Software Foundation; either version 2 of the License, or (at your option)
@@ -980,6 +987,10 @@
 		ret += tcrypt_test("ansi_cprng");
 		break;
 
+	case 151:
+		ret += tcrypt_test("rfc4106(gcm(aes))");
+		break;
+
 	case 200:
 		test_cipher_speed("ecb(aes)", ENCRYPT, sec, NULL, 0,
 				speed_template_16_24_32);
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index fa8c8f7..27ea9fe 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -6,6 +6,13 @@
  * Copyright (c) 2007 Nokia Siemens Networks
  * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
  *
+ * Updated RFC4106 AES-GCM testing.
+ *    Authors: Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *             Adrian Hoban <adrian.hoban@intel.com>
+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *             Tadeusz Struk (tadeusz.struk@intel.com)
+ *    Copyright (c) 2010, Intel Corporation.
+ *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
  * Software Foundation; either version 2 of the License, or (at your option)
@@ -2242,6 +2249,23 @@
 			}
 		}
 	}, {
+		.alg = "rfc4106(gcm(aes))",
+		.test = alg_test_aead,
+		.suite = {
+			.aead = {
+				.enc = {
+					.vecs = aes_gcm_rfc4106_enc_tv_template,
+					.count = AES_GCM_4106_ENC_TEST_VECTORS
+				},
+				.dec = {
+					.vecs = aes_gcm_rfc4106_dec_tv_template,
+					.count = AES_GCM_4106_DEC_TEST_VECTORS
+				}
+			}
+		}
+	}, {
+
+
 		.alg = "rfc4309(ccm(aes))",
 		.test = alg_test_aead,
 		.fips_allowed = 1,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 74e3537..834af7f 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -6,6 +6,15 @@
  * Copyright (c) 2007 Nokia Siemens Networks
  * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
  *
+ * Updated RFC4106 AES-GCM testing. Some test vectors were taken from
+ * http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/
+ * gcm/gcm-test-vectors.tar.gz
+ *     Authors: Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *              Adrian Hoban <adrian.hoban@intel.com>
+ *              Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *              Tadeusz Struk (tadeusz.struk@intel.com)
+ *     Copyright (c) 2010, Intel Corporation.
+ *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
  * Software Foundation; either version 2 of the License, or (at your option)
@@ -2947,6 +2956,8 @@
 #define AES_CTR_3686_DEC_TEST_VECTORS 6
 #define AES_GCM_ENC_TEST_VECTORS 9
 #define AES_GCM_DEC_TEST_VECTORS 8
+#define AES_GCM_4106_ENC_TEST_VECTORS 7
+#define AES_GCM_4106_DEC_TEST_VECTORS 7
 #define AES_CCM_ENC_TEST_VECTORS 7
 #define AES_CCM_DEC_TEST_VECTORS 7
 #define AES_CCM_4309_ENC_TEST_VECTORS 7
@@ -5829,6 +5840,356 @@
 	}
 };
 
+static struct aead_testvec aes_gcm_rfc4106_enc_tv_template[] = {
+        { /* Generated using Crypto++ */
+		.key    = zeroed_string,
+		.klen	= 20,
+                .iv     = zeroed_string,
+                .input  = zeroed_string,
+                .ilen   = 16,
+                .assoc  = zeroed_string,
+                .alen   = 8,
+		.result	= "\x03\x88\xDA\xCE\x60\xB6\xA3\x92"
+                          "\xF3\x28\xC2\xB9\x71\xB2\xFE\x78"
+                          "\x97\xFE\x4C\x23\x37\x42\x01\xE0"
+                          "\x81\x9F\x8D\xC5\xD7\x41\xA0\x1B",
+		.rlen	= 32,
+        },{
+		.key    = "\xfe\xff\xe9\x92\x86\x65\x73\x1c"
+			  "\x6d\x6a\x8f\x94\x67\x30\x83\x08"
+                          "\x00\x00\x00\x00",
+		.klen	= 20,
+                .iv     = "\x00\x00\x00\x00\x00\x00\x00\x01"
+                          "\x00\x00\x00\x00",
+                .input  = zeroed_string,
+                .ilen   = 16,
+                .assoc  = zeroed_string,
+                .alen   = 8,
+		.result	= "\xC0\x0D\x8B\x42\x0F\x8F\x34\x18"
+                          "\x88\xB1\xC5\xBC\xC5\xB6\xD6\x28"
+                          "\x6A\x9D\xDF\x11\x5E\xFE\x5E\x9D"
+                          "\x2F\x70\x44\x92\xF7\xF2\xE3\xEF",
+		.rlen	= 32,
+
+        }, {
+		.key    = "\xfe\xff\xe9\x92\x86\x65\x73\x1c"
+			  "\x6d\x6a\x8f\x94\x67\x30\x83\x08"
+                          "\x00\x00\x00\x00",
+		.klen	= 20,
+                .iv     = zeroed_string,
+                .input  = "\x01\x01\x01\x01\x01\x01\x01\x01"
+                          "\x01\x01\x01\x01\x01\x01\x01\x01",
+                .ilen   = 16,
+                .assoc  = zeroed_string,
+                .alen   = 8,
+		.result	= "\x4B\xB1\xB5\xE3\x25\x71\x70\xDE"
+                          "\x7F\xC9\x9C\xA5\x14\x19\xF2\xAC"
+                          "\x0B\x8F\x88\x69\x17\xE6\xB4\x3C"
+                          "\xB1\x68\xFD\x14\x52\x64\x61\xB2",
+		.rlen	= 32,
+        }, {
+		.key    = "\xfe\xff\xe9\x92\x86\x65\x73\x1c"
+			  "\x6d\x6a\x8f\x94\x67\x30\x83\x08"
+                          "\x00\x00\x00\x00",
+		.klen	= 20,
+                .iv     = zeroed_string,
+                .input  = "\x01\x01\x01\x01\x01\x01\x01\x01"
+                          "\x01\x01\x01\x01\x01\x01\x01\x01",
+                .ilen   = 16,
+                .assoc  = "\x01\x01\x01\x01\x01\x01\x01\x01",
+                .alen   = 8,
+		.result	= "\x4B\xB1\xB5\xE3\x25\x71\x70\xDE"
+                          "\x7F\xC9\x9C\xA5\x14\x19\xF2\xAC"
+                          "\x90\x92\xB7\xE3\x5F\xA3\x9A\x63"
+                          "\x7E\xD7\x1F\xD8\xD3\x7C\x4B\xF5",
+		.rlen	= 32,
+        }, {
+		.key    = "\xfe\xff\xe9\x92\x86\x65\x73\x1c"
+			  "\x6d\x6a\x8f\x94\x67\x30\x83\x08"
+                          "\x00\x00\x00\x00",
+		.klen	= 20,
+                .iv     = "\x00\x00\x00\x00\x00\x00\x00\x01"
+                          "\x00\x00\x00\x00",
+                .input  = "\x01\x01\x01\x01\x01\x01\x01\x01"
+                          "\x01\x01\x01\x01\x01\x01\x01\x01",
+                .ilen   = 16,
+                .assoc  = "\x01\x01\x01\x01\x01\x01\x01\x01",
+                .alen   = 8,
+		.result	= "\xC1\x0C\x8A\x43\x0E\x8E\x35\x19"
+                          "\x89\xB0\xC4\xBD\xC4\xB7\xD7\x29"
+                          "\x64\x50\xF9\x32\x13\xFB\x74\x61"
+                          "\xF4\xED\x52\xD3\xC5\x10\x55\x3C",
+		.rlen	= 32,
+        }, {
+		.key    = "\xfe\xff\xe9\x92\x86\x65\x73\x1c"
+			  "\x6d\x6a\x8f\x94\x67\x30\x83\x08"
+                          "\x00\x00\x00\x00",
+		.klen	= 20,
+                .iv     = "\x00\x00\x00\x00\x00\x00\x00\x01"
+                          "\x00\x00\x00\x00",
+                .input  = "\x01\x01\x01\x01\x01\x01\x01\x01"
+                          "\x01\x01\x01\x01\x01\x01\x01\x01"
+                          "\x01\x01\x01\x01\x01\x01\x01\x01"
+                          "\x01\x01\x01\x01\x01\x01\x01\x01"
+                          "\x01\x01\x01\x01\x01\x01\x01\x01"
+                          "\x01\x01\x01\x01\x01\x01\x01\x01"
+                          "\x01\x01\x01\x01\x01\x01\x01\x01"
+                          "\x01\x01\x01\x01\x01\x01\x01\x01",
+                .ilen   = 64,
+                .assoc  = "\x01\x01\x01\x01\x01\x01\x01\x01",
+                .alen   = 8,
+		.result	= "\xC1\x0C\x8A\x43\x0E\x8E\x35\x19"
+                          "\x89\xB0\xC4\xBD\xC4\xB7\xD7\x29"
+                          "\x98\x14\xA1\x42\x37\x80\xFD\x90"
+                          "\x68\x12\x01\xA8\x91\x89\xB9\x83"
+                          "\x5B\x11\x77\x12\x9B\xFF\x24\x89"
+                          "\x94\x5F\x18\x12\xBA\x27\x09\x39"
+                          "\x99\x96\x76\x42\x15\x1C\xCD\xCB"
+                          "\xDC\xD3\xDA\x65\x73\xAF\x80\xCD"
+                          "\xD2\xB6\xC2\x4A\x76\xC2\x92\x85"
+                          "\xBD\xCF\x62\x98\x58\x14\xE5\xBD",
+		.rlen	= 80,
+        }, {
+		.key    = "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+                          "\x00\x00\x00\x00",
+		.klen	= 20,
+                .iv     = "\x00\x00\x45\x67\x89\xab\xcd\xef"
+                          "\x00\x00\x00\x00",
+                .input  = "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff",
+                .ilen   = 192,
+                .assoc  = "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+                          "\xaa\xaa\xaa\xaa",
+                .alen   = 12,
+		.result	= "\xC1\x76\x33\x85\xE2\x9B\x5F\xDE"
+			  "\xDE\x89\x3D\x42\xE7\xC9\x69\x8A"
+			  "\x44\x6D\xC3\x88\x46\x2E\xC2\x01"
+			  "\x5E\xF6\x0C\x39\xF0\xC4\xA5\x82"
+			  "\xCD\xE8\x31\xCC\x0A\x4C\xE4\x44"
+			  "\x41\xA9\x82\x6F\x22\xA1\x23\x1A"
+			  "\xA8\xE3\x16\xFD\x31\x5C\x27\x31"
+			  "\xF1\x7F\x01\x63\xA3\xAF\x70\xA1"
+			  "\xCF\x07\x57\x41\x67\xD0\xC4\x42"
+			  "\xDB\x18\xC6\x4C\x4C\xE0\x3D\x9F"
+			  "\x05\x07\xFB\x13\x7D\x4A\xCA\x5B"
+			  "\xF0\xBF\x64\x7E\x05\xB1\x72\xEE"
+			  "\x7C\x3B\xD4\xCD\x14\x03\xB2\x2C"
+			  "\xD3\xA9\xEE\xFA\x17\xFC\x9C\xDF"
+			  "\xC7\x75\x40\xFF\xAE\xAD\x1E\x59"
+			  "\x2F\x30\x24\xFB\xAD\x6B\x10\xFA"
+			  "\x6C\x9F\x5B\xE7\x25\xD5\xD0\x25"
+			  "\xAC\x4A\x4B\xDA\xFC\x7A\x85\x1B"
+			  "\x7E\x13\x06\x82\x08\x17\xA4\x35"
+			  "\xEC\xC5\x8D\x63\x96\x81\x0A\x8F"
+			  "\xA3\x05\x38\x95\x20\x1A\x47\x04"
+			  "\x6F\x6D\xDA\x8F\xEF\xC1\x76\x35"
+			  "\x6B\xC7\x4D\x0F\x94\x12\xCA\x3E"
+			  "\x2E\xD5\x03\x2E\x86\x7E\xAA\x3B"
+			  "\x37\x08\x1C\xCF\xBA\x5D\x71\x46"
+			  "\x80\x72\xB0\x4C\x82\x0D\x60\x3C",
+		.rlen	= 208,
+	}
+};
+
+static struct aead_testvec aes_gcm_rfc4106_dec_tv_template[] = {
+        { /* Generated using Crypto++ */
+		.key    = zeroed_string,
+		.klen	= 20,
+                .iv     = zeroed_string,
+		.input	= "\x03\x88\xDA\xCE\x60\xB6\xA3\x92"
+                          "\xF3\x28\xC2\xB9\x71\xB2\xFE\x78"
+                          "\x97\xFE\x4C\x23\x37\x42\x01\xE0"
+                          "\x81\x9F\x8D\xC5\xD7\x41\xA0\x1B",
+		.ilen	= 32,
+                .assoc  = zeroed_string,
+                .alen   = 8,
+                .result = zeroed_string,
+                .rlen   = 16,
+
+        },{
+		.key    = "\xfe\xff\xe9\x92\x86\x65\x73\x1c"
+			  "\x6d\x6a\x8f\x94\x67\x30\x83\x08"
+                          "\x00\x00\x00\x00",
+		.klen	= 20,
+                .iv     = "\x00\x00\x00\x00\x00\x00\x00\x01"
+                          "\x00\x00\x00\x00",
+		.input	= "\xC0\x0D\x8B\x42\x0F\x8F\x34\x18"
+                          "\x88\xB1\xC5\xBC\xC5\xB6\xD6\x28"
+                          "\x6A\x9D\xDF\x11\x5E\xFE\x5E\x9D"
+                          "\x2F\x70\x44\x92\xF7\xF2\xE3\xEF",
+		.ilen	= 32,
+                .assoc  = zeroed_string,
+                .alen   = 8,
+                .result = zeroed_string,
+                .rlen   = 16,
+        }, {
+		.key    = "\xfe\xff\xe9\x92\x86\x65\x73\x1c"
+			  "\x6d\x6a\x8f\x94\x67\x30\x83\x08"
+                          "\x00\x00\x00\x00",
+		.klen	= 20,
+                .iv     = zeroed_string,
+		.input	= "\x4B\xB1\xB5\xE3\x25\x71\x70\xDE"
+                          "\x7F\xC9\x9C\xA5\x14\x19\xF2\xAC"
+                          "\x0B\x8F\x88\x69\x17\xE6\xB4\x3C"
+                          "\xB1\x68\xFD\x14\x52\x64\x61\xB2",
+		.ilen	= 32,
+                .assoc  = zeroed_string,
+                .alen   = 8,
+                .result = "\x01\x01\x01\x01\x01\x01\x01\x01"
+                          "\x01\x01\x01\x01\x01\x01\x01\x01",
+                .rlen   = 16,
+        }, {
+		.key    = "\xfe\xff\xe9\x92\x86\x65\x73\x1c"
+			  "\x6d\x6a\x8f\x94\x67\x30\x83\x08"
+                          "\x00\x00\x00\x00",
+		.klen	= 20,
+                .iv     = zeroed_string,
+		.input	= "\x4B\xB1\xB5\xE3\x25\x71\x70\xDE"
+                          "\x7F\xC9\x9C\xA5\x14\x19\xF2\xAC"
+                          "\x90\x92\xB7\xE3\x5F\xA3\x9A\x63"
+                          "\x7E\xD7\x1F\xD8\xD3\x7C\x4B\xF5",
+		.ilen	= 32,
+                .assoc  = "\x01\x01\x01\x01\x01\x01\x01\x01",
+                .alen   = 8,
+                .result = "\x01\x01\x01\x01\x01\x01\x01\x01"
+                          "\x01\x01\x01\x01\x01\x01\x01\x01",
+                .rlen   = 16,
+
+        }, {
+		.key    = "\xfe\xff\xe9\x92\x86\x65\x73\x1c"
+			  "\x6d\x6a\x8f\x94\x67\x30\x83\x08"
+                          "\x00\x00\x00\x00",
+		.klen	= 20,
+                .iv     = "\x00\x00\x00\x00\x00\x00\x00\x01"
+                          "\x00\x00\x00\x00",
+		.input	= "\xC1\x0C\x8A\x43\x0E\x8E\x35\x19"
+                          "\x89\xB0\xC4\xBD\xC4\xB7\xD7\x29"
+                          "\x64\x50\xF9\x32\x13\xFB\x74\x61"
+                          "\xF4\xED\x52\xD3\xC5\x10\x55\x3C",
+		.ilen	= 32,
+                .assoc  = "\x01\x01\x01\x01\x01\x01\x01\x01",
+                .alen   = 8,
+                .result = "\x01\x01\x01\x01\x01\x01\x01\x01"
+                          "\x01\x01\x01\x01\x01\x01\x01\x01",
+                .rlen   = 16,
+        }, {
+		.key    = "\xfe\xff\xe9\x92\x86\x65\x73\x1c"
+			  "\x6d\x6a\x8f\x94\x67\x30\x83\x08"
+                          "\x00\x00\x00\x00",
+		.klen	= 20,
+                .iv     = "\x00\x00\x00\x00\x00\x00\x00\x01"
+                          "\x00\x00\x00\x00",
+		.input	= "\xC1\x0C\x8A\x43\x0E\x8E\x35\x19"
+                          "\x89\xB0\xC4\xBD\xC4\xB7\xD7\x29"
+                          "\x98\x14\xA1\x42\x37\x80\xFD\x90"
+                          "\x68\x12\x01\xA8\x91\x89\xB9\x83"
+                          "\x5B\x11\x77\x12\x9B\xFF\x24\x89"
+                          "\x94\x5F\x18\x12\xBA\x27\x09\x39"
+                          "\x99\x96\x76\x42\x15\x1C\xCD\xCB"
+                          "\xDC\xD3\xDA\x65\x73\xAF\x80\xCD"
+                          "\xD2\xB6\xC2\x4A\x76\xC2\x92\x85"
+                          "\xBD\xCF\x62\x98\x58\x14\xE5\xBD",
+		.ilen	= 80,
+                .assoc  = "\x01\x01\x01\x01\x01\x01\x01\x01",
+                .alen   = 8,
+                .result = "\x01\x01\x01\x01\x01\x01\x01\x01"
+                          "\x01\x01\x01\x01\x01\x01\x01\x01"
+                          "\x01\x01\x01\x01\x01\x01\x01\x01"
+                          "\x01\x01\x01\x01\x01\x01\x01\x01"
+                          "\x01\x01\x01\x01\x01\x01\x01\x01"
+                          "\x01\x01\x01\x01\x01\x01\x01\x01"
+                          "\x01\x01\x01\x01\x01\x01\x01\x01"
+                          "\x01\x01\x01\x01\x01\x01\x01\x01",
+                .rlen   = 64,
+        }, {
+		.key    = "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+                          "\x00\x00\x00\x00",
+		.klen	= 20,
+                .iv     = "\x00\x00\x45\x67\x89\xab\xcd\xef"
+                          "\x00\x00\x00\x00",
+		.input	= "\xC1\x76\x33\x85\xE2\x9B\x5F\xDE"
+			  "\xDE\x89\x3D\x42\xE7\xC9\x69\x8A"
+			  "\x44\x6D\xC3\x88\x46\x2E\xC2\x01"
+			  "\x5E\xF6\x0C\x39\xF0\xC4\xA5\x82"
+			  "\xCD\xE8\x31\xCC\x0A\x4C\xE4\x44"
+			  "\x41\xA9\x82\x6F\x22\xA1\x23\x1A"
+			  "\xA8\xE3\x16\xFD\x31\x5C\x27\x31"
+			  "\xF1\x7F\x01\x63\xA3\xAF\x70\xA1"
+			  "\xCF\x07\x57\x41\x67\xD0\xC4\x42"
+			  "\xDB\x18\xC6\x4C\x4C\xE0\x3D\x9F"
+			  "\x05\x07\xFB\x13\x7D\x4A\xCA\x5B"
+			  "\xF0\xBF\x64\x7E\x05\xB1\x72\xEE"
+			  "\x7C\x3B\xD4\xCD\x14\x03\xB2\x2C"
+			  "\xD3\xA9\xEE\xFA\x17\xFC\x9C\xDF"
+			  "\xC7\x75\x40\xFF\xAE\xAD\x1E\x59"
+			  "\x2F\x30\x24\xFB\xAD\x6B\x10\xFA"
+			  "\x6C\x9F\x5B\xE7\x25\xD5\xD0\x25"
+			  "\xAC\x4A\x4B\xDA\xFC\x7A\x85\x1B"
+			  "\x7E\x13\x06\x82\x08\x17\xA4\x35"
+			  "\xEC\xC5\x8D\x63\x96\x81\x0A\x8F"
+			  "\xA3\x05\x38\x95\x20\x1A\x47\x04"
+			  "\x6F\x6D\xDA\x8F\xEF\xC1\x76\x35"
+			  "\x6B\xC7\x4D\x0F\x94\x12\xCA\x3E"
+			  "\x2E\xD5\x03\x2E\x86\x7E\xAA\x3B"
+			  "\x37\x08\x1C\xCF\xBA\x5D\x71\x46"
+			  "\x80\x72\xB0\x4C\x82\x0D\x60\x3C",
+		.ilen	= 208,
+                .assoc  = "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+                          "\xaa\xaa\xaa\xaa",
+                .alen   = 12,
+                .result = "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff"
+                          "\xff\xff\xff\xff\xff\xff\xff\xff",
+                .rlen   = 192,
+
+	}
+};
+
 static struct aead_testvec aes_ccm_enc_tv_template[] = {
 	{ /* From RFC 3610 */
 		.key	= "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
diff --git a/crypto/zlib.c b/crypto/zlib.c
index c3015733..739b8fc 100644
--- a/crypto/zlib.c
+++ b/crypto/zlib.c
@@ -95,11 +95,10 @@
 	zlib_comp_exit(ctx);
 
 	workspacesize = zlib_deflate_workspacesize();
-	stream->workspace = vmalloc(workspacesize);
+	stream->workspace = vzalloc(workspacesize);
 	if (!stream->workspace)
 		return -ENOMEM;
 
-	memset(stream->workspace, 0, workspacesize);
 	ret = zlib_deflateInit2(stream,
 				tb[ZLIB_COMP_LEVEL]
 					? nla_get_u32(tb[ZLIB_COMP_LEVEL])
diff --git a/drivers/char/hw_random/via-rng.c b/drivers/char/hw_random/via-rng.c
index 794aacb..d0387a8 100644
--- a/drivers/char/hw_random/via-rng.c
+++ b/drivers/char/hw_random/via-rng.c
@@ -24,6 +24,7 @@
  * warranty of any kind, whether express or implied.
  */
 
+#include <crypto/padlock.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/hw_random.h>
@@ -34,7 +35,6 @@
 #include <asm/i387.h>
 
 
-#define PFX	KBUILD_MODNAME ": "
 
 
 enum {
@@ -81,8 +81,7 @@
 	ts_state = irq_ts_save();
 
 	asm(".byte 0x0F,0xA7,0xC0 /* xstore %%edi (addr=%0) */"
-		:"=m"(*addr), "=a"(eax_out)
-		:"D"(addr), "d"(edx_in));
+		: "=m" (*addr), "=a" (eax_out), "+d" (edx_in), "+D" (addr));
 
 	irq_ts_restore(ts_state);
 	return eax_out;
@@ -90,8 +89,10 @@
 
 static int via_rng_data_present(struct hwrng *rng, int wait)
 {
+	char buf[16 + PADLOCK_ALIGNMENT - STACK_ALIGN] __attribute__
+		((aligned(STACK_ALIGN)));
+	u32 *via_rng_datum = (u32 *)PTR_ALIGN(&buf[0], PADLOCK_ALIGNMENT);
 	u32 bytes_out;
-	u32 *via_rng_datum = (u32 *)(&rng->priv);
 	int i;
 
 	/* We choose the recommended 1-byte-per-instruction RNG rate,
@@ -115,6 +116,7 @@
 			break;
 		udelay(10);
 	}
+	rng->priv = *via_rng_datum;
 	return bytes_out ? 1 : 0;
 }
 
diff --git a/drivers/crypto/mv_cesa.c b/drivers/crypto/mv_cesa.c
index 7d279e5..c99305a 100644
--- a/drivers/crypto/mv_cesa.c
+++ b/drivers/crypto/mv_cesa.c
@@ -857,7 +857,7 @@
 			printk(KERN_WARNING MV_CESA
 			       "Base driver '%s' could not be loaded!\n",
 			       base_hash_name);
-			err = PTR_ERR(fallback_tfm);
+			err = PTR_ERR(base_hash);
 			goto err_bad_base;
 		}
 	}
diff --git a/drivers/crypto/n2_core.c b/drivers/crypto/n2_core.c
index 76141262..80dc094 100644
--- a/drivers/crypto/n2_core.c
+++ b/drivers/crypto/n2_core.c
@@ -1542,7 +1542,7 @@
 	return err;
 }
 
-static void __exit n2_unregister_algs(void)
+static void __devexit n2_unregister_algs(void)
 {
 	mutex_lock(&spu_lock);
 	if (!--algs_registered)
diff --git a/drivers/crypto/omap-aes.c b/drivers/crypto/omap-aes.c
index 799ca51..add2a1a 100644
--- a/drivers/crypto/omap-aes.c
+++ b/drivers/crypto/omap-aes.c
@@ -74,11 +74,9 @@
 #define FLAGS_CBC		BIT(1)
 #define FLAGS_GIV		BIT(2)
 
-#define FLAGS_NEW_KEY		BIT(4)
-#define FLAGS_NEW_IV		BIT(5)
-#define FLAGS_INIT		BIT(6)
-#define FLAGS_FAST		BIT(7)
-#define FLAGS_BUSY		8
+#define FLAGS_INIT		BIT(4)
+#define FLAGS_FAST		BIT(5)
+#define FLAGS_BUSY		BIT(6)
 
 struct omap_aes_ctx {
 	struct omap_aes_dev *dd;
@@ -98,19 +96,18 @@
 struct omap_aes_dev {
 	struct list_head	list;
 	unsigned long		phys_base;
-	void __iomem 		*io_base;
+	void __iomem		*io_base;
 	struct clk		*iclk;
 	struct omap_aes_ctx	*ctx;
 	struct device		*dev;
 	unsigned long		flags;
+	int			err;
 
-	u32			*iv;
-	u32			ctrl;
+	spinlock_t		lock;
+	struct crypto_queue	queue;
 
-	spinlock_t			lock;
-	struct crypto_queue		queue;
-
-	struct tasklet_struct		task;
+	struct tasklet_struct	done_task;
+	struct tasklet_struct	queue_task;
 
 	struct ablkcipher_request	*req;
 	size_t				total;
@@ -179,9 +176,13 @@
 
 static int omap_aes_hw_init(struct omap_aes_dev *dd)
 {
-	int err = 0;
-
+	/*
+	 * clocks are enabled when request starts and disabled when finished.
+	 * It may be long delays between requests.
+	 * Device might go to off mode to save power.
+	 */
 	clk_enable(dd->iclk);
+
 	if (!(dd->flags & FLAGS_INIT)) {
 		/* is it necessary to reset before every operation? */
 		omap_aes_write_mask(dd, AES_REG_MASK, AES_REG_MASK_SOFTRESET,
@@ -193,39 +194,26 @@
 		__asm__ __volatile__("nop");
 		__asm__ __volatile__("nop");
 
-		err = omap_aes_wait(dd, AES_REG_SYSSTATUS,
-				AES_REG_SYSSTATUS_RESETDONE);
-		if (!err)
-			dd->flags |= FLAGS_INIT;
+		if (omap_aes_wait(dd, AES_REG_SYSSTATUS,
+				AES_REG_SYSSTATUS_RESETDONE))
+			return -ETIMEDOUT;
+
+		dd->flags |= FLAGS_INIT;
+		dd->err = 0;
 	}
 
-	return err;
+	return 0;
 }
 
-static void omap_aes_hw_cleanup(struct omap_aes_dev *dd)
-{
-	clk_disable(dd->iclk);
-}
-
-static void omap_aes_write_ctrl(struct omap_aes_dev *dd)
+static int omap_aes_write_ctrl(struct omap_aes_dev *dd)
 {
 	unsigned int key32;
-	int i;
+	int i, err;
 	u32 val, mask;
 
-	val = FLD_VAL(((dd->ctx->keylen >> 3) - 1), 4, 3);
-	if (dd->flags & FLAGS_CBC)
-		val |= AES_REG_CTRL_CBC;
-	if (dd->flags & FLAGS_ENCRYPT)
-		val |= AES_REG_CTRL_DIRECTION;
-
-	if (dd->ctrl == val && !(dd->flags & FLAGS_NEW_IV) &&
-		   !(dd->ctx->flags & FLAGS_NEW_KEY))
-		goto out;
-
-	/* only need to write control registers for new settings */
-
-	dd->ctrl = val;
+	err = omap_aes_hw_init(dd);
+	if (err)
+		return err;
 
 	val = 0;
 	if (dd->dma_lch_out >= 0)
@@ -237,30 +225,43 @@
 
 	omap_aes_write_mask(dd, AES_REG_MASK, val, mask);
 
-	pr_debug("Set key\n");
 	key32 = dd->ctx->keylen / sizeof(u32);
-	/* set a key */
+
+	/* it seems a key should always be set even if it has not changed */
 	for (i = 0; i < key32; i++) {
 		omap_aes_write(dd, AES_REG_KEY(i),
 			__le32_to_cpu(dd->ctx->key[i]));
 	}
-	dd->ctx->flags &= ~FLAGS_NEW_KEY;
 
-	if (dd->flags & FLAGS_NEW_IV) {
-		pr_debug("Set IV\n");
-		omap_aes_write_n(dd, AES_REG_IV(0), dd->iv, 4);
-		dd->flags &= ~FLAGS_NEW_IV;
-	}
+	if ((dd->flags & FLAGS_CBC) && dd->req->info)
+		omap_aes_write_n(dd, AES_REG_IV(0), dd->req->info, 4);
+
+	val = FLD_VAL(((dd->ctx->keylen >> 3) - 1), 4, 3);
+	if (dd->flags & FLAGS_CBC)
+		val |= AES_REG_CTRL_CBC;
+	if (dd->flags & FLAGS_ENCRYPT)
+		val |= AES_REG_CTRL_DIRECTION;
 
 	mask = AES_REG_CTRL_CBC | AES_REG_CTRL_DIRECTION |
 			AES_REG_CTRL_KEY_SIZE;
 
-	omap_aes_write_mask(dd, AES_REG_CTRL, dd->ctrl, mask);
+	omap_aes_write_mask(dd, AES_REG_CTRL, val, mask);
 
-out:
-	/* start DMA or disable idle mode */
-	omap_aes_write_mask(dd, AES_REG_MASK, AES_REG_MASK_START,
-			    AES_REG_MASK_START);
+	/* IN */
+	omap_set_dma_dest_params(dd->dma_lch_in, 0, OMAP_DMA_AMODE_CONSTANT,
+				 dd->phys_base + AES_REG_DATA, 0, 4);
+
+	omap_set_dma_dest_burst_mode(dd->dma_lch_in, OMAP_DMA_DATA_BURST_4);
+	omap_set_dma_src_burst_mode(dd->dma_lch_in, OMAP_DMA_DATA_BURST_4);
+
+	/* OUT */
+	omap_set_dma_src_params(dd->dma_lch_out, 0, OMAP_DMA_AMODE_CONSTANT,
+				dd->phys_base + AES_REG_DATA, 0, 4);
+
+	omap_set_dma_src_burst_mode(dd->dma_lch_out, OMAP_DMA_DATA_BURST_4);
+	omap_set_dma_dest_burst_mode(dd->dma_lch_out, OMAP_DMA_DATA_BURST_4);
+
+	return 0;
 }
 
 static struct omap_aes_dev *omap_aes_find_dev(struct omap_aes_ctx *ctx)
@@ -288,8 +289,16 @@
 {
 	struct omap_aes_dev *dd = data;
 
-	if (lch == dd->dma_lch_out)
-		tasklet_schedule(&dd->task);
+	if (ch_status != OMAP_DMA_BLOCK_IRQ) {
+		pr_err("omap-aes DMA error status: 0x%hx\n", ch_status);
+		dd->err = -EIO;
+		dd->flags &= ~FLAGS_INIT; /* request to re-initialize */
+	} else if (lch == dd->dma_lch_in) {
+		return;
+	}
+
+	/* dma_lch_out - completed */
+	tasklet_schedule(&dd->done_task);
 }
 
 static int omap_aes_dma_init(struct omap_aes_dev *dd)
@@ -339,18 +348,6 @@
 		goto err_dma_out;
 	}
 
-	omap_set_dma_dest_params(dd->dma_lch_in, 0, OMAP_DMA_AMODE_CONSTANT,
-				 dd->phys_base + AES_REG_DATA, 0, 4);
-
-	omap_set_dma_dest_burst_mode(dd->dma_lch_in, OMAP_DMA_DATA_BURST_4);
-	omap_set_dma_src_burst_mode(dd->dma_lch_in, OMAP_DMA_DATA_BURST_4);
-
-	omap_set_dma_src_params(dd->dma_lch_out, 0, OMAP_DMA_AMODE_CONSTANT,
-				dd->phys_base + AES_REG_DATA, 0, 4);
-
-	omap_set_dma_src_burst_mode(dd->dma_lch_out, OMAP_DMA_DATA_BURST_4);
-	omap_set_dma_dest_burst_mode(dd->dma_lch_out, OMAP_DMA_DATA_BURST_4);
-
 	return 0;
 
 err_dma_out:
@@ -406,6 +403,11 @@
 		if (!count)
 			return off;
 
+		/*
+		 * buflen and total are AES_BLOCK_SIZE size aligned,
+		 * so count should be also aligned
+		 */
+
 		sg_copy_buf(buf + off, *sg, *offset, count, out);
 
 		off += count;
@@ -461,7 +463,9 @@
 	omap_start_dma(dd->dma_lch_in);
 	omap_start_dma(dd->dma_lch_out);
 
-	omap_aes_write_ctrl(dd);
+	/* start DMA or disable idle mode */
+	omap_aes_write_mask(dd, AES_REG_MASK, AES_REG_MASK_START,
+			    AES_REG_MASK_START);
 
 	return 0;
 }
@@ -488,8 +492,10 @@
 		count = min(dd->total, sg_dma_len(dd->in_sg));
 		count = min(count, sg_dma_len(dd->out_sg));
 
-		if (count != dd->total)
+		if (count != dd->total) {
+			pr_err("request length != buffer length\n");
 			return -EINVAL;
+		}
 
 		pr_debug("fast\n");
 
@@ -525,23 +531,25 @@
 
 	dd->total -= count;
 
-	err = omap_aes_hw_init(dd);
-
 	err = omap_aes_crypt_dma(tfm, addr_in, addr_out, count);
+	if (err) {
+		dma_unmap_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
+		dma_unmap_sg(dd->dev, dd->out_sg, 1, DMA_TO_DEVICE);
+	}
 
 	return err;
 }
 
 static void omap_aes_finish_req(struct omap_aes_dev *dd, int err)
 {
-	struct omap_aes_ctx *ctx;
+	struct ablkcipher_request *req = dd->req;
 
 	pr_debug("err: %d\n", err);
 
-	ctx = crypto_ablkcipher_ctx(crypto_ablkcipher_reqtfm(dd->req));
+	clk_disable(dd->iclk);
+	dd->flags &= ~FLAGS_BUSY;
 
-	if (!dd->total)
-		dd->req->base.complete(&dd->req->base, err);
+	req->base.complete(&req->base, err);
 }
 
 static int omap_aes_crypt_dma_stop(struct omap_aes_dev *dd)
@@ -553,8 +561,6 @@
 
 	omap_aes_write_mask(dd, AES_REG_MASK, 0, AES_REG_MASK_START);
 
-	omap_aes_hw_cleanup(dd);
-
 	omap_stop_dma(dd->dma_lch_in);
 	omap_stop_dma(dd->dma_lch_out);
 
@@ -574,40 +580,39 @@
 		}
 	}
 
-	if (err || !dd->total)
-		omap_aes_finish_req(dd, err);
-
 	return err;
 }
 
-static int omap_aes_handle_req(struct omap_aes_dev *dd)
+static int omap_aes_handle_queue(struct omap_aes_dev *dd,
+			       struct ablkcipher_request *req)
 {
 	struct crypto_async_request *async_req, *backlog;
 	struct omap_aes_ctx *ctx;
 	struct omap_aes_reqctx *rctx;
-	struct ablkcipher_request *req;
 	unsigned long flags;
-
-	if (dd->total)
-		goto start;
+	int err, ret = 0;
 
 	spin_lock_irqsave(&dd->lock, flags);
+	if (req)
+		ret = ablkcipher_enqueue_request(&dd->queue, req);
+	if (dd->flags & FLAGS_BUSY) {
+		spin_unlock_irqrestore(&dd->lock, flags);
+		return ret;
+	}
 	backlog = crypto_get_backlog(&dd->queue);
 	async_req = crypto_dequeue_request(&dd->queue);
-	if (!async_req)
-		clear_bit(FLAGS_BUSY, &dd->flags);
+	if (async_req)
+		dd->flags |= FLAGS_BUSY;
 	spin_unlock_irqrestore(&dd->lock, flags);
 
 	if (!async_req)
-		return 0;
+		return ret;
 
 	if (backlog)
 		backlog->complete(backlog, -EINPROGRESS);
 
 	req = ablkcipher_request_cast(async_req);
 
-	pr_debug("get new req\n");
-
 	/* assign new request to device */
 	dd->req = req;
 	dd->total = req->nbytes;
@@ -621,27 +626,22 @@
 	rctx->mode &= FLAGS_MODE_MASK;
 	dd->flags = (dd->flags & ~FLAGS_MODE_MASK) | rctx->mode;
 
-	dd->iv = req->info;
-	if ((dd->flags & FLAGS_CBC) && dd->iv)
-		dd->flags |= FLAGS_NEW_IV;
-	else
-		dd->flags &= ~FLAGS_NEW_IV;
-
+	dd->ctx = ctx;
 	ctx->dd = dd;
-	if (dd->ctx != ctx) {
-		/* assign new context to device */
-		dd->ctx = ctx;
-		ctx->flags |= FLAGS_NEW_KEY;
+
+	err = omap_aes_write_ctrl(dd);
+	if (!err)
+		err = omap_aes_crypt_dma_start(dd);
+	if (err) {
+		/* aes_task will not finish it, so do it here */
+		omap_aes_finish_req(dd, err);
+		tasklet_schedule(&dd->queue_task);
 	}
 
-	if (!IS_ALIGNED(req->nbytes, AES_BLOCK_SIZE))
-		pr_err("request size is not exact amount of AES blocks\n");
-
-start:
-	return omap_aes_crypt_dma_start(dd);
+	return ret; /* return ret, which is enqueue return value */
 }
 
-static void omap_aes_task(unsigned long data)
+static void omap_aes_done_task(unsigned long data)
 {
 	struct omap_aes_dev *dd = (struct omap_aes_dev *)data;
 	int err;
@@ -650,40 +650,50 @@
 
 	err = omap_aes_crypt_dma_stop(dd);
 
-	err = omap_aes_handle_req(dd);
+	err = dd->err ? : err;
+
+	if (dd->total && !err) {
+		err = omap_aes_crypt_dma_start(dd);
+		if (!err)
+			return; /* DMA started. Not fininishing. */
+	}
+
+	omap_aes_finish_req(dd, err);
+	omap_aes_handle_queue(dd, NULL);
 
 	pr_debug("exit\n");
 }
 
+static void omap_aes_queue_task(unsigned long data)
+{
+	struct omap_aes_dev *dd = (struct omap_aes_dev *)data;
+
+	omap_aes_handle_queue(dd, NULL);
+}
+
 static int omap_aes_crypt(struct ablkcipher_request *req, unsigned long mode)
 {
 	struct omap_aes_ctx *ctx = crypto_ablkcipher_ctx(
 			crypto_ablkcipher_reqtfm(req));
 	struct omap_aes_reqctx *rctx = ablkcipher_request_ctx(req);
 	struct omap_aes_dev *dd;
-	unsigned long flags;
-	int err;
 
 	pr_debug("nbytes: %d, enc: %d, cbc: %d\n", req->nbytes,
 		  !!(mode & FLAGS_ENCRYPT),
 		  !!(mode & FLAGS_CBC));
 
+	if (!IS_ALIGNED(req->nbytes, AES_BLOCK_SIZE)) {
+		pr_err("request size is not exact amount of AES blocks\n");
+		return -EINVAL;
+	}
+
 	dd = omap_aes_find_dev(ctx);
 	if (!dd)
 		return -ENODEV;
 
 	rctx->mode = mode;
 
-	spin_lock_irqsave(&dd->lock, flags);
-	err = ablkcipher_enqueue_request(&dd->queue, req);
-	spin_unlock_irqrestore(&dd->lock, flags);
-
-	if (!test_and_set_bit(FLAGS_BUSY, &dd->flags))
-		omap_aes_handle_req(dd);
-
-	pr_debug("exit\n");
-
-	return err;
+	return omap_aes_handle_queue(dd, req);
 }
 
 /* ********************** ALG API ************************************ */
@@ -701,7 +711,6 @@
 
 	memcpy(ctx->key, key, keylen);
 	ctx->keylen = keylen;
-	ctx->flags |= FLAGS_NEW_KEY;
 
 	return 0;
 }
@@ -750,7 +759,7 @@
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct omap_aes_ctx),
-	.cra_alignmask	 	= 0,
+	.cra_alignmask		= 0,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= omap_aes_cra_init,
@@ -770,7 +779,7 @@
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct omap_aes_ctx),
-	.cra_alignmask	 	= 0,
+	.cra_alignmask		= 0,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= omap_aes_cra_init,
@@ -849,7 +858,8 @@
 		 (reg & AES_REG_REV_MAJOR) >> 4, reg & AES_REG_REV_MINOR);
 	clk_disable(dd->iclk);
 
-	tasklet_init(&dd->task, omap_aes_task, (unsigned long)dd);
+	tasklet_init(&dd->done_task, omap_aes_done_task, (unsigned long)dd);
+	tasklet_init(&dd->queue_task, omap_aes_queue_task, (unsigned long)dd);
 
 	err = omap_aes_dma_init(dd);
 	if (err)
@@ -876,7 +886,8 @@
 		crypto_unregister_alg(&algs[j]);
 	omap_aes_dma_cleanup(dd);
 err_dma:
-	tasklet_kill(&dd->task);
+	tasklet_kill(&dd->done_task);
+	tasklet_kill(&dd->queue_task);
 	iounmap(dd->io_base);
 err_io:
 	clk_put(dd->iclk);
@@ -903,7 +914,8 @@
 	for (i = 0; i < ARRAY_SIZE(algs); i++)
 		crypto_unregister_alg(&algs[i]);
 
-	tasklet_kill(&dd->task);
+	tasklet_kill(&dd->done_task);
+	tasklet_kill(&dd->queue_task);
 	omap_aes_dma_cleanup(dd);
 	iounmap(dd->io_base);
 	clk_put(dd->iclk);
diff --git a/drivers/crypto/omap-sham.c b/drivers/crypto/omap-sham.c
index a081c7c..2e71123 100644
--- a/drivers/crypto/omap-sham.c
+++ b/drivers/crypto/omap-sham.c
@@ -72,10 +72,9 @@
 
 #define DEFAULT_TIMEOUT_INTERVAL	HZ
 
-#define FLAGS_FIRST		0x0001
 #define FLAGS_FINUP		0x0002
 #define FLAGS_FINAL		0x0004
-#define FLAGS_FAST		0x0008
+#define FLAGS_SG		0x0008
 #define FLAGS_SHA1		0x0010
 #define FLAGS_DMA_ACTIVE	0x0020
 #define FLAGS_OUTPUT_READY	0x0040
@@ -83,13 +82,17 @@
 #define FLAGS_INIT		0x0100
 #define FLAGS_CPU		0x0200
 #define FLAGS_HMAC		0x0400
-
-/* 3rd byte */
-#define FLAGS_BUSY		16
+#define FLAGS_ERROR		0x0800
+#define FLAGS_BUSY		0x1000
 
 #define OP_UPDATE	1
 #define OP_FINAL	2
 
+#define OMAP_ALIGN_MASK		(sizeof(u32)-1)
+#define OMAP_ALIGNED		__attribute__((aligned(sizeof(u32))))
+
+#define BUFLEN		PAGE_SIZE
+
 struct omap_sham_dev;
 
 struct omap_sham_reqctx {
@@ -97,8 +100,8 @@
 	unsigned long		flags;
 	unsigned long		op;
 
+	u8			digest[SHA1_DIGEST_SIZE] OMAP_ALIGNED;
 	size_t			digcnt;
-	u8			*buffer;
 	size_t			bufcnt;
 	size_t			buflen;
 	dma_addr_t		dma_addr;
@@ -107,6 +110,8 @@
 	struct scatterlist	*sg;
 	unsigned int		offset;	/* offset in current sg */
 	unsigned int		total;	/* total request */
+
+	u8			buffer[0] OMAP_ALIGNED;
 };
 
 struct omap_sham_hmac_ctx {
@@ -136,6 +141,7 @@
 	int			irq;
 	struct clk		*iclk;
 	spinlock_t		lock;
+	int			err;
 	int			dma;
 	int			dma_lch;
 	struct tasklet_struct	done_task;
@@ -194,53 +200,68 @@
 static void omap_sham_copy_hash(struct ahash_request *req, int out)
 {
 	struct omap_sham_reqctx *ctx = ahash_request_ctx(req);
+	u32 *hash = (u32 *)ctx->digest;
+	int i;
+
+	/* MD5 is almost unused. So copy sha1 size to reduce code */
+	for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(u32); i++) {
+		if (out)
+			hash[i] = omap_sham_read(ctx->dd,
+						SHA_REG_DIGEST(i));
+		else
+			omap_sham_write(ctx->dd,
+					SHA_REG_DIGEST(i), hash[i]);
+	}
+}
+
+static void omap_sham_copy_ready_hash(struct ahash_request *req)
+{
+	struct omap_sham_reqctx *ctx = ahash_request_ctx(req);
+	u32 *in = (u32 *)ctx->digest;
 	u32 *hash = (u32 *)req->result;
 	int i;
 
+	if (!hash)
+		return;
+
 	if (likely(ctx->flags & FLAGS_SHA1)) {
 		/* SHA1 results are in big endian */
 		for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(u32); i++)
-			if (out)
-				hash[i] = be32_to_cpu(omap_sham_read(ctx->dd,
-							SHA_REG_DIGEST(i)));
-			else
-				omap_sham_write(ctx->dd, SHA_REG_DIGEST(i),
-							cpu_to_be32(hash[i]));
+			hash[i] = be32_to_cpu(in[i]);
 	} else {
 		/* MD5 results are in little endian */
 		for (i = 0; i < MD5_DIGEST_SIZE / sizeof(u32); i++)
-			if (out)
-				hash[i] = le32_to_cpu(omap_sham_read(ctx->dd,
-							SHA_REG_DIGEST(i)));
-			else
-				omap_sham_write(ctx->dd, SHA_REG_DIGEST(i),
-							cpu_to_le32(hash[i]));
+			hash[i] = le32_to_cpu(in[i]);
 	}
 }
 
-static int omap_sham_write_ctrl(struct omap_sham_dev *dd, size_t length,
+static int omap_sham_hw_init(struct omap_sham_dev *dd)
+{
+	clk_enable(dd->iclk);
+
+	if (!(dd->flags & FLAGS_INIT)) {
+		omap_sham_write_mask(dd, SHA_REG_MASK,
+			SHA_REG_MASK_SOFTRESET, SHA_REG_MASK_SOFTRESET);
+
+		if (omap_sham_wait(dd, SHA_REG_SYSSTATUS,
+					SHA_REG_SYSSTATUS_RESETDONE))
+			return -ETIMEDOUT;
+
+		dd->flags |= FLAGS_INIT;
+		dd->err = 0;
+	}
+
+	return 0;
+}
+
+static void omap_sham_write_ctrl(struct omap_sham_dev *dd, size_t length,
 				 int final, int dma)
 {
 	struct omap_sham_reqctx *ctx = ahash_request_ctx(dd->req);
 	u32 val = length << 5, mask;
 
-	if (unlikely(!ctx->digcnt)) {
-
-		clk_enable(dd->iclk);
-
-		if (!(dd->flags & FLAGS_INIT)) {
-			omap_sham_write_mask(dd, SHA_REG_MASK,
-				SHA_REG_MASK_SOFTRESET, SHA_REG_MASK_SOFTRESET);
-
-			if (omap_sham_wait(dd, SHA_REG_SYSSTATUS,
-						SHA_REG_SYSSTATUS_RESETDONE))
-				return -ETIMEDOUT;
-
-			dd->flags |= FLAGS_INIT;
-		}
-	} else {
+	if (likely(ctx->digcnt))
 		omap_sham_write(dd, SHA_REG_DIGCNT, ctx->digcnt);
-	}
 
 	omap_sham_write_mask(dd, SHA_REG_MASK,
 		SHA_REG_MASK_IT_EN | (dma ? SHA_REG_MASK_DMA_EN : 0),
@@ -260,29 +281,26 @@
 			SHA_REG_CTRL_ALGO | SHA_REG_CTRL_LENGTH;
 
 	omap_sham_write_mask(dd, SHA_REG_CTRL, val, mask);
-
-	return 0;
 }
 
 static int omap_sham_xmit_cpu(struct omap_sham_dev *dd, const u8 *buf,
 			      size_t length, int final)
 {
 	struct omap_sham_reqctx *ctx = ahash_request_ctx(dd->req);
-	int err, count, len32;
+	int count, len32;
 	const u32 *buffer = (const u32 *)buf;
 
 	dev_dbg(dd->dev, "xmit_cpu: digcnt: %d, length: %d, final: %d\n",
 						ctx->digcnt, length, final);
 
-	err = omap_sham_write_ctrl(dd, length, final, 0);
-	if (err)
-		return err;
+	omap_sham_write_ctrl(dd, length, final, 0);
+
+	/* should be non-zero before next lines to disable clocks later */
+	ctx->digcnt += length;
 
 	if (omap_sham_wait(dd, SHA_REG_CTRL, SHA_REG_CTRL_INPUT_READY))
 		return -ETIMEDOUT;
 
-	ctx->digcnt += length;
-
 	if (final)
 		ctx->flags |= FLAGS_FINAL; /* catch last interrupt */
 
@@ -298,16 +316,11 @@
 			      size_t length, int final)
 {
 	struct omap_sham_reqctx *ctx = ahash_request_ctx(dd->req);
-	int err, len32;
+	int len32;
 
 	dev_dbg(dd->dev, "xmit_dma: digcnt: %d, length: %d, final: %d\n",
 						ctx->digcnt, length, final);
 
-	/* flush cache entries related to our page */
-	if (dma_addr == ctx->dma_addr)
-		dma_sync_single_for_device(dd->dev, dma_addr, length,
-					   DMA_TO_DEVICE);
-
 	len32 = DIV_ROUND_UP(length, sizeof(u32));
 
 	omap_set_dma_transfer_params(dd->dma_lch, OMAP_DMA_DATA_TYPE_S32, len32,
@@ -317,9 +330,7 @@
 	omap_set_dma_src_params(dd->dma_lch, 0, OMAP_DMA_AMODE_POST_INC,
 				dma_addr, 0, 0);
 
-	err = omap_sham_write_ctrl(dd, length, final, 1);
-	if (err)
-		return err;
+	omap_sham_write_ctrl(dd, length, final, 1);
 
 	ctx->digcnt += length;
 
@@ -371,15 +382,29 @@
 	return 0;
 }
 
+static int omap_sham_xmit_dma_map(struct omap_sham_dev *dd,
+					struct omap_sham_reqctx *ctx,
+					size_t length, int final)
+{
+	ctx->dma_addr = dma_map_single(dd->dev, ctx->buffer, ctx->buflen,
+				       DMA_TO_DEVICE);
+	if (dma_mapping_error(dd->dev, ctx->dma_addr)) {
+		dev_err(dd->dev, "dma %u bytes error\n", ctx->buflen);
+		return -EINVAL;
+	}
+
+	ctx->flags &= ~FLAGS_SG;
+
+	/* next call does not fail... so no unmap in the case of error */
+	return omap_sham_xmit_dma(dd, ctx->dma_addr, length, final);
+}
+
 static int omap_sham_update_dma_slow(struct omap_sham_dev *dd)
 {
 	struct omap_sham_reqctx *ctx = ahash_request_ctx(dd->req);
 	unsigned int final;
 	size_t count;
 
-	if (!ctx->total)
-		return 0;
-
 	omap_sham_append_sg(ctx);
 
 	final = (ctx->flags & FLAGS_FINUP) && !ctx->total;
@@ -390,30 +415,68 @@
 	if (final || (ctx->bufcnt == ctx->buflen && ctx->total)) {
 		count = ctx->bufcnt;
 		ctx->bufcnt = 0;
-		return omap_sham_xmit_dma(dd, ctx->dma_addr, count, final);
+		return omap_sham_xmit_dma_map(dd, ctx, count, final);
 	}
 
 	return 0;
 }
 
-static int omap_sham_update_dma_fast(struct omap_sham_dev *dd)
+/* Start address alignment */
+#define SG_AA(sg)	(IS_ALIGNED(sg->offset, sizeof(u32)))
+/* SHA1 block size alignment */
+#define SG_SA(sg)	(IS_ALIGNED(sg->length, SHA1_MD5_BLOCK_SIZE))
+
+static int omap_sham_update_dma_start(struct omap_sham_dev *dd)
 {
 	struct omap_sham_reqctx *ctx = ahash_request_ctx(dd->req);
-	unsigned int length;
+	unsigned int length, final, tail;
+	struct scatterlist *sg;
 
-	ctx->flags |= FLAGS_FAST;
+	if (!ctx->total)
+		return 0;
 
-	length = min(ctx->total, sg_dma_len(ctx->sg));
-	ctx->total = length;
+	if (ctx->bufcnt || ctx->offset)
+		return omap_sham_update_dma_slow(dd);
+
+	dev_dbg(dd->dev, "fast: digcnt: %d, bufcnt: %u, total: %u\n",
+			ctx->digcnt, ctx->bufcnt, ctx->total);
+
+	sg = ctx->sg;
+
+	if (!SG_AA(sg))
+		return omap_sham_update_dma_slow(dd);
+
+	if (!sg_is_last(sg) && !SG_SA(sg))
+		/* size is not SHA1_BLOCK_SIZE aligned */
+		return omap_sham_update_dma_slow(dd);
+
+	length = min(ctx->total, sg->length);
+
+	if (sg_is_last(sg)) {
+		if (!(ctx->flags & FLAGS_FINUP)) {
+			/* not last sg must be SHA1_MD5_BLOCK_SIZE aligned */
+			tail = length & (SHA1_MD5_BLOCK_SIZE - 1);
+			/* without finup() we need one block to close hash */
+			if (!tail)
+				tail = SHA1_MD5_BLOCK_SIZE;
+			length -= tail;
+		}
+	}
 
 	if (!dma_map_sg(dd->dev, ctx->sg, 1, DMA_TO_DEVICE)) {
 		dev_err(dd->dev, "dma_map_sg  error\n");
 		return -EINVAL;
 	}
 
-	ctx->total -= length;
+	ctx->flags |= FLAGS_SG;
 
-	return omap_sham_xmit_dma(dd, sg_dma_address(ctx->sg), length, 1);
+	ctx->total -= length;
+	ctx->offset = length; /* offset where to start slow */
+
+	final = (ctx->flags & FLAGS_FINUP) && !ctx->total;
+
+	/* next call does not fail... so no unmap in the case of error */
+	return omap_sham_xmit_dma(dd, sg_dma_address(ctx->sg), length, final);
 }
 
 static int omap_sham_update_cpu(struct omap_sham_dev *dd)
@@ -433,8 +496,17 @@
 	struct omap_sham_reqctx *ctx = ahash_request_ctx(dd->req);
 
 	omap_stop_dma(dd->dma_lch);
-	if (ctx->flags & FLAGS_FAST)
+	if (ctx->flags & FLAGS_SG) {
 		dma_unmap_sg(dd->dev, ctx->sg, 1, DMA_TO_DEVICE);
+		if (ctx->sg->length == ctx->offset) {
+			ctx->sg = sg_next(ctx->sg);
+			if (ctx->sg)
+				ctx->offset = 0;
+		}
+	} else {
+		dma_unmap_single(dd->dev, ctx->dma_addr, ctx->buflen,
+				 DMA_TO_DEVICE);
+	}
 
 	return 0;
 }
@@ -454,14 +526,7 @@
 	spin_unlock_irqrestore(&dd->lock, flags);
 
 	if (ctx->digcnt)
-		clk_disable(dd->iclk);
-
-	if (ctx->dma_addr)
-		dma_unmap_single(dd->dev, ctx->dma_addr, ctx->buflen,
-				 DMA_TO_DEVICE);
-
-	if (ctx->buffer)
-		free_page((unsigned long)ctx->buffer);
+		omap_sham_copy_ready_hash(req);
 
 	dev_dbg(dd->dev, "digcnt: %d, bufcnt: %d\n", ctx->digcnt, ctx->bufcnt);
 }
@@ -489,8 +554,6 @@
 
 	ctx->flags = 0;
 
-	ctx->flags |= FLAGS_FIRST;
-
 	dev_dbg(dd->dev, "init: digest size: %d\n",
 		crypto_ahash_digestsize(tfm));
 
@@ -499,21 +562,7 @@
 
 	ctx->bufcnt = 0;
 	ctx->digcnt = 0;
-
-	ctx->buflen = PAGE_SIZE;
-	ctx->buffer = (void *)__get_free_page(
-				(req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ?
-				GFP_KERNEL : GFP_ATOMIC);
-	if (!ctx->buffer)
-		return -ENOMEM;
-
-	ctx->dma_addr = dma_map_single(dd->dev, ctx->buffer, ctx->buflen,
-					DMA_TO_DEVICE);
-	if (dma_mapping_error(dd->dev, ctx->dma_addr)) {
-		dev_err(dd->dev, "dma %u bytes error\n", ctx->buflen);
-		free_page((unsigned long)ctx->buffer);
-		return -EINVAL;
-	}
+	ctx->buflen = BUFLEN;
 
 	if (tctx->flags & FLAGS_HMAC) {
 		struct omap_sham_hmac_ctx *bctx = tctx->base;
@@ -538,10 +587,8 @@
 
 	if (ctx->flags & FLAGS_CPU)
 		err = omap_sham_update_cpu(dd);
-	else if (ctx->flags & FLAGS_FAST)
-		err = omap_sham_update_dma_fast(dd);
 	else
-		err = omap_sham_update_dma_slow(dd);
+		err = omap_sham_update_dma_start(dd);
 
 	/* wait for dma completion before can take more data */
 	dev_dbg(dd->dev, "update: err: %d, digcnt: %d\n", err, ctx->digcnt);
@@ -560,15 +607,12 @@
 		use_dma = 0;
 
 	if (use_dma)
-		err = omap_sham_xmit_dma(dd, ctx->dma_addr, ctx->bufcnt, 1);
+		err = omap_sham_xmit_dma_map(dd, ctx, ctx->bufcnt, 1);
 	else
 		err = omap_sham_xmit_cpu(dd, ctx->buffer, ctx->bufcnt, 1);
 
 	ctx->bufcnt = 0;
 
-	if (err != -EINPROGRESS)
-		omap_sham_cleanup(req);
-
 	dev_dbg(dd->dev, "final_req: err: %d\n", err);
 
 	return err;
@@ -576,6 +620,7 @@
 
 static int omap_sham_finish_req_hmac(struct ahash_request *req)
 {
+	struct omap_sham_reqctx *ctx = ahash_request_ctx(req);
 	struct omap_sham_ctx *tctx = crypto_tfm_ctx(req->base.tfm);
 	struct omap_sham_hmac_ctx *bctx = tctx->base;
 	int bs = crypto_shash_blocksize(bctx->shash);
@@ -590,48 +635,56 @@
 
 	return crypto_shash_init(&desc.shash) ?:
 	       crypto_shash_update(&desc.shash, bctx->opad, bs) ?:
-	       crypto_shash_finup(&desc.shash, req->result, ds, req->result);
+	       crypto_shash_finup(&desc.shash, ctx->digest, ds, ctx->digest);
 }
 
 static void omap_sham_finish_req(struct ahash_request *req, int err)
 {
 	struct omap_sham_reqctx *ctx = ahash_request_ctx(req);
+	struct omap_sham_dev *dd = ctx->dd;
 
 	if (!err) {
 		omap_sham_copy_hash(ctx->dd->req, 1);
 		if (ctx->flags & FLAGS_HMAC)
 			err = omap_sham_finish_req_hmac(req);
+	} else {
+		ctx->flags |= FLAGS_ERROR;
 	}
 
-	if (ctx->flags & FLAGS_FINAL)
+	if ((ctx->flags & FLAGS_FINAL) || err)
 		omap_sham_cleanup(req);
 
-	clear_bit(FLAGS_BUSY, &ctx->dd->flags);
+	clk_disable(dd->iclk);
+	dd->flags &= ~FLAGS_BUSY;
 
 	if (req->base.complete)
 		req->base.complete(&req->base, err);
 }
 
-static int omap_sham_handle_queue(struct omap_sham_dev *dd)
+static int omap_sham_handle_queue(struct omap_sham_dev *dd,
+				  struct ahash_request *req)
 {
 	struct crypto_async_request *async_req, *backlog;
 	struct omap_sham_reqctx *ctx;
-	struct ahash_request *req, *prev_req;
+	struct ahash_request *prev_req;
 	unsigned long flags;
-	int err = 0;
-
-	if (test_and_set_bit(FLAGS_BUSY, &dd->flags))
-		return 0;
+	int err = 0, ret = 0;
 
 	spin_lock_irqsave(&dd->lock, flags);
+	if (req)
+		ret = ahash_enqueue_request(&dd->queue, req);
+	if (dd->flags & FLAGS_BUSY) {
+		spin_unlock_irqrestore(&dd->lock, flags);
+		return ret;
+	}
 	backlog = crypto_get_backlog(&dd->queue);
 	async_req = crypto_dequeue_request(&dd->queue);
-	if (!async_req)
-		clear_bit(FLAGS_BUSY, &dd->flags);
+	if (async_req)
+		dd->flags |= FLAGS_BUSY;
 	spin_unlock_irqrestore(&dd->lock, flags);
 
 	if (!async_req)
-		return 0;
+		return ret;
 
 	if (backlog)
 		backlog->complete(backlog, -EINPROGRESS);
@@ -646,7 +699,22 @@
 	dev_dbg(dd->dev, "handling new req, op: %lu, nbytes: %d\n",
 						ctx->op, req->nbytes);
 
-	if (req != prev_req && ctx->digcnt)
+
+	err = omap_sham_hw_init(dd);
+	if (err)
+		goto err1;
+
+	omap_set_dma_dest_params(dd->dma_lch, 0,
+			OMAP_DMA_AMODE_CONSTANT,
+			dd->phys_base + SHA_REG_DIN(0), 0, 16);
+
+	omap_set_dma_dest_burst_mode(dd->dma_lch,
+			OMAP_DMA_DATA_BURST_16);
+
+	omap_set_dma_src_burst_mode(dd->dma_lch,
+			OMAP_DMA_DATA_BURST_4);
+
+	if (ctx->digcnt)
 		/* request has changed - restore hash */
 		omap_sham_copy_hash(req, 0);
 
@@ -658,7 +726,7 @@
 	} else if (ctx->op == OP_FINAL) {
 		err = omap_sham_final_req(dd);
 	}
-
+err1:
 	if (err != -EINPROGRESS) {
 		/* done_task will not finish it, so do it here */
 		omap_sham_finish_req(req, err);
@@ -667,7 +735,7 @@
 
 	dev_dbg(dd->dev, "exit, err: %d\n", err);
 
-	return err;
+	return ret;
 }
 
 static int omap_sham_enqueue(struct ahash_request *req, unsigned int op)
@@ -675,18 +743,10 @@
 	struct omap_sham_reqctx *ctx = ahash_request_ctx(req);
 	struct omap_sham_ctx *tctx = crypto_tfm_ctx(req->base.tfm);
 	struct omap_sham_dev *dd = tctx->dd;
-	unsigned long flags;
-	int err;
 
 	ctx->op = op;
 
-	spin_lock_irqsave(&dd->lock, flags);
-	err = ahash_enqueue_request(&dd->queue, req);
-	spin_unlock_irqrestore(&dd->lock, flags);
-
-	omap_sham_handle_queue(dd);
-
-	return err;
+	return omap_sham_handle_queue(dd, req);
 }
 
 static int omap_sham_update(struct ahash_request *req)
@@ -709,21 +769,13 @@
 			*/
 			omap_sham_append_sg(ctx);
 			return 0;
-		} else if (ctx->bufcnt + ctx->total <= 64) {
+		} else if (ctx->bufcnt + ctx->total <= SHA1_MD5_BLOCK_SIZE) {
+			/*
+			* faster to use CPU for short transfers
+			*/
 			ctx->flags |= FLAGS_CPU;
-		} else if (!ctx->bufcnt && sg_is_last(ctx->sg)) {
-			/* may be can use faster functions */
-			int aligned = IS_ALIGNED((u32)ctx->sg->offset,
-								sizeof(u32));
-
-			if (aligned && (ctx->flags & FLAGS_FIRST))
-				/* digest: first and final */
-				ctx->flags |= FLAGS_FAST;
-
-			ctx->flags &= ~FLAGS_FIRST;
 		}
-	} else if (ctx->bufcnt + ctx->total <= ctx->buflen) {
-		/* if not finaup -> not fast */
+	} else if (ctx->bufcnt + ctx->total < ctx->buflen) {
 		omap_sham_append_sg(ctx);
 		return 0;
 	}
@@ -761,12 +813,14 @@
 
 	ctx->flags |= FLAGS_FINUP;
 
-	/* OMAP HW accel works only with buffers >= 9 */
-	/* HMAC is always >= 9 because of ipad */
-	if ((ctx->digcnt + ctx->bufcnt) < 9)
-		err = omap_sham_final_shash(req);
-	else if (ctx->bufcnt)
-		return omap_sham_enqueue(req, OP_FINAL);
+	if (!(ctx->flags & FLAGS_ERROR)) {
+		/* OMAP HW accel works only with buffers >= 9 */
+		/* HMAC is always >= 9 because of ipad */
+		if ((ctx->digcnt + ctx->bufcnt) < 9)
+			err = omap_sham_final_shash(req);
+		else if (ctx->bufcnt)
+			return omap_sham_enqueue(req, OP_FINAL);
+	}
 
 	omap_sham_cleanup(req);
 
@@ -836,6 +890,8 @@
 	struct omap_sham_ctx *tctx = crypto_tfm_ctx(tfm);
 	const char *alg_name = crypto_tfm_alg_name(tfm);
 
+	pr_info("enter\n");
+
 	/* Allocate a fallback and abort if it failed. */
 	tctx->fallback = crypto_alloc_shash(alg_name, 0,
 					    CRYPTO_ALG_NEED_FALLBACK);
@@ -846,7 +902,7 @@
 	}
 
 	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
-				 sizeof(struct omap_sham_reqctx));
+				 sizeof(struct omap_sham_reqctx) + BUFLEN);
 
 	if (alg_base) {
 		struct omap_sham_hmac_ctx *bctx = tctx->base;
@@ -932,7 +988,7 @@
 						CRYPTO_ALG_NEED_FALLBACK,
 		.cra_blocksize		= SHA1_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct omap_sham_ctx),
-		.cra_alignmask		= 0,
+		.cra_alignmask		= OMAP_ALIGN_MASK,
 		.cra_module		= THIS_MODULE,
 		.cra_init		= omap_sham_cra_init,
 		.cra_exit		= omap_sham_cra_exit,
@@ -956,7 +1012,7 @@
 		.cra_blocksize		= SHA1_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct omap_sham_ctx) +
 					sizeof(struct omap_sham_hmac_ctx),
-		.cra_alignmask		= 0,
+		.cra_alignmask		= OMAP_ALIGN_MASK,
 		.cra_module		= THIS_MODULE,
 		.cra_init		= omap_sham_cra_sha1_init,
 		.cra_exit		= omap_sham_cra_exit,
@@ -980,7 +1036,7 @@
 		.cra_blocksize		= SHA1_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct omap_sham_ctx) +
 					sizeof(struct omap_sham_hmac_ctx),
-		.cra_alignmask		= 0,
+		.cra_alignmask		= OMAP_ALIGN_MASK,
 		.cra_module		= THIS_MODULE,
 		.cra_init		= omap_sham_cra_md5_init,
 		.cra_exit		= omap_sham_cra_exit,
@@ -993,7 +1049,7 @@
 	struct omap_sham_dev *dd = (struct omap_sham_dev *)data;
 	struct ahash_request *req = dd->req;
 	struct omap_sham_reqctx *ctx = ahash_request_ctx(req);
-	int ready = 1;
+	int ready = 0, err = 0;
 
 	if (ctx->flags & FLAGS_OUTPUT_READY) {
 		ctx->flags &= ~FLAGS_OUTPUT_READY;
@@ -1003,15 +1059,18 @@
 	if (dd->flags & FLAGS_DMA_ACTIVE) {
 		dd->flags &= ~FLAGS_DMA_ACTIVE;
 		omap_sham_update_dma_stop(dd);
-		omap_sham_update_dma_slow(dd);
+		if (!dd->err)
+			err = omap_sham_update_dma_start(dd);
 	}
 
-	if (ready && !(dd->flags & FLAGS_DMA_ACTIVE)) {
-		dev_dbg(dd->dev, "update done\n");
+	err = dd->err ? : err;
+
+	if (err != -EINPROGRESS && (ready || err)) {
+		dev_dbg(dd->dev, "update done: err: %d\n", err);
 		/* finish curent request */
-		omap_sham_finish_req(req, 0);
+		omap_sham_finish_req(req, err);
 		/* start new request */
-		omap_sham_handle_queue(dd);
+		omap_sham_handle_queue(dd, NULL);
 	}
 }
 
@@ -1019,7 +1078,7 @@
 {
 	struct omap_sham_dev *dd = (struct omap_sham_dev *)data;
 
-	omap_sham_handle_queue(dd);
+	omap_sham_handle_queue(dd, NULL);
 }
 
 static irqreturn_t omap_sham_irq(int irq, void *dev_id)
@@ -1041,6 +1100,7 @@
 	omap_sham_read(dd, SHA_REG_CTRL);
 
 	ctx->flags |= FLAGS_OUTPUT_READY;
+	dd->err = 0;
 	tasklet_schedule(&dd->done_task);
 
 	return IRQ_HANDLED;
@@ -1050,8 +1110,13 @@
 {
 	struct omap_sham_dev *dd = data;
 
-	if (likely(lch == dd->dma_lch))
-		tasklet_schedule(&dd->done_task);
+	if (ch_status != OMAP_DMA_BLOCK_IRQ) {
+		pr_err("omap-sham DMA error status: 0x%hx\n", ch_status);
+		dd->err = -EIO;
+		dd->flags &= ~FLAGS_INIT; /* request to re-initialize */
+	}
+
+	tasklet_schedule(&dd->done_task);
 }
 
 static int omap_sham_dma_init(struct omap_sham_dev *dd)
@@ -1066,15 +1131,6 @@
 		dev_err(dd->dev, "Unable to request DMA channel\n");
 		return err;
 	}
-	omap_set_dma_dest_params(dd->dma_lch, 0,
-			OMAP_DMA_AMODE_CONSTANT,
-			dd->phys_base + SHA_REG_DIN(0), 0, 16);
-
-	omap_set_dma_dest_burst_mode(dd->dma_lch,
-			OMAP_DMA_DATA_BURST_16);
-
-	omap_set_dma_src_burst_mode(dd->dma_lch,
-			OMAP_DMA_DATA_BURST_4);
 
 	return 0;
 }
diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c
index 8a515ba..db33d30 100644
--- a/drivers/crypto/padlock-aes.c
+++ b/drivers/crypto/padlock-aes.c
@@ -9,6 +9,7 @@
 
 #include <crypto/algapi.h>
 #include <crypto/aes.h>
+#include <crypto/padlock.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/types.h>
@@ -21,7 +22,6 @@
 #include <asm/byteorder.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
-#include "padlock.h"
 
 /*
  * Number of data blocks actually fetched for each xcrypt insn.
diff --git a/drivers/crypto/padlock-sha.c b/drivers/crypto/padlock-sha.c
index d3a27e0..adf075b 100644
--- a/drivers/crypto/padlock-sha.c
+++ b/drivers/crypto/padlock-sha.c
@@ -13,6 +13,7 @@
  */
 
 #include <crypto/internal/hash.h>
+#include <crypto/padlock.h>
 #include <crypto/sha.h>
 #include <linux/err.h>
 #include <linux/module.h>
@@ -22,13 +23,6 @@
 #include <linux/kernel.h>
 #include <linux/scatterlist.h>
 #include <asm/i387.h>
-#include "padlock.h"
-
-#ifdef CONFIG_64BIT
-#define STACK_ALIGN 16
-#else
-#define STACK_ALIGN 4
-#endif
 
 struct padlock_sha_desc {
 	struct shash_desc fallback;
diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h
new file mode 100644
index 0000000..c5813c8
--- /dev/null
+++ b/include/crypto/if_alg.h
@@ -0,0 +1,92 @@
+/*
+ * if_alg: User-space algorithm interface
+ *
+ * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#ifndef _CRYPTO_IF_ALG_H
+#define _CRYPTO_IF_ALG_H
+
+#include <linux/compiler.h>
+#include <linux/completion.h>
+#include <linux/if_alg.h>
+#include <linux/types.h>
+#include <net/sock.h>
+
+#define ALG_MAX_PAGES			16
+
+struct crypto_async_request;
+
+struct alg_sock {
+	/* struct sock must be the first member of struct alg_sock */
+	struct sock sk;
+
+	struct sock *parent;
+
+	const struct af_alg_type *type;
+	void *private;
+};
+
+struct af_alg_completion {
+	struct completion completion;
+	int err;
+};
+
+struct af_alg_control {
+	struct af_alg_iv *iv;
+	int op;
+};
+
+struct af_alg_type {
+	void *(*bind)(const char *name, u32 type, u32 mask);
+	void (*release)(void *private);
+	int (*setkey)(void *private, const u8 *key, unsigned int keylen);
+	int (*accept)(void *private, struct sock *sk);
+
+	struct proto_ops *ops;
+	struct module *owner;
+	char name[14];
+};
+
+struct af_alg_sgl {
+	struct scatterlist sg[ALG_MAX_PAGES];
+	struct page *pages[ALG_MAX_PAGES];
+};
+
+int af_alg_register_type(const struct af_alg_type *type);
+int af_alg_unregister_type(const struct af_alg_type *type);
+
+int af_alg_release(struct socket *sock);
+int af_alg_accept(struct sock *sk, struct socket *newsock);
+
+int af_alg_make_sg(struct af_alg_sgl *sgl, void __user *addr, int len,
+		   int write);
+void af_alg_free_sg(struct af_alg_sgl *sgl);
+
+int af_alg_cmsg_send(struct msghdr *msg, struct af_alg_control *con);
+
+int af_alg_wait_for_completion(int err, struct af_alg_completion *completion);
+void af_alg_complete(struct crypto_async_request *req, int err);
+
+static inline struct alg_sock *alg_sk(struct sock *sk)
+{
+	return (struct alg_sock *)sk;
+}
+
+static inline void af_alg_release_parent(struct sock *sk)
+{
+	sock_put(alg_sk(sk)->parent);
+}
+
+static inline void af_alg_init_completion(struct af_alg_completion *completion)
+{
+	init_completion(&completion->completion);
+}
+
+#endif	/* _CRYPTO_IF_ALG_H */
diff --git a/drivers/crypto/padlock.h b/include/crypto/padlock.h
similarity index 82%
rename from drivers/crypto/padlock.h
rename to include/crypto/padlock.h
index b728e45..d2cfa2e 100644
--- a/drivers/crypto/padlock.h
+++ b/include/crypto/padlock.h
@@ -15,9 +15,15 @@
 
 #define PADLOCK_ALIGNMENT 16
 
-#define PFX	"padlock: "
+#define PFX	KBUILD_MODNAME ": "
 
 #define PADLOCK_CRA_PRIORITY	300
 #define PADLOCK_COMPOSITE_PRIORITY 400
 
+#ifdef CONFIG_64BIT
+#define STACK_ALIGN 16
+#else
+#define STACK_ALIGN 4
+#endif
+
 #endif	/* _CRYPTO_PADLOCK_H */
diff --git a/include/crypto/scatterwalk.h b/include/crypto/scatterwalk.h
index 833d208..4fd95a3 100644
--- a/include/crypto/scatterwalk.h
+++ b/include/crypto/scatterwalk.h
@@ -68,6 +68,21 @@
 	return (++sg)->length ? sg : (void *)sg_page(sg);
 }
 
+static inline void scatterwalk_crypto_chain(struct scatterlist *head,
+					    struct scatterlist *sg,
+					    int chain, int num)
+{
+	if (chain) {
+		head->length += sg->length;
+		sg = scatterwalk_sg_next(sg);
+	}
+
+	if (sg)
+		scatterwalk_sg_chain(head, num, sg);
+	else
+		sg_mark_end(head);
+}
+
 static inline unsigned long scatterwalk_samebuf(struct scatter_walk *walk_in,
 						struct scatter_walk *walk_out)
 {
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index d1580c1..2296d8b 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -158,6 +158,7 @@
 header-y += if.h
 header-y += if_addr.h
 header-y += if_addrlabel.h
+header-y += if_alg.h
 header-y += if_arcnet.h
 header-y += if_arp.h
 header-y += if_bonding.h
diff --git a/include/linux/if_alg.h b/include/linux/if_alg.h
new file mode 100644
index 0000000..0f9acce
--- /dev/null
+++ b/include/linux/if_alg.h
@@ -0,0 +1,40 @@
+/*
+ * if_alg: User-space algorithm interface
+ *
+ * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#ifndef _LINUX_IF_ALG_H
+#define _LINUX_IF_ALG_H
+
+#include <linux/types.h>
+
+struct sockaddr_alg {
+	__u16	salg_family;
+	__u8	salg_type[14];
+	__u32	salg_feat;
+	__u32	salg_mask;
+	__u8	salg_name[64];
+};
+
+struct af_alg_iv {
+	__u32	ivlen;
+	__u8	iv[0];
+};
+
+/* Socket options */
+#define ALG_SET_KEY			1
+#define ALG_SET_IV			2
+#define ALG_SET_OP			3
+
+/* Operations */
+#define ALG_OP_DECRYPT			0
+#define ALG_OP_ENCRYPT			1
+
+#endif	/* _LINUX_IF_ALG_H */
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 5f65f14..edbb1d0 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -191,7 +191,8 @@
 #define AF_PHONET	35	/* Phonet sockets		*/
 #define AF_IEEE802154	36	/* IEEE802154 sockets		*/
 #define AF_CAIF		37	/* CAIF sockets			*/
-#define AF_MAX		38	/* For now.. */
+#define AF_ALG		38	/* Algorithm sockets		*/
+#define AF_MAX		39	/* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC	AF_UNSPEC
@@ -232,6 +233,7 @@
 #define PF_PHONET	AF_PHONET
 #define PF_IEEE802154	AF_IEEE802154
 #define PF_CAIF		AF_CAIF
+#define PF_ALG		AF_ALG
 #define PF_MAX		AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
@@ -305,6 +307,7 @@
 #define SOL_RDS		276
 #define SOL_IUCV	277
 #define SOL_CAIF	278
+#define SOL_ALG		279
 
 /* IPX options */
 #define IPX_TYPE	1
diff --git a/net/core/sock.c b/net/core/sock.c
index a658aeb..7dfed79 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -157,7 +157,7 @@
   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
-  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" ,
+  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
   "sk_lock-AF_MAX"
 };
 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
@@ -173,7 +173,7 @@
   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
-  "slock-AF_IEEE802154", "slock-AF_CAIF" ,
+  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
   "slock-AF_MAX"
 };
 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
@@ -189,7 +189,7 @@
   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
-  "clock-AF_IEEE802154", "clock-AF_CAIF" ,
+  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
   "clock-AF_MAX"
 };