From 89e12654312dddbbdbf17b5adc95b22cb672f947 Mon Sep 17 00:00:00 2001
From: Sebastian Siewior <sebastian@breakpoint.cc>
Date: Wed, 17 Oct 2007 23:18:57 +0800
Subject: [CRYPTO] aes: Move common defines into a header file

This three defines are used in all AES related hardware.

Signed-off-by: Sebastian Siewior <sebastian@breakpoint.cc>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/s390/crypto/aes_s390.c | 7 +------
 arch/x86/crypto/aes_32.c    | 4 +---
 arch/x86/crypto/aes_64.c    | 6 +-----
 3 files changed, 3 insertions(+), 14 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c
index 512669691ad..812511bbb54 100644
--- a/arch/s390/crypto/aes_s390.c
+++ b/arch/s390/crypto/aes_s390.c
@@ -16,17 +16,12 @@
  *
  */
 
+#include <crypto/aes.h>
 #include <crypto/algapi.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include "crypt_s390.h"
 
-#define AES_MIN_KEY_SIZE	16
-#define AES_MAX_KEY_SIZE	32
-
-/* data block size for all key lengths */
-#define AES_BLOCK_SIZE		16
-
 #define AES_KEYLEN_128		1
 #define AES_KEYLEN_192		2
 #define AES_KEYLEN_256		4
diff --git a/arch/x86/crypto/aes_32.c b/arch/x86/crypto/aes_32.c
index 49aad9397f1..9b0ab50394b 100644
--- a/arch/x86/crypto/aes_32.c
+++ b/arch/x86/crypto/aes_32.c
@@ -38,6 +38,7 @@
  */
 
 #include <asm/byteorder.h>
+#include <crypto/aes.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
@@ -48,9 +49,6 @@
 asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
 asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
 
-#define AES_MIN_KEY_SIZE	16
-#define AES_MAX_KEY_SIZE	32
-#define AES_BLOCK_SIZE		16
 #define AES_KS_LENGTH		4 * AES_BLOCK_SIZE
 #define RC_LENGTH		29
 
diff --git a/arch/x86/crypto/aes_64.c b/arch/x86/crypto/aes_64.c
index 5cdb13ea5cc..0b38a4cd2ce 100644
--- a/arch/x86/crypto/aes_64.c
+++ b/arch/x86/crypto/aes_64.c
@@ -54,6 +54,7 @@
 */
 
 #include <asm/byteorder.h>
+#include <crypto/aes.h>
 #include <linux/bitops.h>
 #include <linux/crypto.h>
 #include <linux/errno.h>
@@ -61,11 +62,6 @@
 #include <linux/module.h>
 #include <linux/types.h>
 
-#define AES_MIN_KEY_SIZE	16
-#define AES_MAX_KEY_SIZE	32
-
-#define AES_BLOCK_SIZE		16
-
 /*
  * #define byte(x, nr) ((unsigned char)((x) >> (nr*8)))
  */
-- 
cgit v1.2.3


From 81190b321548bb0bf2d6e1f172695275b0fd1363 Mon Sep 17 00:00:00 2001
From: Sebastian Siewior <sebastian@breakpoint.cc>
Date: Thu, 8 Nov 2007 21:25:04 +0800
Subject: [CRYPTO] aes-x86-64: Remove setkey

The setkey() function can be shared with the generic algorithm.

Signed-off-by: Sebastian Siewior <sebastian@breakpoint.cc>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aes-x86_64-asm_64.S |  68 ++++-----
 arch/x86/crypto/aes_64.c            | 282 +-----------------------------------
 2 files changed, 37 insertions(+), 313 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S
index 26b40de4d0b..a120f526c3d 100644
--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -8,10 +8,10 @@
  * including this sentence is retained in full.
  */
 
-.extern aes_ft_tab
-.extern aes_it_tab
-.extern aes_fl_tab
-.extern aes_il_tab
+.extern crypto_ft_tab
+.extern crypto_it_tab
+.extern crypto_fl_tab
+.extern crypto_il_tab
 
 .text
 
@@ -56,13 +56,13 @@
 	.align	8;			\
 FUNC:	movq	r1,r2;			\
 	movq	r3,r4;			\
-	leaq	BASE+KEY+52(r8),r9;	\
+	leaq	BASE+KEY+48+4(r8),r9;	\
 	movq	r10,r11;		\
 	movl	(r7),r5 ## E;		\
 	movl	4(r7),r1 ## E;		\
 	movl	8(r7),r6 ## E;		\
 	movl	12(r7),r7 ## E;		\
-	movl	BASE(r8),r10 ## E;	\
+	movl	BASE+0(r8),r10 ## E;	\
 	xorl	-48(r9),r5 ## E;	\
 	xorl	-44(r9),r1 ## E;	\
 	xorl	-40(r9),r6 ## E;	\
@@ -154,37 +154,37 @@ FUNC:	movq	r1,r2;			\
 /* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */
 
 	entry(aes_enc_blk,0,enc128,enc192)
-	encrypt_round(aes_ft_tab,-96)
-	encrypt_round(aes_ft_tab,-80)
-enc192:	encrypt_round(aes_ft_tab,-64)
-	encrypt_round(aes_ft_tab,-48)
-enc128:	encrypt_round(aes_ft_tab,-32)
-	encrypt_round(aes_ft_tab,-16)
-	encrypt_round(aes_ft_tab,  0)
-	encrypt_round(aes_ft_tab, 16)
-	encrypt_round(aes_ft_tab, 32)
-	encrypt_round(aes_ft_tab, 48)
-	encrypt_round(aes_ft_tab, 64)
-	encrypt_round(aes_ft_tab, 80)
-	encrypt_round(aes_ft_tab, 96)
-	encrypt_final(aes_fl_tab,112)
+	encrypt_round(crypto_ft_tab,-96)
+	encrypt_round(crypto_ft_tab,-80)
+enc192:	encrypt_round(crypto_ft_tab,-64)
+	encrypt_round(crypto_ft_tab,-48)
+enc128:	encrypt_round(crypto_ft_tab,-32)
+	encrypt_round(crypto_ft_tab,-16)
+	encrypt_round(crypto_ft_tab,  0)
+	encrypt_round(crypto_ft_tab, 16)
+	encrypt_round(crypto_ft_tab, 32)
+	encrypt_round(crypto_ft_tab, 48)
+	encrypt_round(crypto_ft_tab, 64)
+	encrypt_round(crypto_ft_tab, 80)
+	encrypt_round(crypto_ft_tab, 96)
+	encrypt_final(crypto_fl_tab,112)
 	return
 
 /* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */
 
 	entry(aes_dec_blk,240,dec128,dec192)
-	decrypt_round(aes_it_tab,-96)
-	decrypt_round(aes_it_tab,-80)
-dec192:	decrypt_round(aes_it_tab,-64)
-	decrypt_round(aes_it_tab,-48)
-dec128:	decrypt_round(aes_it_tab,-32)
-	decrypt_round(aes_it_tab,-16)
-	decrypt_round(aes_it_tab,  0)
-	decrypt_round(aes_it_tab, 16)
-	decrypt_round(aes_it_tab, 32)
-	decrypt_round(aes_it_tab, 48)
-	decrypt_round(aes_it_tab, 64)
-	decrypt_round(aes_it_tab, 80)
-	decrypt_round(aes_it_tab, 96)
-	decrypt_final(aes_il_tab,112)
+	decrypt_round(crypto_it_tab,-96)
+	decrypt_round(crypto_it_tab,-80)
+dec192:	decrypt_round(crypto_it_tab,-64)
+	decrypt_round(crypto_it_tab,-48)
+dec128:	decrypt_round(crypto_it_tab,-32)
+	decrypt_round(crypto_it_tab,-16)
+	decrypt_round(crypto_it_tab,  0)
+	decrypt_round(crypto_it_tab, 16)
+	decrypt_round(crypto_it_tab, 32)
+	decrypt_round(crypto_it_tab, 48)
+	decrypt_round(crypto_it_tab, 64)
+	decrypt_round(crypto_it_tab, 80)
+	decrypt_round(crypto_it_tab, 96)
+	decrypt_final(crypto_il_tab,112)
 	return
diff --git a/arch/x86/crypto/aes_64.c b/arch/x86/crypto/aes_64.c
index 0b38a4cd2ce..d7a41a97dd3 100644
--- a/arch/x86/crypto/aes_64.c
+++ b/arch/x86/crypto/aes_64.c
@@ -1,284 +1,9 @@
 /*
- * Cryptographic API.
+ * Glue Code for AES Cipher Algorithm
  *
- * AES Cipher Algorithm.
- *
- * Based on Brian Gladman's code.
- *
- * Linux developers:
- *  Alexander Kjeldaas <astor@fast.no>
- *  Herbert Valerio Riedel <hvr@hvrlab.org>
- *  Kyle McMartin <kyle@debian.org>
- *  Adam J. Richter <adam@yggdrasil.com> (conversion to 2.5 API).
- *  Andreas Steinmetz <ast@domdv.de> (adapted to x86_64 assembler)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * ---------------------------------------------------------------------------
- * Copyright (c) 2002, Dr Brian Gladman <brg@gladman.me.uk>, Worcester, UK.
- * All rights reserved.
- *
- * LICENSE TERMS
- *
- * The free distribution and use of this software in both source and binary
- * form is allowed (with or without changes) provided that:
- *
- *   1. distributions of this source code include the above copyright
- *      notice, this list of conditions and the following disclaimer;
- *
- *   2. distributions in binary form include the above copyright
- *      notice, this list of conditions and the following disclaimer
- *      in the documentation and/or other associated materials;
- *
- *   3. the copyright holder's name is not used to endorse products
- *      built using this software without specific written permission.
- *
- * ALTERNATIVELY, provided that this notice is retained in full, this product
- * may be distributed under the terms of the GNU General Public License (GPL),
- * in which case the provisions of the GPL apply INSTEAD OF those given above.
- *
- * DISCLAIMER
- *
- * This software is provided 'as is' with no explicit or implied warranties
- * in respect of its properties, including, but not limited to, correctness
- * and/or fitness for purpose.
- * ---------------------------------------------------------------------------
  */
 
-/* Some changes from the Gladman version:
-    s/RIJNDAEL(e_key)/E_KEY/g
-    s/RIJNDAEL(d_key)/D_KEY/g
-*/
-
-#include <asm/byteorder.h>
 #include <crypto/aes.h>
-#include <linux/bitops.h>
-#include <linux/crypto.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/types.h>
-
-/*
- * #define byte(x, nr) ((unsigned char)((x) >> (nr*8)))
- */
-static inline u8 byte(const u32 x, const unsigned n)
-{
-	return x >> (n << 3);
-}
-
-struct aes_ctx
-{
-	u32 key_length;
-	u32 buf[120];
-};
-
-#define E_KEY (&ctx->buf[0])
-#define D_KEY (&ctx->buf[60])
-
-static u8 pow_tab[256] __initdata;
-static u8 log_tab[256] __initdata;
-static u8 sbx_tab[256] __initdata;
-static u8 isb_tab[256] __initdata;
-static u32 rco_tab[10];
-u32 aes_ft_tab[4][256];
-u32 aes_it_tab[4][256];
-
-u32 aes_fl_tab[4][256];
-u32 aes_il_tab[4][256];
-
-static inline u8 f_mult(u8 a, u8 b)
-{
-	u8 aa = log_tab[a], cc = aa + log_tab[b];
-
-	return pow_tab[cc + (cc < aa ? 1 : 0)];
-}
-
-#define ff_mult(a, b) (a && b ? f_mult(a, b) : 0)
-
-#define ls_box(x)				\
-	(aes_fl_tab[0][byte(x, 0)] ^		\
-	 aes_fl_tab[1][byte(x, 1)] ^		\
-	 aes_fl_tab[2][byte(x, 2)] ^		\
-	 aes_fl_tab[3][byte(x, 3)])
-
-static void __init gen_tabs(void)
-{
-	u32 i, t;
-	u8 p, q;
-
-	/* log and power tables for GF(2**8) finite field with
-	   0x011b as modular polynomial - the simplest primitive
-	   root is 0x03, used here to generate the tables */
-
-	for (i = 0, p = 1; i < 256; ++i) {
-		pow_tab[i] = (u8)p;
-		log_tab[p] = (u8)i;
-
-		p ^= (p << 1) ^ (p & 0x80 ? 0x01b : 0);
-	}
-
-	log_tab[1] = 0;
-
-	for (i = 0, p = 1; i < 10; ++i) {
-		rco_tab[i] = p;
-
-		p = (p << 1) ^ (p & 0x80 ? 0x01b : 0);
-	}
-
-	for (i = 0; i < 256; ++i) {
-		p = (i ? pow_tab[255 - log_tab[i]] : 0);
-		q = ((p >> 7) | (p << 1)) ^ ((p >> 6) | (p << 2));
-		p ^= 0x63 ^ q ^ ((q >> 6) | (q << 2));
-		sbx_tab[i] = p;
-		isb_tab[p] = (u8)i;
-	}
-
-	for (i = 0; i < 256; ++i) {
-		p = sbx_tab[i];
-
-		t = p;
-		aes_fl_tab[0][i] = t;
-		aes_fl_tab[1][i] = rol32(t, 8);
-		aes_fl_tab[2][i] = rol32(t, 16);
-		aes_fl_tab[3][i] = rol32(t, 24);
-
-		t = ((u32)ff_mult(2, p)) |
-		    ((u32)p << 8) |
-		    ((u32)p << 16) | ((u32)ff_mult(3, p) << 24);
-
-		aes_ft_tab[0][i] = t;
-		aes_ft_tab[1][i] = rol32(t, 8);
-		aes_ft_tab[2][i] = rol32(t, 16);
-		aes_ft_tab[3][i] = rol32(t, 24);
-
-		p = isb_tab[i];
-
-		t = p;
-		aes_il_tab[0][i] = t;
-		aes_il_tab[1][i] = rol32(t, 8);
-		aes_il_tab[2][i] = rol32(t, 16);
-		aes_il_tab[3][i] = rol32(t, 24);
-
-		t = ((u32)ff_mult(14, p)) |
-		    ((u32)ff_mult(9, p) << 8) |
-		    ((u32)ff_mult(13, p) << 16) |
-		    ((u32)ff_mult(11, p) << 24);
-
-		aes_it_tab[0][i] = t;
-		aes_it_tab[1][i] = rol32(t, 8);
-		aes_it_tab[2][i] = rol32(t, 16);
-		aes_it_tab[3][i] = rol32(t, 24);
-	}
-}
-
-#define star_x(x) (((x) & 0x7f7f7f7f) << 1) ^ ((((x) & 0x80808080) >> 7) * 0x1b)
-
-#define imix_col(y, x)			\
-	u    = star_x(x);		\
-	v    = star_x(u);		\
-	w    = star_x(v);		\
-	t    = w ^ (x);			\
-	(y)  = u ^ v ^ w;		\
-	(y) ^= ror32(u ^ t,  8) ^	\
-	       ror32(v ^ t, 16) ^	\
-	       ror32(t, 24)
-
-/* initialise the key schedule from the user supplied key */
-
-#define loop4(i)					\
-{							\
-	t = ror32(t,  8); t = ls_box(t) ^ rco_tab[i];	\
-	t ^= E_KEY[4 * i];     E_KEY[4 * i + 4] = t;	\
-	t ^= E_KEY[4 * i + 1]; E_KEY[4 * i + 5] = t;	\
-	t ^= E_KEY[4 * i + 2]; E_KEY[4 * i + 6] = t;	\
-	t ^= E_KEY[4 * i + 3]; E_KEY[4 * i + 7] = t;	\
-}
-
-#define loop6(i)					\
-{							\
-	t = ror32(t,  8); t = ls_box(t) ^ rco_tab[i];	\
-	t ^= E_KEY[6 * i];     E_KEY[6 * i + 6] = t;	\
-	t ^= E_KEY[6 * i + 1]; E_KEY[6 * i + 7] = t;	\
-	t ^= E_KEY[6 * i + 2]; E_KEY[6 * i + 8] = t;	\
-	t ^= E_KEY[6 * i + 3]; E_KEY[6 * i + 9] = t;	\
-	t ^= E_KEY[6 * i + 4]; E_KEY[6 * i + 10] = t;	\
-	t ^= E_KEY[6 * i + 5]; E_KEY[6 * i + 11] = t;	\
-}
-
-#define loop8(i)					\
-{							\
-	t = ror32(t,  8); ; t = ls_box(t) ^ rco_tab[i];	\
-	t ^= E_KEY[8 * i];     E_KEY[8 * i + 8] = t;	\
-	t ^= E_KEY[8 * i + 1]; E_KEY[8 * i + 9] = t;	\
-	t ^= E_KEY[8 * i + 2]; E_KEY[8 * i + 10] = t;	\
-	t ^= E_KEY[8 * i + 3]; E_KEY[8 * i + 11] = t;	\
-	t  = E_KEY[8 * i + 4] ^ ls_box(t);		\
-	E_KEY[8 * i + 12] = t;				\
-	t ^= E_KEY[8 * i + 5]; E_KEY[8 * i + 13] = t;	\
-	t ^= E_KEY[8 * i + 6]; E_KEY[8 * i + 14] = t;	\
-	t ^= E_KEY[8 * i + 7]; E_KEY[8 * i + 15] = t;	\
-}
-
-static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
-		       unsigned int key_len)
-{
-	struct aes_ctx *ctx = crypto_tfm_ctx(tfm);
-	const __le32 *key = (const __le32 *)in_key;
-	u32 *flags = &tfm->crt_flags;
-	u32 i, j, t, u, v, w;
-
-	if (key_len % 8) {
-		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
-		return -EINVAL;
-	}
-
-	ctx->key_length = key_len;
-
-	D_KEY[key_len + 24] = E_KEY[0] = le32_to_cpu(key[0]);
-	D_KEY[key_len + 25] = E_KEY[1] = le32_to_cpu(key[1]);
-	D_KEY[key_len + 26] = E_KEY[2] = le32_to_cpu(key[2]);
-	D_KEY[key_len + 27] = E_KEY[3] = le32_to_cpu(key[3]);
-
-	switch (key_len) {
-	case 16:
-		t = E_KEY[3];
-		for (i = 0; i < 10; ++i)
-			loop4(i);
-		break;
-
-	case 24:
-		E_KEY[4] = le32_to_cpu(key[4]);
-		t = E_KEY[5] = le32_to_cpu(key[5]);
-		for (i = 0; i < 8; ++i)
-			loop6 (i);
-		break;
-
-	case 32:
-		E_KEY[4] = le32_to_cpu(key[4]);
-		E_KEY[5] = le32_to_cpu(key[5]);
-		E_KEY[6] = le32_to_cpu(key[6]);
-		t = E_KEY[7] = le32_to_cpu(key[7]);
-		for (i = 0; i < 7; ++i)
-			loop8(i);
-		break;
-	}
-
-	D_KEY[0] = E_KEY[key_len + 24];
-	D_KEY[1] = E_KEY[key_len + 25];
-	D_KEY[2] = E_KEY[key_len + 26];
-	D_KEY[3] = E_KEY[key_len + 27];
-
-	for (i = 4; i < key_len + 24; ++i) {
-		j = key_len + 24 - (i & ~3) + (i & 3);
-		imix_col(D_KEY[j], E_KEY[i]);
-	}
-
-	return 0;
-}
 
 asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
 asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
@@ -299,14 +24,14 @@ static struct crypto_alg aes_alg = {
 	.cra_priority		=	200,
 	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
 	.cra_blocksize		=	AES_BLOCK_SIZE,
-	.cra_ctxsize		=	sizeof(struct aes_ctx),
+	.cra_ctxsize		=	sizeof(struct crypto_aes_ctx),
 	.cra_module		=	THIS_MODULE,
 	.cra_list		=	LIST_HEAD_INIT(aes_alg.cra_list),
 	.cra_u			=	{
 		.cipher = {
 			.cia_min_keysize	=	AES_MIN_KEY_SIZE,
 			.cia_max_keysize	=	AES_MAX_KEY_SIZE,
-			.cia_setkey	   	= 	aes_set_key,
+			.cia_setkey		=	crypto_aes_set_key,
 			.cia_encrypt	 	=	aes_encrypt,
 			.cia_decrypt	  	=	aes_decrypt
 		}
@@ -315,7 +40,6 @@ static struct crypto_alg aes_alg = {
 
 static int __init aes_init(void)
 {
-	gen_tabs();
 	return crypto_register_alg(&aes_alg);
 }
 
-- 
cgit v1.2.3


From 5157dea8139cf0edc4834d528531e642c0d27e37 Mon Sep 17 00:00:00 2001
From: Sebastian Siewior <sebastian@breakpoint.cc>
Date: Sat, 10 Nov 2007 19:07:16 +0800
Subject: [CRYPTO] aes-i586: Remove setkey

The setkey() function can be shared with the generic algorithm.

Signed-off-by: Sebastian Siewior <sebastian@breakpoint.cc>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aes-i586-asm_32.S |  89 ++++----
 arch/x86/crypto/aes_32.c          | 461 +-------------------------------------
 2 files changed, 46 insertions(+), 504 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/aes-i586-asm_32.S b/arch/x86/crypto/aes-i586-asm_32.S
index f942f0c8f63..1093bede3e0 100644
--- a/arch/x86/crypto/aes-i586-asm_32.S
+++ b/arch/x86/crypto/aes-i586-asm_32.S
@@ -46,9 +46,9 @@
 #define in_blk 16
 
 /* offsets in crypto_tfm structure */
-#define ekey (crypto_tfm_ctx_offset + 0)
-#define nrnd (crypto_tfm_ctx_offset + 256)
-#define dkey (crypto_tfm_ctx_offset + 260)
+#define klen (crypto_tfm_ctx_offset + 0)
+#define ekey (crypto_tfm_ctx_offset + 4)
+#define dkey (crypto_tfm_ctx_offset + 244)
 
 // register mapping for encrypt and decrypt subroutines
 
@@ -221,8 +221,8 @@
 
 .global  aes_enc_blk
 
-.extern  ft_tab
-.extern  fl_tab
+.extern  crypto_ft_tab
+.extern  crypto_fl_tab
 
 .align 4
 
@@ -236,7 +236,7 @@ aes_enc_blk:
 1:	push    %ebx
 	mov     in_blk+4(%esp),%r2
 	push    %esi
-	mov     nrnd(%ebp),%r3   // number of rounds
+	mov     klen(%ebp),%r3   // key size
 	push    %edi
 #if ekey != 0
 	lea     ekey(%ebp),%ebp  // key pointer
@@ -255,26 +255,26 @@ aes_enc_blk:
 
 	sub     $8,%esp		// space for register saves on stack
 	add     $16,%ebp	// increment to next round key
-	cmp     $12,%r3
+	cmp     $24,%r3
 	jb      4f		// 10 rounds for 128-bit key
 	lea     32(%ebp),%ebp
 	je      3f		// 12 rounds for 192-bit key
 	lea     32(%ebp),%ebp
 
-2:	fwd_rnd1( -64(%ebp) ,ft_tab)	// 14 rounds for 256-bit key
-	fwd_rnd2( -48(%ebp) ,ft_tab)
-3:	fwd_rnd1( -32(%ebp) ,ft_tab)	// 12 rounds for 192-bit key
-	fwd_rnd2( -16(%ebp) ,ft_tab)
-4:	fwd_rnd1(    (%ebp) ,ft_tab)	// 10 rounds for 128-bit key
-	fwd_rnd2( +16(%ebp) ,ft_tab)
-	fwd_rnd1( +32(%ebp) ,ft_tab)
-	fwd_rnd2( +48(%ebp) ,ft_tab)
-	fwd_rnd1( +64(%ebp) ,ft_tab)
-	fwd_rnd2( +80(%ebp) ,ft_tab)
-	fwd_rnd1( +96(%ebp) ,ft_tab)
-	fwd_rnd2(+112(%ebp) ,ft_tab)
-	fwd_rnd1(+128(%ebp) ,ft_tab)
-	fwd_rnd2(+144(%ebp) ,fl_tab)	// last round uses a different table
+2:	fwd_rnd1( -64(%ebp), crypto_ft_tab)	// 14 rounds for 256-bit key
+	fwd_rnd2( -48(%ebp), crypto_ft_tab)
+3:	fwd_rnd1( -32(%ebp), crypto_ft_tab)	// 12 rounds for 192-bit key
+	fwd_rnd2( -16(%ebp), crypto_ft_tab)
+4:	fwd_rnd1(    (%ebp), crypto_ft_tab)	// 10 rounds for 128-bit key
+	fwd_rnd2( +16(%ebp), crypto_ft_tab)
+	fwd_rnd1( +32(%ebp), crypto_ft_tab)
+	fwd_rnd2( +48(%ebp), crypto_ft_tab)
+	fwd_rnd1( +64(%ebp), crypto_ft_tab)
+	fwd_rnd2( +80(%ebp), crypto_ft_tab)
+	fwd_rnd1( +96(%ebp), crypto_ft_tab)
+	fwd_rnd2(+112(%ebp), crypto_ft_tab)
+	fwd_rnd1(+128(%ebp), crypto_ft_tab)
+	fwd_rnd2(+144(%ebp), crypto_fl_tab)	// last round uses a different table
 
 // move final values to the output array.  CAUTION: the 
 // order of these assigns rely on the register mappings
@@ -297,8 +297,8 @@ aes_enc_blk:
 
 .global  aes_dec_blk
 
-.extern  it_tab
-.extern  il_tab
+.extern  crypto_it_tab
+.extern  crypto_il_tab
 
 .align 4
 
@@ -312,14 +312,11 @@ aes_dec_blk:
 1:	push    %ebx
 	mov     in_blk+4(%esp),%r2
 	push    %esi
-	mov     nrnd(%ebp),%r3   // number of rounds
+	mov     klen(%ebp),%r3   // key size
 	push    %edi
 #if dkey != 0
 	lea     dkey(%ebp),%ebp  // key pointer
 #endif
-	mov     %r3,%r0
-	shl     $4,%r0
-	add     %r0,%ebp
 	
 // input four columns and xor in first round key
 
@@ -333,27 +330,27 @@ aes_dec_blk:
 	xor     12(%ebp),%r5
 
 	sub     $8,%esp		// space for register saves on stack
-	sub     $16,%ebp	// increment to next round key
-	cmp     $12,%r3
+	add     $16,%ebp	// increment to next round key
+	cmp     $24,%r3
 	jb      4f		// 10 rounds for 128-bit key
-	lea     -32(%ebp),%ebp
+	lea     32(%ebp),%ebp
 	je      3f		// 12 rounds for 192-bit key
-	lea     -32(%ebp),%ebp
-
-2:	inv_rnd1( +64(%ebp), it_tab)	// 14 rounds for 256-bit key
-	inv_rnd2( +48(%ebp), it_tab)
-3:	inv_rnd1( +32(%ebp), it_tab)	// 12 rounds for 192-bit key
-	inv_rnd2( +16(%ebp), it_tab)
-4:	inv_rnd1(    (%ebp), it_tab)	// 10 rounds for 128-bit key
-	inv_rnd2( -16(%ebp), it_tab)
-	inv_rnd1( -32(%ebp), it_tab)
-	inv_rnd2( -48(%ebp), it_tab)
-	inv_rnd1( -64(%ebp), it_tab)
-	inv_rnd2( -80(%ebp), it_tab)
-	inv_rnd1( -96(%ebp), it_tab)
-	inv_rnd2(-112(%ebp), it_tab)
-	inv_rnd1(-128(%ebp), it_tab)
-	inv_rnd2(-144(%ebp), il_tab)	// last round uses a different table
+	lea     32(%ebp),%ebp
+
+2:	inv_rnd1( -64(%ebp), crypto_it_tab)	// 14 rounds for 256-bit key
+	inv_rnd2( -48(%ebp), crypto_it_tab)
+3:	inv_rnd1( -32(%ebp), crypto_it_tab)	// 12 rounds for 192-bit key
+	inv_rnd2( -16(%ebp), crypto_it_tab)
+4:	inv_rnd1(    (%ebp), crypto_it_tab)	// 10 rounds for 128-bit key
+	inv_rnd2( +16(%ebp), crypto_it_tab)
+	inv_rnd1( +32(%ebp), crypto_it_tab)
+	inv_rnd2( +48(%ebp), crypto_it_tab)
+	inv_rnd1( +64(%ebp), crypto_it_tab)
+	inv_rnd2( +80(%ebp), crypto_it_tab)
+	inv_rnd1( +96(%ebp), crypto_it_tab)
+	inv_rnd2(+112(%ebp), crypto_it_tab)
+	inv_rnd1(+128(%ebp), crypto_it_tab)
+	inv_rnd2(+144(%ebp), crypto_il_tab)	// last round uses a different table
 
 // move final values to the output array.  CAUTION: the 
 // order of these assigns rely on the register mappings
diff --git a/arch/x86/crypto/aes_32.c b/arch/x86/crypto/aes_32.c
index 9b0ab50394b..8556d9561c2 100644
--- a/arch/x86/crypto/aes_32.c
+++ b/arch/x86/crypto/aes_32.c
@@ -1,468 +1,14 @@
-/* 
- * 
+/*
  * Glue Code for optimized 586 assembler version of AES
- *
- * Copyright (c) 2002, Dr Brian Gladman <>, Worcester, UK.
- * All rights reserved.
- *
- * LICENSE TERMS
- *
- * The free distribution and use of this software in both source and binary
- * form is allowed (with or without changes) provided that:
- *
- *   1. distributions of this source code include the above copyright
- *      notice, this list of conditions and the following disclaimer;
- *
- *   2. distributions in binary form include the above copyright
- *      notice, this list of conditions and the following disclaimer
- *      in the documentation and/or other associated materials;
- *
- *   3. the copyright holder's name is not used to endorse products
- *      built using this software without specific written permission.
- *
- * ALTERNATIVELY, provided that this notice is retained in full, this product
- * may be distributed under the terms of the GNU General Public License (GPL),
- * in which case the provisions of the GPL apply INSTEAD OF those given above.
- *
- * DISCLAIMER
- *
- * This software is provided 'as is' with no explicit or implied warranties
- * in respect of its properties, including, but not limited to, correctness
- * and/or fitness for purpose.
- *
- * Copyright (c) 2003, Adam J. Richter <adam@yggdrasil.com> (conversion to
- * 2.5 API).
- * Copyright (c) 2003, 2004 Fruhwirth Clemens <clemens@endorphin.org>
- * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
- *
  */
 
-#include <asm/byteorder.h>
 #include <crypto/aes.h>
-#include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/init.h>
-#include <linux/types.h>
 #include <linux/crypto.h>
-#include <linux/linkage.h>
 
 asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
 asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
 
-#define AES_KS_LENGTH		4 * AES_BLOCK_SIZE
-#define RC_LENGTH		29
-
-struct aes_ctx {
-	u32 ekey[AES_KS_LENGTH];
-	u32 rounds;
-	u32 dkey[AES_KS_LENGTH];
-};
-
-#define WPOLY 0x011b
-#define bytes2word(b0, b1, b2, b3)  \
-	(((u32)(b3) << 24) | ((u32)(b2) << 16) | ((u32)(b1) << 8) | (b0))
-
-/* define the finite field multiplies required for Rijndael */
-#define f2(x) ((x) ? pow[log[x] + 0x19] : 0)
-#define f3(x) ((x) ? pow[log[x] + 0x01] : 0)
-#define f9(x) ((x) ? pow[log[x] + 0xc7] : 0)
-#define fb(x) ((x) ? pow[log[x] + 0x68] : 0)
-#define fd(x) ((x) ? pow[log[x] + 0xee] : 0)
-#define fe(x) ((x) ? pow[log[x] + 0xdf] : 0)
-#define fi(x) ((x) ?   pow[255 - log[x]]: 0)
-
-static inline u32 upr(u32 x, int n)
-{
-	return (x << 8 * n) | (x >> (32 - 8 * n));
-}
-
-static inline u8 bval(u32 x, int n)
-{
-	return x >> 8 * n;
-}
-
-/* The forward and inverse affine transformations used in the S-box */
-#define fwd_affine(x) \
-	(w = (u32)x, w ^= (w<<1)^(w<<2)^(w<<3)^(w<<4), 0x63^(u8)(w^(w>>8)))
-
-#define inv_affine(x) \
-	(w = (u32)x, w = (w<<1)^(w<<3)^(w<<6), 0x05^(u8)(w^(w>>8)))
-
-static u32 rcon_tab[RC_LENGTH];
-
-u32 ft_tab[4][256];
-u32 fl_tab[4][256];
-static u32 im_tab[4][256];
-u32 il_tab[4][256];
-u32 it_tab[4][256];
-
-static void gen_tabs(void)
-{
-	u32 i, w;
-	u8 pow[512], log[256];
-
-	/*
-	 * log and power tables for GF(2^8) finite field with
-	 * WPOLY as modular polynomial - the simplest primitive
-	 * root is 0x03, used here to generate the tables.
-	 */
-	i = 0; w = 1; 
-	
-	do {
-		pow[i] = (u8)w;
-		pow[i + 255] = (u8)w;
-		log[w] = (u8)i++;
-		w ^=  (w << 1) ^ (w & 0x80 ? WPOLY : 0);
-	} while (w != 1);
-	
-	for(i = 0, w = 1; i < RC_LENGTH; ++i) {
-		rcon_tab[i] = bytes2word(w, 0, 0, 0);
-		w = f2(w);
-	}
-
-	for(i = 0; i < 256; ++i) {
-		u8 b;
-		
-		b = fwd_affine(fi((u8)i));
-		w = bytes2word(f2(b), b, b, f3(b));
-
-		/* tables for a normal encryption round */
-		ft_tab[0][i] = w;
-		ft_tab[1][i] = upr(w, 1);
-		ft_tab[2][i] = upr(w, 2);
-		ft_tab[3][i] = upr(w, 3);
-		w = bytes2word(b, 0, 0, 0);
-		
-		/*
-		 * tables for last encryption round
-		 * (may also be used in the key schedule)
-		 */
-		fl_tab[0][i] = w;
-		fl_tab[1][i] = upr(w, 1);
-		fl_tab[2][i] = upr(w, 2);
-		fl_tab[3][i] = upr(w, 3);
-		
-		b = fi(inv_affine((u8)i));
-		w = bytes2word(fe(b), f9(b), fd(b), fb(b));
-
-		/* tables for the inverse mix column operation  */
-		im_tab[0][b] = w;
-		im_tab[1][b] = upr(w, 1);
-		im_tab[2][b] = upr(w, 2);
-		im_tab[3][b] = upr(w, 3);
-
-		/* tables for a normal decryption round */
-		it_tab[0][i] = w;
-		it_tab[1][i] = upr(w,1);
-		it_tab[2][i] = upr(w,2);
-		it_tab[3][i] = upr(w,3);
-
-		w = bytes2word(b, 0, 0, 0);
-		
-		/* tables for last decryption round */
-		il_tab[0][i] = w;
-		il_tab[1][i] = upr(w,1);
-		il_tab[2][i] = upr(w,2);
-		il_tab[3][i] = upr(w,3);
-    }
-}
-
-#define four_tables(x,tab,vf,rf,c)		\
-(	tab[0][bval(vf(x,0,c),rf(0,c))]	^	\
-	tab[1][bval(vf(x,1,c),rf(1,c))] ^	\
-	tab[2][bval(vf(x,2,c),rf(2,c))] ^	\
-	tab[3][bval(vf(x,3,c),rf(3,c))]		\
-)
-
-#define vf1(x,r,c)  (x)
-#define rf1(r,c)    (r)
-#define rf2(r,c)    ((r-c)&3)
-
-#define inv_mcol(x) four_tables(x,im_tab,vf1,rf1,0)
-#define ls_box(x,c) four_tables(x,fl_tab,vf1,rf2,c)
-
-#define ff(x) inv_mcol(x)
-
-#define ke4(k,i)							\
-{									\
-	k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i];		\
-	k[4*(i)+5] = ss[1] ^= ss[0];					\
-	k[4*(i)+6] = ss[2] ^= ss[1];					\
-	k[4*(i)+7] = ss[3] ^= ss[2];					\
-}
-
-#define kel4(k,i)							\
-{									\
-	k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i];		\
-	k[4*(i)+5] = ss[1] ^= ss[0];					\
-	k[4*(i)+6] = ss[2] ^= ss[1]; k[4*(i)+7] = ss[3] ^= ss[2];	\
-}
-
-#define ke6(k,i)							\
-{									\
-	k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i];		\
-	k[6*(i)+ 7] = ss[1] ^= ss[0];					\
-	k[6*(i)+ 8] = ss[2] ^= ss[1];					\
-	k[6*(i)+ 9] = ss[3] ^= ss[2];					\
-	k[6*(i)+10] = ss[4] ^= ss[3];					\
-	k[6*(i)+11] = ss[5] ^= ss[4];					\
-}
-
-#define kel6(k,i)							\
-{									\
-	k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i];		\
-	k[6*(i)+ 7] = ss[1] ^= ss[0];					\
-	k[6*(i)+ 8] = ss[2] ^= ss[1];					\
-	k[6*(i)+ 9] = ss[3] ^= ss[2];					\
-}
-
-#define ke8(k,i)							\
-{									\
-	k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i];		\
-	k[8*(i)+ 9] = ss[1] ^= ss[0];					\
-	k[8*(i)+10] = ss[2] ^= ss[1];					\
-	k[8*(i)+11] = ss[3] ^= ss[2];					\
-	k[8*(i)+12] = ss[4] ^= ls_box(ss[3],0);				\
-	k[8*(i)+13] = ss[5] ^= ss[4];					\
-	k[8*(i)+14] = ss[6] ^= ss[5];					\
-	k[8*(i)+15] = ss[7] ^= ss[6];					\
-}
-
-#define kel8(k,i)							\
-{									\
-	k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i];		\
-	k[8*(i)+ 9] = ss[1] ^= ss[0];					\
-	k[8*(i)+10] = ss[2] ^= ss[1];					\
-	k[8*(i)+11] = ss[3] ^= ss[2];					\
-}
-
-#define kdf4(k,i)							\
-{									\
-	ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3];				\
-	ss[1] = ss[1] ^ ss[3];						\
-	ss[2] = ss[2] ^ ss[3];						\
-	ss[3] = ss[3];							\
-	ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i];			\
-	ss[i % 4] ^= ss[4];						\
-	ss[4] ^= k[4*(i)];						\
-	k[4*(i)+4] = ff(ss[4]);						\
-	ss[4] ^= k[4*(i)+1];						\
-	k[4*(i)+5] = ff(ss[4]);						\
-	ss[4] ^= k[4*(i)+2];						\
-	k[4*(i)+6] = ff(ss[4]);						\
-	ss[4] ^= k[4*(i)+3];						\
-	k[4*(i)+7] = ff(ss[4]);						\
-}
-
-#define kd4(k,i)							\
-{									\
-	ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i];			\
-	ss[i % 4] ^= ss[4];						\
-	ss[4] = ff(ss[4]);						\
-	k[4*(i)+4] = ss[4] ^= k[4*(i)];					\
-	k[4*(i)+5] = ss[4] ^= k[4*(i)+1];				\
-	k[4*(i)+6] = ss[4] ^= k[4*(i)+2];				\
-	k[4*(i)+7] = ss[4] ^= k[4*(i)+3];				\
-}
-
-#define kdl4(k,i)							\
-{									\
-	ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i];			\
-	ss[i % 4] ^= ss[4];						\
-	k[4*(i)+4] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3];			\
-	k[4*(i)+5] = ss[1] ^ ss[3];					\
-	k[4*(i)+6] = ss[0];						\
-	k[4*(i)+7] = ss[1];						\
-}
-
-#define kdf6(k,i)							\
-{									\
-	ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i];				\
-	k[6*(i)+ 6] = ff(ss[0]);					\
-	ss[1] ^= ss[0];							\
-	k[6*(i)+ 7] = ff(ss[1]);					\
-	ss[2] ^= ss[1];							\
-	k[6*(i)+ 8] = ff(ss[2]);					\
-	ss[3] ^= ss[2];							\
-	k[6*(i)+ 9] = ff(ss[3]);					\
-	ss[4] ^= ss[3];							\
-	k[6*(i)+10] = ff(ss[4]);					\
-	ss[5] ^= ss[4];							\
-	k[6*(i)+11] = ff(ss[5]);					\
-}
-
-#define kd6(k,i)							\
-{									\
-	ss[6] = ls_box(ss[5],3) ^ rcon_tab[i];				\
-	ss[0] ^= ss[6]; ss[6] = ff(ss[6]);				\
-	k[6*(i)+ 6] = ss[6] ^= k[6*(i)];				\
-	ss[1] ^= ss[0];							\
-	k[6*(i)+ 7] = ss[6] ^= k[6*(i)+ 1];				\
-	ss[2] ^= ss[1];							\
-	k[6*(i)+ 8] = ss[6] ^= k[6*(i)+ 2];				\
-	ss[3] ^= ss[2];							\
-	k[6*(i)+ 9] = ss[6] ^= k[6*(i)+ 3];				\
-	ss[4] ^= ss[3];							\
-	k[6*(i)+10] = ss[6] ^= k[6*(i)+ 4];				\
-	ss[5] ^= ss[4];							\
-	k[6*(i)+11] = ss[6] ^= k[6*(i)+ 5];				\
-}
-
-#define kdl6(k,i)							\
-{									\
-	ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i];				\
-	k[6*(i)+ 6] = ss[0];						\
-	ss[1] ^= ss[0];							\
-	k[6*(i)+ 7] = ss[1];						\
-	ss[2] ^= ss[1];							\
-	k[6*(i)+ 8] = ss[2];						\
-	ss[3] ^= ss[2];							\
-	k[6*(i)+ 9] = ss[3];						\
-}
-
-#define kdf8(k,i)							\
-{									\
-	ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i];				\
-	k[8*(i)+ 8] = ff(ss[0]);					\
-	ss[1] ^= ss[0];							\
-	k[8*(i)+ 9] = ff(ss[1]);					\
-	ss[2] ^= ss[1];							\
-	k[8*(i)+10] = ff(ss[2]);					\
-	ss[3] ^= ss[2];							\
-	k[8*(i)+11] = ff(ss[3]);					\
-	ss[4] ^= ls_box(ss[3],0);					\
-	k[8*(i)+12] = ff(ss[4]);					\
-	ss[5] ^= ss[4];							\
-	k[8*(i)+13] = ff(ss[5]);					\
-	ss[6] ^= ss[5];							\
-	k[8*(i)+14] = ff(ss[6]);					\
-	ss[7] ^= ss[6];							\
-	k[8*(i)+15] = ff(ss[7]);					\
-}
-
-#define kd8(k,i)							\
-{									\
-	u32 __g = ls_box(ss[7],3) ^ rcon_tab[i];			\
-	ss[0] ^= __g;							\
-	__g = ff(__g);							\
-	k[8*(i)+ 8] = __g ^= k[8*(i)];					\
-	ss[1] ^= ss[0];							\
-	k[8*(i)+ 9] = __g ^= k[8*(i)+ 1];				\
-	ss[2] ^= ss[1];							\
-	k[8*(i)+10] = __g ^= k[8*(i)+ 2];				\
-	ss[3] ^= ss[2];							\
-	k[8*(i)+11] = __g ^= k[8*(i)+ 3];				\
-	__g = ls_box(ss[3],0);						\
-	ss[4] ^= __g;							\
-	__g = ff(__g);							\
-	k[8*(i)+12] = __g ^= k[8*(i)+ 4];				\
-	ss[5] ^= ss[4];							\
-	k[8*(i)+13] = __g ^= k[8*(i)+ 5];				\
-	ss[6] ^= ss[5];							\
-	k[8*(i)+14] = __g ^= k[8*(i)+ 6];				\
-	ss[7] ^= ss[6];							\
-	k[8*(i)+15] = __g ^= k[8*(i)+ 7];				\
-}
-
-#define kdl8(k,i)							\
-{									\
-	ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i];				\
-	k[8*(i)+ 8] = ss[0];						\
-	ss[1] ^= ss[0];							\
-	k[8*(i)+ 9] = ss[1];						\
-	ss[2] ^= ss[1];							\
-	k[8*(i)+10] = ss[2];						\
-	ss[3] ^= ss[2];							\
-	k[8*(i)+11] = ss[3];						\
-}
-
-static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
-		       unsigned int key_len)
-{
-	int i;
-	u32 ss[8];
-	struct aes_ctx *ctx = crypto_tfm_ctx(tfm);
-	const __le32 *key = (const __le32 *)in_key;
-	u32 *flags = &tfm->crt_flags;
-
-	/* encryption schedule */
-	
-	ctx->ekey[0] = ss[0] = le32_to_cpu(key[0]);
-	ctx->ekey[1] = ss[1] = le32_to_cpu(key[1]);
-	ctx->ekey[2] = ss[2] = le32_to_cpu(key[2]);
-	ctx->ekey[3] = ss[3] = le32_to_cpu(key[3]);
-
-	switch(key_len) {
-	case 16:
-		for (i = 0; i < 9; i++)
-			ke4(ctx->ekey, i);
-		kel4(ctx->ekey, 9);
-		ctx->rounds = 10;
-		break;
-		
-	case 24:
-		ctx->ekey[4] = ss[4] = le32_to_cpu(key[4]);
-		ctx->ekey[5] = ss[5] = le32_to_cpu(key[5]);
-		for (i = 0; i < 7; i++)
-			ke6(ctx->ekey, i);
-		kel6(ctx->ekey, 7); 
-		ctx->rounds = 12;
-		break;
-
-	case 32:
-		ctx->ekey[4] = ss[4] = le32_to_cpu(key[4]);
-		ctx->ekey[5] = ss[5] = le32_to_cpu(key[5]);
-		ctx->ekey[6] = ss[6] = le32_to_cpu(key[6]);
-		ctx->ekey[7] = ss[7] = le32_to_cpu(key[7]);
-		for (i = 0; i < 6; i++)
-			ke8(ctx->ekey, i);
-		kel8(ctx->ekey, 6);
-		ctx->rounds = 14;
-		break;
-
-	default:
-		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
-		return -EINVAL;
-	}
-	
-	/* decryption schedule */
-	
-	ctx->dkey[0] = ss[0] = le32_to_cpu(key[0]);
-	ctx->dkey[1] = ss[1] = le32_to_cpu(key[1]);
-	ctx->dkey[2] = ss[2] = le32_to_cpu(key[2]);
-	ctx->dkey[3] = ss[3] = le32_to_cpu(key[3]);
-
-	switch (key_len) {
-	case 16:
-		kdf4(ctx->dkey, 0);
-		for (i = 1; i < 9; i++)
-			kd4(ctx->dkey, i);
-		kdl4(ctx->dkey, 9);
-		break;
-		
-	case 24:
-		ctx->dkey[4] = ff(ss[4] = le32_to_cpu(key[4]));
-		ctx->dkey[5] = ff(ss[5] = le32_to_cpu(key[5]));
-		kdf6(ctx->dkey, 0);
-		for (i = 1; i < 7; i++)
-			kd6(ctx->dkey, i);
-		kdl6(ctx->dkey, 7);
-		break;
-
-	case 32:
-		ctx->dkey[4] = ff(ss[4] = le32_to_cpu(key[4]));
-		ctx->dkey[5] = ff(ss[5] = le32_to_cpu(key[5]));
-		ctx->dkey[6] = ff(ss[6] = le32_to_cpu(key[6]));
-		ctx->dkey[7] = ff(ss[7] = le32_to_cpu(key[7]));
-		kdf8(ctx->dkey, 0);
-		for (i = 1; i < 6; i++)
-			kd8(ctx->dkey, i);
-		kdl8(ctx->dkey, 6);
-		break;
-	}
-	return 0;
-}
-
 static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
 	aes_enc_blk(tfm, dst, src);
@@ -479,14 +25,14 @@ static struct crypto_alg aes_alg = {
 	.cra_priority		=	200,
 	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
 	.cra_blocksize		=	AES_BLOCK_SIZE,
-	.cra_ctxsize		=	sizeof(struct aes_ctx),
+	.cra_ctxsize		=	sizeof(struct crypto_aes_ctx),
 	.cra_module		=	THIS_MODULE,
 	.cra_list		=	LIST_HEAD_INIT(aes_alg.cra_list),
 	.cra_u			=	{
 		.cipher = {
 			.cia_min_keysize	=	AES_MIN_KEY_SIZE,
 			.cia_max_keysize	=	AES_MAX_KEY_SIZE,
-			.cia_setkey	   	= 	aes_set_key,
+			.cia_setkey		=	crypto_aes_set_key,
 			.cia_encrypt	 	=	aes_encrypt,
 			.cia_decrypt	  	=	aes_decrypt
 		}
@@ -495,7 +41,6 @@ static struct crypto_alg aes_alg = {
 
 static int __init aes_init(void)
 {
-	gen_tabs();
 	return crypto_register_alg(&aes_alg);
 }
 
-- 
cgit v1.2.3


From 06e1a8f0505426a97292174a959560fd86ea0a3d Mon Sep 17 00:00:00 2001
From: Sebastian Siewior <sebastian@breakpoint.cc>
Date: Fri, 30 Nov 2007 00:15:11 +1100
Subject: [CRYPTO] aes-asm: Merge common glue code

32 bit and 64 bit glue code is using (now) the same
piece code. This patch unifies them.

Signed-off-by: Sebastian Siewior <sebastian@breakpoint.cc>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile   |  4 ++--
 arch/x86/crypto/aes_32.c   | 58 ----------------------------------------------
 arch/x86/crypto/aes_64.c   | 56 --------------------------------------------
 arch/x86/crypto/aes_glue.c | 57 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 59 insertions(+), 116 deletions(-)
 delete mode 100644 arch/x86/crypto/aes_32.c
 delete mode 100644 arch/x86/crypto/aes_64.c
 create mode 100644 arch/x86/crypto/aes_glue.c

(limited to 'arch')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 46bb609e244..b8fbb43df6d 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -8,8 +8,8 @@ obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 
-aes-i586-y := aes-i586-asm_32.o aes_32.o
+aes-i586-y := aes-i586-asm_32.o aes_glue.o
 twofish-i586-y := twofish-i586-asm_32.o twofish_32.o
 
-aes-x86_64-y := aes-x86_64-asm_64.o aes_64.o
+aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_64.o
diff --git a/arch/x86/crypto/aes_32.c b/arch/x86/crypto/aes_32.c
deleted file mode 100644
index 8556d9561c2..00000000000
--- a/arch/x86/crypto/aes_32.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Glue Code for optimized 586 assembler version of AES
- */
-
-#include <crypto/aes.h>
-#include <linux/module.h>
-#include <linux/crypto.h>
-
-asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
-asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
-
-static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-	aes_enc_blk(tfm, dst, src);
-}
-
-static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-	aes_dec_blk(tfm, dst, src);
-}
-
-static struct crypto_alg aes_alg = {
-	.cra_name		=	"aes",
-	.cra_driver_name	=	"aes-i586",
-	.cra_priority		=	200,
-	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
-	.cra_blocksize		=	AES_BLOCK_SIZE,
-	.cra_ctxsize		=	sizeof(struct crypto_aes_ctx),
-	.cra_module		=	THIS_MODULE,
-	.cra_list		=	LIST_HEAD_INIT(aes_alg.cra_list),
-	.cra_u			=	{
-		.cipher = {
-			.cia_min_keysize	=	AES_MIN_KEY_SIZE,
-			.cia_max_keysize	=	AES_MAX_KEY_SIZE,
-			.cia_setkey		=	crypto_aes_set_key,
-			.cia_encrypt	 	=	aes_encrypt,
-			.cia_decrypt	  	=	aes_decrypt
-		}
-	}
-};
-
-static int __init aes_init(void)
-{
-	return crypto_register_alg(&aes_alg);
-}
-
-static void __exit aes_fini(void)
-{
-	crypto_unregister_alg(&aes_alg);
-}
-
-module_init(aes_init);
-module_exit(aes_fini);
-
-MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, i586 asm optimized");
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_AUTHOR("Fruhwirth Clemens, James Morris, Brian Gladman, Adam Richter");
-MODULE_ALIAS("aes");
diff --git a/arch/x86/crypto/aes_64.c b/arch/x86/crypto/aes_64.c
deleted file mode 100644
index d7a41a97dd3..00000000000
--- a/arch/x86/crypto/aes_64.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Glue Code for AES Cipher Algorithm
- *
- */
-
-#include <crypto/aes.h>
-
-asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
-asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
-
-static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-	aes_enc_blk(tfm, dst, src);
-}
-
-static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-	aes_dec_blk(tfm, dst, src);
-}
-
-static struct crypto_alg aes_alg = {
-	.cra_name		=	"aes",
-	.cra_driver_name	=	"aes-x86_64",
-	.cra_priority		=	200,
-	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
-	.cra_blocksize		=	AES_BLOCK_SIZE,
-	.cra_ctxsize		=	sizeof(struct crypto_aes_ctx),
-	.cra_module		=	THIS_MODULE,
-	.cra_list		=	LIST_HEAD_INIT(aes_alg.cra_list),
-	.cra_u			=	{
-		.cipher = {
-			.cia_min_keysize	=	AES_MIN_KEY_SIZE,
-			.cia_max_keysize	=	AES_MAX_KEY_SIZE,
-			.cia_setkey		=	crypto_aes_set_key,
-			.cia_encrypt	 	=	aes_encrypt,
-			.cia_decrypt	  	=	aes_decrypt
-		}
-	}
-};
-
-static int __init aes_init(void)
-{
-	return crypto_register_alg(&aes_alg);
-}
-
-static void __exit aes_fini(void)
-{
-	crypto_unregister_alg(&aes_alg);
-}
-
-module_init(aes_init);
-module_exit(aes_fini);
-
-MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("aes");
diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c
new file mode 100644
index 00000000000..71f45782711
--- /dev/null
+++ b/arch/x86/crypto/aes_glue.c
@@ -0,0 +1,57 @@
+/*
+ * Glue Code for the asm optimized version of the AES Cipher Algorithm
+ *
+ */
+
+#include <crypto/aes.h>
+
+asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
+asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
+
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	aes_enc_blk(tfm, dst, src);
+}
+
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	aes_dec_blk(tfm, dst, src);
+}
+
+static struct crypto_alg aes_alg = {
+	.cra_name		= "aes",
+	.cra_driver_name	= "aes-asm",
+	.cra_priority		= 200,
+	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(aes_alg.cra_list),
+	.cra_u	= {
+		.cipher	= {
+			.cia_min_keysize	= AES_MIN_KEY_SIZE,
+			.cia_max_keysize	= AES_MAX_KEY_SIZE,
+			.cia_setkey		= crypto_aes_set_key,
+			.cia_encrypt		= aes_encrypt,
+			.cia_decrypt		= aes_decrypt
+		}
+	}
+};
+
+static int __init aes_init(void)
+{
+	return crypto_register_alg(&aes_alg);
+}
+
+static void __exit aes_fini(void)
+{
+	crypto_unregister_alg(&aes_alg);
+}
+
+module_init(aes_init);
+module_exit(aes_fini);
+
+MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, asm optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("aes");
+MODULE_ALIAS("aes-asm");
-- 
cgit v1.2.3


From b0c3e75d857f3785a4b274e26b1c0b2327580dda Mon Sep 17 00:00:00 2001
From: Sebastian Siewior <sebastian@breakpoint.cc>
Date: Sat, 1 Dec 2007 12:47:37 +1100
Subject: [CRYPTO] aes_s390: Add fallback driver

Some CPUs support only 128 bit keys in HW. This patch adds SW fallback
support for the other keys which may be required. The generic algorithm
(and the block mode) must be availble in case of a fallback.

Signed-off-by: Sebastian Siewior <sebastian@breakpoint.cc>
Signed-off-by: Jan Glauber <jang@linux.vnet.ibm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/s390/crypto/aes_s390.c | 226 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 205 insertions(+), 21 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c
index 812511bbb54..85246112ab5 100644
--- a/arch/s390/crypto/aes_s390.c
+++ b/arch/s390/crypto/aes_s390.c
@@ -6,6 +6,7 @@
  * s390 Version:
  *   Copyright IBM Corp. 2005,2007
  *   Author(s): Jan Glauber (jang@de.ibm.com)
+ *		Sebastian Siewior (sebastian@breakpoint.cc> SW-Fallback
  *
  * Derived from "crypto/aes_generic.c"
  *
@@ -18,6 +19,7 @@
 
 #include <crypto/aes.h>
 #include <crypto/algapi.h>
+#include <linux/err.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include "crypt_s390.h"
@@ -34,45 +36,89 @@ struct s390_aes_ctx {
 	long enc;
 	long dec;
 	int key_len;
+	union {
+		struct crypto_blkcipher *blk;
+		struct crypto_cipher *cip;
+	} fallback;
 };
 
-static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
-		       unsigned int key_len)
+/*
+ * Check if the key_len is supported by the HW.
+ * Returns 0 if it is, a positive number if it is not and software fallback is
+ * required or a negative number in case the key size is not valid
+ */
+static int need_fallback(unsigned int key_len)
 {
-	struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
-	u32 *flags = &tfm->crt_flags;
-
 	switch (key_len) {
 	case 16:
 		if (!(keylen_flag & AES_KEYLEN_128))
-			goto fail;
+			return 1;
 		break;
 	case 24:
 		if (!(keylen_flag & AES_KEYLEN_192))
-			goto fail;
-
+			return 1;
 		break;
 	case 32:
 		if (!(keylen_flag & AES_KEYLEN_256))
-			goto fail;
+			return 1;
 		break;
 	default:
-		goto fail;
+		return -1;
 		break;
 	}
+	return 0;
+}
+
+static int setkey_fallback_cip(struct crypto_tfm *tfm, const u8 *in_key,
+		unsigned int key_len)
+{
+	struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
+	int ret;
+
+	sctx->fallback.blk->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK;
+	sctx->fallback.blk->base.crt_flags |= (tfm->crt_flags &
+			CRYPTO_TFM_REQ_MASK);
+
+	ret = crypto_cipher_setkey(sctx->fallback.cip, in_key, key_len);
+	if (ret) {
+		tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
+		tfm->crt_flags |= (sctx->fallback.blk->base.crt_flags &
+				CRYPTO_TFM_RES_MASK);
+	}
+	return ret;
+}
+
+static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+		       unsigned int key_len)
+{
+	struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
+	u32 *flags = &tfm->crt_flags;
+	int ret;
+
+	ret = need_fallback(key_len);
+	if (ret < 0) {
+		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
 
 	sctx->key_len = key_len;
-	memcpy(sctx->key, in_key, key_len);
-	return 0;
-fail:
-	*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
-	return -EINVAL;
+	if (!ret) {
+		memcpy(sctx->key, in_key, key_len);
+		return 0;
+	}
+
+	return setkey_fallback_cip(tfm, in_key, key_len);
 }
 
 static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
 	const struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
 
+	if (unlikely(need_fallback(sctx->key_len))) {
+		crypto_cipher_encrypt_one(sctx->fallback.cip, out, in);
+		return;
+	}
+
 	switch (sctx->key_len) {
 	case 16:
 		crypt_s390_km(KM_AES_128_ENCRYPT, &sctx->key, out, in,
@@ -93,6 +139,11 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
 	const struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
 
+	if (unlikely(need_fallback(sctx->key_len))) {
+		crypto_cipher_decrypt_one(sctx->fallback.cip, out, in);
+		return;
+	}
+
 	switch (sctx->key_len) {
 	case 16:
 		crypt_s390_km(KM_AES_128_DECRYPT, &sctx->key, out, in,
@@ -109,6 +160,29 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	}
 }
 
+static int fallback_init_cip(struct crypto_tfm *tfm)
+{
+	const char *name = tfm->__crt_alg->cra_name;
+	struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
+
+	sctx->fallback.cip = crypto_alloc_cipher(name, 0,
+			CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK);
+
+	if (IS_ERR(sctx->fallback.cip)) {
+		printk(KERN_ERR "Error allocating fallback algo %s\n", name);
+		return PTR_ERR(sctx->fallback.blk);
+	}
+
+	return 0;
+}
+
+static void fallback_exit_cip(struct crypto_tfm *tfm)
+{
+	struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
+
+	crypto_free_cipher(sctx->fallback.cip);
+	sctx->fallback.cip = NULL;
+}
 
 static struct crypto_alg aes_alg = {
 	.cra_name		=	"aes",
@@ -120,6 +194,8 @@ static struct crypto_alg aes_alg = {
 	.cra_ctxsize		=	sizeof(struct s390_aes_ctx),
 	.cra_module		=	THIS_MODULE,
 	.cra_list		=	LIST_HEAD_INIT(aes_alg.cra_list),
+	.cra_init               =       fallback_init_cip,
+	.cra_exit               =       fallback_exit_cip,
 	.cra_u			=	{
 		.cipher = {
 			.cia_min_keysize	=	AES_MIN_KEY_SIZE,
@@ -131,10 +207,76 @@ static struct crypto_alg aes_alg = {
 	}
 };
 
+static int setkey_fallback_blk(struct crypto_tfm *tfm, const u8 *key,
+		unsigned int len)
+{
+	struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
+	unsigned int ret;
+
+	sctx->fallback.blk->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK;
+	sctx->fallback.blk->base.crt_flags |= (tfm->crt_flags &
+			CRYPTO_TFM_REQ_MASK);
+
+	ret = crypto_blkcipher_setkey(sctx->fallback.blk, key, len);
+	if (ret) {
+		tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
+		tfm->crt_flags |= (sctx->fallback.blk->base.crt_flags &
+				CRYPTO_TFM_RES_MASK);
+	}
+	return ret;
+}
+
+static int fallback_blk_dec(struct blkcipher_desc *desc,
+		struct scatterlist *dst, struct scatterlist *src,
+		unsigned int nbytes)
+{
+	unsigned int ret;
+	struct crypto_blkcipher *tfm;
+	struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm);
+
+	memcpy(crypto_blkcipher_crt(sctx->fallback.blk)->iv, desc->info,
+		AES_BLOCK_SIZE);
+
+	tfm = desc->tfm;
+	desc->tfm = sctx->fallback.blk;
+
+	ret = crypto_blkcipher_decrypt(desc, dst, src, nbytes);
+
+	desc->tfm = tfm;
+	return ret;
+}
+
+static int fallback_blk_enc(struct blkcipher_desc *desc,
+		struct scatterlist *dst, struct scatterlist *src,
+		unsigned int nbytes)
+{
+	unsigned int ret;
+	struct crypto_blkcipher *tfm;
+	struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm);
+
+	memcpy(crypto_blkcipher_crt(sctx->fallback.blk)->iv, desc->info,
+		AES_BLOCK_SIZE);
+
+	tfm = desc->tfm;
+	desc->tfm = sctx->fallback.blk;
+
+	ret = crypto_blkcipher_encrypt(desc, dst, src, nbytes);
+
+	desc->tfm = tfm;
+	return ret;
+}
+
 static int ecb_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
 			   unsigned int key_len)
 {
 	struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
+	int ret;
+
+	ret = need_fallback(key_len);
+	if (ret > 0) {
+		sctx->key_len = key_len;
+		return setkey_fallback_blk(tfm, in_key, key_len);
+	}
 
 	switch (key_len) {
 	case 16:
@@ -183,6 +325,9 @@ static int ecb_aes_encrypt(struct blkcipher_desc *desc,
 	struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm);
 	struct blkcipher_walk walk;
 
+	if (unlikely(need_fallback(sctx->key_len)))
+		return fallback_blk_enc(desc, dst, src, nbytes);
+
 	blkcipher_walk_init(&walk, dst, src, nbytes);
 	return ecb_aes_crypt(desc, sctx->enc, sctx->key, &walk);
 }
@@ -194,10 +339,37 @@ static int ecb_aes_decrypt(struct blkcipher_desc *desc,
 	struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm);
 	struct blkcipher_walk walk;
 
+	if (unlikely(need_fallback(sctx->key_len)))
+		return fallback_blk_dec(desc, dst, src, nbytes);
+
 	blkcipher_walk_init(&walk, dst, src, nbytes);
 	return ecb_aes_crypt(desc, sctx->dec, sctx->key, &walk);
 }
 
+static int fallback_init_blk(struct crypto_tfm *tfm)
+{
+	const char *name = tfm->__crt_alg->cra_name;
+	struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
+
+	sctx->fallback.blk = crypto_alloc_blkcipher(name, 0,
+			CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK);
+
+	if (IS_ERR(sctx->fallback.blk)) {
+		printk(KERN_ERR "Error allocating fallback algo %s\n", name);
+		return PTR_ERR(sctx->fallback.blk);
+	}
+
+	return 0;
+}
+
+static void fallback_exit_blk(struct crypto_tfm *tfm)
+{
+	struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
+
+	crypto_free_blkcipher(sctx->fallback.blk);
+	sctx->fallback.blk = NULL;
+}
+
 static struct crypto_alg ecb_aes_alg = {
 	.cra_name		=	"ecb(aes)",
 	.cra_driver_name	=	"ecb-aes-s390",
@@ -209,6 +381,8 @@ static struct crypto_alg ecb_aes_alg = {
 	.cra_type		=	&crypto_blkcipher_type,
 	.cra_module		=	THIS_MODULE,
 	.cra_list		=	LIST_HEAD_INIT(ecb_aes_alg.cra_list),
+	.cra_init		=	fallback_init_blk,
+	.cra_exit		=	fallback_exit_blk,
 	.cra_u			=	{
 		.blkcipher = {
 			.min_keysize		=	AES_MIN_KEY_SIZE,
@@ -224,6 +398,13 @@ static int cbc_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
 			   unsigned int key_len)
 {
 	struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
+	int ret;
+
+	ret = need_fallback(key_len);
+	if (ret > 0) {
+		sctx->key_len = key_len;
+		return setkey_fallback_blk(tfm, in_key, key_len);
+	}
 
 	switch (key_len) {
 	case 16:
@@ -278,6 +459,9 @@ static int cbc_aes_encrypt(struct blkcipher_desc *desc,
 	struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm);
 	struct blkcipher_walk walk;
 
+	if (unlikely(need_fallback(sctx->key_len)))
+		return fallback_blk_enc(desc, dst, src, nbytes);
+
 	blkcipher_walk_init(&walk, dst, src, nbytes);
 	return cbc_aes_crypt(desc, sctx->enc, sctx->iv, &walk);
 }
@@ -289,6 +473,9 @@ static int cbc_aes_decrypt(struct blkcipher_desc *desc,
 	struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm);
 	struct blkcipher_walk walk;
 
+	if (unlikely(need_fallback(sctx->key_len)))
+		return fallback_blk_dec(desc, dst, src, nbytes);
+
 	blkcipher_walk_init(&walk, dst, src, nbytes);
 	return cbc_aes_crypt(desc, sctx->dec, sctx->iv, &walk);
 }
@@ -304,6 +491,8 @@ static struct crypto_alg cbc_aes_alg = {
 	.cra_type		=	&crypto_blkcipher_type,
 	.cra_module		=	THIS_MODULE,
 	.cra_list		=	LIST_HEAD_INIT(cbc_aes_alg.cra_list),
+	.cra_init		=	fallback_init_blk,
+	.cra_exit		=	fallback_exit_blk,
 	.cra_u			=	{
 		.blkcipher = {
 			.min_keysize		=	AES_MIN_KEY_SIZE,
@@ -331,14 +520,10 @@ static int __init aes_init(void)
 		return -EOPNOTSUPP;
 
 	/* z9 109 and z9 BC/EC only support 128 bit key length */
-	if (keylen_flag == AES_KEYLEN_128) {
-		aes_alg.cra_u.cipher.cia_max_keysize = AES_MIN_KEY_SIZE;
-		ecb_aes_alg.cra_u.blkcipher.max_keysize = AES_MIN_KEY_SIZE;
-		cbc_aes_alg.cra_u.blkcipher.max_keysize = AES_MIN_KEY_SIZE;
+	if (keylen_flag == AES_KEYLEN_128)
 		printk(KERN_INFO
 		       "aes_s390: hardware acceleration only available for"
 		       "128 bit keys\n");
-	}
 
 	ret = crypto_register_alg(&aes_alg);
 	if (ret)
@@ -377,4 +562,3 @@ MODULE_ALIAS("aes");
 
 MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm");
 MODULE_LICENSE("GPL");
-
-- 
cgit v1.2.3


From 2d74d405fc5ea78b20a4a2efd24201db424e07b1 Mon Sep 17 00:00:00 2001
From: Sebastian Siewior <sebastian@breakpoint.cc>
Date: Mon, 10 Dec 2007 15:49:41 +0800
Subject: [CRYPTO] s390-aes: Use correct encrypt/decrypt function in fallback

crypto_blkcipher_decrypt is wrong because it does not care about
the IV.

Signed-off-by: Sebastian Siewior <sebastian@breakpoint.cc>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/s390/crypto/aes_s390.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c
index 85246112ab5..46c97058ebe 100644
--- a/arch/s390/crypto/aes_s390.c
+++ b/arch/s390/crypto/aes_s390.c
@@ -234,13 +234,10 @@ static int fallback_blk_dec(struct blkcipher_desc *desc,
 	struct crypto_blkcipher *tfm;
 	struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm);
 
-	memcpy(crypto_blkcipher_crt(sctx->fallback.blk)->iv, desc->info,
-		AES_BLOCK_SIZE);
-
 	tfm = desc->tfm;
 	desc->tfm = sctx->fallback.blk;
 
-	ret = crypto_blkcipher_decrypt(desc, dst, src, nbytes);
+	ret = crypto_blkcipher_decrypt_iv(desc, dst, src, nbytes);
 
 	desc->tfm = tfm;
 	return ret;
@@ -254,13 +251,10 @@ static int fallback_blk_enc(struct blkcipher_desc *desc,
 	struct crypto_blkcipher *tfm;
 	struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm);
 
-	memcpy(crypto_blkcipher_crt(sctx->fallback.blk)->iv, desc->info,
-		AES_BLOCK_SIZE);
-
 	tfm = desc->tfm;
 	desc->tfm = sctx->fallback.blk;
 
-	ret = crypto_blkcipher_encrypt(desc, dst, src, nbytes);
+	ret = crypto_blkcipher_encrypt_iv(desc, dst, src, nbytes);
 
 	desc->tfm = tfm;
 	return ret;
-- 
cgit v1.2.3


From 974e4b752ee623854c5dc2bbfc7c7725029ce173 Mon Sep 17 00:00:00 2001
From: Tan Swee Heng <thesweeheng@gmail.com>
Date: Mon, 10 Dec 2007 15:52:56 +0800
Subject: [CRYPTO] salsa20_i586: Salsa20 stream cipher algorithm (i586 version)

This patch contains the salsa20-i586 implementation. The original
assembly code came from
<http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>. I have reformatted
it (added indents) so that it matches the other algorithms in
arch/x86/crypto.

Signed-off-by: Tan Swee Heng <thesweeheng@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile              |    2 +
 arch/x86/crypto/salsa20-i586-asm_32.S | 1114 +++++++++++++++++++++++++++++++++
 arch/x86/crypto/salsa20_glue.c        |  127 ++++
 3 files changed, 1243 insertions(+)
 create mode 100644 arch/x86/crypto/salsa20-i586-asm_32.S
 create mode 100644 arch/x86/crypto/salsa20_glue.c

(limited to 'arch')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index b8fbb43df6d..25cc8441046 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -4,12 +4,14 @@
 
 obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
 obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
+obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
 
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
 twofish-i586-y := twofish-i586-asm_32.o twofish_32.o
+salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
 
 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_64.o
diff --git a/arch/x86/crypto/salsa20-i586-asm_32.S b/arch/x86/crypto/salsa20-i586-asm_32.S
new file mode 100644
index 00000000000..72eb306680b
--- /dev/null
+++ b/arch/x86/crypto/salsa20-i586-asm_32.S
@@ -0,0 +1,1114 @@
+# salsa20_pm.s version 20051229
+# D. J. Bernstein
+# Public domain.
+
+# enter ECRYPT_encrypt_bytes
+.text
+.p2align 5
+.globl ECRYPT_encrypt_bytes
+ECRYPT_encrypt_bytes:
+	mov	%esp,%eax
+	and	$31,%eax
+	add	$256,%eax
+	sub	%eax,%esp
+	# eax_stack = eax
+	movl	%eax,80(%esp)
+	# ebx_stack = ebx
+	movl	%ebx,84(%esp)
+	# esi_stack = esi
+	movl	%esi,88(%esp)
+	# edi_stack = edi
+	movl	%edi,92(%esp)
+	# ebp_stack = ebp
+	movl	%ebp,96(%esp)
+	# x = arg1
+	movl	4(%esp,%eax),%edx
+	# m = arg2
+	movl	8(%esp,%eax),%esi
+	# out = arg3
+	movl	12(%esp,%eax),%edi
+	# bytes = arg4
+	movl	16(%esp,%eax),%ebx
+	# bytes -= 0
+	sub	$0,%ebx
+	# goto done if unsigned<=
+	jbe	._done
+._start:
+	# in0 = *(uint32 *) (x + 0)
+	movl	0(%edx),%eax
+	# in1 = *(uint32 *) (x + 4)
+	movl	4(%edx),%ecx
+	# in2 = *(uint32 *) (x + 8)
+	movl	8(%edx),%ebp
+	# j0 = in0
+	movl	%eax,164(%esp)
+	# in3 = *(uint32 *) (x + 12)
+	movl	12(%edx),%eax
+	# j1 = in1
+	movl	%ecx,168(%esp)
+	# in4 = *(uint32 *) (x + 16)
+	movl	16(%edx),%ecx
+	# j2 = in2
+	movl	%ebp,172(%esp)
+	# in5 = *(uint32 *) (x + 20)
+	movl	20(%edx),%ebp
+	# j3 = in3
+	movl	%eax,176(%esp)
+	# in6 = *(uint32 *) (x + 24)
+	movl	24(%edx),%eax
+	# j4 = in4
+	movl	%ecx,180(%esp)
+	# in7 = *(uint32 *) (x + 28)
+	movl	28(%edx),%ecx
+	# j5 = in5
+	movl	%ebp,184(%esp)
+	# in8 = *(uint32 *) (x + 32)
+	movl	32(%edx),%ebp
+	# j6 = in6
+	movl	%eax,188(%esp)
+	# in9 = *(uint32 *) (x + 36)
+	movl	36(%edx),%eax
+	# j7 = in7
+	movl	%ecx,192(%esp)
+	# in10 = *(uint32 *) (x + 40)
+	movl	40(%edx),%ecx
+	# j8 = in8
+	movl	%ebp,196(%esp)
+	# in11 = *(uint32 *) (x + 44)
+	movl	44(%edx),%ebp
+	# j9 = in9
+	movl	%eax,200(%esp)
+	# in12 = *(uint32 *) (x + 48)
+	movl	48(%edx),%eax
+	# j10 = in10
+	movl	%ecx,204(%esp)
+	# in13 = *(uint32 *) (x + 52)
+	movl	52(%edx),%ecx
+	# j11 = in11
+	movl	%ebp,208(%esp)
+	# in14 = *(uint32 *) (x + 56)
+	movl	56(%edx),%ebp
+	# j12 = in12
+	movl	%eax,212(%esp)
+	# in15 = *(uint32 *) (x + 60)
+	movl	60(%edx),%eax
+	# j13 = in13
+	movl	%ecx,216(%esp)
+	# j14 = in14
+	movl	%ebp,220(%esp)
+	# j15 = in15
+	movl	%eax,224(%esp)
+	# x_backup = x
+	movl	%edx,64(%esp)
+._bytesatleast1:
+	#   bytes - 64
+	cmp	$64,%ebx
+	#   goto nocopy if unsigned>=
+	jae	._nocopy
+	#     ctarget = out
+	movl	%edi,228(%esp)
+	#     out = &tmp
+	leal	0(%esp),%edi
+	#     i = bytes
+	mov	%ebx,%ecx
+	#     while (i) { *out++ = *m++; --i }
+	rep	movsb
+	#     out = &tmp
+	leal	0(%esp),%edi
+	#     m = &tmp
+	leal	0(%esp),%esi
+._nocopy:
+	#   out_backup = out
+	movl	%edi,72(%esp)
+	#   m_backup = m
+	movl	%esi,68(%esp)
+	#   bytes_backup = bytes
+	movl	%ebx,76(%esp)
+	#   in0 = j0
+	movl	164(%esp),%eax
+	#   in1 = j1
+	movl	168(%esp),%ecx
+	#   in2 = j2
+	movl	172(%esp),%edx
+	#   in3 = j3
+	movl	176(%esp),%ebx
+	#   x0 = in0
+	movl	%eax,100(%esp)
+	#   x1 = in1
+	movl	%ecx,104(%esp)
+	#   x2 = in2
+	movl	%edx,108(%esp)
+	#   x3 = in3
+	movl	%ebx,112(%esp)
+	#   in4 = j4
+	movl	180(%esp),%eax
+	#   in5 = j5
+	movl	184(%esp),%ecx
+	#   in6 = j6
+	movl	188(%esp),%edx
+	#   in7 = j7
+	movl	192(%esp),%ebx
+	#   x4 = in4
+	movl	%eax,116(%esp)
+	#   x5 = in5
+	movl	%ecx,120(%esp)
+	#   x6 = in6
+	movl	%edx,124(%esp)
+	#   x7 = in7
+	movl	%ebx,128(%esp)
+	#   in8 = j8
+	movl	196(%esp),%eax
+	#   in9 = j9
+	movl	200(%esp),%ecx
+	#   in10 = j10
+	movl	204(%esp),%edx
+	#   in11 = j11
+	movl	208(%esp),%ebx
+	#   x8 = in8
+	movl	%eax,132(%esp)
+	#   x9 = in9
+	movl	%ecx,136(%esp)
+	#   x10 = in10
+	movl	%edx,140(%esp)
+	#   x11 = in11
+	movl	%ebx,144(%esp)
+	#   in12 = j12
+	movl	212(%esp),%eax
+	#   in13 = j13
+	movl	216(%esp),%ecx
+	#   in14 = j14
+	movl	220(%esp),%edx
+	#   in15 = j15
+	movl	224(%esp),%ebx
+	#   x12 = in12
+	movl	%eax,148(%esp)
+	#   x13 = in13
+	movl	%ecx,152(%esp)
+	#   x14 = in14
+	movl	%edx,156(%esp)
+	#   x15 = in15
+	movl	%ebx,160(%esp)
+	#   i = 20
+	mov	$20,%ebp
+	# p = x0
+	movl	100(%esp),%eax
+	# s = x5
+	movl	120(%esp),%ecx
+	# t = x10
+	movl	140(%esp),%edx
+	# w = x15
+	movl	160(%esp),%ebx
+._mainloop:
+	# x0 = p
+	movl	%eax,100(%esp)
+	# 				x10 = t
+	movl	%edx,140(%esp)
+	# p += x12
+	addl	148(%esp),%eax
+	# 		x5 = s
+	movl	%ecx,120(%esp)
+	# 				t += x6
+	addl	124(%esp),%edx
+	# 						x15 = w
+	movl	%ebx,160(%esp)
+	# 		r = x1
+	movl	104(%esp),%esi
+	# 		r += s
+	add	%ecx,%esi
+	# 						v = x11
+	movl	144(%esp),%edi
+	# 						v += w
+	add	%ebx,%edi
+	# p <<<= 7
+	rol	$7,%eax
+	# p ^= x4
+	xorl	116(%esp),%eax
+	# 				t <<<= 7
+	rol	$7,%edx
+	# 				t ^= x14
+	xorl	156(%esp),%edx
+	# 		r <<<= 7
+	rol	$7,%esi
+	# 		r ^= x9
+	xorl	136(%esp),%esi
+	# 						v <<<= 7
+	rol	$7,%edi
+	# 						v ^= x3
+	xorl	112(%esp),%edi
+	# x4 = p
+	movl	%eax,116(%esp)
+	# 				x14 = t
+	movl	%edx,156(%esp)
+	# p += x0
+	addl	100(%esp),%eax
+	# 		x9 = r
+	movl	%esi,136(%esp)
+	# 				t += x10
+	addl	140(%esp),%edx
+	# 						x3 = v
+	movl	%edi,112(%esp)
+	# p <<<= 9
+	rol	$9,%eax
+	# p ^= x8
+	xorl	132(%esp),%eax
+	# 				t <<<= 9
+	rol	$9,%edx
+	# 				t ^= x2
+	xorl	108(%esp),%edx
+	# 		s += r
+	add	%esi,%ecx
+	# 		s <<<= 9
+	rol	$9,%ecx
+	# 		s ^= x13
+	xorl	152(%esp),%ecx
+	# 						w += v
+	add	%edi,%ebx
+	# 						w <<<= 9
+	rol	$9,%ebx
+	# 						w ^= x7
+	xorl	128(%esp),%ebx
+	# x8 = p
+	movl	%eax,132(%esp)
+	# 				x2 = t
+	movl	%edx,108(%esp)
+	# p += x4
+	addl	116(%esp),%eax
+	# 		x13 = s
+	movl	%ecx,152(%esp)
+	# 				t += x14
+	addl	156(%esp),%edx
+	# 						x7 = w
+	movl	%ebx,128(%esp)
+	# p <<<= 13
+	rol	$13,%eax
+	# p ^= x12
+	xorl	148(%esp),%eax
+	# 				t <<<= 13
+	rol	$13,%edx
+	# 				t ^= x6
+	xorl	124(%esp),%edx
+	# 		r += s
+	add	%ecx,%esi
+	# 		r <<<= 13
+	rol	$13,%esi
+	# 		r ^= x1
+	xorl	104(%esp),%esi
+	# 						v += w
+	add	%ebx,%edi
+	# 						v <<<= 13
+	rol	$13,%edi
+	# 						v ^= x11
+	xorl	144(%esp),%edi
+	# x12 = p
+	movl	%eax,148(%esp)
+	# 				x6 = t
+	movl	%edx,124(%esp)
+	# p += x8
+	addl	132(%esp),%eax
+	# 		x1 = r
+	movl	%esi,104(%esp)
+	# 				t += x2
+	addl	108(%esp),%edx
+	# 						x11 = v
+	movl	%edi,144(%esp)
+	# p <<<= 18
+	rol	$18,%eax
+	# p ^= x0
+	xorl	100(%esp),%eax
+	# 				t <<<= 18
+	rol	$18,%edx
+	# 				t ^= x10
+	xorl	140(%esp),%edx
+	# 		s += r
+	add	%esi,%ecx
+	# 		s <<<= 18
+	rol	$18,%ecx
+	# 		s ^= x5
+	xorl	120(%esp),%ecx
+	# 						w += v
+	add	%edi,%ebx
+	# 						w <<<= 18
+	rol	$18,%ebx
+	# 						w ^= x15
+	xorl	160(%esp),%ebx
+	# x0 = p
+	movl	%eax,100(%esp)
+	# 				x10 = t
+	movl	%edx,140(%esp)
+	# p += x3
+	addl	112(%esp),%eax
+	# p <<<= 7
+	rol	$7,%eax
+	# 		x5 = s
+	movl	%ecx,120(%esp)
+	# 				t += x9
+	addl	136(%esp),%edx
+	# 						x15 = w
+	movl	%ebx,160(%esp)
+	# 		r = x4
+	movl	116(%esp),%esi
+	# 		r += s
+	add	%ecx,%esi
+	# 						v = x14
+	movl	156(%esp),%edi
+	# 						v += w
+	add	%ebx,%edi
+	# p ^= x1
+	xorl	104(%esp),%eax
+	# 				t <<<= 7
+	rol	$7,%edx
+	# 				t ^= x11
+	xorl	144(%esp),%edx
+	# 		r <<<= 7
+	rol	$7,%esi
+	# 		r ^= x6
+	xorl	124(%esp),%esi
+	# 						v <<<= 7
+	rol	$7,%edi
+	# 						v ^= x12
+	xorl	148(%esp),%edi
+	# x1 = p
+	movl	%eax,104(%esp)
+	# 				x11 = t
+	movl	%edx,144(%esp)
+	# p += x0
+	addl	100(%esp),%eax
+	# 		x6 = r
+	movl	%esi,124(%esp)
+	# 				t += x10
+	addl	140(%esp),%edx
+	# 						x12 = v
+	movl	%edi,148(%esp)
+	# p <<<= 9
+	rol	$9,%eax
+	# p ^= x2
+	xorl	108(%esp),%eax
+	# 				t <<<= 9
+	rol	$9,%edx
+	# 				t ^= x8
+	xorl	132(%esp),%edx
+	# 		s += r
+	add	%esi,%ecx
+	# 		s <<<= 9
+	rol	$9,%ecx
+	# 		s ^= x7
+	xorl	128(%esp),%ecx
+	# 						w += v
+	add	%edi,%ebx
+	# 						w <<<= 9
+	rol	$9,%ebx
+	# 						w ^= x13
+	xorl	152(%esp),%ebx
+	# x2 = p
+	movl	%eax,108(%esp)
+	# 				x8 = t
+	movl	%edx,132(%esp)
+	# p += x1
+	addl	104(%esp),%eax
+	# 		x7 = s
+	movl	%ecx,128(%esp)
+	# 				t += x11
+	addl	144(%esp),%edx
+	# 						x13 = w
+	movl	%ebx,152(%esp)
+	# p <<<= 13
+	rol	$13,%eax
+	# p ^= x3
+	xorl	112(%esp),%eax
+	# 				t <<<= 13
+	rol	$13,%edx
+	# 				t ^= x9
+	xorl	136(%esp),%edx
+	# 		r += s
+	add	%ecx,%esi
+	# 		r <<<= 13
+	rol	$13,%esi
+	# 		r ^= x4
+	xorl	116(%esp),%esi
+	# 						v += w
+	add	%ebx,%edi
+	# 						v <<<= 13
+	rol	$13,%edi
+	# 						v ^= x14
+	xorl	156(%esp),%edi
+	# x3 = p
+	movl	%eax,112(%esp)
+	# 				x9 = t
+	movl	%edx,136(%esp)
+	# p += x2
+	addl	108(%esp),%eax
+	# 		x4 = r
+	movl	%esi,116(%esp)
+	# 				t += x8
+	addl	132(%esp),%edx
+	# 						x14 = v
+	movl	%edi,156(%esp)
+	# p <<<= 18
+	rol	$18,%eax
+	# p ^= x0
+	xorl	100(%esp),%eax
+	# 				t <<<= 18
+	rol	$18,%edx
+	# 				t ^= x10
+	xorl	140(%esp),%edx
+	# 		s += r
+	add	%esi,%ecx
+	# 		s <<<= 18
+	rol	$18,%ecx
+	# 		s ^= x5
+	xorl	120(%esp),%ecx
+	# 						w += v
+	add	%edi,%ebx
+	# 						w <<<= 18
+	rol	$18,%ebx
+	# 						w ^= x15
+	xorl	160(%esp),%ebx
+	# x0 = p
+	movl	%eax,100(%esp)
+	# 				x10 = t
+	movl	%edx,140(%esp)
+	# p += x12
+	addl	148(%esp),%eax
+	# 		x5 = s
+	movl	%ecx,120(%esp)
+	# 				t += x6
+	addl	124(%esp),%edx
+	# 						x15 = w
+	movl	%ebx,160(%esp)
+	# 		r = x1
+	movl	104(%esp),%esi
+	# 		r += s
+	add	%ecx,%esi
+	# 						v = x11
+	movl	144(%esp),%edi
+	# 						v += w
+	add	%ebx,%edi
+	# p <<<= 7
+	rol	$7,%eax
+	# p ^= x4
+	xorl	116(%esp),%eax
+	# 				t <<<= 7
+	rol	$7,%edx
+	# 				t ^= x14
+	xorl	156(%esp),%edx
+	# 		r <<<= 7
+	rol	$7,%esi
+	# 		r ^= x9
+	xorl	136(%esp),%esi
+	# 						v <<<= 7
+	rol	$7,%edi
+	# 						v ^= x3
+	xorl	112(%esp),%edi
+	# x4 = p
+	movl	%eax,116(%esp)
+	# 				x14 = t
+	movl	%edx,156(%esp)
+	# p += x0
+	addl	100(%esp),%eax
+	# 		x9 = r
+	movl	%esi,136(%esp)
+	# 				t += x10
+	addl	140(%esp),%edx
+	# 						x3 = v
+	movl	%edi,112(%esp)
+	# p <<<= 9
+	rol	$9,%eax
+	# p ^= x8
+	xorl	132(%esp),%eax
+	# 				t <<<= 9
+	rol	$9,%edx
+	# 				t ^= x2
+	xorl	108(%esp),%edx
+	# 		s += r
+	add	%esi,%ecx
+	# 		s <<<= 9
+	rol	$9,%ecx
+	# 		s ^= x13
+	xorl	152(%esp),%ecx
+	# 						w += v
+	add	%edi,%ebx
+	# 						w <<<= 9
+	rol	$9,%ebx
+	# 						w ^= x7
+	xorl	128(%esp),%ebx
+	# x8 = p
+	movl	%eax,132(%esp)
+	# 				x2 = t
+	movl	%edx,108(%esp)
+	# p += x4
+	addl	116(%esp),%eax
+	# 		x13 = s
+	movl	%ecx,152(%esp)
+	# 				t += x14
+	addl	156(%esp),%edx
+	# 						x7 = w
+	movl	%ebx,128(%esp)
+	# p <<<= 13
+	rol	$13,%eax
+	# p ^= x12
+	xorl	148(%esp),%eax
+	# 				t <<<= 13
+	rol	$13,%edx
+	# 				t ^= x6
+	xorl	124(%esp),%edx
+	# 		r += s
+	add	%ecx,%esi
+	# 		r <<<= 13
+	rol	$13,%esi
+	# 		r ^= x1
+	xorl	104(%esp),%esi
+	# 						v += w
+	add	%ebx,%edi
+	# 						v <<<= 13
+	rol	$13,%edi
+	# 						v ^= x11
+	xorl	144(%esp),%edi
+	# x12 = p
+	movl	%eax,148(%esp)
+	# 				x6 = t
+	movl	%edx,124(%esp)
+	# p += x8
+	addl	132(%esp),%eax
+	# 		x1 = r
+	movl	%esi,104(%esp)
+	# 				t += x2
+	addl	108(%esp),%edx
+	# 						x11 = v
+	movl	%edi,144(%esp)
+	# p <<<= 18
+	rol	$18,%eax
+	# p ^= x0
+	xorl	100(%esp),%eax
+	# 				t <<<= 18
+	rol	$18,%edx
+	# 				t ^= x10
+	xorl	140(%esp),%edx
+	# 		s += r
+	add	%esi,%ecx
+	# 		s <<<= 18
+	rol	$18,%ecx
+	# 		s ^= x5
+	xorl	120(%esp),%ecx
+	# 						w += v
+	add	%edi,%ebx
+	# 						w <<<= 18
+	rol	$18,%ebx
+	# 						w ^= x15
+	xorl	160(%esp),%ebx
+	# x0 = p
+	movl	%eax,100(%esp)
+	# 				x10 = t
+	movl	%edx,140(%esp)
+	# p += x3
+	addl	112(%esp),%eax
+	# p <<<= 7
+	rol	$7,%eax
+	# 		x5 = s
+	movl	%ecx,120(%esp)
+	# 				t += x9
+	addl	136(%esp),%edx
+	# 						x15 = w
+	movl	%ebx,160(%esp)
+	# 		r = x4
+	movl	116(%esp),%esi
+	# 		r += s
+	add	%ecx,%esi
+	# 						v = x14
+	movl	156(%esp),%edi
+	# 						v += w
+	add	%ebx,%edi
+	# p ^= x1
+	xorl	104(%esp),%eax
+	# 				t <<<= 7
+	rol	$7,%edx
+	# 				t ^= x11
+	xorl	144(%esp),%edx
+	# 		r <<<= 7
+	rol	$7,%esi
+	# 		r ^= x6
+	xorl	124(%esp),%esi
+	# 						v <<<= 7
+	rol	$7,%edi
+	# 						v ^= x12
+	xorl	148(%esp),%edi
+	# x1 = p
+	movl	%eax,104(%esp)
+	# 				x11 = t
+	movl	%edx,144(%esp)
+	# p += x0
+	addl	100(%esp),%eax
+	# 		x6 = r
+	movl	%esi,124(%esp)
+	# 				t += x10
+	addl	140(%esp),%edx
+	# 						x12 = v
+	movl	%edi,148(%esp)
+	# p <<<= 9
+	rol	$9,%eax
+	# p ^= x2
+	xorl	108(%esp),%eax
+	# 				t <<<= 9
+	rol	$9,%edx
+	# 				t ^= x8
+	xorl	132(%esp),%edx
+	# 		s += r
+	add	%esi,%ecx
+	# 		s <<<= 9
+	rol	$9,%ecx
+	# 		s ^= x7
+	xorl	128(%esp),%ecx
+	# 						w += v
+	add	%edi,%ebx
+	# 						w <<<= 9
+	rol	$9,%ebx
+	# 						w ^= x13
+	xorl	152(%esp),%ebx
+	# x2 = p
+	movl	%eax,108(%esp)
+	# 				x8 = t
+	movl	%edx,132(%esp)
+	# p += x1
+	addl	104(%esp),%eax
+	# 		x7 = s
+	movl	%ecx,128(%esp)
+	# 				t += x11
+	addl	144(%esp),%edx
+	# 						x13 = w
+	movl	%ebx,152(%esp)
+	# p <<<= 13
+	rol	$13,%eax
+	# p ^= x3
+	xorl	112(%esp),%eax
+	# 				t <<<= 13
+	rol	$13,%edx
+	# 				t ^= x9
+	xorl	136(%esp),%edx
+	# 		r += s
+	add	%ecx,%esi
+	# 		r <<<= 13
+	rol	$13,%esi
+	# 		r ^= x4
+	xorl	116(%esp),%esi
+	# 						v += w
+	add	%ebx,%edi
+	# 						v <<<= 13
+	rol	$13,%edi
+	# 						v ^= x14
+	xorl	156(%esp),%edi
+	# x3 = p
+	movl	%eax,112(%esp)
+	# 				x9 = t
+	movl	%edx,136(%esp)
+	# p += x2
+	addl	108(%esp),%eax
+	# 		x4 = r
+	movl	%esi,116(%esp)
+	# 				t += x8
+	addl	132(%esp),%edx
+	# 						x14 = v
+	movl	%edi,156(%esp)
+	# p <<<= 18
+	rol	$18,%eax
+	# p ^= x0
+	xorl	100(%esp),%eax
+	# 				t <<<= 18
+	rol	$18,%edx
+	# 				t ^= x10
+	xorl	140(%esp),%edx
+	# 		s += r
+	add	%esi,%ecx
+	# 		s <<<= 18
+	rol	$18,%ecx
+	# 		s ^= x5
+	xorl	120(%esp),%ecx
+	# 						w += v
+	add	%edi,%ebx
+	# 						w <<<= 18
+	rol	$18,%ebx
+	# 						w ^= x15
+	xorl	160(%esp),%ebx
+	# i -= 4
+	sub	$4,%ebp
+	# goto mainloop if unsigned >
+	ja	._mainloop
+	# x0 = p
+	movl	%eax,100(%esp)
+	# x5 = s
+	movl	%ecx,120(%esp)
+	# x10 = t
+	movl	%edx,140(%esp)
+	# x15 = w
+	movl	%ebx,160(%esp)
+	#   out = out_backup
+	movl	72(%esp),%edi
+	#   m = m_backup
+	movl	68(%esp),%esi
+	#   in0 = x0
+	movl	100(%esp),%eax
+	#   in1 = x1
+	movl	104(%esp),%ecx
+	#   in0 += j0
+	addl	164(%esp),%eax
+	#   in1 += j1
+	addl	168(%esp),%ecx
+	#   in0 ^= *(uint32 *) (m + 0)
+	xorl	0(%esi),%eax
+	#   in1 ^= *(uint32 *) (m + 4)
+	xorl	4(%esi),%ecx
+	#   *(uint32 *) (out + 0) = in0
+	movl	%eax,0(%edi)
+	#   *(uint32 *) (out + 4) = in1
+	movl	%ecx,4(%edi)
+	#   in2 = x2
+	movl	108(%esp),%eax
+	#   in3 = x3
+	movl	112(%esp),%ecx
+	#   in2 += j2
+	addl	172(%esp),%eax
+	#   in3 += j3
+	addl	176(%esp),%ecx
+	#   in2 ^= *(uint32 *) (m + 8)
+	xorl	8(%esi),%eax
+	#   in3 ^= *(uint32 *) (m + 12)
+	xorl	12(%esi),%ecx
+	#   *(uint32 *) (out + 8) = in2
+	movl	%eax,8(%edi)
+	#   *(uint32 *) (out + 12) = in3
+	movl	%ecx,12(%edi)
+	#   in4 = x4
+	movl	116(%esp),%eax
+	#   in5 = x5
+	movl	120(%esp),%ecx
+	#   in4 += j4
+	addl	180(%esp),%eax
+	#   in5 += j5
+	addl	184(%esp),%ecx
+	#   in4 ^= *(uint32 *) (m + 16)
+	xorl	16(%esi),%eax
+	#   in5 ^= *(uint32 *) (m + 20)
+	xorl	20(%esi),%ecx
+	#   *(uint32 *) (out + 16) = in4
+	movl	%eax,16(%edi)
+	#   *(uint32 *) (out + 20) = in5
+	movl	%ecx,20(%edi)
+	#   in6 = x6
+	movl	124(%esp),%eax
+	#   in7 = x7
+	movl	128(%esp),%ecx
+	#   in6 += j6
+	addl	188(%esp),%eax
+	#   in7 += j7
+	addl	192(%esp),%ecx
+	#   in6 ^= *(uint32 *) (m + 24)
+	xorl	24(%esi),%eax
+	#   in7 ^= *(uint32 *) (m + 28)
+	xorl	28(%esi),%ecx
+	#   *(uint32 *) (out + 24) = in6
+	movl	%eax,24(%edi)
+	#   *(uint32 *) (out + 28) = in7
+	movl	%ecx,28(%edi)
+	#   in8 = x8
+	movl	132(%esp),%eax
+	#   in9 = x9
+	movl	136(%esp),%ecx
+	#   in8 += j8
+	addl	196(%esp),%eax
+	#   in9 += j9
+	addl	200(%esp),%ecx
+	#   in8 ^= *(uint32 *) (m + 32)
+	xorl	32(%esi),%eax
+	#   in9 ^= *(uint32 *) (m + 36)
+	xorl	36(%esi),%ecx
+	#   *(uint32 *) (out + 32) = in8
+	movl	%eax,32(%edi)
+	#   *(uint32 *) (out + 36) = in9
+	movl	%ecx,36(%edi)
+	#   in10 = x10
+	movl	140(%esp),%eax
+	#   in11 = x11
+	movl	144(%esp),%ecx
+	#   in10 += j10
+	addl	204(%esp),%eax
+	#   in11 += j11
+	addl	208(%esp),%ecx
+	#   in10 ^= *(uint32 *) (m + 40)
+	xorl	40(%esi),%eax
+	#   in11 ^= *(uint32 *) (m + 44)
+	xorl	44(%esi),%ecx
+	#   *(uint32 *) (out + 40) = in10
+	movl	%eax,40(%edi)
+	#   *(uint32 *) (out + 44) = in11
+	movl	%ecx,44(%edi)
+	#   in12 = x12
+	movl	148(%esp),%eax
+	#   in13 = x13
+	movl	152(%esp),%ecx
+	#   in12 += j12
+	addl	212(%esp),%eax
+	#   in13 += j13
+	addl	216(%esp),%ecx
+	#   in12 ^= *(uint32 *) (m + 48)
+	xorl	48(%esi),%eax
+	#   in13 ^= *(uint32 *) (m + 52)
+	xorl	52(%esi),%ecx
+	#   *(uint32 *) (out + 48) = in12
+	movl	%eax,48(%edi)
+	#   *(uint32 *) (out + 52) = in13
+	movl	%ecx,52(%edi)
+	#   in14 = x14
+	movl	156(%esp),%eax
+	#   in15 = x15
+	movl	160(%esp),%ecx
+	#   in14 += j14
+	addl	220(%esp),%eax
+	#   in15 += j15
+	addl	224(%esp),%ecx
+	#   in14 ^= *(uint32 *) (m + 56)
+	xorl	56(%esi),%eax
+	#   in15 ^= *(uint32 *) (m + 60)
+	xorl	60(%esi),%ecx
+	#   *(uint32 *) (out + 56) = in14
+	movl	%eax,56(%edi)
+	#   *(uint32 *) (out + 60) = in15
+	movl	%ecx,60(%edi)
+	#   bytes = bytes_backup
+	movl	76(%esp),%ebx
+	#   in8 = j8
+	movl	196(%esp),%eax
+	#   in9 = j9
+	movl	200(%esp),%ecx
+	#   in8 += 1
+	add	$1,%eax
+	#   in9 += 0 + carry
+	adc	$0,%ecx
+	#   j8 = in8
+	movl	%eax,196(%esp)
+	#   j9 = in9
+	movl	%ecx,200(%esp)
+	#   bytes - 64
+	cmp	$64,%ebx
+	#   goto bytesatleast65 if unsigned>
+	ja	._bytesatleast65
+	#     goto bytesatleast64 if unsigned>=
+	jae	._bytesatleast64
+	#       m = out
+	mov	%edi,%esi
+	#       out = ctarget
+	movl	228(%esp),%edi
+	#       i = bytes
+	mov	%ebx,%ecx
+	#       while (i) { *out++ = *m++; --i }
+	rep	movsb
+._bytesatleast64:
+	#     x = x_backup
+	movl	64(%esp),%eax
+	#     in8 = j8
+	movl	196(%esp),%ecx
+	#     in9 = j9
+	movl	200(%esp),%edx
+	#     *(uint32 *) (x + 32) = in8
+	movl	%ecx,32(%eax)
+	#     *(uint32 *) (x + 36) = in9
+	movl	%edx,36(%eax)
+._done:
+	#     eax = eax_stack
+	movl	80(%esp),%eax
+	#     ebx = ebx_stack
+	movl	84(%esp),%ebx
+	#     esi = esi_stack
+	movl	88(%esp),%esi
+	#     edi = edi_stack
+	movl	92(%esp),%edi
+	#     ebp = ebp_stack
+	movl	96(%esp),%ebp
+	#     leave
+	add	%eax,%esp
+	ret
+._bytesatleast65:
+	#   bytes -= 64
+	sub	$64,%ebx
+	#   out += 64
+	add	$64,%edi
+	#   m += 64
+	add	$64,%esi
+	# goto bytesatleast1
+	jmp	._bytesatleast1
+# enter ECRYPT_keysetup
+.text
+.p2align 5
+.globl ECRYPT_keysetup
+ECRYPT_keysetup:
+	mov	%esp,%eax
+	and	$31,%eax
+	add	$256,%eax
+	sub	%eax,%esp
+	#   eax_stack = eax
+	movl	%eax,64(%esp)
+	#   ebx_stack = ebx
+	movl	%ebx,68(%esp)
+	#   esi_stack = esi
+	movl	%esi,72(%esp)
+	#   edi_stack = edi
+	movl	%edi,76(%esp)
+	#   ebp_stack = ebp
+	movl	%ebp,80(%esp)
+	#   k = arg2
+	movl	8(%esp,%eax),%ecx
+	#   kbits = arg3
+	movl	12(%esp,%eax),%edx
+	#   x = arg1
+	movl	4(%esp,%eax),%eax
+	#   in1 = *(uint32 *) (k + 0)
+	movl	0(%ecx),%ebx
+	#   in2 = *(uint32 *) (k + 4)
+	movl	4(%ecx),%esi
+	#   in3 = *(uint32 *) (k + 8)
+	movl	8(%ecx),%edi
+	#   in4 = *(uint32 *) (k + 12)
+	movl	12(%ecx),%ebp
+	#   *(uint32 *) (x + 4) = in1
+	movl	%ebx,4(%eax)
+	#   *(uint32 *) (x + 8) = in2
+	movl	%esi,8(%eax)
+	#   *(uint32 *) (x + 12) = in3
+	movl	%edi,12(%eax)
+	#   *(uint32 *) (x + 16) = in4
+	movl	%ebp,16(%eax)
+	#   kbits - 256
+	cmp	$256,%edx
+	#   goto kbits128 if unsigned<
+	jb	._kbits128
+._kbits256:
+	#     in11 = *(uint32 *) (k + 16)
+	movl	16(%ecx),%edx
+	#     in12 = *(uint32 *) (k + 20)
+	movl	20(%ecx),%ebx
+	#     in13 = *(uint32 *) (k + 24)
+	movl	24(%ecx),%esi
+	#     in14 = *(uint32 *) (k + 28)
+	movl	28(%ecx),%ecx
+	#     *(uint32 *) (x + 44) = in11
+	movl	%edx,44(%eax)
+	#     *(uint32 *) (x + 48) = in12
+	movl	%ebx,48(%eax)
+	#     *(uint32 *) (x + 52) = in13
+	movl	%esi,52(%eax)
+	#     *(uint32 *) (x + 56) = in14
+	movl	%ecx,56(%eax)
+	#     in0 = 1634760805
+	mov	$1634760805,%ecx
+	#     in5 = 857760878
+	mov	$857760878,%edx
+	#     in10 = 2036477234
+	mov	$2036477234,%ebx
+	#     in15 = 1797285236
+	mov	$1797285236,%esi
+	#     *(uint32 *) (x + 0) = in0
+	movl	%ecx,0(%eax)
+	#     *(uint32 *) (x + 20) = in5
+	movl	%edx,20(%eax)
+	#     *(uint32 *) (x + 40) = in10
+	movl	%ebx,40(%eax)
+	#     *(uint32 *) (x + 60) = in15
+	movl	%esi,60(%eax)
+	#   goto keysetupdone
+	jmp	._keysetupdone
+._kbits128:
+	#     in11 = *(uint32 *) (k + 0)
+	movl	0(%ecx),%edx
+	#     in12 = *(uint32 *) (k + 4)
+	movl	4(%ecx),%ebx
+	#     in13 = *(uint32 *) (k + 8)
+	movl	8(%ecx),%esi
+	#     in14 = *(uint32 *) (k + 12)
+	movl	12(%ecx),%ecx
+	#     *(uint32 *) (x + 44) = in11
+	movl	%edx,44(%eax)
+	#     *(uint32 *) (x + 48) = in12
+	movl	%ebx,48(%eax)
+	#     *(uint32 *) (x + 52) = in13
+	movl	%esi,52(%eax)
+	#     *(uint32 *) (x + 56) = in14
+	movl	%ecx,56(%eax)
+	#     in0 = 1634760805
+	mov	$1634760805,%ecx
+	#     in5 = 824206446
+	mov	$824206446,%edx
+	#     in10 = 2036477238
+	mov	$2036477238,%ebx
+	#     in15 = 1797285236
+	mov	$1797285236,%esi
+	#     *(uint32 *) (x + 0) = in0
+	movl	%ecx,0(%eax)
+	#     *(uint32 *) (x + 20) = in5
+	movl	%edx,20(%eax)
+	#     *(uint32 *) (x + 40) = in10
+	movl	%ebx,40(%eax)
+	#     *(uint32 *) (x + 60) = in15
+	movl	%esi,60(%eax)
+._keysetupdone:
+	#   eax = eax_stack
+	movl	64(%esp),%eax
+	#   ebx = ebx_stack
+	movl	68(%esp),%ebx
+	#   esi = esi_stack
+	movl	72(%esp),%esi
+	#   edi = edi_stack
+	movl	76(%esp),%edi
+	#   ebp = ebp_stack
+	movl	80(%esp),%ebp
+	# leave
+	add	%eax,%esp
+	ret
+# enter ECRYPT_ivsetup
+.text
+.p2align 5
+.globl ECRYPT_ivsetup
+ECRYPT_ivsetup:
+	mov	%esp,%eax
+	and	$31,%eax
+	add	$256,%eax
+	sub	%eax,%esp
+	#   eax_stack = eax
+	movl	%eax,64(%esp)
+	#   ebx_stack = ebx
+	movl	%ebx,68(%esp)
+	#   esi_stack = esi
+	movl	%esi,72(%esp)
+	#   edi_stack = edi
+	movl	%edi,76(%esp)
+	#   ebp_stack = ebp
+	movl	%ebp,80(%esp)
+	#   iv = arg2
+	movl	8(%esp,%eax),%ecx
+	#   x = arg1
+	movl	4(%esp,%eax),%eax
+	#   in6 = *(uint32 *) (iv + 0)
+	movl	0(%ecx),%edx
+	#   in7 = *(uint32 *) (iv + 4)
+	movl	4(%ecx),%ecx
+	#   in8 = 0
+	mov	$0,%ebx
+	#   in9 = 0
+	mov	$0,%esi
+	#   *(uint32 *) (x + 24) = in6
+	movl	%edx,24(%eax)
+	#   *(uint32 *) (x + 28) = in7
+	movl	%ecx,28(%eax)
+	#   *(uint32 *) (x + 32) = in8
+	movl	%ebx,32(%eax)
+	#   *(uint32 *) (x + 36) = in9
+	movl	%esi,36(%eax)
+	#   eax = eax_stack
+	movl	64(%esp),%eax
+	#   ebx = ebx_stack
+	movl	68(%esp),%ebx
+	#   esi = esi_stack
+	movl	72(%esp),%esi
+	#   edi = edi_stack
+	movl	76(%esp),%edi
+	#   ebp = ebp_stack
+	movl	80(%esp),%ebp
+	# leave
+	add	%eax,%esp
+	ret
diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c
new file mode 100644
index 00000000000..3be443995ed
--- /dev/null
+++ b/arch/x86/crypto/salsa20_glue.c
@@ -0,0 +1,127 @@
+/*
+ * Glue code for optimized assembly version of  Salsa20.
+ *
+ * Copyright (c) 2007 Tan Swee Heng <thesweeheng@gmail.com>
+ *
+ * The assembly codes are public domain assembly codes written by Daniel. J.
+ * Bernstein <djb@cr.yp.to>. The codes are modified to include indentation
+ * and to remove extraneous comments and functions that are not needed.
+ * - i586 version, renamed as salsa20-i586-asm_32.S
+ *   available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/algapi.h>
+#include <linux/module.h>
+#include <linux/crypto.h>
+
+#define SALSA20_IV_SIZE        8U
+#define SALSA20_MIN_KEY_SIZE  16U
+#define SALSA20_MAX_KEY_SIZE  32U
+
+// use the ECRYPT_* function names
+#define salsa20_keysetup        ECRYPT_keysetup
+#define salsa20_ivsetup         ECRYPT_ivsetup
+#define salsa20_encrypt_bytes   ECRYPT_encrypt_bytes
+
+struct salsa20_ctx
+{
+	u32 input[16];
+};
+
+asmlinkage void salsa20_keysetup(struct salsa20_ctx *ctx, const u8 *k,
+				 u32 keysize, u32 ivsize);
+asmlinkage void salsa20_ivsetup(struct salsa20_ctx *ctx, const u8 *iv);
+asmlinkage void salsa20_encrypt_bytes(struct salsa20_ctx *ctx,
+				      const u8 *src, u8 *dst, u32 bytes);
+
+static int setkey(struct crypto_tfm *tfm, const u8 *key,
+		  unsigned int keysize)
+{
+	struct salsa20_ctx *ctx = crypto_tfm_ctx(tfm);
+	salsa20_keysetup(ctx, key, keysize*8, SALSA20_IV_SIZE*8);
+	return 0;
+}
+
+static int encrypt(struct blkcipher_desc *desc,
+		   struct scatterlist *dst, struct scatterlist *src,
+		   unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+	struct crypto_blkcipher *tfm = desc->tfm;
+	struct salsa20_ctx *ctx = crypto_blkcipher_ctx(tfm);
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, 64);
+
+	salsa20_ivsetup(ctx, walk.iv);
+
+	if (likely(walk.nbytes == nbytes))
+	{
+		salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
+				      walk.dst.virt.addr, nbytes);
+		return blkcipher_walk_done(desc, &walk, 0);
+	}
+
+	while (walk.nbytes >= 64) {
+		salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
+				      walk.dst.virt.addr,
+				      walk.nbytes - (walk.nbytes % 64));
+		err = blkcipher_walk_done(desc, &walk, walk.nbytes % 64);
+	}
+
+	if (walk.nbytes) {
+		salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
+				      walk.dst.virt.addr, walk.nbytes);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+
+	return err;
+}
+
+static struct crypto_alg alg = {
+	.cra_name           =   "salsa20",
+	.cra_driver_name    =   "salsa20-asm",
+	.cra_priority       =   200,
+	.cra_flags          =   CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_type           =   &crypto_blkcipher_type,
+	.cra_blocksize      =   1,
+	.cra_ctxsize        =   sizeof(struct salsa20_ctx),
+	.cra_alignmask      =	3,
+	.cra_module         =   THIS_MODULE,
+	.cra_list           =   LIST_HEAD_INIT(alg.cra_list),
+	.cra_u              =   {
+		.blkcipher = {
+			.setkey         =   setkey,
+			.encrypt        =   encrypt,
+			.decrypt        =   encrypt,
+			.min_keysize    =   SALSA20_MIN_KEY_SIZE,
+			.max_keysize    =   SALSA20_MAX_KEY_SIZE,
+			.ivsize         =   SALSA20_IV_SIZE,
+		}
+	}
+};
+
+static int __init init(void)
+{
+	return crypto_register_alg(&alg);
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)");
+MODULE_ALIAS("salsa20");
+MODULE_ALIAS("salsa20-asm");
-- 
cgit v1.2.3


From 9a7dafbba47384c330779c75a1546684efaa8c1a Mon Sep 17 00:00:00 2001
From: Tan Swee Heng <thesweeheng@gmail.com>
Date: Tue, 18 Dec 2007 00:04:40 +0800
Subject: [CRYPTO] salsa20: Add x86-64 assembly version

This is the x86-64 version of the Salsa20 stream cipher algorithm. The
original assembly code came from
<http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>. It has been
reformatted for clarity.

Signed-off-by: Tan Swee Heng <thesweeheng@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile                |   2 +
 arch/x86/crypto/salsa20-x86_64-asm_64.S | 920 ++++++++++++++++++++++++++++++++
 arch/x86/crypto/salsa20_glue.c          |   2 +
 3 files changed, 924 insertions(+)
 create mode 100644 arch/x86/crypto/salsa20-x86_64-asm_64.S

(limited to 'arch')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 25cc8441046..09200e12f14 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
 
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
+obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
 
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
 twofish-i586-y := twofish-i586-asm_32.o twofish_32.o
@@ -15,3 +16,4 @@ salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
 
 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_64.o
+salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S
new file mode 100644
index 00000000000..6214a9b0970
--- /dev/null
+++ b/arch/x86/crypto/salsa20-x86_64-asm_64.S
@@ -0,0 +1,920 @@
+# enter ECRYPT_encrypt_bytes
+.text
+.p2align 5
+.globl ECRYPT_encrypt_bytes
+ECRYPT_encrypt_bytes:
+	mov	%rsp,%r11
+	and	$31,%r11
+	add	$256,%r11
+	sub	%r11,%rsp
+	# x = arg1
+	mov	%rdi,%r8
+	# m = arg2
+	mov	%rsi,%rsi
+	# out = arg3
+	mov	%rdx,%rdi
+	# bytes = arg4
+	mov	%rcx,%rdx
+	#               unsigned>? bytes - 0
+	cmp	$0,%rdx
+	# comment:fp stack unchanged by jump
+	# goto done if !unsigned>
+	jbe	._done
+	# comment:fp stack unchanged by fallthrough
+# start:
+._start:
+	# r11_stack = r11
+	movq	%r11,0(%rsp)
+	# r12_stack = r12
+	movq	%r12,8(%rsp)
+	# r13_stack = r13
+	movq	%r13,16(%rsp)
+	# r14_stack = r14
+	movq	%r14,24(%rsp)
+	# r15_stack = r15
+	movq	%r15,32(%rsp)
+	# rbx_stack = rbx
+	movq	%rbx,40(%rsp)
+	# rbp_stack = rbp
+	movq	%rbp,48(%rsp)
+	# in0 = *(uint64 *) (x + 0)
+	movq	0(%r8),%rcx
+	# in2 = *(uint64 *) (x + 8)
+	movq	8(%r8),%r9
+	# in4 = *(uint64 *) (x + 16)
+	movq	16(%r8),%rax
+	# in6 = *(uint64 *) (x + 24)
+	movq	24(%r8),%r10
+	# in8 = *(uint64 *) (x + 32)
+	movq	32(%r8),%r11
+	# in10 = *(uint64 *) (x + 40)
+	movq	40(%r8),%r12
+	# in12 = *(uint64 *) (x + 48)
+	movq	48(%r8),%r13
+	# in14 = *(uint64 *) (x + 56)
+	movq	56(%r8),%r14
+	# j0 = in0
+	movq	%rcx,56(%rsp)
+	# j2 = in2
+	movq	%r9,64(%rsp)
+	# j4 = in4
+	movq	%rax,72(%rsp)
+	# j6 = in6
+	movq	%r10,80(%rsp)
+	# j8 = in8
+	movq	%r11,88(%rsp)
+	# j10 = in10
+	movq	%r12,96(%rsp)
+	# j12 = in12
+	movq	%r13,104(%rsp)
+	# j14 = in14
+	movq	%r14,112(%rsp)
+	# x_backup = x
+	movq	%r8,120(%rsp)
+# bytesatleast1:
+._bytesatleast1:
+	#                   unsigned<? bytes - 64
+	cmp	$64,%rdx
+	# comment:fp stack unchanged by jump
+	#   goto nocopy if !unsigned<
+	jae	._nocopy
+	#     ctarget = out
+	movq	%rdi,128(%rsp)
+	#     out = &tmp
+	leaq	192(%rsp),%rdi
+	#     i = bytes
+	mov	%rdx,%rcx
+	#     while (i) { *out++ = *m++; --i }
+	rep	movsb
+	#     out = &tmp
+	leaq	192(%rsp),%rdi
+	#     m = &tmp
+	leaq	192(%rsp),%rsi
+	# comment:fp stack unchanged by fallthrough
+#   nocopy:
+._nocopy:
+	#   out_backup = out
+	movq	%rdi,136(%rsp)
+	#   m_backup = m
+	movq	%rsi,144(%rsp)
+	#   bytes_backup = bytes
+	movq	%rdx,152(%rsp)
+	#   x1 = j0
+	movq	56(%rsp),%rdi
+	#   x0 = x1
+	mov	%rdi,%rdx
+	#   (uint64) x1 >>= 32
+	shr	$32,%rdi
+	#   		x3 = j2
+	movq	64(%rsp),%rsi
+	#   		x2 = x3
+	mov	%rsi,%rcx
+	#   		(uint64) x3 >>= 32
+	shr	$32,%rsi
+	#   x5 = j4
+	movq	72(%rsp),%r8
+	#   x4 = x5
+	mov	%r8,%r9
+	#   (uint64) x5 >>= 32
+	shr	$32,%r8
+	#   x5_stack = x5
+	movq	%r8,160(%rsp)
+	#   		x7 = j6
+	movq	80(%rsp),%r8
+	#   		x6 = x7
+	mov	%r8,%rax
+	#   		(uint64) x7 >>= 32
+	shr	$32,%r8
+	#   x9 = j8
+	movq	88(%rsp),%r10
+	#   x8 = x9
+	mov	%r10,%r11
+	#   (uint64) x9 >>= 32
+	shr	$32,%r10
+	#   		x11 = j10
+	movq	96(%rsp),%r12
+	#   		x10 = x11
+	mov	%r12,%r13
+	#   		x10_stack = x10
+	movq	%r13,168(%rsp)
+	#   		(uint64) x11 >>= 32
+	shr	$32,%r12
+	#   x13 = j12
+	movq	104(%rsp),%r13
+	#   x12 = x13
+	mov	%r13,%r14
+	#   (uint64) x13 >>= 32
+	shr	$32,%r13
+	#   		x15 = j14
+	movq	112(%rsp),%r15
+	#   		x14 = x15
+	mov	%r15,%rbx
+	#   		(uint64) x15 >>= 32
+	shr	$32,%r15
+	#   		x15_stack = x15
+	movq	%r15,176(%rsp)
+	#   i = 20
+	mov	$20,%r15
+#   mainloop:
+._mainloop:
+	#   i_backup = i
+	movq	%r15,184(%rsp)
+	# 		x5 = x5_stack
+	movq	160(%rsp),%r15
+	# a = x12 + x0
+	lea	(%r14,%rdx),%rbp
+	# (uint32) a <<<= 7
+	rol	$7,%ebp
+	# x4 ^= a
+	xor	%rbp,%r9
+	# 		b = x1 + x5
+	lea	(%rdi,%r15),%rbp
+	# 		(uint32) b <<<= 7
+	rol	$7,%ebp
+	# 		x9 ^= b
+	xor	%rbp,%r10
+	# a = x0 + x4
+	lea	(%rdx,%r9),%rbp
+	# (uint32) a <<<= 9
+	rol	$9,%ebp
+	# x8 ^= a
+	xor	%rbp,%r11
+	# 		b = x5 + x9
+	lea	(%r15,%r10),%rbp
+	# 		(uint32) b <<<= 9
+	rol	$9,%ebp
+	# 		x13 ^= b
+	xor	%rbp,%r13
+	# a = x4 + x8
+	lea	(%r9,%r11),%rbp
+	# (uint32) a <<<= 13
+	rol	$13,%ebp
+	# x12 ^= a
+	xor	%rbp,%r14
+	# 		b = x9 + x13
+	lea	(%r10,%r13),%rbp
+	# 		(uint32) b <<<= 13
+	rol	$13,%ebp
+	# 		x1 ^= b
+	xor	%rbp,%rdi
+	# a = x8 + x12
+	lea	(%r11,%r14),%rbp
+	# (uint32) a <<<= 18
+	rol	$18,%ebp
+	# x0 ^= a
+	xor	%rbp,%rdx
+	# 		b = x13 + x1
+	lea	(%r13,%rdi),%rbp
+	# 		(uint32) b <<<= 18
+	rol	$18,%ebp
+	# 		x5 ^= b
+	xor	%rbp,%r15
+	# 				x10 = x10_stack
+	movq	168(%rsp),%rbp
+	# 		x5_stack = x5
+	movq	%r15,160(%rsp)
+	# 				c = x6 + x10
+	lea	(%rax,%rbp),%r15
+	# 				(uint32) c <<<= 7
+	rol	$7,%r15d
+	# 				x14 ^= c
+	xor	%r15,%rbx
+	# 				c = x10 + x14
+	lea	(%rbp,%rbx),%r15
+	# 				(uint32) c <<<= 9
+	rol	$9,%r15d
+	# 				x2 ^= c
+	xor	%r15,%rcx
+	# 				c = x14 + x2
+	lea	(%rbx,%rcx),%r15
+	# 				(uint32) c <<<= 13
+	rol	$13,%r15d
+	# 				x6 ^= c
+	xor	%r15,%rax
+	# 				c = x2 + x6
+	lea	(%rcx,%rax),%r15
+	# 				(uint32) c <<<= 18
+	rol	$18,%r15d
+	# 				x10 ^= c
+	xor	%r15,%rbp
+	# 						x15 = x15_stack
+	movq	176(%rsp),%r15
+	# 				x10_stack = x10
+	movq	%rbp,168(%rsp)
+	# 						d = x11 + x15
+	lea	(%r12,%r15),%rbp
+	# 						(uint32) d <<<= 7
+	rol	$7,%ebp
+	# 						x3 ^= d
+	xor	%rbp,%rsi
+	# 						d = x15 + x3
+	lea	(%r15,%rsi),%rbp
+	# 						(uint32) d <<<= 9
+	rol	$9,%ebp
+	# 						x7 ^= d
+	xor	%rbp,%r8
+	# 						d = x3 + x7
+	lea	(%rsi,%r8),%rbp
+	# 						(uint32) d <<<= 13
+	rol	$13,%ebp
+	# 						x11 ^= d
+	xor	%rbp,%r12
+	# 						d = x7 + x11
+	lea	(%r8,%r12),%rbp
+	# 						(uint32) d <<<= 18
+	rol	$18,%ebp
+	# 						x15 ^= d
+	xor	%rbp,%r15
+	# 						x15_stack = x15
+	movq	%r15,176(%rsp)
+	# 		x5 = x5_stack
+	movq	160(%rsp),%r15
+	# a = x3 + x0
+	lea	(%rsi,%rdx),%rbp
+	# (uint32) a <<<= 7
+	rol	$7,%ebp
+	# x1 ^= a
+	xor	%rbp,%rdi
+	# 		b = x4 + x5
+	lea	(%r9,%r15),%rbp
+	# 		(uint32) b <<<= 7
+	rol	$7,%ebp
+	# 		x6 ^= b
+	xor	%rbp,%rax
+	# a = x0 + x1
+	lea	(%rdx,%rdi),%rbp
+	# (uint32) a <<<= 9
+	rol	$9,%ebp
+	# x2 ^= a
+	xor	%rbp,%rcx
+	# 		b = x5 + x6
+	lea	(%r15,%rax),%rbp
+	# 		(uint32) b <<<= 9
+	rol	$9,%ebp
+	# 		x7 ^= b
+	xor	%rbp,%r8
+	# a = x1 + x2
+	lea	(%rdi,%rcx),%rbp
+	# (uint32) a <<<= 13
+	rol	$13,%ebp
+	# x3 ^= a
+	xor	%rbp,%rsi
+	# 		b = x6 + x7
+	lea	(%rax,%r8),%rbp
+	# 		(uint32) b <<<= 13
+	rol	$13,%ebp
+	# 		x4 ^= b
+	xor	%rbp,%r9
+	# a = x2 + x3
+	lea	(%rcx,%rsi),%rbp
+	# (uint32) a <<<= 18
+	rol	$18,%ebp
+	# x0 ^= a
+	xor	%rbp,%rdx
+	# 		b = x7 + x4
+	lea	(%r8,%r9),%rbp
+	# 		(uint32) b <<<= 18
+	rol	$18,%ebp
+	# 		x5 ^= b
+	xor	%rbp,%r15
+	# 				x10 = x10_stack
+	movq	168(%rsp),%rbp
+	# 		x5_stack = x5
+	movq	%r15,160(%rsp)
+	# 				c = x9 + x10
+	lea	(%r10,%rbp),%r15
+	# 				(uint32) c <<<= 7
+	rol	$7,%r15d
+	# 				x11 ^= c
+	xor	%r15,%r12
+	# 				c = x10 + x11
+	lea	(%rbp,%r12),%r15
+	# 				(uint32) c <<<= 9
+	rol	$9,%r15d
+	# 				x8 ^= c
+	xor	%r15,%r11
+	# 				c = x11 + x8
+	lea	(%r12,%r11),%r15
+	# 				(uint32) c <<<= 13
+	rol	$13,%r15d
+	# 				x9 ^= c
+	xor	%r15,%r10
+	# 				c = x8 + x9
+	lea	(%r11,%r10),%r15
+	# 				(uint32) c <<<= 18
+	rol	$18,%r15d
+	# 				x10 ^= c
+	xor	%r15,%rbp
+	# 						x15 = x15_stack
+	movq	176(%rsp),%r15
+	# 				x10_stack = x10
+	movq	%rbp,168(%rsp)
+	# 						d = x14 + x15
+	lea	(%rbx,%r15),%rbp
+	# 						(uint32) d <<<= 7
+	rol	$7,%ebp
+	# 						x12 ^= d
+	xor	%rbp,%r14
+	# 						d = x15 + x12
+	lea	(%r15,%r14),%rbp
+	# 						(uint32) d <<<= 9
+	rol	$9,%ebp
+	# 						x13 ^= d
+	xor	%rbp,%r13
+	# 						d = x12 + x13
+	lea	(%r14,%r13),%rbp
+	# 						(uint32) d <<<= 13
+	rol	$13,%ebp
+	# 						x14 ^= d
+	xor	%rbp,%rbx
+	# 						d = x13 + x14
+	lea	(%r13,%rbx),%rbp
+	# 						(uint32) d <<<= 18
+	rol	$18,%ebp
+	# 						x15 ^= d
+	xor	%rbp,%r15
+	# 						x15_stack = x15
+	movq	%r15,176(%rsp)
+	# 		x5 = x5_stack
+	movq	160(%rsp),%r15
+	# a = x12 + x0
+	lea	(%r14,%rdx),%rbp
+	# (uint32) a <<<= 7
+	rol	$7,%ebp
+	# x4 ^= a
+	xor	%rbp,%r9
+	# 		b = x1 + x5
+	lea	(%rdi,%r15),%rbp
+	# 		(uint32) b <<<= 7
+	rol	$7,%ebp
+	# 		x9 ^= b
+	xor	%rbp,%r10
+	# a = x0 + x4
+	lea	(%rdx,%r9),%rbp
+	# (uint32) a <<<= 9
+	rol	$9,%ebp
+	# x8 ^= a
+	xor	%rbp,%r11
+	# 		b = x5 + x9
+	lea	(%r15,%r10),%rbp
+	# 		(uint32) b <<<= 9
+	rol	$9,%ebp
+	# 		x13 ^= b
+	xor	%rbp,%r13
+	# a = x4 + x8
+	lea	(%r9,%r11),%rbp
+	# (uint32) a <<<= 13
+	rol	$13,%ebp
+	# x12 ^= a
+	xor	%rbp,%r14
+	# 		b = x9 + x13
+	lea	(%r10,%r13),%rbp
+	# 		(uint32) b <<<= 13
+	rol	$13,%ebp
+	# 		x1 ^= b
+	xor	%rbp,%rdi
+	# a = x8 + x12
+	lea	(%r11,%r14),%rbp
+	# (uint32) a <<<= 18
+	rol	$18,%ebp
+	# x0 ^= a
+	xor	%rbp,%rdx
+	# 		b = x13 + x1
+	lea	(%r13,%rdi),%rbp
+	# 		(uint32) b <<<= 18
+	rol	$18,%ebp
+	# 		x5 ^= b
+	xor	%rbp,%r15
+	# 				x10 = x10_stack
+	movq	168(%rsp),%rbp
+	# 		x5_stack = x5
+	movq	%r15,160(%rsp)
+	# 				c = x6 + x10
+	lea	(%rax,%rbp),%r15
+	# 				(uint32) c <<<= 7
+	rol	$7,%r15d
+	# 				x14 ^= c
+	xor	%r15,%rbx
+	# 				c = x10 + x14
+	lea	(%rbp,%rbx),%r15
+	# 				(uint32) c <<<= 9
+	rol	$9,%r15d
+	# 				x2 ^= c
+	xor	%r15,%rcx
+	# 				c = x14 + x2
+	lea	(%rbx,%rcx),%r15
+	# 				(uint32) c <<<= 13
+	rol	$13,%r15d
+	# 				x6 ^= c
+	xor	%r15,%rax
+	# 				c = x2 + x6
+	lea	(%rcx,%rax),%r15
+	# 				(uint32) c <<<= 18
+	rol	$18,%r15d
+	# 				x10 ^= c
+	xor	%r15,%rbp
+	# 						x15 = x15_stack
+	movq	176(%rsp),%r15
+	# 				x10_stack = x10
+	movq	%rbp,168(%rsp)
+	# 						d = x11 + x15
+	lea	(%r12,%r15),%rbp
+	# 						(uint32) d <<<= 7
+	rol	$7,%ebp
+	# 						x3 ^= d
+	xor	%rbp,%rsi
+	# 						d = x15 + x3
+	lea	(%r15,%rsi),%rbp
+	# 						(uint32) d <<<= 9
+	rol	$9,%ebp
+	# 						x7 ^= d
+	xor	%rbp,%r8
+	# 						d = x3 + x7
+	lea	(%rsi,%r8),%rbp
+	# 						(uint32) d <<<= 13
+	rol	$13,%ebp
+	# 						x11 ^= d
+	xor	%rbp,%r12
+	# 						d = x7 + x11
+	lea	(%r8,%r12),%rbp
+	# 						(uint32) d <<<= 18
+	rol	$18,%ebp
+	# 						x15 ^= d
+	xor	%rbp,%r15
+	# 						x15_stack = x15
+	movq	%r15,176(%rsp)
+	# 		x5 = x5_stack
+	movq	160(%rsp),%r15
+	# a = x3 + x0
+	lea	(%rsi,%rdx),%rbp
+	# (uint32) a <<<= 7
+	rol	$7,%ebp
+	# x1 ^= a
+	xor	%rbp,%rdi
+	# 		b = x4 + x5
+	lea	(%r9,%r15),%rbp
+	# 		(uint32) b <<<= 7
+	rol	$7,%ebp
+	# 		x6 ^= b
+	xor	%rbp,%rax
+	# a = x0 + x1
+	lea	(%rdx,%rdi),%rbp
+	# (uint32) a <<<= 9
+	rol	$9,%ebp
+	# x2 ^= a
+	xor	%rbp,%rcx
+	# 		b = x5 + x6
+	lea	(%r15,%rax),%rbp
+	# 		(uint32) b <<<= 9
+	rol	$9,%ebp
+	# 		x7 ^= b
+	xor	%rbp,%r8
+	# a = x1 + x2
+	lea	(%rdi,%rcx),%rbp
+	# (uint32) a <<<= 13
+	rol	$13,%ebp
+	# x3 ^= a
+	xor	%rbp,%rsi
+	# 		b = x6 + x7
+	lea	(%rax,%r8),%rbp
+	# 		(uint32) b <<<= 13
+	rol	$13,%ebp
+	# 		x4 ^= b
+	xor	%rbp,%r9
+	# a = x2 + x3
+	lea	(%rcx,%rsi),%rbp
+	# (uint32) a <<<= 18
+	rol	$18,%ebp
+	# x0 ^= a
+	xor	%rbp,%rdx
+	# 		b = x7 + x4
+	lea	(%r8,%r9),%rbp
+	# 		(uint32) b <<<= 18
+	rol	$18,%ebp
+	# 		x5 ^= b
+	xor	%rbp,%r15
+	# 				x10 = x10_stack
+	movq	168(%rsp),%rbp
+	# 		x5_stack = x5
+	movq	%r15,160(%rsp)
+	# 				c = x9 + x10
+	lea	(%r10,%rbp),%r15
+	# 				(uint32) c <<<= 7
+	rol	$7,%r15d
+	# 				x11 ^= c
+	xor	%r15,%r12
+	# 				c = x10 + x11
+	lea	(%rbp,%r12),%r15
+	# 				(uint32) c <<<= 9
+	rol	$9,%r15d
+	# 				x8 ^= c
+	xor	%r15,%r11
+	# 				c = x11 + x8
+	lea	(%r12,%r11),%r15
+	# 				(uint32) c <<<= 13
+	rol	$13,%r15d
+	# 				x9 ^= c
+	xor	%r15,%r10
+	# 				c = x8 + x9
+	lea	(%r11,%r10),%r15
+	# 				(uint32) c <<<= 18
+	rol	$18,%r15d
+	# 				x10 ^= c
+	xor	%r15,%rbp
+	# 						x15 = x15_stack
+	movq	176(%rsp),%r15
+	# 				x10_stack = x10
+	movq	%rbp,168(%rsp)
+	# 						d = x14 + x15
+	lea	(%rbx,%r15),%rbp
+	# 						(uint32) d <<<= 7
+	rol	$7,%ebp
+	# 						x12 ^= d
+	xor	%rbp,%r14
+	# 						d = x15 + x12
+	lea	(%r15,%r14),%rbp
+	# 						(uint32) d <<<= 9
+	rol	$9,%ebp
+	# 						x13 ^= d
+	xor	%rbp,%r13
+	# 						d = x12 + x13
+	lea	(%r14,%r13),%rbp
+	# 						(uint32) d <<<= 13
+	rol	$13,%ebp
+	# 						x14 ^= d
+	xor	%rbp,%rbx
+	# 						d = x13 + x14
+	lea	(%r13,%rbx),%rbp
+	# 						(uint32) d <<<= 18
+	rol	$18,%ebp
+	# 						x15 ^= d
+	xor	%rbp,%r15
+	# 						x15_stack = x15
+	movq	%r15,176(%rsp)
+	#   i = i_backup
+	movq	184(%rsp),%r15
+	#                  unsigned>? i -= 4
+	sub	$4,%r15
+	# comment:fp stack unchanged by jump
+	# goto mainloop if unsigned>
+	ja	._mainloop
+	#   (uint32) x2 += j2
+	addl	64(%rsp),%ecx
+	#   x3 <<= 32
+	shl	$32,%rsi
+	#   x3 += j2
+	addq	64(%rsp),%rsi
+	#   (uint64) x3 >>= 32
+	shr	$32,%rsi
+	#   x3 <<= 32
+	shl	$32,%rsi
+	#   x2 += x3
+	add	%rsi,%rcx
+	#   (uint32) x6 += j6
+	addl	80(%rsp),%eax
+	#   x7 <<= 32
+	shl	$32,%r8
+	#   x7 += j6
+	addq	80(%rsp),%r8
+	#   (uint64) x7 >>= 32
+	shr	$32,%r8
+	#   x7 <<= 32
+	shl	$32,%r8
+	#   x6 += x7
+	add	%r8,%rax
+	#   (uint32) x8 += j8
+	addl	88(%rsp),%r11d
+	#   x9 <<= 32
+	shl	$32,%r10
+	#   x9 += j8
+	addq	88(%rsp),%r10
+	#   (uint64) x9 >>= 32
+	shr	$32,%r10
+	#   x9 <<= 32
+	shl	$32,%r10
+	#   x8 += x9
+	add	%r10,%r11
+	#   (uint32) x12 += j12
+	addl	104(%rsp),%r14d
+	#   x13 <<= 32
+	shl	$32,%r13
+	#   x13 += j12
+	addq	104(%rsp),%r13
+	#   (uint64) x13 >>= 32
+	shr	$32,%r13
+	#   x13 <<= 32
+	shl	$32,%r13
+	#   x12 += x13
+	add	%r13,%r14
+	#   (uint32) x0 += j0
+	addl	56(%rsp),%edx
+	#   x1 <<= 32
+	shl	$32,%rdi
+	#   x1 += j0
+	addq	56(%rsp),%rdi
+	#   (uint64) x1 >>= 32
+	shr	$32,%rdi
+	#   x1 <<= 32
+	shl	$32,%rdi
+	#   x0 += x1
+	add	%rdi,%rdx
+	#   x5 = x5_stack
+	movq	160(%rsp),%rdi
+	#   (uint32) x4 += j4
+	addl	72(%rsp),%r9d
+	#   x5 <<= 32
+	shl	$32,%rdi
+	#   x5 += j4
+	addq	72(%rsp),%rdi
+	#   (uint64) x5 >>= 32
+	shr	$32,%rdi
+	#   x5 <<= 32
+	shl	$32,%rdi
+	#   x4 += x5
+	add	%rdi,%r9
+	#   x10 = x10_stack
+	movq	168(%rsp),%r8
+	#   (uint32) x10 += j10
+	addl	96(%rsp),%r8d
+	#   x11 <<= 32
+	shl	$32,%r12
+	#   x11 += j10
+	addq	96(%rsp),%r12
+	#   (uint64) x11 >>= 32
+	shr	$32,%r12
+	#   x11 <<= 32
+	shl	$32,%r12
+	#   x10 += x11
+	add	%r12,%r8
+	#   x15 = x15_stack
+	movq	176(%rsp),%rdi
+	#   (uint32) x14 += j14
+	addl	112(%rsp),%ebx
+	#   x15 <<= 32
+	shl	$32,%rdi
+	#   x15 += j14
+	addq	112(%rsp),%rdi
+	#   (uint64) x15 >>= 32
+	shr	$32,%rdi
+	#   x15 <<= 32
+	shl	$32,%rdi
+	#   x14 += x15
+	add	%rdi,%rbx
+	#   out = out_backup
+	movq	136(%rsp),%rdi
+	#   m = m_backup
+	movq	144(%rsp),%rsi
+	#   x0 ^= *(uint64 *) (m + 0)
+	xorq	0(%rsi),%rdx
+	#   *(uint64 *) (out + 0) = x0
+	movq	%rdx,0(%rdi)
+	#   x2 ^= *(uint64 *) (m + 8)
+	xorq	8(%rsi),%rcx
+	#   *(uint64 *) (out + 8) = x2
+	movq	%rcx,8(%rdi)
+	#   x4 ^= *(uint64 *) (m + 16)
+	xorq	16(%rsi),%r9
+	#   *(uint64 *) (out + 16) = x4
+	movq	%r9,16(%rdi)
+	#   x6 ^= *(uint64 *) (m + 24)
+	xorq	24(%rsi),%rax
+	#   *(uint64 *) (out + 24) = x6
+	movq	%rax,24(%rdi)
+	#   x8 ^= *(uint64 *) (m + 32)
+	xorq	32(%rsi),%r11
+	#   *(uint64 *) (out + 32) = x8
+	movq	%r11,32(%rdi)
+	#   x10 ^= *(uint64 *) (m + 40)
+	xorq	40(%rsi),%r8
+	#   *(uint64 *) (out + 40) = x10
+	movq	%r8,40(%rdi)
+	#   x12 ^= *(uint64 *) (m + 48)
+	xorq	48(%rsi),%r14
+	#   *(uint64 *) (out + 48) = x12
+	movq	%r14,48(%rdi)
+	#   x14 ^= *(uint64 *) (m + 56)
+	xorq	56(%rsi),%rbx
+	#   *(uint64 *) (out + 56) = x14
+	movq	%rbx,56(%rdi)
+	#   bytes = bytes_backup
+	movq	152(%rsp),%rdx
+	#   in8 = j8
+	movq	88(%rsp),%rcx
+	#   in8 += 1
+	add	$1,%rcx
+	#   j8 = in8
+	movq	%rcx,88(%rsp)
+	#                          unsigned>? unsigned<? bytes - 64
+	cmp	$64,%rdx
+	# comment:fp stack unchanged by jump
+	#   goto bytesatleast65 if unsigned>
+	ja	._bytesatleast65
+	# comment:fp stack unchanged by jump
+	#     goto bytesatleast64 if !unsigned<
+	jae	._bytesatleast64
+	#       m = out
+	mov	%rdi,%rsi
+	#       out = ctarget
+	movq	128(%rsp),%rdi
+	#       i = bytes
+	mov	%rdx,%rcx
+	#       while (i) { *out++ = *m++; --i }
+	rep	movsb
+	# comment:fp stack unchanged by fallthrough
+#     bytesatleast64:
+._bytesatleast64:
+	#     x = x_backup
+	movq	120(%rsp),%rdi
+	#     in8 = j8
+	movq	88(%rsp),%rsi
+	#     *(uint64 *) (x + 32) = in8
+	movq	%rsi,32(%rdi)
+	#     r11 = r11_stack
+	movq	0(%rsp),%r11
+	#     r12 = r12_stack
+	movq	8(%rsp),%r12
+	#     r13 = r13_stack
+	movq	16(%rsp),%r13
+	#     r14 = r14_stack
+	movq	24(%rsp),%r14
+	#     r15 = r15_stack
+	movq	32(%rsp),%r15
+	#     rbx = rbx_stack
+	movq	40(%rsp),%rbx
+	#     rbp = rbp_stack
+	movq	48(%rsp),%rbp
+	# comment:fp stack unchanged by fallthrough
+#     done:
+._done:
+	#     leave
+	add	%r11,%rsp
+	mov	%rdi,%rax
+	mov	%rsi,%rdx
+	ret
+#   bytesatleast65:
+._bytesatleast65:
+	#   bytes -= 64
+	sub	$64,%rdx
+	#   out += 64
+	add	$64,%rdi
+	#   m += 64
+	add	$64,%rsi
+	# comment:fp stack unchanged by jump
+	# goto bytesatleast1
+	jmp	._bytesatleast1
+# enter ECRYPT_keysetup
+.text
+.p2align 5
+.globl ECRYPT_keysetup
+ECRYPT_keysetup:
+	mov	%rsp,%r11
+	and	$31,%r11
+	add	$256,%r11
+	sub	%r11,%rsp
+	#   k = arg2
+	mov	%rsi,%rsi
+	#   kbits = arg3
+	mov	%rdx,%rdx
+	#   x = arg1
+	mov	%rdi,%rdi
+	#   in0 = *(uint64 *) (k + 0)
+	movq	0(%rsi),%r8
+	#   in2 = *(uint64 *) (k + 8)
+	movq	8(%rsi),%r9
+	#   *(uint64 *) (x + 4) = in0
+	movq	%r8,4(%rdi)
+	#   *(uint64 *) (x + 12) = in2
+	movq	%r9,12(%rdi)
+	#                    unsigned<? kbits - 256
+	cmp	$256,%rdx
+	# comment:fp stack unchanged by jump
+	#   goto kbits128 if unsigned<
+	jb	._kbits128
+#   kbits256:
+._kbits256:
+	#     in10 = *(uint64 *) (k + 16)
+	movq	16(%rsi),%rdx
+	#     in12 = *(uint64 *) (k + 24)
+	movq	24(%rsi),%rsi
+	#     *(uint64 *) (x + 44) = in10
+	movq	%rdx,44(%rdi)
+	#     *(uint64 *) (x + 52) = in12
+	movq	%rsi,52(%rdi)
+	#     in0 = 1634760805
+	mov	$1634760805,%rsi
+	#     in4 = 857760878
+	mov	$857760878,%rdx
+	#     in10 = 2036477234
+	mov	$2036477234,%rcx
+	#     in14 = 1797285236
+	mov	$1797285236,%r8
+	#     *(uint32 *) (x + 0) = in0
+	movl	%esi,0(%rdi)
+	#     *(uint32 *) (x + 20) = in4
+	movl	%edx,20(%rdi)
+	#     *(uint32 *) (x + 40) = in10
+	movl	%ecx,40(%rdi)
+	#     *(uint32 *) (x + 60) = in14
+	movl	%r8d,60(%rdi)
+	# comment:fp stack unchanged by jump
+	#   goto keysetupdone
+	jmp	._keysetupdone
+#   kbits128:
+._kbits128:
+	#     in10 = *(uint64 *) (k + 0)
+	movq	0(%rsi),%rdx
+	#     in12 = *(uint64 *) (k + 8)
+	movq	8(%rsi),%rsi
+	#     *(uint64 *) (x + 44) = in10
+	movq	%rdx,44(%rdi)
+	#     *(uint64 *) (x + 52) = in12
+	movq	%rsi,52(%rdi)
+	#     in0 = 1634760805
+	mov	$1634760805,%rsi
+	#     in4 = 824206446
+	mov	$824206446,%rdx
+	#     in10 = 2036477238
+	mov	$2036477238,%rcx
+	#     in14 = 1797285236
+	mov	$1797285236,%r8
+	#     *(uint32 *) (x + 0) = in0
+	movl	%esi,0(%rdi)
+	#     *(uint32 *) (x + 20) = in4
+	movl	%edx,20(%rdi)
+	#     *(uint32 *) (x + 40) = in10
+	movl	%ecx,40(%rdi)
+	#     *(uint32 *) (x + 60) = in14
+	movl	%r8d,60(%rdi)
+#   keysetupdone:
+._keysetupdone:
+	# leave
+	add	%r11,%rsp
+	mov	%rdi,%rax
+	mov	%rsi,%rdx
+	ret
+# enter ECRYPT_ivsetup
+.text
+.p2align 5
+.globl ECRYPT_ivsetup
+ECRYPT_ivsetup:
+	mov	%rsp,%r11
+	and	$31,%r11
+	add	$256,%r11
+	sub	%r11,%rsp
+	#   iv = arg2
+	mov	%rsi,%rsi
+	#   x = arg1
+	mov	%rdi,%rdi
+	#   in6 = *(uint64 *) (iv + 0)
+	movq	0(%rsi),%rsi
+	#   in8 = 0
+	mov	$0,%r8
+	#   *(uint64 *) (x + 24) = in6
+	movq	%rsi,24(%rdi)
+	#   *(uint64 *) (x + 32) = in8
+	movq	%r8,32(%rdi)
+	# leave
+	add	%r11,%rsp
+	mov	%rdi,%rax
+	mov	%rsi,%rdx
+	ret
diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c
index 3be443995ed..bccb76d8098 100644
--- a/arch/x86/crypto/salsa20_glue.c
+++ b/arch/x86/crypto/salsa20_glue.c
@@ -8,6 +8,8 @@
  * and to remove extraneous comments and functions that are not needed.
  * - i586 version, renamed as salsa20-i586-asm_32.S
  *   available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>
+ * - x86-64 version, renamed as salsa20-x86_64-asm_64.S
+ *   available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
-- 
cgit v1.2.3


From 15e7b4452b72ae890f2fcb027b4c4fa63a1c9a7a Mon Sep 17 00:00:00 2001
From: Sebastian Siewior <sebastian@breakpoint.cc>
Date: Mon, 14 Jan 2008 17:07:57 +1100
Subject: [CRYPTO] twofish: Merge common glue code

There is almost no difference between 32 & 64 bit glue code.

Signed-off-by: Sebastian Siewior <sebastian@breakpoint.cc>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile       |  4 +-
 arch/x86/crypto/twofish_32.c   | 97 ------------------------------------------
 arch/x86/crypto/twofish_64.c   | 97 ------------------------------------------
 arch/x86/crypto/twofish_glue.c | 97 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 99 insertions(+), 196 deletions(-)
 delete mode 100644 arch/x86/crypto/twofish_32.c
 delete mode 100644 arch/x86/crypto/twofish_64.c
 create mode 100644 arch/x86/crypto/twofish_glue.c

(limited to 'arch')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 09200e12f14..3874c2de540 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -11,9 +11,9 @@ obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
 
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
-twofish-i586-y := twofish-i586-asm_32.o twofish_32.o
+twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
 salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
 
 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
-twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_64.o
+twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
diff --git a/arch/x86/crypto/twofish_32.c b/arch/x86/crypto/twofish_32.c
deleted file mode 100644
index e3004dfe9c7..00000000000
--- a/arch/x86/crypto/twofish_32.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- *  Glue Code for optimized 586 assembler version of TWOFISH
- *
- * Originally Twofish for GPG
- * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
- * 256-bit key length added March 20, 1999
- * Some modifications to reduce the text size by Werner Koch, April, 1998
- * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
- * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
- *
- * The original author has disclaimed all copyright interest in this
- * code and thus put it in the public domain. The subsequent authors
- * have put this under the GNU General Public License.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
- * USA
- *
- * This code is a "clean room" implementation, written from the paper
- * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
- * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
- * through http://www.counterpane.com/twofish.html
- *
- * For background information on multiplication in finite fields, used for
- * the matrix operations in the key schedule, see the book _Contemporary
- * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
- * Third Edition.
- */
-
-#include <crypto/twofish.h>
-#include <linux/crypto.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/types.h>
-
-
-asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
-asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
-
-static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-	twofish_enc_blk(tfm, dst, src);
-}
-
-static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-	twofish_dec_blk(tfm, dst, src);
-}
-
-static struct crypto_alg alg = {
-	.cra_name		=	"twofish",
-	.cra_driver_name	=	"twofish-i586",
-	.cra_priority		=	200,
-	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
-	.cra_blocksize		=	TF_BLOCK_SIZE,
-	.cra_ctxsize		=	sizeof(struct twofish_ctx),
-	.cra_alignmask		=	3,
-	.cra_module		=	THIS_MODULE,
-	.cra_list		=	LIST_HEAD_INIT(alg.cra_list),
-	.cra_u			=	{
-		.cipher = {
-			.cia_min_keysize	=	TF_MIN_KEY_SIZE,
-			.cia_max_keysize	=	TF_MAX_KEY_SIZE,
-			.cia_setkey		=	twofish_setkey,
-			.cia_encrypt		=	twofish_encrypt,
-			.cia_decrypt		=	twofish_decrypt
-		}
-	}
-};
-
-static int __init init(void)
-{
-	return crypto_register_alg(&alg);
-}
-
-static void __exit fini(void)
-{
-	crypto_unregister_alg(&alg);
-}
-
-module_init(init);
-module_exit(fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION ("Twofish Cipher Algorithm, i586 asm optimized");
-MODULE_ALIAS("twofish");
diff --git a/arch/x86/crypto/twofish_64.c b/arch/x86/crypto/twofish_64.c
deleted file mode 100644
index 182d91d5cfb..00000000000
--- a/arch/x86/crypto/twofish_64.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Glue Code for optimized x86_64 assembler version of TWOFISH
- *
- * Originally Twofish for GPG
- * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
- * 256-bit key length added March 20, 1999
- * Some modifications to reduce the text size by Werner Koch, April, 1998
- * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
- * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
- *
- * The original author has disclaimed all copyright interest in this
- * code and thus put it in the public domain. The subsequent authors
- * have put this under the GNU General Public License.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
- * USA
- *
- * This code is a "clean room" implementation, written from the paper
- * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
- * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
- * through http://www.counterpane.com/twofish.html
- *
- * For background information on multiplication in finite fields, used for
- * the matrix operations in the key schedule, see the book _Contemporary
- * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
- * Third Edition.
- */
-
-#include <crypto/twofish.h>
-#include <linux/crypto.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/types.h>
-
-asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
-asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
-
-static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-	twofish_enc_blk(tfm, dst, src);
-}
-
-static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-	twofish_dec_blk(tfm, dst, src);
-}
-
-static struct crypto_alg alg = {
-	.cra_name		=	"twofish",
-	.cra_driver_name	=	"twofish-x86_64",
-	.cra_priority		=	200,
-	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
-	.cra_blocksize		=	TF_BLOCK_SIZE,
-	.cra_ctxsize		=	sizeof(struct twofish_ctx),
-	.cra_alignmask		=	3,
-	.cra_module		=	THIS_MODULE,
-	.cra_list		=	LIST_HEAD_INIT(alg.cra_list),
-	.cra_u			=	{
-		.cipher = {
-			.cia_min_keysize	=	TF_MIN_KEY_SIZE,
-			.cia_max_keysize	=	TF_MAX_KEY_SIZE,
-			.cia_setkey		=	twofish_setkey,
-			.cia_encrypt		=	twofish_encrypt,
-			.cia_decrypt		=	twofish_decrypt
-		}
-	}
-};
-
-static int __init init(void)
-{
-	return crypto_register_alg(&alg);
-}
-
-static void __exit fini(void)
-{
-	crypto_unregister_alg(&alg);
-}
-
-module_init(init);
-module_exit(fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION ("Twofish Cipher Algorithm, x86_64 asm optimized");
-MODULE_ALIAS("twofish");
diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c
new file mode 100644
index 00000000000..cefaf8b9aa1
--- /dev/null
+++ b/arch/x86/crypto/twofish_glue.c
@@ -0,0 +1,97 @@
+/*
+ * Glue Code for assembler optimized version of TWOFISH
+ *
+ * Originally Twofish for GPG
+ * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
+ * 256-bit key length added March 20, 1999
+ * Some modifications to reduce the text size by Werner Koch, April, 1998
+ * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
+ * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
+ *
+ * The original author has disclaimed all copyright interest in this
+ * code and thus put it in the public domain. The subsequent authors
+ * have put this under the GNU General Public License.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ * This code is a "clean room" implementation, written from the paper
+ * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
+ * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
+ * through http://www.counterpane.com/twofish.html
+ *
+ * For background information on multiplication in finite fields, used for
+ * the matrix operations in the key schedule, see the book _Contemporary
+ * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
+ * Third Edition.
+ */
+
+#include <crypto/twofish.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+
+static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	twofish_enc_blk(tfm, dst, src);
+}
+
+static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	twofish_dec_blk(tfm, dst, src);
+}
+
+static struct crypto_alg alg = {
+	.cra_name		=	"twofish",
+	.cra_driver_name	=	"twofish-asm",
+	.cra_priority		=	200,
+	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		=	TF_BLOCK_SIZE,
+	.cra_ctxsize		=	sizeof(struct twofish_ctx),
+	.cra_alignmask		=	3,
+	.cra_module		=	THIS_MODULE,
+	.cra_list		=	LIST_HEAD_INIT(alg.cra_list),
+	.cra_u			=	{
+		.cipher = {
+			.cia_min_keysize	=	TF_MIN_KEY_SIZE,
+			.cia_max_keysize	=	TF_MAX_KEY_SIZE,
+			.cia_setkey		=	twofish_setkey,
+			.cia_encrypt		=	twofish_encrypt,
+			.cia_decrypt		=	twofish_decrypt
+		}
+	}
+};
+
+static int __init init(void)
+{
+	return crypto_register_alg(&alg);
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION ("Twofish Cipher Algorithm, asm optimized");
+MODULE_ALIAS("twofish");
+MODULE_ALIAS("twofish-asm");
-- 
cgit v1.2.3