1 files changed, 520 insertions, 0 deletions
diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S
new file mode 100644
index 0000000..0b33743
--- /dev/null
+++ b/arch/x86/crypto/camellia-x86_64-asm_64.S
@@ -0,0 +1,520 @@
+/*
+ * Camellia Cipher Algorithm (x86_64)
+ *
+ * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ */
+
+.file "camellia-x86_64-asm_64.S"
+.text
+
+.extern camellia_sp10011110;
+.extern camellia_sp22000222;
+.extern camellia_sp03303033;
+.extern camellia_sp00444404;
+.extern camellia_sp02220222;
+.extern camellia_sp30333033;
+.extern camellia_sp44044404;
+.extern camellia_sp11101110;
+
+#define sp10011110 camellia_sp10011110
+#define sp22000222 camellia_sp22000222
+#define sp03303033 camellia_sp03303033
+#define sp00444404 camellia_sp00444404
+#define sp02220222 camellia_sp02220222
+#define sp30333033 camellia_sp30333033
+#define sp44044404 camellia_sp44044404
+#define sp11101110 camellia_sp11101110
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct camellia_ctx: */
+#define key_table 0
+#define key_length CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %rdi
+#define RIO %rsi
+#define RIOd %esi
+
+#define RAB0 %rax
+#define RCD0 %rcx
+#define RAB1 %rbx
+#define RCD1 %rdx
+
+#define RAB0d %eax
+#define RCD0d %ecx
+#define RAB1d %ebx
+#define RCD1d %edx
+
+#define RAB0bl %al
+#define RCD0bl %cl
+#define RAB1bl %bl
+#define RCD1bl %dl
+
+#define RAB0bh %ah
+#define RCD0bh %ch
+#define RAB1bh %bh
+#define RCD1bh %dh
+
+#define RT0 %rsi
+#define RT1 %rbp
+#define RT2 %r8
+
+#define RT0d %esi
+#define RT1d %ebp
+#define RT2d %r8d
+
+#define RT2bl %r8b
+
+#define RXOR %r9
+#define RRBP %r10
+#define RDST %r11
+
+#define RXORd %r9d
+#define RXORbl %r9b
+
+#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
+	movzbl ab ## bl,		tmp2 ## d; \
+	movzbl ab ## bh,		tmp1 ## d; \
+	rorq $16,			ab; \
+	xorq T0(, tmp2, 8),		dst; \
+	xorq T1(, tmp1, 8),		dst;
+
+/**********************************************************************
+  1-way camellia
+ **********************************************************************/
+#define roundsm(ab, subkey, cd) \
+	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
+	\
+	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
+	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
+	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
+	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
+	\
+	xorq RT2,					cd ## 0;
+
+#define fls(l, r, kl, kr) \
+	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
+	andl l ## 0d,					RT0d; \
+	roll $1,					RT0d; \
+	shlq $32,					RT0; \
+	xorq RT0,					l ## 0; \
+	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
+	orq r ## 0,					RT1; \
+	shrq $32,					RT1; \
+	xorq RT1,					r ## 0; \
+	\
+	movq (key_table + ((kl) * 2) * 4)(CTX),		RT2; \
+	orq l ## 0,					RT2; \
+	shrq $32,					RT2; \
+	xorq RT2,					l ## 0; \
+	movl (key_table + ((kr) * 2) * 4)(CTX),		RT0d; \
+	andl r ## 0d,					RT0d; \
+	roll $1,					RT0d; \
+	shlq $32,					RT0; \
+	xorq RT0,					r ## 0;
+
+#define enc_rounds(i) \
+	roundsm(RAB, i + 2, RCD); \
+	roundsm(RCD, i + 3, RAB); \
+	roundsm(RAB, i + 4, RCD); \
+	roundsm(RCD, i + 5, RAB); \
+	roundsm(RAB, i + 6, RCD); \
+	roundsm(RCD, i + 7, RAB);
+
+#define enc_fls(i) \
+	fls(RAB, RCD, i + 0, i + 1);
+
+#define enc_inpack() \
+	movq (RIO),			RAB0; \
+	bswapq				RAB0; \
+	rolq $32,			RAB0; \
+	movq 4*2(RIO),			RCD0; \
+	bswapq				RCD0; \
+	rorq $32,			RCD0; \
+	xorq key_table(CTX),		RAB0;
+
+#define enc_outunpack(op, max) \
+	xorq key_table(CTX, max, 8),	RCD0; \
+	rorq $32,			RCD0; \
+	bswapq				RCD0; \
+	op ## q RCD0,			(RIO); \
+	rolq $32,			RAB0; \
+	bswapq				RAB0; \
+	op ## q RAB0,			4*2(RIO);
+
+#define dec_rounds(i) \
+	roundsm(RAB, i + 7, RCD); \
+	roundsm(RCD, i + 6, RAB); \
+	roundsm(RAB, i + 5, RCD); \
+	roundsm(RCD, i + 4, RAB); \
+	roundsm(RAB, i + 3, RCD); \
+	roundsm(RCD, i + 2, RAB);
+
+#define dec_fls(i) \
+	fls(RAB, RCD, i + 1, i + 0);
+
+#define dec_inpack(max) \
+	movq (RIO),			RAB0; \
+	bswapq				RAB0; \
+	rolq $32,			RAB0; \
+	movq 4*2(RIO),			RCD0; \
+	bswapq				RCD0; \
+	rorq $32,			RCD0; \
+	xorq key_table(CTX, max, 8),	RAB0;
+
+#define dec_outunpack() \
+	xorq key_table(CTX),		RCD0; \
+	rorq $32,			RCD0; \
+	bswapq				RCD0; \
+	movq RCD0,			(RIO); \
+	rolq $32,			RAB0; \
+	bswapq				RAB0; \
+	movq RAB0,			4*2(RIO);
+
+.global __camellia_enc_blk;
+.type   __camellia_enc_blk,@function;
+
+__camellia_enc_blk:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: bool xor
+	 */
+	movq %rbp, RRBP;
+
+	movq %rcx, RXOR;
+	movq %rsi, RDST;
+	movq %rdx, RIO;
+
+	enc_inpack();
+
+	enc_rounds(0);
+	enc_fls(8);
+	enc_rounds(8);
+	enc_fls(16);
+	enc_rounds(16);
+	movl $24, RT1d; /* max */
+
+	cmpb $16, key_length(CTX);
+	je __enc_done;
+
+	enc_fls(24);
+	enc_rounds(24);
+	movl $32, RT1d; /* max */
+
+__enc_done:
+	testb RXORbl, RXORbl;
+	movq RDST, RIO;
+
+	jnz __enc_xor;
+
+	enc_outunpack(mov, RT1);
+
+	movq RRBP, %rbp;
+	ret;
+
+__enc_xor:
+	enc_outunpack(xor, RT1);
+
+	movq RRBP, %rbp;
+	ret;
+
+.global camellia_dec_blk;
+.type   camellia_dec_blk,@function;
+
+camellia_dec_blk:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	cmpl $16, key_length(CTX);
+	movl $32, RT2d;
+	movl $24, RXORd;
+	cmovel RXORd, RT2d; /* max */
+
+	movq %rbp, RRBP;
+	movq %rsi, RDST;
+	movq %rdx, RIO;
+
+	dec_inpack(RT2);
+
+	cmpb $24, RT2bl;
+	je __dec_rounds16;
+
+	dec_rounds(24);
+	dec_fls(24);
+
+__dec_rounds16:
+	dec_rounds(16);
+	dec_fls(16);
+	dec_rounds(8);
+	dec_fls(8);
+	dec_rounds(0);
+
+	movq RDST, RIO;
+
+	dec_outunpack();
+
+	movq RRBP, %rbp;
+	ret;
+
+/**********************************************************************
+  2-way camellia
+ **********************************************************************/
+#define roundsm2(ab, subkey, cd) \
+	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
+	xorq RT2,					cd ## 1; \
+	\
+	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
+	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
+	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
+	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
+	\
+		xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
+		xorq RT2,					cd ## 0; \
+		xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
+		xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
+		xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
+
+#define fls2(l, r, kl, kr) \
+	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
+	andl l ## 0d,					RT0d; \
+	roll $1,					RT0d; \
+	shlq $32,					RT0; \
+	xorq RT0,					l ## 0; \
+	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
+	orq r ## 0,					RT1; \
+	shrq $32,					RT1; \
+	xorq RT1,					r ## 0; \
+	\
+		movl (key_table + ((kl) * 2) * 4)(CTX),		RT2d; \
+		andl l ## 1d,					RT2d; \
+		roll $1,					RT2d; \
+		shlq $32,					RT2; \
+		xorq RT2,					l ## 1; \
+		movq (key_table + ((kr) * 2) * 4)(CTX),		RT0; \
+		orq r ## 1,					RT0; \
+		shrq $32,					RT0; \
+		xorq RT0,					r ## 1; \
+	\
+	movq (key_table + ((kl) * 2) * 4)(CTX),		RT1; \
+	orq l ## 0,					RT1; \
+	shrq $32,					RT1; \
+	xorq RT1,					l ## 0; \
+	movl (key_table + ((kr) * 2) * 4)(CTX),		RT2d; \
+	andl r ## 0d,					RT2d; \
+	roll $1,					RT2d; \
+	shlq $32,					RT2; \
+	xorq RT2,					r ## 0; \
+	\
+		movq (key_table + ((kl) * 2) * 4)(CTX),		RT0; \
+		orq l ## 1,					RT0; \
+		shrq $32,					RT0; \
+		xorq RT0,					l ## 1; \
+		movl (key_table + ((kr) * 2) * 4)(CTX),		RT1d; \
+		andl r ## 1d,					RT1d; \
+		roll $1,					RT1d; \
+		shlq $32,					RT1; \
+		xorq RT1,					r ## 1;
+
+#define enc_rounds2(i) \
+	roundsm2(RAB, i + 2, RCD); \
+	roundsm2(RCD, i + 3, RAB); \
+	roundsm2(RAB, i + 4, RCD); \
+	roundsm2(RCD, i + 5, RAB); \
+	roundsm2(RAB, i + 6, RCD); \
+	roundsm2(RCD, i + 7, RAB);
+
+#define enc_fls2(i) \
+	fls2(RAB, RCD, i + 0, i + 1);
+
+#define enc_inpack2() \
+	movq (RIO),			RAB0; \
+	bswapq				RAB0; \
+	rorq $32,			RAB0; \
+	movq 4*2(RIO),			RCD0; \
+	bswapq				RCD0; \
+	rolq $32,			RCD0; \
+	xorq key_table(CTX),		RAB0; \
+	\
+		movq 8*2(RIO),			RAB1; \
+		bswapq				RAB1; \
+		rorq $32,			RAB1; \
+		movq 12*2(RIO),			RCD1; \
+		bswapq				RCD1; \
+		rolq $32,			RCD1; \
+		xorq key_table(CTX),		RAB1;
+
+#define enc_outunpack2(op, max) \
+	xorq key_table(CTX, max, 8),	RCD0; \
+	rolq $32,			RCD0; \
+	bswapq				RCD0; \
+	op ## q RCD0,			(RIO); \
+	rorq $32,			RAB0; \
+	bswapq				RAB0; \
+	op ## q RAB0,			4*2(RIO); \
+	\
+		xorq key_table(CTX, max, 8),	RCD1; \
+		rolq $32,			RCD1; \
+		bswapq				RCD1; \
+		op ## q RCD1,			8*2(RIO); \
+		rorq $32,			RAB1; \
+		bswapq				RAB1; \
+		op ## q RAB1,			12*2(RIO);
+
+#define dec_rounds2(i) \
+	roundsm2(RAB, i + 7, RCD); \
+	roundsm2(RCD, i + 6, RAB); \
+	roundsm2(RAB, i + 5, RCD); \
+	roundsm2(RCD, i + 4, RAB); \
+	roundsm2(RAB, i + 3, RCD); \
+	roundsm2(RCD, i + 2, RAB);
+
+#define dec_fls2(i) \
+	fls2(RAB, RCD, i + 1, i + 0);
+
+#define dec_inpack2(max) \
+	movq (RIO),			RAB0; \
+	bswapq				RAB0; \
+	rorq $32,			RAB0; \
+	movq 4*2(RIO),			RCD0; \
+	bswapq				RCD0; \
+	rolq $32,			RCD0; \
+	xorq key_table(CTX, max, 8),	RAB0; \
+	\
+		movq 8*2(RIO),			RAB1; \
+		bswapq				RAB1; \
+		rorq $32,			RAB1; \
+		movq 12*2(RIO),			RCD1; \
+		bswapq				RCD1; \
+		rolq $32,			RCD1; \
+		xorq key_table(CTX, max, 8),	RAB1;
+
+#define dec_outunpack2() \
+	xorq key_table(CTX),		RCD0; \
+	rolq $32,			RCD0; \
+	bswapq				RCD0; \
+	movq RCD0,			(RIO); \
+	rorq $32,			RAB0; \
+	bswapq				RAB0; \
+	movq RAB0,			4*2(RIO); \
+	\
+		xorq key_table(CTX),		RCD1; \
+		rolq $32,			RCD1; \
+		bswapq				RCD1; \
+		movq RCD1,			8*2(RIO); \
+		rorq $32,			RAB1; \
+		bswapq				RAB1; \
+		movq RAB1,			12*2(RIO);
+
+.global __camellia_enc_blk_2way;
+.type   __camellia_enc_blk_2way,@function;
+
+__camellia_enc_blk_2way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: bool xor
+	 */
+	pushq %rbx;
+
+	movq %rbp, RRBP;
+	movq %rcx, RXOR;
+	movq %rsi, RDST;
+	movq %rdx, RIO;
+
+	enc_inpack2();
+
+	enc_rounds2(0);
+	enc_fls2(8);
+	enc_rounds2(8);
+	enc_fls2(16);
+	enc_rounds2(16);
+	movl $24, RT2d; /* max */
+
+	cmpb $16, key_length(CTX);
+	je __enc2_done;
+
+	enc_fls2(24);
+	enc_rounds2(24);
+	movl $32, RT2d; /* max */
+
+__enc2_done:
+	test RXORbl, RXORbl;
+	movq RDST, RIO;
+	jnz __enc2_xor;
+
+	enc_outunpack2(mov, RT2);
+
+	movq RRBP, %rbp;
+	popq %rbx;
+	ret;
+
+__enc2_xor:
+	enc_outunpack2(xor, RT2);
+
+	movq RRBP, %rbp;
+	popq %rbx;
+	ret;
+
+.global camellia_dec_blk_2way;
+.type   camellia_dec_blk_2way,@function;
+
+camellia_dec_blk_2way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	cmpl $16, key_length(CTX);
+	movl $32, RT2d;
+	movl $24, RXORd;
+	cmovel RXORd, RT2d; /* max */
+
+	movq %rbx, RXOR;
+	movq %rbp, RRBP;
+	movq %rsi, RDST;
+	movq %rdx, RIO;
+
+	dec_inpack2(RT2);
+
+	cmpb $24, RT2bl;
+	je __dec2_rounds16;
+
+	dec_rounds2(24);
+	dec_fls2(24);
+
+__dec2_rounds16:
+	dec_rounds2(16);
+	dec_fls2(16);
+	dec_rounds2(8);
+	dec_fls2(8);
+	dec_rounds2(0);
+
+	movq RDST, RIO;
+
+	dec_outunpack2();
+
+	movq RRBP, %rbp;
+	movq RXOR, %rbx;
+	ret;