| /* |
| * Blowfish Cipher Algorithm (x86_64) |
| * |
| * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> |
| * |
| * This program is free software; you can redistribute it and/or modify |
| * it under the terms of the GNU General Public License as published by |
| * the Free Software Foundation; either version 2 of the License, or |
| * (at your option) any later version. |
| * |
| * This program is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| * GNU General Public License for more details. |
| * |
| * You should have received a copy of the GNU General Public License |
| * along with this program; if not, write to the Free Software |
| * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 |
| * USA |
| * |
| */ |
| |
| .file "blowfish-x86_64-asm.S" |
| .text |
| |
| /* structure of crypto context */ |
| #define p 0 |
| #define s0 ((16 + 2) * 4) |
| #define s1 ((16 + 2 + (1 * 256)) * 4) |
| #define s2 ((16 + 2 + (2 * 256)) * 4) |
| #define s3 ((16 + 2 + (3 * 256)) * 4) |
| |
| /* register macros */ |
| #define CTX %rdi |
| #define RIO %rsi |
| |
| #define RX0 %rax |
| #define RX1 %rbx |
| #define RX2 %rcx |
| #define RX3 %rdx |
| |
| #define RX0d %eax |
| #define RX1d %ebx |
| #define RX2d %ecx |
| #define RX3d %edx |
| |
| #define RX0bl %al |
| #define RX1bl %bl |
| #define RX2bl %cl |
| #define RX3bl %dl |
| |
| #define RX0bh %ah |
| #define RX1bh %bh |
| #define RX2bh %ch |
| #define RX3bh %dh |
| |
| #define RT0 %rbp |
| #define RT1 %rsi |
| |
| #define RT0d %ebp |
| #define RT1d %esi |
| |
| #define RK0 %r8 |
| #define RK1 %r9 |
| #define RK2 %r10 |
| #define RK3 %r11 |
| |
| #define RK0d %r8d |
| #define RK1d %r9d |
| #define RK2d %r10d |
| #define RK3d %r11d |
| |
| #define RKEY %r12 |
| |
| /*********************************************************************** |
| * 1-way blowfish |
| ***********************************************************************/ |
| #define F(x, k) \ |
| rorq $16, x; \ |
| movzbl x ## bh, RT0d; \ |
| movzbl x ## bl, RT1d; \ |
| rolq $16, x; \ |
| movl s0(CTX,RT0,4), k ## d; \ |
| addl s1(CTX,RT1,4), k ## d; \ |
| movzbl x ## bh, RT0d; \ |
| movzbl x ## bl, RT1d; \ |
| rolq $32, x; \ |
| xorl s2(CTX,RT0,4), k ## d; \ |
| addl s3(CTX,RT1,4), k ## d; \ |
| xorq k, x; |
| |
| #define add_roundkey_enc(n) \ |
| xorq p+4*(n)(CTX), RX0; |
| |
| #define round_enc(n) \ |
| add_roundkey_enc(n); \ |
| \ |
| F(RX0, RK0); \ |
| F(RX0, RK0); |
| |
| #define round_final_enc(n) \ |
| xorq p+4*(n)(CTX), RX0; |
| |
| #define add_roundkey_dec(n) \ |
| movq p+4*(n-1)(CTX), RT0; \ |
| rorq $32, RT0; \ |
| xorq RT0, RX0; |
| |
| #define round_dec(n) \ |
| add_roundkey_dec(n); \ |
| \ |
| F(RX0, RK0); \ |
| F(RX0, RK0); \ |
| |
| #define read_block() \ |
| movq (RIO), RX0; \ |
| rorq $32, RX0; \ |
| bswapq RX0; |
| |
| #define write_block() \ |
| bswapq RX0; \ |
| movq RX0, (RIO); |
| |
| #define xor_block() \ |
| bswapq RX0; \ |
| xorq RX0, (RIO); |
| |
| .align 8 |
| .global __blowfish_enc_blk |
| .type __blowfish_enc_blk,@function; |
| |
| __blowfish_enc_blk: |
| // input: |
| // %rdi: ctx, CTX |
| // %rsi: dst |
| // %rdx: src |
| // %rcx: bool xor |
| pushq %rbp; |
| pushq %rbx; |
| |
| pushq %rsi; |
| pushq %rcx; |
| movq %rdx, RIO; |
| |
| read_block(); |
| |
| round_enc(0); |
| round_enc(2); |
| round_enc(4); |
| round_enc(6); |
| round_enc(8); |
| round_enc(10); |
| round_enc(12); |
| round_enc(14); |
| add_roundkey_enc(16); |
| |
| popq %rbp; |
| popq RIO; |
| |
| test %bpl, %bpl; |
| jnz __enc_xor; |
| |
| write_block(); |
| |
| __enc_ret: |
| popq %rbx; |
| popq %rbp; |
| |
| ret; |
| |
| __enc_xor: |
| xor_block(); |
| |
| jmp __enc_ret; |
| |
| .align 8 |
| .global blowfish_dec_blk |
| .type blowfish_dec_blk,@function; |
| |
| blowfish_dec_blk: |
| // input: |
| // %rdi: ctx, CTX |
| // %rsi: dst |
| // %rdx: src |
| pushq %rbp; |
| pushq %rbx; |
| |
| pushq %rsi; |
| movq %rdx, RIO; |
| |
| read_block(); |
| |
| round_dec(17); |
| round_dec(15); |
| round_dec(13); |
| round_dec(11); |
| round_dec(9); |
| round_dec(7); |
| round_dec(5); |
| round_dec(3); |
| add_roundkey_dec(1); |
| |
| popq RIO; |
| write_block(); |
| |
| popq %rbx; |
| popq %rbp; |
| |
| ret; |
| |
| /********************************************************************** |
| 4-way blowfish, four blocks parallel |
| **********************************************************************/ |
| #define add_preloaded_roundkey4() \ |
| xorq RKEY, RX0; \ |
| xorq RKEY, RX1; \ |
| xorq RKEY, RX2; \ |
| xorq RKEY, RX3; |
| |
| #define preload_roundkey_enc(n) \ |
| movq p+4*(n)(CTX), RKEY; |
| |
| #define add_roundkey_enc4(n) \ |
| add_preloaded_roundkey4(); \ |
| preload_roundkey_enc(n + 2); |
| |
| #define round_enc4(n) \ |
| add_roundkey_enc4(n); \ |
| \ |
| F(RX0, RK0); \ |
| F(RX1, RK1); \ |
| F(RX2, RK2); \ |
| F(RX3, RK3); \ |
| \ |
| F(RX0, RK0); \ |
| F(RX1, RK1); \ |
| F(RX2, RK2); \ |
| F(RX3, RK3); |
| |
| #define preload_roundkey_dec(n) \ |
| movq p+4*((n)-1)(CTX), RKEY; \ |
| rorq $32, RKEY; |
| |
| #define add_roundkey_dec4(n) \ |
| add_preloaded_roundkey4(); \ |
| preload_roundkey_dec(n - 2); |
| |
| #define round_dec4(n) \ |
| add_roundkey_dec4(n); \ |
| \ |
| F(RX0, RK0); \ |
| F(RX1, RK1); \ |
| F(RX2, RK2); \ |
| F(RX3, RK3); \ |
| \ |
| F(RX0, RK0); \ |
| F(RX1, RK1); \ |
| F(RX2, RK2); \ |
| F(RX3, RK3); |
| |
| #define read_block4() \ |
| movq (RIO), RX0; \ |
| rorq $32, RX0; \ |
| bswapq RX0; \ |
| \ |
| movq 8(RIO), RX1; \ |
| rorq $32, RX1; \ |
| bswapq RX1; \ |
| \ |
| movq 16(RIO), RX2; \ |
| rorq $32, RX2; \ |
| bswapq RX2; \ |
| \ |
| movq 24(RIO), RX3; \ |
| rorq $32, RX3; \ |
| bswapq RX3; |
| |
| #define write_block4() \ |
| bswapq RX0; \ |
| movq RX0, (RIO); \ |
| \ |
| bswapq RX1; \ |
| movq RX1, 8(RIO); \ |
| \ |
| bswapq RX2; \ |
| movq RX2, 16(RIO); \ |
| \ |
| bswapq RX3; \ |
| movq RX3, 24(RIO); |
| |
| #define xor_block4() \ |
| bswapq RX0; \ |
| xorq RX0, (RIO); \ |
| \ |
| bswapq RX1; \ |
| xorq RX1, 8(RIO); \ |
| \ |
| bswapq RX2; \ |
| xorq RX2, 16(RIO); \ |
| \ |
| bswapq RX3; \ |
| xorq RX3, 24(RIO); |
| |
| .align 8 |
| .global __blowfish_enc_blk_4way |
| .type __blowfish_enc_blk_4way,@function; |
| |
| __blowfish_enc_blk_4way: |
| // input: |
| // %rdi: ctx, CTX |
| // %rsi: dst |
| // %rdx: src |
| // %rcx: bool xor |
| pushq %rbp; |
| pushq %rbx; |
| pushq RKEY; |
| preload_roundkey_enc(0); |
| |
| pushq %rsi; |
| pushq %rcx; |
| movq %rdx, RIO; |
| |
| read_block4(); |
| |
| round_enc4(0); |
| round_enc4(2); |
| round_enc4(4); |
| round_enc4(6); |
| round_enc4(8); |
| round_enc4(10); |
| round_enc4(12); |
| round_enc4(14); |
| add_preloaded_roundkey4(); |
| |
| popq %rbp; |
| popq RIO; |
| |
| test %bpl, %bpl; |
| jnz __enc_xor4; |
| |
| write_block4(); |
| |
| __enc_ret4: |
| popq RKEY; |
| popq %rbx; |
| popq %rbp; |
| |
| ret; |
| |
| __enc_xor4: |
| xor_block4(); |
| |
| jmp __enc_ret4; |
| |
| .align 8 |
| .global blowfish_dec_blk_4way |
| .type blowfish_dec_blk_4way,@function; |
| |
| blowfish_dec_blk_4way: |
| // input: |
| // %rdi: ctx, CTX |
| // %rsi: dst |
| // %rdx: src |
| pushq %rbp; |
| pushq %rbx; |
| pushq RKEY; |
| preload_roundkey_dec(17); |
| |
| pushq %rsi; |
| movq %rdx, RIO; |
| |
| read_block4(); |
| |
| round_dec4(17); |
| round_dec4(15); |
| round_dec4(13); |
| round_dec4(11); |
| round_dec4(9); |
| round_dec4(7); |
| round_dec4(5); |
| round_dec4(3); |
| add_preloaded_roundkey4(); |
| |
| popq RIO; |
| write_block4(); |
| |
| popq RKEY; |
| popq %rbx; |
| popq %rbp; |
| |
| ret; |
| |