- 论坛徽章:
- 0
|
需要一个简单快速的加密算法.
看过 DES, AES 后, 感觉在速度上难以符合要求.
就设计了一个简单的( 当然没有 DES, AES 安全性高 ):
以 8 个字节( 64 bit )为单位, 按密钥指定的顺序, 将每一个 bit 调整位置. 然后再和一个 64 bit 的异或码异或.
解密时按相反的顺序.
先写的 c 代码, 用了查表法.
后写的汇编代, 也用了查表法.
谁知道将 c 代码用 gcc -O2 优化后, 竟然比我的汇编代码还快.
我可真生气了.
用 gcc -S -O2 根据 .c 文件生成一个 .s 出来,
发现 .s 代码, 竟和我自己写的汇编代码惊人的一致.
仔细对比后, 参照 .s 将:
shr $1, %ebx
jnc \@.next
改为:
test $1<<\bit, %ebx
jz \@.next
才总算扳平. 编解 100,000 个 1472 大小的包, c: 1.764646 秒, asm: 1.642376 秒, 占 0.x 秒的优势.
在 x86 (Intel(R) Pentium(R) 4 CPU 3.06GHz) 机器上, 一秒钟可以一编一解 82.5 MBytes 的数据.
但在 ARM 上每秒中才可以编解 10xx个包(大概 1.40MBytes), 太慢了.
想在 ARM 上用汇编优化, 不知道有多大的优化空间?
毕竟 gcc -O2 太厉害了, 在 pc 上用普通 x86 指令优化差别不大.
贴出代码, 求高人指点:
- typedef struct xbit_t xbit_t;
- struct xbit_t
- {
- u32_t xor_low; // 异或码( 0 ~ 31)
- u32_t xor_high; // 异或码(32 ~ 63)
- u8_t enc_move[64]; // 移位码(加密)
- u8_t dec_move[64]; // 移位码(解密)
- };
- #include <stdlib.h>
- #include <time.h>
- #include "def_type.h"
- #include "xbit.h"
- typedef struct sort_t sort_t;
- struct sort_t
- {
- i32_t sort_key;
- i32_t bit_move;
- };
- static i32_t compar( const void *a, const void *b )
- {
- return ((sort_t *)a)->sort_key - ((sort_t *)b)->sort_key;
- }
- // 功能: 生成密钥
- // 参数: key : 指向存储密钥的结构
- void gen_key( xbit_t *key )
- {
- i32_t i;
- u8_t *p;
- sort_t arr[ ARR_E( key->enc_move ) ];
- // 初始化随机数生成器
- srandom( time(NULL) );
- // 生成异或码
- p = ( u8_t * )&key->xor_low;
- for( i = 0; i < sizeof( key->xor_low ); i++ )
- {
- p[i] = random();
- }
- p = ( u8_t * )&key->xor_high;
- for( i = 0; i < sizeof( key->xor_high ); i++ )
- {
- p[i] = random();
- }
- // 生成移位码
- for( i = 0; i < ARR_E( arr ); i++ )
- {
- arr[i].sort_key = random();
- arr[i].bit_move = i;
- }
- qsort( arr, ARR_E( arr ), sizeof( arr[0] ), compar );
- for( i = 0; i < ARR_E( arr ); i++ )
- {
- key->enc_move[i] = arr[i].bit_move;
- key->dec_move[arr[i].bit_move] = i;
- }
- }
- #ifndef USE_ASM
- static const u32_t tab_low[] = \
- {
- 0x00000001, 0x00000002, 0x00000004, 0x00000008,
- 0x00000010, 0x00000020, 0x00000040, 0x00000080,
- 0x00000100, 0x00000200, 0x00000400, 0x00000800,
- 0x00001000, 0x00002000, 0x00004000, 0x00008000,
- 0x00010000, 0x00020000, 0x00040000, 0x00080000,
- 0x00100000, 0x00200000, 0x00400000, 0x00800000,
- 0x01000000, 0x02000000, 0x04000000, 0x08000000,
- 0x10000000, 0x20000000, 0x40000000, 0x80000000,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- };
- static const u32_t tab_high[] = \
- {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0x00000001, 0x00000002, 0x00000004, 0x00000008,
- 0x00000010, 0x00000020, 0x00000040, 0x00000080,
- 0x00000100, 0x00000200, 0x00000400, 0x00000800,
- 0x00001000, 0x00002000, 0x00004000, 0x00008000,
- 0x00010000, 0x00020000, 0x00040000, 0x00080000,
- 0x00100000, 0x00200000, 0x00400000, 0x00800000,
- 0x01000000, 0x02000000, 0x04000000, 0x08000000,
- 0x10000000, 0x20000000, 0x40000000, 0x80000000
- };
- // 功能: 加密并拷贝数据
- // 参数: key : 密钥
- // dst : 加密后数据存放地
- // src : 被加密数据来源地
- // len : 数据长度( 字节数 )
- void xbit_enc( const xbit_t *key, void *dst, const void *src, u32_t len )
- {
- const u8_t *bit_move;
- u32_t l, h, s, xor_low, xor_high;
- xor_low = key->xor_low; // 取异或码( 0 ~ 31)
- xor_high = key->xor_high; // 取异或码(32 ~ 63)
- bit_move = key->enc_move; // 取移位码(加密)
- len >>= 3;
- while( len-- )
- {
- l = h = 0;
- s = *( u32_t * )src;
- src += 4;
- if( s & 1 << 0 ) { l |= tab_low[ bit_move[ 0] ]; h |= tab_high[ bit_move[ 0] ]; }
- if( s & 1 << 1 ) { l |= tab_low[ bit_move[ 1] ]; h |= tab_high[ bit_move[ 1] ]; }
- if( s & 1 << 2 ) { l |= tab_low[ bit_move[ 2] ]; h |= tab_high[ bit_move[ 2] ]; }
- if( s & 1 << 3 ) { l |= tab_low[ bit_move[ 3] ]; h |= tab_high[ bit_move[ 3] ]; }
- if( s & 1 << 4 ) { l |= tab_low[ bit_move[ 4] ]; h |= tab_high[ bit_move[ 4] ]; }
- if( s & 1 << 5 ) { l |= tab_low[ bit_move[ 5] ]; h |= tab_high[ bit_move[ 5] ]; }
- if( s & 1 << 6 ) { l |= tab_low[ bit_move[ 6] ]; h |= tab_high[ bit_move[ 6] ]; }
- if( s & 1 << 7 ) { l |= tab_low[ bit_move[ 7] ]; h |= tab_high[ bit_move[ 7] ]; }
- if( s & 1 << 8 ) { l |= tab_low[ bit_move[ 8] ]; h |= tab_high[ bit_move[ 8] ]; }
- if( s & 1 << 9 ) { l |= tab_low[ bit_move[ 9] ]; h |= tab_high[ bit_move[ 9] ]; }
- if( s & 1 << 10 ) { l |= tab_low[ bit_move[10] ]; h |= tab_high[ bit_move[10] ]; }
- if( s & 1 << 11 ) { l |= tab_low[ bit_move[11] ]; h |= tab_high[ bit_move[11] ]; }
- if( s & 1 << 12 ) { l |= tab_low[ bit_move[12] ]; h |= tab_high[ bit_move[12] ]; }
- if( s & 1 << 13 ) { l |= tab_low[ bit_move[13] ]; h |= tab_high[ bit_move[13] ]; }
- if( s & 1 << 14 ) { l |= tab_low[ bit_move[14] ]; h |= tab_high[ bit_move[14] ]; }
- if( s & 1 << 15 ) { l |= tab_low[ bit_move[15] ]; h |= tab_high[ bit_move[15] ]; }
- if( s & 1 << 16 ) { l |= tab_low[ bit_move[16] ]; h |= tab_high[ bit_move[16] ]; }
- if( s & 1 << 17 ) { l |= tab_low[ bit_move[17] ]; h |= tab_high[ bit_move[17] ]; }
- if( s & 1 << 18 ) { l |= tab_low[ bit_move[18] ]; h |= tab_high[ bit_move[18] ]; }
- if( s & 1 << 19 ) { l |= tab_low[ bit_move[19] ]; h |= tab_high[ bit_move[19] ]; }
- if( s & 1 << 20 ) { l |= tab_low[ bit_move[20] ]; h |= tab_high[ bit_move[20] ]; }
- if( s & 1 << 21 ) { l |= tab_low[ bit_move[21] ]; h |= tab_high[ bit_move[21] ]; }
- if( s & 1 << 22 ) { l |= tab_low[ bit_move[22] ]; h |= tab_high[ bit_move[22] ]; }
- if( s & 1 << 23 ) { l |= tab_low[ bit_move[23] ]; h |= tab_high[ bit_move[23] ]; }
- if( s & 1 << 24 ) { l |= tab_low[ bit_move[24] ]; h |= tab_high[ bit_move[24] ]; }
- if( s & 1 << 25 ) { l |= tab_low[ bit_move[25] ]; h |= tab_high[ bit_move[25] ]; }
- if( s & 1 << 26 ) { l |= tab_low[ bit_move[26] ]; h |= tab_high[ bit_move[26] ]; }
- if( s & 1 << 27 ) { l |= tab_low[ bit_move[27] ]; h |= tab_high[ bit_move[27] ]; }
- if( s & 1 << 28 ) { l |= tab_low[ bit_move[28] ]; h |= tab_high[ bit_move[28] ]; }
- if( s & 1 << 29 ) { l |= tab_low[ bit_move[29] ]; h |= tab_high[ bit_move[29] ]; }
- if( s & 1 << 30 ) { l |= tab_low[ bit_move[30] ]; h |= tab_high[ bit_move[30] ]; }
- if( s & 1 << 31 ) { l |= tab_low[ bit_move[31] ]; h |= tab_high[ bit_move[31] ]; }
- s = *( u32_t * )src;
- src += 4;
- if( s & 1 << 0 ) { l |= tab_low[ bit_move[32] ]; h |= tab_high[ bit_move[32] ]; }
- if( s & 1 << 1 ) { l |= tab_low[ bit_move[33] ]; h |= tab_high[ bit_move[33] ]; }
- if( s & 1 << 2 ) { l |= tab_low[ bit_move[34] ]; h |= tab_high[ bit_move[34] ]; }
- if( s & 1 << 3 ) { l |= tab_low[ bit_move[35] ]; h |= tab_high[ bit_move[35] ]; }
- if( s & 1 << 4 ) { l |= tab_low[ bit_move[36] ]; h |= tab_high[ bit_move[36] ]; }
- if( s & 1 << 5 ) { l |= tab_low[ bit_move[37] ]; h |= tab_high[ bit_move[37] ]; }
- if( s & 1 << 6 ) { l |= tab_low[ bit_move[38] ]; h |= tab_high[ bit_move[38] ]; }
- if( s & 1 << 7 ) { l |= tab_low[ bit_move[39] ]; h |= tab_high[ bit_move[39] ]; }
- if( s & 1 << 8 ) { l |= tab_low[ bit_move[40] ]; h |= tab_high[ bit_move[40] ]; }
- if( s & 1 << 9 ) { l |= tab_low[ bit_move[41] ]; h |= tab_high[ bit_move[41] ]; }
- if( s & 1 << 10 ) { l |= tab_low[ bit_move[42] ]; h |= tab_high[ bit_move[42] ]; }
- if( s & 1 << 11 ) { l |= tab_low[ bit_move[43] ]; h |= tab_high[ bit_move[43] ]; }
- if( s & 1 << 12 ) { l |= tab_low[ bit_move[44] ]; h |= tab_high[ bit_move[44] ]; }
- if( s & 1 << 13 ) { l |= tab_low[ bit_move[45] ]; h |= tab_high[ bit_move[45] ]; }
- if( s & 1 << 14 ) { l |= tab_low[ bit_move[46] ]; h |= tab_high[ bit_move[46] ]; }
- if( s & 1 << 15 ) { l |= tab_low[ bit_move[47] ]; h |= tab_high[ bit_move[47] ]; }
- if( s & 1 << 16 ) { l |= tab_low[ bit_move[48] ]; h |= tab_high[ bit_move[48] ]; }
- if( s & 1 << 17 ) { l |= tab_low[ bit_move[49] ]; h |= tab_high[ bit_move[49] ]; }
- if( s & 1 << 18 ) { l |= tab_low[ bit_move[50] ]; h |= tab_high[ bit_move[50] ]; }
- if( s & 1 << 19 ) { l |= tab_low[ bit_move[51] ]; h |= tab_high[ bit_move[51] ]; }
- if( s & 1 << 20 ) { l |= tab_low[ bit_move[52] ]; h |= tab_high[ bit_move[52] ]; }
- if( s & 1 << 21 ) { l |= tab_low[ bit_move[53] ]; h |= tab_high[ bit_move[53] ]; }
- if( s & 1 << 22 ) { l |= tab_low[ bit_move[54] ]; h |= tab_high[ bit_move[54] ]; }
- if( s & 1 << 23 ) { l |= tab_low[ bit_move[55] ]; h |= tab_high[ bit_move[55] ]; }
- if( s & 1 << 24 ) { l |= tab_low[ bit_move[56] ]; h |= tab_high[ bit_move[56] ]; }
- if( s & 1 << 25 ) { l |= tab_low[ bit_move[57] ]; h |= tab_high[ bit_move[57] ]; }
- if( s & 1 << 26 ) { l |= tab_low[ bit_move[58] ]; h |= tab_high[ bit_move[58] ]; }
- if( s & 1 << 27 ) { l |= tab_low[ bit_move[59] ]; h |= tab_high[ bit_move[59] ]; }
- if( s & 1 << 28 ) { l |= tab_low[ bit_move[60] ]; h |= tab_high[ bit_move[60] ]; }
- if( s & 1 << 29 ) { l |= tab_low[ bit_move[61] ]; h |= tab_high[ bit_move[61] ]; }
- if( s & 1 << 30 ) { l |= tab_low[ bit_move[62] ]; h |= tab_high[ bit_move[62] ]; }
- if( s & 1 << 31 ) { l |= tab_low[ bit_move[63] ]; h |= tab_high[ bit_move[63] ]; }
- ( (u32_t *)dst )[0] = l ^ xor_low;
- ( (u32_t *)dst )[1] = h ^ xor_high;
- dst += 8;
- }
- }
- // 功能: 解密并拷贝数据
- // 参数: key : 密钥
- // dst : 解密后数据存放地
- // src : 被解密数据来源地
- // len : 数据长度( 字节数 )
- void xbit_dec( const xbit_t *key, void *dst, const void *src, u32_t len )
- {
- // 略...
- }
- #endif
- // 汇编代码
- .file "xbit-x86.S"
- .macro MV32BIT bit, cnt = 0
- .if \cnt % 16 < 8
- test $1<<(\cnt%8), %bl
- .else
- test $1<<(\cnt%8), %bh
- .endif
- jz .\@next
- movzbl \bit(%ebp), %ecx
- or TAB(,%ecx,4), %eax
- or TAB+128(,%ecx,4), %edx
- .\@next:
- .ifeq \cnt - 15
- shr $16, %ebx
- .endif
- .if \cnt - 31
- MV32BIT "(\bit+1)", "(\cnt+1)"
- .endif
- .endm
- .text
- .p2align 4,,15
- .globl xbit_enc
- .type xbit_enc, @function
- xbit_enc:
- push %ebp
- mov %edi, -4(%esp)
- mov %esi, -8(%esp)
- mov %ebx, -12(%esp)
- shrl $3, 20(%esp) # len /= 8
- jz .ENC_EXIT # len > 0 ?
- mov 8(%esp), %ebp # ebp = key
- mov 12(%esp), %edi # edi = dst
- mov 16(%esp), %esi # esi = src
- lea 8(%ebp), %ebp # ebp = key->enc_move
- .p2align 4,,7
- .ENC_LOOP:
- xor %eax, %eax # l = 0
- xor %edx, %edx # h = 0
- mov (%esi), %ebx # ebx = src[0]
- MV32BIT 0 # edx:eax <- ebx
- mov 4(%esi), %ebx # ebx = src[1]
- MV32BIT 32 # edx:eax <- ebx
- xor -8(%ebp), %eax # l ^= xor_low
- xor -4(%ebp), %edx # h ^= xor_high
- mov %eax, (%edi) # dst[0] = l
- mov %edx, 4(%edi) # dst[1] = h
- lea 8(%esi), %esi # src += 8
- lea 8(%edi), %edi # dst += 8
- decl 20(%esp) # len -= 1
- jnz .ENC_LOOP # len > 0 ?
- .p2align 4,,7
- .ENC_EXIT:
- mov -4(%esp), %edi
- mov -8(%esp), %esi
- mov -12(%esp), %ebx
- pop %ebp
- ret
- .size xbit_enc, .-xbit_enc
- .p2align 4,,15
- .globl xbit_dec
- .type xbit_dec, @function
- // 略...
- .size xbit_dec, .-xbit_dec
- .section .rodata
- .align 32
- .type TAB, @object
- .size TAB, 384
- TAB:
- .long 0x00000001, 0x00000002, 0x00000004, 0x00000008
- .long 0x00000010, 0x00000020, 0x00000040, 0x00000080
- .long 0x00000100, 0x00000200, 0x00000400, 0x00000800
- .long 0x00001000, 0x00002000, 0x00004000, 0x00008000
- .long 0x00010000, 0x00020000, 0x00040000, 0x00080000
- .long 0x00100000, 0x00200000, 0x00400000, 0x00800000
- .long 0x01000000, 0x02000000, 0x04000000, 0x08000000
- .long 0x10000000, 0x20000000, 0x40000000, 0x80000000
- .long 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .long 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .long 0x00000001, 0x00000002, 0x00000004, 0x00000008
- .long 0x00000010, 0x00000020, 0x00000040, 0x00000080
- .long 0x00000100, 0x00000200, 0x00000400, 0x00000800
- .long 0x00001000, 0x00002000, 0x00004000, 0x00008000
- .long 0x00010000, 0x00020000, 0x00040000, 0x00080000
- .long 0x00100000, 0x00200000, 0x00400000, 0x00800000
- .long 0x01000000, 0x02000000, 0x04000000, 0x08000000
- .long 0x10000000, 0x20000000, 0x40000000, 0x80000000
- .ident "GCC: (GNU) 4.1.2 20061115 (prerelease) (Debian 4.1.1-21)"
- .section .note.GNU-stack,"",@progbits
复制代码
(附件有源码)
[ 本帖最后由 guoruimin 于 2008-4-30 08:44 编辑 ] |
|