diff options
Diffstat (limited to 'crc_i386.S')
-rw-r--r-- | crc_i386.S | 240 |
1 files changed, 240 insertions, 0 deletions
diff --git a/crc_i386.S b/crc_i386.S new file mode 100644 index 0000000..d23967b --- /dev/null +++ b/crc_i386.S @@ -0,0 +1,240 @@ +/* + Copyright (c) 1990-2005 Info-ZIP. All rights reserved. + + See the accompanying file LICENSE, version 2004-May-22 or later + (the contents of which are also included in zip.h) for terms of use. + If, for some reason, both of these files are missing, the Info-ZIP license + also may be found at: ftp://ftp.info-zip.org/pub/infozip/license.html +*/ +/* + * crc_i386.S, optimized CRC calculation function for Zip and UnZip, + * created by Paul Kienitz and Christian Spieler. + * + * GRR 961110: incorporated Scott Field optimizations from win32/crc_i386.asm + * => overall 6% speedup in "unzip -tq" on 9MB zipfile (486-66) + * + * SPC 970402: revised for Rodney Brown's optimizations (32-bit-wide + * aligned reads for most of the data from buffer), can be + * disabled by defining the macro NO_32_BIT_LOADS + * + * SPC 971012: added Rodney Brown's additional tweaks for 32-bit-optimized + * CPUs (like the Pentium Pro, Pentium II, and probably some + * Pentium clones). This optimization is controlled by the + * preprocessor switch "__686" and is disabled by default. + * (This default is based on the assumption that most users + * do not yet work on a Pentium Pro or Pentium II machine ...) + * + * COS 050116: Enabled the 686 build by default, because there are hardly any + * pre-686 CPUs in serious use nowadays. (See SPC 970402 above.) + * + * FLAT memory model assumed. Calling interface: + * - args are pushed onto the stack from right to left, + * - return value is given in the EAX register, + * - all other registers (with exception of EFLAGS) are preserved. (With + * GNU C 2.7.x, %edx and %ecx are `scratch' registers, but preserving + * them nevertheless adds only 4 single byte instructions.) + * + * This source generates the function + * ulg crc32(ulg crc, ZCONST uch *buf, extent len). + * + * Loop unrolling can be disabled by defining the macro NO_UNROLLED_LOOPS. + * This results in shorter code at the expense of reduced performance. + */ + +/* This file is NOT used in conjunction with zlib. */ +#ifndef USE_ZLIB + +/* Preprocess with -DNO_UNDERLINE if your C compiler does not prefix + * external symbols with an underline character '_'. + */ +#if defined(NO_UNDERLINE) || defined(__ELF__) +# define _crc32 crc32 +# define _get_crc_table get_crc_table +#endif +/* Use 16-byte alignment if your assembler supports it. Warning: gas + * uses a log(x) parameter (.align 4 means 16-byte alignment). On SVR4 + * the parameter is a number of bytes. + */ +#ifndef ALIGNMENT +# define ALIGNMENT .align 4,0x90 +#endif + +#if defined(i386) || defined(_i386) || defined(_I386) || defined(__i386) + +/* This version is for 386 Unix, OS/2, MSDOS in 32 bit mode (gcc & gas). + * Warning: it uses the AT&T syntax: mov source,dest + * This file is only optional. If you want to use the C version, + * remove -DASM_CRC from CFLAGS in Makefile and set OBJA to an empty string. + */ + + .file "crc_i386.S" + +#if !defined(PRE_686) && !defined(__686) + /* Optimize for Pentium Pro and compatible CPUs by default. */ +# define __686 +#endif + +#if defined(NO_STD_STACKFRAME) && defined(USE_STD_STACKFRAME) +# undef USE_STACKFRAME +#else + /* The default is to use standard stack frame entry, because it + * results in smaller code! + */ +# ifndef USE_STD_STACKFRAME +# define USE_STD_STACKFRAME +# endif +#endif + +#ifdef USE_STD_STACKFRAME +# define _STD_ENTRY pushl %ebp ; movl %esp,%ebp +# define arg1 8(%ebp) +# define arg2 12(%ebp) +# define arg3 16(%ebp) +# define _STD_LEAVE popl %ebp +#else /* !USE_STD_STACKFRAME */ +# define _STD_ENTRY +# define arg1 24(%esp) +# define arg2 28(%esp) +# define arg3 32(%esp) +# define _STD_LEAVE +#endif /* ?USE_STD_STACKFRAME */ + +/* + * These two (three) macros make up the loop body of the CRC32 cruncher. + * registers modified: + * eax : crc value "c" + * esi : pointer to next data byte (or lword) "buf++" + * registers read: + * edi : pointer to base of crc_table array + * scratch registers: + * ebx : index into crc_table array + * (requires upper three bytes = 0 when __686 is undefined) + */ +#ifndef __686 /* optimize for 386, 486, Pentium */ +#define Do_CRC /* c = (c >> 8) ^ table[c & 0xFF] */\ + movb %al, %bl ;/* tmp = c & 0xFF */\ + shrl $8, %eax ;/* c = (c >> 8) */\ + xorl (%edi, %ebx, 4), %eax ;/* c ^= table[tmp] */ +#else /* __686 : optimize for Pentium Pro and compatible CPUs */ +#define Do_CRC /* c = (c >> 8) ^ table[c & 0xFF] */\ + movzbl %al, %ebx ;/* tmp = c & 0xFF */\ + shrl $8, %eax ;/* c = (c >> 8) */\ + xorl (%edi, %ebx, 4), %eax ;/* c ^=table[tmp] */ +#endif /* ?__686 */ + +#define Do_CRC_byte /* c = (c >> 8) ^ table[(c^*buf++)&0xFF] */\ + xorb (%esi), %al ;/* c ^= *buf */\ + incl %esi ;/* buf++ */\ + Do_CRC + +#ifndef NO_32_BIT_LOADS +#define Do_CRC_lword \ + xorl (%esi), %eax ;/* c ^= *(ulg *)buf */\ + addl $4, %esi ;/* ((ulg *)buf)++ */\ + Do_CRC \ + Do_CRC \ + Do_CRC \ + Do_CRC +#endif /* !NO_32_BIT_LOADS */ + + + .text + + .globl _crc32 + +_crc32: /* ulg crc32(ulg crc, uch *buf, extent len) */ + _STD_ENTRY + pushl %edi + pushl %esi + pushl %ebx + pushl %edx + pushl %ecx + + movl arg2, %esi /* 2nd arg: uch *buf */ + subl %eax, %eax /* > if (!buf) */ + testl %esi, %esi /* > return 0; */ + jz .L_fine /* > else { */ + call _get_crc_table + movl %eax, %edi + movl arg1, %eax /* 1st arg: ulg crc */ +#ifndef __686 + subl %ebx, %ebx /* ebx=0; bl usable as dword */ +#endif + movl arg3, %ecx /* 3rd arg: extent len */ + notl %eax /* > c = ~crc; */ + + testl %ecx, %ecx +#ifndef NO_UNROLLED_LOOPS + jz .L_bail +# ifndef NO_32_BIT_LOADS + /* Assert now have positive length */ +.L_align_loop: + testl $3, %esi /* Align buf on lword boundary */ + jz .L_aligned_now + Do_CRC_byte + decl %ecx + jnz .L_align_loop +.L_aligned_now: +# endif /* !NO_32_BIT_LOADS */ + movl %ecx, %edx /* save len in edx */ + shrl $3, %ecx /* ecx = len / 8 */ + jz .L_No_Eights +/* align loop head at start of 486 internal cache line !! */ + ALIGNMENT +.L_Next_Eight: +# ifndef NO_32_BIT_LOADS + /* Do_CRC_lword */ + xorl (%esi), %eax ;/* c ^= *(ulg *)buf */ + addl $4, %esi ;/* ((ulg *)buf)++ */ + Do_CRC + Do_CRC + Do_CRC + Do_CRC + /* Do_CRC_lword */ + xorl (%esi), %eax ;/* c ^= *(ulg *)buf */ + addl $4, %esi ;/* ((ulg *)buf)++ */ + Do_CRC + Do_CRC + Do_CRC + Do_CRC +# else /* NO_32_BIT_LOADS */ + Do_CRC_byte + Do_CRC_byte + Do_CRC_byte + Do_CRC_byte + Do_CRC_byte + Do_CRC_byte + Do_CRC_byte + Do_CRC_byte +# endif /* ?NO_32_BIT_LOADS */ + decl %ecx + jnz .L_Next_Eight + +.L_No_Eights: + movl %edx, %ecx + andl $7, %ecx /* ecx = len % 8 */ +#endif /* !NO_UNROLLED_LOOPS */ + jz .L_bail /* > if (len) */ +/* align loop head at start of 486 internal cache line !! */ + ALIGNMENT +.L_loupe: /* > do { */ + Do_CRC_byte /* c = CRC32(c, *buf++); */ + decl %ecx /* > } while (--len); */ + jnz .L_loupe + +.L_bail: /* > } */ + notl %eax /* > return ~c; */ +.L_fine: + popl %ecx + popl %edx + popl %ebx + popl %esi + popl %edi + _STD_LEAVE + ret + +#else + error: this asm version is for 386 only +#endif /* i386 || _i386 || _I386 || __i386 */ + +#endif /* !USE_ZLIB */ |