/* * mp32opt.ia64.S * * Assembler optimized multiprecision integer routines for ia64 (Intel Itanium) * * Compile target is GNU Assembler * * Copyright (c) 2000, 2001 Virtual Unlimited B.V. * * Author: Bob Deblier * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ /* * I will need to implement 64 bit multiprecision assembler-optimized routines * before this platform can be tested adequately. The current 32 bit ones suffer * from loading into integer registers, conversion to floating point, doing the xma * and converting back to integer; 64 bit values can be loaded directly into * floating point registers, which should shave off a lot of cycles. */ #include "config.gas.h" #define saved_pfs r14 #define saved_lc r15 #define size r16 #define dst r17 #define src r18 .file "mp32opt.ia64.S" .text .explicit .align 32 .global mp32zero# .proc mp32zero# mp32zero: alloc saved_pfs = ar.pfs,2,0,0,0 mov saved_lc = ar.lc sub size = in0,r0,1;; mov src = in1 mov ar.lc = size;; .L00: st4 [src] = r0,4 br.ctop.sptk .L00 ;; mov ar.lc = saved_lc mov ar.pfs = saved_pfs br.ret.sptk b0 .endp mp32zero# .align 32 .global mp32copy# .proc mp32copy# mp32copy: alloc saved_pfs = ar.pfs,3,5,0,8 mov saved_lc = ar.lc sub size = in0,r0,1 mov dst = in1 mov src = in2;; mov ar.lc = size mov ar.ec = 2 mov pr.rot = (1 << 16);; .L01: (p17) st4 [dst] = r33,4 (p16) ld4 r32 = [src],4;; br.ctop.sptk .L01;; mov ar.lc = saved_lc mov ar.pfs = saved_pfs br.ret.sptk b0 .endp mp32copy# .if 0 .align 32 .global mp32z .type mp32z,@function mp32z: alloc r14 = ar.pfs,2,6,0,8 mov r15 = ar.lc sub r16= in0,r0,1 mov r17 = in1 mov r18 = in2;; mov r8 = 1 mov pr.rot = 1 << 16 mov ar.ec = 2 mov ar.lc = r16;; .L02: (p16) ld4 r32 = [r18],4 (p18) cmp.eq p0,p32 = r34,r0 (p33) mov r8 = r0 (p33) br.exit .endif .align 32 .global mp32add# .proc mp32add# mp32add: alloc r14 = ar.pfs,3,0,0,0 mov r15 = ar.lc # adjust size by -1 sub r16 = in0,r0,1 # clear carry mov r8 = r0;; # load addresses shladd r17 = r16,2,in1 shladd r18 = r16,2,in2 # load loop count mov ar.lc = r16;; .L20: ld4 r20 = [r18],-4 ld4 r19 = [r17] tbit.z p1,p2 = r8,32;; (p1) add r8 = r19,r20 (p2) add r8 = r19,r20,1;; st4 [r17] = r8,-4 br.cloop.sptk .L20;; extr.u r8 = r8,32,1 mov ar.lc = r15 mov ar.pfs = r14 br.ret.sptk b0 .endp mp32add# .align 32 .global mp32sub# .proc mp32sub# mp32sub: alloc r14 = ar.pfs,3,0,0,0 mov r15 = ar.lc # adjust size by -1 sub r16 = in0,r0,1 # clear carry mov r8 = r0;; # load addresses shladd r17 = r16,2,in1 shladd r18 = r16,2,in2 # load loop count mov ar.lc = r16;; .L30: ld4 r20 = [r18],-4 ld4 r19 = [r17] tbit.z p1,p2 = r8,32;; (p1) sub r8 = r19,r20 (p2) sub r8 = r19,r20,1;; st4 [r17] = r8,-4 br.cloop.sptk .L30;; extr.u r8 = r8,32,1 mov ar.lc = r15 mov ar.pfs = r14 br.ret.sptk b0 .endp mp32sub# .if 0 .align 32 .global mp32setmul# .proc mp32setmul# mp32setmul: alloc r14 = ar.pfs,4,0,0,0 mov r15 = ar.lc # load mul setf.sig f96 = in3 # adjust size by -1 sub r16 = in0,r0,1 # clear carry mov r8 = r0;; # adjust addresses shladd r17 = r16,2,in1 shladd r18 = r16,2,in2 # load loop count mov ar.lc = r16;; .L40: ld4 r19 = [r18],-4;; setf.sig f98 = r8 setf.sig f97 = r19;; # multiplication can only be done in f registers, but we do have a multiply-add xma.l f98 = f96,f97,f98;; getf.sig r8 = f98;; st4 [r17] = r8,-4 shr.u r8 = r8,32 br.cloop.sptk .L40;; mov ar.lc = r15 mov ar.pfs = r14 br.ret.sptk b0 .endp mp32setmul# .align 32 .global mp32addmul# .proc mp32addmul# mp32addmul: alloc saved_pfs = ar.pfs,4,0,0,0 mov saved_lc = ar.lc # load mul setf.sig f96 = in3 # adjust size by -1 sub size = in0,r0,1 # clear carry mov r8 = r0;; # adjust addresses shladd dst = size,2,in1 shladd src = size,2,in2 # load loop count mov ar.lc = r16;; .L50: ld4 r19 = [dst] ld4 r20 = [dst],-4;; setf.sig f98 = r8 setf.sig f97 = r20;; # multiplication can only be done in f registers, but we do have a multiply-add xma.l f98 = f96,f97,f98;; getf.sig r8 = f98;; add r8 = r8,r19;; st4 [r17] = r8,-4 shr.u r8 = r8,32 br.cloop.sptk .L50;; mov ar.lc = r15 mov ar.pfs = r14 br.ret.sptk b0 .endp mp32addmul# .endif .if 0 .align 16 .global mp32addsqrtrc# .proc mp32addsqrtrc# mp32addsqrtrc: .endp mp32addsqrtrc# .endif