A simple example

Here is the complete code for the simplist assembler implementation of a DDOT primitive (we are implementing the case where incX and incY are known to be 1):
#
#  These macros show integer register usage
#
#define N       %eax
#define X       %edx
#define Y       %ecx

#
#double ATL_UDOT(const int N, const double *X, const int incX,
#                             const double *Y, const int incY)
.global ATL_UDOT
        .type   ATL_UDOT,@function
ATL_UDOT:
#
#       Load parameters
#
        movl    4(%esp), N
        movl    8(%esp), X
        movl    16(%esp), Y
#
#       Dot product starts at 0
#
        fldz    
LOOP:
        fldl    (X)
        fldl    (Y)
        fmulp   %st, %st(1)
        addl    $8, X
        addl    $8, Y
        faddp   %st, %st(1)
        dec     N
        jnz     LOOP

        ret

Notice that because we are able to confine ourselves to the three scratch registers, we have an empty function prologue and epilogue (we do not save any registers or move the stack pointer).



Clint Whaley 2012-07-10