A simple double precision real implementation of GER would then be:
void ATL_dger1_a1_x1_yX(const int M, const int N, const double alpha,
const double *X, const int incX, const double *Y,
const int incY, double *A, const int lda)
{
int i, j;
register double y0;
for (j=0; j < N; j++)
{
y0 = Y[j*incY];
for (i=0; i < M; i++) A[i+j*lda] += X[i] * y0;
}
}
Which we can test and time with:
speedy. make dr1tstcase r1rout=CASES/ATL_dger1_1x1_1.c xu=1 yu=1
TEST CONJ=0, M=997, N=177, lda=1006, incY=1, STARTED
TEST CONJ=0, M=997, N=177, lda=1006, incY=1, PASSED
TEST CONJ=0, M=997, N=177, lda=1006, incY=3, STARTED
TEST CONJ=0, M=997, N=177, lda=1006, incY=3, PASSED
TEST CONJ=0, M=997, N=177, lda=1006, incY=-3, STARTED
TEST CONJ=0, M=997, N=177, lda=1006, incY=-3, PASSED
speedy. make dr1case r1rout=CASES/ATL_dger1_1x1_1.c xu=1 yu=1
dger_0 : 31.168831 MFLOPS
dger_0 : 31.788079 MFLOPS
dger_0 : 31.578947 MFLOPS
dger_0 : 31.51 MFLOPS
A simple double precision complex implementation would be:
#ifdef Conj_
void ATL_zger1c_a1_x1_yX
#else
void ATL_zger1u_a1_x1_yX
#endif
(const int M, const int N, const double *alpha, const double *X,
const int incX, const double *Y, const int incY, double *A, const int lda)
{
const int M2 = M<<1, N2 = N<<1;
int i, j;
register double ry, iy, ra, ia, rx, ix;
for (j=0; j < N2; j += 2)
{
ry = Y[incY*j];
iy = Y[incY*j+1];
for (i=0; i < M2; i += 2)
{
rx = X[i];
ix = X[i+1];
ra = rx * ry;
ia = ix * ry;
#ifdef Conj_
ra += ix * iy;
ia -= rx * iy;
#else
ra -= ix * iy;
ia += rx * iy;
#endif
A[i+j*lda] += ra;
A[i+j*lda+1] += ia;
}
}
}
Which we test in time by:
speedy. make zr1tstcase r1rout=CASES/ATL_zger1_1x1_1.c xu=1 yu=1
TEST CONJ=0, M=997, N=177, lda=1006, incY=1, STARTED
TEST CONJ=0, M=997, N=177, lda=1006, incY=1, PASSED
TEST CONJ=0, M=997, N=177, lda=1006, incY=3, STARTED
TEST CONJ=0, M=997, N=177, lda=1006, incY=3, PASSED
TEST CONJ=0, M=997, N=177, lda=1006, incY=-3, STARTED
TEST CONJ=0, M=997, N=177, lda=1006, incY=-3, PASSED
TEST CONJ=1, M=997, N=177, lda=1006, incY=1, STARTED
TEST CONJ=1, M=997, N=177, lda=1006, incY=1, PASSED
TEST CONJ=1, M=997, N=177, lda=1006, incY=3, STARTED
TEST CONJ=1, M=997, N=177, lda=1006, incY=3, PASSED
TEST CONJ=1, M=997, N=177, lda=1006, incY=-3, STARTED
TEST CONJ=1, M=997, N=177, lda=1006, incY=-3, PASSED
speedy. make zr1case r1rout=CASES/ATL_zger1_1x1_1.c xu=1 yu=1
zger_0 : 39.024390 MFLOPS
zger_0 : 38.709677 MFLOPS
zger_0 : 39.024390 MFLOPS
zger_0 : 38.92 MFLOPS