A simple double precision real implementation of GER would then be:
void ATL_dger1_a1_x1_yX(const int M, const int N, const double alpha, const double *X, const int incX, const double *Y, const int incY, double *A, const int lda) { int i, j; register double y0; for (j=0; j < N; j++) { y0 = Y[j*incY]; for (i=0; i < M; i++) A[i+j*lda] += X[i] * y0; } }Which we can test and time with:
speedy. make dr1tstcase r1rout=CASES/ATL_dger1_1x1_1.c xu=1 yu=1 TEST CONJ=0, M=997, N=177, lda=1006, incY=1, STARTED TEST CONJ=0, M=997, N=177, lda=1006, incY=1, PASSED TEST CONJ=0, M=997, N=177, lda=1006, incY=3, STARTED TEST CONJ=0, M=997, N=177, lda=1006, incY=3, PASSED TEST CONJ=0, M=997, N=177, lda=1006, incY=-3, STARTED TEST CONJ=0, M=997, N=177, lda=1006, incY=-3, PASSED speedy. make dr1case r1rout=CASES/ATL_dger1_1x1_1.c xu=1 yu=1 dger_0 : 31.168831 MFLOPS dger_0 : 31.788079 MFLOPS dger_0 : 31.578947 MFLOPS dger_0 : 31.51 MFLOPS
A simple double precision complex implementation would be:
#ifdef Conj_ void ATL_zger1c_a1_x1_yX #else void ATL_zger1u_a1_x1_yX #endif (const int M, const int N, const double *alpha, const double *X, const int incX, const double *Y, const int incY, double *A, const int lda) { const int M2 = M<<1, N2 = N<<1; int i, j; register double ry, iy, ra, ia, rx, ix; for (j=0; j < N2; j += 2) { ry = Y[incY*j]; iy = Y[incY*j+1]; for (i=0; i < M2; i += 2) { rx = X[i]; ix = X[i+1]; ra = rx * ry; ia = ix * ry; #ifdef Conj_ ra += ix * iy; ia -= rx * iy; #else ra -= ix * iy; ia += rx * iy; #endif A[i+j*lda] += ra; A[i+j*lda+1] += ia; } } }
Which we test in time by:
speedy. make zr1tstcase r1rout=CASES/ATL_zger1_1x1_1.c xu=1 yu=1 TEST CONJ=0, M=997, N=177, lda=1006, incY=1, STARTED TEST CONJ=0, M=997, N=177, lda=1006, incY=1, PASSED TEST CONJ=0, M=997, N=177, lda=1006, incY=3, STARTED TEST CONJ=0, M=997, N=177, lda=1006, incY=3, PASSED TEST CONJ=0, M=997, N=177, lda=1006, incY=-3, STARTED TEST CONJ=0, M=997, N=177, lda=1006, incY=-3, PASSED TEST CONJ=1, M=997, N=177, lda=1006, incY=1, STARTED TEST CONJ=1, M=997, N=177, lda=1006, incY=1, PASSED TEST CONJ=1, M=997, N=177, lda=1006, incY=3, STARTED TEST CONJ=1, M=997, N=177, lda=1006, incY=3, PASSED TEST CONJ=1, M=997, N=177, lda=1006, incY=-3, STARTED TEST CONJ=1, M=997, N=177, lda=1006, incY=-3, PASSED speedy. make zr1case r1rout=CASES/ATL_zger1_1x1_1.c xu=1 yu=1 zger_0 : 39.024390 MFLOPS zger_0 : 38.709677 MFLOPS zger_0 : 39.024390 MFLOPS zger_0 : 38.92 MFLOPS