GER examples

A simple double precision real implementation of GER would then be:

void ATL_dger1_a1_x1_yX(const int M, const int N, const double alpha,
                        const double *X, const int incX, const double *Y,
                        const int incY, double *A, const int lda)
{
   int i, j;
   register double y0;

   for (j=0; j < N; j++)
   {
      y0 = Y[j*incY];
      for (i=0; i < M; i++) A[i+j*lda] += X[i] * y0;
   }
}
Which we can test and time with:
speedy. make dr1tstcase r1rout=CASES/ATL_dger1_1x1_1.c xu=1 yu=1
   TEST CONJ=0, M=997, N=177, lda=1006, incY=1, STARTED
   TEST CONJ=0, M=997, N=177, lda=1006, incY=1, PASSED
   TEST CONJ=0, M=997, N=177, lda=1006, incY=3, STARTED
   TEST CONJ=0, M=997, N=177, lda=1006, incY=3, PASSED
   TEST CONJ=0, M=997, N=177, lda=1006, incY=-3, STARTED
   TEST CONJ=0, M=997, N=177, lda=1006, incY=-3, PASSED

speedy. make dr1case r1rout=CASES/ATL_dger1_1x1_1.c xu=1 yu=1
      dger_0 : 31.168831 MFLOPS
      dger_0 : 31.788079 MFLOPS
      dger_0 : 31.578947 MFLOPS
   dger_0 : 31.51 MFLOPS

A simple double precision complex implementation would be:

#ifdef Conj_
   void ATL_zger1c_a1_x1_yX
#else
   void ATL_zger1u_a1_x1_yX
#endif
   (const int M, const int N, const double *alpha, const double *X, 
    const int incX, const double *Y, const int incY, double *A, const int lda)
{
   const int M2 = M<<1, N2 = N<<1;
   int i, j;
   register double ry, iy, ra, ia, rx, ix;

   for (j=0; j < N2; j += 2)
   {
      ry = Y[incY*j];
      iy = Y[incY*j+1];
      for (i=0; i < M2; i += 2)
      {
         rx = X[i];
         ix = X[i+1];
         ra = rx * ry;
         ia = ix * ry;
         #ifdef Conj_
            ra += ix * iy;
            ia -= rx * iy;
         #else
            ra -= ix * iy;
            ia += rx * iy;
         #endif
         A[i+j*lda] += ra;
         A[i+j*lda+1] += ia;
      }
   }
}

Which we test in time by:

speedy. make zr1tstcase r1rout=CASES/ATL_zger1_1x1_1.c xu=1 yu=1
   TEST CONJ=0, M=997, N=177, lda=1006, incY=1, STARTED
   TEST CONJ=0, M=997, N=177, lda=1006, incY=1, PASSED
   TEST CONJ=0, M=997, N=177, lda=1006, incY=3, STARTED
   TEST CONJ=0, M=997, N=177, lda=1006, incY=3, PASSED
   TEST CONJ=0, M=997, N=177, lda=1006, incY=-3, STARTED
   TEST CONJ=0, M=997, N=177, lda=1006, incY=-3, PASSED
   TEST CONJ=1, M=997, N=177, lda=1006, incY=1, STARTED
   TEST CONJ=1, M=997, N=177, lda=1006, incY=1, PASSED
   TEST CONJ=1, M=997, N=177, lda=1006, incY=3, STARTED
   TEST CONJ=1, M=997, N=177, lda=1006, incY=3, PASSED
   TEST CONJ=1, M=997, N=177, lda=1006, incY=-3, STARTED
   TEST CONJ=1, M=997, N=177, lda=1006, incY=-3, PASSED

speedy. make zr1case r1rout=CASES/ATL_zger1_1x1_1.c xu=1 yu=1
      zger_0 : 39.024390 MFLOPS
      zger_0 : 38.709677 MFLOPS
      zger_0 : 39.024390 MFLOPS
   zger_0 : 38.92 MFLOPS



Clint Whaley 2012-07-10