GEMV examples

Probably the simplest, notranspose GEMV kernel implementation is:

#ifdef BETA0
void ATL_dgemvN_a1_x1_b0_y1
#elif defined (BETA1)
void ATL_dgemvN_a1_x1_b1_y1
#else
void ATL_dgemvN_a1_x1_bX_y1
#endif
   (const int M, const int N, const double alpha, const double *A, 
    const int lda, const double *X, const int incX, const double beta,
    double *Y, const int incY)
{
   int i, j;
   register double y0;

   for (i=0; i != M; i++)
   {
      #ifdef BETA0
         y0 = 0.0;
      #elif defined(BETA1)
         y0 = Y[i];
      #elif defined(BETAX)
         y0 = Y[i] * beta;
      #endif
      for (j=0; j != N; j++) y0 += A[i+j*lda] * X[j];
      Y[i] = y0;
   }
}

Saving this file to ATLAS/tune/blas/gemv/CASES/ATL_dgemvN_1x1_1.c,
from the BLDdir/tune/blas/gemv/ directory, we can test and time the implementation by:

make dmvtstcaseN mvrout=CASES/ATL_dgemvN_1x1_1.c yu=1 xu=1
BEGINNING GEMV PRIMITIVE TESTING:

   TEST TA=N, M=997, N=177, lda=1004, beta=0.000000 STARTED
   TEST TA=N, M=997, N=177, lda=1004, beta=0.000000 PASSED
   TEST TA=N, M=997, N=177, lda=1004, beta=1.000000 STARTED
   TEST TA=N, M=997, N=177, lda=1004, beta=1.000000 PASSED
   TEST TA=N, M=997, N=177, lda=1004, beta=0.800000 STARTED
   TEST TA=N, M=997, N=177, lda=1004, beta=0.800000 PASSED


GEMV PRIMITIVE PASSED ALL TESTS

speedy. make dmvcaseN mvrout=CASES/ATL_dgemvN_1x1_1.c yu=1 xu=1
      gemvN_0 : 49.484536 MFLOPS
      gemvN_0 : 49.484536 MFLOPS
      gemvN_0 : 49.230769 MFLOPS
   gemvN_0 : 49.40 MFLOPS

In the above examples, we pass yu, the unrolling along the $y$ vector, and xu, the unrolling along the $x$ vector, so that when ATLAS blocks the operation, it knows the correct unrolling to use to avoid cleanup code.

A similarly sophisticated transpose primitive is:

#ifdef BETA0
void ATL_dgemvT_a1_x1_b0_y1
#elif defined (BETA1)
void ATL_dgemvT_a1_x1_b1_y1
#else
void ATL_dgemvT_a1_x1_bX_y1
#endif
   (const int M, const int N, const double alpha, const double *A,
    const int lda, const double *X, const int incX, const double beta,
    double *Y, const int incY)

{
   int i, j;
   register double y0;

   for (j=0; j != M; j++)
   {
      #ifdef BETA0
         y0 = 0.0;
      #elif defined(BETA1)
         y0 = Y[j];
      #else
         y0 = Y[j] * beta;
      #endif
      for (i=0; i != N; i++) y0 += A[i+j*lda] * X[i];
      Y[j] = y0;
   }
}

Which we could test and time by:

speedy. make dmvtstcaseT mvrout=CASES/ATL_dgemvT_1x1_1.c xu=1 yu=1
BEGINNING GEMV PRIMITIVE TESTING:

   TEST TA=T, M=997, N=177, lda=1004, beta=0.000000 STARTED
   TEST TA=T, M=997, N=177, lda=1004, beta=0.000000 PASSED
   TEST TA=T, M=997, N=177, lda=1004, beta=1.000000 STARTED
   TEST TA=T, M=997, N=177, lda=1004, beta=1.000000 PASSED
   TEST TA=T, M=997, N=177, lda=1004, beta=0.800000 STARTED
   TEST TA=T, M=997, N=177, lda=1004, beta=0.800000 PASSED


GEMV PRIMITIVE PASSED ALL TESTS

speedy. make dmvcaseT mvrout=CASES/ATL_dgemvT_1x1_1.c xu=1 yu=1
      gemvT_0 : 37.647059 MFLOPS
      gemvT_0 : 37.647059 MFLOPS
      gemvT_0 : 37.647059 MFLOPS
   gemvT_0 : 37.65 MFLOPS

An unsophisticated notranspose GEMV implementation for double precision complex would be:

#ifdef BETA0
   #ifdef Conj_
      void ATL_dgemvNc_a1_x1_b0_y1
   #else
      void ATL_dgemvN_a1_x1_b0_y1
   #endif
#elif defined (BETA1)
   #ifdef Conj_
      void ATL_dgemvNc_a1_x1_b1_y1
   #else
      void ATL_dgemvN_a1_x1_b1_y1
   #endif
#elif defined (BETAXI0)
   #ifdef Conj_
      void ATL_dgemvNc_a1_x1_bXi0_y1
   #else
      void ATL_dgemvN_a1_x1_bXi0_y1
   #endif
#else
   #ifdef Conj_
      void ATL_dgemvNc_a1_x1_bX_y1
   #else
      void ATL_dgemvN_a1_x1_bX_y1
   #endif
#endif
   (const int M, const int N, const double *alpha,
    const double *A, const int lda, const double *X, const int incX,
    const double *beta, double *Y, const int incY)
{
   int i, j;
   const int M2 = M<<1, N2 = N<<1;
   #if defined(BETAX)
      const double rbeta = *beta, ibeta = beta[1];
   #elif defined(BETAXI0)
      const double rbeta = *beta;
   #endif
   register double ra, ia, rx, ix, ry, iy;

   for (i=0; i != M2; i += 2)
   {
      #ifdef BETA0
         ry = iy = 0.0;
      #elif defined(BETAX)
         rx = rbeta; ix = ibeta;
         ra = Y[i]; ia = Y[i+1];
         ry = ra * rx - ia * ix;
         iy = ra * ix + ia * rx;
      #else
         ry = Y[i];
         iy = Y[i+1];
         #ifdef BETAXI0
            rx = rbeta;
            ry *= rx;
            iy *= rx;
         #endif
      #endif
      for(j=0; j != N2; j += 2)
      {
         ra = A[i+j*lda]; ia = A[i+j*lda+1];
         rx = X[j]; ix = X[j+1];
         ry += ra * rx;
         iy += ra * ix;
         #ifdef Conj_
            ry += ia * ix;
            iy -= ia * rx;
         #else
            ry -= ia * ix;
            iy += ia * rx;
         #endif
      }
      Y[i] = ry;
      Y[i+1] = iy;
   }
}

Which, when saved to ATLAS/tune/blas/gemv/CASES/ATL_zgemvN_1x1_1.c, we could test and time by:

 make zmvtstcaseN mvrout=CASES/ATL_zgemvN_1x1_1.c xu=1 yu=1

BEGINNING GEMV PRIMITIVE TESTING:

   TEST TA=N, M=997, N=177, lda=1004, beta=(0.000000,0.000000) STARTED
   TEST TA=N, M=997, N=177, lda=1004, beta=(0.000000,0.000000) PASSED
   TEST TA=-, M=997, N=177, lda=1004, beta=(0.000000,0.000000) STARTED
   TEST TA=-, M=997, N=177, lda=1004, beta=(0.000000,0.000000) PASSED
   TEST TA=N, M=997, N=177, lda=1004, beta=(1.000000,0.000000) STARTED
   TEST TA=N, M=997, N=177, lda=1004, beta=(1.000000,0.000000) PASSED
   TEST TA=-, M=997, N=177, lda=1004, beta=(1.000000,0.000000) STARTED
   TEST TA=-, M=997, N=177, lda=1004, beta=(1.000000,0.000000) PASSED
   TEST TA=N, M=997, N=177, lda=1004, beta=(0.800000,0.000000) STARTED
   TEST TA=N, M=997, N=177, lda=1004, beta=(0.800000,0.000000) PASSED
   TEST TA=-, M=997, N=177, lda=1004, beta=(0.800000,0.000000) STARTED
   TEST TA=-, M=997, N=177, lda=1004, beta=(0.800000,0.000000) PASSED
   TEST TA=N, M=997, N=177, lda=1004, beta=(0.800000,0.300000) STARTED
   TEST TA=N, M=997, N=177, lda=1004, beta=(0.800000,0.300000) PASSED
   TEST TA=-, M=997, N=177, lda=1004, beta=(0.800000,0.300000) STARTED
   TEST TA=-, M=997, N=177, lda=1004, beta=(0.800000,0.300000) PASSED


GEMV PRIMITIVE PASSED ALL TESTS

speedy. make zmvcaseN mvrout=CASES/ATL_zgemvN_1x1_1.c xu=1 yu=1
      gemvN_0 : 78.688525 MFLOPS
      gemvN_0 : 78.688525 MFLOPS
      gemvN_0 : 78.688525 MFLOPS
   gemvN_0 : 78.69 MFLOPS



Clint Whaley 2012-07-10