Probably the simplest, notranspose GEMV kernel implementation is:
#ifdef BETA0
void ATL_dgemvN_a1_x1_b0_y1
#elif defined (BETA1)
void ATL_dgemvN_a1_x1_b1_y1
#else
void ATL_dgemvN_a1_x1_bX_y1
#endif
(const int M, const int N, const double alpha, const double *A,
const int lda, const double *X, const int incX, const double beta,
double *Y, const int incY)
{
int i, j;
register double y0;
for (i=0; i != M; i++)
{
#ifdef BETA0
y0 = 0.0;
#elif defined(BETA1)
y0 = Y[i];
#elif defined(BETAX)
y0 = Y[i] * beta;
#endif
for (j=0; j != N; j++) y0 += A[i+j*lda] * X[j];
Y[i] = y0;
}
}
Saving this file to
ATLAS/tune/blas/gemv/CASES/ATL_dgemvN_1x1_1.c,
from the
BLDdir/tune/blas/gemv/ directory, we can test and time
the implementation by:
make dmvtstcaseN mvrout=CASES/ATL_dgemvN_1x1_1.c yu=1 xu=1
BEGINNING GEMV PRIMITIVE TESTING:
TEST TA=N, M=997, N=177, lda=1004, beta=0.000000 STARTED
TEST TA=N, M=997, N=177, lda=1004, beta=0.000000 PASSED
TEST TA=N, M=997, N=177, lda=1004, beta=1.000000 STARTED
TEST TA=N, M=997, N=177, lda=1004, beta=1.000000 PASSED
TEST TA=N, M=997, N=177, lda=1004, beta=0.800000 STARTED
TEST TA=N, M=997, N=177, lda=1004, beta=0.800000 PASSED
GEMV PRIMITIVE PASSED ALL TESTS
speedy. make dmvcaseN mvrout=CASES/ATL_dgemvN_1x1_1.c yu=1 xu=1
gemvN_0 : 49.484536 MFLOPS
gemvN_0 : 49.484536 MFLOPS
gemvN_0 : 49.230769 MFLOPS
gemvN_0 : 49.40 MFLOPS
In the above examples, we pass yu, the unrolling along the
vector,
and xu, the unrolling along the
vector, so that when ATLAS
blocks the operation, it knows the correct unrolling to use to avoid
cleanup code.
A similarly sophisticated transpose primitive is:
#ifdef BETA0
void ATL_dgemvT_a1_x1_b0_y1
#elif defined (BETA1)
void ATL_dgemvT_a1_x1_b1_y1
#else
void ATL_dgemvT_a1_x1_bX_y1
#endif
(const int M, const int N, const double alpha, const double *A,
const int lda, const double *X, const int incX, const double beta,
double *Y, const int incY)
{
int i, j;
register double y0;
for (j=0; j != M; j++)
{
#ifdef BETA0
y0 = 0.0;
#elif defined(BETA1)
y0 = Y[j];
#else
y0 = Y[j] * beta;
#endif
for (i=0; i != N; i++) y0 += A[i+j*lda] * X[i];
Y[j] = y0;
}
}
Which we could test and time by:
speedy. make dmvtstcaseT mvrout=CASES/ATL_dgemvT_1x1_1.c xu=1 yu=1
BEGINNING GEMV PRIMITIVE TESTING:
TEST TA=T, M=997, N=177, lda=1004, beta=0.000000 STARTED
TEST TA=T, M=997, N=177, lda=1004, beta=0.000000 PASSED
TEST TA=T, M=997, N=177, lda=1004, beta=1.000000 STARTED
TEST TA=T, M=997, N=177, lda=1004, beta=1.000000 PASSED
TEST TA=T, M=997, N=177, lda=1004, beta=0.800000 STARTED
TEST TA=T, M=997, N=177, lda=1004, beta=0.800000 PASSED
GEMV PRIMITIVE PASSED ALL TESTS
speedy. make dmvcaseT mvrout=CASES/ATL_dgemvT_1x1_1.c xu=1 yu=1
gemvT_0 : 37.647059 MFLOPS
gemvT_0 : 37.647059 MFLOPS
gemvT_0 : 37.647059 MFLOPS
gemvT_0 : 37.65 MFLOPS
An unsophisticated notranspose GEMV implementation for double precision complex would be:
#ifdef BETA0
#ifdef Conj_
void ATL_dgemvNc_a1_x1_b0_y1
#else
void ATL_dgemvN_a1_x1_b0_y1
#endif
#elif defined (BETA1)
#ifdef Conj_
void ATL_dgemvNc_a1_x1_b1_y1
#else
void ATL_dgemvN_a1_x1_b1_y1
#endif
#elif defined (BETAXI0)
#ifdef Conj_
void ATL_dgemvNc_a1_x1_bXi0_y1
#else
void ATL_dgemvN_a1_x1_bXi0_y1
#endif
#else
#ifdef Conj_
void ATL_dgemvNc_a1_x1_bX_y1
#else
void ATL_dgemvN_a1_x1_bX_y1
#endif
#endif
(const int M, const int N, const double *alpha,
const double *A, const int lda, const double *X, const int incX,
const double *beta, double *Y, const int incY)
{
int i, j;
const int M2 = M<<1, N2 = N<<1;
#if defined(BETAX)
const double rbeta = *beta, ibeta = beta[1];
#elif defined(BETAXI0)
const double rbeta = *beta;
#endif
register double ra, ia, rx, ix, ry, iy;
for (i=0; i != M2; i += 2)
{
#ifdef BETA0
ry = iy = 0.0;
#elif defined(BETAX)
rx = rbeta; ix = ibeta;
ra = Y[i]; ia = Y[i+1];
ry = ra * rx - ia * ix;
iy = ra * ix + ia * rx;
#else
ry = Y[i];
iy = Y[i+1];
#ifdef BETAXI0
rx = rbeta;
ry *= rx;
iy *= rx;
#endif
#endif
for(j=0; j != N2; j += 2)
{
ra = A[i+j*lda]; ia = A[i+j*lda+1];
rx = X[j]; ix = X[j+1];
ry += ra * rx;
iy += ra * ix;
#ifdef Conj_
ry += ia * ix;
iy -= ia * rx;
#else
ry -= ia * ix;
iy += ia * rx;
#endif
}
Y[i] = ry;
Y[i+1] = iy;
}
}
Which, when saved to ATLAS/tune/blas/gemv/CASES/ATL_zgemvN_1x1_1.c, we could test and time by:
make zmvtstcaseN mvrout=CASES/ATL_zgemvN_1x1_1.c xu=1 yu=1
BEGINNING GEMV PRIMITIVE TESTING:
TEST TA=N, M=997, N=177, lda=1004, beta=(0.000000,0.000000) STARTED
TEST TA=N, M=997, N=177, lda=1004, beta=(0.000000,0.000000) PASSED
TEST TA=-, M=997, N=177, lda=1004, beta=(0.000000,0.000000) STARTED
TEST TA=-, M=997, N=177, lda=1004, beta=(0.000000,0.000000) PASSED
TEST TA=N, M=997, N=177, lda=1004, beta=(1.000000,0.000000) STARTED
TEST TA=N, M=997, N=177, lda=1004, beta=(1.000000,0.000000) PASSED
TEST TA=-, M=997, N=177, lda=1004, beta=(1.000000,0.000000) STARTED
TEST TA=-, M=997, N=177, lda=1004, beta=(1.000000,0.000000) PASSED
TEST TA=N, M=997, N=177, lda=1004, beta=(0.800000,0.000000) STARTED
TEST TA=N, M=997, N=177, lda=1004, beta=(0.800000,0.000000) PASSED
TEST TA=-, M=997, N=177, lda=1004, beta=(0.800000,0.000000) STARTED
TEST TA=-, M=997, N=177, lda=1004, beta=(0.800000,0.000000) PASSED
TEST TA=N, M=997, N=177, lda=1004, beta=(0.800000,0.300000) STARTED
TEST TA=N, M=997, N=177, lda=1004, beta=(0.800000,0.300000) PASSED
TEST TA=-, M=997, N=177, lda=1004, beta=(0.800000,0.300000) STARTED
TEST TA=-, M=997, N=177, lda=1004, beta=(0.800000,0.300000) PASSED
GEMV PRIMITIVE PASSED ALL TESTS
speedy. make zmvcaseN mvrout=CASES/ATL_zgemvN_1x1_1.c xu=1 yu=1
gemvN_0 : 78.688525 MFLOPS
gemvN_0 : 78.688525 MFLOPS
gemvN_0 : 78.688525 MFLOPS
gemvN_0 : 78.69 MFLOPS