


/*
 *
 *   	File:  method4_variant.cu
 *   	author: Lung-Sheng Chien
 *			Department of Mathematics, Tsing Hua univeristy, R.O.C. (Taiwan).
 *			Email: d947207@oz.nthu.edu.tw
 *	 	date: 2010/01/18
 *
 *		description: see HandTunedSgemm_2010_v1.pdf
 *
 * How to compile .cu to .cubin 
 *	"C:\CUDA\bin64\nvcc.exe"  -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" -I"C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include"  -O2 -arch compute_13 -code sm_13 -cubin  method4_variant.cu
 *	"C:\CUDA\bin64\nvcc.exe"  -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" -I"C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include"  -O2 -maxrregcount 80 -arch compute_13 -code sm_13 -cubin  method4_variant.cu
 *
 * How to use decuda/cudasm   
 *	decuda -o  method4_variant.asm  method4_variant.cubin 
 *	cudasm -o  method4_variant_cudasm.cubin  method4_variant.asm 
 *	cudasm -o  method4_variant_cudasm.cubin  method4_variant_correct.asm 
 *
 */


// Written by Vasily Volkov.
// Copyright (c) 2009, The Regents of the University of California.
// All rights reserved.


#define VECTOR_LENGTH  64
#define NUM_VECTOR     4 
#define BLOCK_SIZE_Y   16	

#define BLOCK_SIZE_X   16

#define THREAD_BLOCK_X  16   // THREAD_BLOCK_X = BLOCK_SIZE_X
#define THREAD_BLOCK_Y  4    // THREAD_BLOCK_Y = VECTOR_LENGTH / BLOCK_SIZE_X


__device__ void store_block( int num, float alpha, float *c, float beta, float *C, int ldc )
{
    if( num <= 0 ) return;

    if( beta == 0 )
    {
        //
        //  for the case when C is initialized with inf or NaN
        //
        int i = 0; 
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  

        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  

        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++];
    }
    else
    {
        int i = 0; 
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  

        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  

        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0];
    }
}


//
//  C = alpha*A*B + beta*C
//

/*
 	nextBlockStride_A = VECTOR_LENGTH * blockDim.x 
 	nextColumnStride_A = lda - 3* nextBlockStride_A ;

#pragma unroll 	for sub-matrix of A	and -maxrregcount 80  	
	lmem = 0
	smem = 1176
	reg  = 80
	active threads = 192

 */

static __global__ void  method4_variant_sgemmNN( int m, int n, const float *A, int lda, 
const float *B, int ldb, float* C, int ldc, int k, float alpha, float beta,
int nextBlockStride_A , int nextColumnStride_A )
{
// two sub-matrix of C share the same sub-matrix of B    
	__shared__ float b[BLOCK_SIZE_X][BLOCK_SIZE_Y+1] ;
	
	const int inx = threadIdx.x;
	const int iny = threadIdx.y;
	const int ibx = blockIdx.x * VECTOR_LENGTH ;
	const int iby = blockIdx.y * BLOCK_SIZE_Y ;
	int row = ibx + inx + iny*THREAD_BLOCK_X ;

	A += row;
	B += inx + ( iby + iny ) * ldb;
	C += row  + iby * ldc;

	float c0[BLOCK_SIZE_Y] ;
	float c1[BLOCK_SIZE_Y] ;
	float c2[BLOCK_SIZE_Y] ;
	float c3[BLOCK_SIZE_Y] ;
#pragma unroll	
	for( int i = 0 ; i < BLOCK_SIZE_Y ; i++){
		c0[i] = 0.0f ;
	}	
#pragma unroll	
	for( int i = 0 ; i < BLOCK_SIZE_Y ; i++){
		c1[i] = 0.0f ;
	}		
#pragma unroll	
	for( int i = 0 ; i < BLOCK_SIZE_Y ; i++){
		c2[i] = 0.0f ;
	}		
#pragma unroll	
	for( int i = 0 ; i < BLOCK_SIZE_Y ; i++){
		c3[i] = 0.0f ;
	}			    

// put one row of sub-matrix of B into register
	float b_reg ;
		
	for( ; k > 0; k -= BLOCK_SIZE_X ){
// fetch sub-matrix of B by all threads	
#pragma unroll
		for( int i = 0; i < BLOCK_SIZE_Y ; i += THREAD_BLOCK_Y ){
			b[inx][iny+i]  = B[i*ldb];
		}
		__syncthreads();

		if( k < BLOCK_SIZE_X )  break;
	
		float *b_ptr = (float*)b ;
#pragma unroll 
		for( int i = 0; i < BLOCK_SIZE_X; i++  ){

			float A0_reg = A[0] ; A += nextBlockStride_A ;
			float A1_reg = A[0] ; A += nextBlockStride_A ;
			float A2_reg = A[0] ; A += nextBlockStride_A ;
			float A3_reg = A[0] ; A += nextColumnStride_A ;	
					
// fetch b[i][:] into register b_reg[:]
#pragma unroll
			for( int j = 0 ; j < BLOCK_SIZE_Y ; j++){
				b_reg = b_ptr[j] * 4.0f ;
				c0[j] += A0_reg * b_reg ;
				c1[j] += A1_reg * b_reg ;	
				c2[j] += A2_reg * b_reg ;	
				c3[j] += A3_reg * b_reg ;	
			} 						
			b_ptr += (BLOCK_SIZE_Y+1) ;	// b_ptr = &b[i][0]
		}// for each column index of sub-matrix of A
		 __syncthreads();
			
		B += BLOCK_SIZE_X ;
	};

// rank k-update

		float *b_ptr = (float*)b ;
#pragma unroll 1	 		
		for( int i = 0; i < k ; i++  ){

			float A0_reg = A[0] ; A += nextBlockStride_A ;
			float A1_reg = A[0] ; A += nextBlockStride_A ;
			float A2_reg = A[0] ; A += nextBlockStride_A ;
			float A3_reg = A[0] ; A += nextColumnStride_A ;	
					
// fetch b[i][:] into register b_reg[:]
#pragma unroll
			for( int j = 0 ; j < BLOCK_SIZE_Y ; j++){
				b_reg = b_ptr[j] ;
				c0[j] += A0_reg * b_reg ;
				c1[j] += A1_reg * b_reg ;	
				c2[j] += A2_reg * b_reg ;	
				c3[j] += A3_reg * b_reg ;	
			} 						
			b_ptr += (BLOCK_SIZE_Y+1) ;	// b_ptr = &b[i][0]
		}// for each column index of sub-matrix of A
		
		
// we need to modify this assertion since we modify two sub-matrix of C
    if( row >= m )  return;
    
    store_block( n - iby, alpha, c0, beta, C, ldc);
    
    row += nextBlockStride_A ;
    if( row >= m )  return; 
        
    C += nextBlockStride_A ;
    store_block( n - iby, alpha, c1, beta, C, ldc);
    
    row += nextBlockStride_A ;
    if( row >= m )  return; 
        
    C += nextBlockStride_A ;
    store_block( n - iby, alpha, c2, beta, C, ldc);
    
    row += nextBlockStride_A ;
    if( row >= m )  return; 
        
    C += nextBlockStride_A ;
    store_block( n - iby, alpha, c3, beta, C, ldc);    
}

