

/*
 *   	File:  method6_variant.cu
 *   	author: Lung-Sheng Chien
 *			Department of Mathematics, Tsing Hua univeristy, R.O.C. (Taiwan).
 *			Email: d947207@oz.nthu.edu.tw
 *	 	date: 2010/01/31
 *
 *		description: see HandTunedSgemm_2010_v1.1.pdf 
 * 
 * How to compile .cu to .cubin  
 *	"C:\CUDA\bin64\nvcc.exe"  -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" -I"C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include"  -O2 -arch compute_13 -code sm_13 -cubin  method6_variant.cu
 *	"C:\CUDA\bin64\nvcc.exe"  -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" -I"C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include"  -O2 -maxrregcount 49 -arch compute_13 -code sm_13 -cubin  method6_variant.cu
 *
 *	How to use decuda/cudasm 
 * 		decuda -o  method6_variant.asm  method6_variant.cubin 
 *		cudasm -o  method6_variant_cudasm.cubin   method6_variant.asm
 *		cudasm -o  method6_variant_cudasm.cubin   method6_variant_correct.asm
 *
 */


#define VECTOR_LENGTH  64
#define NUM_VECTOR     2 
#define BLOCK_SIZE_Y   16	

#define BLOCK_SIZE_X   16

#define THREAD_BLOCK_X  16   // THREAD_BLOCK_X = BLOCK_SIZE_X
#define THREAD_BLOCK_Y  4    // THREAD_BLOCK_Y = VECTOR_LENGTH / BLOCK_SIZE_X


__device__ void store_block( int num, float alpha, float *c, float beta, float *C, int ldc )
{
    if( num <= 0 ) return;

    if( beta == 0 )
    {
        //
        //  for the case when C is initialized with inf or NaN
        //
        int i = 0; 
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  

        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  

        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++];
    }
    else
    {
        int i = 0; 
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  

        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  

        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0];
    }
  
}

	  
//
//  C = alpha*A*B + beta*C
//

/*
 (1) b_reg = b_ptr[j] * 4.0f ;
  use  -maxrregcount 49
	reg  = 47
	active threads = 320

  (2) b_reg = b_ptr[j]
  	without -maxrregcount 
  	smem = 1180
		reg  = 54
		active threads = 256
 */


__global__ void  method6_variant_sgemmNN( int m, int n, const float *A, int lda, 
const float *B, int ldb, float* C, int ldc, int k, float alpha, float beta,
float* B_bound )
{
// two sub-matrix of C share the same sub-matrix of B    
	__shared__ float b[BLOCK_SIZE_X][BLOCK_SIZE_Y+1] ;
	
	const int inx = threadIdx.x;
	const int iny = threadIdx.y;
	const int ibx = blockIdx.x * VECTOR_LENGTH * NUM_VECTOR ;
	const int iby = blockIdx.y * BLOCK_SIZE_Y ;
	int row = ibx + inx + iny*THREAD_BLOCK_X ;

	int sel = 0 ;
	if ( blockIdx.y < (gridDim.y - 1) ){
			sel = 1 ;
	}

// A1 is starting addrees of next sub-matrix of A	
	float *A1 ;
	if ( row >= m ){
		A += (m-1) ;
		A1 = (float*)A ; 		
	}else{
		if ( (row + VECTOR_LENGTH) >= m ){
			A1 = (float*)A + (m-1) ;
		}else{
			A1 = (float*)A + (row + VECTOR_LENGTH) ;
		}
		A += row;
	}
	
	B += inx + ( iby + iny ) * ldb;
	C += row  + iby * ldc;
	
	float c0[BLOCK_SIZE_Y] ;
	float c1[BLOCK_SIZE_Y] ;
#pragma unroll	
	for( int i = 0 ; i < BLOCK_SIZE_Y ; i++){
		c0[i] = 0.0f ;
	}	
#pragma unroll	
	for( int i = 0 ; i < BLOCK_SIZE_Y ; i++){
		c1[i] = 0.0f ;
	}		
    
// put one row of sub-matrix of B into register
	float  b_reg ;
	float *b_ptr ;
	float *b_base = (float*)b + inx*(BLOCK_SIZE_Y+1) + iny ;
		
	for( ; k > 0; k -= BLOCK_SIZE_X ){
// fetch sub-matrix of B by all threads	
		if ( sel ){
#pragma unroll
		for( int i = 0; i < BLOCK_SIZE_Y ; i += THREAD_BLOCK_Y ){
//			b[inx][iny+i]  = B[i*ldb];
				b_base[i] = B[i*ldb];
		}
		}else{
			for( int i = 0; i < BLOCK_SIZE_Y ; i += THREAD_BLOCK_Y ){
				if( &B[i*ldb] < B_bound ){
					b_base[i] = B[i*ldb];
				}else{
					break ;
				}
			}						
		}// if (sel)
		__syncthreads();

		if( k < BLOCK_SIZE_X )  break;
	
		b_ptr = (float*)b ;
#pragma unroll		 		
		for( int i = 0; i < BLOCK_SIZE_X; i++  ){
			float A0_reg = A[0]  ; A += lda ;
			float A1_reg = A1[0] ; A1 += lda ;
#pragma unroll
			for( int j = 0 ; j < BLOCK_SIZE_Y ; j++){
				b_reg = b_ptr[j] * 4.0f ;
				c0[j] += A0_reg * b_reg ;
				c1[j] += A1_reg * b_reg ;	
			} 						
			b_ptr += (BLOCK_SIZE_Y+1) ;	// b_ptr = &b[i][0]
		}// for each column index of sub-matrix of A
		 __syncthreads();
			
		B += BLOCK_SIZE_X ;
	};
	
// rank-k update
  b_ptr = (float*)b ;
	for(int i = 0 ; i < k ; i++){
			float A0_reg = A[0]  ; A += lda ;
			float A1_reg = A1[0] ; A1 += lda ;		
#pragma unroll
			for( int j = 0 ; j < BLOCK_SIZE_Y ; j++){
				b_reg = b_ptr[j] ;
				c0[j] += A0_reg * b_reg ;
				c1[j] += A1_reg * b_reg ;	
			} 						
			b_ptr += (BLOCK_SIZE_Y+1) ;	// b_ptr = &b[i][0]			
	}	

// we need to modify this assertion since we modify two sub-matrix of C
    if( row >= m )  return;  
    store_block( n - iby, alpha, c0, beta, C, ldc);
    
    row += VECTOR_LENGTH ;
    if( row >= m )  return;
    
    C += VECTOR_LENGTH ;
    store_block( n - iby, alpha, c1, beta, C, ldc);
	
}	

