

/*
 *   	File:  method6_v3.cu
 *   	author: Lung-Sheng Chien
 *			Department of Mathematics, Tsing Hua univeristy, R.O.C. (Taiwan).
 *			Email: d947207@oz.nthu.edu.tw
 *	 	date: 2010/01/31
 *
 *		description: use int B_bound, but not good
 *   
 *			see HandTunedSgemm_2010_v1.pdf 
 * 
 * How to compile .cu to .cubin  
 	"C:\CUDA\bin64\nvcc.exe"  -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" -I"C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include"  -O2 -arch compute_13 -code sm_13 -cubin  method6.cu
 	
	"C:\CUDA\bin64\nvcc.exe"  -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" -I"C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include"  -O2 -maxrregcount 48 -arch compute_13 -code sm_13 -cubin  method6.cu

 */

// Written by Vasily Volkov.
// Copyright (c) 2009, The Regents of the University of California.
// All rights reserved.


#define VECTOR_LENGTH  64
#define NUM_VECTOR     2 
#define BLOCK_SIZE_Y   16	

#define BLOCK_SIZE_X   16

#define THREAD_BLOCK_X  16   // THREAD_BLOCK_X = BLOCK_SIZE_X
#define THREAD_BLOCK_Y  4    // THREAD_BLOCK_Y = VECTOR_LENGTH / BLOCK_SIZE_X


__device__ void store_block( int num, float alpha, float *c, float beta, float *C, int ldc )
{
    if( num <= 0 ) return;

    if( beta == 0 )
    {
        //
        //  for the case when C is initialized with inf or NaN
        //
        int i = 0; 
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  

        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  

        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++];
    }
    else
    {
        int i = 0; 
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  

        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  

        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0]; if( i >= num ) return; C += ldc;  
        C[0] = alpha*c[i++] + beta*C[0];
    }
  
}


// case 1: no check 
__device__  void  method6_sgemmNN_case1( int m, int n, float *A, int lda, 
float *B, int ldb, float* C, int ldc, int k, float alpha, float beta,
int B_bound, float* b )
{
	const int inx = threadIdx.x;
	const int iny = threadIdx.y;
	const int ibx = blockIdx.x * VECTOR_LENGTH * NUM_VECTOR ;
	const int iby = blockIdx.y * BLOCK_SIZE_Y ;
	int row = ibx + inx + iny*THREAD_BLOCK_X ;

// A1 is starting addrees of next sub-matrix of A	
	float *A1 ;
	if ( row >= m ){
		A += (m-1) ;
		A1 = (float*)A ; 		
	}else{
		if ( (row + VECTOR_LENGTH) >= m ){
			A1 = (float*)A + (m-1) ;
		}else{
			A1 = (float*)A + (row + VECTOR_LENGTH) ;
		}
		A += row;
	}
	
	B += inx + ( iby + iny ) * ldb;
	C += row  + iby * ldc;
	
	float c0[BLOCK_SIZE_Y] ;
	float c1[BLOCK_SIZE_Y] ;
#pragma unroll	
	for( int i = 0 ; i < BLOCK_SIZE_Y ; i++){
		c0[i] = 0.0f ;
	}	
#pragma unroll	
	for( int i = 0 ; i < BLOCK_SIZE_Y ; i++){
		c1[i] = 0.0f ;
	}		
    
// put one row of sub-matrix of B into register
	float  b_reg ;
	float *b_ptr ;
	float *b_base = (float*)b + inx*(BLOCK_SIZE_Y+1) + iny ;
		
	for( ; k > 0; k -= BLOCK_SIZE_X ){
// fetch sub-matrix of B by all threads	
#pragma unroll
		for( int i = 0; i < BLOCK_SIZE_Y ; i += THREAD_BLOCK_Y ){
//			b[inx][iny+i]  = B[i*ldb];
				b_base[i] = B[i*ldb];
		}
		__syncthreads();

		if( k < BLOCK_SIZE_X )  break;
	
		b_ptr = (float*)b ;
#pragma unroll		 		
		for( int i = 0; i < BLOCK_SIZE_X; i++  ){
			float A0_reg = A[0]  ; A += lda ;
			float A1_reg = A1[0] ; A1 += lda ;
#pragma unroll
			for( int j = 0 ; j < BLOCK_SIZE_Y ; j++){
				b_reg = b_ptr[j] ;
				c0[j] += A0_reg * b_reg ;
				c1[j] += A1_reg * b_reg ;	
			} 						
			b_ptr += (BLOCK_SIZE_Y+1) ;	// b_ptr = &b[i][0]
		}// for each column index of sub-matrix of A
		 __syncthreads();
			
		B += BLOCK_SIZE_X ;
	};
	
// rank-k update
  b_ptr = (float*)b ;
	for(int i = 0 ; i < k ; i++){
			float A0_reg = A[0]  ; A += lda ;
			float A1_reg = A1[0] ; A1 += lda ;		
#pragma unroll
			for( int j = 0 ; j < BLOCK_SIZE_Y ; j++){
				b_reg = b_ptr[j] ;
				c0[j] += A0_reg * b_reg ;
				c1[j] += A1_reg * b_reg ;	
			} 						
			b_ptr += (BLOCK_SIZE_Y+1) ;	// b_ptr = &b[i][0]			
	}	

// we need to modify this assertion since we modify two sub-matrix of C
    if( row >= m )  return;  
    store_block( n - iby, alpha, c0, beta, C, ldc);
    
    row += VECTOR_LENGTH ;
    if( row >= m )  return;
    
    C += VECTOR_LENGTH ;
    store_block( n - iby, alpha, c1, beta, C, ldc);
}

// case 2: check B
__device__  void  method6_sgemmNN_case2( int m, int n, float *A, int lda, 
float *B, int ldb, float* C, int ldc, int k, float alpha, float beta,
int B_bound,  float* b )
{
	const int inx = threadIdx.x;
	const int iny = threadIdx.y;
	const int ibx = blockIdx.x * VECTOR_LENGTH * NUM_VECTOR ;
	const int iby = blockIdx.y * BLOCK_SIZE_Y ;
	int row = ibx + inx + iny*THREAD_BLOCK_X ;

// A1 is starting addrees of next sub-matrix of A	
	float *A1 ;
	if ( row >= m ){
		A += (m-1) ;
		A1 = (float*)A ; 		
	}else{
		if ( (row + VECTOR_LENGTH) >= m ){
			A1 = (float*)A + (m-1) ;
		}else{
			A1 = (float*)A + (row + VECTOR_LENGTH) ;
		}
		A += row;
	}
	
//	B += inx + ( iby + iny ) * ldb;
	int B_offset = inx + ( iby + iny ) * ldb; 
	C += row  + iby * ldc;
	
	float c0[BLOCK_SIZE_Y] ;
	float c1[BLOCK_SIZE_Y] ;
#pragma unroll	
	for( int i = 0 ; i < BLOCK_SIZE_Y ; i++){
		c0[i] = 0.0f ;
	}	
#pragma unroll	
	for( int i = 0 ; i < BLOCK_SIZE_Y ; i++){
		c1[i] = 0.0f ;
	}		
    
// put one row of sub-matrix of B into register
	float  b_reg ;
	float *b_ptr ;
	float *b_base = (float*)b + inx*(BLOCK_SIZE_Y+1) + iny ;
		
	for( ; k > 0; k -= BLOCK_SIZE_X ){
// fetch sub-matrix of B by all threads	
//#pragma unroll
/*
		for( int i = 0; i < BLOCK_SIZE_Y ; i += THREAD_BLOCK_Y ){
				if( &B[i*ldb] < B_bound ){
					b_base[i] = B[i*ldb];
				}else{
					break ;
				}
		}
*/				
		int B_offset_sweep = B_offset ;
		for( int i = 0; i < BLOCK_SIZE_Y ; i += THREAD_BLOCK_Y ){
				if( B_offset_sweep < B_bound ){
					b_base[i] = B[ B_offset_sweep ];
					B_offset_sweep += THREAD_BLOCK_Y *ldb ;
				}else{
					break ;
				}
		}			
		__syncthreads();

		if( k < BLOCK_SIZE_X )  break;
	
		b_ptr = (float*)b ;
#pragma unroll		 		
		for( int i = 0; i < BLOCK_SIZE_X; i++  ){
			float A0_reg = A[0]  ; A += lda ;
			float A1_reg = A1[0] ; A1 += lda ;
#pragma unroll
			for( int j = 0 ; j < BLOCK_SIZE_Y ; j++){
				b_reg = b_ptr[j] ;
				c0[j] += A0_reg * b_reg ;
				c1[j] += A1_reg * b_reg ;	
			} 						
			b_ptr += (BLOCK_SIZE_Y+1) ;	// b_ptr = &b[i][0]
		}// for each column index of sub-matrix of A
		 __syncthreads();
			
//		B += BLOCK_SIZE_X ;
		B_offset += BLOCK_SIZE_X ;
	};
	
// rank k-update
  b_ptr = (float*)b ;
	for(int i = 0 ; i < k ; i++){
			float A0_reg = A[0]  ; A += lda ;
			float A1_reg = A1[0] ; A1 += lda ;		
#pragma unroll
			for( int j = 0 ; j < BLOCK_SIZE_Y ; j++){
				b_reg = b_ptr[j] ;
				c0[j] += A0_reg * b_reg ;
				c1[j] += A1_reg * b_reg ;	
			} 						
			b_ptr += (BLOCK_SIZE_Y+1) ;	// b_ptr = &b[i][0]			
	}	

// we need to modify this assertion since we modify two sub-matrix of C
    if( row >= m )  return;  
    store_block( n - iby, alpha, c0, beta, C, ldc);
    
    row += VECTOR_LENGTH ;
    if( row >= m )  return;
    
    C += VECTOR_LENGTH ;
    store_block( n - iby, alpha, c1, beta, C, ldc);
}


// case 5: only rank-k update needs to consider B
__device__  void  method6_sgemmNN_case5( int m, int n, float *A, int lda, 
float *B_matrix, int ldb, float* C, int ldc, int k, float alpha, float beta,
int B_bound, float* b )
{
	const int inx = threadIdx.x;
	const int iny = threadIdx.y;
	const int ibx = blockIdx.x * VECTOR_LENGTH * NUM_VECTOR ;
	const int iby = blockIdx.y * BLOCK_SIZE_Y ;
	int row = ibx + inx + iny*THREAD_BLOCK_X ;

// A1 is starting addrees of next sub-matrix of A	
	float *A1 ;
	if ( row >= m ){
		A += (m-1) ;
		A1 = (float*)A ; 		
	}else{
		if ( (row + VECTOR_LENGTH) >= m ){
			A1 = (float*)A + (m-1) ;
		}else{
			A1 = (float*)A + (row + VECTOR_LENGTH) ;
		}
		A += row;
	}

//	B += inx + ( iby + iny ) * ldb;
	float *B = B_matrix + ( inx + ( iby + iny ) * ldb ) ;
	C += row  + iby * ldc;
	
	float c0[BLOCK_SIZE_Y] ;
	float c1[BLOCK_SIZE_Y] ;
#pragma unroll	
	for( int i = 0 ; i < BLOCK_SIZE_Y ; i++){
		c0[i] = 0.0f ;
	}	
#pragma unroll	
	for( int i = 0 ; i < BLOCK_SIZE_Y ; i++){
		c1[i] = 0.0f ;
	}		
    
// put one row of sub-matrix of B into register
	float  b_reg ;
	float *b_ptr ;
	float *b_base = (float*)b + inx*(BLOCK_SIZE_Y+1) + iny ;
		
	for( ; k > 0; k -= BLOCK_SIZE_X ){
		
		if( k < BLOCK_SIZE_X )  break;
		
// fetch sub-matrix of B by all threads	
#pragma unroll
		for( int i = 0; i < BLOCK_SIZE_Y ; i += THREAD_BLOCK_Y ){
				b_base[i] = B[i*ldb];
		}
		__syncthreads();

		b_ptr = (float*)b ;
#pragma unroll		 		
		for( int i = 0; i < BLOCK_SIZE_X; i++  ){
			float A0_reg = A[0]  ; A += lda ;
			float A1_reg = A1[0] ; A1 += lda ;
#pragma unroll
			for( int j = 0 ; j < BLOCK_SIZE_Y ; j++){
				b_reg = b_ptr[j] ;
				c0[j] += A0_reg * b_reg ;
				c1[j] += A1_reg * b_reg ;	
			} 						
			b_ptr += (BLOCK_SIZE_Y+1) ;	// b_ptr = &b[i][0]
		}// for each column index of sub-matrix of A
		 __syncthreads();
			
		B += BLOCK_SIZE_X ;
	};
	
	if ( 0 < k ){
		int B_offset = B - B_matrix ;
		for( int i = 0; i < BLOCK_SIZE_Y ; i += THREAD_BLOCK_Y ){
				if( B_offset < B_bound ){
					b_base[i] = B_matrix[ B_offset ];
					B_offset += THREAD_BLOCK_Y * ldb ;
				}else{
					break ;
				}
		}
		__syncthreads();	
		
// rank-k update
  	b_ptr = (float*)b ;
		for(int i = 0 ; i < k ; i++){
			float A0_reg = A[0]  ; A += lda ;
			float A1_reg = A1[0] ; A1 += lda ;		
#pragma unroll
			for( int j = 0 ; j < BLOCK_SIZE_Y ; j++){
				b_reg = b_ptr[j] ;
				c0[j] += A0_reg * b_reg ;
				c1[j] += A1_reg * b_reg ;	
			} 						
			b_ptr += (BLOCK_SIZE_Y+1) ;	// b_ptr = &b[i][0]			
		}	
	}//if (k > 0)
	
// we need to modify this assertion since we modify two sub-matrix of C
    if( row >= m )  return;  
    store_block( n - iby, alpha, c0, beta, C, ldc);
    
    row += VECTOR_LENGTH ;
    if( row >= m )  return;
    
    C += VECTOR_LENGTH ;
    store_block( n - iby, alpha, c1, beta, C, ldc);
}
 
	  
//
//  C = alpha*A*B + beta*C
//


//	smem = 1180
//	reg  = 50
//  active threads = 256
/*
__global__ void  method6_sgemmNN( int m, int n, const float *A, int lda, 
const float *B, int ldb, float* C, int ldc, int k, float alpha, float beta,
int B_bound, int category )
{
// two sub-matrix of C share the same sub-matrix of B    
	__shared__ float b[BLOCK_SIZE_X][BLOCK_SIZE_Y+1] ;
	
	int case_sel ; 
	if ( 1 == category ){
		if ( blockIdx.y < (gridDim.y - 1) ){
			case_sel = 1 ;
		}else{
			case_sel = 5 ;
		}
		
	}else{ //  category = 2
		if ( blockIdx.y < (gridDim.y - 1) ){
			case_sel = 1 ;
		}else{
			case_sel = 2 ;
		}		
		
	}// if ( category)
	
	switch( case_sel ){
	case 1:
		method6_sgemmNN_case1( m, n, (float*)A, lda, (float*)B, ldb, C, ldc, k, alpha, beta, B_bound,
					(float*)b ) ;		
		break ;	
	case 2: 
		method6_sgemmNN_case2( m, n, (float*)A, lda, (float*)B, ldb, C, ldc, k, alpha, beta, B_bound,
					(float*)b ) ;		
		break ;						
	default: // case 5
		method6_sgemmNN_case5( m, n, (float*)A, lda, (float*)B, ldb, C, ldc, k, alpha, beta, B_bound,
					(float*)b ) ;			
		break ;			
	}// switch( case_sel)
	
}	
*/


__global__ void  method6_sgemmNN( int m, int n, const float *A, int lda, 
const float *B, int ldb, float* C, int ldc, int k, float alpha, float beta,
int B_bound, int category )
{
// two sub-matrix of C share the same sub-matrix of B    
	__shared__ float b[BLOCK_SIZE_X][BLOCK_SIZE_Y+1] ;
	
	int case_sel ; 
	if ( blockIdx.y < (gridDim.y - 1) ){
		case_sel = 1 ;
	}else{
		case_sel = 2 ;
	}		
 
	if ( 1 == case_sel ){
		method6_sgemmNN_case1( m, n, (float*)A, lda, (float*)B, ldb, C, ldc, k, alpha, beta, B_bound, (float*)b ) ;		
	}else{ 
		method6_sgemmNN_case2( m, n, (float*)A, lda, (float*)B, ldb, C, ldc, k, alpha, beta, B_bound, (float*)b ) ;		
	}
	
}	



