

/*
 *		File: volkov.cu
 *		author: Lung-Sheng Chien
 *			Department of Mathematics, Tsing Hua univeristy, R.O.C. (Taiwan).
 *			Email: d947207@oz.nthu.edu.tw
 *	 	date: 2010/2/17
 *
 *		description: modification based on Volkov's code
 
 * How to compile .cu to .cubin 
		"C:\CUDA\bin64\nvcc.exe"  -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" -I"C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include"  -O2 -arch compute_13 -code sm_13 -cubin  volkov.cu

		"C:\CUDA\bin64\nvcc.exe"  -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 8\VC\bin" -I"C:\Program Files (x86)\Microsoft Visual Studio 8\VC\include"  -O2 -maxrregcount 48 -arch compute_13 -code sm_13 -cubin  volkov.cu

 * How to use decuda/cudasm 
 * 	decuda -o  volkov.asm  volkov.cubin 
 *	cudasm -o  volkov_cudasm.cubin   volkov.asm
 *	cudasm -o  volkov_cudasm.cubin   volkov_correct.asm
 *
 *   decuda -o decuda_ldsb32_cudasm_decuda.asm decuda_ldsb32_cudasm.cubin
 */
 
// Written by Vasily Volkov.
// Copyright (c) 2009, The Regents of the University of California.
// All rights reserved.

typedef float2 Complex;


__device__ void store_block( int num, Complex alpha, float *c, Complex beta, Complex *C, int ldc )
{
    if( num <= 0 ) return;
/*
   	int i = 0; 
   	C[0].x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  C[0].y = alpha.y *c[2*i] + alpha.x * c[2*i+1]; i++ ; if( i >= num ) return; C += ldc;  
   	C[0].x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  C[0].y = alpha.y *c[2*i] + alpha.x * c[2*i+1]; i++ ; if( i >= num ) return; C += ldc;  
   	C[0].x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  C[0].y = alpha.y *c[2*i] + alpha.x * c[2*i+1]; i++ ; if( i >= num ) return; C += ldc;     	   	
   	C[0].x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  C[0].y = alpha.y *c[2*i] + alpha.x * c[2*i+1]; i++ ; if( i >= num ) return; C += ldc;  
   	 
   	C[0].x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  C[0].y = alpha.y *c[2*i] + alpha.x * c[2*i+1]; i++ ; if( i >= num ) return; C += ldc;  
   	C[0].x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  C[0].y = alpha.y *c[2*i] + alpha.x * c[2*i+1]; i++ ; if( i >= num ) return; C += ldc;  
   	C[0].x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  C[0].y = alpha.y *c[2*i] + alpha.x * c[2*i+1]; i++ ; if( i >= num ) return; C += ldc;     	   	
   	C[0].x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  C[0].y = alpha.y *c[2*i] + alpha.x * c[2*i+1]; i++ ; if( i >= num ) return; C += ldc;   

   	C[0].x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  C[0].y = alpha.y *c[2*i] + alpha.x * c[2*i+1]; i++ ; if( i >= num ) return; C += ldc;  
   	C[0].x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  C[0].y = alpha.y *c[2*i] + alpha.x * c[2*i+1]; i++ ; if( i >= num ) return; C += ldc;  
   	C[0].x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  C[0].y = alpha.y *c[2*i] + alpha.x * c[2*i+1]; i++ ; if( i >= num ) return; C += ldc;     	   	
   	C[0].x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  C[0].y = alpha.y *c[2*i] + alpha.x * c[2*i+1]; i++ ; if( i >= num ) return; C += ldc;  
   	
   	C[0].x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  C[0].y = alpha.y *c[2*i] + alpha.x * c[2*i+1]; i++ ; if( i >= num ) return; C += ldc;  
   	C[0].x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  C[0].y = alpha.y *c[2*i] + alpha.x * c[2*i+1]; i++ ; if( i >= num ) return; C += ldc;  
   	C[0].x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  C[0].y = alpha.y *c[2*i] + alpha.x * c[2*i+1]; i++ ; if( i >= num ) return; C += ldc;     	   	
   	C[0].x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  C[0].y = alpha.y *c[2*i] + alpha.x * c[2*i+1]; 
 */ 	   	  

    if( (0.0f == beta.x) && (0.0f == beta.y) )
    {
        //
        //  for the case when C is initialized with inf or NaN
        //
        int i = 0;
        Complex g_C ;
 
// C[0:3]           
   			g_C.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  
   			g_C.y = alpha.y * c[2*i] + alpha.x * c[2*i+1] ; 
   			C[0] = g_C ; i++ ; if( i >= num ) return; C += ldc;  
   			
   			g_C.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  
   			g_C.y = alpha.y * c[2*i] + alpha.x * c[2*i+1]; 
   			C[0] = g_C ; i++ ; if( i >= num ) return; C += ldc;  
   			
   			g_C.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  
   			g_C.y = alpha.y * c[2*i] + alpha.x * c[2*i+1]; 
   			C[0] = g_C ; i++ ; if( i >= num ) return; C += ldc;  
   			
   			g_C.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  
   			g_C.y = alpha.y * c[2*i] + alpha.x * c[2*i+1]; 
   			C[0] = g_C ; i++ ; if( i >= num ) return; C += ldc;  
   			
// C[4:7]           
   			g_C.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  
   			g_C.y = alpha.y * c[2*i] + alpha.x * c[2*i+1]; 
   			C[0] = g_C ; i++ ; if( i >= num ) return; C += ldc;  
   			
   			g_C.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  
   			g_C.y = alpha.y * c[2*i] + alpha.x * c[2*i+1]; 
   			C[0] = g_C ; i++ ; if( i >= num ) return; C += ldc;  
   			
   			g_C.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  
   			g_C.y = alpha.y * c[2*i] + alpha.x * c[2*i+1]; 
   			C[0] = g_C ; i++ ; if( i >= num ) return; C += ldc;  
   			
   			g_C.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  
   			g_C.y = alpha.y * c[2*i] + alpha.x * c[2*i+1]; 
   			C[0] = g_C ; i++ ; if( i >= num ) return; C += ldc;  
   
// C[8:11]           
   			g_C.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  
   			g_C.y = alpha.y * c[2*i] + alpha.x * c[2*i+1]; 
   			C[0] = g_C ; i++ ; if( i >= num ) return; C += ldc;  
   			
   			g_C.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  
   			g_C.y = alpha.y * c[2*i] + alpha.x * c[2*i+1]; 
   			C[0] = g_C ; i++ ; if( i >= num ) return; C += ldc;  
   			
   			g_C.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  
   			g_C.y = alpha.y * c[2*i] + alpha.x * c[2*i+1]; 
   			C[0] = g_C ; i++ ; if( i >= num ) return; C += ldc;  
   			
   			g_C.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  
   			g_C.y = alpha.y * c[2*i] + alpha.x * c[2*i+1]; 
   			C[0] = g_C ; i++ ; if( i >= num ) return; C += ldc;  
   	
// C[12:15]           
   			g_C.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  
   			g_C.y = alpha.y * c[2*i] + alpha.x * c[2*i+1]; 
   			C[0] = g_C ; i++ ; if( i >= num ) return; C += ldc;  
   			
   			g_C.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  
   			g_C.y = alpha.y * c[2*i] + alpha.x * c[2*i+1]; 
   			C[0] = g_C ; i++ ; if( i >= num ) return; C += ldc;  
   			
   			g_C.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  
   			g_C.y = alpha.y * c[2*i] + alpha.x * c[2*i+1]; 
   			C[0] = g_C ; i++ ; if( i >= num ) return; C += ldc;  
   			
   			g_C.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] ;  
   			g_C.y = alpha.y * c[2*i] + alpha.x * c[2*i+1]; 
   			C[0] = g_C ;  
   			
    }
    else
    {
        int i = 0; 
        Complex g_C, g_C_out ;
   
// C[0:3]       
        g_C = C[0] ;
        g_C_out.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] + ( beta.x * g_C.x - beta.y * g_C.y ) ;  
        g_C_out.y = alpha.y * c[2*i] + alpha.x * c[2*i+1] + ( beta.x * g_C.y + beta.y * g_C.x ) ; 
        C[0] = g_C_out ; i++ ; if( i >= num ) return ; C += ldc;  
        
        g_C = C[0] ;
        g_C_out.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] + ( beta.x * g_C.x - beta.y * g_C.y ) ;  
        g_C_out.y = alpha.y * c[2*i] + alpha.x * c[2*i+1] + ( beta.x * g_C.y + beta.y * g_C.x ) ; 
        C[0] = g_C_out ; i++ ; if( i >= num ) return ; C += ldc;      
        
        g_C = C[0] ;
        g_C_out.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] + ( beta.x * g_C.x - beta.y * g_C.y ) ;  
        g_C_out.y = alpha.y * c[2*i] + alpha.x * c[2*i+1] + ( beta.x * g_C.y + beta.y * g_C.x ) ; 
        C[0] = g_C_out ; i++ ; if( i >= num ) return ; C += ldc;  
        
        g_C = C[0] ;
        g_C_out.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] + ( beta.x * g_C.x - beta.y * g_C.y ) ;  
        g_C_out.y = alpha.y * c[2*i] + alpha.x * c[2*i+1] + ( beta.x * g_C.y + beta.y * g_C.x ) ; 
        C[0] = g_C_out ; i++ ; if( i >= num ) return ; C += ldc;  
                       
// C[4:7]   
       g_C = C[0] ;
        g_C_out.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] + ( beta.x * g_C.x - beta.y * g_C.y ) ;  
        g_C_out.y = alpha.y * c[2*i] + alpha.x * c[2*i+1] + ( beta.x * g_C.y + beta.y * g_C.x ) ; 
        C[0] = g_C_out ; i++ ; if( i >= num ) return ; C += ldc;  
        
        g_C = C[0] ;
        g_C_out.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] + ( beta.x * g_C.x - beta.y * g_C.y ) ;  
        g_C_out.y = alpha.y * c[2*i] + alpha.x * c[2*i+1] + ( beta.x * g_C.y + beta.y * g_C.x ) ; 
        C[0] = g_C_out ; i++ ; if( i >= num ) return ; C += ldc;      
        
        g_C = C[0] ;
        g_C_out.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] + ( beta.x * g_C.x - beta.y * g_C.y ) ;  
        g_C_out.y = alpha.y * c[2*i] + alpha.x * c[2*i+1] + ( beta.x * g_C.y + beta.y * g_C.x ) ; 
        C[0] = g_C_out ; i++ ; if( i >= num ) return ; C += ldc;  
        
        g_C = C[0] ;
        g_C_out.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] + ( beta.x * g_C.x - beta.y * g_C.y ) ;  
        g_C_out.y = alpha.y * c[2*i] + alpha.x * c[2*i+1] + ( beta.x * g_C.y + beta.y * g_C.x ) ; 
        C[0] = g_C_out ; i++ ; if( i >= num ) return ; C += ldc;  
        
// C[8:11]   
       g_C = C[0] ;
        g_C_out.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] + ( beta.x * g_C.x - beta.y * g_C.y ) ;  
        g_C_out.y = alpha.y * c[2*i] + alpha.x * c[2*i+1] + ( beta.x * g_C.y + beta.y * g_C.x ) ; 
        C[0] = g_C_out ; i++ ; if( i >= num ) return ; C += ldc;  
        
        g_C = C[0] ;
        g_C_out.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] + ( beta.x * g_C.x - beta.y * g_C.y ) ;  
        g_C_out.y = alpha.y * c[2*i] + alpha.x * c[2*i+1] + ( beta.x * g_C.y + beta.y * g_C.x ) ; 
        C[0] = g_C_out ; i++ ; if( i >= num ) return ; C += ldc;      
        
        g_C = C[0] ;
        g_C_out.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] + ( beta.x * g_C.x - beta.y * g_C.y ) ;  
        g_C_out.y = alpha.y * c[2*i] + alpha.x * c[2*i+1] + ( beta.x * g_C.y + beta.y * g_C.x ) ; 
        C[0] = g_C_out ; i++ ; if( i >= num ) return ; C += ldc;  
        
        g_C = C[0] ;
        g_C_out.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] + ( beta.x * g_C.x - beta.y * g_C.y ) ;  
        g_C_out.y = alpha.y * c[2*i] + alpha.x * c[2*i+1] + ( beta.x * g_C.y + beta.y * g_C.x ) ; 
        C[0] = g_C_out ; i++ ; if( i >= num ) return ; C += ldc;  

// C[12:15]   
       g_C = C[0] ;
        g_C_out.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] + ( beta.x * g_C.x - beta.y * g_C.y ) ;  
        g_C_out.y = alpha.y * c[2*i] + alpha.x * c[2*i+1] + ( beta.x * g_C.y + beta.y * g_C.x ) ; 
        C[0] = g_C_out ; i++ ; if( i >= num ) return ; C += ldc;  
        
        g_C = C[0] ;
        g_C_out.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] + ( beta.x * g_C.x - beta.y * g_C.y ) ;  
        g_C_out.y = alpha.y * c[2*i] + alpha.x * c[2*i+1] + ( beta.x * g_C.y + beta.y * g_C.x ) ; 
        C[0] = g_C_out ; i++ ; if( i >= num ) return ; C += ldc;      
        
        g_C = C[0] ;
        g_C_out.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] + ( beta.x * g_C.x - beta.y * g_C.y ) ;  
        g_C_out.y = alpha.y * c[2*i] + alpha.x * c[2*i+1] + ( beta.x * g_C.y + beta.y * g_C.x ) ; 
        C[0] = g_C_out ; i++ ; if( i >= num ) return ; C += ldc;  
        
        g_C = C[0] ;
        g_C_out.x = alpha.x * c[2*i] - alpha.y * c[2*i+1] + ( beta.x * g_C.x - beta.y * g_C.y ) ;  
        g_C_out.y = alpha.y * c[2*i] + alpha.x * c[2*i+1] + ( beta.x * g_C.y + beta.y * g_C.x ) ; 
        C[0] = g_C_out ;
    
    }
   
}

//
//  C = alpha*A*B + beta*C
//
/*
	lmem = 0
	smem = 2264
	reg  = 51
	active threads = 256
 */
 
__global__ void   volkov( int m, int n, const Complex *A, int lda, 
const Complex *B, int ldb, Complex* C, int ldc, int k, Complex alpha, Complex beta,
Complex* B_bound )
{
	__shared__ float b1[16][17];
	__shared__ float b2[16][17];
	
	const int inx = threadIdx.x;
	const int iny = threadIdx.y;
	const int ibx = blockIdx.x * 64;
	const int iby = blockIdx.y * 16;
	const int row = ibx + inx + iny*16;
	
	int sel = 0 ;
	if ( blockIdx.y < (gridDim.y - 1) ){
			sel = 1 ;
	}

	if ( row >= m ){
		A += (m-1) ;
	}else{
		A += row ;
	}
		
	B += inx + ( iby + iny ) * ldb;
	C += row  + iby * ldc;
	
	float c[32] ;
#pragma unroll
	for( int i = 0; i < 32; i++ ){
		c[i] = 0.0f ;
	}	
 
// put one row of sub-matrix of B into register
	Complex B_reg ;
	float *b1_base = (float*)b1 + inx*17 + iny ; 
  float *b2_base = (float*)b2 + inx*17 + iny ; 
   
	for( ; k > 0; k -= 16 )
	{
		if ( sel ){
#pragma unroll
			for( int i = 0; i < 16 ; i += 4 ){
				B_reg = B[i*ldb];
				b1_base[i] = B_reg.x ;
				b2_base[i] = B_reg.y ;
			}
		}else{
			for( int i = 0; i < 16 ; i += 4 ){
				if( &B[i*ldb] < B_bound ){
					B_reg = B[i*ldb];
					b1_base[i] = B_reg.x ;
					b2_base[i] = B_reg.y ;					
				}else{
					break ;
				}
			}						
		}// if (sel)		
		__syncthreads();

		if( k < 16 )  break;

		float *b1_ptr = (float*)b1 ;
		float *b2_ptr = (float*)b2 ;
#pragma unroll
		for( int i = 0; i < 16; i++ ){ 
				Complex A_reg = A[0] ; A += lda ;
#pragma unroll
				for( int j = 0 ; j < 16 ; j++){
					float b_reg_x = b1_ptr[j] ;
					float b_reg_y = b2_ptr[j] ;
// c[j] += A_reg * b_reg ;
					c[2*j  ] += (A_reg.x * b_reg_x - A_reg.y * b_reg_y) ;					
					c[2*j+1] += (A_reg.y * b_reg_x + A_reg.x * b_reg_y) ;
				} 						
				b1_ptr += 17 ;	// b1_ptr = &b1[i][0]		
				b2_ptr += 17 ;	// b2_ptr = &b2[i][0]		
		}  
		__syncthreads();
		B += 16;
	};

// rank-k update   
		float *b1_ptr = (float*)b1 ;
		float *b2_ptr = (float*)b2 ;
#pragma unroll
		for( int i = 0; i < k ; i++ ){ 
				Complex A_reg = A[0] ; A += lda ;
#pragma unroll
				for( int j = 0 ; j < 16 ; j++){
					float b_reg_x = b1_ptr[j] ;
					float b_reg_y = b2_ptr[j] ;
// c[j] += A_reg * b_reg ;
					c[2*j  ] += (A_reg.x * b_reg_x - A_reg.y * b_reg_y) ;					
					c[2*j+1] += (A_reg.y * b_reg_x + A_reg.x * b_reg_y) ;
				} 						
				b1_ptr += 17 ;	// b1_ptr = &b1[i][0]		
				b2_ptr += 17 ;	// b2_ptr = &b2[i][0]		
		}  

	if( row >= m )  return;
    
	store_block( n - iby, alpha, c, beta, C, ldc);
}	


 
