
/* 
 *	 File: profile_CUBLAS.cpp
 *   author: Lung-Sheng Chien
 *		Department of Mathematics, Tsing Hua univeristy, R.O.C. (Taiwan).
 *		Email: d947207@oz.nthu.edu.tw
 *	 date: 2010/01/15
 *
 *	 description: time profile of SGEMM in CUBLAS.
 *		C = A * B, where A, B and C are square matrix with dimension n.
 *
 *		sweep n from 5 to 4096 and output data to file "output.txt"
 *
 *		File format:
 *		[dimension n]   [GPU time in ms]   [flops (Gflops/s)]
 *
 *		in the output file, '%' is leading character of MATLAB-style comment, since 
 *		we use MATLAB to do post-processing of data.
 *
 *	 Remark:
 *		1.	before do time profiling, we execute SGEMM once to remove warmup time and
 *			if dimension n <= 2048, we execute SGEMM 8 times and do average, 
 *			if n > 2048, then we execute SGEMM 2 times in order to save calibration time.
 */

#include <lsc_cuda_utility.h>
 
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <assert.h>

#include <iostream>
#include <fstream>
#include <iomanip>

typedef float2 Complex;

using namespace std ;

void  profile_CUBLAS_unit( unsigned int m, unsigned int n, unsigned int k, 
		Complex alpha, Complex beta,
		float &gpu_time, double &flop_GB );

void  profile_CUBLAS_overN( char* out_file = "output.txt" )
{
	ofstream  fout( out_file, ios::out) ;
	if( !fout ){
		cerr << "File is NOT opened" << endl ; 
		exit(1) ;
	}
	fout.precision(3) ;
	fout << std::fixed ;

	cudaDeviceProp deviceProp;
	int device = 2 ;
	cudaGetDeviceProperties(&deviceProp, device);
	cudaSetDevice( device );
	printf("use device %d, name = %s\n", device, deviceProp.name );

	fout << "% CGEMM in CUBLAS  " << endl ;
	fout << "% use device: " << deviceProp.name << endl ;
	fout << "%  n   gpu_time (ms)   flops (Gflops/s)" << endl ;

	float  gpuTime ; 
	double flop_rate ;
	Complex alpha = {1.0f, 0.0f} ;
	Complex beta  = {0.0f, 0.0f} ;
		
	for( int n1 = 5 ; n1 <= 4096 ; n1++){

		profile_CUBLAS_unit( n1, n1, n1, alpha, beta, gpuTime, flop_rate ) ;

		fout << setw(7) << n1 << setw(14) << gpuTime  << setw(14) << flop_rate << endl ;	

		printf("n = %d is complete \n", n1 );
		
	}// for n1

	fout.close() ;

	printf("write data to %s\n", out_file );
}

void  profile_CUBLAS_unit( unsigned int m, unsigned int n, unsigned int k, 
		Complex alpha, Complex beta,
		float &gpu_time, double &flop_rate )
{
	int numIterations = 8 ;

	Complex* h_A ;
	Complex* h_B ;
	Complex* h_C ;
 
	cublasStatus stat ;
	Complex *devPtrA , *devPtrB , *devPtrC ;

	unsigned int timer;
	float naiveTime ;

	cutCreateTimer(&timer);

// allocate host memory for matrices A and B
	unsigned int size_A = m*k;
	unsigned int mem_size_A = sizeof(Complex) * size_A;
	h_A = (Complex*) malloc(mem_size_A);
	assert( h_A ) ;

	unsigned int size_B = k*n;
	unsigned int mem_size_B = sizeof(Complex) * size_B;
	h_B = (Complex*) malloc(mem_size_B);
	assert( h_B ) ;

	// allocate host memory for the result
	unsigned int size_C = m*n;
	unsigned int mem_size_C = sizeof(Complex) * size_C;
	h_C = (Complex*) malloc(mem_size_C);
	assert( h_C ) ;
	
 // initialize host memory
	randomInit(h_A, size_A);
	randomInit(h_B, size_B);

// allocate device memory
    
	stat = cublasInit() ; // initilization of CUDA application
	assert( CUBLAS_STATUS_SUCCESS  == stat ) ;

	int hA = m ;
	int wA = k ;
	int wB = n ;

	stat = cublasAlloc( hA*wA, sizeof(Complex), (void**) &devPtrA) ;
	assert ( CUBLAS_STATUS_SUCCESS == stat ) ;
 
	stat = cublasAlloc( wA*wB, sizeof(Complex), (void**) &devPtrB) ;
	assert ( CUBLAS_STATUS_SUCCESS == stat ) ;

	stat = cublasAlloc( hA*wB, sizeof(Complex), (void**) &devPtrC) ;
	assert ( CUBLAS_STATUS_SUCCESS == stat ) ;
	
// step 2: copy data from host to device
	stat = cublasSetMatrix( hA, wA, sizeof(Complex), h_A, hA, devPtrA, hA) ;
	assert ( CUBLAS_STATUS_SUCCESS == stat ) ;
	
	stat = cublasSetMatrix( wA, wB, sizeof(Complex), h_B, wA, devPtrB, wA) ;
	assert ( CUBLAS_STATUS_SUCCESS == stat ) ;

// step 3: execute the kernel and evaluate average timing
// remove warmup 
	cublasCgemm('N', 'N', hA, wB, wA, alpha, devPtrA, hA, 
		devPtrB, wA, beta, devPtrC, hA) ;
 
	// make sure that all threads are done
	cudaThreadSynchronize();  


	if ( 2048 < n ){
		numIterations = 2 ;
	}

	cutStartTimer(timer);
	for (int i = 0; i < numIterations; ++i){
		cublasCgemm('N', 'N', hA, wB, wA, alpha, devPtrA, hA, 
			devPtrB, wA, beta, devPtrC, hA) ;
	}
	cudaThreadSynchronize();
	cutStopTimer(timer);
	naiveTime = cutGetTimerValue(timer) ;
	gpu_time = naiveTime /((float)numIterations) ;

	stat = cublasGetError() ;
	assert ( CUBLAS_STATUS_SUCCESS == stat ) ;

  // check if kernel execution generated and error
	cutilCheckMsg("Kernel execution failed");

    // copy result from device to host
	stat = cublasGetMatrix(hA, wB, sizeof(Complex), devPtrC, hA, h_C, hA) ;
	cudaThreadSynchronize();  
	
	assert ( CUBLAS_STATUS_SUCCESS == stat ) ;

// compute flops
	double flops = ((double)m*(double)n*(double)k)*8.0 ;
	flop_rate = (flops /gpu_time) * ( 1000.0 / 1024.0 ) /1024.0 / 1024.0 ; 

// step 6: cleanup memory
	free( h_A ); free( h_B ); free( h_C );
  
 	cublasFree( devPtrA ) ;	 cublasFree( devPtrB ) ;  cublasFree( devPtrC ) ;

	stat = cublasShutdown() ;
	assert( CUBLAS_STATUS_SUCCESS  == stat ) ;

	cutilCheckError( cutDeleteTimer(timer));

}