


/*
 *   File: profile_general_cgemm_suqare.cpp
 *   author: Lung-Sheng Chien
 *		Department of Mathematics, Tsing Hua univeristy, R.O.C. (Taiwan).
 *		Email: d947207@oz.nthu.edu.tw
 *	 date: 2010/01/15
 *
 *	 description: this is template of timing profiling, measure average exeuction GPU time
 *		of several SGEMM methods. without loss of generality, only C = A * B is considered.
 *		In this measurement, A, B and C are set to be square matrix with dimension n.
 *		storage is column-major, leading dimension of A, B, C are lda, ldb, and ldc respectively.
 *		in this test, we don't do any padding, say lda = m, ldb = k, ldc = m.
 *		
 *		sweep n from 5 to 4096 and output data to file "output.txt"
 *
 *		File format:
 *		[dimension n]   [GPU time in ms]   [flops (Gflops/s)]
 *
 *		in the output file, '%' is leading character of MATLAB-style comment, since 
 *		we use MATLAB to do post-processing of data.
 *
 *	 Remark:
 *		1.	before do time profiling, we execute SGEMM once to remove warmup time and
 *			if dimension n <= 2048, we execute SGEMM 8 times and do average, 
 *			if n > 2048, then we execute SGEMM 2 times in order to save calibration time.
 *
 *		2.  CUBLAS is used as checker, we compare result of our method to result of CUBLAS, 
 *			one can disable this checking process by setting variable compareToCPU = false 
 *			in function "profile_sgemm_square_unit", this can save calibration time.
 *
 *		3.	the methods we test in this subroutine have out of array bound problem when 
 *			dimension n is not multiple of 64. This out of array bound occurs when access
 *			matrix element of A and B and it may lead to segmentation fault potentially.
 *			However we allocate device memory for matrix A, B and C in order such that 
 *			segmentation fault does not happen.
 *			
 *		4.	we use driver API to load binary file, so one must make sure 
 *			triplet = ( cubin file, kernel name, kernel warpper) must be consistent
 *			when call "profile_sgemm_square"
 *
 */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <assert.h>

#include <iostream>
#include <fstream>
#include <iomanip>

using namespace std ;

#include <lsc_cuda_utility.h>

typedef float2 Complex;

typedef  void  (*cgemmWrapper_prototype)(CUfunction hfunc,
		CUdeviceptr C, CUdeviceptr A, CUdeviceptr B, int hA, int wA, int wB,
		int lda, int ldb, int ldc,
		Complex alpha, Complex beta ) ;

void  profile_general_cgemm_square_unit( cgemmWrapper_prototype, CUfunction  cgemm, 
		unsigned int m, unsigned int n, unsigned int k, Complex alpha, Complex beta,
		double &ave_naiveTime, double &flop_GB,
		float &rel_max_err ) ;

void  profile_general_cgemm_square( char* cubin_filename, char* cgemm_funcName, 
						   cgemmWrapper_prototype cgemmWrapper,
						    char* out_file )
{
	printf("### profile C = A*B (complex) on square matrices \n");

	ofstream  fout(out_file, ios::out) ;
	if( !fout ){
		cerr << "File is NOT opened" << endl ; 
		exit(1) ;
	}
	fout.precision(3) ;
	fout << std::fixed ;

// ------------------ load context ------------------------
	CUresult status ;

// Create module from binary file
	CUmodule cuModule;
	status = cuModuleLoad(&cuModule, cubin_filename );
	if ( CUDA_SUCCESS != status ){
		cerr << "Error: module " << cubin_filename << " cannot be loaded" << endl ; 
		exit(1) ;
	}else{
		cout << "Succ: load module " << cubin_filename << endl ; 
	}
 
// Get function handle from module
	CUfunction  cgemm ;
	status = cuModuleGetFunction(&cgemm, cuModule, cgemm_funcName ) ;
	if ( CUDA_SUCCESS != status ){
		cerr << "Error: kernel " <<  cgemm_funcName << " cannot be found" << endl ;
		exit(1) ;
	}else{
		cout << "Succ: load kernel " <<  cgemm_funcName << endl ;
	}

// --------------------------------------------------------
	fout << "%" << endl ;
	fout << "%  compute C = A*B (complex), A:mxk, B:kxn, C:mxn" << endl;
	fout << "%" << endl ;
	fout << "%  cubin file = " << cubin_filename << endl ;
	fout << "%  kernel function = " << cgemm_funcName << endl ;

	CUdevice device ;
	cuCtxGetDevice( &device) ;
	char dev_name[128] ;
	cuDeviceGetName(dev_name, 128, device) ;

	fout << "%  use device: " << dev_name << endl ;

	fout << "%  m=n=k    gpu_time (ms)   flops (Gflops/s)" << endl ;

	double  gpuTime ;
	double flop_rate ;
	float  rel_max_err = 1.0 ;
	float  eps = 1.E-4f ;
	Complex  alpha = {1.0f, 0.0f} ;
//	Complex  alpha = {72.41f, 0.133f} ;
	Complex  beta  = {0.0f, 0.0f} ; 
	
	int n1 ; 
	for( n1 = 5 ; n1 <=  4096 ; n1++){
		profile_general_cgemm_square_unit(cgemmWrapper, cgemm,  
					n1, n1, n1, alpha, beta,
					gpuTime, flop_rate, rel_max_err ) ;

		if ( eps < rel_max_err ){
			printf("Error: rel_max_err(n = %d) = %7.2E \n", n1, rel_max_err);
			fout.close() ;
			return ;
		}
		fout << setw(7) << n1 << setw(14) << gpuTime << setw(14) << flop_rate << endl ;	

		printf("n = %d is complete with rel_max_err = %.2E\n", n1, rel_max_err );
		
	}// for n1

	fout.close() ;

	printf("write data to %s\n", out_file );

}


void  profile_general_cgemm_square_unit( cgemmWrapper_prototype cgemmWrapper, CUfunction  cgemm, 
		unsigned int m, unsigned int n, unsigned int k, Complex alpha, Complex beta,
		double &gpuTime, double &flop_GB,
		float &rel_max_err )
{
	bool compareToCPU = false ;

	int lda, ldb, ldc ;
	int numIterations = 8 ;

	Complex* h_A ;
	Complex* h_B ;
	Complex* h_C ;
	Complex* reference ;

	unsigned int timer;
	float naiveTime ;
	float max_err = 0.0 ; 

	cutCreateTimer(&timer);
	
	lda = m ;
	ldb = k ;
	ldc = m ; 

// allocate host memory for matrices A and B
	unsigned int size_A = lda*k ;
	unsigned int mem_size_A = sizeof(Complex) * size_A ;
	h_A = (Complex*) malloc(mem_size_A); assert( h_A ) ;

	unsigned int size_B = ldb*n;
	unsigned int mem_size_B = sizeof(Complex) * size_B;
	h_B = (Complex*) malloc(mem_size_B); assert( h_B ) ;

    // allocate host memory for the result
	unsigned int size_C = ldc*n;
	unsigned int mem_size_C = sizeof(Complex) * size_C;
	h_C = (Complex*) malloc(mem_size_C); assert( h_C ) ;
	
	reference = (Complex*) malloc(mem_size_C); assert( reference ) ;

	// initialize host memory
	randomInit(h_A, size_A);
	randomInit(h_B, size_B);
	randomInit(h_C, size_C);

    // allocate device memory
	CUdeviceptr d_A, d_B, d_C;
	cutilDrvSafeCallNoSync( cuMemAlloc( &d_C, mem_size_C ) ) ;	
	cutilDrvSafeCallNoSync( cuMemAlloc( &d_A, mem_size_A ) ) ;
	cutilDrvSafeCallNoSync( cuMemAlloc( &d_B, mem_size_B ) ) ;

// step 2: copy data from host to device
// cuMemcpyHtoD (CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount)
	cutilDrvSafeCallNoSync( cuMemcpyHtoD(d_A, h_A, mem_size_A) ) ;
	cutilDrvSafeCallNoSync( cuMemcpyHtoD(d_B, h_B, mem_size_B) ) ;

// step 3: execute the kernel and evaluate average timing
// remove warmup 
	(*cgemmWrapper)( cgemm, d_C, d_A, d_B, m, k, n, lda, ldb, ldc, alpha, beta ) ;
	cudaThreadSynchronize();

	if ( 2048 < n ){ numIterations = 2 ; }

	cutStartTimer(timer);
	for (int i = 0; i < numIterations; ++i){
		(*cgemmWrapper)( cgemm, d_C, d_A, d_B, m, k, n, lda, ldb, ldc, alpha, beta ) ;
	}
	cudaThreadSynchronize();
	cutStopTimer(timer);
	naiveTime = cutGetTimerValue(timer) ;

	gpuTime = naiveTime /((double)numIterations) ;

	// check if kernel execution generated and error
	cutilCheckMsg("Kernel execution failed");

    // copy result from device to host
	cutilDrvSafeCallNoSync( cuMemcpyDtoH(h_C, d_C, mem_size_C) ) ;

// compute flops
	double flops = ((double)m*(double)n*(double)k)*8.0 ;
	flop_GB = (flops /gpuTime) * ( 1000.0 / 1024.0 ) /1024.0 / 1024.0 ; 

	if( compareToCPU ){
		
// step 4: compute reference solution via CPU
		matrixMul_cublas(reference, h_A, h_B, m, k, n, lda, ldb, ldc, alpha, beta ) ;
 
// step 5: check result
		compare_supnorm( m, n, reference, ldc, h_C, ldc, max_err, rel_max_err ) ;

	}else{
		printf("Warning: don't compare to CPU, programers take care \n");
		max_err = 0.0 ;
		rel_max_err = 0.0 ;
	}

// step 6: cleanup memory
	free( h_A );
	free( h_B );
	free( h_C );
	free( reference);

	cutilDrvSafeCallNoSync( cuMemFree(d_A) ) ;
	cutilDrvSafeCallNoSync( cuMemFree(d_B) ) ;
	cutilDrvSafeCallNoSync( cuMemFree(d_C) ) ;

	cutilCheckError( cutDeleteTimer(timer));

}