
/*
 *   File: profile_general_sgemm_suqare.cpp
 *   author: Lung-Sheng Chien
 *		Department of Mathematics, Tsing Hua univeristy, R.O.C. (Taiwan).
 *		Email: d947207@oz.nthu.edu.tw
 *	 date: 2010/01/30
 *
 *	 description: the same as profile_sgemm_suqare.cpp, except that 
 *				change  
 *			cutilDrvSafeCallNoSync( cuMemAlloc( &d_A, mem_size_A ) ) ;
 *			cutilDrvSafeCallNoSync( cuMemAlloc( &d_B, mem_size_B ) ) ;
 *			cutilDrvSafeCallNoSync( cuMemAlloc( &d_C, mem_size_C ) ) ;
 *
 *				to
 *
 *			cutilDrvSafeCallNoSync( cuMemAlloc( &d_C, mem_size_C ) ) ;
 *			cutilDrvSafeCallNoSync( cuMemAlloc( &d_A, mem_size_A ) ) ;
 *			cutilDrvSafeCallNoSync( cuMemAlloc( &d_B, mem_size_B ) ) ;
 *
 */


#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <assert.h>

#include <iostream>
#include <fstream>
#include <iomanip>

using namespace std ;

#include <lsc_cuda_utility.h>


typedef  void  (*sgemmWrapper_prototype)(CUfunction hfunc,
		CUdeviceptr C, CUdeviceptr A, CUdeviceptr B, int hA, int wA, int wB,
		int lda, int ldb, int ldc,
		float alpha, float beta ) ;

void  profile_general_sgemm_square_unit( sgemmWrapper_prototype, CUfunction  sgemm, 
		unsigned int m, unsigned int n, unsigned int k, float alpha, float beta,
		double &ave_naiveTime, double &flop_GB,
		float &rel_max_err ) ;

void  profile_general_sgemm_square( char* cubin_filename, char* sgemm_funcName, 
						   sgemmWrapper_prototype sgemmWrapper,
						    char* out_file = "output.txt" )
{
	printf("### profile C = A*B on square matrices \n");

	ofstream  fout(out_file, ios::out) ;
	if( !fout ){
		cerr << "File is NOT opened" << endl ; 
		exit(1) ;
	}
	fout.precision(3) ;
	fout << std::fixed ;

// ------------------ load context ------------------------
	CUresult status ;

// Create module from binary file
	CUmodule cuModule;
	status = cuModuleLoad(&cuModule, cubin_filename );
	if ( CUDA_SUCCESS != status ){
		cerr << "Error: module " << cubin_filename << " cannot be loaded" << endl ; 
		exit(1) ;
	}else{
		cout << "Succ: load module " << cubin_filename << endl ; 
	}
 
// Get function handle from module
	CUfunction  sgemm ;
	status = cuModuleGetFunction(&sgemm, cuModule, sgemm_funcName ) ;
	if ( CUDA_SUCCESS != status ){
		cerr << "Error: kernel " <<  sgemm_funcName << " cannot be found" << endl ;
		exit(1) ;
	}else{
		cout << "Succ: load kernel " <<  sgemm_funcName << endl ;
	}

// --------------------------------------------------------
	fout << "%" << endl ;
	fout << "%  compute C = A*B, A:mxk, B:kxn, C:mxn" << endl;
	fout << "%" << endl ;
	fout << "%  cubin file = " << cubin_filename << endl ;
	fout << "%  kernel function = " << sgemm_funcName << endl ;

	CUdevice device ;
	cuCtxGetDevice( &device) ;
	char dev_name[128] ;
	cuDeviceGetName(dev_name, 128, device) ;

	fout << "%  use device: " << dev_name << endl ;

	fout << "%  m=n=k    gpu_time (ms)   flops (Gflops/s)" << endl ;

	double  gpuTime ;
	double flop_rate ;
	float  rel_max_err = 1.0 ;
	float  eps = 1.E-4f ;
	float  alpha = 1.0 ;
	float  beta  = 0.0 ; 
	
	int n1 ; 
	for( n1 = 5 ; n1 <=  4096 ; n1++){
		profile_general_sgemm_square_unit(sgemmWrapper, sgemm,  
					n1, n1, n1, alpha, beta,
					gpuTime, flop_rate, rel_max_err ) ;

		if ( eps < rel_max_err ){
			printf("Error: rel_max_err(n = %d) = %7.2E \n", n1, rel_max_err);
			fout.close() ;
			return ;
		}
		fout << setw(7) << n1 << setw(14) << gpuTime << setw(14) << flop_rate << endl ;	

		printf("n = %d is complete with rel_max_err = %.2E\n", n1, rel_max_err );
		
	}// for n1

	fout.close() ;

	printf("write data to %s\n", out_file );

}


void  profile_general_sgemm_square_unit( sgemmWrapper_prototype sgemmWrapper, CUfunction  sgemm, 
		unsigned int m, unsigned int n, unsigned int k, float alpha, float beta,
		double &gpuTime, double &flop_GB,
		float &rel_max_err )
{
	bool compareToCPU = true ;

	int lda, ldb, ldc ;
	int numIterations = 8 ;

	float* h_A ;
	float* h_B ;
	float* h_C ;
	float* reference ;

	unsigned int timer;
	float naiveTime ;
	float max_err = 0.0 ; 

	cutCreateTimer(&timer);
	
	lda = m ;
	ldb = k ;
	ldc = m ; 

// allocate host memory for matrices A and B
	unsigned int size_A = lda*k ;
	unsigned int mem_size_A = sizeof(float) * size_A ;
	h_A = (float*) malloc(mem_size_A); assert( h_A ) ;

	unsigned int size_B = ldb*n;
	unsigned int mem_size_B = sizeof(float) * size_B;
	h_B = (float*) malloc(mem_size_B); assert( h_B ) ;

    // allocate host memory for the result
	unsigned int size_C = ldc*n;
	unsigned int mem_size_C = sizeof(float) * size_C;
	h_C = (float*) malloc(mem_size_C); assert( h_C ) ;
	
	reference = (float*) malloc(mem_size_C); assert( reference ) ;

	// initialize host memory
	randomInit(h_A, size_A);
	randomInit(h_B, size_B);
	randomInit(h_C, size_C);

    // allocate device memory
	CUdeviceptr d_A, d_B, d_C;
	cutilDrvSafeCallNoSync( cuMemAlloc( &d_C, mem_size_C ) ) ;
	cutilDrvSafeCallNoSync( cuMemAlloc( &d_A, mem_size_A ) ) ;
	cutilDrvSafeCallNoSync( cuMemAlloc( &d_B, mem_size_B ) ) ;
	
// step 2: copy data from host to device
// cuMemcpyHtoD (CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount)
	cutilDrvSafeCallNoSync( cuMemcpyHtoD(d_A, h_A, mem_size_A) ) ;
	cutilDrvSafeCallNoSync( cuMemcpyHtoD(d_B, h_B, mem_size_B) ) ;

// step 3: execute the kernel and evaluate average timing
// remove warmup 
	(*sgemmWrapper)( sgemm, d_C, d_A, d_B, m, k, n, lda, ldb, ldc, alpha, beta ) ;
	cudaThreadSynchronize();

	if ( 2048 < n ){ numIterations = 2 ; }

	cutStartTimer(timer);
	for (int i = 0; i < numIterations; ++i){
		(*sgemmWrapper)( sgemm, d_C, d_A, d_B, m, k, n, lda, ldb, ldc, alpha, beta ) ;
	}
	cudaThreadSynchronize();
	cutStopTimer(timer);
	naiveTime = cutGetTimerValue(timer) ;

	gpuTime = naiveTime /((double)numIterations) ;

	// check if kernel execution generated and error
	cutilCheckMsg("Kernel execution failed");

    // copy result from device to host
	cutilDrvSafeCallNoSync( cuMemcpyDtoH(h_C, d_C, mem_size_C) ) ;

// compute flops
	double flops = ((double)m*(double)n*(double)k)*2.0 ;
	flop_GB = (flops /gpuTime) * ( 1000.0 / 1024.0 ) /1024.0 / 1024.0 ; 

	if( compareToCPU ){
		
// step 4: compute reference solution via CPU
		matrixMul_cublas(reference, h_A, h_B, m, k, n, lda, ldb, ldc, alpha, beta ) ;
 
// step 5: check result
		compare_supnorm( m, n, reference, ldc, h_C, ldc, max_err, rel_max_err ) ;

	}else{
		printf("Warning: don't compare to CPU, programers take care \n");
		max_err = 0.0 ;
		rel_max_err = 0.0 ;
	}

// step 6: cleanup memory
	free( h_A );
	free( h_B );
	free( h_C );
	free( reference);

	cutilDrvSafeCallNoSync( cuMemFree(d_A) ) ;
	cutilDrvSafeCallNoSync( cuMemFree(d_B) ) ;
	cutilDrvSafeCallNoSync( cuMemFree(d_C) ) ;

	cutilCheckError( cutDeleteTimer(timer));

}