
/*
 *   File: main.cpp
 *   author: Lung-Sheng Chien
 *		Department of Mathematics, Tsing Hua univeristy, R.O.C. (Taiwan).
 *		Email: d947207@oz.nthu.edu.tw
 *	 date: 2010/01/15
 *
 *	 description: calibrate timing of CUBLAS , volkov, method 1~8
 *
 *		1. set parameter "device_num" to a device
 *		2. use auto_profile() to calibrate all methods
 *		   please note
 *		   (1) put exeuctable file in directory "release"	
 *		   (2) modify macro defintion OUTPUT_DIR 
 *			for example, if you want to calibrate on GTX275, then you 
 *			can create directory /data/GTX275 and modify OUTPUT_DIR as 
 *			
 *			#define  OUTPUT_DIR  "../data/GTX275/"
 *
 */

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#include "lsc_cuda_utility.h"

#include "wrapper.h"

typedef  void  (*sgemmWrapper_prototype)(CUfunction hfunc,
		CUdeviceptr C, CUdeviceptr A, CUdeviceptr B, int hA, int wA, int wB,
		int lda, int ldb, int ldc,
		float alpha, float beta ) ;

void  profile_sgemm_square(
	char* cubin_filename, char* sgemm_funcName, sgemmWrapper_prototype sgemmWrapper,
	char* out_file ) ;

void  profile_general_sgemm_square( char* cubin_filename, char* sgemm_funcName, 
						   sgemmWrapper_prototype sgemmWrapper,
						    char* out_file );

void  profile_CUBLAS_overN( char* out_file ) ;

void  check_general_sgemm_square( char* cubin_filename, char* sgemm_funcName, 
						   sgemmWrapper_prototype sgemmWrapper ) ;

void  auto_profile( void ) ;


#define  cat(x, y)  x ## y

#define  OUTPUT_DIR  "../data/TeslaC1060/"

int main(int argc, char** argv)
{
	int device_num = 2 ; // TeslaC1060
	initContext_Drv( device_num ) ; // initial context

// show_device_info() ; // show device information

	auto_profile( ) ; // calibrate Volkov's code and 8 methods
	 
//	check_general_sgemm_square( "../method5/method5_v3.cubin", "method5_sgemmNN", &method5_DrvWrapper ) ;

    return 0 ;
}


void  auto_profile( void ) 
{

	profile_CUBLAS_overN( cat(OUTPUT_DIR, "cublas/threads512.txt") ) ;	

	profile_sgemm_square("../volkov/volkov_sgemm.cubin", "volkov_sgemm", 
		&volkov_DrvWrapper, cat(OUTPUT_DIR,"volkov/threads512.txt") );
	
	profile_sgemm_square("../volkov/decuda_ldsb32_cudasm.cubin", "volkov_variant_sgemmNN", 
		&volkov_DrvWrapper, cat(OUTPUT_DIR,"volkov/variant_threads512.txt") );

// method 1, 2, 3, 4 don't consider out-of-array bound, so we use "profile_sgemm_square"

	profile_sgemm_square("../method1/method1.cubin", "method1_sgemmNN", 
		&method1_DrvWrapper, cat(OUTPUT_DIR,"method1/threads320.txt") );

	profile_sgemm_square("../method1/decuda_ldsb32_cudasm.cubin", "method1_variant_sgemmNN", 
		&method1_DrvWrapper, cat(OUTPUT_DIR,"method1/variant_threads320.txt") );

	profile_sgemm_square("../method2/method2.cubin", "method2_sgemmNN", 
		&method2_DrvWrapper, cat(OUTPUT_DIR,"method2/threads256.txt") );

	profile_sgemm_square("../method2/decuda_ldsb32_cudasm.cubin", "method2_variant_sgemmNN", 
		&method2_DrvWrapper, cat(OUTPUT_DIR,"method2/variant_threads256.txt") );

	profile_sgemm_square("../method3/method3.cubin", "method3_sgemmNN", 
		&method3_DrvWrapper, cat(OUTPUT_DIR,"method3/threads256.txt") );

	profile_sgemm_square("../method3/decuda_ldsb32_cudasm.cubin", "method3_variant_sgemmNN", 
		&method3_DrvWrapper, cat(OUTPUT_DIR,"method3/variant_threads256.txt") );

	profile_sgemm_square("../method4/method4.cubin", "method4_sgemmNN", 
		&method4_DrvWrapper, cat(OUTPUT_DIR,"method4/threads192.txt") );

	profile_sgemm_square("../method4/decuda_ldsb32_cudasm.cubin", "method4_variant_sgemmNN", 
		&method4_DrvWrapper, cat(OUTPUT_DIR,"method4/variant_threads192.txt") );
	
// method 5, 6, 7, 8 consider out-of-array bound, so we use "profile_general_sgemm_square"
	profile_general_sgemm_square("../method5/method5_v1.cubin", "method5_sgemmNN", 
		&method5_DrvWrapper, cat(OUTPUT_DIR,"method5/threads384_v1.txt") );

	profile_general_sgemm_square("../method5/method5_v2.cubin", "method5_sgemmNN", 
		&method5_DrvWrapper, cat(OUTPUT_DIR,"method5/threads384_v2.txt") );

	profile_general_sgemm_square("../method5/method5_v3.cubin", "method5_sgemmNN", 
		&method5_DrvWrapper, cat(OUTPUT_DIR,"method5/threads512_v3.txt") );

	profile_general_sgemm_square("../method6/method6.cubin", "method6_sgemmNN", 
		&method6_DrvWrapper, cat(OUTPUT_DIR,"method6/threads256.txt") );

	profile_general_sgemm_square("../method6/decuda_ldsb32_cudasm.cubin", "method6_variant_sgemmNN", 
		&method6_DrvWrapper, cat(OUTPUT_DIR,"method6/variant_threads320.txt") );

	profile_general_sgemm_square("../method7/method7.cubin", "method7_sgemmNN", 
		&method7_DrvWrapper, cat(OUTPUT_DIR,"method7/threads256.txt") );

	profile_general_sgemm_square("../method7/decuda_ldsb32_cudasm.cubin", "method7_variant_sgemmNN", 
		&method7_DrvWrapper, cat(OUTPUT_DIR,"method7/variant_threads320.txt") );

	profile_general_sgemm_square("../method8/method8.cubin", "method8_sgemmNN", 
		&method8_DrvWrapper, cat(OUTPUT_DIR,"method8/threads256.txt") );

	profile_sgemm_square("../method8/decuda_ldsb32_cudasm.cubin", "method8_variant_sgemmNN", 
		&method8_DrvWrapper, cat(OUTPUT_DIR,"method8/variant_threads256.txt") );

}

