

#include "lsc_cuda_utility.h"

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#include <cuda_runtime_api.h>

// includes, project
#include <cutil.h>

typedef float2 Complex; 

void show_device_info( void )
{
	int deviceCount;
	cudaDeviceProp deviceProp;
	int device;
	
	cudaGetDeviceCount(&deviceCount);
	printf("device count = %d\n", deviceCount);
	for (device = 0; device < deviceCount; ++device) {
		cudaGetDeviceProperties(&deviceProp, device);
		printf("device %d: name = %s \n", device, deviceProp.name);	
		printf("           totalGlobalMem = %6.1f MB \n", ((double)deviceProp.totalGlobalMem)/1024./1024. );
		printf("           sharedMemPerBlock = %6.1f kB \n", ((double)deviceProp.sharedMemPerBlock)/1024. );
		printf("           totalConstMem = %6.1f kB \n", ((double)deviceProp.totalConstMem)/1024. );	
		printf("           regsPerBlock = %d \n", deviceProp.regsPerBlock );	
		printf("           warpSize = %d \n", deviceProp.warpSize );	
		printf("           maxThreadsPerBlock = %d \n", deviceProp.maxThreadsPerBlock );	
		printf("           maxThreadsDim[3] = (%d, %d, %d) \n", 
					deviceProp.maxThreadsDim[0],
					deviceProp.maxThreadsDim[1],
					deviceProp.maxThreadsDim[2] );	
		printf("           maxGridSize[3] = (%d, %d, %d) \n", 
					deviceProp.maxGridSize[0],
					deviceProp.maxGridSize[1],
					deviceProp.maxGridSize[2] );				
		printf("           compute capability = %d.%d \n", deviceProp.major, deviceProp.minor );	
		printf("           multiProcessorCount (SM) = %d \n", deviceProp.multiProcessorCount );	
		printf("           clockRate = %6.1f MHz\n", ((double)deviceProp.clockRate)/1024. );	
		printf("           kernelExecTimeoutEnabled = %d \n", deviceProp.kernelExecTimeoutEnabled );	
	}// for each device
	
}	

// Allocates a matrix with random float entries.
void randomInit(Complex* data, unsigned long long int size)
{

    for (int i = 0; i < size; ++i){
        data[i].x = (float)rand() / (float)RAND_MAX;
        data[i].y = (float)rand() / (float)RAND_MAX;
    }

}


#define qMax(a, b)	((b) < (a) ? (a) : (b))
#define qMin(a, b)	((a) < (b) ? (a) : (b))
#define SQUARE(x)  (x)*(x) 

void compare_supnorm( int m, int n, Complex *A , int lda, Complex *B, int ldb,
       float &max_err, float &rel_max_err )
{
	Complex aij, bij ;
	float abs_aij ;
	Complex aij_minus_bij ;
	float abs_err = 0.0 ;
	float max_ref = 0.0 ;
 		
	max_err = 0.0 ;
	for( int i = 0 ; i < m ; i++){
		for( int j = 0 ; j < n ; j++){
			aij = A[ j * lda + i ] ;
			bij = B[ j * ldb + i ] ;
			abs_aij = sqrt( SQUARE(aij.x) + SQUARE(aij.y) ) ;
			max_ref = qMax( max_ref, abs_aij ) ;
			
			aij_minus_bij.x = aij.x - bij.x ;
			aij_minus_bij.y = aij.y - bij.y ;
			
			abs_err = sqrt( SQUARE(aij_minus_bij.x) + SQUARE(aij_minus_bij.y) ) ;
			max_err = qMax( max_err, abs_err ) ;
		}// for each column
	}// for each row
	rel_max_err = max_err / max_ref ;	
}

void  initContext_Drv( int device_num ) 
{
// Initialize
	if (cuInit(0) != CUDA_SUCCESS) exit (0);

// Get number of devices supporting CUDA
	int deviceCount = 0;
	cuDeviceGetCount(&deviceCount);
	if (deviceCount == 0) {
		printf("There is no device supporting CUDA.\n");
		exit (0);
	}

// Get handle for device "device_num"
	CUdevice cuDevice = 0;
	cuDeviceGet(&cuDevice, device_num);

	char dev_name[128] ;
	cuDeviceGetName(dev_name, 128, cuDevice) ;
	printf("device %d, device name = %s\n",  device_num, dev_name );	

	CUresult status ;
// Create context
	CUcontext cuContext;
	status = cuCtxCreate(&cuContext, 0, cuDevice);
	if ( CUDA_SUCCESS != status ){
		printf("Error: context fail\n");
		exit(0) ;
	}
}

