

/*
// volkov_DrvWrapper.cpp

   note: CGEMM deals with type "complex"
 
   we must change 
		cuParamSetf(hfunc, offset, alpha);
		cuParamSetf(hfunc, offset, alpha);
   to

	cuParamSetv(hfunc, offset, &alpha, sizeof(alpha)) ;
	cuParamSetv(hfunc, offset, &alpha, sizeof(beta)) ;

 */


#include <cutil_inline.h>

typedef float2 Complex;

#define ALIGN_UP(offset, alignment)  \
		offset = ((offset) + (alignment) - 1) & ~((alignment) - 1) 


void  volkov_DrvWrapper(CUfunction hfunc,
		CUdeviceptr C, CUdeviceptr A, CUdeviceptr B, int hA, int wA, int wB,
		int lda, int ldb, int ldc,
		Complex alpha, Complex beta )
{
// setup parameter 
	int m = hA ;
	int n = wB ;
	int offset = 0 ;
	void* ptr;

// parameter 1: m = # of row of C
	ALIGN_UP(offset, __alignof(m));
	cuParamSeti(hfunc, offset, m) ;
	offset += sizeof(m);

// parameter 2: n = # of column of C
	ALIGN_UP(offset, __alignof(n));
	cuParamSeti(hfunc, offset, n) ;
	offset += sizeof(n);

// parameter 3: const Complex *A 
	ptr = (void*)(size_t)A;
	ALIGN_UP(offset, __alignof(ptr));
	cuParamSetv(hfunc, offset, &ptr, sizeof(ptr)) ;
	offset += sizeof( ptr );

// parameter 4: int lda 
	ALIGN_UP(offset, __alignof(lda));
	cuParamSeti(hfunc, offset, lda);
	offset += sizeof(lda);

// parameter 5: const Complex *B
	ptr = (void*)(size_t)B;
	ALIGN_UP(offset, __alignof(ptr));
	cuParamSetv(hfunc, offset, &ptr, sizeof(ptr)) ;
	offset += sizeof(ptr);

// parameter 6: int ldb
	ALIGN_UP(offset, __alignof(ldb));
	cuParamSeti(hfunc, offset, ldb);
	offset += sizeof(ldb);

// parameter 7: Complex *C
	ptr = (void*)(size_t)C;
	ALIGN_UP(offset, __alignof(ptr));
	cuParamSetv(hfunc, offset, &ptr, sizeof(ptr)) ;
	offset += sizeof(ptr);

// parameter 8: int ldc
	ALIGN_UP(offset, __alignof(ldc));
	cuParamSeti(hfunc, offset, ldc);
	offset += sizeof(ldc);

// parameter 9: int k: # of column of A
	ALIGN_UP(offset, __alignof(wA));
	cuParamSeti(hfunc, offset, wA);
	offset += sizeof(wA);

// parameter 10: Complex alpha 
	ALIGN_UP(offset, __alignof(alpha));
	cuParamSetv(hfunc, offset, &alpha, sizeof(alpha)) ;
	offset += sizeof(alpha);
	
// parameter 11: Complex beta
	ALIGN_UP(offset, __alignof(beta));
	cuParamSetv(hfunc, offset, &beta, sizeof(beta)) ;
	offset += sizeof(beta);
	
// parameter 12: Complex* B_bound = B + ldb * n * sizeof(Complex) 	
	CUdeviceptr B_bound = B + ldb * n * sizeof(Complex) ;
	ptr = (void*)(size_t)B_bound;
	ALIGN_UP(offset, __alignof(ptr));
	cuParamSetv(hfunc, offset, &ptr, sizeof(ptr)) ;
	offset += sizeof(ptr);	
	
		
	cuParamSetSize(hfunc, offset);

// setup execution configuration
	cuFuncSetBlockShape(hfunc, 16, 4, 1);
//	cuFuncSetSharedSize(hfunc, 0); // no shared memory

	cuLaunchGrid(hfunc, (m+63)/64, (n+15)/16 ) ;
}

