


// volkov_device.cpp

#include <cutil_inline.h>

#define ALIGN_UP(offset, alignment)  \
		offset = ((offset) + (alignment) - 1) & ~((alignment) - 1) 


void  volkov_DrvWrapper(CUfunction hfunc,
		CUdeviceptr C, CUdeviceptr A, CUdeviceptr B, int hA, int wA, int wB,
		int lda, int ldb, int ldc,
		float alpha, float beta )
{
// setup parameter 
	int m = hA ;
	int n = wB ;
	int offset = 0 ;
	void* ptr;

// parameter 1: m = # of row of C
	ALIGN_UP(offset, __alignof(m));
	cuParamSeti(hfunc, offset, m) ;
	offset += sizeof(m);

// parameter 2: n = # of column of C
	ALIGN_UP(offset, __alignof(n));
	cuParamSeti(hfunc, offset, n) ;
	offset += sizeof(n);

// parameter 3: const float *A 
	ptr = (void*)(size_t)A;
	ALIGN_UP(offset, __alignof(ptr));
	cuParamSetv(hfunc, offset, &ptr, sizeof(ptr)) ;
	offset += sizeof( ptr );

// parameter 4: int lda 
	ALIGN_UP(offset, __alignof(lda));
	cuParamSeti(hfunc, offset, lda);
	offset += sizeof(lda);

// parameter 5: const float *B
	ptr = (void*)(size_t)B;
	ALIGN_UP(offset, __alignof(ptr));
	cuParamSetv(hfunc, offset, &ptr, sizeof(ptr)) ;
	offset += sizeof(ptr);

// parameter 6: int ldb
	ALIGN_UP(offset, __alignof(ldb));
	cuParamSeti(hfunc, offset, ldb);
	offset += sizeof(ldb);

// parameter 7: float *C
	ptr = (void*)(size_t)C;
	ALIGN_UP(offset, __alignof(ptr));
	cuParamSetv(hfunc, offset, &ptr, sizeof(ptr)) ;
	offset += sizeof(ptr);

// parameter 8: int ldc
	ALIGN_UP(offset, __alignof(ldc));
	cuParamSeti(hfunc, offset, ldc);
	offset += sizeof(ldc);

// parameter 9: int k: # of column of A
	ALIGN_UP(offset, __alignof(wA));
	cuParamSeti(hfunc, offset, wA);
	offset += sizeof(wA);

// parameter 10: float alpha 
	ALIGN_UP(offset, __alignof(alpha));
	cuParamSetf(hfunc, offset, alpha);
	offset += sizeof(alpha);

// parameter 11: float beta
	ALIGN_UP(offset, __alignof(beta));
	cuParamSetf(hfunc, offset, beta);
	offset += sizeof(beta);

	cuParamSetSize(hfunc, offset);

// setup execution configuration
	cuFuncSetBlockShape(hfunc, 16, 4, 1);
//	cuFuncSetSharedSize(hfunc, 0); // no shared memory

	cuLaunchGrid(hfunc, (m+63)/64, (n+15)/16 ) ;
}

