

// method4_DrvWrapper.cpp

#include <cutil_inline.h>

#define VECTOR_LENGTH  64
#define NUM_VECTOR     4 
#define BLOCK_SIZE_Y   16	

#define BLOCK_SIZE_X   16

#define THREAD_BLOCK_X  16   // THREAD_BLOCK_X = BLOCK_SIZE_X
#define THREAD_BLOCK_Y  4    // THREAD_BLOCK_Y = VECTOR_LENGTH / BLOCK_SIZE_X


#define ALIGN_UP(offset, alignment)  \
		offset = ((offset) + (alignment) - 1) & ~((alignment) - 1) 

void  method4_DrvWrapper(CUfunction hfunc,
		CUdeviceptr C, CUdeviceptr A, CUdeviceptr B, int hA, int wA, int wB,
		int lda, int ldb, int ldc,
		float alpha, float beta )
{
// setup parameter 
	int m = hA ;
	int n = wB ;
	int offset = 0 ;
	void* ptr;

// parameter 1: m = # of row of C
	ALIGN_UP(offset, __alignof(m));
	cuParamSeti(hfunc, offset, m) ;
	offset += sizeof(m);

// parameter 2: n = # of column of C
	ALIGN_UP(offset, __alignof(n));
	cuParamSeti(hfunc, offset, n) ;
	offset += sizeof(n);

// parameter 3: const float *A 
	ptr = (void*)(size_t)A;
	ALIGN_UP(offset, __alignof(ptr));
	cuParamSetv(hfunc, offset, &ptr, sizeof(ptr)) ;
	offset += sizeof( ptr );

// parameter 4: int lda 
	ALIGN_UP(offset, __alignof(lda));
	cuParamSeti(hfunc, offset, lda);
	offset += sizeof(lda);

// parameter 5: const float *B
	ptr = (void*)(size_t)B;
	ALIGN_UP(offset, __alignof(ptr));
	cuParamSetv(hfunc, offset, &ptr, sizeof(ptr)) ;
	offset += sizeof(ptr);

// parameter 6: int ldb
	ALIGN_UP(offset, __alignof(ldb));
	cuParamSeti(hfunc, offset, ldb);
	offset += sizeof(ldb);

// parameter 7: float *C
	ptr = (void*)(size_t)C;
	ALIGN_UP(offset, __alignof(ptr));
	cuParamSetv(hfunc, offset, &ptr, sizeof(ptr)) ;
	offset += sizeof(ptr);

// parameter 8: int ldc
	ALIGN_UP(offset, __alignof(ldc));
	cuParamSeti(hfunc, offset, ldc);
	offset += sizeof(ldc);

// parameter 9: int k: # of column of A
	ALIGN_UP(offset, __alignof(wA));
	cuParamSeti(hfunc, offset, wA);
	offset += sizeof(wA);

// parameter 10: float alpha 
	ALIGN_UP(offset, __alignof(alpha));
	cuParamSetf(hfunc, offset, alpha);
	offset += sizeof(alpha);

// parameter 11: float beta
	ALIGN_UP(offset, __alignof(beta));
	cuParamSetf(hfunc, offset, beta);
	offset += sizeof(beta);

// parameter 12: int nextBlockStride_A = VECTOR_LENGTH * grid.x 
	dim3 grid( (m+VECTOR_LENGTH*NUM_VECTOR-1)/(VECTOR_LENGTH*NUM_VECTOR), 
			   (n+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y ) ;
	
	int nextBlockStride_A = VECTOR_LENGTH * grid.x ;
	ALIGN_UP(offset, __alignof(nextBlockStride_A));
	cuParamSeti(hfunc, offset, nextBlockStride_A);
	offset += sizeof(nextBlockStride_A);

// parameter 13: int nextColumnStride_A = lda - 3* nextBlockStride_A 
	int nextColumnStride_A = lda - 3* nextBlockStride_A ;
	ALIGN_UP(offset, __alignof(nextColumnStride_A));
	cuParamSeti(hfunc, offset, nextColumnStride_A);
	offset += sizeof(nextColumnStride_A);


	cuParamSetSize(hfunc, offset);

// setup execution configuration

// dim3 threads( THREAD_BLOCK_X, THREAD_BLOCK_Y );
	cuFuncSetBlockShape(hfunc, THREAD_BLOCK_X, THREAD_BLOCK_Y, 1);

//	cuFuncSetSharedSize(hfunc, 0); // no shared memory

// dim3 grid( (m+VECTOR_LENGTH*NUM_VECTOR-1)/(VECTOR_LENGTH*NUM_VECTOR), (n+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y ) ;
	cuLaunchGrid(hfunc, grid.x,  grid.y ) ;
}
