


// rank1_update_volkov_unroll1.cpp

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#include <cutil_inline.h>
typedef float2 Complex;

static void rank1_update( int *c, int A, int lda_loc, int sizeOfsmem,
	int b1_ptr, int b2_ptr, char *pre_code, char *post_code )
{
	int i, j, k ;
	int k1 = 1 ;
	int j_step = k1 ; // 16 is multiple of j_step
	int x ;
	int ofs1, b1_res ;
	int ofs2, b2_res ;
	int ofs1_cur = 0 ;
	int ofs2_cur = 0 ;
	int freeReg[21] = {3, 4, 5, 6, 7,
		48, 49,
		50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 
		60, 61, 62, 63 } ;

	char  outFile[32] = "output.txt" ;
	FILE *fout = fopen(outFile, "w" ) ;
	if ( NULL == fout ){
		printf("Error: File %s is not open\n", outFile );
		exit(1) ;
	}
// add pre_code 	
	fprintf(fout, "%s", pre_code ) ;

// $r2  lda  * sizeof( Complex )
	fprintf(fout, "shl.u32 $r2, s[0x%.4x], 0x00000003\n", lda_loc );
	for ( i = 0 ; i < 16 ; i++){
// (r0, r1)  A[0] ;  // r0 = A_reg.x, r1 = A_reg.y	
		fprintf( fout, "mov.b64 $r0, g[$r%d]\n", A ) ;
//  A += lda ;
		fprintf( fout, "add.u32 $r%d, $r2, $r%d\n", A, A ) ;

		for( j =0 ; j < 16 ; j += j_step ){
// step 1: update c[0:2*(k1-1)]
			for( k = 0 ; k < k1 ; k++){
//				b1_ptr[j+k] = ( $ofs1, b1_res ) ;
//				b2_ptr[j+k] = ( $ofs2, b2_res ) ;
				x = b1_ptr + (j+k) * sizeOfsmem ;
				b1_res = x % 0x80 ;
				ofs1 = x - b1_res ;
		
				x = b2_ptr + (j+k) * sizeOfsmem ;
				b2_res = x % 0x80 ;
				ofs2 = x - b2_res ;

				if ( (0 != ofs1) && ( ofs1_cur != ofs1) ){
					ofs1_cur = ofs1 ;
					fprintf(fout,"mov.b32 $ofs1, 0x%.8x\n",  ofs1_cur );
				}
				if ( (0 != ofs2) && ( ofs2_cur != ofs2) ){
					ofs2_cur = ofs2 ;
					fprintf(fout,"mov.b32 $ofs2, 0x%.8x\n",  ofs2_cur );
				}

//       r5  = b_reg_y * A_reg.y ;
				if (0 != ofs2){
					fprintf(fout, "mul.rn.f32 $r%d, s[$ofs2+0x%.4x], $r1\n", freeReg[2*k], b2_res );
				}else{
					fprintf(fout, "mul.rn.f32 $r%d, s[0x%.4x], $r1\n", freeReg[2*k], b2_res );
				}
				
//       r5  = b_reg_x * A_reg.x V r5 ;
				if (0 != ofs1){
					fprintf(fout, "mad.rn.f32 $r%d, s[$ofs1+0x%.4x], $r0, -$r%d\n", 
							freeReg[2*k], b1_res, freeReg[2*k] );
				}else{
					fprintf(fout, "mad.rn.f32 $r%d, s[0x%.4x], $r0, -$r%d\n", 
							freeReg[2*k], b1_res, freeReg[2*k] );
				}
				
//		r6  = b_reg_x * A_reg.y ;
				if (0 != ofs1){
					fprintf(fout, "mul.rn.f32 $r%d, s[$ofs1+0x%.4x], $r1\n", freeReg[2*k+1], b1_res) ;
				}else{
					fprintf(fout, "mul.rn.f32 $r%d, s[0x%.4x], $r1\n", freeReg[2*k+1], b1_res ) ;
				}

//       r6 = b_reg_y * A_reg.x + r6 ;
				if (0 != ofs2){
					fprintf(fout, "mad.rn.f32 $r%d, s[$ofs2+0x%.4x], $r0, $r%d\n", 
						freeReg[2*k+1], b2_res, freeReg[2*k+1] );
				}else{
					fprintf(fout, "mad.rn.f32 $r%d, s[0x%.4x], $r0, $r%d\n", 
						freeReg[2*k+1], b2_res, freeReg[2*k+1] );
				}

			} // for k

			for( k = 0 ; k < k1 ; k++){
//				c[2*j] += r5 ;
				fprintf(fout, "add.rn.f32 $r%d, $r%d, $r%d\n", 
					c[2*j + 2*k], c[2*j + 2*k], freeReg[2*k] );
//				c[2*j+1] += r6 ;
				fprintf(fout, "add.rn.f32 $r%d, $r%d, $r%d\n", 
					c[2*j + 2*k+1], c[2*j + 2*k+1], freeReg[2*k+1]);
			}// for k
			 
		}// for j

//      b1_ptr += 17 ;
		b1_ptr += 17 * sizeOfsmem ; 
//		b2_ptr += 17 ;
		b2_ptr += 17 * sizeOfsmem ; 
	}// for i

// add post_code 	
	fprintf(fout, "%s", post_code ) ;
	fclose( fout ) ;
}


/*
 see /method1/from_decuda_149_1736.asm
 r45 = A 
 s[0x20] = lda
 b1_ptr = 0x60
 b2_ptr = 0x4a0

*/
void rank1_update_volkov_unroll1(void)
{
	int c[32] ;
	c[20] = 10 ;	c[21] = 11 ;	c[22] = 12 ;	c[23] = 13 ;
	c[24] = 14 ;	c[25] = 15 ;	c[26] = 16 ;  c[27] = 17 ;
	c[28] = 18 ;  c[29] = 19 ;

	c[30] = 20 ;	c[31] = 21 ;	c[19] = 22 ;	c[18] = 23 ;
	c[17] = 24 ;	c[16] = 25 ;	c[15] = 26 ;	c[14] = 27 ;
	c[13] = 29 ;
	
	c[12] = 30 ;	c[11] = 31 ;	c[10] = 32 ;	c[9 ] = 33 ;
	c[8 ] = 34 ;	c[7 ] = 35 ;	c[6 ] = 36 ;	

	c[5 ] = 41 ;  c[4 ] = 42 ;  c[3 ] = 43 ;  c[2 ] = 44 ;
	c[1 ] = 46 ;  c[0 ] = 47 ;
	

	int A = 45 ; // r45 = A
	int lda_loc = 0x20 ; // s[0x20] = lda
	int sizeOfsmem = sizeof( float ) ;
	int b1_ptr = 0x60 ;
	int b2_ptr = 0x4a0 ;
	char pre_code[] = "set.le.s32 $p0|$o127, $r38, c1[0x0008]\n@$p0.ne bra.label label5\n" ;
	char post_code[] = "bar.sync.u32 0x00000000\nadd.b32 $r38, $r38, 0xfffffff0\n" ;
	rank1_update( c, A, lda_loc, sizeOfsmem,
		b1_ptr, b2_ptr, pre_code, post_code ) ;
}