

// rank1_update_method3.cpp

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#include <cutil_inline.h>
typedef float2 Complex;


static void rank1_update( int *c, int A, int lda_loc, int sizeOfsmem,
	int b1_ptr, int b2_ptr, char *pre_code, char *post_code )
{
	int i, j ;
	int x ;
	int ofs1, b1_res ;
	int ofs2, b2_res ;
	int ofs3, b3_res ;
	int ofs4, b4_res ;
	int ofs1_cur = 0 ;
	int ofs2_cur = 0 ;
	int ofs3_cur = 0 ;	
	int ofs4_cur = 0 ;
	
// b1_ptr = b1[0][0]
// b2_ptr = b2[0][0]
// b3_ptr = b1[1][0] 
// b4_ptr = b2[1][0]
	int b3_ptr = b1_ptr + 17 * sizeOfsmem ; 
	int b4_ptr = b2_ptr + 17 * sizeOfsmem ;
	
	char  outFile[32] = "output.txt" ;
	FILE *fout = fopen(outFile, "w" ) ;
	if ( NULL == fout ){
		printf("Error: File %s is not open\n", outFile );
		exit(1) ;
	}
// add pre_code 	
	fprintf(fout, "%s", pre_code ) ;

// $r4 <-- lda  * sizeof( Complex )
	fprintf(fout, "shl.u32 $r4, s[0x%.4x], 0x00000003\n", lda_loc );
	for ( i = 0 ; i < 16 ; i+=2){
// (r0, r1) = A[0] ;  // r0 = A_reg.x, r1 = A_reg.y	
		fprintf( fout, "mov.b64 $r0, g[$r%d]\n", A ) ;
//  A += lda ;
		fprintf( fout, "add.u32 $r%d, $r4, $r%d\n", A, A ) ;

// (r2, r3) = A[lda] ;  // r2 = A_reg.x, r3 = A_reg.y	
	  fprintf( fout, "mov.b64 $r2, g[$r%d]\n", A ) ;
//  A += lda ;
		fprintf( fout, "add.u32 $r%d, $r4, $r%d\n", A, A ) ;
			  
		for( j =0 ; j < 16 ; j++ ){
// step 1: update c[0:2*(k1-1)]
//				b1[i][j] = ( $ofs1, b1_res ) ;
//				b2[i][j] = ( $ofs2, b2_res ) ;
				x = b1_ptr + j * sizeOfsmem ;
				b1_res = x % 0x80 ;
				ofs1 = x - b1_res ;
		
				x = b2_ptr + j * sizeOfsmem ;
				b2_res = x % 0x80 ;
				ofs2 = x - b2_res ;

				if ( (0 != ofs1) && ( ofs1_cur != ofs1) ){
					ofs1_cur = ofs1 ;
					fprintf(fout,"mov.b32 $ofs1, 0x%.8x\n",  ofs1_cur );
				}
				if ( (0 != ofs2) && ( ofs2_cur != ofs2) ){
					ofs2_cur = ofs2 ;
					fprintf(fout,"mov.b32 $ofs2, 0x%.8x\n",  ofs2_cur );
				}

//				b1[i+1][j] = ( $ofs3, b3_res ) ;
//				b2[i+1][j] = ( $ofs4, b4_res ) ;
				x = b3_ptr + j * sizeOfsmem ;
				b3_res = x % 0x80 ;
				ofs3 = x - b3_res ;
		
				x = b4_ptr + j * sizeOfsmem ;
				b4_res = x % 0x80 ;
				ofs4 = x - b4_res ;

				if ( (0 != ofs3) && ( ofs3_cur != ofs3) ){
					ofs3_cur = ofs3 ;
					fprintf(fout,"mov.b32 $ofs3, 0x%.8x\n",  ofs3_cur );
				}
				if ( (0 != ofs4) && ( ofs4_cur != ofs4) ){
					ofs4_cur = ofs4 ;
					fprintf(fout,"mov.b32 $ofs4, 0x%.8x\n",  ofs4_cur );
				}

// !!!	r6 = b_reg_y <-- s[ $ofs4 + b4_res ] ;
			if (0 != ofs4){
				fprintf(fout, "lds.b32 $r6, s[$ofs4+0x%.4x]\n", b4_res );
			}else{
				fprintf(fout, "lds.b32 $r6, s[0x%.4x]\n", b4_res );
			}
				
// !!! c[2*j] <-- b_reg_y * A_reg.y - c[2*j] ;
		  fprintf(fout, "mad.rn.f32 $r%d, $r6, $r3, -$r%d\n", c[2*j], c[2*j] );
			 
							
//  (MUL) r5  = b_reg_y * A_reg.y ;
				if (0 != ofs2){
					fprintf(fout, "mul.rn.f32 $r5, s[$ofs2+0x%.4x], $r1\n", b2_res );
				}else{
					fprintf(fout, "mul.rn.f32 $r5, s[0x%.4x], $r1\n", b2_res );
				}

// !!! c[2j+1] <-- b_reg_y * A_reg.x + (b_reg_x * A_reg.y + c[2*j+1]) ;
		 	fprintf(fout, "mad.rn.f32 $r%d, $r6, $r2, $r%d\n", c[2*j+1], c[2*j+1] );
			 
							
//  r5  = b_reg_x * A_reg.x V r5 ;
				if (0 != ofs1){
					fprintf(fout, "mad.rn.f32 $r5, s[$ofs1+0x%.4x], $r0, -$r5\n", b1_res );
				}else{
					fprintf(fout, "mad.rn.f32 $r5, s[0x%.4x], $r0, -$r5\n", b1_res );
				}
			
// !!!  r6 = b_reg_x <-- s[ $ofs3 + b3_res ] ;          
			if (0 != ofs3){
				fprintf(fout, "lds.b32 $r6, s[$ofs3+0x%.4x]\n", b3_res );
			}else{
				fprintf(fout, "lds.b32 $r6, s[0x%.4x]\n", b3_res );
			}

// !!!   c[2*j] <-- b_reg_x * A_reg.x V(b_reg_y * A_reg.y - c[2*j]) ;
			fprintf(fout, "mad.rn.f32 $r%d, $r6, $r2, -$r%d\n", c[2*j], c[2*j] );
			 										
//		 r7  = b_reg_x * A_reg.y ;
				if (0 != ofs1){
					fprintf(fout, "mul.rn.f32 $r7, s[$ofs1+0x%.4x], $r1\n", b1_res) ;
				}else{
					fprintf(fout, "mul.rn.f32 $r7, s[0x%.4x], $r1\n", b1_res ) ;
				}

// !!!   c[2*j+1] <-- b_reg_x * A_reg.y + c[2*j+1] ; 
			fprintf(fout, "mad.rn.f32 $r%d, $r6, $r3, $r%d\n", c[2*j+1], c[2*j+1] ) ;
			
//     r7 = b_reg_y * A_reg.x + r7 ;
				if (0 != ofs2){
					fprintf(fout, "mad.rn.f32 $r7, s[$ofs2+0x%.4x], $r0, $r7\n", b2_res );
				}else{
					fprintf(fout, "mad.rn.f32 $r7, s[0x%.4x], $r0, $r7\n", b2_res );
				}

//			c[2*j] += r5 ;
				fprintf(fout, "add.rn.f32 $r%d, $r%d, $r5\n", c[2*j], c[2*j] );
										
//		 c[2*j+1] += r7 ;
				fprintf(fout, "add.rn.f32 $r%d, $r%d, $r7\n", c[2*j+1], c[2*j+1] );
		
		}// for j

//    b1_ptr += 17 * 2 ;
		b1_ptr += 17 * sizeOfsmem * 2 ; 
//		b2_ptr += 17 * 2 ;
		b2_ptr += 17 * sizeOfsmem * 2 ; 
		
		b3_ptr += 17 * sizeOfsmem * 2 ; 
		b4_ptr += 17 * sizeOfsmem * 2 ; 
	}// for i

// add post_code 	
	fprintf(fout, "%s", post_code ) ;
	fclose( fout ) ;
}


/*
// O.K.
static void rank1_update( int *c, int A, int lda_loc, int sizeOfsmem,
	int b1_ptr, int b2_ptr, char *pre_code, char *post_code )
{
	int i, j ;
	int x ;
	int ofs1, b1_res ;
	int ofs2, b2_res ;
	int ofs3, b3_res ;
	int ofs4, b4_res ;
	int ofs1_cur = 0 ;
	int ofs2_cur = 0 ;
	int ofs3_cur = 0 ;	
	int ofs4_cur = 0 ;
	
// b1_ptr = b1[0][0]
// b2_ptr = b2[0][0]
// b3_ptr = b1[1][0] 
// b4_ptr = b2[1][0]
	int b3_ptr = b1_ptr + 17 * sizeOfsmem ; 
	int b4_ptr = b2_ptr + 17 * sizeOfsmem ;
	
	char  outFile[32] = "output.txt" ;
	FILE *fout = fopen(outFile, "w" ) ;
	if ( NULL == fout ){
		printf("Error: File %s is not open\n", outFile );
		exit(1) ;
	}
// add pre_code 	
	fprintf(fout, "%s", pre_code ) ;

// $r2 <-- lda  * sizeof( Complex )
	fprintf(fout, "shl.u32 $r2, s[0x%.4x], 0x00000003\n", lda_loc );
	for ( i = 0 ; i < 16 ; i++ ){
// (r0, r1) = A[0] ;  // r0 = A_reg.x, r1 = A_reg.y	
		fprintf( fout, "mov.b64 $r0, g[$r%d]\n", A ) ;
//  A += lda ;
		fprintf( fout, "add.u32 $r%d, $r2, $r%d\n", A, A ) ;
 
			  
		for( j = 0 ; j < 16 ; j++ ){
// step 1: update c[0:2*(k1-1)]
//				b1[i][j] = ( $ofs1, b1_res ) ;
//				b2[i][j] = ( $ofs2, b2_res ) ;
				x = b1_ptr + j * sizeOfsmem ;
				b1_res = x % 0x80 ;
				ofs1 = x - b1_res ;
		
				x = b2_ptr + j * sizeOfsmem ;
				b2_res = x % 0x80 ;
				ofs2 = x - b2_res ;

				if ( (0 != ofs1) && ( ofs1_cur != ofs1) ){
					ofs1_cur = ofs1 ;
					fprintf(fout,"mov.b32 $ofs1, 0x%.8x\n",  ofs1_cur );
				}
				if ( (0 != ofs2) && ( ofs2_cur != ofs2) ){
					ofs2_cur = ofs2 ;
					fprintf(fout,"mov.b32 $ofs2, 0x%.8x\n",  ofs2_cur );
				}
 			 																								
//  (MUL) r5  = b_reg_y * A_reg.y ;
				if (0 != ofs2){
					fprintf(fout, "mul.rn.f32 $r5, s[$ofs2+0x%.4x], $r1\n", b2_res );
				}else{
					fprintf(fout, "mul.rn.f32 $r5, s[0x%.4x], $r1\n", b2_res );
				}

//  r5  = b_reg_x * A_reg.x V r5 ;
				if (0 != ofs1){
					fprintf(fout, "mad.rn.f32 $r5, s[$ofs1+0x%.4x], $r0, -$r5\n", b1_res );
				}else{
					fprintf(fout, "mad.rn.f32 $r5, s[0x%.4x], $r0, -$r5\n", b1_res );
				}

//			c[2*j] += r5 ;
				fprintf(fout, "add.rn.f32 $r%d, $r%d, $r5\n", c[2*j], c[2*j] );
										
												
//		 r5  = b_reg_x * A_reg.y ;
				if (0 != ofs1){
					fprintf(fout, "mul.rn.f32 $r5, s[$ofs1+0x%.4x], $r1\n", b1_res) ;
				}else{
					fprintf(fout, "mul.rn.f32 $r5, s[0x%.4x], $r1\n", b1_res ) ;
				}

//     r5 = b_reg_y * A_reg.x + r5 ;
				if (0 != ofs2){
					fprintf(fout, "mad.rn.f32 $r5, s[$ofs2+0x%.4x], $r0, $r5\n", b2_res );
				}else{
					fprintf(fout, "mad.rn.f32 $r5, s[0x%.4x], $r0, $r5\n", b2_res );
				}
				
//		 c[2*j+1] += r5 ;
				fprintf(fout, "add.rn.f32 $r%d, $r%d, $r5\n", c[2*j+1], c[2*j+1] );
		
		}// for j

//    b1_ptr += 17 * 2 ;
		b1_ptr += 17 * sizeOfsmem  ; 
//		b2_ptr += 17 * 2 ;
		b2_ptr += 17 * sizeOfsmem  ; 
		
		b3_ptr += 17 * sizeOfsmem * 2 ; 
		b4_ptr += 17 * sizeOfsmem * 2 ; 
	}// for i

// add post_code 	
	fprintf(fout, "%s", post_code ) ;
	fclose( fout ) ;
}

*/

/*
// (r2, r3) <-- A[0]
//  r4 <-- lda * 8
// if we use (r3,r4) <-- A[0], then run-time error
static void rank1_update( int *c, int A, int lda_loc, int sizeOfsmem,
	int b1_ptr, int b2_ptr, char *pre_code, char *post_code )
{
	int i, j ;
	int ofs1, b1_res ;
	int ofs2, b2_res ;
	int ofs1_cur = 0 ;
	int ofs2_cur = 0 ;

	char  outFile[32] = "output.txt" ;
	FILE *fout = fopen(outFile, "w" ) ;
	if ( NULL == fout ){
		printf("Error: File %s is not open\n", outFile );
		exit(1) ;
	}
// add pre_code 	
	fprintf(fout, "%s", pre_code ) ;

// $r4 <-- lda  * sizeof( Complex )
	fprintf(fout, "shl.u32 $r4, s[0x%.4x], 0x00000003\n", lda_loc );
	for ( i = 0 ; i < 16 ; i++){
// (r2, r3)  A[0] ;  // r0 = A_reg.x, r1 = A_reg.y	
		fprintf( fout, "mov.b64 $r2, g[$r%d]\n", A ) ;
//  A += lda ;
		fprintf( fout, "add.u32 $r%d, $r4, $r%d\n", A, A ) ;
		for ( j = 0 ; j < 16 ; j++){
//          b1_ptr[j] = ( $ofs1, b1_res ) ;
//          b2_ptr[j] = ( $ofs2, b2_res ) ;
			int x = b1_ptr + j * sizeOfsmem ;
			b1_res = x % 0x80 ;
			ofs1 = x - b1_res ;
		
			x = b2_ptr + j * sizeOfsmem ;
			b2_res = x % 0x80 ;
			ofs2 = x - b2_res ;

			if ( (0 != ofs1) && ( ofs1_cur != ofs1) ){
				ofs1_cur = ofs1 ;
				fprintf(fout,"mov.b32 $ofs3, 0x%.8x\n",  ofs1_cur );
			}
			if ( (0 != ofs2) && ( ofs2_cur != ofs2) ){
				ofs2_cur = ofs2 ;
				fprintf(fout,"mov.b32 $ofs4, 0x%.8x\n",  ofs2_cur );
			}

//	  r6 = b_reg_y <-- s[ $ofs2 + b2_res ] ;
			if (0 != ofs2){
				fprintf(fout, "lds.b32 $r6, s[$ofs4+0x%.4x]\n", b2_res );
			}else{
				fprintf(fout, "lds.b32 $r6, s[0x%.4x]\n", b2_res );
			}
			
//   c[2*j] <-- b_reg_y * A_reg.y - c[2*j] ;
				fprintf(fout, "mad.rn.f32 $r%d, $r6, $r3, -$r%d\n", c[2*j], c[2*j] );


//    c[2j+1] <-- b_reg_y * A_reg.x + (b_reg_x * A_reg.y + c[2*j+1]) ;
				fprintf(fout, "mad.rn.f32 $r%d, $r6, $r2, $r%d\n", c[2*j+1], c[2*j+1] );
						
//	  r6 = b_reg_x <-- s[ $ofs1 + b1_res ] ;          
			if (0 != ofs1){
				fprintf(fout, "lds.b32 $r6, s[$ofs3+0x%.4x]\n", b1_res );
			}else{
				fprintf(fout, "lds.b32 $r6, s[0x%.4x]\n", b1_res );
			}

//    c[2*j] <-- b_reg_x * A_reg.x V(b_reg_y * A_reg.y - c[2*j]) ;

				fprintf(fout, "mad.rn.f32 $r%d, $r6, $r2, -$r%d\n", c[2*j], c[2*j] );

//    c[2*j+1] <-- b_reg_x * A_reg.y + c[2*j+1] ; 
				fprintf(fout, "mad.rn.f32 $r%d, $r6, $r3, $r%d\n", c[2*j+1], c[2*j+1] ) ;
				
		}// for j
//      b1_ptr += 17 ;
		b1_ptr += 17 * sizeOfsmem ; 
//		b2_ptr += 17 ;
		b2_ptr += 17 * sizeOfsmem ; 
	}// for i

// add post_code 	
	fprintf(fout, "%s", post_code ) ;
	fclose( fout ) ;
}
*/


/*
 see /method1/from_decuda_149_1736.asm
 r45 = A 
 s[0x20] = lda
 b1_ptr = 0x60
 b2_ptr = 0x4a0

*/
void rank1_update_method3(void)
{
	int c[32] ;
	c[20] = 10 ;	c[21] = 11 ;	c[22] = 12 ;	c[23] = 13 ;
	c[24] = 14 ;	c[25] = 15 ;	c[26] = 16 ;  c[27] = 17 ;
	c[28] = 18 ;  c[29] = 19 ;

	c[30] = 20 ;	c[31] = 21 ;	c[19] = 22 ;	c[18] = 23 ;
	c[17] = 24 ;	c[16] = 25 ;	c[15] = 26 ;	c[14] = 27 ;
	c[13] = 29 ;
	
	c[12] = 30 ;	c[11] = 31 ;	c[10] = 32 ;	c[9 ] = 33 ;
	c[8 ] = 34 ;	c[7 ] = 35 ;	c[6 ] = 36 ;	

	c[5 ] = 41 ;  c[4 ] = 42 ;  c[3 ] = 43 ;  c[2 ] = 44 ;
	c[1 ] = 46 ;  c[0 ] = 47 ;
	

	int A = 45 ; // r45 = A
	int lda_loc = 0x20 ; // s[0x20] = lda
	int sizeOfsmem = sizeof( float ) ;
	int b1_ptr = 0x60 ;
	int b2_ptr = 0x4a0 ;
	char pre_code[] = "set.le.s32 $p0|$o127, $r38, c1[0x0008]\n@$p0.ne bra.label label5\n" ;
	char post_code[] = "bar.sync.u32 0x00000000\nadd.b32 $r38, $r38, 0xfffffff0\n" ;
	rank1_update( c, A, lda_loc, sizeOfsmem,
		b1_ptr, b2_ptr, pre_code, post_code ) ;
}