

/*   FAST ZGEMM CORE SUBROUTINE FOR ALPHA CPU               */
/*      By Alessandro MIRONE alex@greco2.polytechnique.fr   */
/*   http://luli.polytechnique.fr/~alex                     */

/*   This routine is based on a previous fast dgemm              */
/*            By GOTO@statabo.rim.or.jp                          */
/* ftp://ftp.eni.co.jp/.2/Linux-Alpha-JP/ftp.statabo.rim.or.jp/  */


#define ONE  1.000000000
#define ZERO 0.000000000


/* zgemmNN:

 moltiplicazione di matrici nella loro forma normale

*/
int  zgemmnn_(int *mwholeptr, int *nwholeptr, int *kptr,
	       double *alphaptr,  double *a,
	       int *ldaptr, double *b, int *ldbptr, double *betaptr,
	       double *c, int *ldcptr)
{
  int mwhole,  nwhole,  k, lda,ldb,ldc;
  double alphar,alphai;

  int i, j, l,m,n;
  
  int ls, is, lslda2;
  int lda2, ldb2, ldc2, jlda2,jldb2, jldc2;
  int mP1, kQ1;

  double *a_offset, *b_offset;
  double *a0_offset, *a1_offset, *a_orig;
  double *b0_offset, *b1_offset;
  double *c0_offset, *c3_offset;
  double temp1,temp2, temp3, temp4;

  double atemp1, atemp2, atemp3, atemp4;
  double atemp5, atemp6, atemp7, atemp8;
  double btemp1, btemp2, btemp3, btemp4;
  double btemp5, btemp6, btemp7, btemp8;
  double ctemp1, ctemp2, ctemp3, ctemp4;
  double ctemp5, ctemp6, ctemp7, ctemp8;

  int min_i, min_l;

#define P1 64

#define Q1 24  
/* #define Q1 48 
 Changed because sizeof(complex<double>) = 2*sizeof(double) 
*/
  


  mwhole=*mwholeptr;
  nwhole=*nwholeptr;
  k=*kptr;
  lda=*ldaptr;
  ldb=*ldbptr;
  ldc=*ldcptr;
  alphar=*alphaptr;
  alphai=*(alphaptr+1);



  /* The complex<double> arrays are pointed to by double*s *
   * When moving from a columns to the other we must skip  *
   *  twice the leading dimensione              	   */

  lda2 = (lda<<1);
  ldb2 = (ldb<<1);
  ldc2 = (ldc<<1);

  lslda2 = lda2*Q1; /* cela nous deplaces de Q1 columns dans a */
  m = mwhole - (mwhole % 2);
  n = nwhole - (nwhole % 2);

  mP1 = m - P1;
  kQ1 = k - Q1;
  
  for (is=0; is<m; is+=P1){   /* is is the row index  of the a matrix */
    min_i = (P1>>1);
    if (is >mP1) min_i = ((m -is)>>1);  /* we operate on two rows at a time*/
					/* that`s why we >>1 ate */

    a_orig = a + (is<<1) ;                   /* our block of a starts from a_orig */
    for (ls=0; ls<k; ls+=Q1){  /* ls is the column number of the a matrix */
      min_l = (Q1);
      if (ls>kQ1) min_l = ((k - ls));  /* we operate on two columns at a time*/ 	

      jldb2 = 0;
      jldc2 = 0;

      for (j = 0; j < (n)  ; j+=2) {  /* having defined our a`s block, we pass */ 	
	                                       /* through it every b`s columns */

 	a_offset  = a_orig;

	b_offset  = b + jldb2 + (ls<<1);
	c0_offset = c + jldc2 + (is<<1);
	c3_offset = c0_offset+ldc2;

	for (i = min_i; i >0; i--) { /* 4 iA[ */
	  b0_offset = b_offset;
	  b1_offset = b_offset + ldb2;
	  a0_offset = a_offset;

	  ctemp1 = ZERO;
	  ctemp2 = ZERO;
	  ctemp3 = ZERO;
	  ctemp4 = ZERO;
	  ctemp5 = ZERO;
	  ctemp6 = ZERO;
	  ctemp7 = ZERO;
	  ctemp8 = ZERO;

	  temp1 = *(b0_offset+0); /* parte reale  di b00 */
	  temp2 = *(b1_offset+0); /* parte reale  di b01 */

	  atemp1 = *(a0_offset+0);/* parte reale     a00*/
	  atemp2 = *(a0_offset+1);/* parte immaginaria a00 */
	  atemp3 = *(a0_offset+2);/* parte reale       a10*/
	  atemp4 = *(a0_offset+3);/* parte immaginaria  a10*/

	  btemp1 = ZERO;
	  btemp2 = ZERO;
	  btemp3 = ZERO;
	  btemp4 = ZERO;
	  btemp5 = ZERO;
	  btemp6 = ZERO;
	  btemp7 = ZERO;
	  btemp8 = ZERO;

	  a0_offset += lda2;  /* si passa a una colonna  successiva */

	  l = min_l -1;

	  if (l>0){
	    do {
	      temp3 = *(b0_offset+1); /* parte immaginaria   di 00     */
	      temp4 = *(b1_offset+1); /* parte immaginaria  di 01  */

	      ctemp1 = ctemp1 - btemp1;
	      ctemp2 = ctemp2 + btemp2;
	      ctemp3 = ctemp3 - btemp3;
	      ctemp4 = ctemp4 + btemp4;

	      btemp1 = temp1 * atemp1; /* contributo reale a c00 */
	      btemp2 = temp1 * atemp2; /* contributo immaginario  a c00 */
	      btemp3 = temp1 * atemp3; /* contributo reale a c10 */
	      btemp4 = temp1 * atemp4; /* contributo immaginario  a c10 */

	      ctemp5 = ctemp5 - btemp5; 
	      ctemp6 = ctemp6 + btemp6;
	      ctemp7 = ctemp7 - btemp7;
	      ctemp8 = ctemp8 + btemp8;

	      btemp5 = temp2 * atemp1;/* contributo reale a c01 */
	      btemp6 = temp2 * atemp2;/* contributo immaginario  a c01 */
	      btemp7 = temp2 * atemp3;/* contributo reale a c11 */
	      btemp8 = temp2 * atemp4;/* contributo immaginario  a c11 */

	      ctemp1 = ctemp1 + btemp1;
	      ctemp2 = ctemp2 + btemp2;
	      ctemp3 = ctemp3 + btemp3;
	      ctemp4 = ctemp4 + btemp4;

	      /* parte aggiunta */

	      btemp2 = temp3 * atemp1; /* contributo immaginario  a c00 */
	      btemp1 = temp3 * atemp2; /* contributo reale a c00 */
	      btemp4 = temp3 * atemp3; /* contributo immaginario  a c10 */
	      btemp3 = temp3 * atemp4; /* contributo reale a c10 */

	      ctemp5 = ctemp5 + btemp5; 
	      ctemp6 = ctemp6 + btemp6;
	      ctemp7 = ctemp7 + btemp7;
	      ctemp8 = ctemp8 + btemp8;

	      btemp6 = temp4 * atemp1;/* contributo immaginario a c01 */
	      btemp5 = temp4 * atemp2;/* contributo  reale a c01 */
	      btemp8 = temp4 * atemp3;/* contributo immaginario a c11 */
	      btemp7 = temp4 * atemp4;/* contributo  reale  a c11 */

	      /* fine  parte aggiunta */


	      temp1 = *(b0_offset+2);
	      temp2 = *(b1_offset+2);

	      atemp1 = *(a0_offset+0);
	      atemp2 = *(a0_offset+1);
	      atemp3 = *(a0_offset+2);
	      atemp4 = *(a0_offset+3);
 

	      a0_offset += lda2; /* nuova colonna */
	      b0_offset +=2;    /* nuova linea */
	      b1_offset +=2;   /* nuova linea */

	      l--;
	    }while(l>0);
	  }

	  temp3 = *(b0_offset+1);
	  temp4 = *(b1_offset+1);

	  ctemp1 = ctemp1 - btemp1;
	  ctemp2 = ctemp2 + btemp2;
	  ctemp3 = ctemp3 - btemp3;
	  ctemp4 = ctemp4 + btemp4;
	  
	  ctemp5 = ctemp5 - btemp5;
	  ctemp6 = ctemp6 + btemp6;
	  ctemp7 = ctemp7 - btemp7;
	  ctemp8 = ctemp8 + btemp8;

	  /* vedere se rimangono altre moltiplicazioni da fare */

	  btemp1 = temp1 * atemp1;
	  btemp2 = temp1 * atemp2;
	  btemp3 = temp1 * atemp3;
	  btemp4 = temp1 * atemp4;

	  btemp5 = temp2 * atemp1;
	  btemp6 = temp2 * atemp2;
	  btemp7 = temp2 * atemp3;
	  btemp8 = temp2 * atemp4;


	  /* qui si cambia musica */


	  atemp5 = *(c3_offset+0);
	  atemp6 = *(c3_offset+1);
	  atemp7 = *(c3_offset+2);
	  atemp8 = *(c3_offset+3);


	  ctemp1 = ctemp1 + btemp1;
	  ctemp2 = ctemp2 + btemp2;
	  ctemp3 = ctemp3 + btemp3;
	  ctemp4 = ctemp4 + btemp4;
	  
	  ctemp5 = ctemp5 + btemp5;
	  ctemp6 = ctemp6 + btemp6;
	  ctemp7 = ctemp7 + btemp7;
	  ctemp8 = ctemp8 + btemp8;

	  btemp2 = temp3 * atemp1;
	  btemp1 = temp3 * atemp2;
	  btemp4 = temp3 * atemp3;
	  btemp3 = temp3 * atemp4;
	  
	  btemp6 = temp4 * atemp1;
	  btemp5 = temp4 * atemp2;
	  btemp8 = temp4 * atemp3;
	  btemp7 = temp4 * atemp4;
	
	    
	  atemp1 = *(c0_offset+0);
	  atemp2 = *(c0_offset+1);
	  atemp3 = *(c0_offset+2);
	  atemp4 = *(c0_offset+3);
  
	  ctemp1 = ctemp1 - btemp1;
	  ctemp2 = ctemp2 + btemp2;
	  ctemp3 = ctemp3 - btemp3;
	  ctemp4 = ctemp4 + btemp4;
	 /*  */  
	  btemp1 = alphar*ctemp1 -alphai*ctemp2;
	  btemp2 = alphar*ctemp2+alphai*ctemp1;
	  btemp3 = alphar*ctemp3-alphai*ctemp4 ;
	  btemp4 = alphar*ctemp4+alphai*ctemp3;

	  ctemp5 = ctemp5 - btemp5;
	  ctemp6 = ctemp6 + btemp6;
	  ctemp7 = ctemp7 - btemp7;
	  ctemp8 = ctemp8 + btemp8;

	  btemp5 = alphar*ctemp5-alphai*ctemp6;
	  btemp6 = alphar*ctemp6+alphai*ctemp5;
	  btemp7 = alphar*ctemp7-alphai*ctemp8;
	  btemp8 = alphar*ctemp8+alphai*ctemp7;

	  ctemp1 = atemp1 + btemp1;
	  ctemp2 = atemp2 + btemp2;
	  ctemp3 = atemp3 + btemp3;
	  ctemp4 = atemp4 + btemp4;

	  ctemp5 = atemp5 + btemp5;
	  ctemp6 = atemp6 + btemp6;
	  ctemp7 = atemp7 + btemp7;
	  ctemp8 = atemp8 + btemp8;

	  *(c0_offset+0) = ctemp1;
	  *(c0_offset+1) = ctemp2;
	  *(c0_offset+2) = ctemp3;
	  *(c0_offset+3) = ctemp4;

	  *(c3_offset+0) = ctemp5;
	  *(c3_offset+1) = ctemp6;
	  *(c3_offset+2) = ctemp7;
	  *(c3_offset+3) = ctemp8;

	  c0_offset +=4;
	  c3_offset +=4;
	  a_offset +=4;
	}
	jldb2 += (ldb2<<1);
	jldc2 += (ldc2<<1);
      }
      a_orig += lslda2;
    }
  }

  /* parte mancante */


  if (mwhole==(m+1) )
    {
      jldb2 = 0;
      jldc2 = 0;
      a_offset  = a + (m<<1);
      
      
      for (j = 0; j < (n)  ; j+=2) {  /* having defined our a`s line, we pass */ 	
	/* through it every b`s columns */
	
	
	b_offset  = b + jldb2 ;
	c0_offset = c + jldc2 +(m<<1) ;
	c3_offset = c0_offset+ldc2  ;
	
	b0_offset = b_offset;
	b1_offset = b_offset + ldb2;
	a0_offset = a_offset;
	
	ctemp1 = ZERO;
	ctemp2 = ZERO;
	ctemp3 = ZERO;
	ctemp4 = ZERO;
	ctemp5 = ZERO;
	ctemp6 = ZERO;
	ctemp7 = ZERO;
	ctemp8 = ZERO;
	
	
	temp1 = *(b0_offset+0); /* parte reale  di b00 */
	temp2 = *(b1_offset+0); /* parte reale  di b01 */
	
	atemp1 = *(a0_offset+0);/* parte reale     a00*/
	atemp2 = *(a0_offset+1);/* parte immaginaria a00 */
	
	btemp1 = ZERO;
	btemp2 = ZERO;
	btemp3 = ZERO;
	btemp4 = ZERO;
	btemp5 = ZERO;
	btemp6 = ZERO;
	btemp7 = ZERO;
	btemp8 = ZERO;
	
	a0_offset += lda2;  /* si passa a una colonna  successiva */
	
	l = k -1;
	if (l>0){
	  do {
	    temp3 = *(b0_offset+1); /* parte immaginaria   di 00     */
	    temp4 = *(b1_offset+1); /* parte immaginaria  di 01  */
	    
	    ctemp1 = ctemp1 + btemp1;
	    ctemp2 = ctemp2 + btemp2;
	    ctemp3 = ctemp3 + btemp3;
	    ctemp4 = ctemp4 + btemp4;
	    
	    btemp1 = temp1 * atemp1; /* contributo reale a c00 */
	    btemp2 = temp1 * atemp2; /* contributo immaginario  a c00 */
	    btemp3 = temp2 * atemp1; /* contributo reale a c01 */
	    btemp4 = temp2 * atemp2; /* contributo immaginario  a c01 */
	    
	    
	    temp1 = *(b0_offset+2);
	    temp2 = *(b1_offset+2);
	    
	    ctemp5 = ctemp5 + btemp5; 
	    ctemp6 = ctemp6 + btemp6;
	    ctemp7 = ctemp7 + btemp7;
	    ctemp8 = ctemp8 + btemp8;	    
	    
	    btemp6 = temp3 * atemp1; /* contributo immaginarioa c00 */
	    btemp5 = temp3 * atemp2; /* contributo reale  a c00 */
	    btemp8 = temp4 * atemp1; /* contributo  immaginario reale a c01 */
	    btemp7 = temp4 * atemp2; /* contributo reale a c01 */
	    
	    
	    atemp1 = *(a0_offset+0);
	    atemp2 = *(a0_offset+1);
	    
	    
	    a0_offset += lda2; /* nuova colonna */
	    b0_offset +=2;    /* nuova linea */
	    b1_offset +=2;   /* nuova linea */
	    
	    l--;
	  }while(l>0);
	}
	temp3 = *(b0_offset+1); /* parte immaginaria   di 00     */
	temp4 = *(b1_offset+1); /* parte immaginaria  di 01  */
	
	ctemp1 = ctemp1 + btemp1;
	ctemp2 = ctemp2 + btemp2;
	ctemp3 = ctemp3 + btemp3;
	ctemp4 = ctemp4 + btemp4;
	
	btemp1 = temp1 * atemp1; /* contributo reale a c00 */
	btemp2 = temp1 * atemp2; /* contributo immaginario  a c00 */
	btemp3 = temp2 * atemp1; /* contributo reale a c01 */
	btemp4 = temp2 * atemp2; /* contributo immaginario  a c01 */
	
	ctemp5 = ctemp5 + btemp5; 
	ctemp6 = ctemp6 + btemp6;
	ctemp7 = ctemp7 + btemp7;
	ctemp8 = ctemp8 + btemp8;	    
	
	btemp6 = temp3 * atemp1; /* contributo immaginario a c00 */
	btemp5 = temp3 * atemp2; /* contributo reale  a c00 */
	btemp8 = temp4 * atemp1; /* contributo  immaginario reale a c01 */
	btemp7 = temp4 * atemp2; /* contributo reale a c01 */
	
	ctemp1 = ctemp1 + btemp1;
	ctemp2 = ctemp2 + btemp2;
	ctemp3 = ctemp3 + btemp3;
	ctemp4 = ctemp4 + btemp4;	 	  
	
	ctemp5 = ctemp5 + btemp5; 
	ctemp6 = ctemp6 + btemp6;
	ctemp7 = ctemp7 + btemp7;
	ctemp8 = ctemp8 + btemp8;
    
	atemp1 = *(c0_offset+0);
	atemp2 = *(c0_offset+1);
	atemp3 = *(c3_offset+0);
	atemp4 = *(c3_offset+1); 
	
	ctemp1 = ctemp1 - ctemp5;
	ctemp2 = ctemp2 + ctemp6;
	ctemp3 = ctemp3 - ctemp7;
	ctemp4 = ctemp4 + ctemp8;	 	  
	
	// fino a qui
	
	/*  */  
	
	btemp1 = alphar*ctemp1 -alphai*ctemp2;
	btemp2 = alphar*ctemp2+alphai*ctemp1;
	btemp3 = alphar*ctemp3-alphai*ctemp4 ;
	btemp4 = alphar*ctemp4+alphai*ctemp3;
	
	ctemp1 = atemp1 + btemp1;
	ctemp2 = atemp2 + btemp2;
	ctemp3 = atemp3 + btemp3;
	ctemp4 = atemp4 + btemp4;
	
	*(c0_offset+0) = ctemp1;
	*(c0_offset+1) = ctemp2;
	*(c3_offset+0) = ctemp3;
	*(c3_offset+1) = ctemp4;
  
	
	jldb2 += (ldb2<<1);
	jldc2 += (ldc2<<1);
      }
    }


  if (nwhole==(n+1))
    {
      jlda2 = 0;
      jldc2 = 0;
      b_offset  = b + n*ldb2;
      
      
      for (j = 0; j < (m)  ; j+=2) {/*having defined our b`s column, we pass */ 	
	/* through it every a'sline */
	
	
	a_offset  = a + jlda2 ;
	c0_offset = c + jldc2 + n*ldc2 ;
	
	a0_offset = a_offset;
	b0_offset = b_offset;
	
	ctemp1 = ZERO;
	ctemp2 = ZERO;
	ctemp3 = ZERO;
	ctemp4 = ZERO;
	ctemp5 = ZERO;
	ctemp6 = ZERO;
	ctemp7 = ZERO;
	ctemp8 = ZERO;
	
	
	temp1 = *(a0_offset+0); /* parte reale  di a00 */
	temp2 = *(a0_offset+2); /* parte reale  di a01 */
	
	atemp1 = *(b0_offset+0);/* parte reale     b00*/
	atemp2 = *(b0_offset+1);/* parte immaginaria b00 */
	
	btemp1 = ZERO;
	btemp2 = ZERO;
	btemp3 = ZERO;
	btemp4 = ZERO;
	btemp5 = ZERO;
	btemp6 = ZERO;
	btemp7 = ZERO;
	btemp8 = ZERO;
	
	b0_offset += 2;  /* si passa a una linea  successiva */
	
	l = k -1;
	if (l>0){
	  do {
	    temp3 = *(a0_offset+1); /* parte immaginaria   di 00     */
	    temp4 = *(a0_offset+3); /* parte immaginaria  di 01  */
	    a0_offset += lda2;    /* nuova colonna */	    
	    ctemp1 = ctemp1 + btemp1;
	    ctemp2 = ctemp2 + btemp2;
	    ctemp3 = ctemp3 + btemp3;
	    ctemp4 = ctemp4 + btemp4;
	    
	    btemp1 = temp1 * atemp1; /* contributo reale a c00 */
	    btemp2 = temp1 * atemp2; /* contributo immaginario  a c00 */
	    btemp3 = temp2 * atemp1; /* contributo reale a c01 */
	    btemp4 = temp2 * atemp2; /* contributo immaginario  a c01 */
	    
	    
	    temp1 = *(a0_offset);
	    temp2 = *(a0_offset+2);
	    
	    ctemp5 = ctemp5 + btemp5; 
	    ctemp6 = ctemp6 + btemp6;
	    ctemp7 = ctemp7 + btemp7;
	    ctemp8 = ctemp8 + btemp8;	    
	    
	    btemp6 = temp3 * atemp1; /* contributo immaginarioa c00 */
	    btemp5 = temp3 * atemp2; /* contributo reale  a c00 */
	    btemp8 = temp4 * atemp1; /* contributo  immaginario reale a c01 */
	    btemp7 = temp4 * atemp2; /* contributo reale a c01 */
	    
	    atemp1 = *(b0_offset+0);
	    atemp2 = *(b0_offset+1);
	    
	    b0_offset += 2; /* nuova linea */

   	    l--;
	  }while(l>0);
	}
	temp3 = *(a0_offset+1); /* parte immaginaria   di 00     */
	temp4 = *(a0_offset+3); /* parte immaginaria  di 01  */
	
	ctemp1 = ctemp1 + btemp1;
	ctemp2 = ctemp2 + btemp2;
	ctemp3 = ctemp3 + btemp3;
	ctemp4 = ctemp4 + btemp4;
	
	btemp1 = temp1 * atemp1; /* contributo reale a c00 */
	btemp2 = temp1 * atemp2; /* contributo immaginario  a c00 */
	btemp3 = temp2 * atemp1; /* contributo reale a c01 */
	btemp4 = temp2 * atemp2; /* contributo immaginario  a c01 */
	
	ctemp5 = ctemp5 + btemp5; 
	ctemp6 = ctemp6 + btemp6;
	ctemp7 = ctemp7 + btemp7;
	ctemp8 = ctemp8 + btemp8;	    
	
	btemp6 = temp3 * atemp1; /* contributo immaginario a c00 */
	btemp5 = temp3 * atemp2; /* contributo reale  a c00 */
	btemp8 = temp4 * atemp1; /* contributo  immaginario reale a c01 */
	btemp7 = temp4 * atemp2; /* contributo reale a c01 */
	
	ctemp1 = ctemp1 + btemp1;
	ctemp2 = ctemp2 + btemp2;
	ctemp3 = ctemp3 + btemp3;
	ctemp4 = ctemp4 + btemp4;	 	  
	
	ctemp5 = ctemp5 + btemp5; 
	ctemp6 = ctemp6 + btemp6;
	ctemp7 = ctemp7 + btemp7;
	ctemp8 = ctemp8 + btemp8;
    
	atemp1 = *(c0_offset+0);
	atemp2 = *(c0_offset+1);
	atemp3 = *(c0_offset+2);
	atemp4 = *(c0_offset+3); 
	
	ctemp1 = ctemp1 - ctemp5;
	ctemp2 = ctemp2 + ctemp6;
	ctemp3 = ctemp3 - ctemp7;
	ctemp4 = ctemp4 + ctemp8;	 	  
	
	// fino a qui
	
	/*  */  
	
	btemp1 = alphar*ctemp1 -alphai*ctemp2;
	btemp2 = alphar*ctemp2+alphai*ctemp1;
	btemp3 = alphar*ctemp3-alphai*ctemp4 ;
	btemp4 = alphar*ctemp4+alphai*ctemp3;
	
	ctemp1 = atemp1 + btemp1;
	ctemp2 = atemp2 + btemp2;
	ctemp3 = atemp3 + btemp3;
	ctemp4 = atemp4 + btemp4;
	
	*(c0_offset+0) = ctemp1;
	*(c0_offset+1) = ctemp2;
	*(c0_offset+2) = ctemp3;
	*(c0_offset+3) = ctemp4;
	
	jlda2 += 4;
	jldc2 += 4;
      }
    }






	
  if (nwhole==(n+1)&& mwhole==(m+1) )
    {
      jlda2 = 0;
      jldc2 = 0;
      b0_offset  = b + n*ldb2;
      a0_offset  = a + (m<<1);
 
      c0_offset = c + n*ldc2 +(m<<1) ;

      ctemp1 = ZERO;
      ctemp2 = ZERO;
      
      ctemp5 = ZERO;
      ctemp6 = ZERO;

	
      temp1 = *(b0_offset+0); /* parte reale  di b00 */
      
      atemp1 = *(a0_offset+0);/* parte reale     a00*/
      atemp2 = *(a0_offset+1);/* parte immaginaria a00 */

	
	btemp1 = ZERO;
	btemp2 = ZERO;

	btemp5 = ZERO;
	btemp6 = ZERO;
	
	a0_offset += lda2;  /* si passa a una colonna  successiva */
	
	l = k -1;
	if (l>0){
	  do {
	    
	    temp3 = *(b0_offset+1); /* parte immaginaria   di 00     */
	    
	    ctemp1 = ctemp1 + btemp1;
	    ctemp2 = ctemp2 + btemp2;
	    
	    btemp1 = temp1 * atemp1; /* contributo reale a c00 */
	    btemp2 = temp1 * atemp2; /* contributo immaginario  a c00 */
	    
      	    temp1 = *(b0_offset+2);
	    temp2 = *(b0_offset+3);
	    
	    ctemp5 = ctemp5 + btemp5; 
	    ctemp6 = ctemp6 + btemp6;
	    
	    btemp6 = temp3 * atemp1; /* contributo immaginarioa c00 */
	    btemp5 = temp3 * atemp2; /* contributo reale  a c00 */
	    
      	    atemp1 = *(a0_offset+0);
	    atemp2 = *(a0_offset+1);
	    
	    
	    a0_offset += lda2; /* nuova colonna */
	    b0_offset +=2;    /* nuova linea */
	    
	    l--;
	  }while(l>0);
	}
	temp3 = *(b0_offset+1); /* parte immaginaria   di 00     */
	
	ctemp1 = ctemp1 + btemp1;
	ctemp2 = ctemp2 + btemp2;
	
	btemp1 = temp1 * atemp1; /* contributo reale a c00 */
	btemp2 = temp1 * atemp2; /* contributo immaginario  a c00 */
	
	ctemp5 = ctemp5 + btemp5; 
	ctemp6 = ctemp6 + btemp6;
	
	btemp6 = temp3 * atemp1; /* contributo immaginario a c00 */
	btemp5 = temp3 * atemp2; /* contributo reale  a c00 */
	
	ctemp1 = ctemp1 + btemp1;
	ctemp2 = ctemp2 + btemp2;
	
	ctemp5 = ctemp5 + btemp5; 
	ctemp6 = ctemp6 + btemp6;

    
	atemp1 = *(c0_offset+0);
	atemp2 = *(c0_offset+1);

      	ctemp1 = ctemp1 - ctemp5;
	ctemp2 = ctemp2 + ctemp6;
	 	  
      	// fino a qui
	/*  */  
	
	btemp1 = alphar*ctemp1 -alphai*ctemp2;
	btemp2 = alphar*ctemp2+alphai*ctemp1;

       	ctemp1 = atemp1 + btemp1;
	ctemp2 = atemp2 + btemp2;
	
	
	*(c0_offset+0) = ctemp1;
	*(c0_offset+1) = ctemp2;
	
    }
  
  return ;
}

/*
extern  "C"
{
  void zgemmnn_(int *mwholeptr, int *nwholeptr, int *kptr,
	       double *alphaptr,  double *a,
	       int *ldaptr, double *b, int *ldbptr, double *betaptr,
	       double *c, int *ldcptr);
}


#include<stdio.h>
#include<stdlib.h>
#include<iostream.h>
#include<Complex.h>
#include <time.h>

  JUST TO TEST IF IT WORKS

main()
{
  
  int m,k,n,i,j,h;
  fscanf(stdin,"%d",&m) ;
  fscanf(stdin,"%d",&k) ;
  fscanf(stdin,"%d",&n) ;

  j=1;
  complex<double> *A= new  complex<double>  [k*m];
  complex<double> *B= new  complex<double>             [n*k];
  complex<double> *C= new  complex<double>         [n*m];
  complex<double> *C1 = new  complex<double>     [n*m];
  for(j=0;j<k;j++)
    {
      for(int i=0;i<m;i++)
	{
	  // cin >> A[j*m+i] ;
	  A[j*m+i]=complex<double> ((1.*random())/RAND_MAX ,(1.*random())/RAND_MAX);
	}
    }
  for(j=0;j<n;j++)
    {
      for(int i=0;i<k;i++)
	{
	  // cin >> B[j*k +i];
	  B[j*k+i]=complex<double> ((1.*random())/RAND_MAX ,(1.*random())/RAND_MAX);
	}
    }
  cout << " chamo" << endl ;
  cout << time(0) << endl;
  double alphar=random()*1./RAND_MAX ,alphai=random()*1./RAND_MAX;
  complex<double> alpha=complex<double> (alphar,alphai);
  zgemmnn_( &m,  &n,  &k,(double *)  &alpha, (double *) A,
	      &m , (double *) B, &k, 1,	   (double *) C ,&m);
  cout << " OK" << endl ;
  cout << time(0) << endl;
  double sum=0;
  for(i=0;i<m;i++)
    {
      for(int j=0;j<n;j++)
	{
	  C1[j*m+ i]=Complex(0.0,0.0);
	  for(int h=0; h<k; h++)
	    {
	      C1[j*m+i] += A[h*m + i]*B[j*k+ h] ;
	    }
	  sum+= abs(C1[j*m+ i]*complex<double>(alphar,alphai)-C[j*m+ i]);
	  // cout << C[j][i]<< " " ;
	}
   }
      cout << " OK "<< endl ;
   cout << time(0) << endl;
   cout << sum << endl ;
}



*/
