#include "stdafx.h"
#include <math.h>

#ifdef P4_ARCH
#include "P4.h"
#endif


static	double ZOOM = 1.0f;

static  long   one_sixth;
static  long   BiCoef[10];
static  WORD   CubicTable[140000];

static inline long CubicInt(long x);
static inline double CubicDouble(double x);

#define _1 0x10000
#define _2 0x20000
#define _3 0x30000
#define _4 0x40000
#define _6 0x60000

static void resize(BYTE *frame,BYTE *resized_frame,
			int src_w,int src_h,int dst_w,int dst_h,
			int startx,int incx,int starty,int incy,int endy,int fill,int how,int crop);

static void Init_BiCubic();

__inline void fbcopy( void *d, void *s, int i)
{
	_asm {
		mov ecx, i
		mov esi, s
		mov edi, d
		rep movsb
	}
}

static  double src_width1;
static  double src_height1;
static  int   src_w1;
static  int   dst_w1;
static  int   dst_h1;
static  int   src_h1;
static  double dst_width1;
static  double dst_height1;
static  long  starty1;
static  long  startx1;
static  long  endy1;
static  long  incx1;
static  long  incy1;
static  long  start_crop1;

// Reinitialise filter when setting change
void InitFilters(int w,int h,int nw,int nh) {

  int nwidth  = nw;
  int nheight = nh;
  int width   = w;
  int height  = h;

  start_crop1 = 0;

  // Init variable for Y plane scaling
  src_width1=(double)width;
  src_height1=(double)height;
  dst_width1=(double)nwidth;
  dst_height1=(double)nheight;
  src_w1=(int)width;
  src_h1=(int)height;
  dst_w1=(int)dst_width1;
  dst_h1=(int)dst_height1;

  // Important
  // Crop 1 pixel left and 2 pixel right to allow full line bicubic
  // Crop 1 pixel bottom to allow full bilinear

  starty1= 0;
  endy1=   dst_h1;
  startx1= _1+1;
  incx1  = (long) (65536.0f*(src_width1-2.0)/dst_width1);  
  incy1  = (long) (65536.0f*(src_height1-1.0)/dst_height1);

  Init_BiCubic();
}

// ***********************************************************************
// this function does not perform any memory allocation
// dst must be allocated by the caller
// ***********************************************************************
void ResizeFrame(BYTE *src,BYTE *dst,int how)
{
  resize(src,dst,
	     src_w1,src_h1,dst_w1,dst_h1,
	     startx1,incx1,starty1,incy1,endy1,0,
	     how ,start_crop1);
}


#define COMPUTE_PIX(c) \
__asm mov   eax,c \
__asm mul   r1  \
__asm shld  edx,eax,16 \
__asm xor   eax,eax \
__asm mov   ah,[edi] \
__asm shl   eax,8  \
__asm mul   edx \
__asm shld  edx,eax,16 \
__asm add   av,edx \
__asm inc   edi

static void Init_BiCubic() {
  int i;
	
  one_sixth = (long ) (1.0/6.0 * 65536.0 );

  BiCoef[0] = (long)(65536.0*0.15/4.0);
  BiCoef[1] = (long)(65536.0*0.25/4.0);
  BiCoef[2] = (long)(65536.0*0.30/4.0);
  BiCoef[3] = (long)(65536.0*0.30);

  // Init 0.16 fixed point bicubic table
  for(i=0;i<140000;i++)
	  CubicTable[i]=(WORD)(CubicDouble((double)i/32768.0 - 2.0)*65536.0);

}


// A fast an accurate routine to compute bicubic coef
// in 16.16 fixed point format
inline long CubicInt(long x) {

  long a,b,c,d,r;

  _asm {
  mov a,0
  mov b,0
  mov c,0
  mov d,0
 
  // if((x + 2) <= 0) a = 0;
  // else a = pow((x + 2), 3.0F);

  mov eax , x
  add eax , _2
  cmp eax,0
  jle _end
  mov  ebx,eax
  mul ebx
  shld edx,eax,16
  mov  eax,edx
  mul ebx
  shld edx,eax,16
  mov  a , edx

  //if((x + 1.0F) <= 0.0F) b = 0.0F;
  //else b = pow((x + 1.0F), 3.0F);

  mov  eax , x
  add  eax , _1
  cmp  eax,0
  jle  _end
  mov  ebx,eax
  mul ebx
  shld edx,eax,16
  mov  eax,edx
  mul ebx
  shld edx,eax,16
  mov  b , edx
  
  //if(x <= 0) c = 0.0F;
  //else c = pow(x, 3.0F);

  mov eax , x
  cmp eax,0
  jle _end
  mov  ebx,eax
  mul ebx
  shld edx,eax,16
  mov  eax,edx
  mul ebx
  shld edx,eax,16
  mov  c , edx

  // if((x - 1.0F) <= 0.0F) d = 0.0F;
  // d = pow((x - 1.0F), 3.0F);

  mov eax , x
  sub eax , _1
  cmp eax,0
  jle _end
  mov  ebx,eax
  mul ebx
  shld edx,eax,16
  mov  eax,edx
  mul ebx
  shld edx,eax,16
  mov  d , edx

  //return (one_sixth * (a - (4.0F * b) + (6.0F * c) - (4.0F * d)));
_end:
  mov  eax,c
  mov  ebx,_6
  imul ebx
  shld edx,eax,16   // edx = 6*c
  mov  eax,d
  sal  eax,2
  sub  edx,eax      // edx = 6*c - 4*d
  mov  eax,b
  sal  eax,2
  sub  edx,eax      // edx = 6*c - 4*d - 4*b
  mov  eax,a
  add  eax,edx      // eax = a + 6*c - 4*d - 4*b
  mov  ebx,one_sixth
  imul ebx
  shld edx,eax,16
  mov  r,edx       // eax = 1/6 * (a + 6*c - 4*d - 4*b)
  }

  return r;
}

inline double CubicDouble(double x) {
  double a,b,c,d;

  if((x + 2) <= 0) a = 0;
  else a = pow((x + 2), 3.0);

  if((x + 1.0) <= 0.0) b = 0.0;
  else b = pow((x + 1.0), 3.0);
  
  if(x <= 0) c = 0.0;
  else c = pow(x, 3.0);

  if((x - 1.0) <= 0.0) d = 0.0;
  else d = pow((x - 1.0), 3.0);

  return (1.0/6.0 * (a - (4.0 * b) + (6.0 * c) - (4.0 * d)));
}

inline BYTE nearest_neighbour( BYTE *src , int src_w , long x, long y )
{
	if( x & 0x00008000 )
	  src += (x >> 16) + 1 ;
	else
	  src += (x >> 16);

	return *src;
}

void resize(BYTE *frame,BYTE *resized_frame,
			int src_w,int src_h,int dst_w,int dst_h,
			int startx,int incx,int starty,int incy,int endy,
			int fill,int how,int crop) {
  int   i,j;
  long  x;
  long  y;
  int   ys;
  BYTE *dst;
  BYTE *src;
  long av,av1,av2;
  int  a,b,a2;
  long ycor;
  BYTE *src_off;
  short MMPACK[8];
  short MMPACK2[8];
  short MMPACKP1[4];
  short MMPACKP2[4];
  short MMPACKR1[4];
  short MMPACKR2[4];
  short MMPACKR3[4];
  short MMPACKR4[4];

  BYTE  MMRES[128];
  __int64 *MMPACKPtr;
  __int64 *MMPACK2Ptr;
  __int64 *MMPACKP1Ptr;
  __int64 *MMPACKP2Ptr;
  __int64 *MMPACKR1Ptr;
  __int64 *MMPACKR2Ptr;
  __int64 *MMPACKR3Ptr;
  __int64 *MMPACKR4Ptr;
  __int64 *MMRESPtr;

  WORD *CubicTablePtr;

  MMPACKPtr = (__int64 *)MMPACK;
  MMPACK2Ptr = (__int64 *)MMPACK2;
  MMPACKP1Ptr = (__int64 *)MMPACKP1;
  MMPACKP2Ptr = (__int64 *)MMPACKP2;
  MMPACKR1Ptr = (__int64 *)MMPACKR1;
  MMPACKR2Ptr = (__int64 *)MMPACKR2;
  MMPACKR3Ptr = (__int64 *)MMPACKR3;
  MMPACKR4Ptr = (__int64 *)MMPACKR4;
  MMRESPtr    = (__int64 *)MMRES;


#ifdef P4_ARCH
  if( how==6 ) 
    resizeP4(frame,resized_frame,
			 src_w,src_h,dst_w,dst_h,
			 startx,incx,starty,incy,endy,
			 fill,how,crop);
#endif

  CubicTablePtr = CubicTable;

  y     = 0;
  memset(resized_frame , fill , dst_w*starty);
  dst=resized_frame+(dst_w*starty);
  
  for(j=starty;j<endy;j++) {
	  x=startx;
	  src=frame+((y>>16)*src_w)+crop;
	  switch( how ) {

 			  // ******************************************************	  
	  case 0: // Nearest neihbourg
 			  // ******************************************************

		  for(i=0;i<dst_w;i++) {
		    *dst=nearest_neighbour(src ,src_w ,x,  y );
			dst++;
			x+=incx;
		  }

		break;

 			  // ******************************************************
	  case 1: // Bilinear filtering
 			  // ******************************************************

	      for(i=0;i<dst_w;i++) {
			
		    src_off = src + (x >> 16);

			a  = (long)(x & 0x0000FFFF);
			b  = (long)(y & 0x0000FFFF);
			
			// P1..P2
			//  :P  :
			// P3..P4
			// P = ( p1*(1-a) + p2*a )*(1-b) + ( p3*(1-a) + p4*a )*b;

			_asm {
				mov edi,src_off
				
			    // P1
				xor eax,eax
				mov ah,[edi]
				shl eax,8
				mov ebx,_1
				sub ebx,a
				mul ebx
				shld edx,eax,16
				mov  ecx,edx
			    // P2
				xor eax,eax
				mov ah,[edi+1]
				shl eax,8
				mov ebx,a
				mul ebx
				shld edx,eax,16
				add  ecx,edx
				
				mov  eax,_1
				sub  eax,b
				mul  ecx
				shld edx,eax,16
				mov  av,edx

				add edi,src_w
			    // P3
				xor eax,eax
				mov ah,[edi]
				shl eax,8
				mov ebx,_1
				sub ebx,a
				mul ebx
				shld edx,eax,16
				mov  ecx,edx
			    // P4
				xor eax,eax
				mov ah,[edi+1]
				shl eax,8
				mov ebx,a
				mul ebx
				shld edx,eax,16
				add  ecx,edx
				
				mov  eax,b
				mul  ecx
				shld edx,eax,16
				add  av,edx

				// Round it
				mov       eax,av
				and       eax,0x00008000
				shl       eax,1
				add       av,eax

			}
			
			
			*dst=(BYTE)(av >> 16);
			dst++;
			x+=incx;
		  }

		break;

			  // ******************************************************
	  case 2: // Bilinear filtering MMX
 			  // ******************************************************


_bilinear_mmx:
	      for(i=0;i<dst_w;i+=2) {
			
			// Perform 2 pixel in one go
			//
			// P1..P2 P3..P4 
			//  :P'1: :P'2:   
			// P5..P6 P7..P8
			// P = ( p1*(1-a) + p2*a )*(1-b) + ( p5*(1-a) + p6*a )*b;

			_asm {

				mov    esi,src
				pxor   mm0,mm0

				// Pack a and (1-a)
				mov  edi,MMPACKPtr
				mov  eax,x
				and  eax,0x0000FFFF
				push eax
				shr  eax,1
				mov [edi],ax
				pop  eax
				mov  ebx,_1
				sub  ebx,eax
				shr  ebx,1
				mov [edi+2],bx
				
				mov  eax,incx
				add  eax,x
				and  eax,0x0000FFFF
				push eax
				shr  eax,1
				mov [edi+4],ax
				pop  eax
				mov  ebx,_1
				sub  ebx,eax
				shr  ebx,1
				mov [edi+6],bx

				movq      mm1,[edi]
				movq      mm4,[edi]
				
				// Pack pixel
				mov  edi,MMPACK2Ptr
				mov  eax,x
				shr  eax,16
				mov  bx,[esi+eax]
				mov  [edi  ],bh
				mov  [edi+1],bl
				mov  eax,x
				add  eax,incx
				shr  eax,16
				mov  bx,[esi+eax]
				mov  [edi+2],bh
				mov  [edi+3],bl

				add  esi,src_w

				mov  eax,x
				shr  eax,16
				mov  bx,[esi+eax]
				mov  [edi+4],bh
				mov  [edi+5],bl
				mov  eax,x
				add  eax,incx
				shr  eax,16
				mov  bx,[esi+eax]
				mov  [edi+6],bh
				mov  [edi+7],bl

				// Perform P1*(1-a) + P2*a  &&  P3*(1-a') + P4*a'
				// Perform P5*(1-a) + P6*a  &&  P7*(1-a') + P8*a'
				movd mm2,[edi]
				movd mm3,[edi+4]
				punpcklbw mm2,mm0    
				punpcklbw mm3,mm0
				psllw     mm2,1
				psllw     mm3,1
				pmaddwd   mm2,mm1
				pmaddwd   mm3,mm4
				movq      [edi  ],mm2
				movq      [edi+8],mm3

				// Perform 32bit multiplication to be accurate
				
				// (P1*(1-a) + P2*a) * (1-b)
				mov       ebx,y
				and       ebx,0x0000FFFF
				mov       eax,_1
				sub       eax,ebx
				push      eax
				mul       dword ptr [edi]
				shld      edx,eax,16
				mov       av1,edx

				// (P3*(1-a) + P4*a) * (1-b)
		        pop       eax
				mul       dword ptr [edi+4]
				shld      edx,eax,16
				mov       av2,edx

				// (P4*(1-a) + P5*a) * b
				mov		  eax,y
				and       eax,0x0000FFFF
				push      eax
				mul       dword ptr [edi+8]
				shld      edx,eax,16
				add       av1,edx

				// Round av1
				mov       eax,av1
				and       eax,0x00008000
				shl       eax,1
				add       av1,eax

				// (P6*(1-a) + P7*a) * b
		        pop       eax
				mul       dword ptr [edi+12]
				shld      edx,eax,16
				add       av2,edx

				// Round av2
				mov       eax,av2
				and       eax,0x00008000
				shl       eax,1
				add       av2,eax

			}

		    *dst=(BYTE)(av1 >> 16);
		    dst++;
			*dst=(BYTE)(av2 >> 16);						
			dst++;
			x+=(incx << 1);
		  }
		
		break;

 			  // ******************************************************
	  case 3: // BiCubic filtering
   			  // ******************************************************

		ys = (y>>16);
		
		if((ys < 1) || ( ys >= (src_h-2) )) {

			// Can't perform bicubic (do bilinear)
			goto _bilinear_mmx;
		  		  
		} else {

		  // perform bicubic

	      for(i=0;i<dst_w;i++) {
			
			long r1,r2_1,r2_2,r2_3,r2_4;

			a  = (long)(y & 0x0000FFFF);
			b  = (long)(x & 0x0000FFFF);
        	av = 0;

			src_off = src + ((x >> 16) - src_w);

			/*
		    long r2,m,n;

			for(m=1; m<5; m++) {
				r1 = CubicTable[((m << 16)-a)>>1?];
				
				for(n=-3; n<1; n++) {
					r2 = CubicTable[(b-(n << 16))>>1?];
					_asm {
						xor  ebx,ebx
						xor  eax,eax
						mov  edi , src_off
						add  edi , n
						add  edi , 2
						mov  ax  , r2
						mov  bx  , r1
						mul  ebx
						shld edx , eax,16
						mov  eax , edx
						xor  ebx,ebx
						mov  bl  , [edi]
						sal  ebx , 16
						mul  ebx
						shld edx , eax,16
						add  av,edx
					}
				}

				src_off += src_w;
			}
		    *dst=(BYTE)(av >> 16);
			dst++;
			*/


			_asm {
			
				mov esi,CubicTablePtr

				// ******** Compute r2 coeficient 
				xor		  eax,eax
				mov		  ebx,b
				add       ebx,_3
				and       ebx,0xFFFFFFFE
				mov		  ax,[esi+ebx]     // eax=r2
				mov		  r2_1,eax

				mov		  ebx,b
				add       ebx,_2
				and       ebx,0xFFFFFFFE
				mov		  ax,[esi+ebx]     // eax=r2
				mov		  r2_2,eax

				mov		  ebx,b
				add       ebx,_1
				and       ebx,0xFFFFFFFE
				mov		  ax,[esi+ebx]     // eax=r2
				mov		  r2_3,eax

				mov		  ebx,b
				xor		  eax,eax
				and       ebx,0xFFFFFFFE
				mov		  ax,[esi+ebx]     // eax=r2
				mov		  r2_4,eax

				// ******************************* First Line
				mov       edi,src_off
				dec       edi
				mov		  ebx,a
				neg		  ebx
				add       ebx,_1
				xor		  eax,eax
				and       ebx,0xFFFFFFFE
				mov		  ax,[esi+ebx]
				mov		  r1,eax
			}
			
			COMPUTE_PIX( r2_1 )
			COMPUTE_PIX( r2_2 )
			COMPUTE_PIX( r2_3 )
			COMPUTE_PIX( r2_4 )

			_asm {

				// ******************************* Second Line
				mov       eax,src_w
				add		  src_off,eax
				mov       edi,src_off
				dec       edi
				mov		  ebx,a
				neg		  ebx
				add       ebx,_2
				xor		  eax,eax
				and       ebx,0xFFFFFFFE
				mov		  ax,[esi+ebx]
				mov		  r1,eax
			}

			COMPUTE_PIX( r2_1 )
			COMPUTE_PIX( r2_2 )
			COMPUTE_PIX( r2_3 )
			COMPUTE_PIX( r2_4 )

			_asm {

				// ******************************* Third Line
				mov       eax,src_w
				add		  src_off,eax
				mov       edi,src_off
				dec       edi
				mov		  ebx,a
				neg		  ebx
				add       ebx,_3
				xor		  eax,eax
				and       ebx,0xFFFFFFFE
				mov		  ax,[esi+ebx]
				mov		  r1,eax
			}

			COMPUTE_PIX( r2_1 )
			COMPUTE_PIX( r2_2 )
			COMPUTE_PIX( r2_3 )
			COMPUTE_PIX( r2_4 )
			
			_asm {
			    // ******************************* Last Line
			    mov       eax,src_w
			    add		  src_off,eax
			    mov       edi,src_off
			    dec       edi
			    mov		  ebx,a
			    neg		  ebx
			    add       ebx,_4
			    xor		  eax,eax
			    and       ebx,0xFFFFFFFE
			    mov		  ax,[esi+ebx]
			    mov		  r1,eax
			}

			COMPUTE_PIX( r2_1 )
			COMPUTE_PIX( r2_2 )
			COMPUTE_PIX( r2_3 )
			COMPUTE_PIX( r2_4 )

			_asm {
			  // Round av
			  mov       ebx,av
			  and       ebx,0x00008000
			  shl       ebx,1
			  add       av,ebx
			}

		    *dst=(BYTE)(av >> 16);
			dst++;
			x+=incx;
		  }

		  
		} 
		break;	    

				// ******************************************************
      case 4:   // BiCubic SSE  use the pmulhuw instruction
		        // Very accurate method
				// ******************************************************

		ys = (y>>16);
		
		if( (ys < 1) || ( ys >= (src_h-2) )) {

			// Can't perform bicubic (do bilinear)
			goto _bilinear_mmx;
		  		  
		} else {

			// Perfrom bicubic

			b   = -(long)(y & 0x0000FFFF);

			// ****************************************************
		    // Precompute R1 ceoficient
		    // ****************************************************
		    _asm {
				mov       esi,CubicTablePtr
				// Extract new r1 for first line
				mov       edi,MMPACKR1Ptr
				mov		  eax,b
				add       eax,_1
				and       eax,0xFFFFFFFE
				mov		  bx,[esi+eax]     // bx = r1
				mov       [edi  ],bx
				mov       [edi+2],bx
				mov       [edi+4],bx
				mov       [edi+6],bx
				// Extract new r1 for second line
				mov       edi,MMPACKR2Ptr
				mov		  eax,b
				add       eax,_2
				and       eax,0xFFFFFFFE
				mov		  bx,[esi+eax]     // bx = r1
				mov       [edi  ],bx
				mov       [edi+2],bx
				mov       [edi+4],bx
				mov       [edi+6],bx
				// Extract new r1 for third line
				mov       edi,MMPACKR3Ptr
				mov		  eax,b
				add       eax,_3
				and       eax,0xFFFFFFFE
				mov		  bx,[esi+eax]     // bx = r1
				mov       [edi  ],bx
				mov       [edi+2],bx
				mov       [edi+4],bx
				mov       [edi+6],bx
				// Extract new r1 for fourth line
				mov       edi,MMPACKR4Ptr
				mov		  eax,b
				add       eax,_4
				and       eax,0xFFFFFFFE
				mov		  bx,[esi+eax]     // bx = r1
				mov       [edi  ],bx
				mov       [edi+2],bx
				mov       [edi+4],bx
				mov       [edi+6],bx
			}

	        for(i=0;i<dst_w;i+=2) {

			  a   =  (long)(x & 0x0000FFFF);
			  a2  =  (long)((x+incx) & 0x0000FFFF);			
			  _asm {

				mov       esi,CubicTablePtr
				// ******************************************************************
				// Compute r2 BiCubic coeficient and pack them into MMPACK & MMPACK2
				// ******************************************************************

				// First pixel
				mov       edi,MMPACKP1Ptr
				mov		  eax,a
				add       eax,_3
				mov       ebx,eax
				and       ebx,0xFFFFFFFE
				mov		  cx,[esi+ebx]
				mov       [edi  ],cx
				sub		  eax,_1
				mov       ebx,eax
				and       ebx,0xFFFFFFFE
				mov		  cx,[esi+ebx]
				mov       [edi+2],cx
				sub		  eax,_1
				mov       ebx,eax
				and       ebx,0xFFFFFFFE
				mov		  cx,[esi+ebx]
				mov       [edi+4],cx
				sub		  eax,_1
				mov       ebx,eax
				and       ebx,0xFFFFFFFE
				mov		  cx,[esi+ebx]
				mov       [edi+6],cx

				// Second pixel
				mov       edi,MMPACKP2Ptr
				mov		  eax,a2
				add       eax,_3
				mov       ebx,eax
				and       ebx,0xFFFFFFFE
				mov		  cx,[esi+ebx]
				mov       [edi  ],cx
				sub		  eax,_1
				mov       ebx,eax
				and       ebx,0xFFFFFFFE
				mov		  cx,[esi+ebx]
				mov       [edi+2],cx
				sub		  eax,_1
				mov       ebx,eax
				and       ebx,0xFFFFFFFE
				mov		  cx,[esi+ebx]
				mov       [edi+4],cx
				sub		  eax,_1
				mov       ebx,eax
				and       ebx,0xFFFFFFFE
				mov		  cx,[esi+ebx]
				mov       [edi+6],cx

				// ecx = pointer to source pixel for P1
				mov       ecx,src
				mov       eax,x
				shr       eax,16
				dec       eax
				add       ecx,eax
				sub       ecx,src_w

				// edx = pointer to source pixel for P2
				mov       edx,src
				mov       eax,x
				add       eax,incx
				shr       eax,16
				dec       eax
				add       edx,eax
				sub       edx,src_w

				mov       edi,MMRESPtr

				// ***************** First pixel line

				movq      mm1,MMPACKR1    // mm1 = r1r1r1r1
				movq      mm2,MMPACKR1    // mm2 = r1r1r1r1
				movq      mm3,MMPACKP1    // mm3 = r2r2r2r2
				movq      mm4,MMPACKP2    // mm4 = r2r2r2r2

				// Multiply r1 and r2

				pmulhuw	  mm3,mm1          // mm3 = PACKED r1*r2 High
				pmulhuw	  mm4,mm2          // mm4 = PACKED r1*r2 High

				movd	  mm1,[ecx]		   // mm1 = PACKED 4 source bytes
				movd	  mm2,[edx]		   // mm2 = PACKED 4 source bytes
				pxor      mm5,mm5
				pxor      mm6,mm6
				punpcklbw mm5,mm1          // extend to 16 bit
				punpcklbw mm6,mm2          // extend to 16 bit
				
				pmulhuw	  mm5,mm3          // mm5 = PACKED r1*r2*src High
				pmulhuw	  mm6,mm4          // mm6 = PACKED r1*r2*src High
				
				movq      [edi  ],mm5
				movq      [edi+8],mm6
				
				add       ecx,src_w
				add       edx,src_w

				// ***************** Second pixel line

				movq      mm1,MMPACKR2    // mm1 = r1r1r1r1
				movq      mm2,MMPACKR2    // mm2 = r1r1r1r1
				movq      mm3,MMPACKP1    // mm3 = r2r2r2r2
				movq      mm4,MMPACKP2    // mm4 = r2r2r2r2

				// Multiply r1 and r2

				pmulhuw	  mm3,mm1          // mm3 = PACKED r1*r2 High
				pmulhuw	  mm4,mm2          // mm4 = PACKED r1*r2 High

				movd	  mm1,[ecx]		   // mm1 = PACKED 4 source bytes
				movd	  mm2,[edx]		   // mm2 = PACKED 4 source bytes
				pxor      mm5,mm5
				pxor      mm6,mm6
				punpcklbw mm5,mm1          // extend to 16 bit
				punpcklbw mm6,mm2          // extend to 16 bit
				
				pmulhuw	  mm5,mm3          // mm5 = PACKED r1*r2*src High
				pmulhuw	  mm6,mm4          // mm6 = PACKED r1*r2*src High
				
				movq      [edi+16],mm5
				movq      [edi+24],mm6
				
				add       ecx,src_w
				add       edx,src_w

				// ***************** Third pixel line

				movq      mm1,MMPACKR3    // mm1 = r1r1r1r1
				movq      mm2,MMPACKR3    // mm2 = r1r1r1r1
				movq      mm3,MMPACKP1    // mm3 = r2r2r2r2
				movq      mm4,MMPACKP2    // mm4 = r2r2r2r2

				// Multiply r1 and r2

				pmulhuw	  mm3,mm1          // mm3 = PACKED r1*r2 High
				pmulhuw	  mm4,mm2          // mm4 = PACKED r1*r2 High

				movd	  mm1,[ecx]		   // mm1 = PACKED 4 source bytes
				movd	  mm2,[edx]		   // mm2 = PACKED 4 source bytes
				pxor      mm5,mm5
				pxor      mm6,mm6
				punpcklbw mm5,mm1          // extend to 16 bit
				punpcklbw mm6,mm2          // extend to 16 bit
				
				pmulhuw	  mm5,mm3          // mm5 = PACKED r1*r2*src High
				pmulhuw	  mm6,mm4          // mm6 = PACKED r1*r2*src High
				
				movq      [edi+32],mm5
				movq      [edi+40],mm6
				
				add       ecx,src_w
				add       edx,src_w

				// ***************** Fourth pixel line

				movq      mm1,MMPACKR4    // mm1 = r1r1r1r1
				movq      mm2,MMPACKR4    // mm2 = r1r1r1r1
				movq      mm3,MMPACKP1    // mm3 = r2r2r2r2
				movq      mm4,MMPACKP2    // mm4 = r2r2r2r2

				// Multiply r1 and r2

				pmulhuw	  mm3,mm1          // mm3 = PACKED r1*r2 High
				pmulhuw	  mm4,mm2          // mm4 = PACKED r1*r2 High

				movd	  mm1,[ecx]		   // mm1 = PACKED 4 source bytes
				movd	  mm2,[edx]		   // mm2 = PACKED 4 source bytes
				pxor      mm5,mm5
				pxor      mm6,mm6
				punpcklbw mm5,mm1          // extend to 16 bit
				punpcklbw mm6,mm2          // extend to 16 bit
				
				pmulhuw	  mm5,mm3          // mm5 = PACKED r1*r2*src High
				pmulhuw	  mm6,mm4          // mm6 = PACKED r1*r2*src High
				
				// ****************** Compute pixel values

				movq      mm0,[edi   ]
				movq      mm1,[edi+ 8]
				paddw     mm5,mm0
				paddw     mm6,mm1
				movq      mm0,[edi+16]
				movq      mm1,[edi+24]
				paddw     mm5,mm0
				paddw     mm6,mm1
				movq      mm0,[edi+32]
				movq      mm1,[edi+40]
				paddw     mm5,mm0
				paddw     mm6,mm1
				movq      [edi  ],mm5
				movq      [edi+8],mm6

				mov       ax,[edi   ]
				mov       bx,[edi+ 8]
				add       ax,[edi+ 2]
				add       bx,[edi+10]
				add       ax,[edi+ 4]
				add       bx,[edi+12]
				add       ax,[edi+ 6]
				add       bx,[edi+14]

				// Round
				mov       cx,ax
				mov       dx,bx
				and       cx,0x0080
				and       dx,0x0080
				shl       cx,1
				shl       dx,1
				add       ax,cx
				add       bx,dx

				// Store
				mov       edi,dst
				mov      [edi  ],ah
				mov      [edi+1],bh
				add		 dst,2

			  }
		      x+=(incx << 1);
			}
		  
		} 
		break;	    

			  // ******************************************************
	  case 5: // Old Pseudo Bicubic filtering
			  // ******************************************************

		if( (y>>16) >= (src_h-2) ) {
	      for(i=0;i<dst_w;i++) {
		    *dst=nearest_neighbour(src,src_w,x,y);
			dst++;
			x+=incx;
		  }		
		} else if( (y>>16) > 1 ) {

		  long *BiCoefPtr = BiCoef;

		  if( y & 0x00008000 ) ycor=src_w;
		  else      		   ycor=0;
 		  
	      for(i=0;i<dst_w;i++) {

			
			if( x & 0x00008000 )
			  src_off = src + (x >> 16) + 1;
			else
			  src_off = src + (x >> 16);
			
			src_off += ycor;

			
			//av = *(src_off-2) +
			//     *(src_off+2) + 
			//     *(src_off-2*src_w) + 
			//     *(src_off+2*src_w) ;
			//coef = BiCoef[0];
			

		    _asm {

			mov  ecx,src_w
			mov  esi,src_off
			mov  edi , BiCoefPtr
			shl  ecx,1
			xor  ebx,ebx
			xor  eax,eax
			mov  al , [esi-2]
			mov  bl , [esi+2]
			add  eax , ebx 
			add  esi , ecx
			mov  bl , [esi]
			add  eax , ebx 
			sub  esi , ecx
			sub  esi , ecx
			mov  bl , [esi]
			add  eax , ebx 

			shl    eax , 16
			mov    ebx , [edi]
			mul    ebx
			shld   edx , eax,16
			mov    av , edx
				
			
			//av = *(src_off-1-src_w) +
			//    *(src_off+1-src_w) + 
			//     *(src_off-1+src_w) + 
			//     *(src_off+1+src_w) ;
			//coef = BiCoef[1];
			

			mov  esi,src_off
			mov  ecx,src_w
			xor  ebx,ebx
			xor  eax,eax
			sub  esi, ecx
			mov  al , [esi-1]
			mov  bl , [esi+1]
			add  eax , ebx 
			add  esi , ecx
			add  esi , ecx
			mov  bl , [esi-1]
			add  eax , ebx 
			mov  bl , [esi+1]
			add  eax , ebx 

			shl  eax , 16  
 			mov  ebx , [edi+4]
			mul  ebx
			shld edx , eax,16
			add  av , edx

			
			//av = *(src_off-1) +
			//     *(src_off+1) + 
			//     *(src_off-src_w) + 
			//     *(src_off+src_w) ;
			//coef = BiCoef[2];
			

			mov  esi,src_off
			mov  ecx,src_w
			xor  ebx,ebx
			xor  eax,eax
			mov  al , [esi-1]
			mov  bl , [esi+1]
 			add  eax , ebx 
			sub  esi, ecx
			mov  bl , [esi]
			add  eax , ebx 
			add  esi, ecx
			add  esi, ecx
			mov  bl , [esi]
			add  eax , ebx

			shl  eax , 16
			mov  ebx , [edi+8]
			mul  ebx
			shld edx , eax,16
			add  av , edx

			
			//av = *src_off;
			//coef = BiCoef[3];
			

			mov  esi,src_off
			xor  eax,eax
			mov  al  , [esi] 
			shl  eax , 16
			mov  ebx , [edi+12]
			mul  ebx
			shld edx , eax,16
			add  edx , av
			sar  edx , 16
			mov  edi , dst
			mov [edi] , dl
			}

		    dst++;
			x+=incx;
		  }
		} else { // Y=0
	      for(i=0;i<dst_w;i++) {
		    *dst=nearest_neighbour(src,src_w,x,y);
			dst++;
			x+=incx;
		  }				
		}
		break;

	  }
	  y+=incy;
	}
	__asm emms

	memset(resized_frame+(endy*dst_w) , fill , dst_w*(dst_h-endy));
}

#ifdef P4_ARCH

//----------------------------------------------------------------
void* memsetP4(void* pDest, int byte_val, size_t cbSize)
//----------------------------------------------------------------
{
    size_t cbLeadMain, cbLeadLead, cbMain, cbTrailMain, cbTrailTrail;

    __asm
    {
        //if (cbSize < 8)
        //{
        //  return memcmp(pDest, pSrc, cbSize);  // std intrinsic function
        //}
        mov         ecx, cbSize // ecx = cbSize
        cmp         ecx, 32
        mov         edi, pDest
        jb          l_memset
    
        movd        xmm0, byte_val
		punpcklbw	xmm0, xmm0
		pshuflw		xmm0, xmm0, _MM_SHUFFLE(0,0,0,0);

        //cbLead = ((((long)pDest) + 31) & ~31) - (long)pDest;
        lea         eax, [edi + 31]
        and         eax, ~31
        sub         eax, edi    // eax = cbLead

        sub         ecx, eax    // cbSize (ecx) -= cbLead

        //cbLeadMain = cbLead & ~7;
        mov         edx, eax    // edx = cbLead
        and         eax, ~7     // eax = cbLeadMain
        mov         cbLeadMain, eax

        //cbLeadLead = cbLead - cbLeadMain;
        sub         edx, eax    // ecx = cbLeadLead
        mov         cbLeadLead, edx
        /*ASSERT(cbLeadLead > cbSize);*/  // since cbSize >= 0 && cbLeadLead < 8

        //cbTrail = cbMain = cbSize - cbLead;
        mov         eax, ecx    // eax = cbSize
        and         ecx, 63     // 
        sub         eax, ecx    // cbTrail = cbSize & 63

        mov         cbMain, eax // cbMain = cbSize - cbTrail

        //cbTrailMain = cbTrail & ~7;
        mov         edx, ecx    // edx = cbTrail
        and         ecx, ~7     // ecx = cbTrailMain
        mov         cbTrailMain, ecx

        //cbTrailTrail = cbTrail - cbTrailMain;
        sub         edx, ecx    // edx = cbTrailTrail
        mov         cbTrailTrail, edx


        
        // main part starts ----------------------------------------------------
        mov         ecx, cbLeadLead
        mov         eax, cbLeadMain

        test        ecx, ecx  // cbLeadLead? // this jump can be removed
        jz          l_leadloop
        movq        [edi], xmm0
        add         edi, ecx

l_leadloop:
        sub         eax, 8
        jl          l_startmainloop

        movq        [edi], xmm0
		add			edi, 8
        jmp         short l_leadloop

l_startmainloop:
        mov         ecx, cbMain
        test        ecx, ecx  // cbMain?
        jz          l_starttrailloop
    
//l_startlargemainloop:
        punpcklqdq  xmm0, xmm0

l_alignedmainloop:
        movntdq     [edi +  0], xmm0
        movntdq     [edi + 16], xmm0
        movntdq     [edi + 32], xmm0
        movntdq     [edi + 48], xmm0

        add         edi, 64
        sub         ecx, 64
        ja          l_alignedmainloop

        sfence		// only needed after movntdq

l_starttrailloop:
        mov         eax, cbTrailMain
        mov         ecx, cbTrailTrail  // remove if no cbTrailTrail test below

l_trailloop:
        sub         eax, 8
        jl          l_trailtrail

        movq        [edi], xmm0
		add			edi, 8
        jmp         short l_trailloop

l_trailtrail:
        test        ecx, ecx  // cbTrailTrail? // this test can be removed
        jz          l_finish
        // fall through to l_memset

l_memset:
        mov         eax, byte_val
    rep stosb

l_finish:
        //emms
    }

    return pDest;
}

void resizeP4(BYTE *frame,BYTE *resized_frame,
			int src_w,int src_h,int dst_w,int dst_h,
			int startx,int incx,int starty,int incy,int endy,
			int fill,int how,int crop) {
  int   i,j;
  long  x;
  long  y;
  int   ys;
  BYTE *dst;
  BYTE *src;
  int  a,b,a2;
  BYTE *src_off;

  BYTE *sdst;
  long x0, x1, x2, x3;
  __m128i qa, qa1, qs, qi;
  __m128i qb, qb1;
  BYTE *ps0, *ps1, *ps2, *ps3;
  BYTE *pt0, *pt1, *pt2, *pt3;
  __m128i qc, qd, qc0, qc1, qd0, qd1;

  CONST_M128I_UINT1(epi32_0x0000FFFF, 0x0000FFFF);
  CONST_M128I_UINT1(epi32_0x00010000, 0x00010000);
  CONST_M128I_UINT1(epi32_0x00008000, 0x00008000);
  CONST_M128I_UINT(epi32_mask31, ~0, 0, ~0, 0);
  y     = 0;
  memsetP4(resized_frame , fill , dst_w*starty);
  dst=resized_frame+(dst_w*starty);
  
  for(j=starty;j<endy;j++) {
	  x=startx;
	  src=frame+((y>>16)*src_w)+crop;
	  switch( how ) {

	  case 6: // P4 bilinear

		x0 = x;
		x1 = x + incx;
		x2 = x + incx * 2;
		x3 = x + incx * 3;
		qs = _mm_set_epi32(x3, x2, x1, x0);
		qi = _mm_set1_epi32(incx * 4);

		b = y & 0x0000FFFF;
		qb = _mm_cvtsi32_si128(b);
		qb1 = _mm_cvtsi32_si128(_1 - b);
		qb = _mm_unpacklo_epi64(qb, qb);
		qb1 = _mm_unpacklo_epi64(qb1, qb1);
		// qb: b,b
		// qb1: (1-b),(1-b)

		sdst = dst;

		for (i = 0; i < dst_w; i += 4)
		{
			x0 = x;
			x1 = x + incx;
			x2 = x + incx * 2;
			x3 = x + incx * 3;

			ps0 = src + ((x + incx * 8) >> 16);
			pt0 = ps0 + src_w;
			_mm_prefetch((char*)ps0, _MM_HINT_NTA);
			_mm_prefetch((char*)pt0, _MM_HINT_NTA);

			ps0 = src + (x0 >> 16);
			ps1 = src + (x1 >> 16);
			ps2 = src + (x2 >> 16);
			ps3 = src + (x3 >> 16);
			pt0 = ps0 + src_w;
			pt1 = ps1 + src_w;
			pt2 = ps2 + src_w;
			pt3 = ps3 + src_w;

			/*
			// bytewise (seems slower)
			qc = _mm_setzero_si128();
			qd = _mm_setzero_si128();

			qc = _mm_insert_epi16(qc, (unsigned short)ps0[0], 0);
			qd = _mm_insert_epi16(qd, (unsigned short)pt0[0], 0);
			qc = _mm_insert_epi16(qc, (unsigned short)ps0[1], 1);
			qd = _mm_insert_epi16(qd, (unsigned short)pt0[1], 1);

			qc = _mm_insert_epi16(qc, (unsigned short)ps1[0], 2);
			qd = _mm_insert_epi16(qd, (unsigned short)pt1[0], 2);
			qc = _mm_insert_epi16(qc, (unsigned short)ps1[1], 3);
			qd = _mm_insert_epi16(qd, (unsigned short)pt1[1], 3);

			qc = _mm_insert_epi16(qc, (unsigned short)ps2[0], 4);
			qd = _mm_insert_epi16(qd, (unsigned short)pt2[0], 4);
			qc = _mm_insert_epi16(qc, (unsigned short)ps2[1], 5);
			qd = _mm_insert_epi16(qd, (unsigned short)pt2[1], 5);

			qc = _mm_insert_epi16(qc, (unsigned short)ps3[0], 6);
			qd = _mm_insert_epi16(qd, (unsigned short)pt3[0], 6);
			qc = _mm_insert_epi16(qc, (unsigned short)ps3[1], 7);
			qd = _mm_insert_epi16(qd, (unsigned short)pt3[1], 7);
			*/

			// wordwise
			qc = _mm_setzero_si128();
			qd = _mm_setzero_si128();

			qc = _mm_insert_epi16(qc, *(unsigned short*)ps0, 0);
			qd = _mm_insert_epi16(qd, *(unsigned short*)pt0, 0);

			qc = _mm_insert_epi16(qc, *(unsigned short*)ps1, 1);
			qd = _mm_insert_epi16(qd, *(unsigned short*)pt1, 1);

			qc = _mm_insert_epi16(qc, *(unsigned short*)ps2, 2);
			qd = _mm_insert_epi16(qd, *(unsigned short*)pt2, 2);

			qc = _mm_insert_epi16(qc, *(unsigned short*)ps3, 3);
			qd = _mm_insert_epi16(qd, *(unsigned short*)pt3, 3);

			qc0 = _mm_setzero_si128();
			qc = _mm_unpacklo_epi8(qc, qc0);
			qd = _mm_unpacklo_epi8(qd, qc0);

			// qc: pf,pc,p9,p8,p5,p4,p1,p0
			// qd: pe,pd,pb,pa,p7,p6,p3,p2

			qa = _mm_and_si128(qs, epi32_0x0000FFFF);
			qa1 = _mm_sub_epi32(epi32_0x00010000, qa);
			qa = _mm_srli_epi32(qa, 1);
			qa1 = _mm_srli_epi32(qa1, 1); // qa1 < USHORT_MAX
			qa = _mm_or_si128(_mm_slli_epi32(qa, 16), qa1);
			// qa: a,(1-a),a,(1-a),a,(1-a),a,(1-a) // shr by 1

			// P0..P1
			//  :P  :
			// P2..P3
			// P = ( p0*(1-a) + p1*a )*(1-b) + ( p2*(1-a) + p3*a )*b;

			// qa: a,(1-a),a,(1-a),a,(1-a),a,(1-a) // shr by 1

			qc = _mm_slli_epi16(qc, 1);
			qd = _mm_slli_epi16(qd, 1);

			qc = _mm_madd_epi16(qc, qa);	// fc,98,54,10
			qd = _mm_madd_epi16(qd, qa);	// ed,ba,76,32

			// qb1: (1-b),(1-b)
			// qb: b,b

			// 64-bit unsigned mul, use bits 63:16
			qc0 = _mm_mul_epu32(qc, qb1);	// 98,10
			qd0 = _mm_mul_epu32(qd, qb);	// ba,32
			qc1 = _mm_mul_epu32(_mm_srli_epi64(qc, 32), qb1);	// fc,54
			qd1 = _mm_mul_epu32(_mm_srli_epi64(qd, 32), qb);	// ed,76
			qc0 = _mm_srli_epi64(qc0, 16);
			qd0 = _mm_srli_epi64(qd0, 16);
			qc1 = _mm_slli_epi64(qc1, 16);
			qd1 = _mm_slli_epi64(qd1, 16);

			// take higher 32 bit
			qc = _mm_combine_si128(qc1, qc0, epi32_mask31);	// fc,98,54,10
			qd = _mm_combine_si128(qd1, qd0, epi32_mask31);	// ed,ba,76,32

			qc = _mm_add_epi32(qc, qd);

			// round
			qc0 = _mm_slli_epi32(_mm_and_si128(qc, epi32_0x00008000), 1);
			qc = _mm_add_epi32(qc, qc0);

			// pack
			qc = _mm_srli_epi32(qc, 16);
			qc = _mm_packs_epi32(qc, qc);
			qc = _mm_packus_epi16(qc, qc);

			qs = _mm_add_epi32(qs, qi);
			x += incx * 4;
			*(int*)dst = _mm_cvtsi128_si32(qc);
			//_mm_stream_si32((int*)dst, _mm_cvtsi128_si32(qc));
			dst += 4;
		}

		dst = sdst + dst_w;
	  } // switch (how)

	  y+=incy;
	}

	memsetP4(resized_frame+(endy*dst_w) , fill , dst_w*(dst_h-endy));
}

#endif // P4_ARCH