[PD-cvs] pd/src d_arithmetic.c,1.1.1.1.16.4,1.1.1.1.16.5 d_math.c,1.1.1.1.16.2,1.1.1.1.16.3 m_simd_def.h,1.1.2.2,1.1.2.3 m_simd_sse_gcc.h,1.1.2.2,1.1.2.3 m_simd_sse_vc.h,1.1.2.2,1.1.2.3 m_simd_ve_gcc.c,1.1.2.2,1.1.2.3 m_simd_ve_gcc.h,1.1.2.2,1.1.2.3

xovo at users.sourceforge.net xovo at users.sourceforge.net
Mon Dec 29 03:02:00 CET 2003


Update of /cvsroot/pure-data/pd/src
In directory sc8-pr-cvs1:/tmp/cvs-serv19981

Modified Files:
      Tag: devel_0_37
	d_arithmetic.c d_math.c m_simd_def.h m_simd_sse_gcc.h 
	m_simd_sse_vc.h m_simd_ve_gcc.c m_simd_ve_gcc.h 
Log Message:
more PPC SIMD improvements



Index: d_arithmetic.c
===================================================================
RCS file: /cvsroot/pure-data/pd/src/d_arithmetic.c,v
retrieving revision 1.1.1.1.16.4
retrieving revision 1.1.1.1.16.5
diff -C2 -d -r1.1.1.1.16.4 -r1.1.1.1.16.5
*** d_arithmetic.c	23 Dec 2003 01:15:39 -0000	1.1.1.1.16.4
--- d_arithmetic.c	29 Dec 2003 02:01:57 -0000	1.1.1.1.16.5
***************
*** 355,358 ****
--- 355,376 ----
  }
  
+ /* T.Grill - squaring: optimized * for equal input signals */
+ static t_int *sqr_perf8(t_int *w)
+ {
+     t_float *in = (t_float *)(w[1]);
+     t_float *out = (t_float *)(w[2]);
+     int n = (int)(w[3]);
+ 
+     for (; n; n -= 8, in += 8, out += 8)
+     {
+     	float f0 = in[0], f1 = in[1], f2 = in[2], f3 = in[3];
+     	float f4 = in[4], f5 = in[5], f6 = in[6], f7 = in[7];
+ 
+     	out[0] = f0 * f0; out[1] = f1 * f1; out[2] = f2 * f2; out[3] = f3 * f3;
+     	out[4] = f4 * f4; out[5] = f5 * f5; out[6] = f6 * f6; out[7] = f7 * f7;
+     }
+     return (w+4);
+ }
+ 
  t_int *scalartimes_perform(t_int *w)
  {
***************
*** 383,386 ****
--- 401,405 ----
  }
  
+ /* T.Grill - added optimization for equal input signals */
  static void times_dsp(t_times *x, t_signal **sp)
  {
***************
*** 388,395 ****
      if (n&7)
      	dsp_add(times_perform, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
! 	else if(SIMD_CHECK3(n,sp[0]->s_vec,sp[1]->s_vec,sp[2]->s_vec))
!     	dsp_add(times_perf_simd, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
!     else	
!     	dsp_add(times_perf8, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
  }
  
--- 407,423 ----
      if (n&7)
      	dsp_add(times_perform, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
! 	else 
! 		if(sp[0]->s_vec == sp[1]->s_vec) {
! 			if(SIMD_CHECK2(n,sp[0]->s_vec,sp[2]->s_vec))
! 		    	dsp_add(sqr_perf_simd, 3, sp[0]->s_vec, sp[2]->s_vec, n);
! 		    else	
! 		    	dsp_add(sqr_perf8, 3, sp[0]->s_vec, sp[2]->s_vec, n);
! 		}
! 		else {
! 			if(SIMD_CHECK3(n,sp[0]->s_vec,sp[1]->s_vec,sp[2]->s_vec))
! 		    	dsp_add(times_perf_simd, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
! 		    else	
! 		    	dsp_add(times_perf8, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
! 		}
  }
  
***************
*** 498,505 ****
  }
  
  t_int *scalarover_perform(t_int *w)
  {
      t_float *in = (t_float *)(w[1]);
!     t_float f = 1. / *(t_float *)(w[2]);
      t_float *out = (t_float *)(w[3]);
      int n = (int)(w[4]);
--- 526,535 ----
  }
  
+ /* T.Grill - added check for zero */
  t_int *scalarover_perform(t_int *w)
  {
      t_float *in = (t_float *)(w[1]);
!     t_float f = *(t_float *)(w[2]);
!     if(f) f = 1./f;
      t_float *out = (t_float *)(w[3]);
      int n = (int)(w[4]);

Index: d_math.c
===================================================================
RCS file: /cvsroot/pure-data/pd/src/d_math.c,v
retrieving revision 1.1.1.1.16.2
retrieving revision 1.1.1.1.16.3
diff -C2 -d -r1.1.1.1.16.2 -r1.1.1.1.16.3
*** d_math.c	28 Dec 2003 14:22:10 -0000	1.1.1.1.16.2
--- d_math.c	29 Dec 2003 02:01:57 -0000	1.1.1.1.16.3
***************
*** 48,52 ****
      	float f = *in++;
      	if (f < lo) f = lo;
!     	if (f > hi) f = hi;
      	*out++ = f;
      }
--- 48,52 ----
      	float f = *in++;
      	if (f < lo) f = lo;
!     	else if (f > hi) f = hi;
      	*out++ = f;
      }

Index: m_simd_def.h
===================================================================
RCS file: /cvsroot/pure-data/pd/src/Attic/m_simd_def.h,v
retrieving revision 1.1.2.2
retrieving revision 1.1.2.3
diff -C2 -d -r1.1.2.2 -r1.1.2.3
*** m_simd_def.h	28 Dec 2003 12:32:03 -0000	1.1.2.2
--- m_simd_def.h	29 Dec 2003 02:01:57 -0000	1.1.2.3
***************
*** 28,31 ****
--- 28,32 ----
  #define times_perf_simd         times_perf8
  #define scalartimes_perf_simd   scalartimes_perf8
+ #define sqr_perf_simd           sqr_perf8
  #define over_perf_simd          over_perf8
  #define scalarover_perf_simd    scalarover_perf8

Index: m_simd_sse_gcc.h
===================================================================
RCS file: /cvsroot/pure-data/pd/src/Attic/m_simd_sse_gcc.h,v
retrieving revision 1.1.2.2
retrieving revision 1.1.2.3
diff -C2 -d -r1.1.2.2 -r1.1.2.3
*** m_simd_sse_gcc.h	28 Dec 2003 12:32:03 -0000	1.1.2.2
--- m_simd_sse_gcc.h	29 Dec 2003 02:01:57 -0000	1.1.2.3
***************
*** 19,22 ****
--- 19,23 ----
  t_int *times_perf_sse_gcc(t_int *w);
  t_int *scalartimes_perf_sse_gcc(t_int *w);
+ t_int *sqr_perf_sse_gcc(t_int *w);
  t_int *over_perf_sse_gcc(t_int *w);
  t_int *scalarover_perf_sse_gcc(t_int *w);
***************
*** 47,50 ****
--- 48,52 ----
  #define times_perf_simd         times_perf8 /* SIMD not implemented */
  #define scalartimes_perf_simd   scalartimes_perf8 /* SIMD not implemented */
+ #define sqr_perf_simd           sqr_perf8 /* SIMD not implemented */
  #define over_perf_simd          over_perf8 /* SIMD not implemented */
  #define scalarover_perf_simd    scalarover_perf8 /* SIMD not implemented */

Index: m_simd_sse_vc.h
===================================================================
RCS file: /cvsroot/pure-data/pd/src/Attic/m_simd_sse_vc.h,v
retrieving revision 1.1.2.2
retrieving revision 1.1.2.3
diff -C2 -d -r1.1.2.2 -r1.1.2.3
*** m_simd_sse_vc.h	28 Dec 2003 12:32:03 -0000	1.1.2.2
--- m_simd_sse_vc.h	29 Dec 2003 02:01:57 -0000	1.1.2.3
***************
*** 19,22 ****
--- 19,23 ----
  t_int *times_perf_sse_vc(t_int *w);
  t_int *scalartimes_perf_sse_vc(t_int *w);
+ t_int *sqr_perf_sse_vc(t_int *w);
  t_int *over_perf_sse_vc(t_int *w);
  t_int *scalarover_perf_sse_vc(t_int *w);
***************
*** 47,50 ****
--- 48,52 ----
  #define times_perf_simd         times_perf_sse_vc
  #define scalartimes_perf_simd   scalartimes_perf_sse_vc
+ #define sqr_perf_simd           sqr_perf8 /* SIMD not implemented */
  #define over_perf_simd          over_perf8 /* SIMD not implemented */
  #define scalarover_perf_simd    scalarover_perf_sse_vc

Index: m_simd_ve_gcc.c
===================================================================
RCS file: /cvsroot/pure-data/pd/src/Attic/m_simd_ve_gcc.c,v
retrieving revision 1.1.2.2
retrieving revision 1.1.2.3
diff -C2 -d -r1.1.2.2 -r1.1.2.3
*** m_simd_ve_gcc.c	28 Dec 2003 12:32:03 -0000	1.1.2.2
--- m_simd_ve_gcc.c	29 Dec 2003 02:01:57 -0000	1.1.2.3
***************
*** 9,12 ****
--- 9,18 ----
  #if defined(__GNUC__) && defined(__POWERPC__) && defined(__ALTIVEC__)
  
+ //#define USEVECLIB
+ 
+ #ifdef USEVECLIB
+ #include <vecLib/vDSP.h>
+ #include <vecLib/vfp.h>
+ #endif
  
  /* functions for unaligned vector data - taken from http://developer.apple.com/hardware/ve/alignment.html */
***************
*** 55,62 ****
  
  	for(; n--; src += 16,dst += 16) {
! 		vec_st(vec_ld( 0,src), 0,dst);
! 		vec_st(vec_ld(16,src),16,dst);
! 		vec_st(vec_ld(32,src),32,dst);
! 		vec_st(vec_ld(48,src),48,dst);
  	}
  	return w+4;
--- 61,72 ----
  
  	for(; n--; src += 16,dst += 16) {
! 		vector float a1 = vec_ld( 0,src);
! 		vector float a2 = vec_ld(16,src);
! 		vector float a3 = vec_ld(32,src);
! 		vector float a4 = vec_ld(48,src);
! 		vec_st(a1, 0,dst);
! 		vec_st(a2,16,dst);
! 		vec_st(a3,32,dst);
! 		vec_st(a4,48,dst);
  	}
  	return w+4;
***************
*** 80,83 ****
--- 90,96 ----
  t_int *plus_perf_ve_gcc(t_int *w)
  {
+ #ifdef USEVECLIB
+ 	vadd((const t_float *)w[1],1,(const t_float *)w[2],1,(t_float *)w[3],1,w[4]);
+ #else
      const t_float *src1 = (const t_float *)w[1];
      const t_float *src2 = (const t_float *)w[2];
***************
*** 86,94 ****
     
  	for(; n--; src1 += 16,src2 += 16,dst += 16) {
! 		vec_st(vec_add(vec_ld( 0,src1),vec_ld( 0,src2)), 0,dst);
! 		vec_st(vec_add(vec_ld(16,src1),vec_ld(16,src2)),16,dst);
! 		vec_st(vec_add(vec_ld(32,src1),vec_ld(32,src2)),32,dst);
! 		vec_st(vec_add(vec_ld(48,src1),vec_ld(48,src2)),48,dst);
  	}
  	return w+5;
  }
--- 99,118 ----
     
  	for(; n--; src1 += 16,src2 += 16,dst += 16) {
! 		vector float a1 = vec_ld( 0,src1),b1 = vec_ld( 0,src2);
! 		vector float a2 = vec_ld(16,src1),b2 = vec_ld(16,src2);
! 		vector float a3 = vec_ld(32,src1),b3 = vec_ld(32,src2);
! 		vector float a4 = vec_ld(48,src1),b4 = vec_ld(48,src2);
! 		
! 		a1 = vec_add(a1,b1);
! 		a2 = vec_add(a2,b2);
! 		a3 = vec_add(a3,b3);
! 		a4 = vec_add(a4,b4);
! 
! 		vec_st(a1, 0,dst);
! 		vec_st(a2,16,dst);
! 		vec_st(a3,32,dst);
! 		vec_st(a4,48,dst);
  	}
+ #endif
  	return w+5;
  }
***************
*** 102,109 ****
     
  	for(; n--; src += 16,dst += 16) {
! 		vec_st(vec_add(vec_ld( 0,src),arg), 0,dst);
! 		vec_st(vec_add(vec_ld(16,src),arg),16,dst);
! 		vec_st(vec_add(vec_ld(32,src),arg),32,dst);
! 		vec_st(vec_add(vec_ld(48,src),arg),48,dst);
  	}
  	return w+5;
--- 126,143 ----
     
  	for(; n--; src += 16,dst += 16) {
! 		vector float a1 = vec_ld( 0,src);
! 		vector float a2 = vec_ld(16,src);
! 		vector float a3 = vec_ld(32,src);
! 		vector float a4 = vec_ld(48,src);
! 		
! 		a1 = vec_add(a1,arg);
! 		a2 = vec_add(a2,arg);
! 		a3 = vec_add(a3,arg);
! 		a4 = vec_add(a4,arg);
! 
! 		vec_st(a1, 0,dst);
! 		vec_st(a2,16,dst);
! 		vec_st(a3,32,dst);
! 		vec_st(a4,48,dst);
  	}
  	return w+5;
***************
*** 112,115 ****
--- 146,152 ----
  t_int *minus_perf_ve_gcc(t_int *w)
  {
+ #ifdef USEVECLIB
+ 	vsub((const t_float *)w[1],1,(const t_float *)w[2],1,(t_float *)w[3],1,w[4]);
+ #else
      const t_float *src1 = (const t_float *)w[1];
      const t_float *src2 = (const t_float *)w[2];
***************
*** 118,126 ****
     
  	for(; n--; src1 += 16,src2 += 16,dst += 16) {
! 		vec_st(vec_sub(vec_ld( 0,src1),vec_ld( 0,src2)), 0,dst);
! 		vec_st(vec_sub(vec_ld(16,src1),vec_ld(16,src2)),16,dst);
! 		vec_st(vec_sub(vec_ld(32,src1),vec_ld(32,src2)),32,dst);
! 		vec_st(vec_sub(vec_ld(48,src1),vec_ld(48,src2)),48,dst);
  	}
  	return w+5;
  }
--- 155,174 ----
     
  	for(; n--; src1 += 16,src2 += 16,dst += 16) {
! 		vector float a1 = vec_ld( 0,src1),b1 = vec_ld( 0,src2);
! 		vector float a2 = vec_ld(16,src1),b2 = vec_ld(16,src2);
! 		vector float a3 = vec_ld(32,src1),b3 = vec_ld(32,src2);
! 		vector float a4 = vec_ld(48,src1),b4 = vec_ld(48,src2);
! 		
! 		a1 = vec_sub(a1,b1);
! 		a2 = vec_sub(a2,b2);
! 		a3 = vec_sub(a3,b3);
! 		a4 = vec_sub(a4,b4);
! 
! 		vec_st(a1, 0,dst);
! 		vec_st(a2,16,dst);
! 		vec_st(a3,32,dst);
! 		vec_st(a4,48,dst);
  	}
+ #endif
  	return w+5;
  }
***************
*** 134,141 ****
     
  	for(; n--; src += 16,dst += 16) {
! 		vec_st(vec_sub(vec_ld( 0,src),arg), 0,dst);
! 		vec_st(vec_sub(vec_ld(16,src),arg),16,dst);
! 		vec_st(vec_sub(vec_ld(32,src),arg),32,dst);
! 		vec_st(vec_sub(vec_ld(48,src),arg),48,dst);
  	}
  	return w+5;
--- 182,199 ----
     
  	for(; n--; src += 16,dst += 16) {
! 		vector float a1 = vec_ld( 0,src);
! 		vector float a2 = vec_ld(16,src);
! 		vector float a3 = vec_ld(32,src);
! 		vector float a4 = vec_ld(48,src);
! 		
! 		a1 = vec_sub(a1,arg);
! 		a2 = vec_sub(a2,arg);
! 		a3 = vec_sub(a3,arg);
! 		a4 = vec_sub(a4,arg);
! 
! 		vec_st(a1, 0,dst);
! 		vec_st(a2,16,dst);
! 		vec_st(a3,32,dst);
! 		vec_st(a4,48,dst);
  	}
  	return w+5;
***************
*** 144,147 ****
--- 202,208 ----
  t_int *times_perf_ve_gcc(t_int *w)
  {
+ #ifdef USEVECLIB
+ 	vmul((const t_float *)w[1],1,(const t_float *)w[2],1,(t_float *)w[3],1,w[4]);
+ #else
      const t_float *src1 = (const t_float *)w[1];
      const t_float *src2 = (const t_float *)w[2];
***************
*** 151,159 ****
     
  	for(; n--; src1 += 16,src2 += 16,dst += 16) {
! 		vec_st(vec_madd(vec_ld( 0,src1),vec_ld( 0,src2),zero), 0,dst);
! 		vec_st(vec_madd(vec_ld(16,src1),vec_ld(16,src2),zero),16,dst);
! 		vec_st(vec_madd(vec_ld(32,src1),vec_ld(32,src2),zero),32,dst);
! 		vec_st(vec_madd(vec_ld(48,src1),vec_ld(48,src2),zero),48,dst);
  	}
  	return w+5;
  }
--- 212,231 ----
     
  	for(; n--; src1 += 16,src2 += 16,dst += 16) {
! 		vector float a1 = vec_ld( 0,src1),b1 = vec_ld( 0,src2);
! 		vector float a2 = vec_ld(16,src1),b2 = vec_ld(16,src2);
! 		vector float a3 = vec_ld(32,src1),b3 = vec_ld(32,src2);
! 		vector float a4 = vec_ld(48,src1),b4 = vec_ld(48,src2);
! 		
! 		a1 = vec_madd(a1,b1,zero);
! 		a2 = vec_madd(a2,b2,zero);
! 		a3 = vec_madd(a3,b3,zero);
! 		a4 = vec_madd(a4,b4,zero);
! 
! 		vec_st(a1, 0,dst);
! 		vec_st(a2,16,dst);
! 		vec_st(a3,32,dst);
! 		vec_st(a4,48,dst);
  	}
+ #endif
  	return w+5;
  }
***************
*** 161,164 ****
--- 233,239 ----
  t_int *scalartimes_perf_ve_gcc(t_int *w)
  {
+ #ifdef USEVECLIB
+ 	vsmul((const t_float *)w[1],1,(t_float *)w[2],(t_float *)w[3],1,w[4]);
+ #else
      const t_float *src = (const t_float *)w[1];
  	const vector float arg = LoadValue(w[2]);
***************
*** 168,175 ****
     
  	for(; n--; src += 16,dst += 16) {
! 		vec_st(vec_madd(vec_ld( 0,src),arg,zero), 0,dst);
! 		vec_st(vec_madd(vec_ld(16,src),arg,zero),16,dst);
! 		vec_st(vec_madd(vec_ld(32,src),arg,zero),32,dst);
! 		vec_st(vec_madd(vec_ld(48,src),arg,zero),48,dst);
  	}
  	return w+5;
--- 243,401 ----
     
  	for(; n--; src += 16,dst += 16) {
! 		vector float a1 = vec_ld( 0,src);
! 		vector float a2 = vec_ld(16,src);
! 		vector float a3 = vec_ld(32,src);
! 		vector float a4 = vec_ld(48,src);
! 		
! 		a1 = vec_madd(a1,arg,zero);
! 		a2 = vec_madd(a2,arg,zero);
! 		a3 = vec_madd(a3,arg,zero);
! 		a4 = vec_madd(a4,arg,zero);
! 
! 		vec_st(a1, 0,dst);
! 		vec_st(a2,16,dst);
! 		vec_st(a3,32,dst);
! 		vec_st(a4,48,dst);
! 	}
! #endif
! 	return w+5;
! }
! 
! t_int *sqr_perf_ve_gcc(t_int *w)
! {
! #ifdef USEVECLIB
! 	vsq((const t_float *)w[1],1,(t_float *)w[2],1,w[3]);
! #else
!     const t_float *src = (const t_float *)w[1];
!     t_float *dst = (t_float *)w[2];
!     const vector float zero = (vector float)(0);
!     int n = w[3]>>4;
!    
! 	for(; n--; src += 16,dst += 16) {
! 		vector float a1 = vec_ld( 0,src);
! 		vector float a2 = vec_ld(16,src);
! 		vector float a3 = vec_ld(32,src);
! 		vector float a4 = vec_ld(48,src);
! 		
! 		a1 = vec_madd(a1,a1,zero);
! 		a2 = vec_madd(a2,a2,zero);
! 		a3 = vec_madd(a3,a3,zero);
! 		a4 = vec_madd(a4,a4,zero);
! 
! 		vec_st(a1, 0,dst);
! 		vec_st(a2,16,dst);
! 		vec_st(a3,32,dst);
! 		vec_st(a4,48,dst);
! 	}
! #endif
! 	return w+4;
! }
! 
! t_int *over_perf_ve_gcc(t_int *w)
! {
!     const t_float *src1 = (const t_float *)w[1];
!     const t_float *src2 = (const t_float *)w[2];
!     t_float *dst = (t_float *)w[3];
!     const vector float zero = (vector float)(0);
!     const vector float one = (vector float)(1);
!     int n = w[4]>>4;
!    
! 	for(; n--; src1 += 16,src2 += 16,dst += 16) {
! #ifdef USEVECLIB
! 		/* no zero checking here */
! 		vec_st(vdivf(vec_ld( 0,src1),vec_ld( 0,src2)), 0,dst);
! 		vec_st(vdivf(vec_ld(16,src1),vec_ld(16,src2)),16,dst);
! 		vec_st(vdivf(vec_ld(32,src1),vec_ld(32,src2)),32,dst);
! 		vec_st(vdivf(vec_ld(48,src1),vec_ld(48,src2)),48,dst);
! #else
! 	    vector float data1 = vec_ld( 0,src2);
! 	    vector float data2 = vec_ld(16,src2); 
! 	    vector float data3 = vec_ld(32,src2); 
! 	    vector float data4 = vec_ld(48,src2); 
! 
! 		vector unsigned char mask1 = vec_nor((vector unsigned char)vec_cmpeq(data1,zero),(vector unsigned char)zero); /* bit mask... all 0 for data = 0., all 1 else */
! 		vector unsigned char mask2 = vec_nor((vector unsigned char)vec_cmpeq(data2,zero),(vector unsigned char)zero); /* bit mask... all 0 for data = 0., all 1 else */
! 		vector unsigned char mask3 = vec_nor((vector unsigned char)vec_cmpeq(data3,zero),(vector unsigned char)zero); /* bit mask... all 0 for data = 0., all 1 else */
! 		vector unsigned char mask4 = vec_nor((vector unsigned char)vec_cmpeq(data4,zero),(vector unsigned char)zero); /* bit mask... all 0 for data = 0., all 1 else */
! 
! 		/* make estimated reciprocal and zero out NANs */
! 		vector float tmp1 = vec_re(data1);
! 		vector float tmp2 = vec_re(data2);
! 		vector float tmp3 = vec_re(data3);
! 		vector float tmp4 = vec_re(data4);
! 		
! 		tmp1 = (vector float)vec_and((vector unsigned char)tmp1,mask1); 
! 		tmp2 = (vector float)vec_and((vector unsigned char)tmp2,mask2); 
! 		tmp3 = (vector float)vec_and((vector unsigned char)tmp3,mask3); 
! 		tmp4 = (vector float)vec_and((vector unsigned char)tmp4,mask4); 
! 
! 		data1 = vec_madd( vec_nmsub( tmp1, data1, one ), tmp1, tmp1 );
! 		data2 = vec_madd( vec_nmsub( tmp2, data2, one ), tmp2, tmp2 );
! 		data3 = vec_madd( vec_nmsub( tmp3, data3, one ), tmp3, tmp3 );
! 		data4 = vec_madd( vec_nmsub( tmp4, data4, one ), tmp4, tmp4 );
! 
! 		tmp1 = vec_ld( 0,src1);
! 		tmp2 = vec_ld(16,src1);
! 		tmp3 = vec_ld(32,src1);
! 		tmp4 = vec_ld(48,src1);
! 
! 		data1 = vec_madd(tmp1,data1,zero);
! 		data2 = vec_madd(tmp2,data2,zero);
! 		data3 = vec_madd(tmp3,data3,zero);
! 		data4 = vec_madd(tmp4,data4,zero);
! 
! 		vec_st(data1, 0,dst);
! 		vec_st(data2,16,dst);
! 		vec_st(data3,32,dst);
! 		vec_st(data4,48,dst);
! #endif
! 	}
! 	return w+5;
! }
! 
! t_int *scalarover_perf_ve_gcc(t_int *w)
! {
!     t_float *dst = (t_float *)w[3];
!     const vector float zero = (vector float)(0);
!     int n = w[4]>>4;
! 
! 	if(*(t_float *)w[2]) {
! 	    const t_float *src = (const t_float *)w[1];
! #ifdef USEVECLIB
! 		float arg = *(t_float *)w[2]?1./ *(t_float *)w[2]: 0;
! 		vsmul(src,1,&arg,dst,1,w[4]);
! #else
! 		const vector float v = LoadValue(w[2]);
! 	    const vector float one = (vector float)(1);
! 
! 	    vector float estimate = vec_re(v); 
! 		vector float arg = vec_madd( vec_nmsub( estimate, v, one ), estimate, estimate );
! 
! 		for(; n--; src += 16,dst += 16) {
! 			vector float a1 = vec_ld( 0,src);
! 			vector float a2 = vec_ld(16,src);
! 			vector float a3 = vec_ld(32,src);
! 			vector float a4 = vec_ld(48,src);
! 			
! 			a1 = vec_madd(a1,arg,zero);
! 			a2 = vec_madd(a2,arg,zero);
! 			a3 = vec_madd(a3,arg,zero);
! 			a4 = vec_madd(a4,arg,zero);
! 
! 			vec_st(a1, 0,dst);
! 			vec_st(a2,16,dst);
! 			vec_st(a3,32,dst);
! 			vec_st(a4,48,dst);
! 		}
! #endif
! 	}
! 	else {
! 		/* zero all output */
! 		for(; n--; dst += 16) {
! 			vec_st(zero, 0,dst);
! 			vec_st(zero,16,dst);
! 			vec_st(zero,32,dst);
! 			vec_st(zero,48,dst);
! 		}
  	}
  	return w+5;
***************
*** 184,191 ****
     
  	for(; n--; src1 += 16,src2 += 16,dst += 16) {
! 		vec_st(vec_min(vec_ld( 0,src1),vec_ld( 0,src2)), 0,dst);
! 		vec_st(vec_min(vec_ld(16,src1),vec_ld(16,src2)),16,dst);
! 		vec_st(vec_min(vec_ld(32,src1),vec_ld(32,src2)),32,dst);
! 		vec_st(vec_min(vec_ld(48,src1),vec_ld(48,src2)),48,dst);
  	}
  	return w+5;
--- 410,427 ----
     
  	for(; n--; src1 += 16,src2 += 16,dst += 16) {
! 		vector float a1 = vec_ld( 0,src1),b1 = vec_ld( 0,src2);
! 		vector float a2 = vec_ld(16,src1),b2 = vec_ld(16,src2);
! 		vector float a3 = vec_ld(32,src1),b3 = vec_ld(32,src2);
! 		vector float a4 = vec_ld(48,src1),b4 = vec_ld(48,src2);
! 		
! 		a1 = vec_min(a1,b1);
! 		a2 = vec_min(a2,b2);
! 		a3 = vec_min(a3,b3);
! 		a4 = vec_min(a4,b4);
! 
! 		vec_st(a1, 0,dst);
! 		vec_st(a2,16,dst);
! 		vec_st(a3,32,dst);
! 		vec_st(a4,48,dst);
  	}
  	return w+5;
***************
*** 200,207 ****
     
  	for(; n--; src += 16,dst += 16) {
! 		vec_st(vec_min(vec_ld( 0,src),arg), 0,dst);
! 		vec_st(vec_min(vec_ld(16,src),arg),16,dst);
! 		vec_st(vec_min(vec_ld(32,src),arg),32,dst);
! 		vec_st(vec_min(vec_ld(48,src),arg),48,dst);
  	}
  	return w+5;
--- 436,453 ----
     
  	for(; n--; src += 16,dst += 16) {
! 		vector float a1 = vec_ld( 0,src);
! 		vector float a2 = vec_ld(16,src);
! 		vector float a3 = vec_ld(32,src);
! 		vector float a4 = vec_ld(48,src);
! 		
! 		a1 = vec_min(a1,arg);
! 		a2 = vec_min(a2,arg);
! 		a3 = vec_min(a3,arg);
! 		a4 = vec_min(a4,arg);
! 
! 		vec_st(a1, 0,dst);
! 		vec_st(a2,16,dst);
! 		vec_st(a3,32,dst);
! 		vec_st(a4,48,dst);
  	}
  	return w+5;
***************
*** 216,223 ****
     
  	for(; n--; src1 += 16,src2 += 16,dst += 16) {
! 		vec_st(vec_max(vec_ld( 0,src1),vec_ld( 0,src2)), 0,dst);
! 		vec_st(vec_max(vec_ld(16,src1),vec_ld(16,src2)),16,dst);
! 		vec_st(vec_max(vec_ld(32,src1),vec_ld(32,src2)),32,dst);
! 		vec_st(vec_max(vec_ld(48,src1),vec_ld(48,src2)),48,dst);
  	}
  	return w+5;
--- 462,479 ----
     
  	for(; n--; src1 += 16,src2 += 16,dst += 16) {
! 		vector float a1 = vec_ld( 0,src1),b1 = vec_ld( 0,src2);
! 		vector float a2 = vec_ld(16,src1),b2 = vec_ld(16,src2);
! 		vector float a3 = vec_ld(32,src1),b3 = vec_ld(32,src2);
! 		vector float a4 = vec_ld(48,src1),b4 = vec_ld(48,src2);
! 		
! 		a1 = vec_max(a1,b1);
! 		a2 = vec_max(a2,b2);
! 		a3 = vec_max(a3,b3);
! 		a4 = vec_max(a4,b4);
! 
! 		vec_st(a1, 0,dst);
! 		vec_st(a2,16,dst);
! 		vec_st(a3,32,dst);
! 		vec_st(a4,48,dst);
  	}
  	return w+5;
***************
*** 232,244 ****
     
  	for(; n--; src += 16,dst += 16) {
! 		vec_st(vec_max(vec_ld( 0,src),arg), 0,dst);
! 		vec_st(vec_max(vec_ld(16,src),arg),16,dst);
! 		vec_st(vec_max(vec_ld(32,src),arg),32,dst);
! 		vec_st(vec_max(vec_ld(48,src),arg),48,dst);
  	}
  	return w+5;
  }
  
! #if 0 /* doesn't work */
  t_int *sigsqrt_perf_ve_gcc(t_int *w)
  {
--- 488,586 ----
     
  	for(; n--; src += 16,dst += 16) {
! 		vector float a1 = vec_ld( 0,src);
! 		vector float a2 = vec_ld(16,src);
! 		vector float a3 = vec_ld(32,src);
! 		vector float a4 = vec_ld(48,src);
! 		
! 		a1 = vec_max(a1,arg);
! 		a2 = vec_max(a2,arg);
! 		a3 = vec_max(a3,arg);
! 		a4 = vec_max(a4,arg);
! 
! 		vec_st(a1, 0,dst);
! 		vec_st(a2,16,dst);
! 		vec_st(a3,32,dst);
! 		vec_st(a4,48,dst);
  	}
  	return w+5;
  }
  
! t_int *clip_perf_ve_gcc(t_int *w)
! {
!     const t_float *src = (const t_float *)w[1];
!     t_float *dst = (t_float *)w[2];
! 	const vector float lo = LoadValue(w[3]);
! 	const vector float hi = LoadValue(w[4]);
!     int n = w[5]>>4;
!    
! 	for(; n--; src += 16,dst += 16) {
! 		vector float data1 = vec_ld( 0,src);
! 		vector float data2 = vec_ld(16,src);
! 		vector float data3 = vec_ld(32,src);
! 		vector float data4 = vec_ld(48,src);
! 		
! 		vector unsigned char mlo1 = (vector unsigned char)vec_cmple(data1,lo); /* bit mask data <= lo */
! 		vector unsigned char mlo2 = (vector unsigned char)vec_cmple(data2,lo); /* bit mask data <= lo */
! 		vector unsigned char mlo3 = (vector unsigned char)vec_cmple(data3,lo); /* bit mask data <= lo */
! 		vector unsigned char mlo4 = (vector unsigned char)vec_cmple(data4,lo); /* bit mask data <= lo */
! 
! 		vector unsigned char mhi1 = (vector unsigned char)vec_cmpge(data1,hi); /* bit mask data >= hi */
! 		vector unsigned char mhi2 = (vector unsigned char)vec_cmpge(data2,hi); /* bit mask data >= hi */
! 		vector unsigned char mhi3 = (vector unsigned char)vec_cmpge(data3,hi); /* bit mask data >= hi */
! 		vector unsigned char mhi4 = (vector unsigned char)vec_cmpge(data4,hi); /* bit mask data >= hi */
! 
! 		data1 = (vector float)vec_and((vector unsigned char)data1,vec_nor(mlo1,mhi1));
! 		data2 = (vector float)vec_and((vector unsigned char)data2,vec_nor(mlo2,mhi2));
! 		data3 = (vector float)vec_and((vector unsigned char)data3,vec_nor(mlo3,mhi3));
! 		data4 = (vector float)vec_and((vector unsigned char)data4,vec_nor(mlo4,mhi4));
! 		
! 		mlo1 = vec_and((vector unsigned char)lo,mlo1);
! 		mlo2 = vec_and((vector unsigned char)lo,mlo2);
! 		mlo3 = vec_and((vector unsigned char)lo,mlo3);
! 		mlo4 = vec_and((vector unsigned char)lo,mlo4);
! 		
! 		mhi1 = vec_and((vector unsigned char)hi,mhi1);
! 		mhi2 = vec_and((vector unsigned char)hi,mhi2);
! 		mhi3 = vec_and((vector unsigned char)hi,mhi3);
! 		mhi4 = vec_and((vector unsigned char)hi,mhi4);
! 
! 		data1 = (vector float)vec_or(vec_or(mlo1,mhi1),(vector unsigned char)data1);
! 		data2 = (vector float)vec_or(vec_or(mlo2,mhi2),(vector unsigned char)data2);
! 		data3 = (vector float)vec_or(vec_or(mlo3,mhi3),(vector unsigned char)data3);
! 		data4 = (vector float)vec_or(vec_or(mlo4,mhi4),(vector unsigned char)data4);
! 
! 		vec_st(data1, 0,dst);
! 		vec_st(data2,16,dst);
! 		vec_st(data3,32,dst);
! 		vec_st(data4,48,dst);
! 	}
! 	return w+6;
! }
! 
! t_int *sigwrap_perf_ve_gcc(t_int *w)
! {
!     const t_float *src = (const t_float *)w[1];
!     t_float *dst = (t_float *)w[2];
!     int n = w[3]>>4;
! 
! 	for(; n--; src += 16,dst += 16) {
! 		vector float data1 = vec_ld( 0,src);
! 		vector float data2 = vec_ld(16,src);
! 		vector float data3 = vec_ld(32,src);
! 		vector float data4 = vec_ld(48,src);
! 		
! 		data1 = vec_sub(data1,vec_floor(data1));
! 		data2 = vec_sub(data2,vec_floor(data2));
! 		data3 = vec_sub(data3,vec_floor(data3));
! 		data4 = vec_sub(data4,vec_floor(data4));
! 		
! 		vec_st(data1, 0,dst);
! 		vec_st(data2,16,dst);
! 		vec_st(data3,32,dst);
! 		vec_st(data4,48,dst);
! 	}
! 	return w+4;
! }
! 
  t_int *sigsqrt_perf_ve_gcc(t_int *w)
  {
***************
*** 252,269 ****
  
  	for(; n--; src += 16,dst += 16) {
! 		/* http://developer.apple.com/hardware/ve/algorithms.html*/
  
! 		vector float data1 = vec_ld( 0,src),estimate1 = vec_rsqrte(data1); 
! 		vec_st(vec_madd(data1,vec_madd( vec_nmsub( data1, vec_madd( estimate1, estimate1, zero ), one ), vec_madd( estimate1, oneHalf, zero ), estimate1 ), zero), 0,dst);
! 		vector float data2 = vec_ld(16,src),estimate2 = vec_rsqrte(data2); 
! 		vec_st(vec_madd(data2,vec_madd( vec_nmsub( data2, vec_madd( estimate2, estimate2, zero ), one ), vec_madd( estimate2, oneHalf, zero ), estimate2 ), zero),16,dst);
! 		vector float data3 = vec_ld(32,src),estimate3 = vec_rsqrte(data3); 
! 		vec_st(vec_madd(data3,vec_madd( vec_nmsub( data3, vec_madd( estimate3, estimate3, zero ), one ), vec_madd( estimate3, oneHalf, zero ), estimate3 ), zero),32,dst);
! 		vector float data4 = vec_ld(48,src),estimate4 = vec_rsqrte(data4); 
! 		vec_st(vec_madd(data4,vec_madd( vec_nmsub( data4, vec_madd( estimate4, estimate4, zero ), one ), vec_madd( estimate4, oneHalf, zero ), estimate4 ), zero),48,dst);
  	}
  	return w+4;
  }
  
  t_int *sigrsqrt_perf_ve_gcc(t_int *w)
  {
--- 594,642 ----
  
  	for(; n--; src += 16,dst += 16) {
! 		/* http://developer.apple.com/hardware/ve/algorithms.html
  
! 			Just as in Miller's scalar sigsqrt_perform, 
! 			first a rsqrt estimate is calculated which is then refined by one round of Newton-Raphson.
! 			Here, to avoid branching a mask is generated which zeroes out eventual resulting NANs.
! 		*/
! 		
! #ifdef USEVECLIB
! 		/* no zero checking here */
! 		vec_st(vsqrtf(vec_ld( 0,src)), 0,dst); 
! 		vec_st(vsqrtf(vec_ld(16,src)),16,dst); 
! 		vec_st(vsqrtf(vec_ld(32,src)),32,dst); 
! 		vec_st(vsqrtf(vec_ld(48,src)),48,dst); 
! #else
! 		vector float data1 = vec_ld( 0,src);
! 		vector float data2 = vec_ld(16,src);
! 		vector float data3 = vec_ld(32,src);
! 		vector float data4 = vec_ld(48,src);
! 
! 		const vector unsigned char mask1 = vec_nor((vector unsigned char)vec_cmple(data1,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
! 		const vector unsigned char mask2 = vec_nor((vector unsigned char)vec_cmple(data2,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
! 		const vector unsigned char mask3 = vec_nor((vector unsigned char)vec_cmple(data3,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
! 		const vector unsigned char mask4 = vec_nor((vector unsigned char)vec_cmple(data4,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
! 
! 		const vector float estimate1 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data1),mask1); 
! 		const vector float estimate2 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data2),mask2); 
! 		const vector float estimate3 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data3),mask3); 
! 		const vector float estimate4 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data4),mask4); 
! 
! 		/* this can still be improved.... */
! 		data1 = vec_madd(data1,vec_madd( vec_nmsub( data1, vec_madd( estimate1, estimate1, zero ), one ), vec_madd( estimate1, oneHalf, zero ), estimate1 ), zero);
! 		data2 = vec_madd(data2,vec_madd( vec_nmsub( data2, vec_madd( estimate2, estimate2, zero ), one ), vec_madd( estimate2, oneHalf, zero ), estimate2 ), zero);
! 		data3 = vec_madd(data3,vec_madd( vec_nmsub( data3, vec_madd( estimate3, estimate3, zero ), one ), vec_madd( estimate3, oneHalf, zero ), estimate3 ), zero);
! 		data4 = vec_madd(data4,vec_madd( vec_nmsub( data4, vec_madd( estimate4, estimate4, zero ), one ), vec_madd( estimate4, oneHalf, zero ), estimate4 ), zero);
! 		
! 		vec_st(data1, 0,dst);
! 		vec_st(data2,16,dst);
! 		vec_st(data3,32,dst);
! 		vec_st(data4,48,dst);
! #endif
  	}
  	return w+4;
  }
  
+ /* Attention: there's a difference to sigsqrt_perform which delivers non-zero for a zero input... i don't think the latter is intended... */
  t_int *sigrsqrt_perf_ve_gcc(t_int *w)
  {
***************
*** 277,294 ****
  
  	for(; n--; src += 16,dst += 16) {
! 		/* http://developer.apple.com/hardware/ve/algorithms.html */
  
! 		vector float data1 = vec_ld( 0,src),estimate1 = vec_rsqrte(data1); 
! 		vec_st(vec_madd( vec_nmsub( data1, vec_madd( estimate1, estimate1, zero ), one ), vec_madd( estimate1, oneHalf, zero ), estimate1 ), 0,dst);
! 		vector float data2 = vec_ld(16,src),estimate2 = vec_rsqrte(data2); 
! 		vec_st(vec_madd( vec_nmsub( data2, vec_madd( estimate2, estimate2, zero ), one ), vec_madd( estimate2, oneHalf, zero ), estimate2 ),16,dst);
! 		vector float data3 = vec_ld(32,src),estimate3 = vec_rsqrte(data3); 
! 		vec_st(vec_madd( vec_nmsub( data3, vec_madd( estimate3, estimate3, zero ), one ), vec_madd( estimate3, oneHalf, zero ), estimate3 ),32,dst);
! 		vector float data4 = vec_ld(48,src),estimate4 = vec_rsqrte(data4); 
! 		vec_st(vec_madd( vec_nmsub( data4, vec_madd( estimate4, estimate4, zero ), one ), vec_madd( estimate4, oneHalf, zero ), estimate4 ),48,dst);
  	}
  	return w+4;
  }
- #endif
  
  #endif
--- 650,700 ----
  
  	for(; n--; src += 16,dst += 16) {
! 		/* http://developer.apple.com/hardware/ve/algorithms.html
  
! 			Just as in Miller's scalar sigrsqrt_perform, 
! 			first a rsqrt estimate is calculated which is then refined by one round of Newton-Raphson.
! 			Here, to avoid branching a mask is generated which zeroes out eventual resulting NANs.
! 		*/
! 
! #ifdef USEVECLIB
! 		/* no zero checking here */
! 		vec_st(vrsqrtf(vec_ld( 0,src)), 0,dst); 
! 		vec_st(vrsqrtf(vec_ld(16,src)),16,dst); 
! 		vec_st(vrsqrtf(vec_ld(32,src)),32,dst); 
! 		vec_st(vrsqrtf(vec_ld(48,src)),48,dst); 
! #else
! 		vector float data1 = vec_ld( 0,src);
! 		vector float data2 = vec_ld(16,src);
! 		vector float data3 = vec_ld(32,src);
! 		vector float data4 = vec_ld(48,src);
! 
! 		const vector unsigned char mask1 = vec_nor((vector unsigned char)vec_cmple(data1,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
! 		const vector unsigned char mask2 = vec_nor((vector unsigned char)vec_cmple(data2,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
! 		const vector unsigned char mask3 = vec_nor((vector unsigned char)vec_cmple(data3,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
! 		const vector unsigned char mask4 = vec_nor((vector unsigned char)vec_cmple(data4,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
! 
! 		const vector float estimate1 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data1),mask1); 
! 		const vector float estimate2 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data2),mask2); 
! 		const vector float estimate3 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data3),mask3); 
! 		const vector float estimate4 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data4),mask4); 
! 		
! 		data1 = vec_nmsub( data1, vec_madd( estimate1, estimate1, zero ), one );
! 		data2 = vec_nmsub( data2, vec_madd( estimate2, estimate2, zero ), one );
! 		data3 = vec_nmsub( data3, vec_madd( estimate3, estimate3, zero ), one );
! 		data4 = vec_nmsub( data4, vec_madd( estimate4, estimate4, zero ), one );
! 
! 		data1 = vec_madd( data1, vec_madd( estimate1, oneHalf, zero ), estimate1 );
! 		data2 = vec_madd( data2, vec_madd( estimate2, oneHalf, zero ), estimate2 );
! 		data3 = vec_madd( data3, vec_madd( estimate3, oneHalf, zero ), estimate3 );
! 		data4 = vec_madd( data4, vec_madd( estimate4, oneHalf, zero ), estimate4 );
! 		
! 		vec_st(data1, 0,dst);
! 		vec_st(data2,16,dst);
! 		vec_st(data3,32,dst);
! 		vec_st(data4,48,dst);
! #endif
  	}
  	return w+4;
  }
  
  #endif

Index: m_simd_ve_gcc.h
===================================================================
RCS file: /cvsroot/pure-data/pd/src/Attic/m_simd_ve_gcc.h,v
retrieving revision 1.1.2.2
retrieving revision 1.1.2.3
diff -C2 -d -r1.1.2.2 -r1.1.2.3
*** m_simd_ve_gcc.h	28 Dec 2003 12:32:03 -0000	1.1.2.2
--- m_simd_ve_gcc.h	29 Dec 2003 02:01:57 -0000	1.1.2.3
***************
*** 19,22 ****
--- 19,23 ----
  t_int *times_perf_ve_gcc(t_int *w);
  t_int *scalartimes_perf_ve_gcc(t_int *w);
+ t_int *sqr_perf_ve_gcc(t_int *w);
  t_int *over_perf_ve_gcc(t_int *w);
  t_int *scalarover_perf_ve_gcc(t_int *w);
***************
*** 47,52 ****
  #define times_perf_simd         times_perf_ve_gcc
  #define scalartimes_perf_simd   scalartimes_perf_ve_gcc
! #define over_perf_simd          over_perf8 /* SIMD not implemented */
! #define scalarover_perf_simd    scalarover_perf8 /* SIMD not implemented */
  #define min_perf_simd           min_perf_ve_gcc
  #define scalarmin_perf_simd     scalarmin_perf_ve_gcc
--- 48,54 ----
  #define times_perf_simd         times_perf_ve_gcc
  #define scalartimes_perf_simd   scalartimes_perf_ve_gcc
! #define sqr_perf_simd           sqr_perf_ve_gcc
! #define over_perf_simd          over_perf_ve_gcc
! #define scalarover_perf_simd    scalarover_perf_ve_gcc
  #define min_perf_simd           min_perf_ve_gcc
  #define scalarmin_perf_simd     scalarmin_perf_ve_gcc
***************
*** 55,62 ****
  
  /* functions in d_math.c */
! #define clip_perf_simd          clip_perform  /* SIMD not implemented */
! #define sigwrap_perf_simd       sigwrap_perform  /* SIMD not implemented */
! #define sigsqrt_perf_simd       sigsqrt_perform /* SIMD not working yet */
! #define sigrsqrt_perf_simd      sigrsqrt_perform /* SIMD not working yet */
  
  #endif /* __M_SIMD_VE_GCC_H */
--- 57,64 ----
  
  /* functions in d_math.c */
! #define clip_perf_simd          clip_perf_ve_gcc
! #define sigwrap_perf_simd       sigwrap_perf_ve_gcc
! #define sigsqrt_perf_simd       sigsqrt_perf_ve_gcc
! #define sigrsqrt_perf_simd      sigrsqrt_perf_ve_gcc
  
  #endif /* __M_SIMD_VE_GCC_H */






More information about the Pd-cvs mailing list