[PD-cvs] pd/src d_ctl.c,1.1.1.3.2.6,1.1.1.3.2.7 d_math.c,1.1.1.1,1.1.1.1.16.1 m_simd.h,1.1.2.3,1.1.2.4 m_simd_def.h,1.1.2.1,1.1.2.2 m_simd_sse_gcc.h,1.1.2.1,1.1.2.2 m_simd_sse_vc.h,1.1.2.1,1.1.2.2 m_simd_ve_gcc.c,1.1.2.1,1.1.2.2 m_simd_ve_gcc.h,1.1.2.1,1.1.2.2

Sun Dec 28 13:32:05 CET 2003

Update of /cvsroot/pure-data/pd/src
In directory sc8-pr-cvs1:/tmp/cvs-serv10322

Modified Files:
      Tag: devel_0_37
	d_ctl.c d_math.c m_simd.h m_simd_def.h m_simd_sse_gcc.h 
	m_simd_sse_vc.h m_simd_ve_gcc.c m_simd_ve_gcc.h 
Log Message:
SIMD instructions for PPC (enable by adding -faltivec to the compiler options)


Index: d_ctl.c
===================================================================
RCS file: /cvsroot/pure-data/pd/src/d_ctl.c,v
retrieving revision 1.1.1.3.2.6
retrieving revision 1.1.1.3.2.7
diff -C2 -d -r1.1.1.3.2.6 -r1.1.1.3.2.7
*** d_ctl.c	23 Dec 2003 01:15:39 -0000	1.1.1.3.2.6
--- d_ctl.c	28 Dec 2003 12:32:03 -0000	1.1.1.3.2.7
***************
*** 69,73 ****
  static void sig_tilde_dsp(t_sig *x, t_signal **sp)
  {
!     dsp_add(sig_tilde_perform, 3, &x->x_f, sp[0]->s_vec, sp[0]->s_n);
  }
  
--- 69,75 ----
  static void sig_tilde_dsp(t_sig *x, t_signal **sp)
  {
! /*   dsp_add(sig_tilde_perform, 3, &x->x_f, sp[0]->s_vec, sp[0]->s_n); */
! 	/* T.Grill - use chance of unrolling */
! 	dsp_add_scalarcopy(&x->x_f, sp[0]->s_vec, sp[0]->s_n);
  }
  

Index: d_math.c
===================================================================
RCS file: /cvsroot/pure-data/pd/src/d_math.c,v
retrieving revision 1.1.1.1
retrieving revision 1.1.1.1.16.1
diff -C2 -d -r1.1.1.1 -r1.1.1.1.16.1
*** d_math.c	29 Jul 2002 17:05:57 -0000	1.1.1.1
--- d_math.c	28 Dec 2003 12:32:03 -0000	1.1.1.1.16.1
***************
*** 11,14 ****
--- 11,17 ----
  #define LOGTEN 2.302585092994
  
+ /* T.Grill - use SIMD functionality */
+ #include "m_simd.h"
+ 
  /* ------------------------- clip~ -------------------------- */
  static t_class *clip_class;
***************
*** 52,56 ****
  static void clip_dsp(t_clip *x, t_signal **sp)
  {
!     dsp_add(clip_perform, 4, x, sp[0]->s_vec, sp[1]->s_vec, sp[0]->s_n);
  }
  
--- 55,62 ----
  static void clip_dsp(t_clip *x, t_signal **sp)
  {
! 	if(SIMD_CHECK2(sp[0]->s_n,sp[0]->s_vec,sp[1]->s_vec))
!     	dsp_add(clip_perf_simd, 3, sp[0]->s_vec, sp[1]->s_vec, sp[0]->s_n);
!     else
! 	    dsp_add(clip_perform, 4, x, sp[0]->s_vec, sp[1]->s_vec, sp[0]->s_n);
  }
  
***************
*** 151,155 ****
  static void sigrsqrt_dsp(t_sigrsqrt *x, t_signal **sp)
  {
!     dsp_add(sigrsqrt_perform, 3, sp[0]->s_vec, sp[1]->s_vec, sp[0]->s_n);
  }
  
--- 157,164 ----
  static void sigrsqrt_dsp(t_sigrsqrt *x, t_signal **sp)
  {
! 	if(SIMD_CHECK2(sp[0]->s_n,sp[0]->s_vec,sp[1]->s_vec))
!     	dsp_add(sigrsqrt_perf_simd, 3, sp[0]->s_vec, sp[1]->s_vec, sp[0]->s_n);
!     else
! 	    dsp_add(sigrsqrt_perform, 3, sp[0]->s_vec, sp[1]->s_vec, sp[0]->s_n);
  }
  
***************
*** 205,209 ****
  static void sigsqrt_dsp(t_sigsqrt *x, t_signal **sp)
  {
!     dsp_add(sigsqrt_perform, 3, sp[0]->s_vec, sp[1]->s_vec, sp[0]->s_n);
  }
  
--- 214,221 ----
  static void sigsqrt_dsp(t_sigsqrt *x, t_signal **sp)
  {
! 	if(SIMD_CHECK2(sp[0]->s_n,sp[0]->s_vec,sp[1]->s_vec))
!     	dsp_add(sigsqrt_perf_simd, 3, sp[0]->s_vec, sp[1]->s_vec, sp[0]->s_n);
!     else
!     	dsp_add(sigsqrt_perform, 3, sp[0]->s_vec, sp[1]->s_vec, sp[0]->s_n);
  }
  
***************
*** 251,255 ****
  static void sigwrap_dsp(t_sigwrap *x, t_signal **sp)
  {
!     dsp_add(sigwrap_perform, 3, sp[0]->s_vec, sp[1]->s_vec, sp[0]->s_n);
  }
  
--- 263,270 ----
  static void sigwrap_dsp(t_sigwrap *x, t_signal **sp)
  {
! 	if(SIMD_CHECK2(sp[0]->s_n,sp[0]->s_vec,sp[1]->s_vec))
!     	dsp_add(sigwrap_perf_simd, 3, sp[0]->s_vec, sp[1]->s_vec, sp[0]->s_n);
!     else
! 	    dsp_add(sigwrap_perform, 3, sp[0]->s_vec, sp[1]->s_vec, sp[0]->s_n);
  }
  

Index: m_simd.h
===================================================================
RCS file: /cvsroot/pure-data/pd/src/Attic/m_simd.h,v
retrieving revision 1.1.2.3
retrieving revision 1.1.2.4
diff -C2 -d -r1.1.2.3 -r1.1.2.4
*** m_simd.h	23 Dec 2003 02:50:53 -0000	1.1.2.3
--- m_simd.h	28 Dec 2003 12:32:03 -0000	1.1.2.4
***************
*** 35,43 ****
          #include "m_simd_sse_gcc.h"
  
!     #elif defined(__GNUC__) && defined(__POWERPC__)
!         /* Altivec with GNU C */
          #define SIMD_BYTEALIGN (128/8)   /* align to 128 bits */
  
!         #include "m_simd_ve_gcc.h"      
  
      #else
--- 35,43 ----
          #include "m_simd_sse_gcc.h"
  
!     #elif defined(__GNUC__) && defined(__POWERPC__) && defined(__ALTIVEC__)
!         /* Altivec with GNU C  ( -faltivec must be given as a compiler option! ) */
          #define SIMD_BYTEALIGN (128/8)   /* align to 128 bits */
  
!         #include "m_simd_ve_gcc.h"  
  
      #else

Index: m_simd_def.h
===================================================================
RCS file: /cvsroot/pure-data/pd/src/Attic/m_simd_def.h,v
retrieving revision 1.1.2.1
retrieving revision 1.1.2.2
diff -C2 -d -r1.1.2.1 -r1.1.2.2
*** m_simd_def.h	23 Dec 2003 01:15:39 -0000	1.1.2.1
--- m_simd_def.h	28 Dec 2003 12:32:03 -0000	1.1.2.2
***************
*** 35,37 ****
--- 35,43 ----
  #define scalarmax_perf_simd     scalarmax_perf8
  
+ /* functions in d_math.c */
+ #define clip_perf_simd          clip_perform  /* SIMD not implemented */
+ #define sigwrap_perf_simd       sigwrap_perform  /* SIMD not implemented */
+ #define sigsqrt_perf_simd       sigsqrt_perform  /* SIMD not implemented */
+ #define sigrsqrt_perf_simd      sigrsqrt_perform /* SIMD not implemented */
+ 
  #endif /* __M_SIMD_DEF_H */

Index: m_simd_sse_gcc.h
===================================================================
RCS file: /cvsroot/pure-data/pd/src/Attic/m_simd_sse_gcc.h,v
retrieving revision 1.1.2.1
retrieving revision 1.1.2.2
diff -C2 -d -r1.1.2.1 -r1.1.2.2
*** m_simd_sse_gcc.h	23 Dec 2003 01:15:39 -0000	1.1.2.1
--- m_simd_sse_gcc.h	28 Dec 2003 12:32:03 -0000	1.1.2.2
***************
*** 25,28 ****
--- 25,32 ----
  t_int *min_perf_sse_gcc(t_int *w);
  t_int *scalarmin_perf_sse_gcc(t_int *w);
+ t_int *clip_perf_sse_gcc(t_int *w);
+ t_int *sigwrap_perf_sse_gcc(t_int *w);
+ t_int *sigsqrt_perf_sse_gcc(t_int *w);
+ t_int *sigrsqrt_perf_sse_gcc(t_int *w);
  
  
***************
*** 49,52 ****
--- 53,62 ----
  #define max_perf_simd           max_perf8 /* SIMD not implemented */
  #define scalarmax_perf_simd     scalarmax_perf8 /* SIMD not implemented */
+ 
+ /* functions in d_math.c */
+ #define clip_perf_simd          clip_perform  /* SIMD not implemented */
+ #define sigwrap_perf_simd       sigwrap_perform  /* SIMD not implemented */
+ #define sigsqrt_perf_simd       sigsqrt_perform  /* SIMD not implemented */
+ #define sigrsqrt_perf_simd      sigrsqrt_perform /* SIMD not implemented */
  
  #endif /* __M_SIMD_SSE_GCC_H */

Index: m_simd_sse_vc.h
===================================================================
RCS file: /cvsroot/pure-data/pd/src/Attic/m_simd_sse_vc.h,v
retrieving revision 1.1.2.1
retrieving revision 1.1.2.2
diff -C2 -d -r1.1.2.1 -r1.1.2.2
*** m_simd_sse_vc.h	23 Dec 2003 01:15:39 -0000	1.1.2.1
--- m_simd_sse_vc.h	28 Dec 2003 12:32:03 -0000	1.1.2.2
***************
*** 25,28 ****
--- 25,32 ----
  t_int *min_perf_sse_vc(t_int *w);
  t_int *scalarmin_perf_sse_vc(t_int *w);
+ t_int *clip_perf_sse_vc(t_int *w);
+ t_int *sigwrap_perf_sse_vc(t_int *w);
+ t_int *sigsqrt_perf_sse_vc(t_int *w);
+ t_int *sigrsqrt_perf_sse_vc(t_int *w);
  
  
***************
*** 49,52 ****
--- 53,62 ----
  #define max_perf_simd           max_perf_sse_vc
  #define scalarmax_perf_simd     scalarmax_perf_sse_vc
+ 
+ /* functions in d_math.c */
+ #define clip_perf_simd          clip_perform  /* SIMD not implemented */
+ #define sigwrap_perf_simd       sigwrap_perform  /* SIMD not implemented */
+ #define sigsqrt_perf_simd       sigsqrt_perform  /* SIMD not implemented */
+ #define sigrsqrt_perf_simd      sigrsqrt_perform /* SIMD not implemented */
  
  #endif /* __M_SIMD_SSE_VC_H */

Index: m_simd_ve_gcc.c
===================================================================
RCS file: /cvsroot/pure-data/pd/src/Attic/m_simd_ve_gcc.c,v
retrieving revision 1.1.2.1
retrieving revision 1.1.2.2
diff -C2 -d -r1.1.2.1 -r1.1.2.2
*** m_simd_ve_gcc.c	23 Dec 2003 01:15:39 -0000	1.1.2.1
--- m_simd_ve_gcc.c	28 Dec 2003 12:32:03 -0000	1.1.2.2
***************
*** 7,12 ****
  #include "m_simd.h"
  
! #if defined(__GNUC__) && defined(__POWERPC__)
  
  
  #endif
--- 7,294 ----
  #include "m_simd.h"
  
! #if defined(__GNUC__) && defined(__POWERPC__) && defined(__ALTIVEC__)
! 
! 
! /* functions for unaligned vector data - taken from http://developer.apple.com/hardware/ve/alignment.html */
! 
! /* T.Grill - this first version _should_ work! but it doesn't... */
! #if 0
! #define LoadUnaligned(v) (vec_perm( vec_ld( 0, (const vector float *)(v) ), vec_ld( 16, (const vector float *)(v) ), vec_lvsl( 0, (float *) (v) ) ))
! #else
! /* instead take the slower second one */
! static vector float LoadUnaligned(const float *v)
! {
! 	union tmpstruct { float f[4]; vector float vec; } tmp;
! 	tmp.f[0] = *(float *)v;
! 	return vec_splat(vec_ld(0,&tmp.vec),0);
! }
! #endif
! 
! 
! #define IsVectorAligned(where) ((unsigned long)(where)&(sizeof(vector float)-1) == 0)
! /*
! #define LoadValue(where) (IsVectorAligned((void *)(where))?vec_splat(vec_ld(0,(vector float *)(where)),0):LoadUnaligned((vector float *)(where))) 
! */
! /* always assume unaligned */
! #define LoadValue(where) LoadUnaligned((const float *)(where))
! 
! t_int *zero_perf_ve_gcc(t_int *w)
! {
! 	const vector float zero = (vector float)(0);
!     t_float *dst = (t_float *)w[1];
!     int n = w[2]>>4;
! 
! 	for(; n--; dst += 16) {
! 		vec_st(zero, 0,dst);
! 		vec_st(zero,16,dst);
! 		vec_st(zero,32,dst);
! 		vec_st(zero,48,dst);
! 	}
!     return w+3;
! }
! 
! t_int *copy_perf_ve_gcc(t_int *w)
! {
!     const t_float *src = (const t_float *)w[1];
!     t_float *dst = (t_float *)w[2];
!     int n = w[3]>>4;
! 
! 	for(; n--; src += 16,dst += 16) {
! 		vec_st(vec_ld( 0,src), 0,dst);
! 		vec_st(vec_ld(16,src),16,dst);
! 		vec_st(vec_ld(32,src),32,dst);
! 		vec_st(vec_ld(48,src),48,dst);
! 	}
! 	return w+4;
! }
! 
! t_int *sig_tilde_perf_ve_gcc(t_int *w)
! {
! 	const vector float arg = LoadValue(w[1]);
!     t_float *dst = (t_float *)w[2];
!     int n = w[3]>>4;
! 
! 	for(; n--; dst += 16) {
! 		vec_st(arg, 0,dst);
! 		vec_st(arg,16,dst);
! 		vec_st(arg,32,dst);
! 		vec_st(arg,48,dst);
! 	}
!     return w+4;
! }
! 
! t_int *plus_perf_ve_gcc(t_int *w)
! {
!     const t_float *src1 = (const t_float *)w[1];
!     const t_float *src2 = (const t_float *)w[2];
!     t_float *dst = (t_float *)w[3];
!     int n = w[4]>>4;
!    
! 	for(; n--; src1 += 16,src2 += 16,dst += 16) {
! 		vec_st(vec_add(vec_ld( 0,src1),vec_ld( 0,src2)), 0,dst);
! 		vec_st(vec_add(vec_ld(16,src1),vec_ld(16,src2)),16,dst);
! 		vec_st(vec_add(vec_ld(32,src1),vec_ld(32,src2)),32,dst);
! 		vec_st(vec_add(vec_ld(48,src1),vec_ld(48,src2)),48,dst);
! 	}
! 	return w+5;
! }
! 
! t_int *scalarplus_perf_ve_gcc(t_int *w)
! {
!     const t_float *src = (const t_float *)w[1];
! 	const vector float arg = LoadValue(w[2]);
!     t_float *dst = (t_float *)w[3];
!     int n = w[4]>>4;
!    
! 	for(; n--; src += 16,dst += 16) {
! 		vec_st(vec_add(vec_ld( 0,src),arg), 0,dst);
! 		vec_st(vec_add(vec_ld(16,src),arg),16,dst);
! 		vec_st(vec_add(vec_ld(32,src),arg),32,dst);
! 		vec_st(vec_add(vec_ld(48,src),arg),48,dst);
! 	}
! 	return w+5;
! }
! 
! t_int *minus_perf_ve_gcc(t_int *w)
! {
!     const t_float *src1 = (const t_float *)w[1];
!     const t_float *src2 = (const t_float *)w[2];
!     t_float *dst = (t_float *)w[3];
!     int n = w[4]>>4;
!    
! 	for(; n--; src1 += 16,src2 += 16,dst += 16) {
! 		vec_st(vec_sub(vec_ld( 0,src1),vec_ld( 0,src2)), 0,dst);
! 		vec_st(vec_sub(vec_ld(16,src1),vec_ld(16,src2)),16,dst);
! 		vec_st(vec_sub(vec_ld(32,src1),vec_ld(32,src2)),32,dst);
! 		vec_st(vec_sub(vec_ld(48,src1),vec_ld(48,src2)),48,dst);
! 	}
! 	return w+5;
! }
! 
! t_int *scalarminus_perf_ve_gcc(t_int *w)
! {
!     const t_float *src = (const t_float *)w[1];
! 	const vector float arg = LoadValue(w[2]);
!     t_float *dst = (t_float *)w[3];
!     int n = w[4]>>4;
!    
! 	for(; n--; src += 16,dst += 16) {
! 		vec_st(vec_sub(vec_ld( 0,src),arg), 0,dst);
! 		vec_st(vec_sub(vec_ld(16,src),arg),16,dst);
! 		vec_st(vec_sub(vec_ld(32,src),arg),32,dst);
! 		vec_st(vec_sub(vec_ld(48,src),arg),48,dst);
! 	}
! 	return w+5;
! }
! 
! t_int *times_perf_ve_gcc(t_int *w)
! {
!     const t_float *src1 = (const t_float *)w[1];
!     const t_float *src2 = (const t_float *)w[2];
!     t_float *dst = (t_float *)w[3];
!     const vector float zero = (vector float)(0);
!     int n = w[4]>>4;
!    
! 	for(; n--; src1 += 16,src2 += 16,dst += 16) {
! 		vec_st(vec_madd(vec_ld( 0,src1),vec_ld( 0,src2),zero), 0,dst);
! 		vec_st(vec_madd(vec_ld(16,src1),vec_ld(16,src2),zero),16,dst);
! 		vec_st(vec_madd(vec_ld(32,src1),vec_ld(32,src2),zero),32,dst);
! 		vec_st(vec_madd(vec_ld(48,src1),vec_ld(48,src2),zero),48,dst);
! 	}
! 	return w+5;
! }
! 
! t_int *scalartimes_perf_ve_gcc(t_int *w)
! {
!     const t_float *src = (const t_float *)w[1];
! 	const vector float arg = LoadValue(w[2]);
!     t_float *dst = (t_float *)w[3];
!     const vector float zero = (vector float)(0);
!     int n = w[4]>>4;
!    
! 	for(; n--; src += 16,dst += 16) {
! 		vec_st(vec_madd(vec_ld( 0,src),arg,zero), 0,dst);
! 		vec_st(vec_madd(vec_ld(16,src),arg,zero),16,dst);
! 		vec_st(vec_madd(vec_ld(32,src),arg,zero),32,dst);
! 		vec_st(vec_madd(vec_ld(48,src),arg,zero),48,dst);
! 	}
! 	return w+5;
! }
! 
! t_int *min_perf_ve_gcc(t_int *w)
! {
!     const t_float *src1 = (const t_float *)w[1];
!     const t_float *src2 = (const t_float *)w[2];
!     t_float *dst = (t_float *)w[3];
!     int n = w[4]>>4;
!    
! 	for(; n--; src1 += 16,src2 += 16,dst += 16) {
! 		vec_st(vec_min(vec_ld( 0,src1),vec_ld( 0,src2)), 0,dst);
! 		vec_st(vec_min(vec_ld(16,src1),vec_ld(16,src2)),16,dst);
! 		vec_st(vec_min(vec_ld(32,src1),vec_ld(32,src2)),32,dst);
! 		vec_st(vec_min(vec_ld(48,src1),vec_ld(48,src2)),48,dst);
! 	}
! 	return w+5;
! }
! 
! t_int *scalarmin_perf_ve_gcc(t_int *w)
! {
!     const t_float *src = (const t_float *)w[1];
! 	const vector float arg = LoadValue(w[2]);
!     t_float *dst = (t_float *)w[3];
!     int n = w[4]>>4;
!    
! 	for(; n--; src += 16,dst += 16) {
! 		vec_st(vec_min(vec_ld( 0,src),arg), 0,dst);
! 		vec_st(vec_min(vec_ld(16,src),arg),16,dst);
! 		vec_st(vec_min(vec_ld(32,src),arg),32,dst);
! 		vec_st(vec_min(vec_ld(48,src),arg),48,dst);
! 	}
! 	return w+5;
! }
! 
! t_int *max_perf_ve_gcc(t_int *w)
! {
!     const t_float *src1 = (const t_float *)w[1];
!     const t_float *src2 = (const t_float *)w[2];
!     t_float *dst = (t_float *)w[3];
!     int n = w[4]>>4;
!    
! 	for(; n--; src1 += 16,src2 += 16,dst += 16) {
! 		vec_st(vec_max(vec_ld( 0,src1),vec_ld( 0,src2)), 0,dst);
! 		vec_st(vec_max(vec_ld(16,src1),vec_ld(16,src2)),16,dst);
! 		vec_st(vec_max(vec_ld(32,src1),vec_ld(32,src2)),32,dst);
! 		vec_st(vec_max(vec_ld(48,src1),vec_ld(48,src2)),48,dst);
! 	}
! 	return w+5;
! }
! 
! t_int *scalarmax_perf_ve_gcc(t_int *w)
! {
!     const t_float *src = (const t_float *)w[1];
! 	const vector float arg = LoadValue(w[2]);
!     t_float *dst = (t_float *)w[3];
!     int n = w[4]>>4;
!    
! 	for(; n--; src += 16,dst += 16) {
! 		vec_st(vec_max(vec_ld( 0,src),arg), 0,dst);
! 		vec_st(vec_max(vec_ld(16,src),arg),16,dst);
! 		vec_st(vec_max(vec_ld(32,src),arg),32,dst);
! 		vec_st(vec_max(vec_ld(48,src),arg),48,dst);
! 	}
! 	return w+5;
! }
! 
! #if 0 /* doesn't work */
! t_int *sigsqrt_perf_ve_gcc(t_int *w)
! {
!     const t_float *src = (const t_float *)w[1];
!     t_float *dst = (t_float *)w[2];
!     int n = w[3]>>4;
! 	
! 	const vector float zero = (vector float)(0);
! 	const vector float oneHalf = (vector float)(0.5);
! 	const vector float one = (vector float)(1.0);
  
+ 	for(; n--; src += 16,dst += 16) {
+ 		/* http://developer.apple.com/hardware/ve/algorithms.html*/
+ 
+ 		vector float data1 = vec_ld( 0,src),estimate1 = vec_rsqrte(data1); 
+ 		vec_st(vec_madd(data1,vec_madd( vec_nmsub( data1, vec_madd( estimate1, estimate1, zero ), one ), vec_madd( estimate1, oneHalf, zero ), estimate1 ), zero), 0,dst);
+ 		vector float data2 = vec_ld(16,src),estimate2 = vec_rsqrte(data2); 
+ 		vec_st(vec_madd(data2,vec_madd( vec_nmsub( data2, vec_madd( estimate2, estimate2, zero ), one ), vec_madd( estimate2, oneHalf, zero ), estimate2 ), zero),16,dst);
+ 		vector float data3 = vec_ld(32,src),estimate3 = vec_rsqrte(data3); 
+ 		vec_st(vec_madd(data3,vec_madd( vec_nmsub( data3, vec_madd( estimate3, estimate3, zero ), one ), vec_madd( estimate3, oneHalf, zero ), estimate3 ), zero),32,dst);
+ 		vector float data4 = vec_ld(48,src),estimate4 = vec_rsqrte(data4); 
+ 		vec_st(vec_madd(data4,vec_madd( vec_nmsub( data4, vec_madd( estimate4, estimate4, zero ), one ), vec_madd( estimate4, oneHalf, zero ), estimate4 ), zero),48,dst);
+ 	}
+ 	return w+4;
+ }
+ 
+ t_int *sigrsqrt_perf_ve_gcc(t_int *w)
+ {
+     const t_float *src = (const t_float *)w[1];
+     t_float *dst = (t_float *)w[2];
+     int n = w[3]>>4;
+ 	
+ 	const vector float zero = (vector float)(0);
+ 	const vector float oneHalf = (vector float)(0.5);
+ 	const vector float one = (vector float)(1.0);
+ 
+ 	for(; n--; src += 16,dst += 16) {
+ 		/* http://developer.apple.com/hardware/ve/algorithms.html */
+ 
+ 		vector float data1 = vec_ld( 0,src),estimate1 = vec_rsqrte(data1); 
+ 		vec_st(vec_madd( vec_nmsub( data1, vec_madd( estimate1, estimate1, zero ), one ), vec_madd( estimate1, oneHalf, zero ), estimate1 ), 0,dst);
+ 		vector float data2 = vec_ld(16,src),estimate2 = vec_rsqrte(data2); 
+ 		vec_st(vec_madd( vec_nmsub( data2, vec_madd( estimate2, estimate2, zero ), one ), vec_madd( estimate2, oneHalf, zero ), estimate2 ),16,dst);
+ 		vector float data3 = vec_ld(32,src),estimate3 = vec_rsqrte(data3); 
+ 		vec_st(vec_madd( vec_nmsub( data3, vec_madd( estimate3, estimate3, zero ), one ), vec_madd( estimate3, oneHalf, zero ), estimate3 ),32,dst);
+ 		vector float data4 = vec_ld(48,src),estimate4 = vec_rsqrte(data4); 
+ 		vec_st(vec_madd( vec_nmsub( data4, vec_madd( estimate4, estimate4, zero ), one ), vec_madd( estimate4, oneHalf, zero ), estimate4 ),48,dst);
+ 	}
+ 	return w+4;
+ }
+ #endif
  
  #endif

Index: m_simd_ve_gcc.h
===================================================================
RCS file: /cvsroot/pure-data/pd/src/Attic/m_simd_ve_gcc.h,v
retrieving revision 1.1.2.1
retrieving revision 1.1.2.2
diff -C2 -d -r1.1.2.1 -r1.1.2.2
*** m_simd_ve_gcc.h	23 Dec 2003 01:15:39 -0000	1.1.2.1
--- m_simd_ve_gcc.h	28 Dec 2003 12:32:03 -0000	1.1.2.2
***************
*** 25,52 ****
  t_int *min_perf_ve_gcc(t_int *w);
  t_int *scalarmin_perf_ve_gcc(t_int *w);
  
  
  /* functions in d_ugen.c */
! #define zero_perf_simd          zero_perf8 /* SIMD not implemented */
  
  /* functions in d_dac.c */
! #define copy_perf_simd          copy_perf8 /* SIMD not implemented */
  
  /* functions in d_ctl.c */
! #define sig_tilde_perf_simd     sig_tilde_perf8 /* SIMD not implemented */
  
  /* functions in d_arithmetic.c */
! #define plus_perf_simd          plus_perf8 /* SIMD not implemented */
! #define scalarplus_perf_simd    scalarplus_perf8 /* SIMD not implemented */
! #define minus_perf_simd         minus_perf8 /* SIMD not implemented */
! #define scalarminus_perf_simd   scalarminus_perf8 /* SIMD not implemented */
! #define times_perf_simd         times_perf8 /* SIMD not implemented */
! #define scalartimes_perf_simd   scalartimes_perf8 /* SIMD not implemented */
  #define over_perf_simd          over_perf8 /* SIMD not implemented */
  #define scalarover_perf_simd    scalarover_perf8 /* SIMD not implemented */
! #define min_perf_simd           min_perf8 /* SIMD not implemented */
! #define scalarmin_perf_simd     scalarmin_perf8 /* SIMD not implemented */
! #define max_perf_simd           max_perf8 /* SIMD not implemented */
! #define scalarmax_perf_simd     scalarmax_perf8 /* SIMD not implemented */
  
  #endif /* __M_SIMD_VE_GCC_H */
--- 25,62 ----
  t_int *min_perf_ve_gcc(t_int *w);
  t_int *scalarmin_perf_ve_gcc(t_int *w);
+ t_int *clip_perf_ve_gcc(t_int *w);
+ t_int *sigwrap_perf_ve_gcc(t_int *w);
+ t_int *sigsqrt_perf_ve_gcc(t_int *w);
+ t_int *sigrsqrt_perf_ve_gcc(t_int *w);
  
  
  /* functions in d_ugen.c */
! #define zero_perf_simd          zero_perf_ve_gcc
  
  /* functions in d_dac.c */
! #define copy_perf_simd          copy_perf_ve_gcc
  
  /* functions in d_ctl.c */
! #define sig_tilde_perf_simd     sig_tilde_perf_ve_gcc
  
  /* functions in d_arithmetic.c */
! #define plus_perf_simd          plus_perf_ve_gcc
! #define scalarplus_perf_simd    scalarplus_perf_ve_gcc
! #define minus_perf_simd         minus_perf_ve_gcc
! #define scalarminus_perf_simd   scalarminus_perf_ve_gcc
! #define times_perf_simd         times_perf_ve_gcc
! #define scalartimes_perf_simd   scalartimes_perf_ve_gcc
  #define over_perf_simd          over_perf8 /* SIMD not implemented */
  #define scalarover_perf_simd    scalarover_perf8 /* SIMD not implemented */
! #define min_perf_simd           min_perf_ve_gcc
! #define scalarmin_perf_simd     scalarmin_perf_ve_gcc
! #define max_perf_simd           max_perf_ve_gcc
! #define scalarmax_perf_simd     scalarmax_perf_ve_gcc
! 
! /* functions in d_math.c */
! #define clip_perf_simd          clip_perform  /* SIMD not implemented */
! #define sigwrap_perf_simd       sigwrap_perform  /* SIMD not implemented */
! #define sigsqrt_perf_simd       sigsqrt_perform /* SIMD not working yet */
! #define sigrsqrt_perf_simd      sigrsqrt_perform /* SIMD not working yet */
  
  #endif /* __M_SIMD_VE_GCC_H */