[PD-cvs] pd/src d_arithmetic.c,1.1.1.1.16.4,1.1.1.1.16.5 d_math.c,1.1.1.1.16.2,1.1.1.1.16.3 m_simd_def.h,1.1.2.2,1.1.2.3 m_simd_sse_gcc.h,1.1.2.2,1.1.2.3 m_simd_sse_vc.h,1.1.2.2,1.1.2.3 m_simd_ve_gcc.c,1.1.2.2,1.1.2.3 m_simd_ve_gcc.h,1.1.2.2,1.1.2.3
xovo at users.sourceforge.net
xovo at users.sourceforge.net
Mon Dec 29 03:02:00 CET 2003
- Previous message: [PD-cvs] pd/src d_math.c,1.1.1.1.16.1,1.1.1.1.16.2
- Next message: [PD-cvs] externals/grill/py config-pd-darwin.txt,1.3,1.4 config-pd-linux.txt,1.7,1.8 makefile.pd-darwin,1.3,1.4 readme.txt,1.10,1.11
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
Update of /cvsroot/pure-data/pd/src
In directory sc8-pr-cvs1:/tmp/cvs-serv19981
Modified Files:
Tag: devel_0_37
d_arithmetic.c d_math.c m_simd_def.h m_simd_sse_gcc.h
m_simd_sse_vc.h m_simd_ve_gcc.c m_simd_ve_gcc.h
Log Message:
more PPC SIMD improvements
Index: d_arithmetic.c
===================================================================
RCS file: /cvsroot/pure-data/pd/src/d_arithmetic.c,v
retrieving revision 1.1.1.1.16.4
retrieving revision 1.1.1.1.16.5
diff -C2 -d -r1.1.1.1.16.4 -r1.1.1.1.16.5
*** d_arithmetic.c 23 Dec 2003 01:15:39 -0000 1.1.1.1.16.4
--- d_arithmetic.c 29 Dec 2003 02:01:57 -0000 1.1.1.1.16.5
***************
*** 355,358 ****
--- 355,376 ----
}
+ /* T.Grill - squaring: optimized * for equal input signals */
+ static t_int *sqr_perf8(t_int *w)
+ {
+ t_float *in = (t_float *)(w[1]);
+ t_float *out = (t_float *)(w[2]);
+ int n = (int)(w[3]);
+
+ for (; n; n -= 8, in += 8, out += 8)
+ {
+ float f0 = in[0], f1 = in[1], f2 = in[2], f3 = in[3];
+ float f4 = in[4], f5 = in[5], f6 = in[6], f7 = in[7];
+
+ out[0] = f0 * f0; out[1] = f1 * f1; out[2] = f2 * f2; out[3] = f3 * f3;
+ out[4] = f4 * f4; out[5] = f5 * f5; out[6] = f6 * f6; out[7] = f7 * f7;
+ }
+ return (w+4);
+ }
+
t_int *scalartimes_perform(t_int *w)
{
***************
*** 383,386 ****
--- 401,405 ----
}
+ /* T.Grill - added optimization for equal input signals */
static void times_dsp(t_times *x, t_signal **sp)
{
***************
*** 388,395 ****
if (n&7)
dsp_add(times_perform, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
! else if(SIMD_CHECK3(n,sp[0]->s_vec,sp[1]->s_vec,sp[2]->s_vec))
! dsp_add(times_perf_simd, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
! else
! dsp_add(times_perf8, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
}
--- 407,423 ----
if (n&7)
dsp_add(times_perform, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
! else
! if(sp[0]->s_vec == sp[1]->s_vec) {
! if(SIMD_CHECK2(n,sp[0]->s_vec,sp[2]->s_vec))
! dsp_add(sqr_perf_simd, 3, sp[0]->s_vec, sp[2]->s_vec, n);
! else
! dsp_add(sqr_perf8, 3, sp[0]->s_vec, sp[2]->s_vec, n);
! }
! else {
! if(SIMD_CHECK3(n,sp[0]->s_vec,sp[1]->s_vec,sp[2]->s_vec))
! dsp_add(times_perf_simd, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
! else
! dsp_add(times_perf8, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
! }
}
***************
*** 498,505 ****
}
t_int *scalarover_perform(t_int *w)
{
t_float *in = (t_float *)(w[1]);
! t_float f = 1. / *(t_float *)(w[2]);
t_float *out = (t_float *)(w[3]);
int n = (int)(w[4]);
--- 526,535 ----
}
+ /* T.Grill - added check for zero */
t_int *scalarover_perform(t_int *w)
{
t_float *in = (t_float *)(w[1]);
! t_float f = *(t_float *)(w[2]);
! if(f) f = 1./f;
t_float *out = (t_float *)(w[3]);
int n = (int)(w[4]);
Index: d_math.c
===================================================================
RCS file: /cvsroot/pure-data/pd/src/d_math.c,v
retrieving revision 1.1.1.1.16.2
retrieving revision 1.1.1.1.16.3
diff -C2 -d -r1.1.1.1.16.2 -r1.1.1.1.16.3
*** d_math.c 28 Dec 2003 14:22:10 -0000 1.1.1.1.16.2
--- d_math.c 29 Dec 2003 02:01:57 -0000 1.1.1.1.16.3
***************
*** 48,52 ****
float f = *in++;
if (f < lo) f = lo;
! if (f > hi) f = hi;
*out++ = f;
}
--- 48,52 ----
float f = *in++;
if (f < lo) f = lo;
! else if (f > hi) f = hi;
*out++ = f;
}
Index: m_simd_def.h
===================================================================
RCS file: /cvsroot/pure-data/pd/src/Attic/m_simd_def.h,v
retrieving revision 1.1.2.2
retrieving revision 1.1.2.3
diff -C2 -d -r1.1.2.2 -r1.1.2.3
*** m_simd_def.h 28 Dec 2003 12:32:03 -0000 1.1.2.2
--- m_simd_def.h 29 Dec 2003 02:01:57 -0000 1.1.2.3
***************
*** 28,31 ****
--- 28,32 ----
#define times_perf_simd times_perf8
#define scalartimes_perf_simd scalartimes_perf8
+ #define sqr_perf_simd sqr_perf8
#define over_perf_simd over_perf8
#define scalarover_perf_simd scalarover_perf8
Index: m_simd_sse_gcc.h
===================================================================
RCS file: /cvsroot/pure-data/pd/src/Attic/m_simd_sse_gcc.h,v
retrieving revision 1.1.2.2
retrieving revision 1.1.2.3
diff -C2 -d -r1.1.2.2 -r1.1.2.3
*** m_simd_sse_gcc.h 28 Dec 2003 12:32:03 -0000 1.1.2.2
--- m_simd_sse_gcc.h 29 Dec 2003 02:01:57 -0000 1.1.2.3
***************
*** 19,22 ****
--- 19,23 ----
t_int *times_perf_sse_gcc(t_int *w);
t_int *scalartimes_perf_sse_gcc(t_int *w);
+ t_int *sqr_perf_sse_gcc(t_int *w);
t_int *over_perf_sse_gcc(t_int *w);
t_int *scalarover_perf_sse_gcc(t_int *w);
***************
*** 47,50 ****
--- 48,52 ----
#define times_perf_simd times_perf8 /* SIMD not implemented */
#define scalartimes_perf_simd scalartimes_perf8 /* SIMD not implemented */
+ #define sqr_perf_simd sqr_perf8 /* SIMD not implemented */
#define over_perf_simd over_perf8 /* SIMD not implemented */
#define scalarover_perf_simd scalarover_perf8 /* SIMD not implemented */
Index: m_simd_sse_vc.h
===================================================================
RCS file: /cvsroot/pure-data/pd/src/Attic/m_simd_sse_vc.h,v
retrieving revision 1.1.2.2
retrieving revision 1.1.2.3
diff -C2 -d -r1.1.2.2 -r1.1.2.3
*** m_simd_sse_vc.h 28 Dec 2003 12:32:03 -0000 1.1.2.2
--- m_simd_sse_vc.h 29 Dec 2003 02:01:57 -0000 1.1.2.3
***************
*** 19,22 ****
--- 19,23 ----
t_int *times_perf_sse_vc(t_int *w);
t_int *scalartimes_perf_sse_vc(t_int *w);
+ t_int *sqr_perf_sse_vc(t_int *w);
t_int *over_perf_sse_vc(t_int *w);
t_int *scalarover_perf_sse_vc(t_int *w);
***************
*** 47,50 ****
--- 48,52 ----
#define times_perf_simd times_perf_sse_vc
#define scalartimes_perf_simd scalartimes_perf_sse_vc
+ #define sqr_perf_simd sqr_perf8 /* SIMD not implemented */
#define over_perf_simd over_perf8 /* SIMD not implemented */
#define scalarover_perf_simd scalarover_perf_sse_vc
Index: m_simd_ve_gcc.c
===================================================================
RCS file: /cvsroot/pure-data/pd/src/Attic/m_simd_ve_gcc.c,v
retrieving revision 1.1.2.2
retrieving revision 1.1.2.3
diff -C2 -d -r1.1.2.2 -r1.1.2.3
*** m_simd_ve_gcc.c 28 Dec 2003 12:32:03 -0000 1.1.2.2
--- m_simd_ve_gcc.c 29 Dec 2003 02:01:57 -0000 1.1.2.3
***************
*** 9,12 ****
--- 9,18 ----
#if defined(__GNUC__) && defined(__POWERPC__) && defined(__ALTIVEC__)
+ //#define USEVECLIB
+
+ #ifdef USEVECLIB
+ #include <vecLib/vDSP.h>
+ #include <vecLib/vfp.h>
+ #endif
/* functions for unaligned vector data - taken from http://developer.apple.com/hardware/ve/alignment.html */
***************
*** 55,62 ****
for(; n--; src += 16,dst += 16) {
! vec_st(vec_ld( 0,src), 0,dst);
! vec_st(vec_ld(16,src),16,dst);
! vec_st(vec_ld(32,src),32,dst);
! vec_st(vec_ld(48,src),48,dst);
}
return w+4;
--- 61,72 ----
for(; n--; src += 16,dst += 16) {
! vector float a1 = vec_ld( 0,src);
! vector float a2 = vec_ld(16,src);
! vector float a3 = vec_ld(32,src);
! vector float a4 = vec_ld(48,src);
! vec_st(a1, 0,dst);
! vec_st(a2,16,dst);
! vec_st(a3,32,dst);
! vec_st(a4,48,dst);
}
return w+4;
***************
*** 80,83 ****
--- 90,96 ----
t_int *plus_perf_ve_gcc(t_int *w)
{
+ #ifdef USEVECLIB
+ vadd((const t_float *)w[1],1,(const t_float *)w[2],1,(t_float *)w[3],1,w[4]);
+ #else
const t_float *src1 = (const t_float *)w[1];
const t_float *src2 = (const t_float *)w[2];
***************
*** 86,94 ****
for(; n--; src1 += 16,src2 += 16,dst += 16) {
! vec_st(vec_add(vec_ld( 0,src1),vec_ld( 0,src2)), 0,dst);
! vec_st(vec_add(vec_ld(16,src1),vec_ld(16,src2)),16,dst);
! vec_st(vec_add(vec_ld(32,src1),vec_ld(32,src2)),32,dst);
! vec_st(vec_add(vec_ld(48,src1),vec_ld(48,src2)),48,dst);
}
return w+5;
}
--- 99,118 ----
for(; n--; src1 += 16,src2 += 16,dst += 16) {
! vector float a1 = vec_ld( 0,src1),b1 = vec_ld( 0,src2);
! vector float a2 = vec_ld(16,src1),b2 = vec_ld(16,src2);
! vector float a3 = vec_ld(32,src1),b3 = vec_ld(32,src2);
! vector float a4 = vec_ld(48,src1),b4 = vec_ld(48,src2);
!
! a1 = vec_add(a1,b1);
! a2 = vec_add(a2,b2);
! a3 = vec_add(a3,b3);
! a4 = vec_add(a4,b4);
!
! vec_st(a1, 0,dst);
! vec_st(a2,16,dst);
! vec_st(a3,32,dst);
! vec_st(a4,48,dst);
}
+ #endif
return w+5;
}
***************
*** 102,109 ****
for(; n--; src += 16,dst += 16) {
! vec_st(vec_add(vec_ld( 0,src),arg), 0,dst);
! vec_st(vec_add(vec_ld(16,src),arg),16,dst);
! vec_st(vec_add(vec_ld(32,src),arg),32,dst);
! vec_st(vec_add(vec_ld(48,src),arg),48,dst);
}
return w+5;
--- 126,143 ----
for(; n--; src += 16,dst += 16) {
! vector float a1 = vec_ld( 0,src);
! vector float a2 = vec_ld(16,src);
! vector float a3 = vec_ld(32,src);
! vector float a4 = vec_ld(48,src);
!
! a1 = vec_add(a1,arg);
! a2 = vec_add(a2,arg);
! a3 = vec_add(a3,arg);
! a4 = vec_add(a4,arg);
!
! vec_st(a1, 0,dst);
! vec_st(a2,16,dst);
! vec_st(a3,32,dst);
! vec_st(a4,48,dst);
}
return w+5;
***************
*** 112,115 ****
--- 146,152 ----
t_int *minus_perf_ve_gcc(t_int *w)
{
+ #ifdef USEVECLIB
+ vsub((const t_float *)w[1],1,(const t_float *)w[2],1,(t_float *)w[3],1,w[4]);
+ #else
const t_float *src1 = (const t_float *)w[1];
const t_float *src2 = (const t_float *)w[2];
***************
*** 118,126 ****
for(; n--; src1 += 16,src2 += 16,dst += 16) {
! vec_st(vec_sub(vec_ld( 0,src1),vec_ld( 0,src2)), 0,dst);
! vec_st(vec_sub(vec_ld(16,src1),vec_ld(16,src2)),16,dst);
! vec_st(vec_sub(vec_ld(32,src1),vec_ld(32,src2)),32,dst);
! vec_st(vec_sub(vec_ld(48,src1),vec_ld(48,src2)),48,dst);
}
return w+5;
}
--- 155,174 ----
for(; n--; src1 += 16,src2 += 16,dst += 16) {
! vector float a1 = vec_ld( 0,src1),b1 = vec_ld( 0,src2);
! vector float a2 = vec_ld(16,src1),b2 = vec_ld(16,src2);
! vector float a3 = vec_ld(32,src1),b3 = vec_ld(32,src2);
! vector float a4 = vec_ld(48,src1),b4 = vec_ld(48,src2);
!
! a1 = vec_sub(a1,b1);
! a2 = vec_sub(a2,b2);
! a3 = vec_sub(a3,b3);
! a4 = vec_sub(a4,b4);
!
! vec_st(a1, 0,dst);
! vec_st(a2,16,dst);
! vec_st(a3,32,dst);
! vec_st(a4,48,dst);
}
+ #endif
return w+5;
}
***************
*** 134,141 ****
for(; n--; src += 16,dst += 16) {
! vec_st(vec_sub(vec_ld( 0,src),arg), 0,dst);
! vec_st(vec_sub(vec_ld(16,src),arg),16,dst);
! vec_st(vec_sub(vec_ld(32,src),arg),32,dst);
! vec_st(vec_sub(vec_ld(48,src),arg),48,dst);
}
return w+5;
--- 182,199 ----
for(; n--; src += 16,dst += 16) {
! vector float a1 = vec_ld( 0,src);
! vector float a2 = vec_ld(16,src);
! vector float a3 = vec_ld(32,src);
! vector float a4 = vec_ld(48,src);
!
! a1 = vec_sub(a1,arg);
! a2 = vec_sub(a2,arg);
! a3 = vec_sub(a3,arg);
! a4 = vec_sub(a4,arg);
!
! vec_st(a1, 0,dst);
! vec_st(a2,16,dst);
! vec_st(a3,32,dst);
! vec_st(a4,48,dst);
}
return w+5;
***************
*** 144,147 ****
--- 202,208 ----
t_int *times_perf_ve_gcc(t_int *w)
{
+ #ifdef USEVECLIB
+ vmul((const t_float *)w[1],1,(const t_float *)w[2],1,(t_float *)w[3],1,w[4]);
+ #else
const t_float *src1 = (const t_float *)w[1];
const t_float *src2 = (const t_float *)w[2];
***************
*** 151,159 ****
for(; n--; src1 += 16,src2 += 16,dst += 16) {
! vec_st(vec_madd(vec_ld( 0,src1),vec_ld( 0,src2),zero), 0,dst);
! vec_st(vec_madd(vec_ld(16,src1),vec_ld(16,src2),zero),16,dst);
! vec_st(vec_madd(vec_ld(32,src1),vec_ld(32,src2),zero),32,dst);
! vec_st(vec_madd(vec_ld(48,src1),vec_ld(48,src2),zero),48,dst);
}
return w+5;
}
--- 212,231 ----
for(; n--; src1 += 16,src2 += 16,dst += 16) {
! vector float a1 = vec_ld( 0,src1),b1 = vec_ld( 0,src2);
! vector float a2 = vec_ld(16,src1),b2 = vec_ld(16,src2);
! vector float a3 = vec_ld(32,src1),b3 = vec_ld(32,src2);
! vector float a4 = vec_ld(48,src1),b4 = vec_ld(48,src2);
!
! a1 = vec_madd(a1,b1,zero);
! a2 = vec_madd(a2,b2,zero);
! a3 = vec_madd(a3,b3,zero);
! a4 = vec_madd(a4,b4,zero);
!
! vec_st(a1, 0,dst);
! vec_st(a2,16,dst);
! vec_st(a3,32,dst);
! vec_st(a4,48,dst);
}
+ #endif
return w+5;
}
***************
*** 161,164 ****
--- 233,239 ----
t_int *scalartimes_perf_ve_gcc(t_int *w)
{
+ #ifdef USEVECLIB
+ vsmul((const t_float *)w[1],1,(t_float *)w[2],(t_float *)w[3],1,w[4]);
+ #else
const t_float *src = (const t_float *)w[1];
const vector float arg = LoadValue(w[2]);
***************
*** 168,175 ****
for(; n--; src += 16,dst += 16) {
! vec_st(vec_madd(vec_ld( 0,src),arg,zero), 0,dst);
! vec_st(vec_madd(vec_ld(16,src),arg,zero),16,dst);
! vec_st(vec_madd(vec_ld(32,src),arg,zero),32,dst);
! vec_st(vec_madd(vec_ld(48,src),arg,zero),48,dst);
}
return w+5;
--- 243,401 ----
for(; n--; src += 16,dst += 16) {
! vector float a1 = vec_ld( 0,src);
! vector float a2 = vec_ld(16,src);
! vector float a3 = vec_ld(32,src);
! vector float a4 = vec_ld(48,src);
!
! a1 = vec_madd(a1,arg,zero);
! a2 = vec_madd(a2,arg,zero);
! a3 = vec_madd(a3,arg,zero);
! a4 = vec_madd(a4,arg,zero);
!
! vec_st(a1, 0,dst);
! vec_st(a2,16,dst);
! vec_st(a3,32,dst);
! vec_st(a4,48,dst);
! }
! #endif
! return w+5;
! }
!
! t_int *sqr_perf_ve_gcc(t_int *w)
! {
! #ifdef USEVECLIB
! vsq((const t_float *)w[1],1,(t_float *)w[2],1,w[3]);
! #else
! const t_float *src = (const t_float *)w[1];
! t_float *dst = (t_float *)w[2];
! const vector float zero = (vector float)(0);
! int n = w[3]>>4;
!
! for(; n--; src += 16,dst += 16) {
! vector float a1 = vec_ld( 0,src);
! vector float a2 = vec_ld(16,src);
! vector float a3 = vec_ld(32,src);
! vector float a4 = vec_ld(48,src);
!
! a1 = vec_madd(a1,a1,zero);
! a2 = vec_madd(a2,a2,zero);
! a3 = vec_madd(a3,a3,zero);
! a4 = vec_madd(a4,a4,zero);
!
! vec_st(a1, 0,dst);
! vec_st(a2,16,dst);
! vec_st(a3,32,dst);
! vec_st(a4,48,dst);
! }
! #endif
! return w+4;
! }
!
! t_int *over_perf_ve_gcc(t_int *w)
! {
! const t_float *src1 = (const t_float *)w[1];
! const t_float *src2 = (const t_float *)w[2];
! t_float *dst = (t_float *)w[3];
! const vector float zero = (vector float)(0);
! const vector float one = (vector float)(1);
! int n = w[4]>>4;
!
! for(; n--; src1 += 16,src2 += 16,dst += 16) {
! #ifdef USEVECLIB
! /* no zero checking here */
! vec_st(vdivf(vec_ld( 0,src1),vec_ld( 0,src2)), 0,dst);
! vec_st(vdivf(vec_ld(16,src1),vec_ld(16,src2)),16,dst);
! vec_st(vdivf(vec_ld(32,src1),vec_ld(32,src2)),32,dst);
! vec_st(vdivf(vec_ld(48,src1),vec_ld(48,src2)),48,dst);
! #else
! vector float data1 = vec_ld( 0,src2);
! vector float data2 = vec_ld(16,src2);
! vector float data3 = vec_ld(32,src2);
! vector float data4 = vec_ld(48,src2);
!
! vector unsigned char mask1 = vec_nor((vector unsigned char)vec_cmpeq(data1,zero),(vector unsigned char)zero); /* bit mask... all 0 for data = 0., all 1 else */
! vector unsigned char mask2 = vec_nor((vector unsigned char)vec_cmpeq(data2,zero),(vector unsigned char)zero); /* bit mask... all 0 for data = 0., all 1 else */
! vector unsigned char mask3 = vec_nor((vector unsigned char)vec_cmpeq(data3,zero),(vector unsigned char)zero); /* bit mask... all 0 for data = 0., all 1 else */
! vector unsigned char mask4 = vec_nor((vector unsigned char)vec_cmpeq(data4,zero),(vector unsigned char)zero); /* bit mask... all 0 for data = 0., all 1 else */
!
! /* make estimated reciprocal and zero out NANs */
! vector float tmp1 = vec_re(data1);
! vector float tmp2 = vec_re(data2);
! vector float tmp3 = vec_re(data3);
! vector float tmp4 = vec_re(data4);
!
! tmp1 = (vector float)vec_and((vector unsigned char)tmp1,mask1);
! tmp2 = (vector float)vec_and((vector unsigned char)tmp2,mask2);
! tmp3 = (vector float)vec_and((vector unsigned char)tmp3,mask3);
! tmp4 = (vector float)vec_and((vector unsigned char)tmp4,mask4);
!
! data1 = vec_madd( vec_nmsub( tmp1, data1, one ), tmp1, tmp1 );
! data2 = vec_madd( vec_nmsub( tmp2, data2, one ), tmp2, tmp2 );
! data3 = vec_madd( vec_nmsub( tmp3, data3, one ), tmp3, tmp3 );
! data4 = vec_madd( vec_nmsub( tmp4, data4, one ), tmp4, tmp4 );
!
! tmp1 = vec_ld( 0,src1);
! tmp2 = vec_ld(16,src1);
! tmp3 = vec_ld(32,src1);
! tmp4 = vec_ld(48,src1);
!
! data1 = vec_madd(tmp1,data1,zero);
! data2 = vec_madd(tmp2,data2,zero);
! data3 = vec_madd(tmp3,data3,zero);
! data4 = vec_madd(tmp4,data4,zero);
!
! vec_st(data1, 0,dst);
! vec_st(data2,16,dst);
! vec_st(data3,32,dst);
! vec_st(data4,48,dst);
! #endif
! }
! return w+5;
! }
!
! t_int *scalarover_perf_ve_gcc(t_int *w)
! {
! t_float *dst = (t_float *)w[3];
! const vector float zero = (vector float)(0);
! int n = w[4]>>4;
!
! if(*(t_float *)w[2]) {
! const t_float *src = (const t_float *)w[1];
! #ifdef USEVECLIB
! float arg = *(t_float *)w[2]?1./ *(t_float *)w[2]: 0;
! vsmul(src,1,&arg,dst,1,w[4]);
! #else
! const vector float v = LoadValue(w[2]);
! const vector float one = (vector float)(1);
!
! vector float estimate = vec_re(v);
! vector float arg = vec_madd( vec_nmsub( estimate, v, one ), estimate, estimate );
!
! for(; n--; src += 16,dst += 16) {
! vector float a1 = vec_ld( 0,src);
! vector float a2 = vec_ld(16,src);
! vector float a3 = vec_ld(32,src);
! vector float a4 = vec_ld(48,src);
!
! a1 = vec_madd(a1,arg,zero);
! a2 = vec_madd(a2,arg,zero);
! a3 = vec_madd(a3,arg,zero);
! a4 = vec_madd(a4,arg,zero);
!
! vec_st(a1, 0,dst);
! vec_st(a2,16,dst);
! vec_st(a3,32,dst);
! vec_st(a4,48,dst);
! }
! #endif
! }
! else {
! /* zero all output */
! for(; n--; dst += 16) {
! vec_st(zero, 0,dst);
! vec_st(zero,16,dst);
! vec_st(zero,32,dst);
! vec_st(zero,48,dst);
! }
}
return w+5;
***************
*** 184,191 ****
for(; n--; src1 += 16,src2 += 16,dst += 16) {
! vec_st(vec_min(vec_ld( 0,src1),vec_ld( 0,src2)), 0,dst);
! vec_st(vec_min(vec_ld(16,src1),vec_ld(16,src2)),16,dst);
! vec_st(vec_min(vec_ld(32,src1),vec_ld(32,src2)),32,dst);
! vec_st(vec_min(vec_ld(48,src1),vec_ld(48,src2)),48,dst);
}
return w+5;
--- 410,427 ----
for(; n--; src1 += 16,src2 += 16,dst += 16) {
! vector float a1 = vec_ld( 0,src1),b1 = vec_ld( 0,src2);
! vector float a2 = vec_ld(16,src1),b2 = vec_ld(16,src2);
! vector float a3 = vec_ld(32,src1),b3 = vec_ld(32,src2);
! vector float a4 = vec_ld(48,src1),b4 = vec_ld(48,src2);
!
! a1 = vec_min(a1,b1);
! a2 = vec_min(a2,b2);
! a3 = vec_min(a3,b3);
! a4 = vec_min(a4,b4);
!
! vec_st(a1, 0,dst);
! vec_st(a2,16,dst);
! vec_st(a3,32,dst);
! vec_st(a4,48,dst);
}
return w+5;
***************
*** 200,207 ****
for(; n--; src += 16,dst += 16) {
! vec_st(vec_min(vec_ld( 0,src),arg), 0,dst);
! vec_st(vec_min(vec_ld(16,src),arg),16,dst);
! vec_st(vec_min(vec_ld(32,src),arg),32,dst);
! vec_st(vec_min(vec_ld(48,src),arg),48,dst);
}
return w+5;
--- 436,453 ----
for(; n--; src += 16,dst += 16) {
! vector float a1 = vec_ld( 0,src);
! vector float a2 = vec_ld(16,src);
! vector float a3 = vec_ld(32,src);
! vector float a4 = vec_ld(48,src);
!
! a1 = vec_min(a1,arg);
! a2 = vec_min(a2,arg);
! a3 = vec_min(a3,arg);
! a4 = vec_min(a4,arg);
!
! vec_st(a1, 0,dst);
! vec_st(a2,16,dst);
! vec_st(a3,32,dst);
! vec_st(a4,48,dst);
}
return w+5;
***************
*** 216,223 ****
for(; n--; src1 += 16,src2 += 16,dst += 16) {
! vec_st(vec_max(vec_ld( 0,src1),vec_ld( 0,src2)), 0,dst);
! vec_st(vec_max(vec_ld(16,src1),vec_ld(16,src2)),16,dst);
! vec_st(vec_max(vec_ld(32,src1),vec_ld(32,src2)),32,dst);
! vec_st(vec_max(vec_ld(48,src1),vec_ld(48,src2)),48,dst);
}
return w+5;
--- 462,479 ----
for(; n--; src1 += 16,src2 += 16,dst += 16) {
! vector float a1 = vec_ld( 0,src1),b1 = vec_ld( 0,src2);
! vector float a2 = vec_ld(16,src1),b2 = vec_ld(16,src2);
! vector float a3 = vec_ld(32,src1),b3 = vec_ld(32,src2);
! vector float a4 = vec_ld(48,src1),b4 = vec_ld(48,src2);
!
! a1 = vec_max(a1,b1);
! a2 = vec_max(a2,b2);
! a3 = vec_max(a3,b3);
! a4 = vec_max(a4,b4);
!
! vec_st(a1, 0,dst);
! vec_st(a2,16,dst);
! vec_st(a3,32,dst);
! vec_st(a4,48,dst);
}
return w+5;
***************
*** 232,244 ****
for(; n--; src += 16,dst += 16) {
! vec_st(vec_max(vec_ld( 0,src),arg), 0,dst);
! vec_st(vec_max(vec_ld(16,src),arg),16,dst);
! vec_st(vec_max(vec_ld(32,src),arg),32,dst);
! vec_st(vec_max(vec_ld(48,src),arg),48,dst);
}
return w+5;
}
! #if 0 /* doesn't work */
t_int *sigsqrt_perf_ve_gcc(t_int *w)
{
--- 488,586 ----
for(; n--; src += 16,dst += 16) {
! vector float a1 = vec_ld( 0,src);
! vector float a2 = vec_ld(16,src);
! vector float a3 = vec_ld(32,src);
! vector float a4 = vec_ld(48,src);
!
! a1 = vec_max(a1,arg);
! a2 = vec_max(a2,arg);
! a3 = vec_max(a3,arg);
! a4 = vec_max(a4,arg);
!
! vec_st(a1, 0,dst);
! vec_st(a2,16,dst);
! vec_st(a3,32,dst);
! vec_st(a4,48,dst);
}
return w+5;
}
! t_int *clip_perf_ve_gcc(t_int *w)
! {
! const t_float *src = (const t_float *)w[1];
! t_float *dst = (t_float *)w[2];
! const vector float lo = LoadValue(w[3]);
! const vector float hi = LoadValue(w[4]);
! int n = w[5]>>4;
!
! for(; n--; src += 16,dst += 16) {
! vector float data1 = vec_ld( 0,src);
! vector float data2 = vec_ld(16,src);
! vector float data3 = vec_ld(32,src);
! vector float data4 = vec_ld(48,src);
!
! vector unsigned char mlo1 = (vector unsigned char)vec_cmple(data1,lo); /* bit mask data <= lo */
! vector unsigned char mlo2 = (vector unsigned char)vec_cmple(data2,lo); /* bit mask data <= lo */
! vector unsigned char mlo3 = (vector unsigned char)vec_cmple(data3,lo); /* bit mask data <= lo */
! vector unsigned char mlo4 = (vector unsigned char)vec_cmple(data4,lo); /* bit mask data <= lo */
!
! vector unsigned char mhi1 = (vector unsigned char)vec_cmpge(data1,hi); /* bit mask data >= hi */
! vector unsigned char mhi2 = (vector unsigned char)vec_cmpge(data2,hi); /* bit mask data >= hi */
! vector unsigned char mhi3 = (vector unsigned char)vec_cmpge(data3,hi); /* bit mask data >= hi */
! vector unsigned char mhi4 = (vector unsigned char)vec_cmpge(data4,hi); /* bit mask data >= hi */
!
! data1 = (vector float)vec_and((vector unsigned char)data1,vec_nor(mlo1,mhi1));
! data2 = (vector float)vec_and((vector unsigned char)data2,vec_nor(mlo2,mhi2));
! data3 = (vector float)vec_and((vector unsigned char)data3,vec_nor(mlo3,mhi3));
! data4 = (vector float)vec_and((vector unsigned char)data4,vec_nor(mlo4,mhi4));
!
! mlo1 = vec_and((vector unsigned char)lo,mlo1);
! mlo2 = vec_and((vector unsigned char)lo,mlo2);
! mlo3 = vec_and((vector unsigned char)lo,mlo3);
! mlo4 = vec_and((vector unsigned char)lo,mlo4);
!
! mhi1 = vec_and((vector unsigned char)hi,mhi1);
! mhi2 = vec_and((vector unsigned char)hi,mhi2);
! mhi3 = vec_and((vector unsigned char)hi,mhi3);
! mhi4 = vec_and((vector unsigned char)hi,mhi4);
!
! data1 = (vector float)vec_or(vec_or(mlo1,mhi1),(vector unsigned char)data1);
! data2 = (vector float)vec_or(vec_or(mlo2,mhi2),(vector unsigned char)data2);
! data3 = (vector float)vec_or(vec_or(mlo3,mhi3),(vector unsigned char)data3);
! data4 = (vector float)vec_or(vec_or(mlo4,mhi4),(vector unsigned char)data4);
!
! vec_st(data1, 0,dst);
! vec_st(data2,16,dst);
! vec_st(data3,32,dst);
! vec_st(data4,48,dst);
! }
! return w+6;
! }
!
! t_int *sigwrap_perf_ve_gcc(t_int *w)
! {
! const t_float *src = (const t_float *)w[1];
! t_float *dst = (t_float *)w[2];
! int n = w[3]>>4;
!
! for(; n--; src += 16,dst += 16) {
! vector float data1 = vec_ld( 0,src);
! vector float data2 = vec_ld(16,src);
! vector float data3 = vec_ld(32,src);
! vector float data4 = vec_ld(48,src);
!
! data1 = vec_sub(data1,vec_floor(data1));
! data2 = vec_sub(data2,vec_floor(data2));
! data3 = vec_sub(data3,vec_floor(data3));
! data4 = vec_sub(data4,vec_floor(data4));
!
! vec_st(data1, 0,dst);
! vec_st(data2,16,dst);
! vec_st(data3,32,dst);
! vec_st(data4,48,dst);
! }
! return w+4;
! }
!
t_int *sigsqrt_perf_ve_gcc(t_int *w)
{
***************
*** 252,269 ****
for(; n--; src += 16,dst += 16) {
! /* http://developer.apple.com/hardware/ve/algorithms.html*/
! vector float data1 = vec_ld( 0,src),estimate1 = vec_rsqrte(data1);
! vec_st(vec_madd(data1,vec_madd( vec_nmsub( data1, vec_madd( estimate1, estimate1, zero ), one ), vec_madd( estimate1, oneHalf, zero ), estimate1 ), zero), 0,dst);
! vector float data2 = vec_ld(16,src),estimate2 = vec_rsqrte(data2);
! vec_st(vec_madd(data2,vec_madd( vec_nmsub( data2, vec_madd( estimate2, estimate2, zero ), one ), vec_madd( estimate2, oneHalf, zero ), estimate2 ), zero),16,dst);
! vector float data3 = vec_ld(32,src),estimate3 = vec_rsqrte(data3);
! vec_st(vec_madd(data3,vec_madd( vec_nmsub( data3, vec_madd( estimate3, estimate3, zero ), one ), vec_madd( estimate3, oneHalf, zero ), estimate3 ), zero),32,dst);
! vector float data4 = vec_ld(48,src),estimate4 = vec_rsqrte(data4);
! vec_st(vec_madd(data4,vec_madd( vec_nmsub( data4, vec_madd( estimate4, estimate4, zero ), one ), vec_madd( estimate4, oneHalf, zero ), estimate4 ), zero),48,dst);
}
return w+4;
}
t_int *sigrsqrt_perf_ve_gcc(t_int *w)
{
--- 594,642 ----
for(; n--; src += 16,dst += 16) {
! /* http://developer.apple.com/hardware/ve/algorithms.html
! Just as in Miller's scalar sigsqrt_perform,
! first a rsqrt estimate is calculated which is then refined by one round of Newton-Raphson.
! Here, to avoid branching a mask is generated which zeroes out eventual resulting NANs.
! */
!
! #ifdef USEVECLIB
! /* no zero checking here */
! vec_st(vsqrtf(vec_ld( 0,src)), 0,dst);
! vec_st(vsqrtf(vec_ld(16,src)),16,dst);
! vec_st(vsqrtf(vec_ld(32,src)),32,dst);
! vec_st(vsqrtf(vec_ld(48,src)),48,dst);
! #else
! vector float data1 = vec_ld( 0,src);
! vector float data2 = vec_ld(16,src);
! vector float data3 = vec_ld(32,src);
! vector float data4 = vec_ld(48,src);
!
! const vector unsigned char mask1 = vec_nor((vector unsigned char)vec_cmple(data1,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
! const vector unsigned char mask2 = vec_nor((vector unsigned char)vec_cmple(data2,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
! const vector unsigned char mask3 = vec_nor((vector unsigned char)vec_cmple(data3,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
! const vector unsigned char mask4 = vec_nor((vector unsigned char)vec_cmple(data4,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
!
! const vector float estimate1 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data1),mask1);
! const vector float estimate2 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data2),mask2);
! const vector float estimate3 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data3),mask3);
! const vector float estimate4 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data4),mask4);
!
! /* this can still be improved.... */
! data1 = vec_madd(data1,vec_madd( vec_nmsub( data1, vec_madd( estimate1, estimate1, zero ), one ), vec_madd( estimate1, oneHalf, zero ), estimate1 ), zero);
! data2 = vec_madd(data2,vec_madd( vec_nmsub( data2, vec_madd( estimate2, estimate2, zero ), one ), vec_madd( estimate2, oneHalf, zero ), estimate2 ), zero);
! data3 = vec_madd(data3,vec_madd( vec_nmsub( data3, vec_madd( estimate3, estimate3, zero ), one ), vec_madd( estimate3, oneHalf, zero ), estimate3 ), zero);
! data4 = vec_madd(data4,vec_madd( vec_nmsub( data4, vec_madd( estimate4, estimate4, zero ), one ), vec_madd( estimate4, oneHalf, zero ), estimate4 ), zero);
!
! vec_st(data1, 0,dst);
! vec_st(data2,16,dst);
! vec_st(data3,32,dst);
! vec_st(data4,48,dst);
! #endif
}
return w+4;
}
+ /* Attention: there's a difference to sigsqrt_perform which delivers non-zero for a zero input... i don't think the latter is intended... */
t_int *sigrsqrt_perf_ve_gcc(t_int *w)
{
***************
*** 277,294 ****
for(; n--; src += 16,dst += 16) {
! /* http://developer.apple.com/hardware/ve/algorithms.html */
! vector float data1 = vec_ld( 0,src),estimate1 = vec_rsqrte(data1);
! vec_st(vec_madd( vec_nmsub( data1, vec_madd( estimate1, estimate1, zero ), one ), vec_madd( estimate1, oneHalf, zero ), estimate1 ), 0,dst);
! vector float data2 = vec_ld(16,src),estimate2 = vec_rsqrte(data2);
! vec_st(vec_madd( vec_nmsub( data2, vec_madd( estimate2, estimate2, zero ), one ), vec_madd( estimate2, oneHalf, zero ), estimate2 ),16,dst);
! vector float data3 = vec_ld(32,src),estimate3 = vec_rsqrte(data3);
! vec_st(vec_madd( vec_nmsub( data3, vec_madd( estimate3, estimate3, zero ), one ), vec_madd( estimate3, oneHalf, zero ), estimate3 ),32,dst);
! vector float data4 = vec_ld(48,src),estimate4 = vec_rsqrte(data4);
! vec_st(vec_madd( vec_nmsub( data4, vec_madd( estimate4, estimate4, zero ), one ), vec_madd( estimate4, oneHalf, zero ), estimate4 ),48,dst);
}
return w+4;
}
- #endif
#endif
--- 650,700 ----
for(; n--; src += 16,dst += 16) {
! /* http://developer.apple.com/hardware/ve/algorithms.html
! Just as in Miller's scalar sigrsqrt_perform,
! first a rsqrt estimate is calculated which is then refined by one round of Newton-Raphson.
! Here, to avoid branching a mask is generated which zeroes out eventual resulting NANs.
! */
!
! #ifdef USEVECLIB
! /* no zero checking here */
! vec_st(vrsqrtf(vec_ld( 0,src)), 0,dst);
! vec_st(vrsqrtf(vec_ld(16,src)),16,dst);
! vec_st(vrsqrtf(vec_ld(32,src)),32,dst);
! vec_st(vrsqrtf(vec_ld(48,src)),48,dst);
! #else
! vector float data1 = vec_ld( 0,src);
! vector float data2 = vec_ld(16,src);
! vector float data3 = vec_ld(32,src);
! vector float data4 = vec_ld(48,src);
!
! const vector unsigned char mask1 = vec_nor((vector unsigned char)vec_cmple(data1,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
! const vector unsigned char mask2 = vec_nor((vector unsigned char)vec_cmple(data2,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
! const vector unsigned char mask3 = vec_nor((vector unsigned char)vec_cmple(data3,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
! const vector unsigned char mask4 = vec_nor((vector unsigned char)vec_cmple(data4,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
!
! const vector float estimate1 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data1),mask1);
! const vector float estimate2 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data2),mask2);
! const vector float estimate3 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data3),mask3);
! const vector float estimate4 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data4),mask4);
!
! data1 = vec_nmsub( data1, vec_madd( estimate1, estimate1, zero ), one );
! data2 = vec_nmsub( data2, vec_madd( estimate2, estimate2, zero ), one );
! data3 = vec_nmsub( data3, vec_madd( estimate3, estimate3, zero ), one );
! data4 = vec_nmsub( data4, vec_madd( estimate4, estimate4, zero ), one );
!
! data1 = vec_madd( data1, vec_madd( estimate1, oneHalf, zero ), estimate1 );
! data2 = vec_madd( data2, vec_madd( estimate2, oneHalf, zero ), estimate2 );
! data3 = vec_madd( data3, vec_madd( estimate3, oneHalf, zero ), estimate3 );
! data4 = vec_madd( data4, vec_madd( estimate4, oneHalf, zero ), estimate4 );
!
! vec_st(data1, 0,dst);
! vec_st(data2,16,dst);
! vec_st(data3,32,dst);
! vec_st(data4,48,dst);
! #endif
}
return w+4;
}
#endif
Index: m_simd_ve_gcc.h
===================================================================
RCS file: /cvsroot/pure-data/pd/src/Attic/m_simd_ve_gcc.h,v
retrieving revision 1.1.2.2
retrieving revision 1.1.2.3
diff -C2 -d -r1.1.2.2 -r1.1.2.3
*** m_simd_ve_gcc.h 28 Dec 2003 12:32:03 -0000 1.1.2.2
--- m_simd_ve_gcc.h 29 Dec 2003 02:01:57 -0000 1.1.2.3
***************
*** 19,22 ****
--- 19,23 ----
t_int *times_perf_ve_gcc(t_int *w);
t_int *scalartimes_perf_ve_gcc(t_int *w);
+ t_int *sqr_perf_ve_gcc(t_int *w);
t_int *over_perf_ve_gcc(t_int *w);
t_int *scalarover_perf_ve_gcc(t_int *w);
***************
*** 47,52 ****
#define times_perf_simd times_perf_ve_gcc
#define scalartimes_perf_simd scalartimes_perf_ve_gcc
! #define over_perf_simd over_perf8 /* SIMD not implemented */
! #define scalarover_perf_simd scalarover_perf8 /* SIMD not implemented */
#define min_perf_simd min_perf_ve_gcc
#define scalarmin_perf_simd scalarmin_perf_ve_gcc
--- 48,54 ----
#define times_perf_simd times_perf_ve_gcc
#define scalartimes_perf_simd scalartimes_perf_ve_gcc
! #define sqr_perf_simd sqr_perf_ve_gcc
! #define over_perf_simd over_perf_ve_gcc
! #define scalarover_perf_simd scalarover_perf_ve_gcc
#define min_perf_simd min_perf_ve_gcc
#define scalarmin_perf_simd scalarmin_perf_ve_gcc
***************
*** 55,62 ****
/* functions in d_math.c */
! #define clip_perf_simd clip_perform /* SIMD not implemented */
! #define sigwrap_perf_simd sigwrap_perform /* SIMD not implemented */
! #define sigsqrt_perf_simd sigsqrt_perform /* SIMD not working yet */
! #define sigrsqrt_perf_simd sigrsqrt_perform /* SIMD not working yet */
#endif /* __M_SIMD_VE_GCC_H */
--- 57,64 ----
/* functions in d_math.c */
! #define clip_perf_simd clip_perf_ve_gcc
! #define sigwrap_perf_simd sigwrap_perf_ve_gcc
! #define sigsqrt_perf_simd sigsqrt_perf_ve_gcc
! #define sigrsqrt_perf_simd sigrsqrt_perf_ve_gcc
#endif /* __M_SIMD_VE_GCC_H */
- Previous message: [PD-cvs] pd/src d_math.c,1.1.1.1.16.1,1.1.1.1.16.2
- Next message: [PD-cvs] externals/grill/py config-pd-darwin.txt,1.3,1.4 config-pd-linux.txt,1.7,1.8 makefile.pd-darwin,1.3,1.4 readme.txt,1.10,1.11
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
More information about the Pd-cvs
mailing list