[PD-cvs] pd/src m_simd.h,NONE,1.1.2.1 m_simd_def.h,NONE,1.1.2.1 m_simd_sse_gcc.c,NONE,1.1.2.1 m_simd_sse_gcc.h,NONE,1.1.2.1 m_simd_sse_vc.c,NONE,1.1.2.1 m_simd_sse_vc.h,NONE,1.1.2.1 m_simd_ve_gcc.c,NONE,1.1.2.1 m_simd_ve_gcc.h,NONE,1.1.2.1 d_arithmetic.c,1.1.1.1.16.3,1.1.1.1.16.4 d_ctl.c,1.1.1.3.2.5,1.1.1.3.2.6 d_dac.c,1.1.1.2.2.4,1.1.1.2.2.5 d_ugen.c,1.1.1.2.2.5,1.1.1.2.2.6 makefile.in,1.1.1.3.2.10,1.1.1.3.2.11

Tue Dec 23 02:15:42 CET 2003

Update of /cvsroot/pure-data/pd/src
In directory sc8-pr-cvs1:/tmp/cvs-serv24434

Modified Files:
      Tag: devel_0_37
	d_arithmetic.c d_ctl.c d_dac.c d_ugen.c makefile.in 
Added Files:
      Tag: devel_0_37
	m_simd.h m_simd_def.h m_simd_sse_gcc.c m_simd_sse_gcc.h 
	m_simd_sse_vc.c m_simd_sse_vc.h m_simd_ve_gcc.c 
	m_simd_ve_gcc.h 
Log Message:
made the SIMD functionality modular 
added implementation templates for all common systems
did some more optimization for SSE/VC++

--- NEW FILE: m_simd.h ---
/* 
    Definitions for SIMD functionality
    added by T.Grill
*/

#ifndef __M_SIMD_H
#define __M_SIMD_H


#ifdef DONTUSESIMD

    /* if SIMD shouldn't be used the checks will always return false */
    #define SIMD_CHKCNT(n) ( 0 )
    #define SIMD_CHKALIGN(ptr) ( 0 )

    #undef SIMD_BYTEALIGN

    #include "m_simd_def.h"

#else

    /* how many floats do we calculate in the loop of a SIMD codelet? */
    #define SIMD_BLOCK 16  /* must be a power of 2 */

    #if defined(_MSC_VER) && defined(_M_IX86)  /* Visual C++ on Intel */
        /* alignment for Intel SSE */
        #define SIMD_BYTEALIGN (128/8)   /* align to 128 bits */

        #include "m_simd_sse_vc.h"

    #elif defined(__GNUC__) && (defined(_X86_) || defined(__i386__) || defined(__i586__) || defined(__i686__))
        /* Intel SSE with GNU C */
        #define SIMD_BYTEALIGN (128/8)   /* align to 128 bits */

        #include "m_simd_sse_gnuc.h"

    #elif defined(__GNUC__) && defined(__POWERPC__)
        /* Altivec with GNU C */
        #define SIMD_BYTEALIGN (128/8)   /* align to 128 bits */

        #include "m_simd_ve_gnuc.h"      

    #else
        /* default */

        #define SIMD_BYTEALIGN (128/8)   /* assume 128 bits */
        #include "m_simd_def.h"
    #endif

    /* check if n meets the requirements for SIMD codelets */
    #define SIMD_CHKCNT(n) ( ((n)&(SIMD_BLOCK-1)) == 0 )
    /* check if a pointer is correctly aligned for SIMD codelets */
    #define SIMD_CHKALIGN(ptr) ( ((unsigned long)(ptr) & (SIMD_BYTEALIGN-1)) == 0 )

#endif

/* check n and 2 pointers at once */
#define SIMD_CHECK1(n,ptr1) ( SIMD_CHKCNT(n) && SIMD_CHKALIGN(ptr1) )
/* check n and 2 pointers at once */
#define SIMD_CHECK2(n,ptr1,ptr2) ( SIMD_CHKCNT(n) && SIMD_CHKALIGN(ptr1) && SIMD_CHKALIGN(ptr2) )
/* check n and 3 pointers at once */
#define SIMD_CHECK3(n,ptr1,ptr2,ptr3) ( SIMD_CHKCNT(n) && SIMD_CHKALIGN(ptr1) && SIMD_CHKALIGN(ptr2) && SIMD_CHKALIGN(ptr3) )


#endif /* __M_SIMD_H */

--- NEW FILE: m_simd_def.h ---
/* 
    Default SIMD (non-)functionality
    added by T.Grill

    This is used when there's no implementation of SIMD code 
    for the current platform and/or compiler
*/

#ifndef __M_SIMD_DEF_H
#define __M_SIMD_DEF_H

/* These are the functions that can be coded for SIMD */

/* functions in d_ugen.c */
#define zero_perf_simd          zero_perf8

/* functions in d_dac.c */
#define copy_perf_simd          copy_perf8

/* functions in d_ctl.c */
#define sig_tilde_perf_simd     sig_tilde_perf8

/* functions in d_arithmetic.c */
#define plus_perf_simd          plus_perf8
#define scalarplus_perf_simd    scalarplus_perf8
#define minus_perf_simd         minus_perf8
#define scalarminus_perf_simd   scalarminus_perf8
#define times_perf_simd         times_perf8
#define scalartimes_perf_simd   scalartimes_perf8
#define over_perf_simd          over_perf8
#define scalarover_perf_simd    scalarover_perf8
#define min_perf_simd           min_perf8
#define scalarmin_perf_simd     scalarmin_perf8
#define max_perf_simd           max_perf8
#define scalarmax_perf_simd     scalarmax_perf8

#endif /* __M_SIMD_DEF_H */

--- NEW FILE: m_simd_sse_gcc.c ---
/* 
    Implementation of SIMD functionality for Intel SSE with GCC compiler
    added by T.Grill
*/

#include "m_pd.h"
#include "m_simd.h"

#if defined(__GNUC__) && (defined(_X86_) || defined(__i386__) || defined(__i586__) || defined(__i686__))


#endif

--- NEW FILE: m_simd_sse_gcc.h ---
/* 
    SIMD functionality for Intel SSE with GCC compiler
    added by T.Grill
*/

#ifndef __M_SIMD_SSE_GCC_H
#define __M_SIMD_SSE_GCC_H

#include "m_pd.h"

/* SIMD functions for SSE with GCC */
t_int *zero_perf_sse_gcc(t_int *w);
t_int *copy_perf_sse_gcc(t_int *w);
t_int *sig_tilde_perf_sse_gcc(t_int *w);
t_int *plus_perf_sse_gcc(t_int *w);
t_int *scalarplus_perf_sse_gcc(t_int *w);
t_int *minus_perf_sse_gcc(t_int *w);
t_int *scalarminus_perf_sse_gcc(t_int *w);
t_int *times_perf_sse_gcc(t_int *w);
t_int *scalartimes_perf_sse_gcc(t_int *w);
t_int *over_perf_sse_gcc(t_int *w);
t_int *scalarover_perf_sse_gcc(t_int *w);
t_int *max_perf_sse_gcc(t_int *w);
t_int *scalarmax_perf_sse_gcc(t_int *w);
t_int *min_perf_sse_gcc(t_int *w);
t_int *scalarmin_perf_sse_gcc(t_int *w);


/* functions in d_ugen.c */
#define zero_perf_simd          zero_perf8 /* SIMD not implemented */

/* functions in d_dac.c */
#define copy_perf_simd          copy_perf8 /* SIMD not implemented */

/* functions in d_ctl.c */
#define sig_tilde_perf_simd     sig_tilde_perf8 /* SIMD not implemented */

/* functions in d_arithmetic.c */
#define plus_perf_simd          plus_perf8 /* SIMD not implemented */
#define scalarplus_perf_simd    scalarplus_perf8 /* SIMD not implemented */
#define minus_perf_simd         minus_perf8 /* SIMD not implemented */
#define scalarminus_perf_simd   scalarminus_perf8 /* SIMD not implemented */
#define times_perf_simd         times_perf8 /* SIMD not implemented */
#define scalartimes_perf_simd   scalartimes_perf8 /* SIMD not implemented */
#define over_perf_simd          over_perf8 /* SIMD not implemented */
#define scalarover_perf_simd    scalarover_perf8 /* SIMD not implemented */
#define min_perf_simd           min_perf8 /* SIMD not implemented */
#define scalarmin_perf_simd     scalarmin_perf8 /* SIMD not implemented */
#define max_perf_simd           max_perf8 /* SIMD not implemented */
#define scalarmax_perf_simd     scalarmax_perf8 /* SIMD not implemented */

#endif /* __M_SIMD_SSE_GCC_H */

--- NEW FILE: m_simd_sse_vc.c ---
/* 
    Implementation of SIMD functionality for Intel SSE with VC++ compiler
    added by T.Grill
*/

#include "m_pd.h"
#include "m_simd.h"

#if defined(NT) && defined(_MSC_VER)

t_int *zero_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		edx,dword ptr [esi + 1*TYPE t_int] /* out */
		/* load zero */
		xorps	xmm0,xmm0

		mov		ecx,[esi + 2*TYPE t_int] /* n */
		shr		ecx,4

		/* should we do more loop unrolling? */
loopa:
		movaps	xmmword ptr[edx],xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm0

		add		edx,16*TYPE t_float
		/* very short loop - let's assume that branch prediction does its job nicely */
		loop	loopa
	}
    return (w+3);
}


t_int *copy_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		ebx,dword ptr [esi + 1*TYPE t_int] /* in1 */
/*		prefetcht0 [ebx] */
		mov		edx,dword ptr [esi + 2*TYPE t_int] /* out */

		mov		ecx,dword ptr [esi + 3*TYPE t_int] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

loopa:
/*		prefetcht0 [ebx+12*TYPE t_float] */

		movaps	xmm0,xmmword ptr[ebx]
		movaps	xmmword ptr[edx],xmm0
		movaps	xmm1,xmmword ptr[ebx+4*TYPE t_float]
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm1

/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm2,xmmword ptr[ebx+8*TYPE t_float]
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm2
		movaps	xmm3,xmmword ptr[ebx+12*TYPE t_float]
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm3

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+4);
}


t_int *sig_tilde_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		edx,dword ptr [esi + 2*TYPE t_int] /* out */
		/* load value ... this is not very clean.. */
		mov		eax,dword ptr [esi + 1*TYPE t_int] /* f */
		movss	xmm0,xmmword ptr [eax]
		shufps	xmm0,xmm0,0

		mov		ecx,dword ptr [esi + 3*TYPE t_int] /* n */
		shr		ecx,4

loopa:
		movaps	xmmword ptr[edx],xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm0

		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+4);
}


t_int *plus_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/*      prefetcht0 [eax] prefetch first cache line */	
		mov		ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/*      prefetcht0 [ebx] prefetch first cache line */	
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */
		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4 /* divide by 16 */

        xor     esi,esi /* reset index */
/*
		prefetcht0 [eax+8*TYPE t_float]
		prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
        prefetcht0 [eax+16*TYPE t_float]
		prefetcht0 [ebx+16*TYPE t_float]
*/
        movaps	xmm0,xmmword ptr[eax+esi]
		movaps	xmm1,xmmword ptr[ebx+esi]
		addps	xmm0,xmm1
		movaps	xmmword ptr[edx+esi],xmm0

		movaps	xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
		movaps	xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
		addps	xmm2,xmm3
		movaps	xmmword ptr[edx+esi+4*TYPE t_float],xmm2
/*
        prefetcht0 [eax+24*TYPE t_float]
		prefetcht0 [ebx+24*TYPE t_float]
*/
		movaps	xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
		movaps	xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
		addps	xmm4,xmm5
		movaps	xmmword ptr[edx+esi+8*TYPE t_float],xmm4

		movaps	xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
		movaps	xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
		addps	xmm6,xmm7
		movaps	xmmword ptr[edx+esi+12*TYPE t_float],xmm6

        add     esi,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *scalarplus_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/*      prefetcht0 [ebx] prefetch first cache line */
		
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		/* load value ... this is not very clean.. */
		mov		eax,dword ptr [esi + 2*TYPE t_int] /* value */
		movss	xmm0,xmmword ptr [eax]
		shufps	xmm0,xmm0,0

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

loopa:
/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm1,xmmword ptr[ebx]
		addps	xmm1,xmm0
		movaps	xmmword ptr[edx],xmm1

		movaps	xmm2,xmmword ptr[ebx+4*TYPE t_float]
		addps	xmm2,xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm2

/*		prefetcht0 [ebx+24*TYPE t_float] */

		movaps	xmm3,xmmword ptr[ebx+8*TYPE t_float]
		addps	xmm3,xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm3

		movaps	xmm4,xmmword ptr[ebx+12*TYPE t_float]
		addps	xmm4,xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm4

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *minus_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/*      prefetcht0 [eax]    prefetch first cache line */
		mov		ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/*      prefetcht0 [ebx]    prefetch first cache line */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */
		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

        xor     esi,esi /* reset index */
/*
		prefetcht0 [eax+8*TYPE t_float]
		prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
        prefetcht0 [eax+16*TYPE t_float]
		prefetcht0 [ebx+16*TYPE t_float]
*/

		movaps	xmm0,xmmword ptr[eax+esi]
		movaps	xmm1,xmmword ptr[ebx+esi]
		subps	xmm0,xmm1
		movaps	xmmword ptr[edx+esi],xmm0

		movaps	xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
		movaps	xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
		subps	xmm2,xmm3
		movaps	xmmword ptr[edx+esi+4*TYPE t_float],xmm2

/*
        prefetcht0 [eax+24*TYPE t_float]
		prefetcht0 [ebx+24*TYPE t_float]
*/

		movaps	xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
		movaps	xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
		subps	xmm4,xmm5
		movaps	xmmword ptr[edx+esi+8*TYPE t_float],xmm4

		movaps	xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
		movaps	xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
		subps	xmm6,xmm7
		movaps	xmmword ptr[edx+esi+12*TYPE t_float],xmm6

		add		esi,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *scalarminus_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/*      prefetcht0 [ebx]    prefetch first cache line */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		/* load value ... this is not very clean.. */
		mov		eax,dword ptr [esi + 2*TYPE t_int] /* g */
		movss	xmm0,xmmword ptr [eax]
		shufps	xmm0,xmm0,0

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

loopa:
/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm1,xmmword ptr[ebx]
		subps	xmm1,xmm0
		movaps	xmmword ptr[edx],xmm1

		movaps	xmm2,xmmword ptr[ebx+4*TYPE t_float]
		subps	xmm2,xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm2

/*		prefetcht0 [ebx+24*TYPE t_float] */

		movaps	xmm3,xmmword ptr[ebx+8*TYPE t_float]
		subps	xmm3,xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm3

		movaps	xmm4,xmmword ptr[ebx+12*TYPE t_float]
		subps	xmm4,xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm4

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *times_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/*      prefetcht0 [eax]    prefetch first cache line */
		mov		ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/*      prefetcht0 [ebx]    prefetch first cache line */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

        xor     esi,esi /* reset index */
/*
		prefetcht0 [eax+8*TYPE t_float]
		prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
        prefetcht0 [eax+16*TYPE t_float]
		prefetcht0 [ebx+16*TYPE t_float]
*/

		movaps	xmm0,xmmword ptr[eax+esi]
		movaps	xmm1,xmmword ptr[ebx+esi]
		mulps	xmm0,xmm1
		movaps	xmmword ptr[edx+esi],xmm0

		movaps	xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
		movaps	xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
		mulps	xmm2,xmm3
		movaps	xmmword ptr[edx+esi+4*TYPE t_float],xmm2

/*
		prefetcht0 [eax+24*TYPE t_float]
		prefetcht0 [ebx+24*TYPE t_float]
*/

		movaps	xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
		movaps	xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
		mulps	xmm4,xmm5
		movaps	xmmword ptr[edx+esi+8*TYPE t_float],xmm4

		movaps	xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
		movaps	xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
		mulps	xmm6,xmm7
		movaps	xmmword ptr[edx+esi+12*TYPE t_float],xmm6

		add		esi,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *scalartimes_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/*      prefetcht0 [ebx]    prefetch first cache line */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		/* load value ... this is not very clean.. */
		mov		eax,dword ptr [esi + 2*TYPE t_int] /* g */
		movss	xmm0,xmmword ptr [eax]
		shufps	xmm0,xmm0,0

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

loopa:
/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm1,xmmword ptr[ebx]
		mulps	xmm1,xmm0
		movaps	xmmword ptr[edx],xmm1

		movaps	xmm2,xmmword ptr[ebx+4*TYPE t_float]
		mulps	xmm2,xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm2

/*		prefetcht0 [ebx+24*4] */

		movaps	xmm3,xmmword ptr[ebx+8*TYPE t_float]
		mulps	xmm3,xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm3

		movaps	xmm4,xmmword ptr[ebx+12*TYPE t_float]
		mulps	xmm4,xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm4

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

/* no checking for 0 yet!! */
t_int *over_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/*      prefetcht0 [eax]    prefetch first cache line */
		mov		ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/*      prefetcht0 [ebx]    prefetch first cache line */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

        xor     esi,esi /* reset index */
/*
		prefetcht0 [eax+8*TYPE t_float]
		prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
        prefetcht0 [eax+16*TYPE t_float]
		prefetcht0 [ebx+16*TYPE t_float]
*/

		movaps	xmm0,xmmword ptr[eax+esi]
		movaps	xmm1,xmmword ptr[ebx+esi]
		divps	xmm0,xmm1
		movaps	xmmword ptr[edx+esi],xmm0

		movaps	xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
		movaps	xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
		divps	xmm2,xmm3
		movaps	xmmword ptr[edx+esi+4*TYPE t_float],xmm2

/*
		prefetcht0 [eax+24*TYPE t_float]
		prefetcht0 [ebx+24*TYPE t_float]
*/

		movaps	xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
		movaps	xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
		divps	xmm4,xmm5
		movaps	xmmword ptr[edx+esi+8*TYPE t_float],xmm4

		movaps	xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
		movaps	xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
		divps	xmm6,xmm7
		movaps	xmmword ptr[edx+esi+12*TYPE t_float],xmm6

		add		esi,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *scalarover_perf_sse_vc(t_int *w)
{
    static const float one = 1.f;

	__asm {
		mov		esi,dword ptr [w]

		mov		ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/*      prefetcht0 [ebx]    prefetch first cache line */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		/* load value */
		mov		eax,dword ptr [esi + 2*TYPE t_int] /* g */
		movss	xmm1,xmmword ptr [eax]

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

        /* check for zero */
		xorps	xmm0,xmm0
        comiss  xmm1,xmm0  /* compare xmm1 to 0 */
        /* if xmm1 is zero (and also xmm0!) -> goto loopa */
        jz      loopa

        /* else, invert xmm0 */
/*        rcpps   xmm0,xmm0  ... far too unprecise!! */
        
		movss	xmm0,[one]
        divss   xmm0,xmm1  /* divide xmm0 by xmm1 */
		shufps	xmm0,xmm0,0 /* make xmm0 all the same */

loopa:
/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm1,xmmword ptr[ebx]
		mulps	xmm1,xmm0
		movaps	xmmword ptr[edx],xmm1

		movaps	xmm2,xmmword ptr[ebx+4*TYPE t_float]
		mulps	xmm2,xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm2

/*		prefetcht0 [ebx+24*4] */

		movaps	xmm3,xmmword ptr[ebx+8*TYPE t_float]
		mulps	xmm3,xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm3

		movaps	xmm4,xmmword ptr[ebx+12*TYPE t_float]
		mulps	xmm4,xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm4

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *max_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/*		prefetcht0 [eax] */
		mov		ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/*		prefetcht0 [ebx] */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

        xor     esi,esi /* reset index */
/*
        prefetcht0 [eax+8*TYPE t_float]
		prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
        prefetcht0 [eax+16*TYPE t_float]
		prefetcht0 [ebx+16*TYPE t_float]
*/
		movaps	xmm0,xmmword ptr[eax+esi]
		movaps	xmm1,xmmword ptr[ebx+esi]
		maxps	xmm0,xmm1
		movaps	xmmword ptr[edx+esi],xmm0

		movaps	xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
		movaps	xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
		maxps	xmm2,xmm3
		movaps	xmmword ptr[edx+esi+4*TYPE t_float],xmm2

/*
        prefetcht0 [eax+24*TYPE t_float]
		prefetcht0 [ebx+24*TYPE t_float]
*/

		movaps	xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
		movaps	xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
		maxps	xmm4,xmm5
		movaps	xmmword ptr[edx+esi+8*TYPE t_float],xmm4

		movaps	xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
		movaps	xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
		maxps	xmm6,xmm7
		movaps	xmmword ptr[edx+esi+12*TYPE t_float],xmm6

		add		esi,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *scalarmax_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/*		prefetcht0 [ebx] */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		/* load value ... this is not very clean.. */
		mov		eax,dword ptr [esi + 2*TYPE t_int] /* g */
		movss	xmm0,xmmword ptr [eax]
		shufps	xmm0,xmm0,0

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

loopa:
/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm1,xmmword ptr[ebx]
		maxps	xmm1,xmm0
		movaps	xmmword ptr[edx],xmm1

		movaps	xmm2,xmmword ptr[ebx+4*TYPE t_float]
		maxps	xmm2,xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm2

/*		prefetcht0 [ebx+24*TYPE t_float] */

		movaps	xmm3,xmmword ptr[ebx+8*TYPE t_float]
		maxps	xmm3,xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm3

		movaps	xmm4,xmmword ptr[ebx+12*TYPE t_float]
		maxps	xmm4,xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm4

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *min_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/*		prefetcht0 [eax] */
		mov		ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/*		prefetcht0 [ebx] */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

        mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

        xor     esi,esi /* reset index */
/*
        prefetcht0 [eax+8*TYPE t_float]
		prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
        prefetcht0 [eax+16*TYPE t_float]
		prefetcht0 [ebx+16*TYPE t_float]
*/
		movaps	xmm0,xmmword ptr[eax+esi]
		movaps	xmm1,xmmword ptr[ebx+esi]
		minps	xmm0,xmm1
		movaps	xmmword ptr[edx+esi],xmm0

		movaps	xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
		movaps	xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
		minps	xmm2,xmm3
		movaps	xmmword ptr[edx+esi+4*TYPE t_float],xmm2
/*
        prefetcht0 [eax+24*TYPE t_float]
		prefetcht0 [ebx+24*TYPE t_float]
*/
		movaps	xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
		movaps	xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
		minps	xmm4,xmm5
		movaps	xmmword ptr[edx+esi+8*TYPE t_float],xmm4

		movaps	xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
		movaps	xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
		minps	xmm6,xmm7
		movaps	xmmword ptr[edx+esi+12*TYPE t_float],xmm6

		add		esi,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *scalarmin_perf_sse_vc(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/*		prefetcht0 [ebx] */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		/* load value ... this is not very clean.. */
		mov		eax,dword ptr [esi + 2*TYPE t_int] /* g */
		movss	xmm0,xmmword ptr [eax]
		shufps	xmm0,xmm0,0

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

loopa:
/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm1,xmmword ptr[ebx]
		minps	xmm1,xmm0
		movaps	xmmword ptr[edx],xmm1

		movaps	xmm2,xmmword ptr[ebx+4*TYPE t_float]
		minps	xmm2,xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm2

/*		prefetcht0 [ebx+24*TYPE t_float] */

		movaps	xmm3,xmmword ptr[ebx+8*TYPE t_float]
		minps	xmm3,xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm3

		movaps	xmm4,xmmword ptr[ebx+12*TYPE t_float]
		minps	xmm4,xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm4

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

#endif

--- NEW FILE: m_simd_sse_vc.h ---
/* 
    SIMD functionality for Intel SSE with VC++ compiler
    added by T.Grill
*/

#ifndef __M_SIMD_SSE_VC_H
#define __M_SIMD_SSE_VC_H

#include "m_pd.h"

/* SIMD functions for SSE with VC++ */
t_int *zero_perf_sse_vc(t_int *w);
t_int *copy_perf_sse_vc(t_int *w);
t_int *sig_tilde_perf_sse_vc(t_int *w);
t_int *plus_perf_sse_vc(t_int *w);
t_int *scalarplus_perf_sse_vc(t_int *w);
t_int *minus_perf_sse_vc(t_int *w);
t_int *scalarminus_perf_sse_vc(t_int *w);
t_int *times_perf_sse_vc(t_int *w);
t_int *scalartimes_perf_sse_vc(t_int *w);
t_int *over_perf_sse_vc(t_int *w);
t_int *scalarover_perf_sse_vc(t_int *w);
t_int *max_perf_sse_vc(t_int *w);
t_int *scalarmax_perf_sse_vc(t_int *w);
t_int *min_perf_sse_vc(t_int *w);
t_int *scalarmin_perf_sse_vc(t_int *w);


/* functions in d_ugen.c */
#define zero_perf_simd          zero_perf_sse_vc

/* functions in d_dac.c */
#define copy_perf_simd          copy_perf_sse_vc

/* functions in d_ctl.c */
#define sig_tilde_perf_simd     sig_tilde_perf_sse_vc

/* functions in d_arithmetic.c */
#define plus_perf_simd          plus_perf_sse_vc
#define scalarplus_perf_simd    scalarplus_perf_sse_vc
#define minus_perf_simd         minus_perf_sse_vc
#define scalarminus_perf_simd   scalarminus_perf_sse_vc
#define times_perf_simd         times_perf_sse_vc
#define scalartimes_perf_simd   scalartimes_perf_sse_vc
#define over_perf_simd          over_perf8 /* SIMD not implemented */
#define scalarover_perf_simd    scalarover_perf_sse_vc
#define min_perf_simd           min_perf_sse_vc
#define scalarmin_perf_simd     scalarmin_perf_sse_vc
#define max_perf_simd           max_perf_sse_vc
#define scalarmax_perf_simd     scalarmax_perf_sse_vc

#endif /* __M_SIMD_SSE_VC_H */

--- NEW FILE: m_simd_ve_gcc.c ---
/* 
    Implementation of SIMD functionality for Apple Velocity Engine (AltiVec) with GCC compiler
    added by T.Grill
*/

#include "m_pd.h"
#include "m_simd.h"

#if defined(__GNUC__) && defined(__POWERPC__)


#endif

--- NEW FILE: m_simd_ve_gcc.h ---
/* 
    SIMD functionality for Apple Velocity Engine (AltiVec) with GCC compiler
    added by T.Grill
*/

#ifndef __M_SIMD_VE_GCC_H
#define __M_SIMD_VE_GCC_H

#include "m_pd.h"

/* SIMD functions for VE with GCC */
t_int *zero_perf_ve_gcc(t_int *w);
t_int *copy_perf_ve_gcc(t_int *w);
t_int *sig_tilde_perf_ve_gcc(t_int *w);
t_int *plus_perf_ve_gcc(t_int *w);
t_int *scalarplus_perf_ve_gcc(t_int *w);
t_int *minus_perf_ve_gcc(t_int *w);
t_int *scalarminus_perf_ve_gcc(t_int *w);
t_int *times_perf_ve_gcc(t_int *w);
t_int *scalartimes_perf_ve_gcc(t_int *w);
t_int *over_perf_ve_gcc(t_int *w);
t_int *scalarover_perf_ve_gcc(t_int *w);
t_int *max_perf_ve_gcc(t_int *w);
t_int *scalarmax_perf_ve_gcc(t_int *w);
t_int *min_perf_ve_gcc(t_int *w);
t_int *scalarmin_perf_ve_gcc(t_int *w);


/* functions in d_ugen.c */
#define zero_perf_simd          zero_perf8 /* SIMD not implemented */

/* functions in d_dac.c */
#define copy_perf_simd          copy_perf8 /* SIMD not implemented */

/* functions in d_ctl.c */
#define sig_tilde_perf_simd     sig_tilde_perf8 /* SIMD not implemented */

/* functions in d_arithmetic.c */
#define plus_perf_simd          plus_perf8 /* SIMD not implemented */
#define scalarplus_perf_simd    scalarplus_perf8 /* SIMD not implemented */
#define minus_perf_simd         minus_perf8 /* SIMD not implemented */
#define scalarminus_perf_simd   scalarminus_perf8 /* SIMD not implemented */
#define times_perf_simd         times_perf8 /* SIMD not implemented */
#define scalartimes_perf_simd   scalartimes_perf8 /* SIMD not implemented */
#define over_perf_simd          over_perf8 /* SIMD not implemented */
#define scalarover_perf_simd    scalarover_perf8 /* SIMD not implemented */
#define min_perf_simd           min_perf8 /* SIMD not implemented */
#define scalarmin_perf_simd     scalarmin_perf8 /* SIMD not implemented */
#define max_perf_simd           max_perf8 /* SIMD not implemented */
#define scalarmax_perf_simd     scalarmax_perf8 /* SIMD not implemented */

#endif /* __M_SIMD_VE_GCC_H */

Index: d_arithmetic.c
===================================================================
RCS file: /cvsroot/pure-data/pd/src/d_arithmetic.c,v
retrieving revision 1.1.1.1.16.3
retrieving revision 1.1.1.1.16.4
diff -C2 -d -r1.1.1.1.16.3 -r1.1.1.1.16.4
*** d_arithmetic.c	6 Aug 2003 15:55:10 -0000	1.1.1.1.16.3
--- d_arithmetic.c	23 Dec 2003 01:15:39 -0000	1.1.1.1.16.4
***************
*** 11,14 ****
--- 11,16 ----
  #include "m_pd.h"
  
+ #include "m_simd.h"
+ 
  /* ----------------------------- plus ----------------------------- */
  static t_class *plus_class, *scalarplus_class;
***************
*** 80,143 ****
  }
  
- #if defined(_MSC_VER) && defined(USESIMD)
- t_int *plus_perf_simd(t_int *w)
- {
-     t_float *in1 = (t_float *)(w[1]);
-     t_float *in2 = (t_float *)(w[2]);
-     t_float *out = (t_float *)(w[3]);
-     int n = (int)(w[4]);
- 
- 	__asm {
- 		mov		eax,dword ptr [in1]
- 		/* prefetch first cache line */
- 		prefetcht0 [eax]
- 		mov		ebx,dword ptr [in2]
- 		/* prefetch first cache line */
- 		prefetcht0 [ebx]
- 		mov		edx,dword ptr [out]
- 		mov		ecx,[n]
- 		shr		ecx,4
- 
- 		prefetcht0 [eax+8*4]
- 		prefetcht0 [ebx+8*4]
- 
- loopa:
- 		/* prefetch the following cache lines */
- //		prefetcht0 [eax+16*4]
- //		prefetcht0 [ebx+16*4]
- 
- 		movaps	xmm0,xmmword ptr[eax]
- 		movaps	xmm1,xmmword ptr[ebx]
- 		addps	xmm0,xmm1
- 		movaps	xmmword ptr[edx],xmm0
- 
- 		movaps	xmm2,xmmword ptr[eax+4*4]
- 		movaps	xmm3,xmmword ptr[ebx+4*4]
- 		addps	xmm2,xmm3
- 		movaps	xmmword ptr[edx+4*4],xmm2
- 
- 		/* prefetch the following cache lines */
- //		prefetcht0 [eax+24*4]
- //		prefetcht0 [ebx+24*4]
- 
- 		movaps	xmm4,xmmword ptr[eax+8*4]
- 		movaps	xmm5,xmmword ptr[ebx+8*4]
- 		addps	xmm4,xmm5
- 		movaps	xmmword ptr[edx+8*4],xmm4
- 
- 		movaps	xmm6,xmmword ptr[eax+12*4]
- 		movaps	xmm7,xmmword ptr[ebx+12*4]
- 		addps	xmm6,xmm7
- 		movaps	xmmword ptr[edx+12*4],xmm6
- 
- 		add		eax,16*4
- 		add		ebx,16*4
- 		add		edx,16*4
- 		loop	loopa 
- 	}
-     return (w+5);
- }
- #endif
- 
  t_int *scalarplus_perform(t_int *w)
  {
--- 82,85 ----
***************
*** 168,239 ****
  }
  
- #if defined(_MSC_VER) && defined(USESIMD)
- t_int *scalarplus_perf_simd(t_int *w)
- {
-     t_float *in = (t_float *)(w[1]);
-     t_float g = *(t_float *)(w[2]);
-     t_float *out = (t_float *)(w[3]);
-     int n = (int)(w[4]);
- 
- 	__asm {
- 		mov		ebx,dword ptr [in]
- 		/* prefetch first cache line */
- 		prefetcht0 [ebx]
- 		mov		edx,dword ptr [out]
- 
- 		/* load value ... this is not very clean.. */
- 		movss	xmm0,xmmword ptr [g]
- 		shufps	xmm0,xmm0,0
- 
- 		mov		ecx,[n]
- 		shr		ecx,4
- 
- 		prefetcht0 [ebx+8*4]
- 
- loopa:
- 		/* prefetch the following cache line */
- //		prefetcht0 [ebx+16*4]
- 
- 		movaps	xmm1,xmmword ptr[ebx]
- 		addps	xmm1,xmm0
- 		movaps	xmmword ptr[edx],xmm1
- 
- 		movaps	xmm2,xmmword ptr[ebx+4*4]
- 		addps	xmm2,xmm0
- 		movaps	xmmword ptr[edx+4*4],xmm2
- 
- 		/* prefetch the following cache line */
- //		prefetcht0 [ebx+24*4]
- 
- 		movaps	xmm3,xmmword ptr[ebx+8*4]
- 		addps	xmm3,xmm0
- 		movaps	xmmword ptr[edx+8*4],xmm3
- 
- 		movaps	xmm4,xmmword ptr[ebx+12*4]
- 		addps	xmm4,xmm0
- 		movaps	xmmword ptr[edx+12*4],xmm4
- 
- 		add		ebx,16*4
- 		add		edx,16*4
- 		loop	loopa 
- 	}
-     return (w+5);
- }
- #endif
- 
  void dsp_add_plus(t_sample *in1, t_sample *in2, t_sample *out, int n)
  {
!     if (n&7)
      	dsp_add(plus_perform, 4, in1, in2, out, n);
! #if defined(_MSC_VER) && defined(USESIMD)
! 	else if(
! 		(n&15) == 0 &&
! 		/* check for alignment */
! 		((unsigned long)in1&15) == 0 &&
! 		((unsigned long)in2&15) == 0 &&
! 		((unsigned long)out&15) == 0 
! 	)
      	dsp_add(plus_perf_simd, 4, in1, in2, out, n);
- #endif
      else	
      	dsp_add(plus_perf8, 4, in1, in2, out, n);
--- 110,119 ----
  }
  
  void dsp_add_plus(t_sample *in1, t_sample *in2, t_sample *out, int n)
  {
!     if(n&7)
      	dsp_add(plus_perform, 4, in1, in2, out, n);
! 	else if(SIMD_CHECK3(n,in1,in2,out))
      	dsp_add(plus_perf_simd, 4, in1, in2, out, n);
      else	
      	dsp_add(plus_perf8, 4, in1, in2, out, n);
***************
*** 247,262 ****
  static void scalarplus_dsp(t_scalarplus *x, t_signal **sp)
  {
! 	int n = sp[0]->s_n;
!     if (n&7)
      	dsp_add(scalarplus_perform, 4, sp[0]->s_vec, &x->x_g, sp[1]->s_vec, n);
! #if defined(_MSC_VER) && defined(USESIMD)
! 	else if(
! 		(n&15) == 0 &&
! 		/* check for alignment */
! 		((unsigned long)sp[0]->s_vec&15) == 0 &&
! 		((unsigned long)sp[1]->s_vec&15) == 0 
! 	)
      	dsp_add(scalarplus_perf_simd, 4, sp[0]->s_vec, &x->x_g, sp[1]->s_vec, n);
- #endif
      else	
      	dsp_add(scalarplus_perf8, 4, sp[0]->s_vec, &x->x_g, sp[1]->s_vec, n);
--- 127,135 ----
  static void scalarplus_dsp(t_scalarplus *x, t_signal **sp)
  {
! 	const int n = sp[0]->s_n;
!     if(n&7)
      	dsp_add(scalarplus_perform, 4, sp[0]->s_vec, &x->x_g, sp[1]->s_vec, n);
! 	else if(SIMD_CHECK2(n,sp[0]->s_vec,sp[1]->s_vec))
      	dsp_add(scalarplus_perf_simd, 4, sp[0]->s_vec, &x->x_g, sp[1]->s_vec, n);
      else	
      	dsp_add(scalarplus_perf8, 4, sp[0]->s_vec, &x->x_g, sp[1]->s_vec, n);
***************
*** 347,408 ****
  }
  
- #if defined(_MSC_VER) && defined(USESIMD)
- t_int *minus_perf_simd(t_int *w)
- {
-     t_float *in1 = (t_float *)(w[1]);
-     t_float *in2 = (t_float *)(w[2]);
-     t_float *out = (t_float *)(w[3]);
-     int n = (int)(w[4]);
- 
- 	__asm {
- 		mov		eax,dword ptr [in1]
- 		/* prefetch first cache line */
- 		prefetcht0 [eax]
- 		mov		ebx,dword ptr [in2]
- 		/* prefetch first cache line */
- 		prefetcht0 [ebx]
- 		mov		edx,dword ptr [out]
- 		mov		ecx,[n]
- 		shr		ecx,4
- 
- 		prefetcht0 [eax+8*4]
- 		prefetcht0 [ebx+8*4]
- 
- loopa:
- 		/* prefetch the following cache lines */
- //		prefetcht0 [eax+16*4]
- //		prefetcht0 [ebx+16*4]
- 
- 		movaps	xmm0,xmmword ptr[eax]
- 		movaps	xmm1,xmmword ptr[ebx]
- 		subps	xmm0,xmm1
- 		movaps	xmmword ptr[edx],xmm0
- 		movaps	xmm2,xmmword ptr[eax+4*4]
- 		movaps	xmm3,xmmword ptr[ebx+4*4]
- 		subps	xmm2,xmm3
- 		movaps	xmmword ptr[edx+4*4],xmm2
- 
- 		/* prefetch the following cache lines */
- //		prefetcht0 [eax+24*4]
- //		prefetcht0 [ebx+24*4]
- 
- 		movaps	xmm4,xmmword ptr[eax+8*4]
- 		movaps	xmm5,xmmword ptr[ebx+8*4]
- 		subps	xmm4,xmm5
- 		movaps	xmmword ptr[edx+8*4],xmm4
- 		movaps	xmm6,xmmword ptr[eax+12*4]
- 		movaps	xmm7,xmmword ptr[ebx+12*4]
- 		subps	xmm6,xmm7
- 		movaps	xmmword ptr[edx+12*4],xmm6
- 
- 		add		eax,16*4
- 		add		ebx,16*4
- 		add		edx,16*4
- 		loop	loopa 
- 	}
-     return (w+5);
- }
- #endif
- 
  t_int *scalarminus_perform(t_int *w)
  {
--- 220,223 ----
***************
*** 433,506 ****
  }
  
- #if defined(_MSC_VER) && defined(USESIMD)
- t_int *scalarminus_perf_simd(t_int *w)
- {
-     t_float *in = (t_float *)(w[1]);
-     t_float g = *(t_float *)(w[2]);
-     t_float *out = (t_float *)(w[3]);
-     int n = (int)(w[4]);
- 
- 	__asm {
- 		mov		ebx,dword ptr [in]
- 		/* prefetch first cache line */
- 		prefetcht0 [ebx]
- 
- 		mov		edx,dword ptr [out]
- 
- 		/* load value ... this is not very clean.. */
- 		movss	xmm0,xmmword ptr [g]
- 		shufps	xmm0,xmm0,0
- 
- 		mov		ecx,[n]
- 		shr		ecx,4
- 
- 		prefetcht0 [ebx+8*4]
- 
- loopa:
- 		/* prefetch the following cache line */
- //		prefetcht0 [ebx+16*4]
- 
- 		movaps	xmm1,xmmword ptr[ebx]
- 		subps	xmm1,xmm0
- 		movaps	xmmword ptr[edx],xmm1
- 
- 		movaps	xmm2,xmmword ptr[ebx+4*4]
- 		subps	xmm2,xmm0
- 		movaps	xmmword ptr[edx+4*4],xmm2
- 
- 		/* prefetch the following cache line */
- //		prefetcht0 [ebx+24*4]
- 
- 		movaps	xmm3,xmmword ptr[ebx+8*4]
- 		subps	xmm3,xmm0
- 		movaps	xmmword ptr[edx+8*4],xmm3
- 
- 		movaps	xmm4,xmmword ptr[ebx+12*4]
- 		subps	xmm4,xmm0
- 		movaps	xmmword ptr[edx+12*4],xmm4
- 
- 		add		ebx,16*4
- 		add		edx,16*4
- 		loop	loopa 
- 	}
-     return (w+5);
- }
- #endif
- 
  static void minus_dsp(t_minus *x, t_signal **sp)
  {
! 	int n = sp[0]->s_n;
!     if (n&7)
      	dsp_add(minus_perform, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
! #if defined(_MSC_VER) && defined(USESIMD)
! 	else if(
! 		(n&15) == 0 &&
! 		/* check for alignment */
! 		((unsigned long)sp[0]->s_vec&15) == 0 &&
! 		((unsigned long)sp[1]->s_vec&15) == 0 &&
! 		((unsigned long)sp[2]->s_vec&15) == 0 
! 	)
      	dsp_add(minus_perf_simd, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
- #endif
      else	
      	dsp_add(minus_perf8, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
--- 248,258 ----
  }
  
  static void minus_dsp(t_minus *x, t_signal **sp)
  {
! 	const int n = sp[0]->s_n;
!     if(n&7)
      	dsp_add(minus_perform, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
! 	else if(SIMD_CHECK3(n,sp[0]->s_vec,sp[1]->s_vec,sp[2]->s_vec))
      	dsp_add(minus_perf_simd, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
      else	
      	dsp_add(minus_perf8, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
***************
*** 509,524 ****
  static void scalarminus_dsp(t_scalarminus *x, t_signal **sp)
  {
! 	int n = sp[0]->s_n;
!     if (n&7)
      	dsp_add(scalarminus_perform, 4, sp[0]->s_vec, &x->x_g,sp[1]->s_vec, n);
! #if defined(_MSC_VER) && defined(USESIMD)
! 	else if(
! 		(n&15) == 0 &&
! 		/* check for alignment */
! 		((unsigned long)sp[0]->s_vec&15) == 0 &&
! 		((unsigned long)sp[1]->s_vec&15) == 0 
! 	)
      	dsp_add(scalarminus_perf_simd, 4, sp[0]->s_vec, &x->x_g, sp[1]->s_vec, n);
- #endif
      else	
      	dsp_add(scalarminus_perf8, 4, sp[0]->s_vec, &x->x_g, sp[1]->s_vec, n);
--- 261,269 ----
  static void scalarminus_dsp(t_scalarminus *x, t_signal **sp)
  {
! 	const int n = sp[0]->s_n;
!     if(n&7)
      	dsp_add(scalarminus_perform, 4, sp[0]->s_vec, &x->x_g,sp[1]->s_vec, n);
! 	else if(SIMD_CHECK2(n,sp[0]->s_vec,sp[1]->s_vec))
      	dsp_add(scalarminus_perf_simd, 4, sp[0]->s_vec, &x->x_g, sp[1]->s_vec, n);
      else	
      	dsp_add(scalarminus_perf8, 4, sp[0]->s_vec, &x->x_g, sp[1]->s_vec, n);
***************
*** 610,670 ****
  }
  
- #if defined(_MSC_VER) && defined(USESIMD)
- t_int *times_perf_simd(t_int *w)
- {
-     t_float *in1 = (t_float *)(w[1]);
-     t_float *in2 = (t_float *)(w[2]);
-     t_float *out = (t_float *)(w[3]);
-     int n = (int)(w[4]);
- 
- 	__asm {
- 		mov		eax,dword ptr [in1]
- 		/* prefetch first cache line */
- 		prefetcht0 [eax]
- 		mov		ebx,dword ptr [in2]
- 		/* prefetch first cache line */
- 		prefetcht0 [ebx]
- 		mov		edx,dword ptr [out]
- 		mov		ecx,[n]
- 		shr		ecx,4
- 
- 		prefetcht0 [eax+8*4]
- 		prefetcht0 [ebx+8*4]
- 
- loopa:
- 		/* prefetch the following cache lines */
- //		prefetcht0 [eax+16*4]
- //		prefetcht0 [ebx+16*4]
- 
- 		movaps	xmm0,xmmword ptr[eax]
- 		movaps	xmm1,xmmword ptr[ebx]
- 		mulps	xmm0,xmm1
- 		movaps	xmmword ptr[edx],xmm0
- 		movaps	xmm2,xmmword ptr[eax+4*4]
- 		movaps	xmm3,xmmword ptr[ebx+4*4]
- 		mulps	xmm2,xmm3
- 		movaps	xmmword ptr[edx+4*4],xmm2
- 
- //		prefetcht0 [eax+24*4]
- //		prefetcht0 [ebx+24*4]
- 
- 		movaps	xmm4,xmmword ptr[eax+8*4]
- 		movaps	xmm5,xmmword ptr[ebx+8*4]
- 		mulps	xmm4,xmm5
- 		movaps	xmmword ptr[edx+8*4],xmm4
- 		movaps	xmm6,xmmword ptr[eax+12*4]
- 		movaps	xmm7,xmmword ptr[ebx+12*4]
- 		mulps	xmm6,xmm7
- 		movaps	xmmword ptr[edx+12*4],xmm6
- 
- 		add		eax,16*4
- 		add		ebx,16*4
- 		add		edx,16*4
- 		loop	loopa 
- 	}
-     return (w+5);
- }
- #endif
- 
  t_int *scalartimes_perform(t_int *w)
  {
--- 355,358 ----
***************
*** 695,767 ****
  }
  
- #if defined(_MSC_VER) && defined(USESIMD)
- t_int *scalartimes_perf_simd(t_int *w)
- {
-     t_float *in = (t_float *)(w[1]);
-     t_float g = *(t_float *)(w[2]);
-     t_float *out = (t_float *)(w[3]);
-     int n = (int)(w[4]);
- 
- 	__asm {
- 		mov		ebx,dword ptr [in]
- 		/* prefetch first cache line */
- 		prefetcht0 [ebx]
- 		mov		edx,dword ptr [out]
- 
- 		/* load value ... this is not very clean.. */
- 		movss	xmm0,xmmword ptr [g]
- 		shufps	xmm0,xmm0,0
- 
- 		mov		ecx,[n]
- 		shr		ecx,4
- 
- 		prefetcht0 [ebx+8*4]
- 
- loopa:
- 		/* prefetch the following cache line */
- //		prefetcht0 [ebx+16*4]
- 
- 		movaps	xmm1,xmmword ptr[ebx]
- 		mulps	xmm1,xmm0
- 		movaps	xmmword ptr[edx],xmm1
- 
- 		movaps	xmm2,xmmword ptr[ebx+4*4]
- 		mulps	xmm2,xmm0
- 		movaps	xmmword ptr[edx+4*4],xmm2
- 
- 		/* prefetch the following cache line */
- //		prefetcht0 [ebx+24*4]
- 
- 		movaps	xmm3,xmmword ptr[ebx+8*4]
- 		mulps	xmm3,xmm0
- 		movaps	xmmword ptr[edx+8*4],xmm3
- 
- 		movaps	xmm4,xmmword ptr[ebx+12*4]
- 		mulps	xmm4,xmm0
- 		movaps	xmmword ptr[edx+12*4],xmm4
- 
- 		add		ebx,16*4
- 		add		edx,16*4
- 		loop	loopa 
- 	}
-     return (w+5);
- }
- #endif
- 
  static void times_dsp(t_times *x, t_signal **sp)
  {
! 	int n = sp[0]->s_n;
      if (n&7)
      	dsp_add(times_perform, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
! #if defined(_MSC_VER) && defined(USESIMD)
! 	else if(
! 		(n&15) == 0 &&
! 		/* check for alignment */
! 		((unsigned long)sp[0]->s_vec&15) == 0 &&
! 		((unsigned long)sp[1]->s_vec&15) == 0 &&
! 		((unsigned long)sp[2]->s_vec&15) == 0 
! 	)
      	dsp_add(times_perf_simd, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
- #endif
      else	
      	dsp_add(times_perf8, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
--- 383,393 ----
  }
  
  static void times_dsp(t_times *x, t_signal **sp)
  {
! 	const int n = sp[0]->s_n;
      if (n&7)
      	dsp_add(times_perform, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
! 	else if(SIMD_CHECK3(n,sp[0]->s_vec,sp[1]->s_vec,sp[2]->s_vec))
      	dsp_add(times_perf_simd, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
      else	
      	dsp_add(times_perf8, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
***************
*** 770,785 ****
  static void scalartimes_dsp(t_scalartimes *x, t_signal **sp)
  {
! 	int n = sp[0]->s_n;
      if (n&7)
      	dsp_add(scalartimes_perform, 4, sp[0]->s_vec, &x->x_g,sp[1]->s_vec, n);
! #if defined(_MSC_VER) && defined(USESIMD)
! 	else if(
! 		(n&15) == 0 &&
! 		/* check for alignment */
! 		((unsigned long)sp[0]->s_vec&15) == 0 &&
! 		((unsigned long)sp[1]->s_vec&15) == 0 
! 	)
      	dsp_add(scalartimes_perf_simd, 4, sp[0]->s_vec, &x->x_g, sp[1]->s_vec, n);
- #endif
      else	
      	dsp_add(scalartimes_perf8, 4, sp[0]->s_vec, &x->x_g, sp[1]->s_vec, n);
--- 396,404 ----
  static void scalartimes_dsp(t_scalartimes *x, t_signal **sp)
  {
! 	const int n = sp[0]->s_n;
      if (n&7)
      	dsp_add(scalartimes_perform, 4, sp[0]->s_vec, &x->x_g,sp[1]->s_vec, n);
! 	else if(SIMD_CHECK2(n,sp[0]->s_vec,sp[1]->s_vec))
      	dsp_add(scalartimes_perf_simd, 4, sp[0]->s_vec, &x->x_g, sp[1]->s_vec, n);
      else	
      	dsp_add(scalartimes_perf8, 4, sp[0]->s_vec, &x->x_g, sp[1]->s_vec, n);
***************
*** 909,928 ****
  static void over_dsp(t_over *x, t_signal **sp)
  {
!     if (sp[0]->s_n&7)
!     	dsp_add(over_perform, 4,
! 	    sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, sp[0]->s_n);
      else	
!     	dsp_add(over_perf8, 4,
! 	    sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, sp[0]->s_n);
  }
  
  static void scalarover_dsp(t_scalarover *x, t_signal **sp)
  {
!     if (sp[0]->s_n&7)
!     	dsp_add(scalarover_perform, 4, sp[0]->s_vec, &x->x_g,
! 	    sp[1]->s_vec, sp[0]->s_n);
      else	
!     	dsp_add(scalarover_perf8, 4, sp[0]->s_vec, &x->x_g,
! 	    sp[1]->s_vec, sp[0]->s_n);
  }
  
--- 528,549 ----
  static void over_dsp(t_over *x, t_signal **sp)
  {
! 	const int n = sp[0]->s_n;
!     if (n&7)
!     	dsp_add(over_perform, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
! 	else if(SIMD_CHECK3(n,sp[0]->s_vec,sp[1]->s_vec,sp[2]->s_vec))
!     	dsp_add(over_perf_simd, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
      else	
!     	dsp_add(over_perf8, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
  }
  
  static void scalarover_dsp(t_scalarover *x, t_signal **sp)
  {
! 	const int n = sp[0]->s_n;
!     if (n&7)
!     	dsp_add(scalarover_perform, 4, sp[0]->s_vec, &x->x_g,sp[1]->s_vec, n);
! 	else if(SIMD_CHECK2(n,sp[0]->s_vec,sp[1]->s_vec))
!     	dsp_add(scalarover_perf_simd, 4, sp[0]->s_vec, &x->x_g, sp[1]->s_vec, n);
      else	
!     	dsp_add(scalarover_perf8, 4, sp[0]->s_vec, &x->x_g,sp[1]->s_vec, n);
  }
  
***************
*** 1051,1070 ****
  static void max_dsp(t_max *x, t_signal **sp)
  {
!     if (sp[0]->s_n&7)
!     	dsp_add(max_perform, 4,
! 	    sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, sp[0]->s_n);
      else	
!     	dsp_add(max_perf8, 4,
! 	    sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, sp[0]->s_n);
  }
  
  static void scalarmax_dsp(t_scalarmax *x, t_signal **sp)
  {
!     if (sp[0]->s_n&7)
!     	dsp_add(scalarmax_perform, 4, sp[0]->s_vec, &x->x_g,
! 	    sp[1]->s_vec, sp[0]->s_n);
      else	
!     	dsp_add(scalarmax_perf8, 4, sp[0]->s_vec, &x->x_g,
! 	    sp[1]->s_vec, sp[0]->s_n);
  }
  
--- 672,693 ----
  static void max_dsp(t_max *x, t_signal **sp)
  {
!     const int n = sp[0]->s_n;
!     if(n&7)
!     	dsp_add(max_perform, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
! 	else if(SIMD_CHECK3(n,sp[0]->s_vec,sp[1]->s_vec,sp[2]->s_vec))
!     	dsp_add(max_perf_simd, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
      else	
!     	dsp_add(max_perf8, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
  }
  
  static void scalarmax_dsp(t_scalarmax *x, t_signal **sp)
  {
! 	const int n = sp[0]->s_n;
!     if (n&7)
!     	dsp_add(scalarmax_perform, 4, sp[0]->s_vec, &x->x_g, sp[1]->s_vec, n);
! 	else if(SIMD_CHECK2(n,sp[0]->s_vec,sp[1]->s_vec))
!     	dsp_add(scalarmax_perf_simd, 4, sp[0]->s_vec, &x->x_g, sp[1]->s_vec, n);
      else	
!     	dsp_add(scalarmax_perf8, 4, sp[0]->s_vec, &x->x_g, sp[1]->s_vec, n);
  }
  
***************
*** 1193,1212 ****
  static void min_dsp(t_min *x, t_signal **sp)
  {
!     if (sp[0]->s_n&7)
!     	dsp_add(min_perform, 4,
! 	    sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, sp[0]->s_n);
      else	
!     	dsp_add(min_perf8, 4,
! 	    sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, sp[0]->s_n);
  }
  
  static void scalarmin_dsp(t_scalarmin *x, t_signal **sp)
  {
!     if (sp[0]->s_n&7)
!     	dsp_add(scalarmin_perform, 4, sp[0]->s_vec, &x->x_g,
! 	    sp[1]->s_vec, sp[0]->s_n);
      else	
!     	dsp_add(scalarmin_perf8, 4, sp[0]->s_vec, &x->x_g,
! 	    sp[1]->s_vec, sp[0]->s_n);
  }
  
--- 816,837 ----
  static void min_dsp(t_min *x, t_signal **sp)
  {
!     const int n = sp[0]->s_n;
!     if(n&7)
!     	dsp_add(min_perform, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
! 	else if(SIMD_CHECK3(n,sp[0]->s_vec,sp[1]->s_vec,sp[2]->s_vec))
!     	dsp_add(min_perf_simd, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
      else	
!     	dsp_add(min_perf8, 4, sp[0]->s_vec, sp[1]->s_vec, sp[2]->s_vec, n);
  }
  
  static void scalarmin_dsp(t_scalarmin *x, t_signal **sp)
  {
! 	const int n = sp[0]->s_n;
!     if (n&7)
!     	dsp_add(scalarmin_perform, 4, sp[0]->s_vec, &x->x_g, sp[1]->s_vec, n);
! 	else if(SIMD_CHECK2(n,sp[0]->s_vec,sp[1]->s_vec))
!     	dsp_add(scalarmin_perf_simd, 4, sp[0]->s_vec, &x->x_g, sp[1]->s_vec, n);
      else	
!     	dsp_add(scalarmin_perf8, 4, sp[0]->s_vec, &x->x_g, sp[1]->s_vec, n);
  }
  

Index: d_ctl.c
===================================================================
RCS file: /cvsroot/pure-data/pd/src/d_ctl.c,v
retrieving revision 1.1.1.3.2.5
retrieving revision 1.1.1.3.2.6
diff -C2 -d -r1.1.1.3.2.5 -r1.1.1.3.2.6
*** d_ctl.c	23 Sep 2003 01:04:33 -0000	1.1.1.3.2.5
--- d_ctl.c	23 Dec 2003 01:15:39 -0000	1.1.1.3.2.6
***************
*** 10,13 ****
--- 10,16 ----
  #include "math.h"
  
+ /* T.Grill - include SIMD functionality */
+ #include "m_simd.h"
+ 
  /* -------------------------- sig~ ------------------------------ */
  static t_class *sig_tilde_class;
***************
*** 49,92 ****
  }
  
- #if defined(_MSC_VER) && defined(USESIMD)
- static t_int *sig_tilde_perf_simd(t_int *w)
- {
-     t_float f = *(t_float *)(w[1]);
-     t_float *out = (t_float *)(w[2]);
-     int n = (int)(w[3]);
-     
- 	__asm {
- 		mov		edx,dword ptr [out]
- 		/* load value ... this is not very clean.. */
- 		movss	xmm0,xmmword ptr [f]
- 		shufps	xmm0,xmm0,0
- 		mov		ecx,[n]
- 		shr		ecx,4
- 
- loopa:
- 		movaps	xmmword ptr[edx],xmm0
- 		movaps	xmmword ptr[edx+4*4],xmm0
- 		movaps	xmmword ptr[edx+8*4],xmm0
- 		movaps	xmmword ptr[edx+12*4],xmm0
- 
- 		add		edx,16*4
- 		loop	loopa 
- 	}
-     return (w+4);
- }
- #endif
- 
  void dsp_add_scalarcopy(t_sample *in, t_sample *out, int n)
  {
      if (n&7)
      	dsp_add(sig_tilde_perform, 3, in, out, n);
! #if defined(_MSC_VER) && defined(USESIMD)
! 	else if(
! 		(n&15) == 0 &&
! 		/* check for alignment */
! 		((unsigned long)out&15) == 0 
! 	)
      	dsp_add(sig_tilde_perf_simd, 3, in, out, n);
- #endif
      else	
      	dsp_add(sig_tilde_perf8, 3, in, out, n);
--- 52,61 ----
  }
  
  void dsp_add_scalarcopy(t_sample *in, t_sample *out, int n)
  {
      if (n&7)
      	dsp_add(sig_tilde_perform, 3, in, out, n);
! 	else if(SIMD_CHECK1(n,out))
      	dsp_add(sig_tilde_perf_simd, 3, in, out, n);
      else	
      	dsp_add(sig_tilde_perf8, 3, in, out, n);

Index: d_dac.c
===================================================================
RCS file: /cvsroot/pure-data/pd/src/d_dac.c,v
retrieving revision 1.1.1.2.2.4
retrieving revision 1.1.1.2.2.5
diff -C2 -d -r1.1.1.2.2.4 -r1.1.1.2.2.5
*** d_dac.c	6 Aug 2003 15:55:10 -0000	1.1.1.2.2.4
--- d_dac.c	23 Dec 2003 01:15:39 -0000	1.1.1.2.2.5
***************
*** 9,12 ****
--- 9,15 ----
  #include "s_stuff.h"
  
+ /* T.Grill - include SIMD functionality */
+ #include "m_simd.h"
+ 
  /* ----------------------------- dac~ --------------------------- */
  static t_class *dac_class;
***************
*** 140,197 ****
  }
  
- #if defined(_MSC_VER) && defined(USESIMD)
- t_int *copy_perf_simd(t_int *w)
- {
-     t_float *in1 = (t_float *)(w[1]);
-     t_float *out = (t_float *)(w[2]);
-     int n = (int)(w[3]);
-     
- 	__asm {
- 		mov		ebx,dword ptr [in1]
- 		/* prefetch first cache line */
- 		prefetcht0 [ebx]
- 		mov		edx,dword ptr [out]
- 		mov		ecx,[n]
- 		shr		ecx,4
- 
- 		prefetcht0 [ebx+8*4]
- 
- loopa:
- 		/* prefetch the following cache line */
- //		prefetcht0 [ebx+12*4]
- 
- 		movaps	xmm0,xmmword ptr[ebx]
- 		movaps	xmmword ptr[edx],xmm0
- 		movaps	xmm1,xmmword ptr[ebx+4*4]
- 		movaps	xmmword ptr[edx+4*4],xmm1
- 
- //		prefetcht0 [ebx+16*4]
- 
- 		movaps	xmm2,xmmword ptr[ebx+8*4]
- 		movaps	xmmword ptr[edx+8*4],xmm2
- 		movaps	xmm3,xmmword ptr[ebx+12*4]
- 		movaps	xmmword ptr[edx+12*4],xmm3
- 
- 		add		ebx,16*4
- 		add		edx,16*4
- 		loop	loopa 
- 	}
-     return (w+4);
- }
- #endif
- 
  void dsp_add_copy(t_sample *in, t_sample *out, int n)
  {
      if (n&7)
      	dsp_add(copy_perform, 3, in, out, n);
! #if defined(_MSC_VER) && defined(USESIMD)
! 	else if(
! 		(n&15) == 0 &&
! 		/* check for alignment */
! 		((unsigned long)in&15) == 0 &&
! 		((unsigned long)out&15) == 0 
! 	)
      	dsp_add(copy_perf_simd, 3, in, out, n);
- #endif
      else	
      	dsp_add(copy_perf8, 3, in, out, n);
--- 143,152 ----
  }
  
  void dsp_add_copy(t_sample *in, t_sample *out, int n)
  {
      if (n&7)
      	dsp_add(copy_perform, 3, in, out, n);
! 	else if(SIMD_CHECK2(n,in,out))
      	dsp_add(copy_perf_simd, 3, in, out, n);
      else	
      	dsp_add(copy_perf8, 3, in, out, n);

Index: d_ugen.c
===================================================================
RCS file: /cvsroot/pure-data/pd/src/d_ugen.c,v
retrieving revision 1.1.1.2.2.5
retrieving revision 1.1.1.2.2.6
diff -C2 -d -r1.1.1.2.2.5 -r1.1.1.2.2.6
*** d_ugen.c	6 Aug 2003 15:23:55 -0000	1.1.1.2.2.5
--- d_ugen.c	23 Dec 2003 01:15:39 -0000	1.1.1.2.2.6
***************
*** 22,31 ****
   */
  
- 
- /* T.Grill - bit alignment for signal vectors (must be a multiple of 8!) */
- /* if undefined no alignment occurs */
- #define VECTORALIGNMENT 128
- 
- 
  #include "m_pd.h"
  #include "m_imp.h"
--- 22,25 ----
***************
*** 33,36 ****
--- 27,43 ----
  #include <stdarg.h>
  
+ 
+ /* T.Grill - include SIMD functionality */
+ #include "m_simd.h"
+ 
+ /* T.Grill - bit alignment for signal vectors (must be a multiple of 8!) */
+ /* if undefined no alignment occurs */
+ #ifdef SIMD_BYTEALIGN
+     #define VECTORALIGNMENT (SIMD_BYTEALIGN*8)
+ #else
+     #define VECTORALIGNMENT 128
+ #endif
+ 
+ 
  extern t_class *vinlet_class, *voutlet_class, *canvas_class;
  t_sample *obj_findsignalscalar(t_object *x, int m);
***************
*** 76,119 ****
  }
  
- #if defined(_MSC_VER) && defined(USESIMD)
- t_int *zero_perf_simd(t_int *w)
- {
-     t_float *out = (t_float *)(w[1]);
-     int n = (int)(w[2]);
- 
- 	__asm {
- 		mov		edx,dword ptr [out]
- 		/* load zero */
- 		xorps	xmm0,xmm0
- 		mov		ecx,[n]
- 		shr		ecx,4
- 
- 		/* should we do more loop unrolling? */
- loopa:
- 		movaps	xmmword ptr[edx],xmm0
- 		movaps	xmmword ptr[edx+4*4],xmm0
- 		movaps	xmmword ptr[edx+8*4],xmm0
- 		movaps	xmmword ptr[edx+12*4],xmm0
- 
- 		add		edx,16*4
- 		/* very short loop - let's assume that branch prediction does its job nicely */
- 		loop	loopa
- 	}
-     return (w+3);
- }
- #endif
- 
  void dsp_add_zero(t_sample *out, int n)
  {
      if (n&7)
      	dsp_add(zero_perform, 2, out, n);
! #if defined(_MSC_VER) && defined(USESIMD)
! 	else if(
! 		(n&15) == 0 &&
! 		/* check for alignment */
! 		((unsigned long)out&15) == 0
! 	)
      	dsp_add(zero_perf_simd, 2, out, n);
- #endif
      else	
      	dsp_add(zero_perf8, 2, out, n);
--- 83,92 ----
  }
  
  void dsp_add_zero(t_sample *out, int n)
  {
      if (n&7)
      	dsp_add(zero_perform, 2, out, n);
! 	else if(SIMD_CHECK1(n,out))
      	dsp_add(zero_perf_simd, 2, out, n);
      else	
      	dsp_add(zero_perf8, 2, out, n);

Index: makefile.in
===================================================================
RCS file: /cvsroot/pure-data/pd/src/makefile.in,v
retrieving revision 1.1.1.3.2.10
retrieving revision 1.1.1.3.2.11
diff -C2 -d -r1.1.1.3.2.10 -r1.1.1.3.2.11
*** makefile.in	24 Oct 2003 05:01:38 -0000	1.1.1.3.2.10
--- makefile.in	23 Dec 2003 01:15:39 -0000	1.1.1.3.2.11
***************
*** 44,47 ****
--- 44,48 ----
      x_arithmetic.c x_connective.c x_interface.c x_midi.c x_misc.c \
      x_time.c x_acoustics.c x_net.c x_qlist.c x_gui.c d_soundfile.c \
+     m_simd_sse_vc.c m_simd_sse_gcc.c m_simd_ve_gcc.c \
      $(SYSSRC)