[PD-cvs] pd/src s_midi_alsa.c,NONE,1.1.2.1 s_audio_asio.cpp,NONE,1.1.4.1 m_simd.c,NONE,1.1.4.1 m_simd.h,NONE,1.1.4.1 m_simd_def.h,NONE,1.1.4.1 m_simd_sse_gcc.c,NONE,1.1.4.1 m_simd_sse_gcc.h,NONE,1.1.4.1 m_simd_sse_vc.c,NONE,1.1.4.1 m_simd_sse_vc.h,NONE,1.1.4.1 m_simd_ve_gcc.c,NONE,1.1.4.1 m_simd_ve_gcc.h,NONE,1.1.4.1

Tim Blechmann timblech at users.sourceforge.net
Fri Nov 5 14:33:22 CET 2004


Update of /cvsroot/pure-data/pd/src
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20791

Added Files:
      Tag: devel_0_38
	s_midi_alsa.c s_audio_asio.cpp m_simd.c m_simd.h m_simd_def.h 
	m_simd_sse_gcc.c m_simd_sse_gcc.h m_simd_sse_vc.c 
	m_simd_sse_vc.h m_simd_ve_gcc.c m_simd_ve_gcc.h 
Log Message:
adding new files

--- NEW FILE: s_audio_asio.cpp ---
/* Copyright (c) 2004, Tim Blechmann and others
 * For information on usage and redistribution, and for a DISCLAIMER OF ALL
 * WARRANTIES, see the file, "LICENSE.txt" in this distribution.  */

/* native ASIO interface for windows and mac osx 
 * adapted from hostsample.cpp (ASIO SDK)
 */

#ifdef USEAPI_ASIO

#ifdef MSW
#include "windows.h" /* for application window handle */
#define IEEE754_64FLOAT 1
#endif

#include "m_pd.h"
extern "C" {
#include "s_stuff.h"
}
#include "asio.h"     /* steinberg's header file */
#include "asiodrivers.h" /* ASIODrivers class */
#include "asiosys.h"
#include "pthread.h"
#include "stdio.h" /* for sprintf */

#define ASIODEBUG

/* public function prototypes */
extern "C" void asio_open_audio(int naudioindev, int *audioindev, int nchindev, 
					 int *chindev, int naudiooutdev, int *audiooutdev,
					 int nchoutdev, int *choutdev, int srate);
extern "C" void asio_close_audio(void);
extern "C" void asio_getdevs(char *indevlist, int *nindevs,
				  char *outdevlist, int *noutdevs, int *canmulti, 
				  int maxndev, int devdescsize);
extern "C" int asio_send_dacs(void);

/* asio callback prototypes */
void asio_bufferSwitch(long db_idx, ASIOBool directprocess);
void asio_sampleRateDidChange(ASIOSampleRate srate);
long asio_messages(long selector, long value, void* message, double* opt);
ASIOTime *asio_bufferSwitchTimeInfo(ASIOTime *params, long db_idx, 
									ASIOBool directprocess);

/* sample converting helper functions:
 * - global send / receive functions
 * - sample conversion functions (adapted from ASIOConvertSamples.cpp */
void asio_convert_and_send (t_sample* source, void* dest, 
							ASIOSampleType format, long asio_bufsize);
void asio_convert_and_receive (void* source, t_sample* dest, 
							   ASIOSampleType format, long asio_bufsize);
void float32toInt16(float* inbuffer, void* outbuffer, long frames);
void Int16tofloat32(void* inbuffer, float* outbuffer, long frames);
void float32toInt24(float* inbuffer, void* outbuffer, long frames);
void Int24tofloat32(void* inbuffer, float* outbuffer, long frames);
void float32toInt32(float* inbuffer, void* outbuffer, long frames);
void Int32tofloat32(void* inbuffer, float* outbuffer, long frames);

/* some local helper functions */
inline void prepare_asio_drivernames(void);

/* system dependent helper functions */
static unsigned long get_sys_reference_time(void);

/* global storage */
ASIODriverInfo * asio_driver = NULL;
ASIOBufferInfo * asio_bufferinfo;
ASIOChannelInfo* asio_channelinfo;
AsioTimeInfo   * asio_timerinfo;
ASIOCallbacks    asio_callbacks;
extern AsioDrivers * asioDrivers; /* declared in asiodrivers.cpp */

char ** asio_drivernames = NULL;

ASIOSampleRate asio_srate;
long asio_inchannels;
long asio_outchannels;

long asio_minbufsize;
long asio_maxbufsize;
long asio_prefbufsize;
long asio_granularity;
unsigned char asio_useoutputready;
long asio_inputlatency;
long asio_outputlatency;

long asio_bufsize;

unsigned long sys_reftime;

/* ringbuffer stuff */
t_sample ** asio_ringbuffer;                   /* ringbuffers */
static int asio_ringbuffer_inoffset;     /* ringbuffer(in) pointer offset for dac */
static int asio_ringbuffer_outoffset;    /* ringbuffer(out) pointer offset */
static int asio_ringbuffer_length;       /* latency - hardware latency in samples*/

/* i hope we can remove this to use callback based dsp scheduling */
static pthread_mutex_t asio_ringbuf_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t asio_ringbuf_cond = PTHREAD_COND_INITIALIZER;

/* definitions from s_audio.c ... it should be save to use them */
#define DEVDESCSIZE   80
#define MAXNDEV   20

/* open asio interface */
/* todo: some more error messages */
void asio_open_audio(int naudioindev, int *audioindev, int nchindev, 
					 int *chindev, int naudiooutdev, int *audiooutdev,
					 int nchoutdev, int *choutdev, int srate)
{
	ASIOError status;
	ASIOBufferInfo * buffers;
	int i;
	int channels;

#ifdef IEEE754_64FLOAT
	asio_srate=(ASIOSampleRate)srate;
#else
	sprintf(asio_srate,"%d",srate);
#endif
	

	/* check, if we use the first asio device */
	prepare_asio_drivernames();
	
	/* load the driver  */
	if (!asioDrivers)
		asioDrivers = new AsioDrivers();
	
	/* check, if the driver is still running */
	if (asio_driver)
		asio_close_audio();

	asioDrivers->getDriverNames(asio_drivernames,MAXNDEV);

	asioDrivers->loadDriver(asio_drivernames[*audioindev]);



	/* initialize ASIO */
	asio_driver = (ASIODriverInfo*) getbytes (sizeof(ASIODriverInfo));
	asio_driver->asioVersion = 2; /* i hope we are compatible with asio 2 */
	
#ifdef MSW
	asio_driver->sysRef = GetDesktopWindow();
#else
	asio_driver->sysRef = 0;
#endif
	
	status = ASIOInit(asio_driver);
	
#ifdef ASIODEBUG
	post("sysRef: %x", asio_driver->sysRef);
	post("asioversion: %d", asio_driver->asioVersion);
	post("driverversion: %d", asio_driver->driverVersion);
	post("name: %s", asio_driver->name);
	if(status) post("error: %s", asio_driver->errorMessage);
#endif

	switch (status)
	{
	case ASE_NotPresent:
		error("ASIO: ASE_NotPresent");
		freebytes(asio_driver, sizeof (ASIODriverInfo));
		asio_driver = NULL;
		return;
 	case ASE_NoMemory:
		error("ASIO: ASE_NoMemory");
		freebytes(asio_driver, sizeof (ASIODriverInfo));
		asio_driver = NULL;
		return;
 	case ASE_HWMalfunction:
		error("ASIO: ASE_HWMalfunction");
		freebytes(asio_driver, sizeof (ASIODriverInfo));
		asio_driver = NULL;
		return;
	}
	post("ASIO initialized successfully");
	


	/* query driver */
	ASIOGetChannels(&asio_inchannels, &asio_outchannels);
	post ("ASIOGetChannels\tinputs: %d, outputs: %d", asio_inchannels,
		  asio_outchannels);

	/* tb: todo: channel count hardcoded to asio hardware */
	sys_inchannels = *chindev <= asio_inchannels ? *chindev : asio_inchannels;
	sys_outchannels = *choutdev <= asio_outchannels ? *choutdev : asio_outchannels;
	channels = sys_inchannels + sys_outchannels;
	

	ASIOGetBufferSize(&asio_minbufsize, &asio_maxbufsize, &asio_prefbufsize,
					  &asio_granularity);
	post ("ASIOGetBufferSize\tmin: %d, max: %d, preferred: %d, granularity: "
		  "%d", asio_minbufsize, asio_maxbufsize, asio_prefbufsize,
		  asio_granularity);

	/* todo: buffer size hardcoded to asio hardware */
	asio_bufsize = asio_prefbufsize; 

	/* set sample rate */
	if (ASIOCanSampleRate( asio_srate ) != ASE_OK)
	{
		error ("Samplerate not supported, using default");
#ifdef IEEE754_64FLOAT
		asio_srate = (ASIOSampleRate)44100.0;
#else
		sprintf(&asio_srate,"%d",44100);
#endif
		srate=44100;
	}
	ASIOSetSampleRate( asio_srate );
	post ("ASIOSetSampleRate\t %d", srate);
	
	if (ASIOOutputReady() == ASE_OK)
		asio_useoutputready = 1;
	else
		asio_useoutputready = 0;


	/* set callbacks */
	asio_callbacks.bufferSwitch = &asio_bufferSwitch;
	asio_callbacks.sampleRateDidChange = &asio_sampleRateDidChange;
	asio_callbacks.asioMessage = &asio_messages;
	asio_callbacks.bufferSwitchTimeInfo = &asio_bufferSwitchTimeInfo;


	/* prepare, create and set up buffers */
	asio_bufferinfo  = (ASIOBufferInfo*) getbytes (channels * sizeof (ASIOBufferInfo));
	asio_channelinfo = (ASIOChannelInfo*)  getbytes(channels * sizeof (ASIOChannelInfo));
	if (!(asio_bufferinfo && asio_channelinfo))
	{
		error("ASIO: couldn't allocate buffer or channel info");
		if (asio_bufferinfo)
			freebytes(asio_bufferinfo, channels * sizeof (ASIOBufferInfo));
		if (asio_channelinfo)
			freebytes(asio_channelinfo, channels * sizeof (ASIOChannelInfo));
		return;
	}

	for (i = 0; i != sys_inchannels + sys_outchannels; ++i)
	{
		if (i < sys_outchannels)
		{
			asio_bufferinfo[i].isInput = ASIOFalse;
			asio_bufferinfo[i].channelNum = i;
			asio_bufferinfo[i].buffers[0] = asio_bufferinfo[i].buffers[1] = 0;
		}
		else
		{
			asio_bufferinfo[i].isInput = ASIOTrue;
			asio_bufferinfo[i].channelNum = i - sys_outchannels;
		}
	}
	
	if (ASIOCreateBuffers(asio_bufferinfo, sys_inchannels + sys_outchannels,
						  asio_bufsize, &asio_callbacks)	
		== ASE_OK)
	{
		post("ASIO: buffers allocated");
	}
	else
	{
		error("ASIO: couldn't allocate buffers");
		return;
	}

	for (i = 0; i != sys_inchannels + sys_outchannels; ++i)
	{
		asio_channelinfo[i].channel = asio_bufferinfo[i].channelNum;
		asio_channelinfo[i].isInput = asio_bufferinfo[i].isInput;
		ASIOGetChannelInfo(&asio_channelinfo[i]);
	}


	/* get latencies */
	ASIOGetLatencies(&asio_inputlatency, &asio_outputlatency);
#ifdef ASIODEBUG
	post("ASIO: input latency: %d, output latency: %d",asio_inputlatency, 
		 asio_outputlatency);
#endif


	/* calculate ringbuffer length */
	asio_ringbuffer_length = asio_bufsize * DEFDACBLKSIZE;
	
	/* a strange way to find the least common multiple, 
	 * but works, since DEFDACBLKSIZE (expt 2 x)        */
	while ( !(asio_ringbuffer_length % DEFDACBLKSIZE) && 
			!(asio_ringbuffer_length % asio_bufsize))
	{
		asio_ringbuffer_length /= 2;
	}
	asio_ringbuffer_length *= 2;

#ifdef ASIODEBUG
	post("ASIO: ringbuffer size: %d",asio_ringbuffer_length);
#endif
	
	
	/* allocate ringbuffer */
	asio_ringbuffer = (t_sample**) getbytes (channels * sizeof (t_sample*));
	for (i = 0; i != channels; ++i)
	{	
		asio_ringbuffer[i] = (t_sample*)getbytes(asio_ringbuffer_length * sizeof (t_sample));
		if (!asio_ringbuffer[i])
			error("ASIO: couldn't allocate ASIO ringbuffer");
		memset(asio_ringbuffer[i], 0, asio_ringbuffer_length * sizeof (t_sample));
	}
	

	/* initialize ringbuffer stuff */
	asio_ringbuffer_inoffset = asio_ringbuffer_outoffset = 0;

	if (ASIOStart() == ASE_OK)
	{
		post("ASIO: started");
	}
	else
		post("ASIO: couldn't start");
	
	return;
}



/* stop asio, free buffers and close asio interface */
void asio_close_audio(void)
{
	ASIOError status;
	int channels = asio_inchannels + asio_outchannels;
	int i;

	pthread_cond_broadcast(&asio_ringbuf_cond);

	ASIOStop();

	if (asio_driver)
	{
		for (i = 0; i != channels; i++)
			freebytes(asio_ringbuffer[i], asio_ringbuffer_length * sizeof (t_sample));
		freebytes(asio_ringbuffer, channels * sizeof (t_sample *));
		freebytes(asio_bufferinfo, channels * sizeof (ASIOBufferInfo));
		freebytes(asio_channelinfo, channels * sizeof (ASIOChannelInfo));
		ASIODisposeBuffers();
		
		asio_ringbuffer = NULL;
		asio_bufferinfo = NULL;
		asio_channelinfo = NULL;
		
		ASIOExit();
		freebytes(asio_driver, sizeof (ASIODriverInfo));
		asio_driver = NULL;
	}
	return;
}


void asio_getdevs(char *indevlist, int *nindevs,
				  char *outdevlist, int *noutdevs, int *canmulti, 
				  int maxndev, int devdescsize)
{
	prepare_asio_drivernames();

	*canmulti = 0; /* we will only support one asio device */
	*nindevs = *noutdevs = (int)asioDrivers->getDriverNames(asio_drivernames,
															maxndev);
	
	for(int i = 0; i!= *nindevs; ++i)
	{
		sprintf(indevlist  + i * devdescsize, "%s", asio_drivernames[i]);
		sprintf(outdevlist + i * devdescsize, "%s", asio_drivernames[i]);
	}
}



/* called on every dac~ send
 * todo: 
 * - use vectorized functions 
 * - function pointer to avoid segfaults */      
int asio_send_dacs(void)
{
	t_sample * sp; /* sample pointer */
	int i, j;
	int timenow;
	int timeref = sys_getrealtime();
#ifdef ASIODEBUG
	if (!asio_driver)
	{
		error("ASIO not running");
		return SENDDACS_NO;
	}
#endif



	/* send sound to ringbuffer */
	sp = sys_soundout;
	for (i = 0; i < sys_outchannels; i++)
	{
		memcpy(asio_ringbuffer[i] + asio_ringbuffer_inoffset, sp,
			   DEFDACBLKSIZE*sizeof(t_sample));
		memset(sp, 0, DEFDACBLKSIZE*sizeof(t_sample));
		sp+=DEFDACBLKSIZE;
	}
	
	/* get sound from ringbuffer */
	sp = sys_soundin;
	for (j = 0; j < sys_inchannels; j++)
 	{
		memcpy(sp, asio_ringbuffer[i+j] + asio_ringbuffer_inoffset,
			   DEFDACBLKSIZE*sizeof(t_sample));
		sp+=DEFDACBLKSIZE;
	}
	
	asio_ringbuffer_inoffset += DEFDACBLKSIZE;

	
	if (asio_ringbuffer_inoffset >= asio_ringbuffer_outoffset + asio_bufsize)
	{
		pthread_cond_wait(&asio_ringbuf_cond, &asio_ringbuf_mutex);
		if (asio_ringbuffer_inoffset == asio_ringbuffer_length)
		{
			asio_ringbuffer_outoffset = 0;
			asio_ringbuffer_inoffset = 0;
		}
		else
			asio_ringbuffer_outoffset += asio_bufsize;
	}
	
	if ((timenow = sys_getrealtime()) - timeref > 0.002)
	{
		return SENDDACS_SLEPT;
	}
	return SENDDACS_YES;
}


/* buffer switch callback */
void asio_bufferSwitch(long db_idx, ASIOBool directprocess)
{
	ASIOTime time;
	memset (&time, 0, sizeof (time));
	/* todo: do we need to syncronize with other media ??? */
	asio_bufferSwitchTimeInfo(&time, db_idx, directprocess);
}

/* sample rate change callback */
void asio_sampleRateDidChange(ASIOSampleRate srate)
{
	asio_srate = srate;
#ifdef ASIODEBUG
	post("sample rate changed");
#endif
}

/* asio messaging callback */
long asio_messages(long selector, long value, void* message, double* opt)
{
	/* todo */
	return 0L;
}

ASIOTime *asio_bufferSwitchTimeInfo(ASIOTime *params, long db_idx, 
									ASIOBool directprocess)
{
	long i, j;	



	// todo: store the timeInfo for later use


	/* todo: i'm not sure if we'll have to synchronize with other media ...
	 * probably yes ... */
		
	/* 	sys_reftime = get_sys_reference_time(); */

	/* perform the processing 
	 * todo: improve input latency
	 */
	for (i = 0; i < asio_outchannels + asio_inchannels; i++)
	{
		if (asio_bufferinfo[i].isInput != ASIOTrue)
		{
			asio_convert_and_send(asio_ringbuffer[i]+asio_ringbuffer_outoffset,
								  (void*) asio_bufferinfo[i].buffers[db_idx],
								  asio_channelinfo[i].type, asio_bufsize);
		}
		else /* these are the input channels */
		{
			asio_convert_and_receive((void*)asio_bufferinfo[i].buffers[db_idx],
									 asio_ringbuffer[i]+asio_ringbuffer_outoffset,
									 asio_channelinfo[i].type, asio_bufsize);
		}
			
	}
	pthread_cond_broadcast(&asio_ringbuf_cond);

	if(asio_useoutputready)
		ASIOOutputReady();
	


	return 0L; /* time info!!! */
}


/* get system reference time on both platforms */
static unsigned long get_sys_reference_time()
{	
#if WINDOWS
	return timeGetTime();
#elif MAC
	static const double twoRaisedTo32 = 4294967296.;
	UnsignedWide ys;
	Microseconds(&ys);
	double r = ((double)ys.hi * twoRaisedTo32 + (double)ys.lo);
	return (unsigned long)(r / 1000.);
#endif
}

/* sample converting helper functions */
void asio_convert_and_send(t_sample* source, void* dest, ASIOSampleType format, long bufsize)
{

#ifdef ASIODEBUG
/* 	post("ASIO: Sample Type %d", format); */
#endif
	switch (format)
	{
	case ASIOSTInt16LSB:
		/* e.g. m audio quattro */
		float32toInt16(source, dest, bufsize);		
		break;
	case ASIOSTFloat32LSB:		// IEEE 754 32 bit float, as found on Intel x86 architecture
		memcpy (dest, source, bufsize * sizeof (float)); /* check */
		break;
	case ASIOSTInt24LSB:		// used for 20 bits as well
		float32toInt24(source, dest, bufsize);		
		break;
	case ASIOSTInt32LSB:
		float32toInt32(source, dest, bufsize);		
		break;

	case ASIOSTFloat64LSB: 		// IEEE 754 64 bit double float, as found on Intel x86 architecture

		// these are used for 32 bit data buffer, with different alignment of the data inside
		// 32 bit PCI bus systems can more easily used with these
	case ASIOSTInt32LSB16:		// 32 bit data with 18 bit alignment
	case ASIOSTInt32LSB18:		// 32 bit data with 18 bit alignment
	case ASIOSTInt32LSB20:		// 32 bit data with 20 bit alignment
	case ASIOSTInt32LSB24:		// 32 bit data with 24 bit alignment

	case ASIOSTInt16MSB:

	case ASIOSTInt24MSB:		// used for 20 bits as well

	case ASIOSTInt32MSB:

	case ASIOSTFloat32MSB:		// IEEE 754 32 bit float, as found on Intel x86 architecture

	case ASIOSTFloat64MSB: 		// IEEE 754 64 bit double float, as found on Intel x86 architecture
		// these are used for 32 bit data buffer, with different alignment of the data inside
		// 32 bit PCI bus systems can more easily used with these
	case ASIOSTInt32MSB16:		// 32 bit data with 18 bit alignment
	case ASIOSTInt32MSB18:		// 32 bit data with 18 bit alignment
	case ASIOSTInt32MSB20:		// 32 bit data with 20 bit alignment
	case ASIOSTInt32MSB24:		// 32 bit data with 24 bit alignment
        {
            static int written = 0;
            if(written < 3) {
                post("Output sample Type %d not supported, yet!!!",format);
                ++written;
            }
        }
	}
}

void asio_convert_and_receive (void* source, t_sample* dest, ASIOSampleType format, long bufsize)
{
#ifdef ASIODEBUG
/* 	post("ASIO: Sample Type %d", format); */
#endif

	switch (format)
	{
	case ASIOSTInt16LSB:
		Int16tofloat32(source, dest, bufsize);		
		break;
	case ASIOSTFloat32LSB:		// IEEE 754 32 bit float, as found on Intel x86 architecture
		memcpy (dest, source, bufsize * sizeof (float)); /* check */
		break;
	case ASIOSTInt24LSB:		// used for 20 bits as well
		Int24tofloat32(source, dest, bufsize);		
		break;
	case ASIOSTInt32LSB:
		Int32tofloat32(source, dest, bufsize);		
		break;

	case ASIOSTFloat64LSB: 		// IEEE 754 64 bit double float, as found on Intel x86 architecture

		// these are used for 32 bit data buffer, with different alignment of the data inside
		// 32 bit PCI bus systems can more easily used with these
	case ASIOSTInt32LSB16:		// 32 bit data with 18 bit alignment
	case ASIOSTInt32LSB18:		// 32 bit data with 18 bit alignment
	case ASIOSTInt32LSB20:		// 32 bit data with 20 bit alignment
	case ASIOSTInt32LSB24:		// 32 bit data with 24 bit alignment

	case ASIOSTInt16MSB:
	case ASIOSTInt24MSB:		// used for 20 bits as well
	case ASIOSTInt32MSB:
	case ASIOSTFloat32MSB:		// IEEE 754 32 bit float, as found on Intel x86 architecture
	case ASIOSTFloat64MSB: 		// IEEE 754 64 bit double float, as found on Intel x86 architecture

		// these are used for 32 bit data buffer, with different alignment of the data inside
		// 32 bit PCI bus systems can more easily used with these
	case ASIOSTInt32MSB16:		// 32 bit data with 18 bit alignment
	case ASIOSTInt32MSB18:		// 32 bit data with 18 bit alignment
	case ASIOSTInt32MSB20:		// 32 bit data with 20 bit alignment
	case ASIOSTInt32MSB24:		// 32 bit data with 24 bit alignment
        {
            static int written = 0;
            if(written < 3) {
                post("Input sample Type %d not supported, yet!!!",format);
                ++written;
            }
        }
	}

}

/* sample conversion functions */

#define SCALE_INT16 32767.f       /* (- (expt 2 15) 1) */
#define SCALE_INT24 8388607.f     /* (- (expt 2 23) 1) */
#define SCALE_INT32 2147483647.f  /* (- (expt 2 31) 1) */

void float32toInt16(float* inbuffer, void* outbuffer, long frames)
{
	short* out = (short*)outbuffer;
	while (frames--)
	{
		*out++ = (short)(*inbuffer++ * SCALE_INT16);
	}
}

void Int16tofloat32(void* inbuffer, float* outbuffer, long frames)
{
	short* in = (short*)inbuffer;
	while (frames--)
	{
		*outbuffer++ = (float)(*in++ * (1.f / SCALE_INT16));
	}
}

void float32toInt24(float* inbuffer, void* outbuffer, long frames)
{
	int* out = (int*)outbuffer;
	while (frames--)
	{
		*out++ = (int)(*inbuffer * SCALE_INT24);
	}
}

void Int24tofloat32(void* inbuffer, float* outbuffer, long frames)
{
	int* in = (int*)inbuffer;
	while (frames--)
	{
		*outbuffer++ = (float)(*in++ * (1.f / SCALE_INT24));
	}
}

void float32toInt32(float* inbuffer, void* outbuffer, long frames)
{
	long* out = (long*)outbuffer;
	while (frames--)
	{
		*out++ = (long)(*inbuffer * SCALE_INT32);
	}
}

void Int32tofloat32(void* inbuffer, float* outbuffer, long frames)
{
	long* in = (long*)inbuffer;
	while (frames--)
	{
		*outbuffer++ = (float)(*in++ * (1.f / SCALE_INT32));
	}
}

/* some local helper functions */
inline void prepare_asio_drivernames(void)
{
	if (asio_drivernames == NULL)
	{
		asio_drivernames = (char**)getbytes(MAXNDEV * sizeof(char*));
		for (int i = 0; i!= MAXNDEV; ++i)
		{
			asio_drivernames[i] = (char*)getbytes (32 * sizeof(char));
		}
	}
	return;
}


#endif /* USEAPI_ASIO */

--- NEW FILE: m_simd.c ---
/* 
    Implementation of general vectorized functions
    added by T.Grill
*/

#include "m_pd.h"
#include "m_simd.h"

void zerovec_8(t_float *dst,int n)
{
    for(n >>= 3; n--; dst += 8) {
        dst[0] = dst[1] = dst[2] = dst[3] = dst[4] = dst[5] = dst[6] = dst[7] = 0;
    }
}

void setvec_8(t_float *dst,t_float v,int n)
{
    for(n >>= 3; n--; dst += 8) {
        dst[0] = dst[1] = dst[2] = dst[3] = dst[4] = dst[5] = dst[6] = dst[7] = v;
    }
}

void copyvec_8(t_float *dst,const t_float *src,int n)
{
    for(n >>= 3; n--; src += 8,dst += 8) {
        dst[0] = src[0],dst[1] = src[1],dst[2] = src[2],dst[3] = src[3];
        dst[4] = src[4],dst[5] = src[5],dst[6] = src[6],dst[7] = src[7];
    }
}

void addvec_8(t_float *dst,const t_float *src,int n)
{
    for(n >>= 3; n--; src += 8,dst += 8) {
        dst[0] += src[0],dst[1] += src[1],dst[2] += src[2],dst[3] += src[3];
        dst[4] += src[4],dst[5] += src[5],dst[6] += src[6],dst[7] += src[7];
    }
}

void testcopyvec_8(t_float *dst,const t_float *src,int n)
{
    while(n--) {
        *(dst++) = (PD_BIGORSMALL(*src) ? 0 : *src); src++;
	}
}

void testaddvec_8(t_float *dst,const t_float *src,int n)
{
    while(n--) {
        *(dst++) += (PD_BIGORSMALL(*src) ? 0 : *src); src++;
	}
}

int simd_check1(t_int n, t_float* ptr1)
{
	return SIMD_CHECK1(n,ptr1);
}

int simd_check2(t_int n, t_float* ptr1, t_float* ptr2)
{
	return SIMD_CHECK2(n,ptr1,ptr2);
}

int simd_check3(t_int n, t_float* ptr1, t_float* ptr2, t_float* ptr3)
{
	return SIMD_CHECK3(n,ptr1,ptr2,ptr3);
}



#ifdef DONTUSESIMD
int simd_runtime_check()
{
	return 0;
}

/* tb: wrapper for simd functions */
void zerovec_simd(t_float *dst,int n)
{
	zerovec_8(dst,n);
}

void setvec_simd(t_float *dst,t_float v,int n)
{
	setvec_8(dst,v,n);
}

void copyvec_simd(t_float *dst,const t_float *src,int n)
{
	copyvec_8(dst,src,n);
}

void addvec_simd(t_float *dst,const t_float *src,int n)
{
	addvec_8(dst,src,n);
}

void testcopyvec_simd(t_float *dst,const t_float *src,int n)
{
	testcopyvec_8(dst,src,n);
}

void testaddvec_simd(t_float *dst,const t_float *src,int n)
{
	testaddvec_8(dst,src,n);
}

#endif /* DONTUSESIMD */

--- NEW FILE: m_simd_sse_gcc.c ---
/* 
    Implementation of SIMD functionality for Intel SSE with GCC compiler
    added by T.Grill
*/

#include "m_pd.h"
#include "m_simd.h"

#if defined(__GNUC__) && (defined(_X86_) || defined(__i386__) || defined(__i586__) || defined(__i686__)) && !(defined DONTUSESIMD)


/* TB: adapted from thomas' vc routines */

/* dst is assumed to be aligned */
void zerovec_simd(t_float *dst,int n)
{
    asm(
		".set T_FLOAT,4                            \n" /* sizeof(t_float) */
		"xorps     %%xmm0, %%xmm0                  \n" /* zero value */
		"shr       $4, %0                          \n"
		
		/* should we do more loop unrolling? */
		/* *dst = 0 */
		"1:                                        \n"
		"movaps    %%xmm0, (%1)                    \n"
		"movaps    %%xmm0, 4*T_FLOAT(%1)           \n"
		"movaps    %%xmm0, 8*T_FLOAT(%1)           \n"
		"movaps    %%xmm0, 12*T_FLOAT(%1)          \n"
		
		"addl      $16*T_FLOAT,%1                  \n"
		"loop      1b                              \n"
		:
		:"c"(n),"r"(dst)
		:"%xmm0");
}

/* dst is assumed to be aligned */
void setvec_simd(t_float *dst,t_float v,int n)
{
    asm(
		".set T_FLOAT,4                            \n" /* sizeof(t_float) */
		"shufps    $0,%2,%2                        \n" /* load value */
		"shr       $4,%0                           \n"
		
		/* should we do more loop unrolling? */
		/* *dst = v */
		"1:                                        \n"
		"movaps    %2, (%1)                        \n"
		"movaps    %2, 4*T_FLOAT(%1)               \n"
		"movaps    %2, 8*T_FLOAT(%1)               \n"
		"movaps    %2, 12*T_FLOAT(%1)              \n"
		
		"addl      $16*T_FLOAT,%1                  \n"
		"loop      1b                              \n"
		:
		:"c"(n),"r"(dst),"x"((t_float)v)
		);
}


/* dst and src are assumed to be aligned */
void copyvec_simd(t_float *dst,const t_float *src,int n)
{
    asm(
		".set T_FLOAT,4                            \n" /* sizeof(t_float) */
		"shr       $4, %0                          \n"
		
		/* loop: *dst = *src */
		"1:                                        \n"
		"movaps    (%1), %%xmm0                    \n"
		"movaps    4*T_FLOAT(%1), %%xmm1           \n"
		"movaps    8*T_FLOAT(%1), %%xmm2           \n"
		"movaps    12*T_FLOAT(%1), %%xmm3          \n"
		"movaps    %%xmm0, (%2)                    \n"
		"movaps    %%xmm1, 4*T_FLOAT(%2)           \n"
		"movaps    %%xmm2, 8*T_FLOAT(%2)           \n"
		"movaps    %%xmm3, 12*T_FLOAT(%2)          \n"
		
		
		"addl      $16*T_FLOAT,%1                  \n"
		"addl      $16*T_FLOAT,%2                  \n"
		"loop      1b                              \n"
		:
		:"c"(n),"r"(src),"r"(dst)
		:"%xmm0","%xmm1","%xmm2","%xmm3");
}


/* dst and src are assumed to be aligned */
void addvec_simd(t_float *dst,const t_float *src,int n)
{
    asm(
		".set T_FLOAT,4                            \n" /* sizeof(t_float) */
		"shr       $4, %0                          \n"
		
		/* loop: *dst += *src */
		"1:                                        \n"
		"movaps    (%2,%3),%%xmm0                  \n"
		"movaps    (%1,%3),%%xmm1                  \n"
		"addps     %%xmm0,%%xmm1                   \n"
		"movaps    %%xmm1,(%2,%3)                  \n"
		
		"movaps    4*T_FLOAT(%2,%3),%%xmm0         \n"
		"movaps    4*T_FLOAT(%1,%3),%%xmm1         \n"
		"addps     %%xmm0,%%xmm1                   \n"
		"movaps    %%xmm1,4*T_FLOAT(%2,%3)         \n"
		
		"movaps    8*T_FLOAT(%2,%3),%%xmm0         \n"
		"movaps    8*T_FLOAT(%1,%3),%%xmm1         \n"
		"addps     %%xmm0,%%xmm1                   \n"
		"movaps    %%xmm1,8*T_FLOAT(%2,%3)         \n"
		
		"movaps    12*T_FLOAT(%2,%3),%%xmm0        \n"
		"movaps    12*T_FLOAT(%1,%3),%%xmm1        \n"
		"addps     %%xmm0,%%xmm1                   \n"
		"movaps    %%xmm1,12*T_FLOAT(%2,%3)        \n"
		
		"addl      $16*T_FLOAT,%3                  \n"
		"loop      1b                              \n"
		:
		: "c"(n),"r"(src),"r"(dst),"r"(0)
		: "%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%xmm5","%xmm6","%xmm7");
}

/* dst is assumed to be aligned */
void testvec_simd(t_float *dst,t_float v,int n)
{
    asm(
		".set T_FLOAT,4                            \n" /* sizeof(t_float) */
		"shufps    $0,%2,%2                        \n" /* load value */
		"shr       $4,%0                           \n"
		
		/* should we do more loop unrolling? */
		/* *dst = v */
		"1:                                        \n"
		"movaps    %2, (%1)                        \n"
		"movaps    %2, 4*T_FLOAT(%1)               \n"
		"movaps    %2, 8*T_FLOAT(%1)               \n"
		"movaps    %2, 12*T_FLOAT(%1)              \n"
		
		"addl      $16*T_FLOAT,%1                  \n"
		"loop      1b                              \n"
		:
		:"c"(n),"r"(dst),"x"((t_float)v)
		);
}


/*
 * if we switch on DAZ, we shouldn't have problems with denormals 
 * any more ... tb
 */
void testcopyvec_simd(t_float *dst,const t_float *src,int n)
{
#if 0 //def DAZ
	copyvec_simd(dst,src,n);
#else
	testcopyvec_8(dst,src,n); /* SIMD not implemented */
#endif
}

void testaddvec_simd(t_float *dst,const t_float *src,int n)
{
#if 0 //DAZ
	addvec_simd(dst,src,n);
#else
	testaddvec_8(dst,src,n); /* SIMD not implemented */
#endif
}


t_int *zero_perf_simd(t_int *w)
{
    zerovec_simd((t_float *)w[1],w[2]);
    return w+3;
}

t_int *copy_perf_simd(t_int *w)
{
    copyvec_simd((t_float *)w[2],(const t_float *)w[1],w[3]);
    return w+4;
}

t_int *sig_tilde_perf_simd(t_int *w)
{
    setvec_simd((t_float *)w[2],*(const t_float *)w[1],w[3]);
    return w+4;
}


t_int *plus_perf_simd (t_int * w)
{
    asm(
	".set T_FLOAT,4                            \n"
	
	"shrl      $4, %3                          \n" /* divide by 16 */
    
	/* loop: *out = *in1 + *in2 */
	"1:                                        \n"
	"movaps    (%0,%4), %%xmm0                 \n"
	"movaps    (%1,%4), %%xmm1                 \n"
	"addps     %%xmm1, %%xmm0                  \n"
	"movaps    %%xmm0, (%2,%4)                 \n"
	
	"movaps    4*T_FLOAT(%0,%4), %%xmm2        \n"
	"movaps    4*T_FLOAT(%1,%4), %%xmm3        \n"
	"addps     %%xmm3, %%xmm2                  \n"
	"movaps    %%xmm2, 4*T_FLOAT(%2,%4)        \n"
	
	"movaps    8*T_FLOAT(%0,%4), %%xmm4        \n"
	"movaps    8*T_FLOAT(%1,%4), %%xmm5        \n"
	"addps     %%xmm5, %%xmm4                  \n"
	"movaps    %%xmm4, 8*T_FLOAT(%2,%4)        \n"
	
	"movaps    12*T_FLOAT(%0,%4), %%xmm6       \n"
	"movaps    12*T_FLOAT(%1,%4), %%xmm7       \n"
	"addps     %%xmm7, %%xmm6                  \n"
	"movaps    %%xmm6, 12*T_FLOAT(%2,%4)       \n"
	
	"addl      $16*T_FLOAT, %4                 \n"
	"loop      1b                              \n"
	:
	/* in1, in2, out, n */
	:"r"(w[1]),"r"(w[2]),"r"(w[3]),"c"(w[4]),"r"(0)
	:"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%xmm5","%xmm6","%xmm7"
	);
    return w+5;
}


t_int *scalarplus_perf_simd(t_int *w)
{
    asm(
	".set T_FLOAT,4                            \n"
	
        "shufps    $0, %1, %1                      \n"
	"shrl      $4, %3                          \n" /* divide by 16 */

	/* loop: *out = *in + value */
	"1:                                        \n"
	"movaps    (%0), %%xmm1                    \n"
	"addps     %1, %%xmm1                      \n"
	"movaps    %%xmm1, (%2)                    \n"
    
	"movaps    4*T_FLOAT(%0), %%xmm2           \n"
	"addps     %1, %%xmm2                      \n"
	"movaps    %%xmm2, 4*T_FLOAT(%2)           \n"
	
	"movaps    8*T_FLOAT(%0), %%xmm3           \n"
	"addps     %1, %%xmm3                      \n"
	"movaps    %%xmm3, 8*T_FLOAT(%2)           \n"
	
	"movaps    12*T_FLOAT(%0), %%xmm4          \n"
	"addps     %1, %%xmm4                      \n"
	"movaps    %%xmm4, 12*T_FLOAT(%2)          \n"
	
	"addl      $16*T_FLOAT, %0                 \n"
	"addl      $16*T_FLOAT, %2                 \n"
	"loop      1b                              \n"
	:
	/* in, value, out, n */
	:"r"(w[1]),"x"((t_float)w[2]),"r"(w[3]),"c"(w[4])
	:"%xmm1","%xmm2","%xmm3","%xmm4"
	);
    return w+5;
}

t_int *minus_perf_simd(t_int *w)
{
    asm(
	".set T_FLOAT,4                            \n"
	
	"shrl      $4, %3                          \n" /* divide by 16 */
	
	/* loop: *out = *in1 - *in2 */
	"1:                                        \n"
	"movaps    (%0,%4), %%xmm0                 \n"
	"movaps    (%1,%4), %%xmm1                 \n"
	"subps     %%xmm1, %%xmm0                  \n"
	"movaps    %%xmm0, (%2,%4)                 \n"
	
	"movaps    4*T_FLOAT(%0,%4), %%xmm2        \n"
	"movaps    4*T_FLOAT(%1,%4), %%xmm3        \n"
	"subps     %%xmm3, %%xmm2                  \n"
	"movaps    %%xmm2, 4*T_FLOAT(%2,%4)        \n"
	
	"movaps    8*T_FLOAT(%0,%4), %%xmm4        \n"
	"movaps    8*T_FLOAT(%1,%4), %%xmm5        \n"
	"subps     %%xmm5, %%xmm4                  \n"
	"movaps    %%xmm4, 8*T_FLOAT(%2,%4)        \n"
	
	"movaps    12*T_FLOAT(%0,%4), %%xmm6       \n"
	"movaps    12*T_FLOAT(%1,%4), %%xmm7       \n"
	"subps     %%xmm7, %%xmm6                  \n"
	"movaps    %%xmm6, 12*T_FLOAT(%2,%4)       \n"
	
	"addl      $16*T_FLOAT, %4                 \n"
	"loop      1b                              \n"
	:
	/* in1, in2, out, n */
	:"r"(w[1]),"r"(w[2]),"r"(w[3]),"c"(w[4]),"r"(0)
	:"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%xmm5","%xmm6","%xmm7"
	);
    return w+5;
}

t_int* scalarminus_perf_simd(t_int *w)
{
    asm(
	".set T_FLOAT,4                            \n"
	
        "shufps    $0, %1, %1                      \n"
	"shrl      $4, %3                          \n" /* divide by 16 */

	/* loop: *out = *in - value */
	"1:                                        \n"
	"movaps    (%0), %%xmm1                    \n"
	"subps     %1, %%xmm1                      \n"
	"movaps    %%xmm1, (%2)                    \n"
    
	"movaps    4*T_FLOAT(%0), %%xmm2           \n"
	"subps     %1, %%xmm2                      \n"
	"movaps    %%xmm2, 4*T_FLOAT(%2)           \n"
	
	"movaps    8*T_FLOAT(%0), %%xmm3           \n"
	"subps     %1, %%xmm3                      \n"
	"movaps    %%xmm3, 8*T_FLOAT(%2)           \n"
	
	"movaps    12*T_FLOAT(%0), %%xmm4          \n"
	"subps     %1, %%xmm4                      \n"
	"movaps    %%xmm4, 12*T_FLOAT(%2)          \n"
	
	"addl      $16*T_FLOAT, %0                 \n"
	"addl      $16*T_FLOAT, %2                 \n"
	"loop      1b                              \n"
	:
	/* in, value, out, n */
	:"r"(w[1]),"x"((t_float)w[2]),"r"(w[3]),"c"(w[4])
	:"%xmm1","%xmm2","%xmm3","%xmm4"
	);
    return w+5;
}


t_int *times_perf_simd(t_int *w)
{
    asm(
	".set T_FLOAT,4                            \n"
	
	"shrl      $4, %3                          \n" /* divide by 16 */
	
	/* loop: *out = *in1 * *in2 */
	"1:                                        \n"
	"movaps    (%0,%4), %%xmm0                 \n"
	"movaps    (%1,%4), %%xmm1                 \n"
	"mulps     %%xmm1, %%xmm0                  \n"
	"movaps    %%xmm0, (%2,%4)                 \n"
	
	"movaps    4*T_FLOAT(%0,%4), %%xmm2        \n"
	"movaps    4*T_FLOAT(%1,%4), %%xmm3        \n"
	"mulps     %%xmm3, %%xmm2                  \n"
	"movaps    %%xmm2, 4*T_FLOAT(%2,%4)        \n"
	
	"movaps    8*T_FLOAT(%0,%4), %%xmm4        \n"
	"movaps    8*T_FLOAT(%1,%4), %%xmm5        \n"
	"mulps     %%xmm5, %%xmm4                  \n"
	"movaps    %%xmm4, 8*T_FLOAT(%2,%4)        \n"
	
	"movaps    12*T_FLOAT(%0,%4), %%xmm6       \n"
	"movaps    12*T_FLOAT(%1,%4), %%xmm7       \n"
	"mulps     %%xmm7, %%xmm6                  \n"
	"movaps    %%xmm6, 12*T_FLOAT(%2,%4)       \n"
	
	"addl      $16*T_FLOAT, %4                 \n"
	"loop      1b                              \n"
	:
	/* in1, in2, out, n */
	:"r"(w[1]),"r"(w[2]),"r"(w[3]),"c"(w[4]),"r"(0)
	:"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%xmm5","%xmm6","%xmm7"
	);
    return w+5;
}

t_int* scalartimes_perf_simd(t_int *w)
{
    asm(
	".set T_FLOAT,4                            \n"
	
        "shufps    $0, %1, %1                      \n"
	"shrl      $4, %3                          \n" /* divide by 16 */

	/* loop: *out = *in * value */
	"1:                                        \n"
	"movaps    (%0), %%xmm1                    \n"
	"mulps     %1, %%xmm1                      \n"
	"movaps    %%xmm1, (%2)                    \n"
    
	"movaps    4*T_FLOAT(%0), %%xmm2           \n"
	"mulps     %1, %%xmm2                      \n"
	"movaps    %%xmm2, 4*T_FLOAT(%2)           \n"
	
	"movaps    8*T_FLOAT(%0), %%xmm3           \n"
	"mulps     %1, %%xmm3                      \n"
	"movaps    %%xmm3, 8*T_FLOAT(%2)           \n"
	
	"movaps    12*T_FLOAT(%0), %%xmm4          \n"
	"mulps     %1, %%xmm4                      \n"
	"movaps    %%xmm4, 12*T_FLOAT(%2)          \n"
	
	"addl      $16*T_FLOAT, %0                 \n"
	"addl      $16*T_FLOAT, %2                 \n"
	"loop      1b                              \n"
	:
	/* in, value, out, n */
	:"r"(w[1]),"x"((t_float)w[2]),"r"(w[3]),"c"(w[4])
	:"%xmm1","%xmm2","%xmm3","%xmm4"
	);
    return w+5;
}

t_int *sqr_perf_simd(t_int *w)
{
    asm(
	".set T_FLOAT,4                            \n"
	
	"shrl      $4, %2                          \n" /* divide by 16 */
	
	/* loop: *out = *in * *in */
	"1:                                        \n"
	"movaps    (%0,%3), %%xmm0                 \n"
	"mulps     %%xmm0, %%xmm0                  \n"
	"movaps    %%xmm0, (%1)                    \n"
    
	"movaps    4*T_FLOAT(%0,%3), %%xmm1        \n"
	"mulps     %%xmm1, %%xmm1                  \n"
	"movaps    %%xmm1, 4*T_FLOAT(%1)           \n"
	
	"movaps    8*T_FLOAT(%0,%3), %%xmm2        \n"
	"mulps     %%xmm2, %%xmm2                  \n"
	"movaps    %%xmm2, 8*T_FLOAT(%1)           \n"
	
	"movaps    12*T_FLOAT(%0,%3), %%xmm3       \n"
	"mulps     %%xmm3, %%xmm3                  \n"
	"movaps    %%xmm3, 12*T_FLOAT(%1)          \n"
	
	"addl      $16*T_FLOAT, %3                 \n"
	"loop      1b                              \n"
	:
	/* in, out, n */
	:"r"(w[1]),"r"(w[2]),"c"(w[3]),"r"(0)
	:"%xmm0","%xmm1","%xmm2","%xmm3"
	);
    return w+4;
}


t_int* over_perf_simd(t_int * w)
{
    asm(
	".set T_FLOAT,4                            \n"
	
	"shrl      $4, %3                          \n" /* divide by 16 */
	
	/* loop: *out = *in1 / *in2 */
	"1:                                        \n"
	"movaps    (%0,%4), %%xmm0                 \n"
	"movaps    (%1,%4), %%xmm1                 \n"
	"divps     %%xmm1, %%xmm0                  \n"
	"movaps    %%xmm0, (%2,%4)                 \n"
	
	"movaps    4*T_FLOAT(%0,%4), %%xmm2        \n"
	"movaps    4*T_FLOAT(%1,%4), %%xmm3        \n"
	"divps     %%xmm3, %%xmm2                  \n"
	"movaps    %%xmm2, 4*T_FLOAT(%2,%4)        \n"
	
	"movaps    8*T_FLOAT(%0,%4), %%xmm4        \n"
	"movaps    8*T_FLOAT(%1,%4), %%xmm5        \n"
	"divps     %%xmm5, %%xmm4                  \n"
	"movaps    %%xmm4, 8*T_FLOAT(%2,%4)        \n"
	
	"movaps    12*T_FLOAT(%0,%4), %%xmm6       \n"
	"movaps    12*T_FLOAT(%1,%4), %%xmm7       \n"
	"divps     %%xmm7, %%xmm6                  \n"
	"movaps    %%xmm6, 12*T_FLOAT(%2,%4)       \n"
	
	"addl      $16*T_FLOAT, %4                 \n"
	"loop      1b                              \n"
	:
	/* in1, in2, out, n */
	:"r"(w[1]),"r"(w[2]),"r"(w[3]),"c"(w[4]),"r"(0)
	:"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%xmm5","%xmm6","%xmm7"
	);
    return w+5;
}

t_int* scalarover_perf_simd(t_int *w)
{
    asm(
	".set T_FLOAT,4                            \n"
	
        "shufps    $0, %1, %1                      \n"
	"shrl      $4, %3                          \n" /* divide by 16 */

	/* loop: *out = *in / value */
	"1:                                        \n"
	"movaps    (%0), %%xmm1                    \n"
	"divps     %1, %%xmm1                      \n"
	"movaps    %%xmm1, (%2)                    \n"
    
	"movaps    4*T_FLOAT(%0), %%xmm2           \n"
	"divps     %1, %%xmm2                      \n"
	"movaps    %%xmm2, 4*T_FLOAT(%2)           \n"
	
	"movaps    8*T_FLOAT(%0), %%xmm3           \n"
	"divps     %1, %%xmm3                      \n"
	"movaps    %%xmm3, 8*T_FLOAT(%2)           \n"
	
	"movaps    12*T_FLOAT(%0), %%xmm4          \n"
	"divps     %1, %%xmm4                      \n"
	"movaps    %%xmm4, 12*T_FLOAT(%2)          \n"
	
	"addl      $16*T_FLOAT, %0                 \n"
	"addl      $16*T_FLOAT, %2                 \n"
	"loop      1b                              \n"
	:
	/* in, value, out, n */
	:"r"(w[1]),"x"((t_float)w[2]),"r"(w[3]),"c"(w[4])
	:"%xmm1","%xmm2","%xmm3","%xmm4"
	);
    return w+5;
}


t_int* min_perf_simd(t_int * w)
{
    asm(
	".set T_FLOAT,4                            \n"
	
	"shrl      $4, %3                          \n" /* divide by 16 */
	
	/* loop: *out = min (*in1, *in2) */
	"1:                                        \n"
	"movaps    (%0,%4), %%xmm0                 \n"
	"movaps    (%1,%4), %%xmm1                 \n"
	"minps     %%xmm1, %%xmm0                  \n"
	"movaps    %%xmm0, (%2,%4)                 \n"
	
	"movaps    4*T_FLOAT(%0,%4), %%xmm2        \n"
	"movaps    4*T_FLOAT(%1,%4), %%xmm3        \n"
	"minps     %%xmm3, %%xmm2                  \n"
	"movaps    %%xmm2, 4*T_FLOAT(%2,%4)        \n"
	
	"movaps    8*T_FLOAT(%0,%4), %%xmm4        \n"
	"movaps    8*T_FLOAT(%1,%4), %%xmm5        \n"
	"minps     %%xmm5, %%xmm4                  \n"
	"movaps    %%xmm4, 8*T_FLOAT(%2,%4)        \n"
	
	"movaps    12*T_FLOAT(%0,%4), %%xmm6       \n"
	"movaps    12*T_FLOAT(%1,%4), %%xmm7       \n"
	"minps     %%xmm7, %%xmm6                  \n"
	"movaps    %%xmm6, 12*T_FLOAT(%2,%4)       \n"
	
	"addl      $16*T_FLOAT, %4                 \n"
	"loop      1b                              \n"
	:
	/* in1, in2, out, n */
	:"r"(w[1]),"r"(w[2]),"r"(w[3]),"c"(w[4]),"r"(0)
	:"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%xmm5","%xmm6","%xmm7"
	);
    return w+5;
}


t_int* scalarmin_perf_simd(t_int *w)
{
    asm(
	".set T_FLOAT,4                            \n"
	
        "shufps    $0, %1, %1                      \n"
	"shrl      $4, %3                          \n" /* divide by 16 */

	/* loop: *out = min(*in, value) */
	"1:                                        \n"
	"movaps    (%0), %%xmm1                    \n"
	"minps     %1, %%xmm1                      \n"
	"movaps    %%xmm1, (%2)                    \n"
    
	"movaps    4*T_FLOAT(%0), %%xmm2           \n"
	"minps     %1, %%xmm2                      \n"
	"movaps    %%xmm2, 4*T_FLOAT(%2)           \n"
	
	"movaps    8*T_FLOAT(%0), %%xmm3           \n"
	"minps     %1, %%xmm3                      \n"
	"movaps    %%xmm3, 8*T_FLOAT(%2)           \n"
	
	"movaps    12*T_FLOAT(%0), %%xmm4          \n"
	"minps     %1, %%xmm4                      \n"
	"movaps    %%xmm4, 12*T_FLOAT(%2)          \n"
	
	"addl      $16*T_FLOAT, %0                 \n"
	"addl      $16*T_FLOAT, %2                 \n"
	"loop      1b                              \n"
	:
	/* in, value, out, n */
	:"r"(w[1]),"x"((t_float)w[2]),"r"(w[3]),"c"(w[4])
	:"%xmm1","%xmm2","%xmm3","%xmm4"
	);
    return w+5;
}


t_int* max_perf_simd(t_int * w)
{
    asm(
	".set T_FLOAT,4                            \n"
	
	"shrl      $4, %3                          \n" /* divide by 16 */
	
	/* loop: *out = max (*in1, *in2) */
	"1:                                        \n"
	"movaps    (%0,%4), %%xmm0                 \n"
	"movaps    (%1,%4), %%xmm1                 \n"
	"maxps     %%xmm1, %%xmm0                  \n"
	"movaps    %%xmm0, (%2,%4)                 \n"
	
	"movaps    4*T_FLOAT(%0,%4), %%xmm2        \n"
	"movaps    4*T_FLOAT(%1,%4), %%xmm3        \n"
	"maxps     %%xmm3, %%xmm2                  \n"
	"movaps    %%xmm2, 4*T_FLOAT(%2,%4)        \n"
	
	"movaps    8*T_FLOAT(%0,%4), %%xmm4        \n"
	"movaps    8*T_FLOAT(%1,%4), %%xmm5        \n"
	"maxps     %%xmm5, %%xmm4                  \n"
	"movaps    %%xmm4, 8*T_FLOAT(%2,%4)        \n"
	
	"movaps    12*T_FLOAT(%0,%4), %%xmm6       \n"
	"movaps    12*T_FLOAT(%1,%4), %%xmm7       \n"
	"maxps     %%xmm7, %%xmm6                  \n"
	"movaps    %%xmm6, 12*T_FLOAT(%2,%4)       \n"
	
	"addl      $16*T_FLOAT, %4                 \n"
	"loop      1b                              \n"
	:
	/* in1, in2, out, n */
	:"r"(w[1]),"r"(w[2]),"r"(w[3]),"c"(w[4]),"r"(0)
	:"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%xmm5","%xmm6","%xmm7"
	);
    return w+5;
}


t_int* scalarmax_perf_simd(t_int *w)
{
    asm(
	".set T_FLOAT,4                            \n"
	
    "shufps    $0, %1, %1                      \n"
	"shrl      $4, %3                          \n" /* divide by 16 */

	/* loop: *out = max(*in, value) */
	"1:                                        \n"
	"movaps    (%0), %%xmm1                    \n"
	"maxps     %1, %%xmm1                      \n"
	"movaps    %%xmm1, (%2)                    \n"
    
	"movaps    4*T_FLOAT(%0), %%xmm2           \n"
	"maxps     %1, %%xmm2                      \n"
	"movaps    %%xmm2, 4*T_FLOAT(%2)           \n"
	
	"movaps    8*T_FLOAT(%0), %%xmm3           \n"
	"maxps     %1, %%xmm3                      \n"
	"movaps    %%xmm3, 8*T_FLOAT(%2)           \n"
	
	"movaps    12*T_FLOAT(%0), %%xmm4          \n"
	"maxps     %1, %%xmm4                      \n"
	"movaps    %%xmm4, 12*T_FLOAT(%2)          \n"
	
	"addl      $16*T_FLOAT, %0                 \n"
	"addl      $16*T_FLOAT, %2                 \n"
	"loop      1b                              \n"
	:
	/* in, value, out, n */
	:"r"(w[1]),"x"((t_float)w[2]),"r"(w[3]),"c"(w[4])
	:"%xmm1","%xmm2","%xmm3","%xmm4"
	);
    return w+5;
}

/* TB: runtime check */
int simd_runtime_check()
{
    unsigned int eax, edx;
    __asm__("cpuid" : "=a"(eax),"=d"(edx) : "a" (1): "bx", "cx");
    return (0x2000000 & edx);
}


#endif


--- NEW FILE: m_simd_ve_gcc.c ---
/* 
    Implementation of SIMD functionality for Apple Velocity Engine (AltiVec) with GCC compiler
    added by T.Grill
*/

#include "m_pd.h"
#include "m_simd.h"

#if defined(__GNUC__) && defined(__POWERPC__) && defined(__ALTIVEC__)

//#define USEVECLIB

#ifdef USEVECLIB
#include <vecLib/vDSP.h>
#include <vecLib/vfp.h>
#endif

/* functions for unaligned vector data - taken from http://developer.apple.com/hardware/ve/alignment.html */

/* T.Grill - this first version _should_ work! but it doesn't... */
#if 0
#define LoadUnaligned(v) (vec_perm( vec_ld( 0, (const vector float *)(v) ), vec_ld( 16, (const vector float *)(v) ), vec_lvsl( 0, (float *) (v) ) ))
#else
/* instead take the slower second one */
static vector float LoadUnaligned(const float *v)
{
	union tmpstruct { float f[4]; vector float vec; } tmp;
	tmp.f[0] = *(float *)v;
	return vec_splat(vec_ld(0,&tmp.vec),0);
}
#endif


#define IsVectorAligned(where) ((unsigned long)(where)&(sizeof(vector float)-1) == 0)
/*
#define LoadValue(where) (IsVectorAligned((void *)(where))?vec_splat(vec_ld(0,(vector float *)(where)),0):LoadUnaligned((vector float *)(where))) 
*/
/* always assume unaligned */
#define LoadValue(where) LoadUnaligned((const float *)(where))

void zerovec_simd(t_float *dst,int n)
{
	const vector float zero = (vector float)(0);
	for(n >>= 4; n--; dst += 16) {
		vec_st(zero, 0,dst);
		vec_st(zero,16,dst);
		vec_st(zero,32,dst);
		vec_st(zero,48,dst);
	}
}

void setvec_simd(t_float *dst,t_float v,int n)
{
	const vector float arg = LoadValue(&v);
	for(n >>= 4; n--; dst += 16) {
		vec_st(arg, 0,dst);
		vec_st(arg,16,dst);
		vec_st(arg,32,dst);
		vec_st(arg,48,dst);
	}
}

void copyvec_simd(t_float *dst,const t_float *src,int n)
{
	for(n >>= 4; n--; src += 16,dst += 16) {
		vector float a1 = vec_ld( 0,src);
		vector float a2 = vec_ld(16,src);
		vector float a3 = vec_ld(32,src);
		vector float a4 = vec_ld(48,src);
		vec_st(a1, 0,dst);
		vec_st(a2,16,dst);
		vec_st(a3,32,dst);
		vec_st(a4,48,dst);
	}
}

void addvec_simd(t_float *dst,const t_float *src,int n)
{
#ifdef USEVECLIB
	vadd(dst,1,src,1,dst,1,n);
#else
	for(n >>= 4; n--; src += 16,dst += 16) {
		vector float a1 = vec_ld( 0,dst),b1 = vec_ld( 0,src);
		vector float a2 = vec_ld(16,dst),b2 = vec_ld(16,src);
		vector float a3 = vec_ld(32,dst),b3 = vec_ld(32,src);
		vector float a4 = vec_ld(48,dst),b4 = vec_ld(48,src);
		
		a1 = vec_add(a1,b1);
		a2 = vec_add(a2,b2);
		a3 = vec_add(a3,b3);
		a4 = vec_add(a4,b4);

		vec_st(a1, 0,dst);
		vec_st(a2,16,dst);
		vec_st(a3,32,dst);
		vec_st(a4,48,dst);
	}
#endif
}

/* no bad float testing for PPC! */
void testcopyvec_simd(t_float *dst,const t_float *src,int n)
{
	copyvec_simd(dst,src,n);
}

void testaddvec_simd(t_float *dst,const t_float *src,int n)
{
	addvec_simd(dst,src,n);
}

t_int *zero_perf_simd(t_int *w)
{
    zerovec_simd((t_float *)w[1],w[2]);
    return w+3;
}

t_int *copy_perf_simd(t_int *w)
{
    copyvec_simd((t_float *)w[2],(const t_float *)w[1],w[3]);
	return w+4;
}

t_int *sig_tilde_perf_simd(t_int *w)
{
    setvec_simd((t_float *)w[2],*(const t_float *)w[1],w[3]);
	return w+4;
}

t_int *plus_perf_simd(t_int *w)
{
#ifdef USEVECLIB
	vadd((const t_float *)w[1],1,(const t_float *)w[2],1,(t_float *)w[3],1,w[4]);
#else
    const t_float *src1 = (const t_float *)w[1];
    const t_float *src2 = (const t_float *)w[2];
    t_float *dst = (t_float *)w[3];
    int n = w[4]>>4;
   
	for(; n--; src1 += 16,src2 += 16,dst += 16) {
		vector float a1 = vec_ld( 0,src1),b1 = vec_ld( 0,src2);
		vector float a2 = vec_ld(16,src1),b2 = vec_ld(16,src2);
		vector float a3 = vec_ld(32,src1),b3 = vec_ld(32,src2);
		vector float a4 = vec_ld(48,src1),b4 = vec_ld(48,src2);
		
		a1 = vec_add(a1,b1);
		a2 = vec_add(a2,b2);
		a3 = vec_add(a3,b3);
		a4 = vec_add(a4,b4);

		vec_st(a1, 0,dst);
		vec_st(a2,16,dst);
		vec_st(a3,32,dst);
		vec_st(a4,48,dst);
	}
#endif
	return w+5;
}

t_int *scalarplus_perf_simd(t_int *w)
{
    const t_float *src = (const t_float *)w[1];
	const vector float arg = LoadValue(w[2]);
    t_float *dst = (t_float *)w[3];
    int n = w[4]>>4;
   
	for(; n--; src += 16,dst += 16) {
		vector float a1 = vec_ld( 0,src);
		vector float a2 = vec_ld(16,src);
		vector float a3 = vec_ld(32,src);
		vector float a4 = vec_ld(48,src);
		
		a1 = vec_add(a1,arg);
		a2 = vec_add(a2,arg);
		a3 = vec_add(a3,arg);
		a4 = vec_add(a4,arg);

		vec_st(a1, 0,dst);
		vec_st(a2,16,dst);
		vec_st(a3,32,dst);
		vec_st(a4,48,dst);
	}
	return w+5;
}

t_int *minus_perf_simd(t_int *w)
{
#if 0 //def USEVECLIB
    /* vsub is buggy for some OSX versions! */
	vsub((const t_float *)w[1],1,(const t_float *)w[2],1,(t_float *)w[3],1,w[4]);
#else
    const t_float *src1 = (const t_float *)w[1];
    const t_float *src2 = (const t_float *)w[2];
    t_float *dst = (t_float *)w[3];
    int n = w[4]>>4;
   
	for(; n--; src1 += 16,src2 += 16,dst += 16) {
		vector float a1 = vec_ld( 0,src1),b1 = vec_ld( 0,src2);
		vector float a2 = vec_ld(16,src1),b2 = vec_ld(16,src2);
		vector float a3 = vec_ld(32,src1),b3 = vec_ld(32,src2);
		vector float a4 = vec_ld(48,src1),b4 = vec_ld(48,src2);
		
		a1 = vec_sub(a1,b1);
		a2 = vec_sub(a2,b2);
		a3 = vec_sub(a3,b3);
		a4 = vec_sub(a4,b4);

		vec_st(a1, 0,dst);
		vec_st(a2,16,dst);
		vec_st(a3,32,dst);
		vec_st(a4,48,dst);
	}
#endif
	return w+5;
}

t_int *scalarminus_perf_simd(t_int *w)
{
    const t_float *src = (const t_float *)w[1];
	const vector float arg = LoadValue(w[2]);
    t_float *dst = (t_float *)w[3];
    int n = w[4]>>4;
   
	for(; n--; src += 16,dst += 16) {
		vector float a1 = vec_ld( 0,src);
		vector float a2 = vec_ld(16,src);
		vector float a3 = vec_ld(32,src);
		vector float a4 = vec_ld(48,src);
		
		a1 = vec_sub(a1,arg);
		a2 = vec_sub(a2,arg);
		a3 = vec_sub(a3,arg);
		a4 = vec_sub(a4,arg);

		vec_st(a1, 0,dst);
		vec_st(a2,16,dst);
		vec_st(a3,32,dst);
		vec_st(a4,48,dst);
	}
	return w+5;
}

t_int *times_perf_simd(t_int *w)
{
#ifdef USEVECLIB
	vmul((const t_float *)w[1],1,(const t_float *)w[2],1,(t_float *)w[3],1,w[4]);
#else
    const t_float *src1 = (const t_float *)w[1];
    const t_float *src2 = (const t_float *)w[2];
    t_float *dst = (t_float *)w[3];
    const vector float zero = (vector float)(0);
    int n = w[4]>>4;
   
	for(; n--; src1 += 16,src2 += 16,dst += 16) {
		vector float a1 = vec_ld( 0,src1),b1 = vec_ld( 0,src2);
		vector float a2 = vec_ld(16,src1),b2 = vec_ld(16,src2);
		vector float a3 = vec_ld(32,src1),b3 = vec_ld(32,src2);
		vector float a4 = vec_ld(48,src1),b4 = vec_ld(48,src2);
		
		a1 = vec_madd(a1,b1,zero);
		a2 = vec_madd(a2,b2,zero);
		a3 = vec_madd(a3,b3,zero);
		a4 = vec_madd(a4,b4,zero);

		vec_st(a1, 0,dst);
		vec_st(a2,16,dst);
		vec_st(a3,32,dst);
		vec_st(a4,48,dst);
	}
#endif
	return w+5;
}

t_int *scalartimes_perf_simd(t_int *w)
{
#ifdef USEVECLIB
	vsmul((const t_float *)w[1],1,(t_float *)w[2],(t_float *)w[3],1,w[4]);
#else
    const t_float *src = (const t_float *)w[1];
	const vector float arg = LoadValue(w[2]);
    t_float *dst = (t_float *)w[3];
    const vector float zero = (vector float)(0);
    int n = w[4]>>4;
   
	for(; n--; src += 16,dst += 16) {
		vector float a1 = vec_ld( 0,src);
		vector float a2 = vec_ld(16,src);
		vector float a3 = vec_ld(32,src);
		vector float a4 = vec_ld(48,src);
		
		a1 = vec_madd(a1,arg,zero);
		a2 = vec_madd(a2,arg,zero);
		a3 = vec_madd(a3,arg,zero);
		a4 = vec_madd(a4,arg,zero);

		vec_st(a1, 0,dst);
		vec_st(a2,16,dst);
		vec_st(a3,32,dst);
		vec_st(a4,48,dst);
	}
#endif
	return w+5;
}

t_int *sqr_perf_simd(t_int *w)
{
#ifdef USEVECLIB
	vsq((const t_float *)w[1],1,(t_float *)w[2],1,w[3]);
#else
    const t_float *src = (const t_float *)w[1];
    t_float *dst = (t_float *)w[2];
    const vector float zero = (vector float)(0);
    int n = w[3]>>4;
   
	for(; n--; src += 16,dst += 16) {
		vector float a1 = vec_ld( 0,src);
		vector float a2 = vec_ld(16,src);
		vector float a3 = vec_ld(32,src);
		vector float a4 = vec_ld(48,src);
		
		a1 = vec_madd(a1,a1,zero);
		a2 = vec_madd(a2,a2,zero);
		a3 = vec_madd(a3,a3,zero);
		a4 = vec_madd(a4,a4,zero);

		vec_st(a1, 0,dst);
		vec_st(a2,16,dst);
		vec_st(a3,32,dst);
		vec_st(a4,48,dst);
	}
#endif
	return w+4;
}

t_int *over_perf_simd(t_int *w)
{
    const t_float *src1 = (const t_float *)w[1];
    const t_float *src2 = (const t_float *)w[2];
    t_float *dst = (t_float *)w[3];
    const vector float zero = (vector float)(0);
    const vector float one = (vector float)(1);
    int n = w[4]>>4;
   
	for(; n--; src1 += 16,src2 += 16,dst += 16) {
#ifdef USEVECLIB
		/* no zero checking here */
		vec_st(vdivf(vec_ld( 0,src1),vec_ld( 0,src2)), 0,dst);
		vec_st(vdivf(vec_ld(16,src1),vec_ld(16,src2)),16,dst);
		vec_st(vdivf(vec_ld(32,src1),vec_ld(32,src2)),32,dst);
		vec_st(vdivf(vec_ld(48,src1),vec_ld(48,src2)),48,dst);
#else
	    vector float data1 = vec_ld( 0,src2);
	    vector float data2 = vec_ld(16,src2); 
	    vector float data3 = vec_ld(32,src2); 
	    vector float data4 = vec_ld(48,src2); 

		vector unsigned char mask1 = vec_nor((vector unsigned char)vec_cmpeq(data1,zero),(vector unsigned char)zero); /* bit mask... all 0 for data = 0., all 1 else */
		vector unsigned char mask2 = vec_nor((vector unsigned char)vec_cmpeq(data2,zero),(vector unsigned char)zero); /* bit mask... all 0 for data = 0., all 1 else */
		vector unsigned char mask3 = vec_nor((vector unsigned char)vec_cmpeq(data3,zero),(vector unsigned char)zero); /* bit mask... all 0 for data = 0., all 1 else */
		vector unsigned char mask4 = vec_nor((vector unsigned char)vec_cmpeq(data4,zero),(vector unsigned char)zero); /* bit mask... all 0 for data = 0., all 1 else */

		/* make estimated reciprocal and zero out NANs */
		vector float tmp1 = vec_re(data1);
		vector float tmp2 = vec_re(data2);
		vector float tmp3 = vec_re(data3);
		vector float tmp4 = vec_re(data4);
		
		tmp1 = (vector float)vec_and((vector unsigned char)tmp1,mask1); 
		tmp2 = (vector float)vec_and((vector unsigned char)tmp2,mask2); 
		tmp3 = (vector float)vec_and((vector unsigned char)tmp3,mask3); 
		tmp4 = (vector float)vec_and((vector unsigned char)tmp4,mask4); 

		data1 = vec_madd( vec_nmsub( tmp1, data1, one ), tmp1, tmp1 );
		data2 = vec_madd( vec_nmsub( tmp2, data2, one ), tmp2, tmp2 );
		data3 = vec_madd( vec_nmsub( tmp3, data3, one ), tmp3, tmp3 );
		data4 = vec_madd( vec_nmsub( tmp4, data4, one ), tmp4, tmp4 );

		tmp1 = vec_ld( 0,src1);
		tmp2 = vec_ld(16,src1);
		tmp3 = vec_ld(32,src1);
		tmp4 = vec_ld(48,src1);

		data1 = vec_madd(tmp1,data1,zero);
		data2 = vec_madd(tmp2,data2,zero);
		data3 = vec_madd(tmp3,data3,zero);
		data4 = vec_madd(tmp4,data4,zero);

		vec_st(data1, 0,dst);
		vec_st(data2,16,dst);
		vec_st(data3,32,dst);
		vec_st(data4,48,dst);
#endif
	}
	return w+5;
}

t_int *scalarover_perf_simd(t_int *w)
{
    t_float *dst = (t_float *)w[3];
    const vector float zero = (vector float)(0);
    int n = w[4]>>4;

	if(*(t_float *)w[2]) {
	    const t_float *src = (const t_float *)w[1];
#ifdef USEVECLIB
		float arg = *(t_float *)w[2]?1./ *(t_float *)w[2]: 0;
		vsmul(src,1,&arg,dst,1,w[4]);
#else
		const vector float v = LoadValue(w[2]);
	    const vector float one = (vector float)(1);

	    vector float estimate = vec_re(v); 
		vector float arg = vec_madd( vec_nmsub( estimate, v, one ), estimate, estimate );

		for(; n--; src += 16,dst += 16) {
			vector float a1 = vec_ld( 0,src);
			vector float a2 = vec_ld(16,src);
			vector float a3 = vec_ld(32,src);
			vector float a4 = vec_ld(48,src);
			
			a1 = vec_madd(a1,arg,zero);
			a2 = vec_madd(a2,arg,zero);
			a3 = vec_madd(a3,arg,zero);
			a4 = vec_madd(a4,arg,zero);

			vec_st(a1, 0,dst);
			vec_st(a2,16,dst);
			vec_st(a3,32,dst);
			vec_st(a4,48,dst);
		}
#endif
	}
	else {
		/* zero all output */
		for(; n--; dst += 16) {
			vec_st(zero, 0,dst);
			vec_st(zero,16,dst);
			vec_st(zero,32,dst);
			vec_st(zero,48,dst);
		}
	}
	return w+5;
}

t_int *min_perf_simd(t_int *w)
{
    const t_float *src1 = (const t_float *)w[1];
    const t_float *src2 = (const t_float *)w[2];
    t_float *dst = (t_float *)w[3];
    int n = w[4]>>4;
   
	for(; n--; src1 += 16,src2 += 16,dst += 16) {
		vector float a1 = vec_ld( 0,src1),b1 = vec_ld( 0,src2);
		vector float a2 = vec_ld(16,src1),b2 = vec_ld(16,src2);
		vector float a3 = vec_ld(32,src1),b3 = vec_ld(32,src2);
		vector float a4 = vec_ld(48,src1),b4 = vec_ld(48,src2);
		
		a1 = vec_min(a1,b1);
		a2 = vec_min(a2,b2);
		a3 = vec_min(a3,b3);
		a4 = vec_min(a4,b4);

		vec_st(a1, 0,dst);
		vec_st(a2,16,dst);
		vec_st(a3,32,dst);
		vec_st(a4,48,dst);
	}
	return w+5;
}

t_int *scalarmin_perf_simd(t_int *w)
{
    const t_float *src = (const t_float *)w[1];
	const vector float arg = LoadValue(w[2]);
    t_float *dst = (t_float *)w[3];
    int n = w[4]>>4;
   
	for(; n--; src += 16,dst += 16) {
		vector float a1 = vec_ld( 0,src);
		vector float a2 = vec_ld(16,src);
		vector float a3 = vec_ld(32,src);
		vector float a4 = vec_ld(48,src);
		
		a1 = vec_min(a1,arg);
		a2 = vec_min(a2,arg);
		a3 = vec_min(a3,arg);
		a4 = vec_min(a4,arg);

		vec_st(a1, 0,dst);
		vec_st(a2,16,dst);
		vec_st(a3,32,dst);
		vec_st(a4,48,dst);
	}
	return w+5;
}

t_int *max_perf_simd(t_int *w)
{
    const t_float *src1 = (const t_float *)w[1];
    const t_float *src2 = (const t_float *)w[2];
    t_float *dst = (t_float *)w[3];
    int n = w[4]>>4;
   
	for(; n--; src1 += 16,src2 += 16,dst += 16) {
		vector float a1 = vec_ld( 0,src1),b1 = vec_ld( 0,src2);
		vector float a2 = vec_ld(16,src1),b2 = vec_ld(16,src2);
		vector float a3 = vec_ld(32,src1),b3 = vec_ld(32,src2);
		vector float a4 = vec_ld(48,src1),b4 = vec_ld(48,src2);
		
		a1 = vec_max(a1,b1);
		a2 = vec_max(a2,b2);
		a3 = vec_max(a3,b3);
		a4 = vec_max(a4,b4);

		vec_st(a1, 0,dst);
		vec_st(a2,16,dst);
		vec_st(a3,32,dst);
		vec_st(a4,48,dst);
	}
	return w+5;
}

t_int *scalarmax_perf_simd(t_int *w)
{
    const t_float *src = (const t_float *)w[1];
	const vector float arg = LoadValue(w[2]);
    t_float *dst = (t_float *)w[3];
    int n = w[4]>>4;
   
	for(; n--; src += 16,dst += 16) {
		vector float a1 = vec_ld( 0,src);
		vector float a2 = vec_ld(16,src);
		vector float a3 = vec_ld(32,src);
		vector float a4 = vec_ld(48,src);
		
		a1 = vec_max(a1,arg);
		a2 = vec_max(a2,arg);
		a3 = vec_max(a3,arg);
		a4 = vec_max(a4,arg);

		vec_st(a1, 0,dst);
		vec_st(a2,16,dst);
		vec_st(a3,32,dst);
		vec_st(a4,48,dst);
	}
	return w+5;
}

t_int *clip_perf_simd(t_int *w)
{
    const t_float *src = (const t_float *)w[1];
    t_float *dst = (t_float *)w[2];
	const vector float lo = LoadValue(w[3]);
	const vector float hi = LoadValue(w[4]);
    int n = w[5]>>4;
   
	for(; n--; src += 16,dst += 16) {
		vector float data1 = vec_ld( 0,src);
		vector float data2 = vec_ld(16,src);
		vector float data3 = vec_ld(32,src);
		vector float data4 = vec_ld(48,src);
		
		vector unsigned char mlo1 = (vector unsigned char)vec_cmple(data1,lo); /* bit mask data <= lo */
		vector unsigned char mlo2 = (vector unsigned char)vec_cmple(data2,lo); /* bit mask data <= lo */
		vector unsigned char mlo3 = (vector unsigned char)vec_cmple(data3,lo); /* bit mask data <= lo */
		vector unsigned char mlo4 = (vector unsigned char)vec_cmple(data4,lo); /* bit mask data <= lo */

		vector unsigned char mhi1 = (vector unsigned char)vec_cmpge(data1,hi); /* bit mask data >= hi */
		vector unsigned char mhi2 = (vector unsigned char)vec_cmpge(data2,hi); /* bit mask data >= hi */
		vector unsigned char mhi3 = (vector unsigned char)vec_cmpge(data3,hi); /* bit mask data >= hi */
		vector unsigned char mhi4 = (vector unsigned char)vec_cmpge(data4,hi); /* bit mask data >= hi */

		data1 = (vector float)vec_and((vector unsigned char)data1,vec_nor(mlo1,mhi1));
		data2 = (vector float)vec_and((vector unsigned char)data2,vec_nor(mlo2,mhi2));
		data3 = (vector float)vec_and((vector unsigned char)data3,vec_nor(mlo3,mhi3));
		data4 = (vector float)vec_and((vector unsigned char)data4,vec_nor(mlo4,mhi4));
		
		mlo1 = vec_and((vector unsigned char)lo,mlo1);
		mlo2 = vec_and((vector unsigned char)lo,mlo2);
		mlo3 = vec_and((vector unsigned char)lo,mlo3);
		mlo4 = vec_and((vector unsigned char)lo,mlo4);
		
		mhi1 = vec_and((vector unsigned char)hi,mhi1);
		mhi2 = vec_and((vector unsigned char)hi,mhi2);
		mhi3 = vec_and((vector unsigned char)hi,mhi3);
		mhi4 = vec_and((vector unsigned char)hi,mhi4);

		data1 = (vector float)vec_or(vec_or(mlo1,mhi1),(vector unsigned char)data1);
		data2 = (vector float)vec_or(vec_or(mlo2,mhi2),(vector unsigned char)data2);
		data3 = (vector float)vec_or(vec_or(mlo3,mhi3),(vector unsigned char)data3);
		data4 = (vector float)vec_or(vec_or(mlo4,mhi4),(vector unsigned char)data4);

		vec_st(data1, 0,dst);
		vec_st(data2,16,dst);
		vec_st(data3,32,dst);
		vec_st(data4,48,dst);
	}
	return w+6;
}

t_int *sigwrap_perf_simd(t_int *w)
{
    const t_float *src = (const t_float *)w[1];
    t_float *dst = (t_float *)w[2];
    int n = w[3]>>4;

	for(; n--; src += 16,dst += 16) {
		vector float data1 = vec_ld( 0,src);
		vector float data2 = vec_ld(16,src);
		vector float data3 = vec_ld(32,src);
		vector float data4 = vec_ld(48,src);
		
		data1 = vec_sub(data1,vec_floor(data1));
		data2 = vec_sub(data2,vec_floor(data2));
		data3 = vec_sub(data3,vec_floor(data3));
		data4 = vec_sub(data4,vec_floor(data4));
		
		vec_st(data1, 0,dst);
		vec_st(data2,16,dst);
		vec_st(data3,32,dst);
		vec_st(data4,48,dst);
	}
	return w+4;
}

t_int *sigsqrt_perf_simd(t_int *w)
{
    const t_float *src = (const t_float *)w[1];
    t_float *dst = (t_float *)w[2];
    int n = w[3]>>4;
	
	const vector float zero = (vector float)(0);
	const vector float oneHalf = (vector float)(0.5);
	const vector float one = (vector float)(1.0);

	for(; n--; src += 16,dst += 16) {
		/* http://developer.apple.com/hardware/ve/algorithms.html

			Just as in Miller's scalar sigsqrt_perform, 
			first a rsqrt estimate is calculated which is then refined by one round of Newton-Raphson.
			Here, to avoid branching a mask is generated which zeroes out eventual resulting NANs.
		*/
		
#ifdef USEVECLIB
		/* no zero checking here */
		vec_st(vsqrtf(vec_ld( 0,src)), 0,dst); 
		vec_st(vsqrtf(vec_ld(16,src)),16,dst); 
		vec_st(vsqrtf(vec_ld(32,src)),32,dst); 
		vec_st(vsqrtf(vec_ld(48,src)),48,dst); 
#else
		vector float data1 = vec_ld( 0,src);
		vector float data2 = vec_ld(16,src);
		vector float data3 = vec_ld(32,src);
		vector float data4 = vec_ld(48,src);

		const vector unsigned char mask1 = vec_nor((vector unsigned char)vec_cmple(data1,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
		const vector unsigned char mask2 = vec_nor((vector unsigned char)vec_cmple(data2,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
		const vector unsigned char mask3 = vec_nor((vector unsigned char)vec_cmple(data3,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
		const vector unsigned char mask4 = vec_nor((vector unsigned char)vec_cmple(data4,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */

		const vector float estimate1 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data1),mask1); 
		const vector float estimate2 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data2),mask2); 
		const vector float estimate3 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data3),mask3); 
		const vector float estimate4 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data4),mask4); 

		/* this can still be improved.... */
		data1 = vec_madd(data1,vec_madd( vec_nmsub( data1, vec_madd( estimate1, estimate1, zero ), one ), vec_madd( estimate1, oneHalf, zero ), estimate1 ), zero);
		data2 = vec_madd(data2,vec_madd( vec_nmsub( data2, vec_madd( estimate2, estimate2, zero ), one ), vec_madd( estimate2, oneHalf, zero ), estimate2 ), zero);
		data3 = vec_madd(data3,vec_madd( vec_nmsub( data3, vec_madd( estimate3, estimate3, zero ), one ), vec_madd( estimate3, oneHalf, zero ), estimate3 ), zero);
		data4 = vec_madd(data4,vec_madd( vec_nmsub( data4, vec_madd( estimate4, estimate4, zero ), one ), vec_madd( estimate4, oneHalf, zero ), estimate4 ), zero);
		
		vec_st(data1, 0,dst);
		vec_st(data2,16,dst);
		vec_st(data3,32,dst);
		vec_st(data4,48,dst);
#endif
	}
	return w+4;
}

/* Attention: there's a difference to sigsqrt_perform which delivers non-zero for a zero input... i don't think the latter is intended... */
t_int *sigrsqrt_perf_simd(t_int *w)
{
    const t_float *src = (const t_float *)w[1];
    t_float *dst = (t_float *)w[2];
    int n = w[3]>>4;
	
	const vector float zero = (vector float)(0);
	const vector float oneHalf = (vector float)(0.5);
	const vector float one = (vector float)(1.0);

	for(; n--; src += 16,dst += 16) {
		/* http://developer.apple.com/hardware/ve/algorithms.html

			Just as in Miller's scalar sigrsqrt_perform, 
			first a rsqrt estimate is calculated which is then refined by one round of Newton-Raphson.
			Here, to avoid branching a mask is generated which zeroes out eventual resulting NANs.
		*/

#ifdef USEVECLIB
		/* no zero checking here */
		vec_st(vrsqrtf(vec_ld( 0,src)), 0,dst); 
		vec_st(vrsqrtf(vec_ld(16,src)),16,dst); 
		vec_st(vrsqrtf(vec_ld(32,src)),32,dst); 
		vec_st(vrsqrtf(vec_ld(48,src)),48,dst); 
#else
		vector float data1 = vec_ld( 0,src);
		vector float data2 = vec_ld(16,src);
		vector float data3 = vec_ld(32,src);
		vector float data4 = vec_ld(48,src);

		const vector unsigned char mask1 = vec_nor((vector unsigned char)vec_cmple(data1,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
		const vector unsigned char mask2 = vec_nor((vector unsigned char)vec_cmple(data2,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
		const vector unsigned char mask3 = vec_nor((vector unsigned char)vec_cmple(data3,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
		const vector unsigned char mask4 = vec_nor((vector unsigned char)vec_cmple(data4,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */

		const vector float estimate1 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data1),mask1); 
		const vector float estimate2 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data2),mask2); 
		const vector float estimate3 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data3),mask3); 
		const vector float estimate4 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data4),mask4); 
		
		data1 = vec_nmsub( data1, vec_madd( estimate1, estimate1, zero ), one );
		data2 = vec_nmsub( data2, vec_madd( estimate2, estimate2, zero ), one );
		data3 = vec_nmsub( data3, vec_madd( estimate3, estimate3, zero ), one );
		data4 = vec_nmsub( data4, vec_madd( estimate4, estimate4, zero ), one );

		data1 = vec_madd( data1, vec_madd( estimate1, oneHalf, zero ), estimate1 );
		data2 = vec_madd( data2, vec_madd( estimate2, oneHalf, zero ), estimate2 );
		data3 = vec_madd( data3, vec_madd( estimate3, oneHalf, zero ), estimate3 );
		data4 = vec_madd( data4, vec_madd( estimate4, oneHalf, zero ), estimate4 );
		
		vec_st(data1, 0,dst);
		vec_st(data2,16,dst);
		vec_st(data3,32,dst);
		vec_st(data4,48,dst);
#endif
	}
	return w+4;
}

int simd_runtime_check()
{
	return 1;
}


#endif

--- NEW FILE: m_simd.h ---
/* 
    Definitions for SIMD functionality
    added by T.Grill
*/

#ifndef __M_SIMD_H
#define __M_SIMD_H

/* general vector functions */
void zerovec_8(t_float *dst,int n);
void setvec_8(t_float *dst,t_float v,int n);
void copyvec_8(t_float *dst,const t_float *src,int n);
void addvec_8(t_float *dst,const t_float *src,int n);
void testcopyvec_8(t_float *dst,const t_float *src,int n);
void testaddvec_8(t_float *dst,const t_float *src,int n);

#ifdef DONTUSESIMD

/*     if SIMD shouldn't be used the checks will always return false */
    #define SIMD_CHKCNT(n) ( 0 )
    #define SIMD_CHKALIGN(ptr) ( 0 )

    #undef SIMD_BYTEALIGN

    #include "m_simd_def.h"

#else

    /* how many floats do we calculate in the loop of a SIMD codelet? */
    #define SIMD_BLOCK 16  /* must be a power of 2 */

    #if defined(_MSC_VER) && defined(_M_IX86)  /* Visual C++ on Intel */
        /* alignment for Intel SSE */
        #define SIMD_BYTEALIGN (128/8)   /* align to 128 bits */

        #include "m_simd_sse_vc.h"

    #elif defined(__GNUC__)  && (defined(_X86_) || defined(__i386__) || defined(__i586__) || defined(__i686__) )
        /* Intel SSE with GNU C */
        #define SIMD_BYTEALIGN (128/8)   /* align to 128 bits */

        #include "m_simd_sse_gcc.h"

    #elif defined(__GNUC__) && defined(__POWERPC__) && defined(__ALTIVEC__)
        /* Altivec with GNU C  ( -faltivec must be given as a compiler option! ) */
        #define SIMD_BYTEALIGN (128/8)   /* align to 128 bits */

        #include "m_simd_ve_gcc.h"  

    #else
        /* default */

        #define SIMD_BYTEALIGN (128/8)   /* assume 128 bits */
        #include "m_simd_def.h"
    #endif

    /* check if n meets the requirements for SIMD codelets */
    #define SIMD_CHKCNT(n) ( ((n)&(SIMD_BLOCK-1)) == 0 )
    /* check if a pointer is correctly aligned for SIMD codelets */
    #define SIMD_CHKALIGN(ptr) ( ((unsigned long)(ptr) & (SIMD_BYTEALIGN-1)) == 0 )

#endif

/* check n and 1 pointer at once */
#define SIMD_CHECK1(n,ptr1) ( SIMD_CHKCNT(n) && SIMD_CHKALIGN(ptr1) && simd_runtime_check())
/* check n and 2 pointers at once */
#define SIMD_CHECK2(n,ptr1,ptr2) ( SIMD_CHKCNT(n) && SIMD_CHKALIGN(ptr1) && SIMD_CHKALIGN(ptr2) && simd_runtime_check() )
/* check n and 3 pointers at once */
#define SIMD_CHECK3(n,ptr1,ptr2,ptr3) ( SIMD_CHKCNT(n) && SIMD_CHKALIGN(ptr1) && SIMD_CHKALIGN(ptr2) && SIMD_CHKALIGN(ptr3) && simd_runtime_check() )


/* T.Grill - bit alignment for signal vectors (must be a multiple of 8!) */
/* if undefined no alignment occurs */
#ifdef SIMD_BYTEALIGN
    #define VECTORALIGNMENT (SIMD_BYTEALIGN*8)
#else
    #define VECTORALIGNMENT 128
#endif

#endif /* __M_SIMD_H */

--- NEW FILE: s_midi_alsa.c ---
/* Copyright (c) 1997-1999 Guenter Geiger, Miller Puckette, Larry Troxler,
* Winfried Ritsch, Karl MacMillan, and others.
* For information on usage and redistribution, and for a DISCLAIMER OF ALL
* WARRANTIES, see the file, "LICENSE.txt," in this distribution.  */

/* MIDI I/O for Linux using ALSA */

#include <stdio.h>
#ifdef UNISTD
#include <unistd.h>
#endif
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <alsa/asoundlib.h>
#include "m_pd.h"
#include "s_stuff.h"

static int alsa_nmidiin;
static int alsa_midiinfd[MAXMIDIINDEV];
static int alsa_nmidiout;
static int alsa_midioutfd[MAXMIDIOUTDEV];

static snd_seq_t *midi_handle;

static int alsa_port;
static int alsa_initialized;
static void alsa_midiout(int fd, int n)
{
    /*snd_midi_event_encode_byte();
    char b = n;
    if ((write(fd, (char *) &b, 1)) != 1)
        perror("midi write");*/
}

#define O_MIDIFLAG O_NDELAY

unsigned short CombineBytes(unsigned char First, unsigned char Second)
{
    unsigned short _14bit;
    _14bit = (unsigned short)Second;
    _14bit <<= 7;
    _14bit |= (unsigned short)First;
    return(_14bit);
}

void sys_do_open_midi(int nmidiin, int *midiinvec,
    int nmidiout, int *midioutvec)
{

    char portname[50];
    int err;
    int client;
    int i;
    if (nmidiin>0 && nmidiout>0)
	err = snd_seq_open(&midi_handle,"default",SND_SEQ_OPEN_DUPLEX,0);
    else if (nmidiin > 0)
	err = snd_seq_open(&midi_handle,"default",SND_SEQ_OPEN_INPUT,0);
    else if (nmidiout > 0)
	err = snd_seq_open(&midi_handle,"default",SND_SEQ_OPEN_OUTPUT,0);
    
    if (err!=0)
    {
	    sys_setalarm(1000000);
	    post("couldn't open alsa sequencer");
	    return;
    }
    for (i=0;i<nmidiout;i++)
    {
        sprintf(portname,"Pure Data Midi-In %d",i+1);
        alsa_port = snd_seq_create_simple_port(midi_handle,portname,SND_SEQ_PORT_CAP_WRITE |SND_SEQ_PORT_CAP_SUBS_WRITE , SND_SEQ_PORT_TYPE_APPLICATION);
	alsa_midiinfd[i] = alsa_port;	
    }
    for (i=0;i<nmidiin;i++)
    {
        sprintf(portname,"Pure Data Midi-Out %d",i+1);
        alsa_port = snd_seq_create_simple_port(midi_handle,portname,  SND_SEQ_PORT_CAP_SUBS_READ |  SND_SEQ_PORT_CAP_READ, SND_SEQ_PORT_TYPE_APPLICATION);
	alsa_midioutfd[i] = alsa_port;	
    }
    
//    alsa_port = snd_seq_create_simple_port(midi_handle,portname,SND_SEQ_PORT_CAP_DUPLEX |SND_SEQ_PORT_CAP_SUBS_WRITE |  SND_SEQ_PORT_CAP_SUBS_READ , SND_SEQ_PORT_TYPE_APPLICATION);
    //alsa_port = snd_seq_create_simple_port(midi_handle,portname,SND_SEQ_PORT_CAP_READ |  SND_SEQ_PORT_CAP_SUBS_READ , SND_SEQ_PORT_TYPE_APPLICATION);
    if (alsa_port < 0) 
    {
	    sys_setalarm(1000000);
	    post("couldn't open alsa MIDI output device");
	    return;
    }
    snd_seq_client_info_t *alsainfo;
    snd_seq_client_info_malloc(&alsainfo);
    snd_seq_get_client_info(midi_handle,alsainfo);
    snd_seq_client_info_set_name(alsainfo,"Pure Data");
    client = snd_seq_client_info_get_client(alsainfo);
    snd_seq_set_client_info(midi_handle,alsainfo);
    post("Opened Alsa Client %d in:%d out:%d",client,nmidiin,nmidiout);
    sys_setalarm(0);
    alsa_nmidiout = nmidiout;
    alsa_initialized = nmidiin;
}

#define md_msglen(x) (((x)<0xC0)?2:((x)<0xE0)?1:((x)<0xF0)?2:\
    ((x)==0xF2)?2:((x)<0xF4)?1:0)

void sys_putmidimess(int portno, int a, int b, int c)
{
	    int aa;
	    int channel;
    snd_midi_event_t *dev;
    snd_seq_event_t ev;
    snd_seq_ev_clear(&ev);
    dev = (snd_midi_event_t*)malloc(30);
    if (portno >= 0 && portno < alsa_nmidiout)
    {
        if (a >= 224)	// pitchbend
        {
            channel = a-224;
    	    snd_seq_ev_set_pitchbend(&ev,channel,CombineBytes(b,c));
        }
        else if (a >= 208)	// touch
        {
	    channel = a-208;
	    snd_seq_ev_set_chanpress(&ev,channel,b);
        }
        else if (a >= 192)	// program
        {
	    channel = a-192;
	    snd_seq_ev_set_pgmchange(&ev,channel,b);
        }
        else if (a >= 176)	// controller
        {
	    channel = a-176;
    	    snd_seq_ev_set_controller(&ev,channel,b,c);
        }
        else if (a >= 160)	// polytouch
        {
	    channel = a-160;
	    snd_seq_ev_set_keypress(&ev,channel,b,c);
        }
        else if (a >= 144)	// note
        {
            channel = a-144;
            if (c)
                snd_seq_ev_set_noteon(&ev,channel,b,c);
            else
                snd_seq_ev_set_noteoff(&ev,channel,b,c);
        }
        snd_seq_ev_set_direct(&ev);
        snd_seq_ev_set_subs(&ev);
        snd_seq_ev_set_source(&ev,alsa_midioutfd[0]);
        snd_seq_event_output_direct(midi_handle,&ev);
    }
    //post("%d %d %d\n",a,b,c);
}

void sys_putmidibyte(int portno, int byte)
{
/*    snd_midi_event_t *dev;
    snd_seq_event_t ev;
    snd_seq_ev_clear(&ev);
    dev = (snd_midi_event_t*)malloc(4);
    if (portno >= 0 && portno < alsa_nmidiout)
    {
        //alsa_midiout(alsa_midioutfd[portno], byte);       
        snd_midi_event_encode_byte(dev, byte,&ev);
        snd_seq_ev_set_direct(&ev);
        snd_seq_ev_set_subs(&ev);
        snd_seq_ev_set_source(&ev,alsa_port);
        snd_seq_event_output_direct(midi_handle,&ev);
    }*/
}

#if 0   /* this is the "select" version which doesn't work with OSS
        driver for emu10k1 (it doesn't implement select.) */
void sys_poll_midi(void)
{
    int i, throttle = 100;
    struct timeval timout;
    int did = 1, maxfd = 0;
    while (did)
    {
        fd_set readset, writeset, exceptset;
        did = 0;
        if (throttle-- < 0)
            break;
        timout.tv_sec = 0;
        timout.tv_usec = 0;

        FD_ZERO(&writeset);
        FD_ZERO(&readset);
        FD_ZERO(&exceptset);
        for (i = 0; i < alsa_nmidiin; i++)
        {
            if (alsa_midiinfd[i] > maxfd)
                maxfd = alsa_midiinfd[i];
            FD_SET(alsa_midiinfd[i], &readset);
        }
        select(maxfd+1, &readset, &writeset, &exceptset, &timout);
        for (i = 0; i < alsa_nmidiin; i++)
            if (FD_ISSET(alsa_midiinfd[i], &readset))
        {
            char c;
            int ret = read(alsa_midiinfd[i], &c, 1);
            if (ret <= 0)
                fprintf(stderr, "Midi read error\n");
            else sys_midibytein(i, (c & 0xff));
            did = 1;
        }
    }
}
#else 

    /* this version uses the asynchronous "read()" ... */
void sys_poll_midi(void)
{
   char buf[20];
   int count, alsa_source;
   int i;
   snd_midi_event_t *dev;
   snd_seq_event_t *midievent = NULL;
   snd_midi_event_new(20,&dev);
   snd_midi_event_init(dev);
   count = snd_seq_event_input_pending(midi_handle,1);
   if (count != 0)
   	count = snd_seq_event_input(midi_handle,&midievent);
   if (midievent != NULL)
   {
       count = snd_midi_event_decode(dev,buf,20,midievent);
       alsa_source = midievent->dest.port;
       for(i=0;i<count;i++)
           sys_midibytein(alsa_source, (buf[i] & 0xff));
       //post("received %d midi bytes\n",count);
   }
   snd_midi_event_free(dev);
}
#endif

void sys_close_midi()
{
    /*int i;
    for (i = 0; i < alsa_nmidiin; i++)
        close(alsa_midiinfd[i]);
    for (i = 0; i < alsa_nmidiout; i++)
        close(alsa_midioutfd[i]);*/
    alsa_nmidiin = alsa_nmidiout = 0;
    snd_seq_close(midi_handle);
}

#define NSEARCH 10
static int alsa_nmidiindevs, alsa_nmidioutdevs, alsa_initted;

void midi_alsa_init(void)     
{
    int i;
    if (alsa_initted)
        return;
    alsa_initted = 1;
}

void midi_getdevs(char *indevlist, int *nindevs,
    char *outdevlist, int *noutdevs, int maxndev, int devdescsize)
{
    int i, ndev;
    if ((ndev = alsa_nmidiindevs) > maxndev)
        ndev = maxndev;
    for (i = 0; i < ndev; i++)
        sprintf(indevlist + i * devdescsize, "OSS MIDI device #%d", i+1);
    *nindevs = ndev;

    if ((ndev = alsa_nmidioutdevs) > maxndev)
        ndev = maxndev;
    for (i = 0; i < ndev; i++)
        sprintf(outdevlist + i * devdescsize, "OSS MIDI device #%d", i+1);
    *noutdevs = ndev;
}

--- NEW FILE: m_simd_sse_vc.c ---
/* 
    Implementation of SIMD functionality for Intel SSE with VC++ compiler
    added by T.Grill
*/

#include "m_pd.h"
#include "m_simd.h"

#if defined(NT) && defined(_MSC_VER) && !(defined DONTUSESIMD)

/* dst is assumed to be aligned */
void zerovec_simd(t_float *dst,int n)
{
	__asm {
		mov		edx,dword ptr [dst] /* out */
		xorps   xmm0,xmm0 /* zero value */

		mov		ecx,[n] /* n */
		shr		ecx,4

		/* should we do more loop unrolling? */
loopa:
		movaps	xmmword ptr[edx],xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm0

		add		edx,16*TYPE t_float
		/* very short loop - let's assume that branch prediction does its job nicely */
		loop	loopa
	}
}

/* dst is assumed to be aligned */
void setvec_simd(t_float *dst,t_float v,int n)
{
	__asm {
		mov		edx,dword ptr [dst] /* out */

        /* load value ... this is not very clean.. */
		movss	xmm0,xmmword ptr [v]
		shufps	xmm0,xmm0,0

		mov		ecx,[n] /* n */
		shr		ecx,4

		/* should we do more loop unrolling? */
loopa:
		movaps	xmmword ptr[edx],xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm0

		add		edx,16*TYPE t_float
		/* very short loop - let's assume that branch prediction does its job nicely */
		loop	loopa
	}
}

/* dst and src are assumed to be aligned */
void copyvec_simd(t_float *dst,const t_float *src,int n)
{
	__asm {
		mov		ebx,dword ptr [src] /* in1 */
/*		prefetcht0 [ebx] */
		mov		edx,dword ptr [dst] /* out */

		mov		ecx,dword ptr [n] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

loopa:
/*		prefetcht0 [ebx+12*TYPE t_float] */

		movaps	xmm0,xmmword ptr[ebx]
		movaps	xmmword ptr[edx],xmm0
		movaps	xmm1,xmmword ptr[ebx+4*TYPE t_float]
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm1

/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm2,xmmword ptr[ebx+8*TYPE t_float]
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm2
		movaps	xmm3,xmmword ptr[ebx+12*TYPE t_float]
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm3

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
}

void addvec_simd(t_float *dst,const t_float *src,int n)
{
	__asm {
		mov		eax,dword ptr [src] /* in1 */
/*      prefetcht0 [eax] prefetch first cache line */	
		mov		edx,dword ptr [dst] /* out */
		mov		ecx,dword ptr [n] /* n */
		shr		ecx,4 /* divide by 16 */

        xor     esi,esi /* reset index */
/*
		prefetcht0 [eax+8*TYPE t_float]
		prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
        prefetcht0 [eax+16*TYPE t_float]
		prefetcht0 [ebx+16*TYPE t_float]
*/
        movaps	xmm0,xmmword ptr[edx+esi]
		movaps	xmm1,xmmword ptr[eax+esi]
		addps	xmm0,xmm1
		movaps	xmmword ptr[edx+esi],xmm0

		movaps	xmm2,xmmword ptr[edx+esi+4*TYPE t_float]
		movaps	xmm3,xmmword ptr[eax+esi+4*TYPE t_float]
		addps	xmm2,xmm3
		movaps	xmmword ptr[edx+esi+4*TYPE t_float],xmm2
/*
        prefetcht0 [eax+24*TYPE t_float]
		prefetcht0 [ebx+24*TYPE t_float]
*/
		movaps	xmm4,xmmword ptr[edx+esi+8*TYPE t_float]
		movaps	xmm5,xmmword ptr[eax+esi+8*TYPE t_float]
		addps	xmm4,xmm5
		movaps	xmmword ptr[edx+esi+8*TYPE t_float],xmm4

		movaps	xmm6,xmmword ptr[edx+esi+12*TYPE t_float]
		movaps	xmm7,xmmword ptr[eax+esi+12*TYPE t_float]
		addps	xmm6,xmm7
		movaps	xmmword ptr[edx+esi+12*TYPE t_float],xmm6

        add     esi,16*TYPE t_float
		loop	loopa 
	}
}

void testcopyvec_simd(t_float *dst,const t_float *src,int n)
{
	testcopyvec_8(dst,src,n);
}

void testaddvec_simd(t_float *dst,const t_float *src,int n)
{
	testaddvec_8(dst,src,n);
}

t_int *zero_perf_simd(t_int *w)
{
    zerovec_simd((t_float *)w[1],w[2]);
    return w+3;
}

t_int *copy_perf_simd(t_int *w)
{
    copyvec_simd((t_float *)w[2],(const t_float *)w[1],w[3]);
    return w+4;
}

t_int *sig_tilde_perf_simd(t_int *w)
{
    setvec_simd((t_float *)w[2],*(const t_float *)w[1],w[3]);
    return w+4;
}


t_int *plus_perf_simd(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/*      prefetcht0 [eax] prefetch first cache line */	
		mov		ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/*      prefetcht0 [ebx] prefetch first cache line */	
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */
		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4 /* divide by 16 */

        xor     esi,esi /* reset index */
/*
		prefetcht0 [eax+8*TYPE t_float]
		prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
        prefetcht0 [eax+16*TYPE t_float]
		prefetcht0 [ebx+16*TYPE t_float]
*/
        movaps	xmm0,xmmword ptr[eax+esi]
		movaps	xmm1,xmmword ptr[ebx+esi]
		addps	xmm0,xmm1
		movaps	xmmword ptr[edx+esi],xmm0

		movaps	xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
		movaps	xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
		addps	xmm2,xmm3
		movaps	xmmword ptr[edx+esi+4*TYPE t_float],xmm2
/*
        prefetcht0 [eax+24*TYPE t_float]
		prefetcht0 [ebx+24*TYPE t_float]
*/
		movaps	xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
		movaps	xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
		addps	xmm4,xmm5
		movaps	xmmword ptr[edx+esi+8*TYPE t_float],xmm4

		movaps	xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
		movaps	xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
		addps	xmm6,xmm7
		movaps	xmmword ptr[edx+esi+12*TYPE t_float],xmm6

        add     esi,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *scalarplus_perf_simd(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/*      prefetcht0 [ebx] prefetch first cache line */
		
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		/* load value ... this is not very clean.. */
		mov		eax,dword ptr [esi + 2*TYPE t_int] /* value */
		movss	xmm0,xmmword ptr [eax]
		shufps	xmm0,xmm0,0

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

loopa:
/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm1,xmmword ptr[ebx]
		addps	xmm1,xmm0
		movaps	xmmword ptr[edx],xmm1

		movaps	xmm2,xmmword ptr[ebx+4*TYPE t_float]
		addps	xmm2,xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm2

/*		prefetcht0 [ebx+24*TYPE t_float] */

		movaps	xmm3,xmmword ptr[ebx+8*TYPE t_float]
		addps	xmm3,xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm3

		movaps	xmm4,xmmword ptr[ebx+12*TYPE t_float]
		addps	xmm4,xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm4

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *minus_perf_simd(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/*      prefetcht0 [eax]    prefetch first cache line */
		mov		ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/*      prefetcht0 [ebx]    prefetch first cache line */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */
		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

        xor     esi,esi /* reset index */
/*
		prefetcht0 [eax+8*TYPE t_float]
		prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
        prefetcht0 [eax+16*TYPE t_float]
		prefetcht0 [ebx+16*TYPE t_float]
*/

		movaps	xmm0,xmmword ptr[eax+esi]
		movaps	xmm1,xmmword ptr[ebx+esi]
		subps	xmm0,xmm1
		movaps	xmmword ptr[edx+esi],xmm0

		movaps	xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
		movaps	xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
		subps	xmm2,xmm3
		movaps	xmmword ptr[edx+esi+4*TYPE t_float],xmm2

/*
        prefetcht0 [eax+24*TYPE t_float]
		prefetcht0 [ebx+24*TYPE t_float]
*/

		movaps	xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
		movaps	xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
		subps	xmm4,xmm5
		movaps	xmmword ptr[edx+esi+8*TYPE t_float],xmm4

		movaps	xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
		movaps	xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
		subps	xmm6,xmm7
		movaps	xmmword ptr[edx+esi+12*TYPE t_float],xmm6

		add		esi,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *scalarminus_perf_simd(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/*      prefetcht0 [ebx]    prefetch first cache line */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		/* load value ... this is not very clean.. */
		mov		eax,dword ptr [esi + 2*TYPE t_int] /* g */
		movss	xmm0,xmmword ptr [eax]
		shufps	xmm0,xmm0,0

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

loopa:
/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm1,xmmword ptr[ebx]
		subps	xmm1,xmm0
		movaps	xmmword ptr[edx],xmm1

		movaps	xmm2,xmmword ptr[ebx+4*TYPE t_float]
		subps	xmm2,xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm2

/*		prefetcht0 [ebx+24*TYPE t_float] */

		movaps	xmm3,xmmword ptr[ebx+8*TYPE t_float]
		subps	xmm3,xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm3

		movaps	xmm4,xmmword ptr[ebx+12*TYPE t_float]
		subps	xmm4,xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm4

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *times_perf_simd(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/*      prefetcht0 [eax]    prefetch first cache line */
		mov		ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/*      prefetcht0 [ebx]    prefetch first cache line */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

        xor     esi,esi /* reset index */
/*
		prefetcht0 [eax+8*TYPE t_float]
		prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
        prefetcht0 [eax+16*TYPE t_float]
		prefetcht0 [ebx+16*TYPE t_float]
*/

		movaps	xmm0,xmmword ptr[eax+esi]
		movaps	xmm1,xmmword ptr[ebx+esi]
		mulps	xmm0,xmm1
		movaps	xmmword ptr[edx+esi],xmm0

		movaps	xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
		movaps	xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
		mulps	xmm2,xmm3
		movaps	xmmword ptr[edx+esi+4*TYPE t_float],xmm2

/*
		prefetcht0 [eax+24*TYPE t_float]
		prefetcht0 [ebx+24*TYPE t_float]
*/

		movaps	xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
		movaps	xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
		mulps	xmm4,xmm5
		movaps	xmmword ptr[edx+esi+8*TYPE t_float],xmm4

		movaps	xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
		movaps	xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
		mulps	xmm6,xmm7
		movaps	xmmword ptr[edx+esi+12*TYPE t_float],xmm6

		add		esi,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *scalartimes_perf_simd(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/*      prefetcht0 [ebx]    prefetch first cache line */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		/* load value ... this is not very clean.. */
		mov		eax,dword ptr [esi + 2*TYPE t_int] /* g */
		movss	xmm0,xmmword ptr [eax]
		shufps	xmm0,xmm0,0

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

loopa:
/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm1,xmmword ptr[ebx]
		mulps	xmm1,xmm0
		movaps	xmmword ptr[edx],xmm1

		movaps	xmm2,xmmword ptr[ebx+4*TYPE t_float]
		mulps	xmm2,xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm2

/*		prefetcht0 [ebx+24*4] */

		movaps	xmm3,xmmword ptr[ebx+8*TYPE t_float]
		mulps	xmm3,xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm3

		movaps	xmm4,xmmword ptr[ebx+12*TYPE t_float]
		mulps	xmm4,xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm4

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *sqr_perf_simd(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/*      prefetcht0 [ebx]    prefetch first cache line */
		mov		edx,dword ptr [esi + 2*TYPE t_int] /* out */

		mov		ecx,dword ptr [esi + 3*TYPE t_int] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

loopa:
/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm0,xmmword ptr[ebx]
		mulps	xmm0,xmm0
		movaps	xmmword ptr[edx],xmm0

		movaps	xmm1,xmmword ptr[ebx+4*TYPE t_float]
		mulps	xmm1,xmm1
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm1

/*		prefetcht0 [ebx+24*4] */

		movaps	xmm2,xmmword ptr[ebx+8*TYPE t_float]
		mulps	xmm2,xmm2
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm2

		movaps	xmm3,xmmword ptr[ebx+12*TYPE t_float]
		mulps	xmm3,xmm3
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm3

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+4);
}

/* no checking for 0 yet!! */
t_int *over_perf_simd(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/*      prefetcht0 [eax]    prefetch first cache line */
		mov		ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/*      prefetcht0 [ebx]    prefetch first cache line */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

        xor     esi,esi /* reset index */
/*
		prefetcht0 [eax+8*TYPE t_float]
		prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
        prefetcht0 [eax+16*TYPE t_float]
		prefetcht0 [ebx+16*TYPE t_float]
*/

		movaps	xmm0,xmmword ptr[eax+esi]
		movaps	xmm1,xmmword ptr[ebx+esi]
		divps	xmm0,xmm1
		movaps	xmmword ptr[edx+esi],xmm0

		movaps	xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
		movaps	xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
		divps	xmm2,xmm3
		movaps	xmmword ptr[edx+esi+4*TYPE t_float],xmm2

/*
		prefetcht0 [eax+24*TYPE t_float]
		prefetcht0 [ebx+24*TYPE t_float]
*/

		movaps	xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
		movaps	xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
		divps	xmm4,xmm5
		movaps	xmmword ptr[edx+esi+8*TYPE t_float],xmm4

		movaps	xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
		movaps	xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
		divps	xmm6,xmm7
		movaps	xmmword ptr[edx+esi+12*TYPE t_float],xmm6

		add		esi,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *scalarover_perf_simd(t_int *w)
{
    static const float one = 1.f;

	__asm {
		mov		esi,dword ptr [w]

		mov		ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/*      prefetcht0 [ebx]    prefetch first cache line */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		/* load value */
		mov		eax,dword ptr [esi + 2*TYPE t_int] /* g */
		movss	xmm1,xmmword ptr [eax]

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

        /* check for zero */
		xorps	xmm0,xmm0
        comiss  xmm1,xmm0  /* compare xmm1 to 0 */
        /* if xmm1 is zero (and also xmm0!) -> goto loopa */
        jz      loopa

        /* else, invert xmm0 */
/*        rcpps   xmm0,xmm0  ... far too unprecise!! */
        
		movss	xmm0,[one]
        divss   xmm0,xmm1  /* divide xmm0 by xmm1 */
		shufps	xmm0,xmm0,0 /* make xmm0 all the same */

loopa:
/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm1,xmmword ptr[ebx]
		mulps	xmm1,xmm0
		movaps	xmmword ptr[edx],xmm1

		movaps	xmm2,xmmword ptr[ebx+4*TYPE t_float]
		mulps	xmm2,xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm2

/*		prefetcht0 [ebx+24*4] */

		movaps	xmm3,xmmword ptr[ebx+8*TYPE t_float]
		mulps	xmm3,xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm3

		movaps	xmm4,xmmword ptr[ebx+12*TYPE t_float]
		mulps	xmm4,xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm4

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *max_perf_simd(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/*		prefetcht0 [eax] */
		mov		ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/*		prefetcht0 [ebx] */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

        xor     esi,esi /* reset index */
/*
        prefetcht0 [eax+8*TYPE t_float]
		prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
        prefetcht0 [eax+16*TYPE t_float]
		prefetcht0 [ebx+16*TYPE t_float]
*/
		movaps	xmm0,xmmword ptr[eax+esi]
		movaps	xmm1,xmmword ptr[ebx+esi]
		maxps	xmm0,xmm1
		movaps	xmmword ptr[edx+esi],xmm0

		movaps	xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
		movaps	xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
		maxps	xmm2,xmm3
		movaps	xmmword ptr[edx+esi+4*TYPE t_float],xmm2

/*
        prefetcht0 [eax+24*TYPE t_float]
		prefetcht0 [ebx+24*TYPE t_float]
*/

		movaps	xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
		movaps	xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
		maxps	xmm4,xmm5
		movaps	xmmword ptr[edx+esi+8*TYPE t_float],xmm4

		movaps	xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
		movaps	xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
		maxps	xmm6,xmm7
		movaps	xmmword ptr[edx+esi+12*TYPE t_float],xmm6

		add		esi,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *scalarmax_perf_simd(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/*		prefetcht0 [ebx] */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		/* load value ... this is not very clean.. */
		mov		eax,dword ptr [esi + 2*TYPE t_int] /* g */
		movss	xmm0,xmmword ptr [eax]
		shufps	xmm0,xmm0,0

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

loopa:
/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm1,xmmword ptr[ebx]
		maxps	xmm1,xmm0
		movaps	xmmword ptr[edx],xmm1

		movaps	xmm2,xmmword ptr[ebx+4*TYPE t_float]
		maxps	xmm2,xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm2

/*		prefetcht0 [ebx+24*TYPE t_float] */

		movaps	xmm3,xmmword ptr[ebx+8*TYPE t_float]
		maxps	xmm3,xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm3

		movaps	xmm4,xmmword ptr[ebx+12*TYPE t_float]
		maxps	xmm4,xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm4

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *min_perf_simd(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/*		prefetcht0 [eax] */
		mov		ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/*		prefetcht0 [ebx] */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

        mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

        xor     esi,esi /* reset index */
/*
        prefetcht0 [eax+8*TYPE t_float]
		prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
        prefetcht0 [eax+16*TYPE t_float]
		prefetcht0 [ebx+16*TYPE t_float]
*/
		movaps	xmm0,xmmword ptr[eax+esi]
		movaps	xmm1,xmmword ptr[ebx+esi]
		minps	xmm0,xmm1
		movaps	xmmword ptr[edx+esi],xmm0

		movaps	xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
		movaps	xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
		minps	xmm2,xmm3
		movaps	xmmword ptr[edx+esi+4*TYPE t_float],xmm2
/*
        prefetcht0 [eax+24*TYPE t_float]
		prefetcht0 [ebx+24*TYPE t_float]
*/
		movaps	xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
		movaps	xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
		minps	xmm4,xmm5
		movaps	xmmword ptr[edx+esi+8*TYPE t_float],xmm4

		movaps	xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
		movaps	xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
		minps	xmm6,xmm7
		movaps	xmmword ptr[edx+esi+12*TYPE t_float],xmm6

		add		esi,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

t_int *scalarmin_perf_simd(t_int *w)
{
	__asm {
		mov		esi,dword ptr [w]

		mov		ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/*		prefetcht0 [ebx] */
		mov		edx,dword ptr [esi + 3*TYPE t_int] /* out */

		/* load value ... this is not very clean.. */
		mov		eax,dword ptr [esi + 2*TYPE t_int] /* g */
		movss	xmm0,xmmword ptr [eax]
		shufps	xmm0,xmm0,0

		mov		ecx,dword ptr [esi + 4*TYPE t_int] /* n */
		shr		ecx,4

/*		prefetcht0 [ebx+8*TYPE t_float] */

loopa:
/*		prefetcht0 [ebx+16*TYPE t_float] */

		movaps	xmm1,xmmword ptr[ebx]
		minps	xmm1,xmm0
		movaps	xmmword ptr[edx],xmm1

		movaps	xmm2,xmmword ptr[ebx+4*TYPE t_float]
		minps	xmm2,xmm0
		movaps	xmmword ptr[edx+4*TYPE t_float],xmm2

/*		prefetcht0 [ebx+24*TYPE t_float] */

		movaps	xmm3,xmmword ptr[ebx+8*TYPE t_float]
		minps	xmm3,xmm0
		movaps	xmmword ptr[edx+8*TYPE t_float],xmm3

		movaps	xmm4,xmmword ptr[ebx+12*TYPE t_float]
		minps	xmm4,xmm0
		movaps	xmmword ptr[edx+12*TYPE t_float],xmm4

		add		ebx,16*TYPE t_float
		add		edx,16*TYPE t_float
		loop	loopa 
	}
    return (w+5);
}

/* TB: runtime check */
int simd_runtime_check()
{
    unsigned int redx;
    __asm
	{
		mov		eax, 1
		cpuid
	    mov		[redx],edx     
	}
    return (0x2000000 & redx);
}

#endif


--- NEW FILE: m_simd_sse_gcc.h ---
/* 
    SIMD functionality for Intel SSE with GCC compiler
    added by T.Grill
*/

#ifndef __M_SIMD_SSE_GCC_H
#define __M_SIMD_SSE_GCC_H

#include "m_pd.h"

/* SIMD functions for SSE with gcc */

/* functions in d_ugen.c */
t_int *zero_perf_simd(t_int *w);

/* functions in d_dac.c */
t_int *copy_perf_simd(t_int *w);

/* functions in d_ctl.c */
t_int *sig_tilde_perf_simd(t_int *w);

/* functions in d_arithmetic.c */
t_int *plus_perf_simd(t_int *w);
t_int *scalarplus_perf_simd(t_int *w);
t_int *minus_perf_simd(t_int *w);
t_int *scalarminus_perf_simd(t_int *w);
t_int *times_perf_simd(t_int *w);
t_int *scalartimes_perf_simd(t_int *w);
t_int *sqr_perf_simd(t_int *w);
t_int *over_perf_simd(t_int *w);
t_int *scalarover_perf_simd(t_int *w);
t_int *max_perf_simd(t_int *w);
t_int *scalarmax_perf_simd(t_int *w);
t_int *min_perf_simd(t_int *w);
t_int *scalarmin_perf_simd(t_int *w);
t_int *clip_perf_simd(t_int *w);
t_int *sigwrap_perf_simd(t_int *w);
t_int *sigsqrt_perf_simd(t_int *w);
t_int *sigrsqrt_perf_simd(t_int *w);

/* functions in d_math.c */
#define clip_perf_simd             clip_perform     /* SIMD not implemented */
#define sigwrap_perf_simd          sigwrap_perform  /* SIMD not implemented */
#define sigsqrt_perf_simd          sigsqrt_perform  /* SIMD not implemented */
#define sigrsqrt_perf_simd         sigrsqrt_perform /* SIMD not implemented */

#endif /* __M_SIMD_SSE_GCC_H */

--- NEW FILE: m_simd_sse_vc.h ---
/* 
    SIMD functionality for Intel SSE with VC++ compiler
    added by T.Grill
*/

#ifndef __M_SIMD_SSE_VC_H
#define __M_SIMD_SSE_VC_H

#include "m_pd.h"

/* SIMD functions for SSE with VC++ */

/* functions in d_ugen.c */
t_int *zero_perf_simd(t_int *w);

/* functions in d_dac.c */
t_int *copy_perf_simd(t_int *w);

/* functions in d_ctl.c */
t_int *sig_tilde_perf_simd(t_int *w);

/* functions in d_arithmetic.c */
t_int *plus_perf_simd(t_int *w);
t_int *scalarplus_perf_simd(t_int *w);
t_int *minus_perf_simd(t_int *w);
t_int *scalarminus_perf_simd(t_int *w);
t_int *times_perf_simd(t_int *w);
t_int *scalartimes_perf_simd(t_int *w);
t_int *sqr_perf_simd(t_int *w);
t_int *over_perf_simd(t_int *w);
t_int *scalarover_perf_simd(t_int *w);
t_int *max_perf_simd(t_int *w);
t_int *scalarmax_perf_simd(t_int *w);
t_int *min_perf_simd(t_int *w);
t_int *scalarmin_perf_simd(t_int *w);
t_int *clip_perf_simd(t_int *w);
t_int *sigwrap_perf_simd(t_int *w);
t_int *sigsqrt_perf_simd(t_int *w);
t_int *sigrsqrt_perf_simd(t_int *w);

/* functions in d_math.c */
#define clip_perf_simd          clip_perform  /* SIMD not implemented */
#define sigwrap_perf_simd       sigwrap_perform  /* SIMD not implemented */
#define sigsqrt_perf_simd       sigsqrt_perform  /* SIMD not implemented */
#define sigrsqrt_perf_simd      sigrsqrt_perform /* SIMD not implemented */

#endif /* __M_SIMD_SSE_VC_H */

--- NEW FILE: m_simd_def.h ---
/* 
    Default SIMD (non-)functionality
    added by T.Grill

    This is used when there's no implementation of SIMD code 
    for the current platform and/or compiler
*/

#ifndef __M_SIMD_DEF_H
#define __M_SIMD_DEF_H

/* These are the functions that can be coded for SIMD */

/* functions in d_ugen.c */
#define zero_perf_simd          zero_perf8

/* functions in d_dac.c */
#define copy_perf_simd          copy_perf8

/* functions in d_ctl.c */
#define sig_tilde_perf_simd     sig_tilde_perf8

/* functions in d_arithmetic.c */
#define plus_perf_simd          plus_perf8
#define scalarplus_perf_simd    scalarplus_perf8
#define minus_perf_simd         minus_perf8
#define scalarminus_perf_simd   scalarminus_perf8
#define times_perf_simd         times_perf8
#define scalartimes_perf_simd   scalartimes_perf8
#define sqr_perf_simd           sqr_perf8
#define over_perf_simd          over_perf8
#define scalarover_perf_simd    scalarover_perf8
#define min_perf_simd           min_perf8
#define scalarmin_perf_simd     scalarmin_perf8
#define max_perf_simd           max_perf8
#define scalarmax_perf_simd     scalarmax_perf8

/* functions in d_math.c */
#define clip_perf_simd          clip_perform  /* SIMD not implemented */
#define sigwrap_perf_simd       sigwrap_perform  /* SIMD not implemented */
#define sigsqrt_perf_simd       sigsqrt_perform  /* SIMD not implemented */
#define sigrsqrt_perf_simd      sigrsqrt_perform /* SIMD not implemented */

#endif /* __M_SIMD_DEF_H */

--- NEW FILE: m_simd_ve_gcc.h ---
/* 
    SIMD functionality for Apple Velocity Engine (AltiVec) with GCC compiler
    added by T.Grill
*/

#ifndef __M_SIMD_VE_GCC_H
#define __M_SIMD_VE_GCC_H

#include "m_pd.h"

/* SIMD functions for VE with GCC */

/* functions in d_ugen.c */
t_int *zero_perf_simd(t_int *w);

/* functions in d_dac.c */
t_int *copy_perf_simd(t_int *w);

/* functions in d_ctl.c */
t_int *sig_tilde_perf_simd(t_int *w);

/* functions in d_arithmetic.c */
t_int *plus_perf_simd(t_int *w);
t_int *scalarplus_perf_simd(t_int *w);
t_int *minus_perf_simd(t_int *w);
t_int *scalarminus_perf_simd(t_int *w);
t_int *times_perf_simd(t_int *w);
t_int *scalartimes_perf_simd(t_int *w);
t_int *sqr_perf_simd(t_int *w);
t_int *over_perf_simd(t_int *w);
t_int *scalarover_perf_simd(t_int *w);
t_int *max_perf_simd(t_int *w);
t_int *scalarmax_perf_simd(t_int *w);
t_int *min_perf_simd(t_int *w);
t_int *scalarmin_perf_simd(t_int *w);

/* functions in d_math.c */
t_int *clip_perf_simd(t_int *w);
t_int *sigwrap_perf_simd(t_int *w);
t_int *sigsqrt_perf_simd(t_int *w);
t_int *sigrsqrt_perf_simd(t_int *w);

#endif /* __M_SIMD_VE_GCC_H */





More information about the Pd-cvs mailing list