[PD-cvs] pd/src s_midi_alsa.c,NONE,1.1.2.1 s_audio_asio.cpp,NONE,1.1.4.1 m_simd.c,NONE,1.1.4.1 m_simd.h,NONE,1.1.4.1 m_simd_def.h,NONE,1.1.4.1 m_simd_sse_gcc.c,NONE,1.1.4.1 m_simd_sse_gcc.h,NONE,1.1.4.1 m_simd_sse_vc.c,NONE,1.1.4.1 m_simd_sse_vc.h,NONE,1.1.4.1 m_simd_ve_gcc.c,NONE,1.1.4.1 m_simd_ve_gcc.h,NONE,1.1.4.1
Tim Blechmann
timblech at users.sourceforge.net
Fri Nov 5 14:33:22 CET 2004
- Previous message: [PD-cvs] pd/src configure,1.4,NONE
- Next message: [PD-cvs] pd/src d_arithmetic.c,1.2,1.2.4.1 d_array.c,1.3,1.3.4.1 d_ctl.c,1.3,1.3.4.1 d_dac.c,1.3,1.3.4.1
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
Update of /cvsroot/pure-data/pd/src
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20791
Added Files:
Tag: devel_0_38
s_midi_alsa.c s_audio_asio.cpp m_simd.c m_simd.h m_simd_def.h
m_simd_sse_gcc.c m_simd_sse_gcc.h m_simd_sse_vc.c
m_simd_sse_vc.h m_simd_ve_gcc.c m_simd_ve_gcc.h
Log Message:
adding new files
--- NEW FILE: s_audio_asio.cpp ---
/* Copyright (c) 2004, Tim Blechmann and others
* For information on usage and redistribution, and for a DISCLAIMER OF ALL
* WARRANTIES, see the file, "LICENSE.txt" in this distribution. */
/* native ASIO interface for windows and mac osx
* adapted from hostsample.cpp (ASIO SDK)
*/
#ifdef USEAPI_ASIO
#ifdef MSW
#include "windows.h" /* for application window handle */
#define IEEE754_64FLOAT 1
#endif
#include "m_pd.h"
extern "C" {
#include "s_stuff.h"
}
#include "asio.h" /* steinberg's header file */
#include "asiodrivers.h" /* ASIODrivers class */
#include "asiosys.h"
#include "pthread.h"
#include "stdio.h" /* for sprintf */
#define ASIODEBUG
/* public function prototypes */
extern "C" void asio_open_audio(int naudioindev, int *audioindev, int nchindev,
int *chindev, int naudiooutdev, int *audiooutdev,
int nchoutdev, int *choutdev, int srate);
extern "C" void asio_close_audio(void);
extern "C" void asio_getdevs(char *indevlist, int *nindevs,
char *outdevlist, int *noutdevs, int *canmulti,
int maxndev, int devdescsize);
extern "C" int asio_send_dacs(void);
/* asio callback prototypes */
void asio_bufferSwitch(long db_idx, ASIOBool directprocess);
void asio_sampleRateDidChange(ASIOSampleRate srate);
long asio_messages(long selector, long value, void* message, double* opt);
ASIOTime *asio_bufferSwitchTimeInfo(ASIOTime *params, long db_idx,
ASIOBool directprocess);
/* sample converting helper functions:
* - global send / receive functions
* - sample conversion functions (adapted from ASIOConvertSamples.cpp */
void asio_convert_and_send (t_sample* source, void* dest,
ASIOSampleType format, long asio_bufsize);
void asio_convert_and_receive (void* source, t_sample* dest,
ASIOSampleType format, long asio_bufsize);
void float32toInt16(float* inbuffer, void* outbuffer, long frames);
void Int16tofloat32(void* inbuffer, float* outbuffer, long frames);
void float32toInt24(float* inbuffer, void* outbuffer, long frames);
void Int24tofloat32(void* inbuffer, float* outbuffer, long frames);
void float32toInt32(float* inbuffer, void* outbuffer, long frames);
void Int32tofloat32(void* inbuffer, float* outbuffer, long frames);
/* some local helper functions */
inline void prepare_asio_drivernames(void);
/* system dependent helper functions */
static unsigned long get_sys_reference_time(void);
/* global storage */
ASIODriverInfo * asio_driver = NULL;
ASIOBufferInfo * asio_bufferinfo;
ASIOChannelInfo* asio_channelinfo;
AsioTimeInfo * asio_timerinfo;
ASIOCallbacks asio_callbacks;
extern AsioDrivers * asioDrivers; /* declared in asiodrivers.cpp */
char ** asio_drivernames = NULL;
ASIOSampleRate asio_srate;
long asio_inchannels;
long asio_outchannels;
long asio_minbufsize;
long asio_maxbufsize;
long asio_prefbufsize;
long asio_granularity;
unsigned char asio_useoutputready;
long asio_inputlatency;
long asio_outputlatency;
long asio_bufsize;
unsigned long sys_reftime;
/* ringbuffer stuff */
t_sample ** asio_ringbuffer; /* ringbuffers */
static int asio_ringbuffer_inoffset; /* ringbuffer(in) pointer offset for dac */
static int asio_ringbuffer_outoffset; /* ringbuffer(out) pointer offset */
static int asio_ringbuffer_length; /* latency - hardware latency in samples*/
/* i hope we can remove this to use callback based dsp scheduling */
static pthread_mutex_t asio_ringbuf_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t asio_ringbuf_cond = PTHREAD_COND_INITIALIZER;
/* definitions from s_audio.c ... it should be save to use them */
#define DEVDESCSIZE 80
#define MAXNDEV 20
/* open asio interface */
/* todo: some more error messages */
void asio_open_audio(int naudioindev, int *audioindev, int nchindev,
int *chindev, int naudiooutdev, int *audiooutdev,
int nchoutdev, int *choutdev, int srate)
{
ASIOError status;
ASIOBufferInfo * buffers;
int i;
int channels;
#ifdef IEEE754_64FLOAT
asio_srate=(ASIOSampleRate)srate;
#else
sprintf(asio_srate,"%d",srate);
#endif
/* check, if we use the first asio device */
prepare_asio_drivernames();
/* load the driver */
if (!asioDrivers)
asioDrivers = new AsioDrivers();
/* check, if the driver is still running */
if (asio_driver)
asio_close_audio();
asioDrivers->getDriverNames(asio_drivernames,MAXNDEV);
asioDrivers->loadDriver(asio_drivernames[*audioindev]);
/* initialize ASIO */
asio_driver = (ASIODriverInfo*) getbytes (sizeof(ASIODriverInfo));
asio_driver->asioVersion = 2; /* i hope we are compatible with asio 2 */
#ifdef MSW
asio_driver->sysRef = GetDesktopWindow();
#else
asio_driver->sysRef = 0;
#endif
status = ASIOInit(asio_driver);
#ifdef ASIODEBUG
post("sysRef: %x", asio_driver->sysRef);
post("asioversion: %d", asio_driver->asioVersion);
post("driverversion: %d", asio_driver->driverVersion);
post("name: %s", asio_driver->name);
if(status) post("error: %s", asio_driver->errorMessage);
#endif
switch (status)
{
case ASE_NotPresent:
error("ASIO: ASE_NotPresent");
freebytes(asio_driver, sizeof (ASIODriverInfo));
asio_driver = NULL;
return;
case ASE_NoMemory:
error("ASIO: ASE_NoMemory");
freebytes(asio_driver, sizeof (ASIODriverInfo));
asio_driver = NULL;
return;
case ASE_HWMalfunction:
error("ASIO: ASE_HWMalfunction");
freebytes(asio_driver, sizeof (ASIODriverInfo));
asio_driver = NULL;
return;
}
post("ASIO initialized successfully");
/* query driver */
ASIOGetChannels(&asio_inchannels, &asio_outchannels);
post ("ASIOGetChannels\tinputs: %d, outputs: %d", asio_inchannels,
asio_outchannels);
/* tb: todo: channel count hardcoded to asio hardware */
sys_inchannels = *chindev <= asio_inchannels ? *chindev : asio_inchannels;
sys_outchannels = *choutdev <= asio_outchannels ? *choutdev : asio_outchannels;
channels = sys_inchannels + sys_outchannels;
ASIOGetBufferSize(&asio_minbufsize, &asio_maxbufsize, &asio_prefbufsize,
&asio_granularity);
post ("ASIOGetBufferSize\tmin: %d, max: %d, preferred: %d, granularity: "
"%d", asio_minbufsize, asio_maxbufsize, asio_prefbufsize,
asio_granularity);
/* todo: buffer size hardcoded to asio hardware */
asio_bufsize = asio_prefbufsize;
/* set sample rate */
if (ASIOCanSampleRate( asio_srate ) != ASE_OK)
{
error ("Samplerate not supported, using default");
#ifdef IEEE754_64FLOAT
asio_srate = (ASIOSampleRate)44100.0;
#else
sprintf(&asio_srate,"%d",44100);
#endif
srate=44100;
}
ASIOSetSampleRate( asio_srate );
post ("ASIOSetSampleRate\t %d", srate);
if (ASIOOutputReady() == ASE_OK)
asio_useoutputready = 1;
else
asio_useoutputready = 0;
/* set callbacks */
asio_callbacks.bufferSwitch = &asio_bufferSwitch;
asio_callbacks.sampleRateDidChange = &asio_sampleRateDidChange;
asio_callbacks.asioMessage = &asio_messages;
asio_callbacks.bufferSwitchTimeInfo = &asio_bufferSwitchTimeInfo;
/* prepare, create and set up buffers */
asio_bufferinfo = (ASIOBufferInfo*) getbytes (channels * sizeof (ASIOBufferInfo));
asio_channelinfo = (ASIOChannelInfo*) getbytes(channels * sizeof (ASIOChannelInfo));
if (!(asio_bufferinfo && asio_channelinfo))
{
error("ASIO: couldn't allocate buffer or channel info");
if (asio_bufferinfo)
freebytes(asio_bufferinfo, channels * sizeof (ASIOBufferInfo));
if (asio_channelinfo)
freebytes(asio_channelinfo, channels * sizeof (ASIOChannelInfo));
return;
}
for (i = 0; i != sys_inchannels + sys_outchannels; ++i)
{
if (i < sys_outchannels)
{
asio_bufferinfo[i].isInput = ASIOFalse;
asio_bufferinfo[i].channelNum = i;
asio_bufferinfo[i].buffers[0] = asio_bufferinfo[i].buffers[1] = 0;
}
else
{
asio_bufferinfo[i].isInput = ASIOTrue;
asio_bufferinfo[i].channelNum = i - sys_outchannels;
}
}
if (ASIOCreateBuffers(asio_bufferinfo, sys_inchannels + sys_outchannels,
asio_bufsize, &asio_callbacks)
== ASE_OK)
{
post("ASIO: buffers allocated");
}
else
{
error("ASIO: couldn't allocate buffers");
return;
}
for (i = 0; i != sys_inchannels + sys_outchannels; ++i)
{
asio_channelinfo[i].channel = asio_bufferinfo[i].channelNum;
asio_channelinfo[i].isInput = asio_bufferinfo[i].isInput;
ASIOGetChannelInfo(&asio_channelinfo[i]);
}
/* get latencies */
ASIOGetLatencies(&asio_inputlatency, &asio_outputlatency);
#ifdef ASIODEBUG
post("ASIO: input latency: %d, output latency: %d",asio_inputlatency,
asio_outputlatency);
#endif
/* calculate ringbuffer length */
asio_ringbuffer_length = asio_bufsize * DEFDACBLKSIZE;
/* a strange way to find the least common multiple,
* but works, since DEFDACBLKSIZE (expt 2 x) */
while ( !(asio_ringbuffer_length % DEFDACBLKSIZE) &&
!(asio_ringbuffer_length % asio_bufsize))
{
asio_ringbuffer_length /= 2;
}
asio_ringbuffer_length *= 2;
#ifdef ASIODEBUG
post("ASIO: ringbuffer size: %d",asio_ringbuffer_length);
#endif
/* allocate ringbuffer */
asio_ringbuffer = (t_sample**) getbytes (channels * sizeof (t_sample*));
for (i = 0; i != channels; ++i)
{
asio_ringbuffer[i] = (t_sample*)getbytes(asio_ringbuffer_length * sizeof (t_sample));
if (!asio_ringbuffer[i])
error("ASIO: couldn't allocate ASIO ringbuffer");
memset(asio_ringbuffer[i], 0, asio_ringbuffer_length * sizeof (t_sample));
}
/* initialize ringbuffer stuff */
asio_ringbuffer_inoffset = asio_ringbuffer_outoffset = 0;
if (ASIOStart() == ASE_OK)
{
post("ASIO: started");
}
else
post("ASIO: couldn't start");
return;
}
/* stop asio, free buffers and close asio interface */
void asio_close_audio(void)
{
ASIOError status;
int channels = asio_inchannels + asio_outchannels;
int i;
pthread_cond_broadcast(&asio_ringbuf_cond);
ASIOStop();
if (asio_driver)
{
for (i = 0; i != channels; i++)
freebytes(asio_ringbuffer[i], asio_ringbuffer_length * sizeof (t_sample));
freebytes(asio_ringbuffer, channels * sizeof (t_sample *));
freebytes(asio_bufferinfo, channels * sizeof (ASIOBufferInfo));
freebytes(asio_channelinfo, channels * sizeof (ASIOChannelInfo));
ASIODisposeBuffers();
asio_ringbuffer = NULL;
asio_bufferinfo = NULL;
asio_channelinfo = NULL;
ASIOExit();
freebytes(asio_driver, sizeof (ASIODriverInfo));
asio_driver = NULL;
}
return;
}
void asio_getdevs(char *indevlist, int *nindevs,
char *outdevlist, int *noutdevs, int *canmulti,
int maxndev, int devdescsize)
{
prepare_asio_drivernames();
*canmulti = 0; /* we will only support one asio device */
*nindevs = *noutdevs = (int)asioDrivers->getDriverNames(asio_drivernames,
maxndev);
for(int i = 0; i!= *nindevs; ++i)
{
sprintf(indevlist + i * devdescsize, "%s", asio_drivernames[i]);
sprintf(outdevlist + i * devdescsize, "%s", asio_drivernames[i]);
}
}
/* called on every dac~ send
* todo:
* - use vectorized functions
* - function pointer to avoid segfaults */
int asio_send_dacs(void)
{
t_sample * sp; /* sample pointer */
int i, j;
int timenow;
int timeref = sys_getrealtime();
#ifdef ASIODEBUG
if (!asio_driver)
{
error("ASIO not running");
return SENDDACS_NO;
}
#endif
/* send sound to ringbuffer */
sp = sys_soundout;
for (i = 0; i < sys_outchannels; i++)
{
memcpy(asio_ringbuffer[i] + asio_ringbuffer_inoffset, sp,
DEFDACBLKSIZE*sizeof(t_sample));
memset(sp, 0, DEFDACBLKSIZE*sizeof(t_sample));
sp+=DEFDACBLKSIZE;
}
/* get sound from ringbuffer */
sp = sys_soundin;
for (j = 0; j < sys_inchannels; j++)
{
memcpy(sp, asio_ringbuffer[i+j] + asio_ringbuffer_inoffset,
DEFDACBLKSIZE*sizeof(t_sample));
sp+=DEFDACBLKSIZE;
}
asio_ringbuffer_inoffset += DEFDACBLKSIZE;
if (asio_ringbuffer_inoffset >= asio_ringbuffer_outoffset + asio_bufsize)
{
pthread_cond_wait(&asio_ringbuf_cond, &asio_ringbuf_mutex);
if (asio_ringbuffer_inoffset == asio_ringbuffer_length)
{
asio_ringbuffer_outoffset = 0;
asio_ringbuffer_inoffset = 0;
}
else
asio_ringbuffer_outoffset += asio_bufsize;
}
if ((timenow = sys_getrealtime()) - timeref > 0.002)
{
return SENDDACS_SLEPT;
}
return SENDDACS_YES;
}
/* buffer switch callback */
void asio_bufferSwitch(long db_idx, ASIOBool directprocess)
{
ASIOTime time;
memset (&time, 0, sizeof (time));
/* todo: do we need to syncronize with other media ??? */
asio_bufferSwitchTimeInfo(&time, db_idx, directprocess);
}
/* sample rate change callback */
void asio_sampleRateDidChange(ASIOSampleRate srate)
{
asio_srate = srate;
#ifdef ASIODEBUG
post("sample rate changed");
#endif
}
/* asio messaging callback */
long asio_messages(long selector, long value, void* message, double* opt)
{
/* todo */
return 0L;
}
ASIOTime *asio_bufferSwitchTimeInfo(ASIOTime *params, long db_idx,
ASIOBool directprocess)
{
long i, j;
// todo: store the timeInfo for later use
/* todo: i'm not sure if we'll have to synchronize with other media ...
* probably yes ... */
/* sys_reftime = get_sys_reference_time(); */
/* perform the processing
* todo: improve input latency
*/
for (i = 0; i < asio_outchannels + asio_inchannels; i++)
{
if (asio_bufferinfo[i].isInput != ASIOTrue)
{
asio_convert_and_send(asio_ringbuffer[i]+asio_ringbuffer_outoffset,
(void*) asio_bufferinfo[i].buffers[db_idx],
asio_channelinfo[i].type, asio_bufsize);
}
else /* these are the input channels */
{
asio_convert_and_receive((void*)asio_bufferinfo[i].buffers[db_idx],
asio_ringbuffer[i]+asio_ringbuffer_outoffset,
asio_channelinfo[i].type, asio_bufsize);
}
}
pthread_cond_broadcast(&asio_ringbuf_cond);
if(asio_useoutputready)
ASIOOutputReady();
return 0L; /* time info!!! */
}
/* get system reference time on both platforms */
static unsigned long get_sys_reference_time()
{
#if WINDOWS
return timeGetTime();
#elif MAC
static const double twoRaisedTo32 = 4294967296.;
UnsignedWide ys;
Microseconds(&ys);
double r = ((double)ys.hi * twoRaisedTo32 + (double)ys.lo);
return (unsigned long)(r / 1000.);
#endif
}
/* sample converting helper functions */
void asio_convert_and_send(t_sample* source, void* dest, ASIOSampleType format, long bufsize)
{
#ifdef ASIODEBUG
/* post("ASIO: Sample Type %d", format); */
#endif
switch (format)
{
case ASIOSTInt16LSB:
/* e.g. m audio quattro */
float32toInt16(source, dest, bufsize);
break;
case ASIOSTFloat32LSB: // IEEE 754 32 bit float, as found on Intel x86 architecture
memcpy (dest, source, bufsize * sizeof (float)); /* check */
break;
case ASIOSTInt24LSB: // used for 20 bits as well
float32toInt24(source, dest, bufsize);
break;
case ASIOSTInt32LSB:
float32toInt32(source, dest, bufsize);
break;
case ASIOSTFloat64LSB: // IEEE 754 64 bit double float, as found on Intel x86 architecture
// these are used for 32 bit data buffer, with different alignment of the data inside
// 32 bit PCI bus systems can more easily used with these
case ASIOSTInt32LSB16: // 32 bit data with 18 bit alignment
case ASIOSTInt32LSB18: // 32 bit data with 18 bit alignment
case ASIOSTInt32LSB20: // 32 bit data with 20 bit alignment
case ASIOSTInt32LSB24: // 32 bit data with 24 bit alignment
case ASIOSTInt16MSB:
case ASIOSTInt24MSB: // used for 20 bits as well
case ASIOSTInt32MSB:
case ASIOSTFloat32MSB: // IEEE 754 32 bit float, as found on Intel x86 architecture
case ASIOSTFloat64MSB: // IEEE 754 64 bit double float, as found on Intel x86 architecture
// these are used for 32 bit data buffer, with different alignment of the data inside
// 32 bit PCI bus systems can more easily used with these
case ASIOSTInt32MSB16: // 32 bit data with 18 bit alignment
case ASIOSTInt32MSB18: // 32 bit data with 18 bit alignment
case ASIOSTInt32MSB20: // 32 bit data with 20 bit alignment
case ASIOSTInt32MSB24: // 32 bit data with 24 bit alignment
{
static int written = 0;
if(written < 3) {
post("Output sample Type %d not supported, yet!!!",format);
++written;
}
}
}
}
void asio_convert_and_receive (void* source, t_sample* dest, ASIOSampleType format, long bufsize)
{
#ifdef ASIODEBUG
/* post("ASIO: Sample Type %d", format); */
#endif
switch (format)
{
case ASIOSTInt16LSB:
Int16tofloat32(source, dest, bufsize);
break;
case ASIOSTFloat32LSB: // IEEE 754 32 bit float, as found on Intel x86 architecture
memcpy (dest, source, bufsize * sizeof (float)); /* check */
break;
case ASIOSTInt24LSB: // used for 20 bits as well
Int24tofloat32(source, dest, bufsize);
break;
case ASIOSTInt32LSB:
Int32tofloat32(source, dest, bufsize);
break;
case ASIOSTFloat64LSB: // IEEE 754 64 bit double float, as found on Intel x86 architecture
// these are used for 32 bit data buffer, with different alignment of the data inside
// 32 bit PCI bus systems can more easily used with these
case ASIOSTInt32LSB16: // 32 bit data with 18 bit alignment
case ASIOSTInt32LSB18: // 32 bit data with 18 bit alignment
case ASIOSTInt32LSB20: // 32 bit data with 20 bit alignment
case ASIOSTInt32LSB24: // 32 bit data with 24 bit alignment
case ASIOSTInt16MSB:
case ASIOSTInt24MSB: // used for 20 bits as well
case ASIOSTInt32MSB:
case ASIOSTFloat32MSB: // IEEE 754 32 bit float, as found on Intel x86 architecture
case ASIOSTFloat64MSB: // IEEE 754 64 bit double float, as found on Intel x86 architecture
// these are used for 32 bit data buffer, with different alignment of the data inside
// 32 bit PCI bus systems can more easily used with these
case ASIOSTInt32MSB16: // 32 bit data with 18 bit alignment
case ASIOSTInt32MSB18: // 32 bit data with 18 bit alignment
case ASIOSTInt32MSB20: // 32 bit data with 20 bit alignment
case ASIOSTInt32MSB24: // 32 bit data with 24 bit alignment
{
static int written = 0;
if(written < 3) {
post("Input sample Type %d not supported, yet!!!",format);
++written;
}
}
}
}
/* sample conversion functions */
#define SCALE_INT16 32767.f /* (- (expt 2 15) 1) */
#define SCALE_INT24 8388607.f /* (- (expt 2 23) 1) */
#define SCALE_INT32 2147483647.f /* (- (expt 2 31) 1) */
void float32toInt16(float* inbuffer, void* outbuffer, long frames)
{
short* out = (short*)outbuffer;
while (frames--)
{
*out++ = (short)(*inbuffer++ * SCALE_INT16);
}
}
void Int16tofloat32(void* inbuffer, float* outbuffer, long frames)
{
short* in = (short*)inbuffer;
while (frames--)
{
*outbuffer++ = (float)(*in++ * (1.f / SCALE_INT16));
}
}
void float32toInt24(float* inbuffer, void* outbuffer, long frames)
{
int* out = (int*)outbuffer;
while (frames--)
{
*out++ = (int)(*inbuffer * SCALE_INT24);
}
}
void Int24tofloat32(void* inbuffer, float* outbuffer, long frames)
{
int* in = (int*)inbuffer;
while (frames--)
{
*outbuffer++ = (float)(*in++ * (1.f / SCALE_INT24));
}
}
void float32toInt32(float* inbuffer, void* outbuffer, long frames)
{
long* out = (long*)outbuffer;
while (frames--)
{
*out++ = (long)(*inbuffer * SCALE_INT32);
}
}
void Int32tofloat32(void* inbuffer, float* outbuffer, long frames)
{
long* in = (long*)inbuffer;
while (frames--)
{
*outbuffer++ = (float)(*in++ * (1.f / SCALE_INT32));
}
}
/* some local helper functions */
inline void prepare_asio_drivernames(void)
{
if (asio_drivernames == NULL)
{
asio_drivernames = (char**)getbytes(MAXNDEV * sizeof(char*));
for (int i = 0; i!= MAXNDEV; ++i)
{
asio_drivernames[i] = (char*)getbytes (32 * sizeof(char));
}
}
return;
}
#endif /* USEAPI_ASIO */
--- NEW FILE: m_simd.c ---
/*
Implementation of general vectorized functions
added by T.Grill
*/
#include "m_pd.h"
#include "m_simd.h"
void zerovec_8(t_float *dst,int n)
{
for(n >>= 3; n--; dst += 8) {
dst[0] = dst[1] = dst[2] = dst[3] = dst[4] = dst[5] = dst[6] = dst[7] = 0;
}
}
void setvec_8(t_float *dst,t_float v,int n)
{
for(n >>= 3; n--; dst += 8) {
dst[0] = dst[1] = dst[2] = dst[3] = dst[4] = dst[5] = dst[6] = dst[7] = v;
}
}
void copyvec_8(t_float *dst,const t_float *src,int n)
{
for(n >>= 3; n--; src += 8,dst += 8) {
dst[0] = src[0],dst[1] = src[1],dst[2] = src[2],dst[3] = src[3];
dst[4] = src[4],dst[5] = src[5],dst[6] = src[6],dst[7] = src[7];
}
}
void addvec_8(t_float *dst,const t_float *src,int n)
{
for(n >>= 3; n--; src += 8,dst += 8) {
dst[0] += src[0],dst[1] += src[1],dst[2] += src[2],dst[3] += src[3];
dst[4] += src[4],dst[5] += src[5],dst[6] += src[6],dst[7] += src[7];
}
}
void testcopyvec_8(t_float *dst,const t_float *src,int n)
{
while(n--) {
*(dst++) = (PD_BIGORSMALL(*src) ? 0 : *src); src++;
}
}
void testaddvec_8(t_float *dst,const t_float *src,int n)
{
while(n--) {
*(dst++) += (PD_BIGORSMALL(*src) ? 0 : *src); src++;
}
}
int simd_check1(t_int n, t_float* ptr1)
{
return SIMD_CHECK1(n,ptr1);
}
int simd_check2(t_int n, t_float* ptr1, t_float* ptr2)
{
return SIMD_CHECK2(n,ptr1,ptr2);
}
int simd_check3(t_int n, t_float* ptr1, t_float* ptr2, t_float* ptr3)
{
return SIMD_CHECK3(n,ptr1,ptr2,ptr3);
}
#ifdef DONTUSESIMD
int simd_runtime_check()
{
return 0;
}
/* tb: wrapper for simd functions */
void zerovec_simd(t_float *dst,int n)
{
zerovec_8(dst,n);
}
void setvec_simd(t_float *dst,t_float v,int n)
{
setvec_8(dst,v,n);
}
void copyvec_simd(t_float *dst,const t_float *src,int n)
{
copyvec_8(dst,src,n);
}
void addvec_simd(t_float *dst,const t_float *src,int n)
{
addvec_8(dst,src,n);
}
void testcopyvec_simd(t_float *dst,const t_float *src,int n)
{
testcopyvec_8(dst,src,n);
}
void testaddvec_simd(t_float *dst,const t_float *src,int n)
{
testaddvec_8(dst,src,n);
}
#endif /* DONTUSESIMD */
--- NEW FILE: m_simd_sse_gcc.c ---
/*
Implementation of SIMD functionality for Intel SSE with GCC compiler
added by T.Grill
*/
#include "m_pd.h"
#include "m_simd.h"
#if defined(__GNUC__) && (defined(_X86_) || defined(__i386__) || defined(__i586__) || defined(__i686__)) && !(defined DONTUSESIMD)
/* TB: adapted from thomas' vc routines */
/* dst is assumed to be aligned */
void zerovec_simd(t_float *dst,int n)
{
asm(
".set T_FLOAT,4 \n" /* sizeof(t_float) */
"xorps %%xmm0, %%xmm0 \n" /* zero value */
"shr $4, %0 \n"
/* should we do more loop unrolling? */
/* *dst = 0 */
"1: \n"
"movaps %%xmm0, (%1) \n"
"movaps %%xmm0, 4*T_FLOAT(%1) \n"
"movaps %%xmm0, 8*T_FLOAT(%1) \n"
"movaps %%xmm0, 12*T_FLOAT(%1) \n"
"addl $16*T_FLOAT,%1 \n"
"loop 1b \n"
:
:"c"(n),"r"(dst)
:"%xmm0");
}
/* dst is assumed to be aligned */
void setvec_simd(t_float *dst,t_float v,int n)
{
asm(
".set T_FLOAT,4 \n" /* sizeof(t_float) */
"shufps $0,%2,%2 \n" /* load value */
"shr $4,%0 \n"
/* should we do more loop unrolling? */
/* *dst = v */
"1: \n"
"movaps %2, (%1) \n"
"movaps %2, 4*T_FLOAT(%1) \n"
"movaps %2, 8*T_FLOAT(%1) \n"
"movaps %2, 12*T_FLOAT(%1) \n"
"addl $16*T_FLOAT,%1 \n"
"loop 1b \n"
:
:"c"(n),"r"(dst),"x"((t_float)v)
);
}
/* dst and src are assumed to be aligned */
void copyvec_simd(t_float *dst,const t_float *src,int n)
{
asm(
".set T_FLOAT,4 \n" /* sizeof(t_float) */
"shr $4, %0 \n"
/* loop: *dst = *src */
"1: \n"
"movaps (%1), %%xmm0 \n"
"movaps 4*T_FLOAT(%1), %%xmm1 \n"
"movaps 8*T_FLOAT(%1), %%xmm2 \n"
"movaps 12*T_FLOAT(%1), %%xmm3 \n"
"movaps %%xmm0, (%2) \n"
"movaps %%xmm1, 4*T_FLOAT(%2) \n"
"movaps %%xmm2, 8*T_FLOAT(%2) \n"
"movaps %%xmm3, 12*T_FLOAT(%2) \n"
"addl $16*T_FLOAT,%1 \n"
"addl $16*T_FLOAT,%2 \n"
"loop 1b \n"
:
:"c"(n),"r"(src),"r"(dst)
:"%xmm0","%xmm1","%xmm2","%xmm3");
}
/* dst and src are assumed to be aligned */
void addvec_simd(t_float *dst,const t_float *src,int n)
{
asm(
".set T_FLOAT,4 \n" /* sizeof(t_float) */
"shr $4, %0 \n"
/* loop: *dst += *src */
"1: \n"
"movaps (%2,%3),%%xmm0 \n"
"movaps (%1,%3),%%xmm1 \n"
"addps %%xmm0,%%xmm1 \n"
"movaps %%xmm1,(%2,%3) \n"
"movaps 4*T_FLOAT(%2,%3),%%xmm0 \n"
"movaps 4*T_FLOAT(%1,%3),%%xmm1 \n"
"addps %%xmm0,%%xmm1 \n"
"movaps %%xmm1,4*T_FLOAT(%2,%3) \n"
"movaps 8*T_FLOAT(%2,%3),%%xmm0 \n"
"movaps 8*T_FLOAT(%1,%3),%%xmm1 \n"
"addps %%xmm0,%%xmm1 \n"
"movaps %%xmm1,8*T_FLOAT(%2,%3) \n"
"movaps 12*T_FLOAT(%2,%3),%%xmm0 \n"
"movaps 12*T_FLOAT(%1,%3),%%xmm1 \n"
"addps %%xmm0,%%xmm1 \n"
"movaps %%xmm1,12*T_FLOAT(%2,%3) \n"
"addl $16*T_FLOAT,%3 \n"
"loop 1b \n"
:
: "c"(n),"r"(src),"r"(dst),"r"(0)
: "%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%xmm5","%xmm6","%xmm7");
}
/* dst is assumed to be aligned */
void testvec_simd(t_float *dst,t_float v,int n)
{
asm(
".set T_FLOAT,4 \n" /* sizeof(t_float) */
"shufps $0,%2,%2 \n" /* load value */
"shr $4,%0 \n"
/* should we do more loop unrolling? */
/* *dst = v */
"1: \n"
"movaps %2, (%1) \n"
"movaps %2, 4*T_FLOAT(%1) \n"
"movaps %2, 8*T_FLOAT(%1) \n"
"movaps %2, 12*T_FLOAT(%1) \n"
"addl $16*T_FLOAT,%1 \n"
"loop 1b \n"
:
:"c"(n),"r"(dst),"x"((t_float)v)
);
}
/*
* if we switch on DAZ, we shouldn't have problems with denormals
* any more ... tb
*/
void testcopyvec_simd(t_float *dst,const t_float *src,int n)
{
#if 0 //def DAZ
copyvec_simd(dst,src,n);
#else
testcopyvec_8(dst,src,n); /* SIMD not implemented */
#endif
}
void testaddvec_simd(t_float *dst,const t_float *src,int n)
{
#if 0 //DAZ
addvec_simd(dst,src,n);
#else
testaddvec_8(dst,src,n); /* SIMD not implemented */
#endif
}
t_int *zero_perf_simd(t_int *w)
{
zerovec_simd((t_float *)w[1],w[2]);
return w+3;
}
t_int *copy_perf_simd(t_int *w)
{
copyvec_simd((t_float *)w[2],(const t_float *)w[1],w[3]);
return w+4;
}
t_int *sig_tilde_perf_simd(t_int *w)
{
setvec_simd((t_float *)w[2],*(const t_float *)w[1],w[3]);
return w+4;
}
t_int *plus_perf_simd (t_int * w)
{
asm(
".set T_FLOAT,4 \n"
"shrl $4, %3 \n" /* divide by 16 */
/* loop: *out = *in1 + *in2 */
"1: \n"
"movaps (%0,%4), %%xmm0 \n"
"movaps (%1,%4), %%xmm1 \n"
"addps %%xmm1, %%xmm0 \n"
"movaps %%xmm0, (%2,%4) \n"
"movaps 4*T_FLOAT(%0,%4), %%xmm2 \n"
"movaps 4*T_FLOAT(%1,%4), %%xmm3 \n"
"addps %%xmm3, %%xmm2 \n"
"movaps %%xmm2, 4*T_FLOAT(%2,%4) \n"
"movaps 8*T_FLOAT(%0,%4), %%xmm4 \n"
"movaps 8*T_FLOAT(%1,%4), %%xmm5 \n"
"addps %%xmm5, %%xmm4 \n"
"movaps %%xmm4, 8*T_FLOAT(%2,%4) \n"
"movaps 12*T_FLOAT(%0,%4), %%xmm6 \n"
"movaps 12*T_FLOAT(%1,%4), %%xmm7 \n"
"addps %%xmm7, %%xmm6 \n"
"movaps %%xmm6, 12*T_FLOAT(%2,%4) \n"
"addl $16*T_FLOAT, %4 \n"
"loop 1b \n"
:
/* in1, in2, out, n */
:"r"(w[1]),"r"(w[2]),"r"(w[3]),"c"(w[4]),"r"(0)
:"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%xmm5","%xmm6","%xmm7"
);
return w+5;
}
t_int *scalarplus_perf_simd(t_int *w)
{
asm(
".set T_FLOAT,4 \n"
"shufps $0, %1, %1 \n"
"shrl $4, %3 \n" /* divide by 16 */
/* loop: *out = *in + value */
"1: \n"
"movaps (%0), %%xmm1 \n"
"addps %1, %%xmm1 \n"
"movaps %%xmm1, (%2) \n"
"movaps 4*T_FLOAT(%0), %%xmm2 \n"
"addps %1, %%xmm2 \n"
"movaps %%xmm2, 4*T_FLOAT(%2) \n"
"movaps 8*T_FLOAT(%0), %%xmm3 \n"
"addps %1, %%xmm3 \n"
"movaps %%xmm3, 8*T_FLOAT(%2) \n"
"movaps 12*T_FLOAT(%0), %%xmm4 \n"
"addps %1, %%xmm4 \n"
"movaps %%xmm4, 12*T_FLOAT(%2) \n"
"addl $16*T_FLOAT, %0 \n"
"addl $16*T_FLOAT, %2 \n"
"loop 1b \n"
:
/* in, value, out, n */
:"r"(w[1]),"x"((t_float)w[2]),"r"(w[3]),"c"(w[4])
:"%xmm1","%xmm2","%xmm3","%xmm4"
);
return w+5;
}
t_int *minus_perf_simd(t_int *w)
{
asm(
".set T_FLOAT,4 \n"
"shrl $4, %3 \n" /* divide by 16 */
/* loop: *out = *in1 - *in2 */
"1: \n"
"movaps (%0,%4), %%xmm0 \n"
"movaps (%1,%4), %%xmm1 \n"
"subps %%xmm1, %%xmm0 \n"
"movaps %%xmm0, (%2,%4) \n"
"movaps 4*T_FLOAT(%0,%4), %%xmm2 \n"
"movaps 4*T_FLOAT(%1,%4), %%xmm3 \n"
"subps %%xmm3, %%xmm2 \n"
"movaps %%xmm2, 4*T_FLOAT(%2,%4) \n"
"movaps 8*T_FLOAT(%0,%4), %%xmm4 \n"
"movaps 8*T_FLOAT(%1,%4), %%xmm5 \n"
"subps %%xmm5, %%xmm4 \n"
"movaps %%xmm4, 8*T_FLOAT(%2,%4) \n"
"movaps 12*T_FLOAT(%0,%4), %%xmm6 \n"
"movaps 12*T_FLOAT(%1,%4), %%xmm7 \n"
"subps %%xmm7, %%xmm6 \n"
"movaps %%xmm6, 12*T_FLOAT(%2,%4) \n"
"addl $16*T_FLOAT, %4 \n"
"loop 1b \n"
:
/* in1, in2, out, n */
:"r"(w[1]),"r"(w[2]),"r"(w[3]),"c"(w[4]),"r"(0)
:"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%xmm5","%xmm6","%xmm7"
);
return w+5;
}
t_int* scalarminus_perf_simd(t_int *w)
{
asm(
".set T_FLOAT,4 \n"
"shufps $0, %1, %1 \n"
"shrl $4, %3 \n" /* divide by 16 */
/* loop: *out = *in - value */
"1: \n"
"movaps (%0), %%xmm1 \n"
"subps %1, %%xmm1 \n"
"movaps %%xmm1, (%2) \n"
"movaps 4*T_FLOAT(%0), %%xmm2 \n"
"subps %1, %%xmm2 \n"
"movaps %%xmm2, 4*T_FLOAT(%2) \n"
"movaps 8*T_FLOAT(%0), %%xmm3 \n"
"subps %1, %%xmm3 \n"
"movaps %%xmm3, 8*T_FLOAT(%2) \n"
"movaps 12*T_FLOAT(%0), %%xmm4 \n"
"subps %1, %%xmm4 \n"
"movaps %%xmm4, 12*T_FLOAT(%2) \n"
"addl $16*T_FLOAT, %0 \n"
"addl $16*T_FLOAT, %2 \n"
"loop 1b \n"
:
/* in, value, out, n */
:"r"(w[1]),"x"((t_float)w[2]),"r"(w[3]),"c"(w[4])
:"%xmm1","%xmm2","%xmm3","%xmm4"
);
return w+5;
}
t_int *times_perf_simd(t_int *w)
{
asm(
".set T_FLOAT,4 \n"
"shrl $4, %3 \n" /* divide by 16 */
/* loop: *out = *in1 * *in2 */
"1: \n"
"movaps (%0,%4), %%xmm0 \n"
"movaps (%1,%4), %%xmm1 \n"
"mulps %%xmm1, %%xmm0 \n"
"movaps %%xmm0, (%2,%4) \n"
"movaps 4*T_FLOAT(%0,%4), %%xmm2 \n"
"movaps 4*T_FLOAT(%1,%4), %%xmm3 \n"
"mulps %%xmm3, %%xmm2 \n"
"movaps %%xmm2, 4*T_FLOAT(%2,%4) \n"
"movaps 8*T_FLOAT(%0,%4), %%xmm4 \n"
"movaps 8*T_FLOAT(%1,%4), %%xmm5 \n"
"mulps %%xmm5, %%xmm4 \n"
"movaps %%xmm4, 8*T_FLOAT(%2,%4) \n"
"movaps 12*T_FLOAT(%0,%4), %%xmm6 \n"
"movaps 12*T_FLOAT(%1,%4), %%xmm7 \n"
"mulps %%xmm7, %%xmm6 \n"
"movaps %%xmm6, 12*T_FLOAT(%2,%4) \n"
"addl $16*T_FLOAT, %4 \n"
"loop 1b \n"
:
/* in1, in2, out, n */
:"r"(w[1]),"r"(w[2]),"r"(w[3]),"c"(w[4]),"r"(0)
:"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%xmm5","%xmm6","%xmm7"
);
return w+5;
}
t_int* scalartimes_perf_simd(t_int *w)
{
asm(
".set T_FLOAT,4 \n"
"shufps $0, %1, %1 \n"
"shrl $4, %3 \n" /* divide by 16 */
/* loop: *out = *in * value */
"1: \n"
"movaps (%0), %%xmm1 \n"
"mulps %1, %%xmm1 \n"
"movaps %%xmm1, (%2) \n"
"movaps 4*T_FLOAT(%0), %%xmm2 \n"
"mulps %1, %%xmm2 \n"
"movaps %%xmm2, 4*T_FLOAT(%2) \n"
"movaps 8*T_FLOAT(%0), %%xmm3 \n"
"mulps %1, %%xmm3 \n"
"movaps %%xmm3, 8*T_FLOAT(%2) \n"
"movaps 12*T_FLOAT(%0), %%xmm4 \n"
"mulps %1, %%xmm4 \n"
"movaps %%xmm4, 12*T_FLOAT(%2) \n"
"addl $16*T_FLOAT, %0 \n"
"addl $16*T_FLOAT, %2 \n"
"loop 1b \n"
:
/* in, value, out, n */
:"r"(w[1]),"x"((t_float)w[2]),"r"(w[3]),"c"(w[4])
:"%xmm1","%xmm2","%xmm3","%xmm4"
);
return w+5;
}
t_int *sqr_perf_simd(t_int *w)
{
asm(
".set T_FLOAT,4 \n"
"shrl $4, %2 \n" /* divide by 16 */
/* loop: *out = *in * *in */
"1: \n"
"movaps (%0,%3), %%xmm0 \n"
"mulps %%xmm0, %%xmm0 \n"
"movaps %%xmm0, (%1) \n"
"movaps 4*T_FLOAT(%0,%3), %%xmm1 \n"
"mulps %%xmm1, %%xmm1 \n"
"movaps %%xmm1, 4*T_FLOAT(%1) \n"
"movaps 8*T_FLOAT(%0,%3), %%xmm2 \n"
"mulps %%xmm2, %%xmm2 \n"
"movaps %%xmm2, 8*T_FLOAT(%1) \n"
"movaps 12*T_FLOAT(%0,%3), %%xmm3 \n"
"mulps %%xmm3, %%xmm3 \n"
"movaps %%xmm3, 12*T_FLOAT(%1) \n"
"addl $16*T_FLOAT, %3 \n"
"loop 1b \n"
:
/* in, out, n */
:"r"(w[1]),"r"(w[2]),"c"(w[3]),"r"(0)
:"%xmm0","%xmm1","%xmm2","%xmm3"
);
return w+4;
}
t_int* over_perf_simd(t_int * w)
{
asm(
".set T_FLOAT,4 \n"
"shrl $4, %3 \n" /* divide by 16 */
/* loop: *out = *in1 / *in2 */
"1: \n"
"movaps (%0,%4), %%xmm0 \n"
"movaps (%1,%4), %%xmm1 \n"
"divps %%xmm1, %%xmm0 \n"
"movaps %%xmm0, (%2,%4) \n"
"movaps 4*T_FLOAT(%0,%4), %%xmm2 \n"
"movaps 4*T_FLOAT(%1,%4), %%xmm3 \n"
"divps %%xmm3, %%xmm2 \n"
"movaps %%xmm2, 4*T_FLOAT(%2,%4) \n"
"movaps 8*T_FLOAT(%0,%4), %%xmm4 \n"
"movaps 8*T_FLOAT(%1,%4), %%xmm5 \n"
"divps %%xmm5, %%xmm4 \n"
"movaps %%xmm4, 8*T_FLOAT(%2,%4) \n"
"movaps 12*T_FLOAT(%0,%4), %%xmm6 \n"
"movaps 12*T_FLOAT(%1,%4), %%xmm7 \n"
"divps %%xmm7, %%xmm6 \n"
"movaps %%xmm6, 12*T_FLOAT(%2,%4) \n"
"addl $16*T_FLOAT, %4 \n"
"loop 1b \n"
:
/* in1, in2, out, n */
:"r"(w[1]),"r"(w[2]),"r"(w[3]),"c"(w[4]),"r"(0)
:"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%xmm5","%xmm6","%xmm7"
);
return w+5;
}
t_int* scalarover_perf_simd(t_int *w)
{
asm(
".set T_FLOAT,4 \n"
"shufps $0, %1, %1 \n"
"shrl $4, %3 \n" /* divide by 16 */
/* loop: *out = *in / value */
"1: \n"
"movaps (%0), %%xmm1 \n"
"divps %1, %%xmm1 \n"
"movaps %%xmm1, (%2) \n"
"movaps 4*T_FLOAT(%0), %%xmm2 \n"
"divps %1, %%xmm2 \n"
"movaps %%xmm2, 4*T_FLOAT(%2) \n"
"movaps 8*T_FLOAT(%0), %%xmm3 \n"
"divps %1, %%xmm3 \n"
"movaps %%xmm3, 8*T_FLOAT(%2) \n"
"movaps 12*T_FLOAT(%0), %%xmm4 \n"
"divps %1, %%xmm4 \n"
"movaps %%xmm4, 12*T_FLOAT(%2) \n"
"addl $16*T_FLOAT, %0 \n"
"addl $16*T_FLOAT, %2 \n"
"loop 1b \n"
:
/* in, value, out, n */
:"r"(w[1]),"x"((t_float)w[2]),"r"(w[3]),"c"(w[4])
:"%xmm1","%xmm2","%xmm3","%xmm4"
);
return w+5;
}
t_int* min_perf_simd(t_int * w)
{
asm(
".set T_FLOAT,4 \n"
"shrl $4, %3 \n" /* divide by 16 */
/* loop: *out = min (*in1, *in2) */
"1: \n"
"movaps (%0,%4), %%xmm0 \n"
"movaps (%1,%4), %%xmm1 \n"
"minps %%xmm1, %%xmm0 \n"
"movaps %%xmm0, (%2,%4) \n"
"movaps 4*T_FLOAT(%0,%4), %%xmm2 \n"
"movaps 4*T_FLOAT(%1,%4), %%xmm3 \n"
"minps %%xmm3, %%xmm2 \n"
"movaps %%xmm2, 4*T_FLOAT(%2,%4) \n"
"movaps 8*T_FLOAT(%0,%4), %%xmm4 \n"
"movaps 8*T_FLOAT(%1,%4), %%xmm5 \n"
"minps %%xmm5, %%xmm4 \n"
"movaps %%xmm4, 8*T_FLOAT(%2,%4) \n"
"movaps 12*T_FLOAT(%0,%4), %%xmm6 \n"
"movaps 12*T_FLOAT(%1,%4), %%xmm7 \n"
"minps %%xmm7, %%xmm6 \n"
"movaps %%xmm6, 12*T_FLOAT(%2,%4) \n"
"addl $16*T_FLOAT, %4 \n"
"loop 1b \n"
:
/* in1, in2, out, n */
:"r"(w[1]),"r"(w[2]),"r"(w[3]),"c"(w[4]),"r"(0)
:"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%xmm5","%xmm6","%xmm7"
);
return w+5;
}
t_int* scalarmin_perf_simd(t_int *w)
{
asm(
".set T_FLOAT,4 \n"
"shufps $0, %1, %1 \n"
"shrl $4, %3 \n" /* divide by 16 */
/* loop: *out = min(*in, value) */
"1: \n"
"movaps (%0), %%xmm1 \n"
"minps %1, %%xmm1 \n"
"movaps %%xmm1, (%2) \n"
"movaps 4*T_FLOAT(%0), %%xmm2 \n"
"minps %1, %%xmm2 \n"
"movaps %%xmm2, 4*T_FLOAT(%2) \n"
"movaps 8*T_FLOAT(%0), %%xmm3 \n"
"minps %1, %%xmm3 \n"
"movaps %%xmm3, 8*T_FLOAT(%2) \n"
"movaps 12*T_FLOAT(%0), %%xmm4 \n"
"minps %1, %%xmm4 \n"
"movaps %%xmm4, 12*T_FLOAT(%2) \n"
"addl $16*T_FLOAT, %0 \n"
"addl $16*T_FLOAT, %2 \n"
"loop 1b \n"
:
/* in, value, out, n */
:"r"(w[1]),"x"((t_float)w[2]),"r"(w[3]),"c"(w[4])
:"%xmm1","%xmm2","%xmm3","%xmm4"
);
return w+5;
}
t_int* max_perf_simd(t_int * w)
{
asm(
".set T_FLOAT,4 \n"
"shrl $4, %3 \n" /* divide by 16 */
/* loop: *out = max (*in1, *in2) */
"1: \n"
"movaps (%0,%4), %%xmm0 \n"
"movaps (%1,%4), %%xmm1 \n"
"maxps %%xmm1, %%xmm0 \n"
"movaps %%xmm0, (%2,%4) \n"
"movaps 4*T_FLOAT(%0,%4), %%xmm2 \n"
"movaps 4*T_FLOAT(%1,%4), %%xmm3 \n"
"maxps %%xmm3, %%xmm2 \n"
"movaps %%xmm2, 4*T_FLOAT(%2,%4) \n"
"movaps 8*T_FLOAT(%0,%4), %%xmm4 \n"
"movaps 8*T_FLOAT(%1,%4), %%xmm5 \n"
"maxps %%xmm5, %%xmm4 \n"
"movaps %%xmm4, 8*T_FLOAT(%2,%4) \n"
"movaps 12*T_FLOAT(%0,%4), %%xmm6 \n"
"movaps 12*T_FLOAT(%1,%4), %%xmm7 \n"
"maxps %%xmm7, %%xmm6 \n"
"movaps %%xmm6, 12*T_FLOAT(%2,%4) \n"
"addl $16*T_FLOAT, %4 \n"
"loop 1b \n"
:
/* in1, in2, out, n */
:"r"(w[1]),"r"(w[2]),"r"(w[3]),"c"(w[4]),"r"(0)
:"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%xmm5","%xmm6","%xmm7"
);
return w+5;
}
t_int* scalarmax_perf_simd(t_int *w)
{
asm(
".set T_FLOAT,4 \n"
"shufps $0, %1, %1 \n"
"shrl $4, %3 \n" /* divide by 16 */
/* loop: *out = max(*in, value) */
"1: \n"
"movaps (%0), %%xmm1 \n"
"maxps %1, %%xmm1 \n"
"movaps %%xmm1, (%2) \n"
"movaps 4*T_FLOAT(%0), %%xmm2 \n"
"maxps %1, %%xmm2 \n"
"movaps %%xmm2, 4*T_FLOAT(%2) \n"
"movaps 8*T_FLOAT(%0), %%xmm3 \n"
"maxps %1, %%xmm3 \n"
"movaps %%xmm3, 8*T_FLOAT(%2) \n"
"movaps 12*T_FLOAT(%0), %%xmm4 \n"
"maxps %1, %%xmm4 \n"
"movaps %%xmm4, 12*T_FLOAT(%2) \n"
"addl $16*T_FLOAT, %0 \n"
"addl $16*T_FLOAT, %2 \n"
"loop 1b \n"
:
/* in, value, out, n */
:"r"(w[1]),"x"((t_float)w[2]),"r"(w[3]),"c"(w[4])
:"%xmm1","%xmm2","%xmm3","%xmm4"
);
return w+5;
}
/* TB: runtime check */
int simd_runtime_check()
{
unsigned int eax, edx;
__asm__("cpuid" : "=a"(eax),"=d"(edx) : "a" (1): "bx", "cx");
return (0x2000000 & edx);
}
#endif
--- NEW FILE: m_simd_ve_gcc.c ---
/*
Implementation of SIMD functionality for Apple Velocity Engine (AltiVec) with GCC compiler
added by T.Grill
*/
#include "m_pd.h"
#include "m_simd.h"
#if defined(__GNUC__) && defined(__POWERPC__) && defined(__ALTIVEC__)
//#define USEVECLIB
#ifdef USEVECLIB
#include <vecLib/vDSP.h>
#include <vecLib/vfp.h>
#endif
/* functions for unaligned vector data - taken from http://developer.apple.com/hardware/ve/alignment.html */
/* T.Grill - this first version _should_ work! but it doesn't... */
#if 0
#define LoadUnaligned(v) (vec_perm( vec_ld( 0, (const vector float *)(v) ), vec_ld( 16, (const vector float *)(v) ), vec_lvsl( 0, (float *) (v) ) ))
#else
/* instead take the slower second one */
static vector float LoadUnaligned(const float *v)
{
union tmpstruct { float f[4]; vector float vec; } tmp;
tmp.f[0] = *(float *)v;
return vec_splat(vec_ld(0,&tmp.vec),0);
}
#endif
#define IsVectorAligned(where) ((unsigned long)(where)&(sizeof(vector float)-1) == 0)
/*
#define LoadValue(where) (IsVectorAligned((void *)(where))?vec_splat(vec_ld(0,(vector float *)(where)),0):LoadUnaligned((vector float *)(where)))
*/
/* always assume unaligned */
#define LoadValue(where) LoadUnaligned((const float *)(where))
void zerovec_simd(t_float *dst,int n)
{
const vector float zero = (vector float)(0);
for(n >>= 4; n--; dst += 16) {
vec_st(zero, 0,dst);
vec_st(zero,16,dst);
vec_st(zero,32,dst);
vec_st(zero,48,dst);
}
}
void setvec_simd(t_float *dst,t_float v,int n)
{
const vector float arg = LoadValue(&v);
for(n >>= 4; n--; dst += 16) {
vec_st(arg, 0,dst);
vec_st(arg,16,dst);
vec_st(arg,32,dst);
vec_st(arg,48,dst);
}
}
void copyvec_simd(t_float *dst,const t_float *src,int n)
{
for(n >>= 4; n--; src += 16,dst += 16) {
vector float a1 = vec_ld( 0,src);
vector float a2 = vec_ld(16,src);
vector float a3 = vec_ld(32,src);
vector float a4 = vec_ld(48,src);
vec_st(a1, 0,dst);
vec_st(a2,16,dst);
vec_st(a3,32,dst);
vec_st(a4,48,dst);
}
}
void addvec_simd(t_float *dst,const t_float *src,int n)
{
#ifdef USEVECLIB
vadd(dst,1,src,1,dst,1,n);
#else
for(n >>= 4; n--; src += 16,dst += 16) {
vector float a1 = vec_ld( 0,dst),b1 = vec_ld( 0,src);
vector float a2 = vec_ld(16,dst),b2 = vec_ld(16,src);
vector float a3 = vec_ld(32,dst),b3 = vec_ld(32,src);
vector float a4 = vec_ld(48,dst),b4 = vec_ld(48,src);
a1 = vec_add(a1,b1);
a2 = vec_add(a2,b2);
a3 = vec_add(a3,b3);
a4 = vec_add(a4,b4);
vec_st(a1, 0,dst);
vec_st(a2,16,dst);
vec_st(a3,32,dst);
vec_st(a4,48,dst);
}
#endif
}
/* no bad float testing for PPC! */
void testcopyvec_simd(t_float *dst,const t_float *src,int n)
{
copyvec_simd(dst,src,n);
}
void testaddvec_simd(t_float *dst,const t_float *src,int n)
{
addvec_simd(dst,src,n);
}
t_int *zero_perf_simd(t_int *w)
{
zerovec_simd((t_float *)w[1],w[2]);
return w+3;
}
t_int *copy_perf_simd(t_int *w)
{
copyvec_simd((t_float *)w[2],(const t_float *)w[1],w[3]);
return w+4;
}
t_int *sig_tilde_perf_simd(t_int *w)
{
setvec_simd((t_float *)w[2],*(const t_float *)w[1],w[3]);
return w+4;
}
t_int *plus_perf_simd(t_int *w)
{
#ifdef USEVECLIB
vadd((const t_float *)w[1],1,(const t_float *)w[2],1,(t_float *)w[3],1,w[4]);
#else
const t_float *src1 = (const t_float *)w[1];
const t_float *src2 = (const t_float *)w[2];
t_float *dst = (t_float *)w[3];
int n = w[4]>>4;
for(; n--; src1 += 16,src2 += 16,dst += 16) {
vector float a1 = vec_ld( 0,src1),b1 = vec_ld( 0,src2);
vector float a2 = vec_ld(16,src1),b2 = vec_ld(16,src2);
vector float a3 = vec_ld(32,src1),b3 = vec_ld(32,src2);
vector float a4 = vec_ld(48,src1),b4 = vec_ld(48,src2);
a1 = vec_add(a1,b1);
a2 = vec_add(a2,b2);
a3 = vec_add(a3,b3);
a4 = vec_add(a4,b4);
vec_st(a1, 0,dst);
vec_st(a2,16,dst);
vec_st(a3,32,dst);
vec_st(a4,48,dst);
}
#endif
return w+5;
}
t_int *scalarplus_perf_simd(t_int *w)
{
const t_float *src = (const t_float *)w[1];
const vector float arg = LoadValue(w[2]);
t_float *dst = (t_float *)w[3];
int n = w[4]>>4;
for(; n--; src += 16,dst += 16) {
vector float a1 = vec_ld( 0,src);
vector float a2 = vec_ld(16,src);
vector float a3 = vec_ld(32,src);
vector float a4 = vec_ld(48,src);
a1 = vec_add(a1,arg);
a2 = vec_add(a2,arg);
a3 = vec_add(a3,arg);
a4 = vec_add(a4,arg);
vec_st(a1, 0,dst);
vec_st(a2,16,dst);
vec_st(a3,32,dst);
vec_st(a4,48,dst);
}
return w+5;
}
t_int *minus_perf_simd(t_int *w)
{
#if 0 //def USEVECLIB
/* vsub is buggy for some OSX versions! */
vsub((const t_float *)w[1],1,(const t_float *)w[2],1,(t_float *)w[3],1,w[4]);
#else
const t_float *src1 = (const t_float *)w[1];
const t_float *src2 = (const t_float *)w[2];
t_float *dst = (t_float *)w[3];
int n = w[4]>>4;
for(; n--; src1 += 16,src2 += 16,dst += 16) {
vector float a1 = vec_ld( 0,src1),b1 = vec_ld( 0,src2);
vector float a2 = vec_ld(16,src1),b2 = vec_ld(16,src2);
vector float a3 = vec_ld(32,src1),b3 = vec_ld(32,src2);
vector float a4 = vec_ld(48,src1),b4 = vec_ld(48,src2);
a1 = vec_sub(a1,b1);
a2 = vec_sub(a2,b2);
a3 = vec_sub(a3,b3);
a4 = vec_sub(a4,b4);
vec_st(a1, 0,dst);
vec_st(a2,16,dst);
vec_st(a3,32,dst);
vec_st(a4,48,dst);
}
#endif
return w+5;
}
t_int *scalarminus_perf_simd(t_int *w)
{
const t_float *src = (const t_float *)w[1];
const vector float arg = LoadValue(w[2]);
t_float *dst = (t_float *)w[3];
int n = w[4]>>4;
for(; n--; src += 16,dst += 16) {
vector float a1 = vec_ld( 0,src);
vector float a2 = vec_ld(16,src);
vector float a3 = vec_ld(32,src);
vector float a4 = vec_ld(48,src);
a1 = vec_sub(a1,arg);
a2 = vec_sub(a2,arg);
a3 = vec_sub(a3,arg);
a4 = vec_sub(a4,arg);
vec_st(a1, 0,dst);
vec_st(a2,16,dst);
vec_st(a3,32,dst);
vec_st(a4,48,dst);
}
return w+5;
}
t_int *times_perf_simd(t_int *w)
{
#ifdef USEVECLIB
vmul((const t_float *)w[1],1,(const t_float *)w[2],1,(t_float *)w[3],1,w[4]);
#else
const t_float *src1 = (const t_float *)w[1];
const t_float *src2 = (const t_float *)w[2];
t_float *dst = (t_float *)w[3];
const vector float zero = (vector float)(0);
int n = w[4]>>4;
for(; n--; src1 += 16,src2 += 16,dst += 16) {
vector float a1 = vec_ld( 0,src1),b1 = vec_ld( 0,src2);
vector float a2 = vec_ld(16,src1),b2 = vec_ld(16,src2);
vector float a3 = vec_ld(32,src1),b3 = vec_ld(32,src2);
vector float a4 = vec_ld(48,src1),b4 = vec_ld(48,src2);
a1 = vec_madd(a1,b1,zero);
a2 = vec_madd(a2,b2,zero);
a3 = vec_madd(a3,b3,zero);
a4 = vec_madd(a4,b4,zero);
vec_st(a1, 0,dst);
vec_st(a2,16,dst);
vec_st(a3,32,dst);
vec_st(a4,48,dst);
}
#endif
return w+5;
}
t_int *scalartimes_perf_simd(t_int *w)
{
#ifdef USEVECLIB
vsmul((const t_float *)w[1],1,(t_float *)w[2],(t_float *)w[3],1,w[4]);
#else
const t_float *src = (const t_float *)w[1];
const vector float arg = LoadValue(w[2]);
t_float *dst = (t_float *)w[3];
const vector float zero = (vector float)(0);
int n = w[4]>>4;
for(; n--; src += 16,dst += 16) {
vector float a1 = vec_ld( 0,src);
vector float a2 = vec_ld(16,src);
vector float a3 = vec_ld(32,src);
vector float a4 = vec_ld(48,src);
a1 = vec_madd(a1,arg,zero);
a2 = vec_madd(a2,arg,zero);
a3 = vec_madd(a3,arg,zero);
a4 = vec_madd(a4,arg,zero);
vec_st(a1, 0,dst);
vec_st(a2,16,dst);
vec_st(a3,32,dst);
vec_st(a4,48,dst);
}
#endif
return w+5;
}
t_int *sqr_perf_simd(t_int *w)
{
#ifdef USEVECLIB
vsq((const t_float *)w[1],1,(t_float *)w[2],1,w[3]);
#else
const t_float *src = (const t_float *)w[1];
t_float *dst = (t_float *)w[2];
const vector float zero = (vector float)(0);
int n = w[3]>>4;
for(; n--; src += 16,dst += 16) {
vector float a1 = vec_ld( 0,src);
vector float a2 = vec_ld(16,src);
vector float a3 = vec_ld(32,src);
vector float a4 = vec_ld(48,src);
a1 = vec_madd(a1,a1,zero);
a2 = vec_madd(a2,a2,zero);
a3 = vec_madd(a3,a3,zero);
a4 = vec_madd(a4,a4,zero);
vec_st(a1, 0,dst);
vec_st(a2,16,dst);
vec_st(a3,32,dst);
vec_st(a4,48,dst);
}
#endif
return w+4;
}
t_int *over_perf_simd(t_int *w)
{
const t_float *src1 = (const t_float *)w[1];
const t_float *src2 = (const t_float *)w[2];
t_float *dst = (t_float *)w[3];
const vector float zero = (vector float)(0);
const vector float one = (vector float)(1);
int n = w[4]>>4;
for(; n--; src1 += 16,src2 += 16,dst += 16) {
#ifdef USEVECLIB
/* no zero checking here */
vec_st(vdivf(vec_ld( 0,src1),vec_ld( 0,src2)), 0,dst);
vec_st(vdivf(vec_ld(16,src1),vec_ld(16,src2)),16,dst);
vec_st(vdivf(vec_ld(32,src1),vec_ld(32,src2)),32,dst);
vec_st(vdivf(vec_ld(48,src1),vec_ld(48,src2)),48,dst);
#else
vector float data1 = vec_ld( 0,src2);
vector float data2 = vec_ld(16,src2);
vector float data3 = vec_ld(32,src2);
vector float data4 = vec_ld(48,src2);
vector unsigned char mask1 = vec_nor((vector unsigned char)vec_cmpeq(data1,zero),(vector unsigned char)zero); /* bit mask... all 0 for data = 0., all 1 else */
vector unsigned char mask2 = vec_nor((vector unsigned char)vec_cmpeq(data2,zero),(vector unsigned char)zero); /* bit mask... all 0 for data = 0., all 1 else */
vector unsigned char mask3 = vec_nor((vector unsigned char)vec_cmpeq(data3,zero),(vector unsigned char)zero); /* bit mask... all 0 for data = 0., all 1 else */
vector unsigned char mask4 = vec_nor((vector unsigned char)vec_cmpeq(data4,zero),(vector unsigned char)zero); /* bit mask... all 0 for data = 0., all 1 else */
/* make estimated reciprocal and zero out NANs */
vector float tmp1 = vec_re(data1);
vector float tmp2 = vec_re(data2);
vector float tmp3 = vec_re(data3);
vector float tmp4 = vec_re(data4);
tmp1 = (vector float)vec_and((vector unsigned char)tmp1,mask1);
tmp2 = (vector float)vec_and((vector unsigned char)tmp2,mask2);
tmp3 = (vector float)vec_and((vector unsigned char)tmp3,mask3);
tmp4 = (vector float)vec_and((vector unsigned char)tmp4,mask4);
data1 = vec_madd( vec_nmsub( tmp1, data1, one ), tmp1, tmp1 );
data2 = vec_madd( vec_nmsub( tmp2, data2, one ), tmp2, tmp2 );
data3 = vec_madd( vec_nmsub( tmp3, data3, one ), tmp3, tmp3 );
data4 = vec_madd( vec_nmsub( tmp4, data4, one ), tmp4, tmp4 );
tmp1 = vec_ld( 0,src1);
tmp2 = vec_ld(16,src1);
tmp3 = vec_ld(32,src1);
tmp4 = vec_ld(48,src1);
data1 = vec_madd(tmp1,data1,zero);
data2 = vec_madd(tmp2,data2,zero);
data3 = vec_madd(tmp3,data3,zero);
data4 = vec_madd(tmp4,data4,zero);
vec_st(data1, 0,dst);
vec_st(data2,16,dst);
vec_st(data3,32,dst);
vec_st(data4,48,dst);
#endif
}
return w+5;
}
t_int *scalarover_perf_simd(t_int *w)
{
t_float *dst = (t_float *)w[3];
const vector float zero = (vector float)(0);
int n = w[4]>>4;
if(*(t_float *)w[2]) {
const t_float *src = (const t_float *)w[1];
#ifdef USEVECLIB
float arg = *(t_float *)w[2]?1./ *(t_float *)w[2]: 0;
vsmul(src,1,&arg,dst,1,w[4]);
#else
const vector float v = LoadValue(w[2]);
const vector float one = (vector float)(1);
vector float estimate = vec_re(v);
vector float arg = vec_madd( vec_nmsub( estimate, v, one ), estimate, estimate );
for(; n--; src += 16,dst += 16) {
vector float a1 = vec_ld( 0,src);
vector float a2 = vec_ld(16,src);
vector float a3 = vec_ld(32,src);
vector float a4 = vec_ld(48,src);
a1 = vec_madd(a1,arg,zero);
a2 = vec_madd(a2,arg,zero);
a3 = vec_madd(a3,arg,zero);
a4 = vec_madd(a4,arg,zero);
vec_st(a1, 0,dst);
vec_st(a2,16,dst);
vec_st(a3,32,dst);
vec_st(a4,48,dst);
}
#endif
}
else {
/* zero all output */
for(; n--; dst += 16) {
vec_st(zero, 0,dst);
vec_st(zero,16,dst);
vec_st(zero,32,dst);
vec_st(zero,48,dst);
}
}
return w+5;
}
t_int *min_perf_simd(t_int *w)
{
const t_float *src1 = (const t_float *)w[1];
const t_float *src2 = (const t_float *)w[2];
t_float *dst = (t_float *)w[3];
int n = w[4]>>4;
for(; n--; src1 += 16,src2 += 16,dst += 16) {
vector float a1 = vec_ld( 0,src1),b1 = vec_ld( 0,src2);
vector float a2 = vec_ld(16,src1),b2 = vec_ld(16,src2);
vector float a3 = vec_ld(32,src1),b3 = vec_ld(32,src2);
vector float a4 = vec_ld(48,src1),b4 = vec_ld(48,src2);
a1 = vec_min(a1,b1);
a2 = vec_min(a2,b2);
a3 = vec_min(a3,b3);
a4 = vec_min(a4,b4);
vec_st(a1, 0,dst);
vec_st(a2,16,dst);
vec_st(a3,32,dst);
vec_st(a4,48,dst);
}
return w+5;
}
t_int *scalarmin_perf_simd(t_int *w)
{
const t_float *src = (const t_float *)w[1];
const vector float arg = LoadValue(w[2]);
t_float *dst = (t_float *)w[3];
int n = w[4]>>4;
for(; n--; src += 16,dst += 16) {
vector float a1 = vec_ld( 0,src);
vector float a2 = vec_ld(16,src);
vector float a3 = vec_ld(32,src);
vector float a4 = vec_ld(48,src);
a1 = vec_min(a1,arg);
a2 = vec_min(a2,arg);
a3 = vec_min(a3,arg);
a4 = vec_min(a4,arg);
vec_st(a1, 0,dst);
vec_st(a2,16,dst);
vec_st(a3,32,dst);
vec_st(a4,48,dst);
}
return w+5;
}
t_int *max_perf_simd(t_int *w)
{
const t_float *src1 = (const t_float *)w[1];
const t_float *src2 = (const t_float *)w[2];
t_float *dst = (t_float *)w[3];
int n = w[4]>>4;
for(; n--; src1 += 16,src2 += 16,dst += 16) {
vector float a1 = vec_ld( 0,src1),b1 = vec_ld( 0,src2);
vector float a2 = vec_ld(16,src1),b2 = vec_ld(16,src2);
vector float a3 = vec_ld(32,src1),b3 = vec_ld(32,src2);
vector float a4 = vec_ld(48,src1),b4 = vec_ld(48,src2);
a1 = vec_max(a1,b1);
a2 = vec_max(a2,b2);
a3 = vec_max(a3,b3);
a4 = vec_max(a4,b4);
vec_st(a1, 0,dst);
vec_st(a2,16,dst);
vec_st(a3,32,dst);
vec_st(a4,48,dst);
}
return w+5;
}
t_int *scalarmax_perf_simd(t_int *w)
{
const t_float *src = (const t_float *)w[1];
const vector float arg = LoadValue(w[2]);
t_float *dst = (t_float *)w[3];
int n = w[4]>>4;
for(; n--; src += 16,dst += 16) {
vector float a1 = vec_ld( 0,src);
vector float a2 = vec_ld(16,src);
vector float a3 = vec_ld(32,src);
vector float a4 = vec_ld(48,src);
a1 = vec_max(a1,arg);
a2 = vec_max(a2,arg);
a3 = vec_max(a3,arg);
a4 = vec_max(a4,arg);
vec_st(a1, 0,dst);
vec_st(a2,16,dst);
vec_st(a3,32,dst);
vec_st(a4,48,dst);
}
return w+5;
}
t_int *clip_perf_simd(t_int *w)
{
const t_float *src = (const t_float *)w[1];
t_float *dst = (t_float *)w[2];
const vector float lo = LoadValue(w[3]);
const vector float hi = LoadValue(w[4]);
int n = w[5]>>4;
for(; n--; src += 16,dst += 16) {
vector float data1 = vec_ld( 0,src);
vector float data2 = vec_ld(16,src);
vector float data3 = vec_ld(32,src);
vector float data4 = vec_ld(48,src);
vector unsigned char mlo1 = (vector unsigned char)vec_cmple(data1,lo); /* bit mask data <= lo */
vector unsigned char mlo2 = (vector unsigned char)vec_cmple(data2,lo); /* bit mask data <= lo */
vector unsigned char mlo3 = (vector unsigned char)vec_cmple(data3,lo); /* bit mask data <= lo */
vector unsigned char mlo4 = (vector unsigned char)vec_cmple(data4,lo); /* bit mask data <= lo */
vector unsigned char mhi1 = (vector unsigned char)vec_cmpge(data1,hi); /* bit mask data >= hi */
vector unsigned char mhi2 = (vector unsigned char)vec_cmpge(data2,hi); /* bit mask data >= hi */
vector unsigned char mhi3 = (vector unsigned char)vec_cmpge(data3,hi); /* bit mask data >= hi */
vector unsigned char mhi4 = (vector unsigned char)vec_cmpge(data4,hi); /* bit mask data >= hi */
data1 = (vector float)vec_and((vector unsigned char)data1,vec_nor(mlo1,mhi1));
data2 = (vector float)vec_and((vector unsigned char)data2,vec_nor(mlo2,mhi2));
data3 = (vector float)vec_and((vector unsigned char)data3,vec_nor(mlo3,mhi3));
data4 = (vector float)vec_and((vector unsigned char)data4,vec_nor(mlo4,mhi4));
mlo1 = vec_and((vector unsigned char)lo,mlo1);
mlo2 = vec_and((vector unsigned char)lo,mlo2);
mlo3 = vec_and((vector unsigned char)lo,mlo3);
mlo4 = vec_and((vector unsigned char)lo,mlo4);
mhi1 = vec_and((vector unsigned char)hi,mhi1);
mhi2 = vec_and((vector unsigned char)hi,mhi2);
mhi3 = vec_and((vector unsigned char)hi,mhi3);
mhi4 = vec_and((vector unsigned char)hi,mhi4);
data1 = (vector float)vec_or(vec_or(mlo1,mhi1),(vector unsigned char)data1);
data2 = (vector float)vec_or(vec_or(mlo2,mhi2),(vector unsigned char)data2);
data3 = (vector float)vec_or(vec_or(mlo3,mhi3),(vector unsigned char)data3);
data4 = (vector float)vec_or(vec_or(mlo4,mhi4),(vector unsigned char)data4);
vec_st(data1, 0,dst);
vec_st(data2,16,dst);
vec_st(data3,32,dst);
vec_st(data4,48,dst);
}
return w+6;
}
t_int *sigwrap_perf_simd(t_int *w)
{
const t_float *src = (const t_float *)w[1];
t_float *dst = (t_float *)w[2];
int n = w[3]>>4;
for(; n--; src += 16,dst += 16) {
vector float data1 = vec_ld( 0,src);
vector float data2 = vec_ld(16,src);
vector float data3 = vec_ld(32,src);
vector float data4 = vec_ld(48,src);
data1 = vec_sub(data1,vec_floor(data1));
data2 = vec_sub(data2,vec_floor(data2));
data3 = vec_sub(data3,vec_floor(data3));
data4 = vec_sub(data4,vec_floor(data4));
vec_st(data1, 0,dst);
vec_st(data2,16,dst);
vec_st(data3,32,dst);
vec_st(data4,48,dst);
}
return w+4;
}
t_int *sigsqrt_perf_simd(t_int *w)
{
const t_float *src = (const t_float *)w[1];
t_float *dst = (t_float *)w[2];
int n = w[3]>>4;
const vector float zero = (vector float)(0);
const vector float oneHalf = (vector float)(0.5);
const vector float one = (vector float)(1.0);
for(; n--; src += 16,dst += 16) {
/* http://developer.apple.com/hardware/ve/algorithms.html
Just as in Miller's scalar sigsqrt_perform,
first a rsqrt estimate is calculated which is then refined by one round of Newton-Raphson.
Here, to avoid branching a mask is generated which zeroes out eventual resulting NANs.
*/
#ifdef USEVECLIB
/* no zero checking here */
vec_st(vsqrtf(vec_ld( 0,src)), 0,dst);
vec_st(vsqrtf(vec_ld(16,src)),16,dst);
vec_st(vsqrtf(vec_ld(32,src)),32,dst);
vec_st(vsqrtf(vec_ld(48,src)),48,dst);
#else
vector float data1 = vec_ld( 0,src);
vector float data2 = vec_ld(16,src);
vector float data3 = vec_ld(32,src);
vector float data4 = vec_ld(48,src);
const vector unsigned char mask1 = vec_nor((vector unsigned char)vec_cmple(data1,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
const vector unsigned char mask2 = vec_nor((vector unsigned char)vec_cmple(data2,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
const vector unsigned char mask3 = vec_nor((vector unsigned char)vec_cmple(data3,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
const vector unsigned char mask4 = vec_nor((vector unsigned char)vec_cmple(data4,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
const vector float estimate1 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data1),mask1);
const vector float estimate2 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data2),mask2);
const vector float estimate3 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data3),mask3);
const vector float estimate4 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data4),mask4);
/* this can still be improved.... */
data1 = vec_madd(data1,vec_madd( vec_nmsub( data1, vec_madd( estimate1, estimate1, zero ), one ), vec_madd( estimate1, oneHalf, zero ), estimate1 ), zero);
data2 = vec_madd(data2,vec_madd( vec_nmsub( data2, vec_madd( estimate2, estimate2, zero ), one ), vec_madd( estimate2, oneHalf, zero ), estimate2 ), zero);
data3 = vec_madd(data3,vec_madd( vec_nmsub( data3, vec_madd( estimate3, estimate3, zero ), one ), vec_madd( estimate3, oneHalf, zero ), estimate3 ), zero);
data4 = vec_madd(data4,vec_madd( vec_nmsub( data4, vec_madd( estimate4, estimate4, zero ), one ), vec_madd( estimate4, oneHalf, zero ), estimate4 ), zero);
vec_st(data1, 0,dst);
vec_st(data2,16,dst);
vec_st(data3,32,dst);
vec_st(data4,48,dst);
#endif
}
return w+4;
}
/* Attention: there's a difference to sigsqrt_perform which delivers non-zero for a zero input... i don't think the latter is intended... */
t_int *sigrsqrt_perf_simd(t_int *w)
{
const t_float *src = (const t_float *)w[1];
t_float *dst = (t_float *)w[2];
int n = w[3]>>4;
const vector float zero = (vector float)(0);
const vector float oneHalf = (vector float)(0.5);
const vector float one = (vector float)(1.0);
for(; n--; src += 16,dst += 16) {
/* http://developer.apple.com/hardware/ve/algorithms.html
Just as in Miller's scalar sigrsqrt_perform,
first a rsqrt estimate is calculated which is then refined by one round of Newton-Raphson.
Here, to avoid branching a mask is generated which zeroes out eventual resulting NANs.
*/
#ifdef USEVECLIB
/* no zero checking here */
vec_st(vrsqrtf(vec_ld( 0,src)), 0,dst);
vec_st(vrsqrtf(vec_ld(16,src)),16,dst);
vec_st(vrsqrtf(vec_ld(32,src)),32,dst);
vec_st(vrsqrtf(vec_ld(48,src)),48,dst);
#else
vector float data1 = vec_ld( 0,src);
vector float data2 = vec_ld(16,src);
vector float data3 = vec_ld(32,src);
vector float data4 = vec_ld(48,src);
const vector unsigned char mask1 = vec_nor((vector unsigned char)vec_cmple(data1,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
const vector unsigned char mask2 = vec_nor((vector unsigned char)vec_cmple(data2,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
const vector unsigned char mask3 = vec_nor((vector unsigned char)vec_cmple(data3,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
const vector unsigned char mask4 = vec_nor((vector unsigned char)vec_cmple(data4,zero),(vector unsigned char)zero); /* bit mask... all 0 for data <= 0., all 1 else */
const vector float estimate1 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data1),mask1);
const vector float estimate2 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data2),mask2);
const vector float estimate3 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data3),mask3);
const vector float estimate4 = (vector float)vec_and((vector unsigned char)vec_rsqrte(data4),mask4);
data1 = vec_nmsub( data1, vec_madd( estimate1, estimate1, zero ), one );
data2 = vec_nmsub( data2, vec_madd( estimate2, estimate2, zero ), one );
data3 = vec_nmsub( data3, vec_madd( estimate3, estimate3, zero ), one );
data4 = vec_nmsub( data4, vec_madd( estimate4, estimate4, zero ), one );
data1 = vec_madd( data1, vec_madd( estimate1, oneHalf, zero ), estimate1 );
data2 = vec_madd( data2, vec_madd( estimate2, oneHalf, zero ), estimate2 );
data3 = vec_madd( data3, vec_madd( estimate3, oneHalf, zero ), estimate3 );
data4 = vec_madd( data4, vec_madd( estimate4, oneHalf, zero ), estimate4 );
vec_st(data1, 0,dst);
vec_st(data2,16,dst);
vec_st(data3,32,dst);
vec_st(data4,48,dst);
#endif
}
return w+4;
}
int simd_runtime_check()
{
return 1;
}
#endif
--- NEW FILE: m_simd.h ---
/*
Definitions for SIMD functionality
added by T.Grill
*/
#ifndef __M_SIMD_H
#define __M_SIMD_H
/* general vector functions */
void zerovec_8(t_float *dst,int n);
void setvec_8(t_float *dst,t_float v,int n);
void copyvec_8(t_float *dst,const t_float *src,int n);
void addvec_8(t_float *dst,const t_float *src,int n);
void testcopyvec_8(t_float *dst,const t_float *src,int n);
void testaddvec_8(t_float *dst,const t_float *src,int n);
#ifdef DONTUSESIMD
/* if SIMD shouldn't be used the checks will always return false */
#define SIMD_CHKCNT(n) ( 0 )
#define SIMD_CHKALIGN(ptr) ( 0 )
#undef SIMD_BYTEALIGN
#include "m_simd_def.h"
#else
/* how many floats do we calculate in the loop of a SIMD codelet? */
#define SIMD_BLOCK 16 /* must be a power of 2 */
#if defined(_MSC_VER) && defined(_M_IX86) /* Visual C++ on Intel */
/* alignment for Intel SSE */
#define SIMD_BYTEALIGN (128/8) /* align to 128 bits */
#include "m_simd_sse_vc.h"
#elif defined(__GNUC__) && (defined(_X86_) || defined(__i386__) || defined(__i586__) || defined(__i686__) )
/* Intel SSE with GNU C */
#define SIMD_BYTEALIGN (128/8) /* align to 128 bits */
#include "m_simd_sse_gcc.h"
#elif defined(__GNUC__) && defined(__POWERPC__) && defined(__ALTIVEC__)
/* Altivec with GNU C ( -faltivec must be given as a compiler option! ) */
#define SIMD_BYTEALIGN (128/8) /* align to 128 bits */
#include "m_simd_ve_gcc.h"
#else
/* default */
#define SIMD_BYTEALIGN (128/8) /* assume 128 bits */
#include "m_simd_def.h"
#endif
/* check if n meets the requirements for SIMD codelets */
#define SIMD_CHKCNT(n) ( ((n)&(SIMD_BLOCK-1)) == 0 )
/* check if a pointer is correctly aligned for SIMD codelets */
#define SIMD_CHKALIGN(ptr) ( ((unsigned long)(ptr) & (SIMD_BYTEALIGN-1)) == 0 )
#endif
/* check n and 1 pointer at once */
#define SIMD_CHECK1(n,ptr1) ( SIMD_CHKCNT(n) && SIMD_CHKALIGN(ptr1) && simd_runtime_check())
/* check n and 2 pointers at once */
#define SIMD_CHECK2(n,ptr1,ptr2) ( SIMD_CHKCNT(n) && SIMD_CHKALIGN(ptr1) && SIMD_CHKALIGN(ptr2) && simd_runtime_check() )
/* check n and 3 pointers at once */
#define SIMD_CHECK3(n,ptr1,ptr2,ptr3) ( SIMD_CHKCNT(n) && SIMD_CHKALIGN(ptr1) && SIMD_CHKALIGN(ptr2) && SIMD_CHKALIGN(ptr3) && simd_runtime_check() )
/* T.Grill - bit alignment for signal vectors (must be a multiple of 8!) */
/* if undefined no alignment occurs */
#ifdef SIMD_BYTEALIGN
#define VECTORALIGNMENT (SIMD_BYTEALIGN*8)
#else
#define VECTORALIGNMENT 128
#endif
#endif /* __M_SIMD_H */
--- NEW FILE: s_midi_alsa.c ---
/* Copyright (c) 1997-1999 Guenter Geiger, Miller Puckette, Larry Troxler,
* Winfried Ritsch, Karl MacMillan, and others.
* For information on usage and redistribution, and for a DISCLAIMER OF ALL
* WARRANTIES, see the file, "LICENSE.txt," in this distribution. */
/* MIDI I/O for Linux using ALSA */
#include <stdio.h>
#ifdef UNISTD
#include <unistd.h>
#endif
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <alsa/asoundlib.h>
#include "m_pd.h"
#include "s_stuff.h"
static int alsa_nmidiin;
static int alsa_midiinfd[MAXMIDIINDEV];
static int alsa_nmidiout;
static int alsa_midioutfd[MAXMIDIOUTDEV];
static snd_seq_t *midi_handle;
static int alsa_port;
static int alsa_initialized;
static void alsa_midiout(int fd, int n)
{
/*snd_midi_event_encode_byte();
char b = n;
if ((write(fd, (char *) &b, 1)) != 1)
perror("midi write");*/
}
#define O_MIDIFLAG O_NDELAY
unsigned short CombineBytes(unsigned char First, unsigned char Second)
{
unsigned short _14bit;
_14bit = (unsigned short)Second;
_14bit <<= 7;
_14bit |= (unsigned short)First;
return(_14bit);
}
void sys_do_open_midi(int nmidiin, int *midiinvec,
int nmidiout, int *midioutvec)
{
char portname[50];
int err;
int client;
int i;
if (nmidiin>0 && nmidiout>0)
err = snd_seq_open(&midi_handle,"default",SND_SEQ_OPEN_DUPLEX,0);
else if (nmidiin > 0)
err = snd_seq_open(&midi_handle,"default",SND_SEQ_OPEN_INPUT,0);
else if (nmidiout > 0)
err = snd_seq_open(&midi_handle,"default",SND_SEQ_OPEN_OUTPUT,0);
if (err!=0)
{
sys_setalarm(1000000);
post("couldn't open alsa sequencer");
return;
}
for (i=0;i<nmidiout;i++)
{
sprintf(portname,"Pure Data Midi-In %d",i+1);
alsa_port = snd_seq_create_simple_port(midi_handle,portname,SND_SEQ_PORT_CAP_WRITE |SND_SEQ_PORT_CAP_SUBS_WRITE , SND_SEQ_PORT_TYPE_APPLICATION);
alsa_midiinfd[i] = alsa_port;
}
for (i=0;i<nmidiin;i++)
{
sprintf(portname,"Pure Data Midi-Out %d",i+1);
alsa_port = snd_seq_create_simple_port(midi_handle,portname, SND_SEQ_PORT_CAP_SUBS_READ | SND_SEQ_PORT_CAP_READ, SND_SEQ_PORT_TYPE_APPLICATION);
alsa_midioutfd[i] = alsa_port;
}
// alsa_port = snd_seq_create_simple_port(midi_handle,portname,SND_SEQ_PORT_CAP_DUPLEX |SND_SEQ_PORT_CAP_SUBS_WRITE | SND_SEQ_PORT_CAP_SUBS_READ , SND_SEQ_PORT_TYPE_APPLICATION);
//alsa_port = snd_seq_create_simple_port(midi_handle,portname,SND_SEQ_PORT_CAP_READ | SND_SEQ_PORT_CAP_SUBS_READ , SND_SEQ_PORT_TYPE_APPLICATION);
if (alsa_port < 0)
{
sys_setalarm(1000000);
post("couldn't open alsa MIDI output device");
return;
}
snd_seq_client_info_t *alsainfo;
snd_seq_client_info_malloc(&alsainfo);
snd_seq_get_client_info(midi_handle,alsainfo);
snd_seq_client_info_set_name(alsainfo,"Pure Data");
client = snd_seq_client_info_get_client(alsainfo);
snd_seq_set_client_info(midi_handle,alsainfo);
post("Opened Alsa Client %d in:%d out:%d",client,nmidiin,nmidiout);
sys_setalarm(0);
alsa_nmidiout = nmidiout;
alsa_initialized = nmidiin;
}
#define md_msglen(x) (((x)<0xC0)?2:((x)<0xE0)?1:((x)<0xF0)?2:\
((x)==0xF2)?2:((x)<0xF4)?1:0)
void sys_putmidimess(int portno, int a, int b, int c)
{
int aa;
int channel;
snd_midi_event_t *dev;
snd_seq_event_t ev;
snd_seq_ev_clear(&ev);
dev = (snd_midi_event_t*)malloc(30);
if (portno >= 0 && portno < alsa_nmidiout)
{
if (a >= 224) // pitchbend
{
channel = a-224;
snd_seq_ev_set_pitchbend(&ev,channel,CombineBytes(b,c));
}
else if (a >= 208) // touch
{
channel = a-208;
snd_seq_ev_set_chanpress(&ev,channel,b);
}
else if (a >= 192) // program
{
channel = a-192;
snd_seq_ev_set_pgmchange(&ev,channel,b);
}
else if (a >= 176) // controller
{
channel = a-176;
snd_seq_ev_set_controller(&ev,channel,b,c);
}
else if (a >= 160) // polytouch
{
channel = a-160;
snd_seq_ev_set_keypress(&ev,channel,b,c);
}
else if (a >= 144) // note
{
channel = a-144;
if (c)
snd_seq_ev_set_noteon(&ev,channel,b,c);
else
snd_seq_ev_set_noteoff(&ev,channel,b,c);
}
snd_seq_ev_set_direct(&ev);
snd_seq_ev_set_subs(&ev);
snd_seq_ev_set_source(&ev,alsa_midioutfd[0]);
snd_seq_event_output_direct(midi_handle,&ev);
}
//post("%d %d %d\n",a,b,c);
}
void sys_putmidibyte(int portno, int byte)
{
/* snd_midi_event_t *dev;
snd_seq_event_t ev;
snd_seq_ev_clear(&ev);
dev = (snd_midi_event_t*)malloc(4);
if (portno >= 0 && portno < alsa_nmidiout)
{
//alsa_midiout(alsa_midioutfd[portno], byte);
snd_midi_event_encode_byte(dev, byte,&ev);
snd_seq_ev_set_direct(&ev);
snd_seq_ev_set_subs(&ev);
snd_seq_ev_set_source(&ev,alsa_port);
snd_seq_event_output_direct(midi_handle,&ev);
}*/
}
#if 0 /* this is the "select" version which doesn't work with OSS
driver for emu10k1 (it doesn't implement select.) */
void sys_poll_midi(void)
{
int i, throttle = 100;
struct timeval timout;
int did = 1, maxfd = 0;
while (did)
{
fd_set readset, writeset, exceptset;
did = 0;
if (throttle-- < 0)
break;
timout.tv_sec = 0;
timout.tv_usec = 0;
FD_ZERO(&writeset);
FD_ZERO(&readset);
FD_ZERO(&exceptset);
for (i = 0; i < alsa_nmidiin; i++)
{
if (alsa_midiinfd[i] > maxfd)
maxfd = alsa_midiinfd[i];
FD_SET(alsa_midiinfd[i], &readset);
}
select(maxfd+1, &readset, &writeset, &exceptset, &timout);
for (i = 0; i < alsa_nmidiin; i++)
if (FD_ISSET(alsa_midiinfd[i], &readset))
{
char c;
int ret = read(alsa_midiinfd[i], &c, 1);
if (ret <= 0)
fprintf(stderr, "Midi read error\n");
else sys_midibytein(i, (c & 0xff));
did = 1;
}
}
}
#else
/* this version uses the asynchronous "read()" ... */
void sys_poll_midi(void)
{
char buf[20];
int count, alsa_source;
int i;
snd_midi_event_t *dev;
snd_seq_event_t *midievent = NULL;
snd_midi_event_new(20,&dev);
snd_midi_event_init(dev);
count = snd_seq_event_input_pending(midi_handle,1);
if (count != 0)
count = snd_seq_event_input(midi_handle,&midievent);
if (midievent != NULL)
{
count = snd_midi_event_decode(dev,buf,20,midievent);
alsa_source = midievent->dest.port;
for(i=0;i<count;i++)
sys_midibytein(alsa_source, (buf[i] & 0xff));
//post("received %d midi bytes\n",count);
}
snd_midi_event_free(dev);
}
#endif
void sys_close_midi()
{
/*int i;
for (i = 0; i < alsa_nmidiin; i++)
close(alsa_midiinfd[i]);
for (i = 0; i < alsa_nmidiout; i++)
close(alsa_midioutfd[i]);*/
alsa_nmidiin = alsa_nmidiout = 0;
snd_seq_close(midi_handle);
}
#define NSEARCH 10
static int alsa_nmidiindevs, alsa_nmidioutdevs, alsa_initted;
void midi_alsa_init(void)
{
int i;
if (alsa_initted)
return;
alsa_initted = 1;
}
void midi_getdevs(char *indevlist, int *nindevs,
char *outdevlist, int *noutdevs, int maxndev, int devdescsize)
{
int i, ndev;
if ((ndev = alsa_nmidiindevs) > maxndev)
ndev = maxndev;
for (i = 0; i < ndev; i++)
sprintf(indevlist + i * devdescsize, "OSS MIDI device #%d", i+1);
*nindevs = ndev;
if ((ndev = alsa_nmidioutdevs) > maxndev)
ndev = maxndev;
for (i = 0; i < ndev; i++)
sprintf(outdevlist + i * devdescsize, "OSS MIDI device #%d", i+1);
*noutdevs = ndev;
}
--- NEW FILE: m_simd_sse_vc.c ---
/*
Implementation of SIMD functionality for Intel SSE with VC++ compiler
added by T.Grill
*/
#include "m_pd.h"
#include "m_simd.h"
#if defined(NT) && defined(_MSC_VER) && !(defined DONTUSESIMD)
/* dst is assumed to be aligned */
void zerovec_simd(t_float *dst,int n)
{
__asm {
mov edx,dword ptr [dst] /* out */
xorps xmm0,xmm0 /* zero value */
mov ecx,[n] /* n */
shr ecx,4
/* should we do more loop unrolling? */
loopa:
movaps xmmword ptr[edx],xmm0
movaps xmmword ptr[edx+4*TYPE t_float],xmm0
movaps xmmword ptr[edx+8*TYPE t_float],xmm0
movaps xmmword ptr[edx+12*TYPE t_float],xmm0
add edx,16*TYPE t_float
/* very short loop - let's assume that branch prediction does its job nicely */
loop loopa
}
}
/* dst is assumed to be aligned */
void setvec_simd(t_float *dst,t_float v,int n)
{
__asm {
mov edx,dword ptr [dst] /* out */
/* load value ... this is not very clean.. */
movss xmm0,xmmword ptr [v]
shufps xmm0,xmm0,0
mov ecx,[n] /* n */
shr ecx,4
/* should we do more loop unrolling? */
loopa:
movaps xmmword ptr[edx],xmm0
movaps xmmword ptr[edx+4*TYPE t_float],xmm0
movaps xmmword ptr[edx+8*TYPE t_float],xmm0
movaps xmmword ptr[edx+12*TYPE t_float],xmm0
add edx,16*TYPE t_float
/* very short loop - let's assume that branch prediction does its job nicely */
loop loopa
}
}
/* dst and src are assumed to be aligned */
void copyvec_simd(t_float *dst,const t_float *src,int n)
{
__asm {
mov ebx,dword ptr [src] /* in1 */
/* prefetcht0 [ebx] */
mov edx,dword ptr [dst] /* out */
mov ecx,dword ptr [n] /* n */
shr ecx,4
/* prefetcht0 [ebx+8*TYPE t_float] */
loopa:
/* prefetcht0 [ebx+12*TYPE t_float] */
movaps xmm0,xmmword ptr[ebx]
movaps xmmword ptr[edx],xmm0
movaps xmm1,xmmword ptr[ebx+4*TYPE t_float]
movaps xmmword ptr[edx+4*TYPE t_float],xmm1
/* prefetcht0 [ebx+16*TYPE t_float] */
movaps xmm2,xmmword ptr[ebx+8*TYPE t_float]
movaps xmmword ptr[edx+8*TYPE t_float],xmm2
movaps xmm3,xmmword ptr[ebx+12*TYPE t_float]
movaps xmmword ptr[edx+12*TYPE t_float],xmm3
add ebx,16*TYPE t_float
add edx,16*TYPE t_float
loop loopa
}
}
void addvec_simd(t_float *dst,const t_float *src,int n)
{
__asm {
mov eax,dword ptr [src] /* in1 */
/* prefetcht0 [eax] prefetch first cache line */
mov edx,dword ptr [dst] /* out */
mov ecx,dword ptr [n] /* n */
shr ecx,4 /* divide by 16 */
xor esi,esi /* reset index */
/*
prefetcht0 [eax+8*TYPE t_float]
prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
prefetcht0 [eax+16*TYPE t_float]
prefetcht0 [ebx+16*TYPE t_float]
*/
movaps xmm0,xmmword ptr[edx+esi]
movaps xmm1,xmmword ptr[eax+esi]
addps xmm0,xmm1
movaps xmmword ptr[edx+esi],xmm0
movaps xmm2,xmmword ptr[edx+esi+4*TYPE t_float]
movaps xmm3,xmmword ptr[eax+esi+4*TYPE t_float]
addps xmm2,xmm3
movaps xmmword ptr[edx+esi+4*TYPE t_float],xmm2
/*
prefetcht0 [eax+24*TYPE t_float]
prefetcht0 [ebx+24*TYPE t_float]
*/
movaps xmm4,xmmword ptr[edx+esi+8*TYPE t_float]
movaps xmm5,xmmword ptr[eax+esi+8*TYPE t_float]
addps xmm4,xmm5
movaps xmmword ptr[edx+esi+8*TYPE t_float],xmm4
movaps xmm6,xmmword ptr[edx+esi+12*TYPE t_float]
movaps xmm7,xmmword ptr[eax+esi+12*TYPE t_float]
addps xmm6,xmm7
movaps xmmword ptr[edx+esi+12*TYPE t_float],xmm6
add esi,16*TYPE t_float
loop loopa
}
}
void testcopyvec_simd(t_float *dst,const t_float *src,int n)
{
testcopyvec_8(dst,src,n);
}
void testaddvec_simd(t_float *dst,const t_float *src,int n)
{
testaddvec_8(dst,src,n);
}
t_int *zero_perf_simd(t_int *w)
{
zerovec_simd((t_float *)w[1],w[2]);
return w+3;
}
t_int *copy_perf_simd(t_int *w)
{
copyvec_simd((t_float *)w[2],(const t_float *)w[1],w[3]);
return w+4;
}
t_int *sig_tilde_perf_simd(t_int *w)
{
setvec_simd((t_float *)w[2],*(const t_float *)w[1],w[3]);
return w+4;
}
t_int *plus_perf_simd(t_int *w)
{
__asm {
mov esi,dword ptr [w]
mov eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/* prefetcht0 [eax] prefetch first cache line */
mov ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/* prefetcht0 [ebx] prefetch first cache line */
mov edx,dword ptr [esi + 3*TYPE t_int] /* out */
mov ecx,dword ptr [esi + 4*TYPE t_int] /* n */
shr ecx,4 /* divide by 16 */
xor esi,esi /* reset index */
/*
prefetcht0 [eax+8*TYPE t_float]
prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
prefetcht0 [eax+16*TYPE t_float]
prefetcht0 [ebx+16*TYPE t_float]
*/
movaps xmm0,xmmword ptr[eax+esi]
movaps xmm1,xmmword ptr[ebx+esi]
addps xmm0,xmm1
movaps xmmword ptr[edx+esi],xmm0
movaps xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
movaps xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
addps xmm2,xmm3
movaps xmmword ptr[edx+esi+4*TYPE t_float],xmm2
/*
prefetcht0 [eax+24*TYPE t_float]
prefetcht0 [ebx+24*TYPE t_float]
*/
movaps xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
movaps xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
addps xmm4,xmm5
movaps xmmword ptr[edx+esi+8*TYPE t_float],xmm4
movaps xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
movaps xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
addps xmm6,xmm7
movaps xmmword ptr[edx+esi+12*TYPE t_float],xmm6
add esi,16*TYPE t_float
loop loopa
}
return (w+5);
}
t_int *scalarplus_perf_simd(t_int *w)
{
__asm {
mov esi,dword ptr [w]
mov ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/* prefetcht0 [ebx] prefetch first cache line */
mov edx,dword ptr [esi + 3*TYPE t_int] /* out */
/* load value ... this is not very clean.. */
mov eax,dword ptr [esi + 2*TYPE t_int] /* value */
movss xmm0,xmmword ptr [eax]
shufps xmm0,xmm0,0
mov ecx,dword ptr [esi + 4*TYPE t_int] /* n */
shr ecx,4
/* prefetcht0 [ebx+8*TYPE t_float] */
loopa:
/* prefetcht0 [ebx+16*TYPE t_float] */
movaps xmm1,xmmword ptr[ebx]
addps xmm1,xmm0
movaps xmmword ptr[edx],xmm1
movaps xmm2,xmmword ptr[ebx+4*TYPE t_float]
addps xmm2,xmm0
movaps xmmword ptr[edx+4*TYPE t_float],xmm2
/* prefetcht0 [ebx+24*TYPE t_float] */
movaps xmm3,xmmword ptr[ebx+8*TYPE t_float]
addps xmm3,xmm0
movaps xmmword ptr[edx+8*TYPE t_float],xmm3
movaps xmm4,xmmword ptr[ebx+12*TYPE t_float]
addps xmm4,xmm0
movaps xmmword ptr[edx+12*TYPE t_float],xmm4
add ebx,16*TYPE t_float
add edx,16*TYPE t_float
loop loopa
}
return (w+5);
}
t_int *minus_perf_simd(t_int *w)
{
__asm {
mov esi,dword ptr [w]
mov eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/* prefetcht0 [eax] prefetch first cache line */
mov ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/* prefetcht0 [ebx] prefetch first cache line */
mov edx,dword ptr [esi + 3*TYPE t_int] /* out */
mov ecx,dword ptr [esi + 4*TYPE t_int] /* n */
shr ecx,4
xor esi,esi /* reset index */
/*
prefetcht0 [eax+8*TYPE t_float]
prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
prefetcht0 [eax+16*TYPE t_float]
prefetcht0 [ebx+16*TYPE t_float]
*/
movaps xmm0,xmmword ptr[eax+esi]
movaps xmm1,xmmword ptr[ebx+esi]
subps xmm0,xmm1
movaps xmmword ptr[edx+esi],xmm0
movaps xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
movaps xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
subps xmm2,xmm3
movaps xmmword ptr[edx+esi+4*TYPE t_float],xmm2
/*
prefetcht0 [eax+24*TYPE t_float]
prefetcht0 [ebx+24*TYPE t_float]
*/
movaps xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
movaps xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
subps xmm4,xmm5
movaps xmmword ptr[edx+esi+8*TYPE t_float],xmm4
movaps xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
movaps xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
subps xmm6,xmm7
movaps xmmword ptr[edx+esi+12*TYPE t_float],xmm6
add esi,16*TYPE t_float
loop loopa
}
return (w+5);
}
t_int *scalarminus_perf_simd(t_int *w)
{
__asm {
mov esi,dword ptr [w]
mov ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/* prefetcht0 [ebx] prefetch first cache line */
mov edx,dword ptr [esi + 3*TYPE t_int] /* out */
/* load value ... this is not very clean.. */
mov eax,dword ptr [esi + 2*TYPE t_int] /* g */
movss xmm0,xmmword ptr [eax]
shufps xmm0,xmm0,0
mov ecx,dword ptr [esi + 4*TYPE t_int] /* n */
shr ecx,4
/* prefetcht0 [ebx+8*TYPE t_float] */
loopa:
/* prefetcht0 [ebx+16*TYPE t_float] */
movaps xmm1,xmmword ptr[ebx]
subps xmm1,xmm0
movaps xmmword ptr[edx],xmm1
movaps xmm2,xmmword ptr[ebx+4*TYPE t_float]
subps xmm2,xmm0
movaps xmmword ptr[edx+4*TYPE t_float],xmm2
/* prefetcht0 [ebx+24*TYPE t_float] */
movaps xmm3,xmmword ptr[ebx+8*TYPE t_float]
subps xmm3,xmm0
movaps xmmword ptr[edx+8*TYPE t_float],xmm3
movaps xmm4,xmmword ptr[ebx+12*TYPE t_float]
subps xmm4,xmm0
movaps xmmword ptr[edx+12*TYPE t_float],xmm4
add ebx,16*TYPE t_float
add edx,16*TYPE t_float
loop loopa
}
return (w+5);
}
t_int *times_perf_simd(t_int *w)
{
__asm {
mov esi,dword ptr [w]
mov eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/* prefetcht0 [eax] prefetch first cache line */
mov ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/* prefetcht0 [ebx] prefetch first cache line */
mov edx,dword ptr [esi + 3*TYPE t_int] /* out */
mov ecx,dword ptr [esi + 4*TYPE t_int] /* n */
shr ecx,4
xor esi,esi /* reset index */
/*
prefetcht0 [eax+8*TYPE t_float]
prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
prefetcht0 [eax+16*TYPE t_float]
prefetcht0 [ebx+16*TYPE t_float]
*/
movaps xmm0,xmmword ptr[eax+esi]
movaps xmm1,xmmword ptr[ebx+esi]
mulps xmm0,xmm1
movaps xmmword ptr[edx+esi],xmm0
movaps xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
movaps xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
mulps xmm2,xmm3
movaps xmmword ptr[edx+esi+4*TYPE t_float],xmm2
/*
prefetcht0 [eax+24*TYPE t_float]
prefetcht0 [ebx+24*TYPE t_float]
*/
movaps xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
movaps xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
mulps xmm4,xmm5
movaps xmmword ptr[edx+esi+8*TYPE t_float],xmm4
movaps xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
movaps xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
mulps xmm6,xmm7
movaps xmmword ptr[edx+esi+12*TYPE t_float],xmm6
add esi,16*TYPE t_float
loop loopa
}
return (w+5);
}
t_int *scalartimes_perf_simd(t_int *w)
{
__asm {
mov esi,dword ptr [w]
mov ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/* prefetcht0 [ebx] prefetch first cache line */
mov edx,dword ptr [esi + 3*TYPE t_int] /* out */
/* load value ... this is not very clean.. */
mov eax,dword ptr [esi + 2*TYPE t_int] /* g */
movss xmm0,xmmword ptr [eax]
shufps xmm0,xmm0,0
mov ecx,dword ptr [esi + 4*TYPE t_int] /* n */
shr ecx,4
/* prefetcht0 [ebx+8*TYPE t_float] */
loopa:
/* prefetcht0 [ebx+16*TYPE t_float] */
movaps xmm1,xmmword ptr[ebx]
mulps xmm1,xmm0
movaps xmmword ptr[edx],xmm1
movaps xmm2,xmmword ptr[ebx+4*TYPE t_float]
mulps xmm2,xmm0
movaps xmmword ptr[edx+4*TYPE t_float],xmm2
/* prefetcht0 [ebx+24*4] */
movaps xmm3,xmmword ptr[ebx+8*TYPE t_float]
mulps xmm3,xmm0
movaps xmmword ptr[edx+8*TYPE t_float],xmm3
movaps xmm4,xmmword ptr[ebx+12*TYPE t_float]
mulps xmm4,xmm0
movaps xmmword ptr[edx+12*TYPE t_float],xmm4
add ebx,16*TYPE t_float
add edx,16*TYPE t_float
loop loopa
}
return (w+5);
}
t_int *sqr_perf_simd(t_int *w)
{
__asm {
mov esi,dword ptr [w]
mov ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/* prefetcht0 [ebx] prefetch first cache line */
mov edx,dword ptr [esi + 2*TYPE t_int] /* out */
mov ecx,dword ptr [esi + 3*TYPE t_int] /* n */
shr ecx,4
/* prefetcht0 [ebx+8*TYPE t_float] */
loopa:
/* prefetcht0 [ebx+16*TYPE t_float] */
movaps xmm0,xmmword ptr[ebx]
mulps xmm0,xmm0
movaps xmmword ptr[edx],xmm0
movaps xmm1,xmmword ptr[ebx+4*TYPE t_float]
mulps xmm1,xmm1
movaps xmmword ptr[edx+4*TYPE t_float],xmm1
/* prefetcht0 [ebx+24*4] */
movaps xmm2,xmmword ptr[ebx+8*TYPE t_float]
mulps xmm2,xmm2
movaps xmmword ptr[edx+8*TYPE t_float],xmm2
movaps xmm3,xmmword ptr[ebx+12*TYPE t_float]
mulps xmm3,xmm3
movaps xmmword ptr[edx+12*TYPE t_float],xmm3
add ebx,16*TYPE t_float
add edx,16*TYPE t_float
loop loopa
}
return (w+4);
}
/* no checking for 0 yet!! */
t_int *over_perf_simd(t_int *w)
{
__asm {
mov esi,dword ptr [w]
mov eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/* prefetcht0 [eax] prefetch first cache line */
mov ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/* prefetcht0 [ebx] prefetch first cache line */
mov edx,dword ptr [esi + 3*TYPE t_int] /* out */
mov ecx,dword ptr [esi + 4*TYPE t_int] /* n */
shr ecx,4
xor esi,esi /* reset index */
/*
prefetcht0 [eax+8*TYPE t_float]
prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
prefetcht0 [eax+16*TYPE t_float]
prefetcht0 [ebx+16*TYPE t_float]
*/
movaps xmm0,xmmword ptr[eax+esi]
movaps xmm1,xmmword ptr[ebx+esi]
divps xmm0,xmm1
movaps xmmword ptr[edx+esi],xmm0
movaps xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
movaps xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
divps xmm2,xmm3
movaps xmmword ptr[edx+esi+4*TYPE t_float],xmm2
/*
prefetcht0 [eax+24*TYPE t_float]
prefetcht0 [ebx+24*TYPE t_float]
*/
movaps xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
movaps xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
divps xmm4,xmm5
movaps xmmword ptr[edx+esi+8*TYPE t_float],xmm4
movaps xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
movaps xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
divps xmm6,xmm7
movaps xmmword ptr[edx+esi+12*TYPE t_float],xmm6
add esi,16*TYPE t_float
loop loopa
}
return (w+5);
}
t_int *scalarover_perf_simd(t_int *w)
{
static const float one = 1.f;
__asm {
mov esi,dword ptr [w]
mov ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/* prefetcht0 [ebx] prefetch first cache line */
mov edx,dword ptr [esi + 3*TYPE t_int] /* out */
/* load value */
mov eax,dword ptr [esi + 2*TYPE t_int] /* g */
movss xmm1,xmmword ptr [eax]
mov ecx,dword ptr [esi + 4*TYPE t_int] /* n */
shr ecx,4
/* prefetcht0 [ebx+8*TYPE t_float] */
/* check for zero */
xorps xmm0,xmm0
comiss xmm1,xmm0 /* compare xmm1 to 0 */
/* if xmm1 is zero (and also xmm0!) -> goto loopa */
jz loopa
/* else, invert xmm0 */
/* rcpps xmm0,xmm0 ... far too unprecise!! */
movss xmm0,[one]
divss xmm0,xmm1 /* divide xmm0 by xmm1 */
shufps xmm0,xmm0,0 /* make xmm0 all the same */
loopa:
/* prefetcht0 [ebx+16*TYPE t_float] */
movaps xmm1,xmmword ptr[ebx]
mulps xmm1,xmm0
movaps xmmword ptr[edx],xmm1
movaps xmm2,xmmword ptr[ebx+4*TYPE t_float]
mulps xmm2,xmm0
movaps xmmword ptr[edx+4*TYPE t_float],xmm2
/* prefetcht0 [ebx+24*4] */
movaps xmm3,xmmword ptr[ebx+8*TYPE t_float]
mulps xmm3,xmm0
movaps xmmword ptr[edx+8*TYPE t_float],xmm3
movaps xmm4,xmmword ptr[ebx+12*TYPE t_float]
mulps xmm4,xmm0
movaps xmmword ptr[edx+12*TYPE t_float],xmm4
add ebx,16*TYPE t_float
add edx,16*TYPE t_float
loop loopa
}
return (w+5);
}
t_int *max_perf_simd(t_int *w)
{
__asm {
mov esi,dword ptr [w]
mov eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/* prefetcht0 [eax] */
mov ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/* prefetcht0 [ebx] */
mov edx,dword ptr [esi + 3*TYPE t_int] /* out */
mov ecx,dword ptr [esi + 4*TYPE t_int] /* n */
shr ecx,4
xor esi,esi /* reset index */
/*
prefetcht0 [eax+8*TYPE t_float]
prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
prefetcht0 [eax+16*TYPE t_float]
prefetcht0 [ebx+16*TYPE t_float]
*/
movaps xmm0,xmmword ptr[eax+esi]
movaps xmm1,xmmword ptr[ebx+esi]
maxps xmm0,xmm1
movaps xmmword ptr[edx+esi],xmm0
movaps xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
movaps xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
maxps xmm2,xmm3
movaps xmmword ptr[edx+esi+4*TYPE t_float],xmm2
/*
prefetcht0 [eax+24*TYPE t_float]
prefetcht0 [ebx+24*TYPE t_float]
*/
movaps xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
movaps xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
maxps xmm4,xmm5
movaps xmmword ptr[edx+esi+8*TYPE t_float],xmm4
movaps xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
movaps xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
maxps xmm6,xmm7
movaps xmmword ptr[edx+esi+12*TYPE t_float],xmm6
add esi,16*TYPE t_float
loop loopa
}
return (w+5);
}
t_int *scalarmax_perf_simd(t_int *w)
{
__asm {
mov esi,dword ptr [w]
mov ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/* prefetcht0 [ebx] */
mov edx,dword ptr [esi + 3*TYPE t_int] /* out */
/* load value ... this is not very clean.. */
mov eax,dword ptr [esi + 2*TYPE t_int] /* g */
movss xmm0,xmmword ptr [eax]
shufps xmm0,xmm0,0
mov ecx,dword ptr [esi + 4*TYPE t_int] /* n */
shr ecx,4
/* prefetcht0 [ebx+8*TYPE t_float] */
loopa:
/* prefetcht0 [ebx+16*TYPE t_float] */
movaps xmm1,xmmword ptr[ebx]
maxps xmm1,xmm0
movaps xmmword ptr[edx],xmm1
movaps xmm2,xmmword ptr[ebx+4*TYPE t_float]
maxps xmm2,xmm0
movaps xmmword ptr[edx+4*TYPE t_float],xmm2
/* prefetcht0 [ebx+24*TYPE t_float] */
movaps xmm3,xmmword ptr[ebx+8*TYPE t_float]
maxps xmm3,xmm0
movaps xmmword ptr[edx+8*TYPE t_float],xmm3
movaps xmm4,xmmword ptr[ebx+12*TYPE t_float]
maxps xmm4,xmm0
movaps xmmword ptr[edx+12*TYPE t_float],xmm4
add ebx,16*TYPE t_float
add edx,16*TYPE t_float
loop loopa
}
return (w+5);
}
t_int *min_perf_simd(t_int *w)
{
__asm {
mov esi,dword ptr [w]
mov eax,dword ptr [esi + 1*TYPE t_int] /* in1 */
/* prefetcht0 [eax] */
mov ebx,dword ptr [esi + 2*TYPE t_int] /* in2 */
/* prefetcht0 [ebx] */
mov edx,dword ptr [esi + 3*TYPE t_int] /* out */
mov ecx,dword ptr [esi + 4*TYPE t_int] /* n */
shr ecx,4
xor esi,esi /* reset index */
/*
prefetcht0 [eax+8*TYPE t_float]
prefetcht0 [ebx+8*TYPE t_float]
*/
loopa:
/*
prefetcht0 [eax+16*TYPE t_float]
prefetcht0 [ebx+16*TYPE t_float]
*/
movaps xmm0,xmmword ptr[eax+esi]
movaps xmm1,xmmword ptr[ebx+esi]
minps xmm0,xmm1
movaps xmmword ptr[edx+esi],xmm0
movaps xmm2,xmmword ptr[eax+esi+4*TYPE t_float]
movaps xmm3,xmmword ptr[ebx+esi+4*TYPE t_float]
minps xmm2,xmm3
movaps xmmword ptr[edx+esi+4*TYPE t_float],xmm2
/*
prefetcht0 [eax+24*TYPE t_float]
prefetcht0 [ebx+24*TYPE t_float]
*/
movaps xmm4,xmmword ptr[eax+esi+8*TYPE t_float]
movaps xmm5,xmmword ptr[ebx+esi+8*TYPE t_float]
minps xmm4,xmm5
movaps xmmword ptr[edx+esi+8*TYPE t_float],xmm4
movaps xmm6,xmmword ptr[eax+esi+12*TYPE t_float]
movaps xmm7,xmmword ptr[ebx+esi+12*TYPE t_float]
minps xmm6,xmm7
movaps xmmword ptr[edx+esi+12*TYPE t_float],xmm6
add esi,16*TYPE t_float
loop loopa
}
return (w+5);
}
t_int *scalarmin_perf_simd(t_int *w)
{
__asm {
mov esi,dword ptr [w]
mov ebx,dword ptr [esi + 1*TYPE t_int] /* in */
/* prefetcht0 [ebx] */
mov edx,dword ptr [esi + 3*TYPE t_int] /* out */
/* load value ... this is not very clean.. */
mov eax,dword ptr [esi + 2*TYPE t_int] /* g */
movss xmm0,xmmword ptr [eax]
shufps xmm0,xmm0,0
mov ecx,dword ptr [esi + 4*TYPE t_int] /* n */
shr ecx,4
/* prefetcht0 [ebx+8*TYPE t_float] */
loopa:
/* prefetcht0 [ebx+16*TYPE t_float] */
movaps xmm1,xmmword ptr[ebx]
minps xmm1,xmm0
movaps xmmword ptr[edx],xmm1
movaps xmm2,xmmword ptr[ebx+4*TYPE t_float]
minps xmm2,xmm0
movaps xmmword ptr[edx+4*TYPE t_float],xmm2
/* prefetcht0 [ebx+24*TYPE t_float] */
movaps xmm3,xmmword ptr[ebx+8*TYPE t_float]
minps xmm3,xmm0
movaps xmmword ptr[edx+8*TYPE t_float],xmm3
movaps xmm4,xmmword ptr[ebx+12*TYPE t_float]
minps xmm4,xmm0
movaps xmmword ptr[edx+12*TYPE t_float],xmm4
add ebx,16*TYPE t_float
add edx,16*TYPE t_float
loop loopa
}
return (w+5);
}
/* TB: runtime check */
int simd_runtime_check()
{
unsigned int redx;
__asm
{
mov eax, 1
cpuid
mov [redx],edx
}
return (0x2000000 & redx);
}
#endif
--- NEW FILE: m_simd_sse_gcc.h ---
/*
SIMD functionality for Intel SSE with GCC compiler
added by T.Grill
*/
#ifndef __M_SIMD_SSE_GCC_H
#define __M_SIMD_SSE_GCC_H
#include "m_pd.h"
/* SIMD functions for SSE with gcc */
/* functions in d_ugen.c */
t_int *zero_perf_simd(t_int *w);
/* functions in d_dac.c */
t_int *copy_perf_simd(t_int *w);
/* functions in d_ctl.c */
t_int *sig_tilde_perf_simd(t_int *w);
/* functions in d_arithmetic.c */
t_int *plus_perf_simd(t_int *w);
t_int *scalarplus_perf_simd(t_int *w);
t_int *minus_perf_simd(t_int *w);
t_int *scalarminus_perf_simd(t_int *w);
t_int *times_perf_simd(t_int *w);
t_int *scalartimes_perf_simd(t_int *w);
t_int *sqr_perf_simd(t_int *w);
t_int *over_perf_simd(t_int *w);
t_int *scalarover_perf_simd(t_int *w);
t_int *max_perf_simd(t_int *w);
t_int *scalarmax_perf_simd(t_int *w);
t_int *min_perf_simd(t_int *w);
t_int *scalarmin_perf_simd(t_int *w);
t_int *clip_perf_simd(t_int *w);
t_int *sigwrap_perf_simd(t_int *w);
t_int *sigsqrt_perf_simd(t_int *w);
t_int *sigrsqrt_perf_simd(t_int *w);
/* functions in d_math.c */
#define clip_perf_simd clip_perform /* SIMD not implemented */
#define sigwrap_perf_simd sigwrap_perform /* SIMD not implemented */
#define sigsqrt_perf_simd sigsqrt_perform /* SIMD not implemented */
#define sigrsqrt_perf_simd sigrsqrt_perform /* SIMD not implemented */
#endif /* __M_SIMD_SSE_GCC_H */
--- NEW FILE: m_simd_sse_vc.h ---
/*
SIMD functionality for Intel SSE with VC++ compiler
added by T.Grill
*/
#ifndef __M_SIMD_SSE_VC_H
#define __M_SIMD_SSE_VC_H
#include "m_pd.h"
/* SIMD functions for SSE with VC++ */
/* functions in d_ugen.c */
t_int *zero_perf_simd(t_int *w);
/* functions in d_dac.c */
t_int *copy_perf_simd(t_int *w);
/* functions in d_ctl.c */
t_int *sig_tilde_perf_simd(t_int *w);
/* functions in d_arithmetic.c */
t_int *plus_perf_simd(t_int *w);
t_int *scalarplus_perf_simd(t_int *w);
t_int *minus_perf_simd(t_int *w);
t_int *scalarminus_perf_simd(t_int *w);
t_int *times_perf_simd(t_int *w);
t_int *scalartimes_perf_simd(t_int *w);
t_int *sqr_perf_simd(t_int *w);
t_int *over_perf_simd(t_int *w);
t_int *scalarover_perf_simd(t_int *w);
t_int *max_perf_simd(t_int *w);
t_int *scalarmax_perf_simd(t_int *w);
t_int *min_perf_simd(t_int *w);
t_int *scalarmin_perf_simd(t_int *w);
t_int *clip_perf_simd(t_int *w);
t_int *sigwrap_perf_simd(t_int *w);
t_int *sigsqrt_perf_simd(t_int *w);
t_int *sigrsqrt_perf_simd(t_int *w);
/* functions in d_math.c */
#define clip_perf_simd clip_perform /* SIMD not implemented */
#define sigwrap_perf_simd sigwrap_perform /* SIMD not implemented */
#define sigsqrt_perf_simd sigsqrt_perform /* SIMD not implemented */
#define sigrsqrt_perf_simd sigrsqrt_perform /* SIMD not implemented */
#endif /* __M_SIMD_SSE_VC_H */
--- NEW FILE: m_simd_def.h ---
/*
Default SIMD (non-)functionality
added by T.Grill
This is used when there's no implementation of SIMD code
for the current platform and/or compiler
*/
#ifndef __M_SIMD_DEF_H
#define __M_SIMD_DEF_H
/* These are the functions that can be coded for SIMD */
/* functions in d_ugen.c */
#define zero_perf_simd zero_perf8
/* functions in d_dac.c */
#define copy_perf_simd copy_perf8
/* functions in d_ctl.c */
#define sig_tilde_perf_simd sig_tilde_perf8
/* functions in d_arithmetic.c */
#define plus_perf_simd plus_perf8
#define scalarplus_perf_simd scalarplus_perf8
#define minus_perf_simd minus_perf8
#define scalarminus_perf_simd scalarminus_perf8
#define times_perf_simd times_perf8
#define scalartimes_perf_simd scalartimes_perf8
#define sqr_perf_simd sqr_perf8
#define over_perf_simd over_perf8
#define scalarover_perf_simd scalarover_perf8
#define min_perf_simd min_perf8
#define scalarmin_perf_simd scalarmin_perf8
#define max_perf_simd max_perf8
#define scalarmax_perf_simd scalarmax_perf8
/* functions in d_math.c */
#define clip_perf_simd clip_perform /* SIMD not implemented */
#define sigwrap_perf_simd sigwrap_perform /* SIMD not implemented */
#define sigsqrt_perf_simd sigsqrt_perform /* SIMD not implemented */
#define sigrsqrt_perf_simd sigrsqrt_perform /* SIMD not implemented */
#endif /* __M_SIMD_DEF_H */
--- NEW FILE: m_simd_ve_gcc.h ---
/*
SIMD functionality for Apple Velocity Engine (AltiVec) with GCC compiler
added by T.Grill
*/
#ifndef __M_SIMD_VE_GCC_H
#define __M_SIMD_VE_GCC_H
#include "m_pd.h"
/* SIMD functions for VE with GCC */
/* functions in d_ugen.c */
t_int *zero_perf_simd(t_int *w);
/* functions in d_dac.c */
t_int *copy_perf_simd(t_int *w);
/* functions in d_ctl.c */
t_int *sig_tilde_perf_simd(t_int *w);
/* functions in d_arithmetic.c */
t_int *plus_perf_simd(t_int *w);
t_int *scalarplus_perf_simd(t_int *w);
t_int *minus_perf_simd(t_int *w);
t_int *scalarminus_perf_simd(t_int *w);
t_int *times_perf_simd(t_int *w);
t_int *scalartimes_perf_simd(t_int *w);
t_int *sqr_perf_simd(t_int *w);
t_int *over_perf_simd(t_int *w);
t_int *scalarover_perf_simd(t_int *w);
t_int *max_perf_simd(t_int *w);
t_int *scalarmax_perf_simd(t_int *w);
t_int *min_perf_simd(t_int *w);
t_int *scalarmin_perf_simd(t_int *w);
/* functions in d_math.c */
t_int *clip_perf_simd(t_int *w);
t_int *sigwrap_perf_simd(t_int *w);
t_int *sigsqrt_perf_simd(t_int *w);
t_int *sigrsqrt_perf_simd(t_int *w);
#endif /* __M_SIMD_VE_GCC_H */
- Previous message: [PD-cvs] pd/src configure,1.4,NONE
- Next message: [PD-cvs] pd/src d_arithmetic.c,1.2,1.2.4.1 d_array.c,1.3,1.3.4.1 d_ctl.c,1.3,1.3.4.1 d_dac.c,1.3,1.3.4.1
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
More information about the Pd-cvs
mailing list