[PD-cvs] externals/pdp/system/mmx Makefile, 1.3, 1.4 pdp_mmx_test.c, 1.2, 1.3 pixel_add_s16.s, 1.2, 1.3 pixel_biquad_dirI_s16.s, 1.2, 1.3 pixel_biquad_s16.s, 1.2, 1.3 pixel_ca_s1.s, 1.2, 1.3 pixel_cascade_s16.s, 1.2, 1.3 pixel_cheby_s16.s, 1.2, 1.3 pixel_conv_hor_s16.s, 1.2, 1.3 pixel_conv_ver_s16.s, 1.2, 1.3 pixel_crot_s16.s, 1.2, 1.3 pixel_gain.s, 1.2, 1.3 pixel_gain_s16.s, 1.2, 1.3 pixel_mix_s16.s, 1.2, 1.3 pixel_mul_s16.s, 1.2, 1.3 pixel_pack_s16u8.s, 1.2, 1.3 pixel_rand_s16.s, 1.2, 1.3 pixel_randmix_s16.s, 1.2, 1.3 pixel_resample_s16.s, 1.2, 1.3 pixel_s1.s, 1.2, 1.3 pixel_unpack_u8s16.s, 1.2, 1.3
Hans-Christoph Steiner
eighthave at users.sourceforge.net
Fri Dec 16 02:05:42 CET 2005
- Previous message: [PD-cvs] externals/pdp/system/image Makefile, 1.2, 1.3 pdp_imageproc_common.c, 1.2, 1.3 pdp_imageproc_mmx.c, 1.2, 1.3 pdp_imageproc_portable.c, 1.2, 1.3 pdp_llconv.c, 1.2, 1.3 pdp_llconv_mmx.c, 1.2, 1.3 pdp_llconv_portable.c, 1.2, 1.3 pdp_resample.c, 1.2, 1.3
- Next message: [PD-cvs] externals/pdp/system/type Makefile, 1.2, 1.3 pdp_bitmap.c, 1.2, 1.3 pdp_image.c, 1.2, 1.3 pdp_matrix.c, 1.2, 1.3
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
Update of /cvsroot/pure-data/externals/pdp/system/mmx
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv6756/system/mmx
Added Files:
Makefile pdp_mmx_test.c pixel_add_s16.s
pixel_biquad_dirI_s16.s pixel_biquad_s16.s pixel_ca_s1.s
pixel_cascade_s16.s pixel_cheby_s16.s pixel_conv_hor_s16.s
pixel_conv_ver_s16.s pixel_crot_s16.s pixel_gain.s
pixel_gain_s16.s pixel_mix_s16.s pixel_mul_s16.s
pixel_pack_s16u8.s pixel_rand_s16.s pixel_randmix_s16.s
pixel_resample_s16.s pixel_s1.s pixel_unpack_u8s16.s
Log Message:
checking in pdp 0.12.4 from http://zwizwa.fartit.com/pd/pdp/pdp-0.12.4.tar.gz
--- NEW FILE: pixel_resample_s16.s ---
#interpolation data:
#* 4 vectors: neighbourhood for samples (TL, TR, BL, BR)
#* 2 vectors: fractional part (unsigned)
#* 2 vectors: addresses of pixel blocks
#coord conversion data:
#1 vector: 32bit splatted address
#1 vector: 16bit splatted w-1
#1 vector: 16bit splatted h-1
#1 vector: 16bit splatted w (reuse w-1 with add?)
#1 dword: 32 bit line offset
#coord generation data: several vectors for parameter update stuff..
#coordinate systems: 16 bit virtual coordinates (signed, center relative)
#* 2 vectors: virtual coordinates
#(evt tussenstap + conversie naar 16 bit virtual)
#step 1: generate virtual coords
#step 2: virtual coords -> block adresses + fractional adresses
#* mulhigh: real coords (x,y) (center relative)
#* add center -> unsigned (top left relative)
#* mullow: fractional part (x_frac, y_frac)
#* mulhigh, mullow, pack 32bit: y_offset
#* pack 32bit: x_offset
#* add, shift, add start address: real addresses
#step3: data fetch using generated addresses:
# this step would be much simpler in 4x16bit rgba. life's a bitch..
#step4: billinear interpolation
#stat5: store
# this can be simplified by doing 32 bit unaligned moves
# and vector unpacking on the data
# cooked image data structure
# pixel environment temp storage
TL1 = 0x00
TL2 = 0x02
TL3 = 0x04
TL4 = 0x06
TR1 = 0x08
TR2 = 0x0A
TR3 = 0x0C
TR4 = 0x0E
BL1 = 0x10
BL2 = 0x12
BL3 = 0x14
BL4 = 0x16
BR1 = 0x18
BR2 = 0x1A
BR3 = 0x1C
BR4 = 0x1E
# addresses of pixel blocks
ADDRESS1 = 0x20
ADDRESS2 = 0x24
ADDRESS3 = 0x28
ADDRESS4 = 0x2C
# second env + address buffer (testing: not used)
SECONDBUFFER = 0x30
# 32bit splatted bitmap address
V2PLANEADDRESS = 0x60
# 16bit splatted image constants
V4TWOWIDTHM1 = 0x68
V4TWOHEIGHTM1 = 0x70
V4LINEOFFSET = 0x78
# data struct size
RESAMPLEDATASIZE = 0x80
# interpolation routine
# input: %mm0, %mm1 4 x 16bit unsigned top left relative virtual x and y coordinates
# %esi: temp & algo data structure
getpixelsbilin: psrlw $1, %mm0 # convert to range 0->0x7fff [0,0.5[
psrlw $1, %mm1
movq %mm0, %mm2
movq %mm1, %mm3
movq V4TWOWIDTHM1(%esi), %mm4 # 2 * (width - 1)
movq V4TWOHEIGHTM1(%esi), %mm5 # 2 * (height - 1)
pmulhw %mm5, %mm3 # mm3 == y coord (topleft relative)
pmulhw %mm4, %mm2 # mm2 == x coord (topleft relative)
pmullw %mm5, %mm1 # mm1 == y frac (unsigned)
pmullw %mm4, %mm0 # mm0 == x frac (unsigned)
movq %mm3, %mm5 # copy y coord
pmullw V4LINEOFFSET(%esi), %mm3 # low part of line offset
pmulhw V4LINEOFFSET(%esi), %mm5 # high part of line offset
movq %mm2, %mm7 # copy x coord vector
pxor %mm4, %mm4
punpcklwd %mm4, %mm2 # low part in %mm2
punpckhwd %mm4, %mm7 # hight part in %mm7
movq %mm3, %mm6 # copy
punpcklwd %mm5, %mm3 # unpack low part in %mm3
punpckhwd %mm5, %mm6 # high part int %mm6
paddd %mm2, %mm3
paddd %mm7, %mm6
pslld $1, %mm3 # convert to word adresses
pslld $1, %mm6
paddd V2PLANEADDRESS(%esi), %mm3 # add pixel plane address
paddd V2PLANEADDRESS(%esi), %mm6
movq %mm3, ADDRESS1(%esi) # store adresses
movq %mm6, ADDRESS3(%esi)
pcmpeqw %mm2, %mm2 # all ones
movq %mm0, %mm4 # copy x frac
movq %mm1, %mm5 # copy y frac
pxor %mm2, %mm4 # compute compliment (approx negative)
pxor %mm2, %mm5
psrlw $1, %mm0 # shift right (0.5 * (frac x)
psrlw $1, %mm1 # shift right (0.5 * (frac y)
psrlw $1, %mm4 # shift right (0.5 * (1 - frac x)
psrlw $1, %mm5 # shift right (0.5 * (1 - frac y)
movq %mm0, %mm2 # copy of frac x
movq %mm4, %mm3 # copy of (1-frac x)
# fetch data
#jmp skipfetch # seems the fetch is the real killer. try to optimize this
# using 32 bit accesses & shifts
# the src image data struct is padded to the cooked data struct
movl RESAMPLEDATASIZE(%esi), %edi
shll $1, %edi
movl ADDRESS1(%esi), %ecx
movl ADDRESS2(%esi), %edx
movw (%ecx), %ax
movw (%edx), %bx
movw %ax, TL1(%esi)
movw %bx, TL2(%esi)
movw 2(%ecx), %ax
movw 2(%edx), %bx
movw %ax, TR1(%esi)
movw %bx, TR2(%esi)
addl %edi, %ecx
addl %edi, %edx
movw (%ecx), %ax
movw (%edx), %bx
movw %ax, BL1(%esi)
movw %bx, BL2(%esi)
movw 2(%ecx), %ax
movw 2(%edx), %bx
movw %ax, BR1(%esi)
movw %bx, BR2(%esi)
movl ADDRESS3(%esi), %ecx
movl ADDRESS4(%esi), %edx
movw (%ecx), %ax
movw (%edx), %bx
movw %ax, TL3(%esi)
movw %bx, TL4(%esi)
movw 2(%ecx), %ax
movw 2(%edx), %bx
movw %ax, TR3(%esi)
movw %bx, TR4(%esi)
addl %edi, %ecx
addl %edi, %edx
movw (%ecx), %ax
movw (%edx), %bx
movw %ax, BL3(%esi)
movw %bx, BL4(%esi)
movw 2(%ecx), %ax
movw 2(%edx), %bx
movw %ax, BR3(%esi)
movw %bx, BR4(%esi)
skipfetch:
pmulhw TL1(%esi), %mm4 # bilin interpolation
pmulhw TR1(%esi), %mm0
pmulhw BL1(%esi), %mm3
pmulhw BR1(%esi), %mm2
paddw %mm4, %mm0
paddw %mm3, %mm2
pmulhw %mm5, %mm0
pmulhw %mm1, %mm2
paddw %mm2, %mm0
psllw $2, %mm0 # compensate for gain reduction
ret
// linear mapping data struct
ROWSTATEX = 0x0
ROWSTATEY = 0x8
COLSTATEX = 0x10
COLSTATEY = 0x18
ROWINCX = 0x20
ROWINCY = 0x28
COLINCX = 0x30
COLINCY = 0x38
// image data struct
LINEOFFSET = 0x0
IMAGEADDRESS = 0x4
WIDTH = 0x8
HEIGHT = 0xC
IMAGEDATASIZE = 0x10
# pixel_resample_linmap_s16(void *x)
.globl pixel_resample_linmap_s16
.type pixel_resample_linmap_s16, at function
SOURCEIMAGE = RESAMPLEDATASIZE
DESTIMAGE = SOURCEIMAGE + IMAGEDATASIZE
LINMAPDATA = DESTIMAGE + IMAGEDATASIZE
pixel_resample_linmap_s16:
pushl %ebp
movl %esp, %ebp
pushl %esi
pushl %edi
pushl %ebx
movl 8(%ebp), %esi # get data struct
movl DESTIMAGE+HEIGHT(%esi), %edx # image height
movl DESTIMAGE+IMAGEADDRESS(%esi), %edi # dest image address
movl DESTIMAGE+WIDTH(%esi), %ecx # image width
shrl $2, %ecx # vector count
.align 16
linmap_looprow:
movq LINMAPDATA+ROWSTATEX(%esi), %mm0 # get current coordinates
movq LINMAPDATA+ROWSTATEY(%esi), %mm1
linmap_loopcol:
movq %mm0, %mm4 # copy
movq %mm1, %mm5
paddd LINMAPDATA+ROWINCX(%esi), %mm4 # increment
paddd LINMAPDATA+ROWINCY(%esi), %mm5
movq %mm4, %mm6 # copy
movq %mm5, %mm7
paddd LINMAPDATA+ROWINCX(%esi), %mm6 # increment
paddd LINMAPDATA+ROWINCY(%esi), %mm7
movq %mm6, LINMAPDATA+ROWSTATEX(%esi) # store next state
movq %mm7, LINMAPDATA+ROWSTATEY(%esi)
psrad $16, %mm0 # round to 16 bit
psrad $16, %mm1
psrad $16, %mm4
psrad $16, %mm5
packssdw %mm4, %mm0 # pack new coordinates
packssdw %mm5, %mm1
push %ecx
push %edx
push %edi
call getpixelsbilin # do interpolation
pop %edi
pop %edx
pop %ecx
movq %mm0, (%edi) # store 4 pixels
addl $0x8, %edi # point to next 4 pixels
decl %ecx # dec row counter
jnz linmap_looprow
movq LINMAPDATA+COLSTATEX(%esi), %mm0 # get column state vector
movq LINMAPDATA+COLSTATEY(%esi), %mm1
movl DESTIMAGE+WIDTH(%esi), %ecx # image width
shrl $2, %ecx # vector count
paddd LINMAPDATA+COLINCX(%esi), %mm0 # increment
paddd LINMAPDATA+COLINCY(%esi), %mm1
movq %mm0, LINMAPDATA+COLSTATEX(%esi) # store
movq %mm1, LINMAPDATA+COLSTATEY(%esi)
decl %edx # dec column counter
jnz linmap_loopcol
emms
popl %ebx
popl %edi
popl %esi
leave
ret
--- NEW FILE: pixel_gain_s16.s ---
# Pure Data Packet mmx routine.
# Copyright (c) by Tom Schouten <pdp at zzz.kotnet.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
.globl pixel_gain_s16
.type pixel_gain_s16, at function
# gain is integer, shift count is down
# void pixel_gain_s16(int *buf, int nb_8pixel_vectors, short int gain[4], unsigned long long *shift)
pixel_gain_s16:
pushl %ebp
movl %esp, %ebp
push %esi
push %edi
movl 20(%ebp), %edi
movq (%edi), %mm6 # get shift vector
movl 16(%ebp), %edi
movq (%edi), %mm7 # get gain vector
movl 8(%ebp), %esi # input array
movl 12(%ebp), %ecx # pixel count
.align 16
.loop_gain:
movq (%esi), %mm0 # load 4 pixels from memory
movq %mm0, %mm1
pmulhw %mm7, %mm1 # apply gain (s15.0) fixed point, high word
pmullw %mm7, %mm0 # low word
movq %mm0, %mm2 # copy
movq %mm1, %mm3
punpcklwd %mm1, %mm0 # unpack lsw components
punpckhwd %mm3, %mm2 # unpack msw components
psrad %mm6, %mm0 # apply signed shift
psrad %mm6, %mm2
packssdw %mm2, %mm0 # pack result & saturate
movq %mm0, (%esi) # store result
addl $8, %esi # increment source pointer
decl %ecx
jnz .loop_gain # loop
emms
pop %edi
pop %esi
leave
ret
--- NEW FILE: pixel_cheby_s16.s ---
# Pure Data Packet mmx routine.
# Copyright (c) by Tom Schouten <pdp at zzz.kotnet.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
.globl pixel_cheby_s16_3plus
.type pixel_cheby_s16_3plus, at function
# void pixel_cheby_s16(int *buf, int nb_8pixel_vectors, int order+1, short int *coefs)
# coefs are s2.13 fixed point (-4->4)
pixel_cheby_s16_3plus:
pushl %ebp
movl %esp, %ebp
push %esi
push %edi
push %edx
movl 8(%ebp), %esi # input array
movl 12(%ebp), %ecx # vector count
movl 16(%ebp), %eax # get order+1
shll $3, %eax
movl 20(%ebp), %edx
addl %eax, %edx # edx = coef endx address
# jmp skip
.align 16
.loop_cheby:
movl 20(%ebp), %edi # get coefs
movq (%esi), %mm0 # load 4 pixels from memory (mm0 = x)
pcmpeqw %mm2, %mm2
movq %mm0, %mm1 # mm1 (T_n-1) <- x
psrlw $1, %mm2 # mm2 (T_n-2) <- 1
movq (%edi), %mm4 # mm4 (acc) == a0
psraw $1, %mm4 # mm4 == a0/2
movq %mm0, %mm5 # mm5 (intermediate)
pmulhw 8(%edi), %mm5 # mm5 == (x * a1)/2
paddsw %mm5, %mm4 # acc = c0 + c1 x
addl $16, %edi
.loop_cheby_inner:
movq %mm1, %mm3 # mm3 == T_n-1
psraw $2, %mm2 # mm2 == T_n-2 / 4
pmulhw %mm0, %mm3 # mm3 == (2 x T_n-1) / 4
psubsw %mm2, %mm3 # mm3 == (2 x T_n-1 - T_n-2) / 4
paddsw %mm3, %mm3
paddsw %mm3, %mm3 # mm3 == T_n
movq %mm1, %mm2 # mm2 == new T_n-1
movq %mm3, %mm1 # mm3 == new T_n-2
pmulhw (%edi), %mm3 # mm3 = a_n * T_n / 2
paddsw %mm3, %mm4 # accumulate
addl $8, %edi
cmpl %edx, %edi
jne .loop_cheby_inner
paddsw %mm4, %mm4 # compensate for 0.125 factor
paddsw %mm4, %mm4
paddsw %mm4, %mm4
movq %mm4, (%esi) # store result in memory
addl $8, %esi # increment source/dest pointer
decl %ecx
jnz .loop_cheby # loop
skip:
emms
pop %edx
pop %edi
pop %esi
leave
ret
--- NEW FILE: pixel_cascade_s16.s ---
# Pure Data Packet mmx routine.
# Copyright (c) by Tom Schouten <pdp at zzz.kotnet.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# TODO: COUPLED CASCADE SECOND ORDER SECTION
#
# s1[k] = ar * s1[k-1] + ai * s2[k-1] + x[k]
# s2[k] = ar * s2[k-1] - ai * s1[k-1]
# y[k] = c0 * x[k] + c1 * s1[k-1] + c2 * s2[k-1]
# MACRO: df2
#
# computes a coupled cascade
#
# input: %mm0 == input
# %mm1 == state 1
# %mm2 == state 2
# (%esi) == cascade coefs (ar ai c0 c1 c2) in s0.15
# output: %mm0 == output
# %mm1 == state 1
# %mm2 == state 2
.macro coupled
pmovq %mm1, %mm3 # mm3 == s1[k-1]
pmovq %mm1, %mm4 # mm4 == s1[k-1]
pmovq %mm2, %mm5 # mm5 == s2[k-1]
pmovq %mm2, %mm6 # mm5 == s2[k-1]
pmulhw (%esi), %mm1 # mm1 == s1[k-1] * ar
pmulhw 8(%esi), %mm3 # mm3 == s1[k-1] * ai
pmulhw 24(%esi), %mm4 # mm4 == s1[k-1] * c1
pmulhw (%esi), %mm2 # mm2 == s2[k-1] * ar
pmulhw 8(%esi), %mm5 # mm5 == s2[k-1] * ai
pmulhw 32(%esi), %mm6 # mm6 == s2[k-1] * c2
paddw %mm5, %mm1 # mm1 == s1[k-1] * ar + s2[k-1] * ai
psubw %mm3, %mm2 # mm2 == s2[k-1] * ar - s1[k-1] * ai == s2[k]
paddw %mm0, %mm1 # mm1 == s1[k]
pmulhw 16(%esi), %mm0 # mm0 == x[k] * c0
paddw %mm6, %mm4 # mm4 == s1[k-1] * c1 + s2[k-1] * c2
paddw %mm4, %mm0 # mm0 == y[k]
.endm
# in order to use the 4 line parallel cascade routine on horizontal
# lines, we need to reorder (rotate or transpose) the matrix, since
# images are scanline encoded, and we want to work in parallell
# on 4 lines.
#
# since the 4 lines are independent, it doesnt matter in which order
# the the vector elements are present.
#
# this allows us to use the same routine for left->right and right->left
# processing.
#
# some comments on the non-abelean group of square isometries consisting of
# (I) identity
# (H) horizontal axis mirror
# (V) vertical axis mirror
# (T) transpose (diagonal axis mirror)
# (A) antitranspose (antidiagonal axis mirror)
# (R1) 90deg anticlockwize rotation
# (R2) 180deg rotation
# (R3) 90deg clockwize rotation
#
#
# we basicly have two options: (R1,R3) or (T,A)
# we opt for T and A because they are self inverting, which improves locality
#
# use antitranspose for right to left an transpose
# for left to right (little endian)
# antitranspose 4x4
# input
# %mm3 == {d0 d1 d2 d3}
# %mm2 == {c0 c1 c2 c3}
# %mm1 == {b0 b1 b2 b3}
# %mm0 == {a0 a1 a2 a3}
# output
# %mm3 == {a3 b3 c3 d3}
# %mm2 == {a2 b2 c2 d2}
# %mm1 == {a1 b1 c1 d1}
# %mm0 == {a0 b0 c0 d0}
.macro antitranspose_4x4:
movq %mm3, %mm4
punpcklwd %mm1, %mm4 # mm4 <- {b2 d2 b3 d3}
movq %mm3, %mm5
punpckhwd %mm1, %mm5 # mm5 <- {b0 d0 b1 d1}
movq %mm2, %mm6
punpcklwd %mm0, %mm6 # mm6 <- {a2 c2 a3 c3}
movq %mm2, %mm7
punpckhwd %mm0, %mm7 # mm7 <- {a0 c0 a1 c1}
movq %mm4, %mm3
punpcklwd %mm6, %mm3 # mm3 <- {a3 b3 c3 d3}
movq %mm4, %mm2
punpckhwd %mm6, %mm2 # mm2 <- {a2 b2 c2 d2}
movq %mm5, %mm1
punpcklwd %mm7, %mm1 # mm1 <- {a1 b1 c1 d1}
movq %mm5, %mm0
punpckhwd %mm7, %mm0 # mm0 <- {a0 b0 c0 d0}
.endm
# transpose 4x4
# input
# %mm3 == {d3 d2 d1 d0}
# %mm2 == {c3 c2 c1 c0}
# %mm1 == {b3 b2 b1 b0}
# %mm0 == {a3 a2 a1 a0}
# output
# %mm3 == {d3 c3 b3 a3}
# %mm2 == {d2 c2 b2 a2}
# %mm1 == {d1 c1 b1 a1}
# %mm0 == {d0 c0 b0 a0}
.macro transpose_4x4:
movq %mm0, %mm4
punpcklwd %mm2, %mm4 # mm4 <- {c1 a1 c0 a0}
movq %mm0, %mm5
punpckhwd %mm2, %mm5 # mm5 <- {c3 a3 c2 a2}
movq %mm1, %mm6
punpcklwd %mm3, %mm6 # mm6 <- {d1 b1 d0 b0}
movq %mm1, %mm7
punpckhwd %mm3, %mm7 # mm7 <- {d3 b3 d2 b2}
movq %mm4, %mm0
punpcklwd %mm6, %mm0 # mm0 <- {d0 c0 b0 a0}
movq %mm4, %mm1
punpckhwd %mm6, %mm1 # mm1 <- {d1 c1 b1 a1}
movq %mm5, %mm2
punpcklwd %mm7, %mm2 # mm2 <- {d2 c2 b2 a2}
movq %mm5, %mm3
punpckhwd %mm7, %mm3 # mm3 <- {d3 c3 b3 a3}
.endm
.globl pixel_cascade_vertb_s16
.type pixel_cascade_vertb_s16, at function
# pixel_cascade_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
pixel_cascade_vertb_s16:
pushl %ebp
movl %esp, %ebp
push %ebx
push %esi
push %edi
movl 8(%ebp), %ebx # pixel array offset
movl 12(%ebp), %ecx # nb of 4x4 pixblocks
movl 16(%ebp), %edx # line with
movl 20(%ebp), %esi # coefs
movl 24(%ebp), %edi # state
shll $1, %edx # short int addressing
subl %edx, %ebx
movq 0(%edi), %mm1 # s1[k-1]
movq 8(%edi), %mm2 # s2[k-1]
.align 16
.cascade_vertb_line_loop:
movq (%ebx,%edx,1), %mm3
movq %mm3, %mm0
addl %edx, %ebx
coupled
movq %mm0, (%ebx)
movq (%ebx,%edx,1), %mm3
movq %mm3, %mm0
addl %edx, %ebx
coupled
movq %mm0, (%ebx)
movq (%ebx,%edx,1), %mm3
movq %mm3, %mm0
addl %edx, %ebx
coupled
movq %mm0, (%ebx)
movq (%ebx,%edx,1), %mm3
movq %mm3, %mm0
addl %edx, %ebx
coupled
movq %mm0, (%ebx)
decl %ecx
jnz .cascade_vertb_line_loop
movq %mm1, 0(%edi) # s1[k-1]
movq %mm2, 8(%edi) # s2[k-1]
emms
pop %edi
pop %esi
pop %ebx
leave
ret
.globl pixel_cascade_horlr_s16
.type pixel_cascade_horlr_s16, at function
# pixel_cascade_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
pixel_cascade_horlr_s16:
pushl %ebp
movl %esp, %ebp
push %ebx
push %esi
push %edi
movl 8(%ebp), %ebx # pixel array offset
movl 12(%ebp), %ecx # nb of 4x4 pixblocks
movl 16(%ebp), %edx # line with
movl 20(%ebp), %esi # coefs
movl 24(%ebp), %edi # state
shll $1, %edx # short int addressing
movl %edx, %eax
shll $1, %eax
addl %edx, %eax # eax = 3 * edx
.align 16
.cascade_horlr_line_loop:
movq (%edi), %mm1
movq 8(%edi), %mm2
movq (%ebx), %mm0
movq (%ebx,%edx,1), %mm1
movq (%ebx,%edx,2), %mm2
movq (%ebx,%eax,1), %mm3
transpose_4x4
movq %mm1, (%ebx,%edx,1)
movq %mm2, (%ebx,%edx,2)
movq %mm3, (%ebx,%eax,1)
coupled
movq %mm0, (%ebx)
movq (%ebx,%edx,1), %mm3
movq %mm3, %mm0
coupled
movq %mm0, (%ebx, %edx,1)
movq (%ebx,%edx,2), %mm3
movq %mm3, %mm0
coupled
movq %mm0, (%ebx, %edx,2)
movq (%ebx,%eax,1), %mm3
movq %mm3, %mm0
coupled
movq %mm1, 0(%edi) # s1[k-1]
movq %mm2, 8(%edi) # s2[k-1]
movq %mm0, %mm3
movq (%ebx), %mm0
movq (%ebx,%edx,1), %mm1
movq (%ebx,%edx,2), %mm2
transpose_4x4
movq %mm0, (%ebx)
movq %mm1, (%ebx,%edx,1)
movq %mm2, (%ebx,%edx,2)
movq %mm3, (%ebx,%eax,1)
addl $8, %ebx
decl %ecx
jnz .cascade_horlr_line_loop
emms
pop %edi
pop %esi
pop %ebx
leave
ret
--- NEW FILE: pixel_rand_s16.s ---
# Pure Data Packet mmx routine.
# Copyright (c) by Tom Schouten <pdp at zzz.kotnet.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
.globl pixel_rand_s16
.type pixel_rand_s16, at function
# mmx rgba pixel gain
# void pixel_rand_s16(int *dst, nb_4pixel_vectors, short int random_seed[4])
pixel_rand_s16:
pushl %ebp
movl %esp, %ebp
push %esi
push %edi
movl 16(%ebp), %esi # int16[4] array of random seeds
movl 8(%ebp), %edi # dst array
movl 12(%ebp), %ecx # pixel count
movq (%esi), %mm6
pcmpeqw %mm3, %mm3
psrlw $15, %mm3 # get bit mask 4 times 0x0001
.align 16
.loop_rand:
# prefetch 128(%esi)
movq %mm6, %mm4 # get random vector
psrlw $15, %mm4 # get first component
movq %mm6, %mm5
psrlw $14, %mm5 # get second component
pxor %mm5, %mm4
movq %mm6, %mm5
psrlw $12, %mm5 # get third component
pxor %mm5, %mm4
movq %mm6, %mm5
psrlw $3, %mm5 # get forth component
pxor %mm5, %mm4
psllw $1, %mm6 # shift left original random vector
pand %mm3, %mm4 # isolate new bit
por %mm4, %mm6 # combine into new random vector
movq %mm6, (%edi)
addl $8, %edi
decl %ecx
jnz .loop_rand # loop
movq %mm6, (%esi) # store random seeds
emms
pop %edi
pop %esi
leave
ret
--- NEW FILE: pixel_conv_ver_s16.s ---
# Pure Data Packet mmx routine.
# Copyright (c) by Tom Schouten <pdp at zzz.kotnet.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
#TODO: fix out of bound acces in conv_ver and conv_hor
# intermediate function
# input in register:
# %mm0: top 4 pixels
# %mm1: middle 4 pixels
# %mm2: bottom 4 pixels
# %mm5: top 4 pixel mask
# %mm6: middle 4 pixel mask
# %mm7: bottom 4 pixel mask
# return in register:
# %mm0: middle 4 pixels result
.conv_ver_4_pixels:
.align 16
# compute quadruplet
# get top pixel
pmulhw %mm5, %mm0
psllw $1, %mm0
# get middle pixel
movq %mm1, %mm4
pmulhw %mm6, %mm4
psllw $1, %mm4
paddsw %mm4, %mm0
# get bottom pixel
movq %mm2, %mm3
pmulhw %mm7, %mm3
psllw $1, %mm3 # mm3 <- mm3/4
paddsw %mm3, %mm0
ret
.globl pixel_conv_ver_s16
.type pixel_conv_ver_s16, at function
# pixel_conv_ver_s16(short int *pixel_array, int nb_4_pixel_vectors, int row_byte_size, short int border[4])
# horizontal unsigned pixel conv (1/4 1/2 1/4) not tested
# NOT TESTED
pixel_conv_ver_s16:
pushl %ebp
movl %esp, %ebp
push %esi
push %edi
movl 8(%ebp), %esi # pixel array offset
movl 12(%ebp), %ecx # nb of 4 pixel vectors in a row (at least 2)
movl 16(%ebp), %edx # rowsize in bytes
movl 24(%ebp), %edi # mask vector
movq (%edi), %mm5
movq 8(%edi), %mm6
movq 16(%edi), %mm7
movl 20(%ebp), %edi # edge vector
shll $1, %edx
decl %ecx # loop has a terminator stub
decl %ecx # loop has another terminator stub
movq (%edi), %mm0 # init regs (left edge, so mm0 is zero)
movq (%esi), %mm1
movq (%esi,%edx,1), %mm2
jmp .conv_line_loop
.align 16
.conv_line_loop:
call .conv_ver_4_pixels # compute conv
movq %mm0, (%esi) # store result
movq %mm1, %mm0 # mm0 <- prev (%esi)
movq %mm2, %mm1 # mm1 <- (%esi,%edx,1)
movq (%esi,%edx,2), %mm2 # mm2 <- (%esi,%edx,2)
addl %edx, %esi # increase pointer
decl %ecx
jnz .conv_line_loop
call .conv_ver_4_pixels # compute conv
movq %mm0, (%esi) # store result
movq %mm1, %mm0 # mm0 <- prev (%esi)
movq %mm2, %mm1 # mm1 <- (%esi,%edx,1)
movq (%edi), %mm2 # clear invalid edge vector
addl %edx, %esi # increase pointer
call .conv_ver_4_pixels # compute last vector
movq %mm0, (%esi) # store it
emms
pop %edi
pop %esi
leave
ret
--- NEW FILE: pixel_biquad_dirI_s16.s ---
# Pure Data Packet mmx routine.
# Copyright (c) by Tom Schouten <pdp at zzz.kotnet.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# TODO MOVE TO DIRECT FORM II
# y[k] = b0 * x[k] + u1[k-1]
# u1[k] = b1 * x[k] + u2[k-1] - a1 * y[k]
# u2[k] = b2 * x[k] - a2 * y[k]
# input in register:
# %mm0-mm3: input 4x4 pixels {x0 x1 x2 x3}
# %esi: coef memory (a1, a2, b0, b1, b2)
# %edi: state memory (u1, u2)
# return in register:
# %mm0-mm4: 4x4 pixels result
.biquad_4x4_pixels:
.align 16
# prescale
movq -8(%esi), %mm4
pmulhw %mm4, %mm0
pmulhw %mm4, %mm1
pmulhw %mm4, %mm2
pmulhw %mm4, %mm3
psllw $1, %mm0
psllw $1, %mm1
psllw $1, %mm2
psllw $1, %mm3
# first vector
movq 0(%edi), %mm4 # mm4 <- u[-1]
movq 8(%edi), %mm5 # mm5 <- u[-2]
movq %mm4, %mm6
movq %mm5, %mm7
pmulhw 0(%esi), %mm6 # multiply by a1
pmulhw 8(%esi), %mm7 # multiply by a2
paddsw %mm6, %mm0 # accumulate
paddsw %mm7, %mm0 # accumulate
paddsw %mm0, %mm0 # scale by 2 (since all fixed point muls are x*y/2)
movq %mm0, %mm6 # mm6 <- u[0]
movq %mm4, %mm7 # mm7 <- u[-1]
pmulhw 16(%esi), %mm0 # multiply by b0
pmulhw 24(%esi), %mm4 # multiply by b1
pmulhw 32(%esi), %mm5 # multiply by b2
paddsw %mm4, %mm0 # accumulate
paddsw %mm5, %mm0 # accumulate
# mm0 is result 0
# second vector
movq %mm6, %mm4 # mm4 <- u[0]
movq %mm7, %mm5 # mm5 <- u[-1]
pmulhw 0(%esi), %mm6 # multiply by a1
pmulhw 8(%esi), %mm7 # multiply by a2
paddsw %mm6, %mm1 # accumulate
paddsw %mm7, %mm1 # accumulate
paddsw %mm1, %mm1 # scale by 2
movq %mm1, %mm6 # mm6 <- u[1]
movq %mm4, %mm7 # mm7 <- u[0]
pmulhw 16(%esi), %mm1 # multiply by b0
pmulhw 24(%esi), %mm4 # multiply by b1
pmulhw 32(%esi), %mm5 # multiply by b2
paddsw %mm4, %mm1 # accumulate
paddsw %mm5, %mm1 # accumulate
# mm1 is result 1
# third vector
movq %mm6, %mm4 # mm4 <- u[1]
movq %mm7, %mm5 # mm5 <- u[0]
pmulhw 0(%esi), %mm6 # multiply by a1
pmulhw 8(%esi), %mm7 # multiply by a2
paddsw %mm6, %mm2 # accumulate
paddsw %mm7, %mm2 # accumulate
paddsw %mm2, %mm2 # scale by 2
movq %mm2, %mm6 # mm6 <- u[2]
movq %mm4, %mm7 # mm7 <- u[1]
pmulhw 16(%esi), %mm2 # multiply by b0
pmulhw 24(%esi), %mm4 # multiply by b1
pmulhw 32(%esi), %mm5 # multiply by b2
paddsw %mm4, %mm2 # accumulate
paddsw %mm5, %mm2 # accumulate
# mm2 is result 2
# fourth vector
movq %mm6, %mm4 # mm4 <- u[2]
movq %mm7, %mm5 # mm5 <- u[1]
pmulhw 0(%esi), %mm6 # multiply by a1
pmulhw 8(%esi), %mm7 # multiply by a2
paddsw %mm6, %mm3 # accumulate
paddsw %mm7, %mm3 # accumulate
paddsw %mm3, %mm3 # scale by 2
movq %mm3, 0(%edi) # store u[3]
movq %mm4, 8(%edi) # store u[2]
pmulhw 16(%esi), %mm3 # multiply by b0
pmulhw 24(%esi), %mm4 # multiply by b1
pmulhw 32(%esi), %mm5 # multiply by b2
paddsw %mm4, %mm3 # accumulate
paddsw %mm5, %mm3 # accumulate
# mm3 is result 3
ret
# in order to use the 4 line parallel biquad routine on horizontal
# lines, we need to reorder (rotate or transpose) the matrix, since
# images are scanline encoded, and we want to work in parallell
# on 4 lines.
#
# since the 4 lines are independent, it doesnt matter in which order
# the the vector elements are present.
#
# this allows us to use the same routine for left->right and right->left
# processing.
#
# some comments on the non-abelean group of square isometries consisting of
# (I) identity
# (H) horizontal axis mirror
# (V) vertical axis mirror
# (T) transpose (diagonal axis mirror)
# (A) antitranspose (antidiagonal axis mirror)
# (R1) 90deg anticlockwize rotation
# (R2) 180deg rotation
# (R3) 90deg clockwize rotation
#
#
# we basicly have two options: (R1,R3) or (T,A)
# we opt for T and A because they are self inverting, which improves locality
#
# use antitranspose for right to left an transpose
# for left to right (little endian)
# antitranspose 4x4
# input
# %mm3 == {d0 d1 d2 d3}
# %mm2 == {c0 c1 c2 c3}
# %mm1 == {b0 b1 b2 b3}
# %mm0 == {a0 a1 a2 a3}
# output
# %mm3 == {a3 b3 c3 d3}
# %mm2 == {a2 b2 c2 d2}
# %mm1 == {a1 b1 c1 d1}
# %mm0 == {a0 b0 c0 d0}
.antitranspose_4x4:
.align 16
movq %mm3, %mm4
punpcklwd %mm1, %mm4 # mm4 <- {b2 d2 b3 d3}
movq %mm3, %mm5
punpckhwd %mm1, %mm5 # mm5 <- {b0 d0 b1 d1}
movq %mm2, %mm6
punpcklwd %mm0, %mm6 # mm6 <- {a2 c2 a3 c3}
movq %mm2, %mm7
punpckhwd %mm0, %mm7 # mm7 <- {a0 c0 a1 c1}
movq %mm4, %mm3
punpcklwd %mm6, %mm3 # mm3 <- {a3 b3 c3 d3}
movq %mm4, %mm2
punpckhwd %mm6, %mm2 # mm2 <- {a2 b2 c2 d2}
movq %mm5, %mm1
punpcklwd %mm7, %mm1 # mm1 <- {a1 b1 c1 d1}
movq %mm5, %mm0
punpckhwd %mm7, %mm0 # mm0 <- {a0 b0 c0 d0}
ret
# transpose 4x4
# input
# %mm3 == {d3 d2 d1 d0}
# %mm2 == {c3 c2 c1 c0}
# %mm1 == {b3 b2 b1 b0}
# %mm0 == {a3 a2 a1 a0}
# output
# %mm3 == {d3 c3 b3 a3}
# %mm2 == {d2 c2 b2 a2}
# %mm1 == {d1 c1 b1 a1}
# %mm0 == {d0 c0 b0 a0}
.transpose_4x4:
.align 16
movq %mm0, %mm4
punpcklwd %mm2, %mm4 # mm4 <- {c1 a1 c0 a0}
movq %mm0, %mm5
punpckhwd %mm2, %mm5 # mm5 <- {c3 a3 c2 a2}
movq %mm1, %mm6
punpcklwd %mm3, %mm6 # mm6 <- {d1 b1 d0 b0}
movq %mm1, %mm7
punpckhwd %mm3, %mm7 # mm7 <- {d3 b3 d2 b2}
movq %mm4, %mm0
punpcklwd %mm6, %mm0 # mm0 <- {d0 c0 b0 a0}
movq %mm4, %mm1
punpckhwd %mm6, %mm1 # mm1 <- {d1 c1 b1 a1}
movq %mm5, %mm2
punpcklwd %mm7, %mm2 # mm2 <- {d2 c2 b2 a2}
movq %mm5, %mm3
punpckhwd %mm7, %mm3 # mm3 <- {d3 c3 b3 a3}
ret
.globl pixel_biquad_vertb_s16
.type pixel_biquad_vertb_s16, at function
# pixel_biquad_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
pixel_biquad_vertb_s16:
pushl %ebp
movl %esp, %ebp
push %ebx
push %esi
push %edi
movl 8(%ebp), %ebx # pixel array offset
movl 12(%ebp), %ecx # nb of 4x4 pixblocks
movl 16(%ebp), %edx # line with
movl 20(%ebp), %esi # coefs
movl 24(%ebp), %edi # state
shll $1, %edx # short int addressing
movl %edx, %eax
shll $1, %eax
addl %edx, %eax # eax = 3 * edx
.align 16
.biquad_vertb_line_loop:
movq (%ebx), %mm0
movq (%ebx,%edx,1), %mm1
movq (%ebx,%edx,2), %mm2
movq (%ebx,%eax,1), %mm3
call .biquad_4x4_pixels
movq %mm0, (%ebx)
movq %mm1, (%ebx,%edx,1)
movq %mm2, (%ebx,%edx,2)
movq %mm3, (%ebx,%eax,1)
addl %edx, %ebx
addl %eax, %ebx
decl %ecx
jnz .biquad_vertb_line_loop
emms
pop %edi
pop %esi
pop %ebx
leave
ret
.globl pixel_biquad_horlr_s16
.type pixel_biquad_horlr_s16, at function
# pixel_biquad_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
pixel_biquad_horlr_s16:
pushl %ebp
movl %esp, %ebp
push %ebx
push %esi
push %edi
movl 8(%ebp), %ebx # pixel array offset
movl 12(%ebp), %ecx # nb of 4x4 pixblocks
movl 16(%ebp), %edx # line with
movl 20(%ebp), %esi # coefs
movl 24(%ebp), %edi # state
shll $1, %edx # short int addressing
movl %edx, %eax
shll $1, %eax
addl %edx, %eax # eax = 3 * edx
.align 16
.biquad_horlr_line_loop:
movq (%ebx), %mm0
movq (%ebx,%edx,1), %mm1
movq (%ebx,%edx,2), %mm2
movq (%ebx,%eax,1), %mm3
call .transpose_4x4
call .biquad_4x4_pixels
call .transpose_4x4
movq %mm0, (%ebx)
movq %mm1, (%ebx,%edx,1)
movq %mm2, (%ebx,%edx,2)
movq %mm3, (%ebx,%eax,1)
addl $8, %ebx
decl %ecx
jnz .biquad_horlr_line_loop
emms
pop %edi
pop %esi
pop %ebx
leave
ret
--- NEW FILE: Makefile ---
include ../../Makefile.config
OBJ = \
pixel_pack_s16u8.o \
pixel_unpack_u8s16.o \
pixel_add_s16.o \
pixel_mul_s16.o \
pixel_mix_s16.o \
pixel_randmix_s16.o \
pixel_conv_hor_s16.o \
pixel_conv_ver_s16.o \
pixel_biquad_s16.o \
pixel_ca_s1.o \
pixel_rand_s16.o \
pixel_crot_s16.o \
pixel_gain_s16.o \
pixel_resample_s16.o \
pixel_cheby_s16.o
all: $(OBJ)
test: pdp_mmx_test.o $(OBJ)
gcc -o pdp_mmx_test pdp_mmx_test.o $(OBJ) -g
clean:
rm -f *.o
rm -f *~
rm -f pdp_mmx.a
rm -f pdp_mmx_test
--- NEW FILE: pixel_add_s16.s ---
# Pure Data Packet mmx routine.
# Copyright (c) by Tom Schouten <pdp at zzz.kotnet.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
.globl pixel_add_s16
.type pixel_add_s16, at function
# simple add
# void pixel_add_s16(int *left, int *right, int nb_4pixel_vectors)
pixel_add_s16:
pushl %ebp
movl %esp, %ebp
push %esi
push %edi
movl 8(%ebp), %edi # left array
movl 12(%ebp), %esi # right array
movl 16(%ebp), %ecx # pixel count
.align 16
.loop_mix:
# prefetch 128(%esi)
movq (%esi), %mm1 # load right 4 pixels from memory
movq (%edi), %mm0 # load 4 left pixels from memory
paddsw %mm1, %mm0 # mix
movq %mm0, (%edi)
addl $8, %esi
addl $8, %edi
decl %ecx
jnz .loop_mix # loop
emms
pop %edi
pop %esi
leave
ret
--- NEW FILE: pixel_randmix_s16.s ---
# Pure Data Packet mmx routine.
# Copyright (c) by Tom Schouten <pdp at zzz.kotnet.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
.globl pixel_randmix_s16
.type pixel_randmix_s16, at function
# mmx rgba pixel gain
# void pixel_randmix_s16(int *left, int *right, int nb_4pixel_vectors, short int random_seed[4], short int threshold[4])
pixel_randmix_s16:
pushl %ebp
movl %esp, %ebp
push %esi
push %edi
movl 20(%ebp), %edi # int16[4] array of random seeds
movq (%edi), %mm6
movl 24(%ebp), %edi # int16[4] array of thresholds
movq (%edi), %mm7
movl 8(%ebp), %edi # left array
movl 12(%ebp), %esi # right array
movl 16(%ebp), %ecx # pixel count
pcmpeqw %mm3, %mm3
psrlw $15, %mm3 # get bit mask 4 times 0x0001
.align 16
.loop_randmix:
# prefetch 128(%esi)
movq (%esi), %mm1 # load right 4 pixels from memory
movq (%edi), %mm0 # load 4 left pixels from memory
movq %mm6, %mm2 # get random vector
pcmpgtw %mm7, %mm2 # compare random vector with threshold
movq %mm2, %mm5
pand %mm0, %mm2 # get left array's components
pandn %mm1, %mm5 # get right array's components
por %mm2, %mm5
movq %mm5, (%edi) # store pixels
movq %mm6, %mm4 # get random vector
psrlw $15, %mm4 # get first component
movq %mm6, %mm5
psrlw $14, %mm5 # get second component
pxor %mm5, %mm4
movq %mm6, %mm5
psrlw $12, %mm5 # get third component
pxor %mm5, %mm4
movq %mm6, %mm5
psrlw $3, %mm5 # get forth component
pxor %mm5, %mm4
psllw $1, %mm6 # shift left original random vector
pand %mm3, %mm4 # isolate new bit
por %mm4, %mm6 # combine into new random vector
addl $8, %esi
addl $8, %edi
decl %ecx
jnz .loop_randmix # loop
movl 20(%ebp), %edi # int16[4] array of random seeds
movq %mm6, (%edi) # store random seeds
emms
pop %edi
pop %esi
leave
ret
--- NEW FILE: pixel_unpack_u8s16.s ---
# Pure Data Packet mmx routine.
# Copyright (c) by Tom Schouten <pdp at zzz.kotnet.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
.globl pixel_unpack_u8s16_y
.type pixel_unpack_u8s16_y, at function
# mmx rgba pixel gain
# void pixel_unpack_u8s16_y(char *input, char *output, int32 nb_pixels_div8)
pixel_unpack_u8s16_y:
pushl %ebp
movl %esp, %ebp
push %esi
push %edi
# movl 20(%ebp), %edi # int16[4] array of gains
# movq (%edi), %mm7 # get gain array
movl 8(%ebp), %esi # input uint8 pixel array
movl 12(%ebp), %edi # output sint16 pixel array
movl 16(%ebp), %ecx # nb of elements div 8
.align 16
.loop_unpack_y:
movq (%esi), %mm5 # load 8 pixels from memory
pxor %mm0, %mm0 # zero mm0 - mm3
pxor %mm1, %mm1
punpcklbw %mm5, %mm0 # unpack 1st 4 pixels
punpckhbw %mm5, %mm1 # unpack 2nd 4 pixles
psrlw $0x1, %mm0 # shift right to clear sign bit 9.7
psrlw $0x1, %mm1
# pmulhw %mm7, %mm0 # apply gain
# pmulhw %mm7, %mm1
# paddsw %mm0, %mm0 # correct factor 2
# paddsw %mm1, %mm1
movq %mm0, (%edi) # store
movq %mm1, 8(%edi)
addl $8, %esi # increment source pointer
addl $16, %edi # increment dest pointer
decl %ecx
jnz .loop_unpack_y # loop
emms
pop %edi
pop %esi
leave
ret
.globl pixel_unpack_u8s16_uv
.type pixel_unpack_u8s16_uv, at function
pixel_unpack_u8s16_uv:
pushl %ebp
movl %esp, %ebp
push %esi
push %edi
# movl 20(%ebp), %edi # int16[4] array of gains
# movq (%edi), %mm7 # get gain array
movl 8(%ebp), %esi # input uint8 pixel array
movl 12(%ebp), %edi # output sint16 pixel array
movl 16(%ebp), %ecx # nb of elements div 8
pcmpeqw %mm6, %mm6
psllw $15, %mm6
.align 16
.loop_unpack_uv:
movq (%esi), %mm5 # load 8 pixels from memory
pxor %mm0, %mm0 # zero mm0 - mm3
pxor %mm1, %mm1
punpcklbw %mm5, %mm0 # unpack 1st 4 pixels
punpckhbw %mm5, %mm1 # unpack 2nd 4 pixles
pxor %mm6, %mm0 # flip sign bit (Cr and Cb are ofset by 128)
pxor %mm6, %mm1
# pmulhw %mm7, %mm0 # apply gain
# pmulhw %mm7, %mm1
# paddsw %mm0, %mm0 # correct factor 2
# paddsw %mm1, %mm1
movq %mm0, (%edi) # store
movq %mm1, 8(%edi)
addl $8, %esi # increment source pointer
addl $16, %edi # increment dest pointer
decl %ecx
jnz .loop_unpack_uv # loop
emms
pop %edi
pop %esi
leave
ret
--- NEW FILE: pixel_pack_s16u8.s ---
# Pure Data Packet mmx routine.
# Copyright (c) by Tom Schouten <pdp at zzz.kotnet.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
.globl pixel_pack_s16u8_y
.type pixel_pack_s16u8_y, at function
# mmx rgba pixel gain
# void pixel_pack_s16u8_y(int *input, int *output, int nb_8pixel_vectors)
pixel_pack_s16u8_y:
pushl %ebp
movl %esp, %ebp
push %esi
push %edi
# movl 20(%ebp), %edi # int16[4] array of gains
# movq (%edi), %mm7 # get gain array
# psllw $1, %mm7 # adjust for shifted sign bit
movl 8(%ebp), %esi # input array
movl 12(%ebp), %edi # output array
movl 16(%ebp), %ecx # pixel count
pxor %mm6, %mm6
.align 16
.loop_pack_y:
# prefetch 128(%esi)
movq (%esi), %mm0 # load 4 pixels from memory
# pmulhw %mm7, %mm0 # apply gain
movq 8(%esi), %mm1 # load 4 pixels from memory
# pmulhw %mm7, %mm1 # apply gain
# movq %mm0, %mm2
# pcmpgtw %mm6, %mm2 # mm2 > 0 ? 0xffff : 0
# pand %mm2, %mm0
# movq %mm1, %mm3
# pcmpgtw %mm6, %mm3 # mm3 > 0 ? 0xffff : 0
# pand %mm3, %mm1
# psllw $1, %mm0 # shift out sign bit
# psllw $1, %mm1 # shift out sign bit
psraw $7, %mm0 # shift to lsb
psraw $7, %mm1 # shift to lsb
packuswb %mm1, %mm0 # pack & saturate to 8bit vector
movq %mm0, (%edi) # store result in memory
addl $16, %esi # increment source pointer
addl $8, %edi # increment dest pointer
decl %ecx
jnz .loop_pack_y # loop
emms
pop %edi
pop %esi
leave
ret
.globl pixel_pack_s16u8_uv
.type pixel_pack_s16u8_uv, at function
pixel_pack_s16u8_uv:
pushl %ebp
movl %esp, %ebp
push %esi
push %edi
# movl 20(%ebp), %edi # int16[4] array of gains
# movq (%edi), %mm7 # get gain array
movl 8(%ebp), %esi # pixel array offset
movl 12(%ebp), %edi # nb of elements
movl 16(%ebp), %ecx # pixel count
pcmpeqw %mm6, %mm6
psllw $15, %mm6
movq %mm6, %mm5
psrlw $8, %mm5
por %mm5, %mm6 # mm6 <- 8 times 0x80
.align 16
.loop_pack_uv:
# prefetch 128(%esi)
movq (%esi), %mm0 # load 4 pixels from memory
# pmulhw %mm7, %mm0 # apply gain
movq 8(%esi), %mm1 # load 4 pixels from memory
# pmulhw %mm7, %mm1 # apply gain
psraw $8, %mm0 # shift to msb
psraw $8, %mm1
packsswb %mm1, %mm0 # pack & saturate to 8bit vector
pxor %mm6, %mm0 # flip sign bits
movq %mm0, (%edi) # store result in memory
addl $16, %esi # increment source pointer
addl $8, %edi # increment dest pointer
decl %ecx
jnz .loop_pack_uv # loop
emms
pop %edi
pop %esi
leave
ret
--- NEW FILE: pixel_mix_s16.s ---
# Pure Data Packet mmx routine.
# Copyright (c) by Tom Schouten <pdp at zzz.kotnet.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
.globl pixel_mix_s16
.type pixel_mix_s16, at function
# mmx rgba pixel gain
# void pixel_mix_s16(int *left, int *right, int nb_4pixel_vectors,
# short int gain_left[4], short int gain_right[4])
pixel_mix_s16:
pushl %ebp
movl %esp, %ebp
push %esi
push %edi
movl 20(%ebp), %edi # int16[4] array of gains
movq (%edi), %mm6 # get left gain array
movl 24(%ebp), %edi # int16[4] array of gains
movq (%edi), %mm7 # get right gain array
movl 8(%ebp), %edi # left array
movl 12(%ebp), %esi # right array
movl 16(%ebp), %ecx # pixel count
.align 16
.loop_mix:
# prefetch 128(%esi)
movq (%esi), %mm1 # load right 4 pixels from memory
pmulhw %mm7, %mm1 # apply right gain
movq (%edi), %mm0 # load 4 left pixels from memory
pmulhw %mm6, %mm0 # apply left gain
# pslaw $1, %mm1 # shift left ((s).15 x (s).15 -> (s0).14))
# pslaw $1, %mm0
paddsw %mm0, %mm0 # no shift left arithmic, so use add instead
paddsw %mm1, %mm1
paddsw %mm1, %mm0 # mix
movq %mm0, (%edi)
addl $8, %esi
addl $8, %edi
decl %ecx
jnz .loop_mix # loop
emms
pop %edi
pop %esi
leave
ret
--- NEW FILE: pixel_ca_s1.s ---
# Pure Data Packet mmx routine.
# Copyright (c) by Tom Schouten <pdp at zzz.kotnet.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# this file contains assembler routines for 2D 1 bit cellular automata
# processing. it is organized around a feeder kernel and a
# stack based bit processor (virtual forth machine)
#
# the feeder kernel is responsable for loading/storing CA cells
# from/to memory. data in memory is organized as a scanline
# encoded toroidial bitplane (lsb = left). to simplify the kernel, the top
# left corner of the rectangular grid of pixels will shift down
# every processing step.
#
# the stack machine has the following architecture:
# CA stack: %esi, TOS: %mm0 (32x2 pixels. lsw = top row)
# CA horizon: %mm4-%mm7 (64x4 pixels. %mm4 = top row)
#
# the stack size / organization is not known to the stack machine.
# it can be thought of as operating on a 3x3 cell neightbourhood.
# the only purpose of forth program is to determine the CA local update rule.
#
# the machine is supposed to be very minimal. no looping control.
# no adressing modes. no conditional code (hey, this is an experiment!)
# so recursion is not allowed (no way to stop it)
# there are 9 words to load the cell neigbourhood on the stack.
# the rest is just logic and stack manips.
# this file contains pure asm macros. it is to be included before assembly
# after scaforth.pl has processed the .scaf file
# *************************** CA CELL ACCESS MACROS *****************************
# fetchTL - fetchBR
# shift / load rectangle macros:
# shift rectangle horizontal
# result is in reg1
.macro shift reg1 reg2 count
psllq $(32+\count), \reg1
psrlq $(32-\count), \reg2
psrlq $32, \reg1
psllq $32, \reg2
por \reg2, \reg1
.endm
.macro ldtop reg1 reg2
movq %mm4, \reg1
movq %mm5, \reg2
.endm
.macro ldcenter reg1 reg2
movq %mm5, \reg1
movq %mm6, \reg2
.endm
.macro ldbottom reg1 reg2
movq %mm6, \reg1
movq %mm7, \reg2
.endm
# fetch from top row
# fetch the top left square
.macro fetchTL
ldtop %mm0, %mm1
shift %mm0, %mm1, -1
.endm
# fetch the top mid square
.macro fetchTM
ldtop %mm0, %mm1
shift %mm0, %mm1, 0
.endm
# fetch the top right square
.macro fetchTR
ldtop %mm0, %mm1
shift %mm0, %mm1, 1
.endm
# fetch from center row
# fetch the mid left square
.macro fetchML
ldcenter %mm0, %mm1
shift %mm0, %mm1, -1
.endm
# fetch the mid mid square
.macro fetchMM
ldcenter %mm0, %mm1
shift %mm0, %mm1, 0
.endm
# fetch the mid right square
.macro fetchMR
ldcenter %mm0, %mm1
shift %mm0, %mm1, 1
.endm
# fetch from bottom row
# fetch the bottom left square
.macro fetchBL
ldbottom %mm0, %mm1
shift %mm0, %mm1, -1
.endm
# fetch the bottom mid square
.macro fetchBM
ldbottom %mm0, %mm1
shift %mm0, %mm1, 0
.endm
# fetch the bottom right square
.macro fetchBR
ldbottom %mm0, %mm1
shift %mm0, %mm1, 1
.endm
# *************************** CA STACK MANIP MACROS *****************************
# dup drop dropdup swap nip dropover
.macro dup
lea -8(%esi), %esi
movq %mm0, (%esi)
.endm
.macro drop
movq (%esi), %mm0
lea 8(%esi), %esi
.endm
.macro dropdup
movq (%esi), %mm0
.endm
.macro swap
movq (%esi), %mm1
movq %mm0, (%esi)
movq %mm1, %mm0
.endm
.macro nip
lea 8(%esi), %esi
.endm
.macro dropover
movq 8(%esi), %mm0
.endm
# *************************** CA BOOLEAN LOGIC MACROS *****************************
# overxor
.macro overxor
pxor (%esi), %mm0
.endm
--- NEW FILE: pixel_biquad_s16.s ---
# Pure Data Packet mmx routine.
# Copyright (c) by Tom Schouten <pdp at zzz.kotnet.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# DIRECT FORM II BIQUAD
#
# y[k] = b0 * x[k] + u1[k-1]
# u1[k] = b1 * x[k] + u2[k-1] - a1 * y[k]
# u2[k] = b2 * x[k] - a2 * y[k]
# MACRO: df2 <reg>
#
# computes a direct form 2 biquad
# does not use {mm0-mm3}\<inreg>
#
# input: <reg> == input
# %mm4 == state 1
# %mm5 == state 2
# (%esi) == biquad coefs (-a1 -a2 b0 b1 b2) in s1.14
# output: <reg> == output
# %mm4 == state 1
# %mm5 == state 2
.macro df2 reg
movq \reg, %mm6 # mm6 == x[k]
movq \reg, %mm7 # mm7 == x[k]
pmulhw 16(%esi), %mm6 # mm6 == x[k] * b0
pmulhw 24(%esi), %mm7 # mm7 == x[k] * b1
paddw %mm4, %mm6 # mm6 == x[k] * b0 + u1[k-1] == y[k]
paddw %mm5, %mm7 # mm7 == x[k] * b1 + u2[k-1]
paddsw %mm6, %mm6 # compensate for mul = x*y/4 (coefs are s1.14 fixed point)
paddsw %mm6, %mm6 # paddsw ensures saturation
movq \reg, %mm5 # mm5 == x[k]
movq %mm6, %mm4 # mm4 == y[k]
movq %mm6, \reg # reg == y[k] --------------------
pmulhw 0(%esi), %mm4 # mm4 == y[k] * (-a1)
pmulhw 8(%esi), %mm6 # mm6 == y[k] * (-a2)
pmulhw 32(%esi), %mm5 # mm5 == x[k] * b2
paddw %mm7, %mm4 # mm4 == u1[k] --------------------
paddw %mm6, %mm5 # mm5 == u2[k] --------------------
.endm
# input in register:
# %mm0-mm3: input 4x4 pixels {x0 x1 x2 x3}
# %esi: coef memory (-a1, -a2, b0, b1, b2) in s1.14
# %edi: state memory (u1, u2)
# return in register:
# %mm0-mm4: 4x4 pixels result
.macro biquad_4x4_pixels
.align 16
movq 0(%edi), %mm4 # get state
movq 8(%edi), %mm5
df2 %mm0 # compute 4 biquads
df2 %mm1
df2 %mm2
df2 %mm3
movq %mm4, 0(%edi) # store state
movq %mm5, 8(%edi)
.endm
# in order to use the 4 line parallel biquad routine on horizontal
# lines, we need to reorder (rotate or transpose) the matrix, since
# images are scanline encoded, and we want to work in parallell
# on 4 lines.
#
# since the 4 lines are independent, it doesnt matter in which order
# the the vector elements are present.
#
# this allows us to use the same routine for left->right and right->left
# processing.
#
# some comments on the non-abelean group of square isometries consisting of
# (I) identity
# (H) horizontal axis mirror
# (V) vertical axis mirror
# (T) transpose (diagonal axis mirror)
# (A) antitranspose (antidiagonal axis mirror)
# (R1) 90deg anticlockwize rotation
# (R2) 180deg rotation
# (R3) 90deg clockwize rotation
#
#
# we basicly have two options: (R1,R3) or (T,A)
# we opt for T and A because they are self inverting, which improves locality
#
# use antitranspose for right to left an transpose
# for left to right (little endian)
# antitranspose 4x4
# input
# %mm3 == {d0 d1 d2 d3}
# %mm2 == {c0 c1 c2 c3}
# %mm1 == {b0 b1 b2 b3}
# %mm0 == {a0 a1 a2 a3}
# output
# %mm3 == {a3 b3 c3 d3}
# %mm2 == {a2 b2 c2 d2}
# %mm1 == {a1 b1 c1 d1}
# %mm0 == {a0 b0 c0 d0}
.macro antitranspose_4x4:
movq %mm3, %mm4
punpcklwd %mm1, %mm4 # mm4 <- {b2 d2 b3 d3}
movq %mm3, %mm5
punpckhwd %mm1, %mm5 # mm5 <- {b0 d0 b1 d1}
movq %mm2, %mm6
punpcklwd %mm0, %mm6 # mm6 <- {a2 c2 a3 c3}
movq %mm2, %mm7
punpckhwd %mm0, %mm7 # mm7 <- {a0 c0 a1 c1}
movq %mm4, %mm3
punpcklwd %mm6, %mm3 # mm3 <- {a3 b3 c3 d3}
movq %mm4, %mm2
punpckhwd %mm6, %mm2 # mm2 <- {a2 b2 c2 d2}
movq %mm5, %mm1
punpcklwd %mm7, %mm1 # mm1 <- {a1 b1 c1 d1}
movq %mm5, %mm0
punpckhwd %mm7, %mm0 # mm0 <- {a0 b0 c0 d0}
.endm
# transpose 4x4
# input
# %mm3 == {d3 d2 d1 d0}
# %mm2 == {c3 c2 c1 c0}
# %mm1 == {b3 b2 b1 b0}
# %mm0 == {a3 a2 a1 a0}
# output
# %mm3 == {d3 c3 b3 a3}
# %mm2 == {d2 c2 b2 a2}
# %mm1 == {d1 c1 b1 a1}
# %mm0 == {d0 c0 b0 a0}
.macro transpose_4x4:
movq %mm0, %mm4
punpcklwd %mm2, %mm4 # mm4 <- {c1 a1 c0 a0}
movq %mm0, %mm5
punpckhwd %mm2, %mm5 # mm5 <- {c3 a3 c2 a2}
movq %mm1, %mm6
punpcklwd %mm3, %mm6 # mm6 <- {d1 b1 d0 b0}
movq %mm1, %mm7
punpckhwd %mm3, %mm7 # mm7 <- {d3 b3 d2 b2}
movq %mm4, %mm0
punpcklwd %mm6, %mm0 # mm0 <- {d0 c0 b0 a0}
movq %mm4, %mm1
punpckhwd %mm6, %mm1 # mm1 <- {d1 c1 b1 a1}
movq %mm5, %mm2
punpcklwd %mm7, %mm2 # mm2 <- {d2 c2 b2 a2}
movq %mm5, %mm3
punpckhwd %mm7, %mm3 # mm3 <- {d3 c3 b3 a3}
.endm
.globl pixel_biquad_vertb_s16
.type pixel_biquad_vertb_s16, at function
# pixel_biquad_vertbr_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
pixel_biquad_vertb_s16:
pushl %ebp
movl %esp, %ebp
push %ebx
push %esi
push %edi
movl 8(%ebp), %ebx # pixel array offset
movl 12(%ebp), %ecx # nb of 4x4 pixblocks
movl 16(%ebp), %edx # line with
movl 20(%ebp), %esi # coefs
movl 24(%ebp), %edi # state
shll $1, %edx # short int addressing
movl %edx, %eax
shll $1, %eax
addl %edx, %eax # eax = 3 * edx
.align 16
.biquad_vertb_line_loop:
movq (%ebx), %mm0
movq (%ebx,%edx,1), %mm1
movq (%ebx,%edx,2), %mm2
movq (%ebx,%eax,1), %mm3
biquad_4x4_pixels
movq %mm0, (%ebx)
movq %mm1, (%ebx,%edx,1)
movq %mm2, (%ebx,%edx,2)
movq %mm3, (%ebx,%eax,1)
addl %edx, %ebx
addl %eax, %ebx
decl %ecx
jnz .biquad_vertb_line_loop
emms
pop %edi
pop %esi
pop %ebx
leave
ret
.globl pixel_biquad_verbt_s16
.type pixel_biquad_verbt_s16, at function
# pixel_biquad_vertbt_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
pixel_biquad_verbt_s16:
pushl %ebp
movl %esp, %ebp
push %ebx
push %esi
push %edi
movl 8(%ebp), %ebx # pixel array offset
movl 12(%ebp), %ecx # nb of 4x4 pixblocks
movl 16(%ebp), %eax # line with
shll $3, %eax # 4 line byte spacing
decl %ecx
mul %ecx
incl %ecx
addl %eax, %ebx # ebx points to last pixblock
movl 16(%ebp), %edx # line with
movl 20(%ebp), %esi # coefs
movl 24(%ebp), %edi # state
shll $1, %edx # short int addressing
movl %edx, %eax
shll $1, %eax
addl %edx, %eax # eax = 3 * edx
.align 16
.biquad_verbt_line_loop:
movq (%ebx), %mm3
movq (%ebx,%edx,1), %mm2
movq (%ebx,%edx,2), %mm1
movq (%ebx,%eax,1), %mm0
biquad_4x4_pixels
movq %mm3, (%ebx)
movq %mm2, (%ebx,%edx,1)
movq %mm1, (%ebx,%edx,2)
movq %mm0, (%ebx,%eax,1)
subl %edx, %ebx
subl %eax, %ebx
decl %ecx
jnz .biquad_verbt_line_loop
emms
pop %edi
pop %esi
pop %ebx
leave
ret
.globl pixel_biquad_horlr_s16
.type pixel_biquad_horlr_s16, at function
# pixel_biquad_hor_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
pixel_biquad_horlr_s16:
pushl %ebp
movl %esp, %ebp
push %ebx
push %esi
push %edi
movl 8(%ebp), %ebx # pixel array offset
movl 12(%ebp), %ecx # nb of 4x4 pixblocks
movl 16(%ebp), %edx # line with
movl 20(%ebp), %esi # coefs
movl 24(%ebp), %edi # state
shll $1, %edx # short int addressing
movl %edx, %eax
shll $1, %eax
addl %edx, %eax # eax = 3 * edx
.align 16
.biquad_horlr_line_loop:
movq (%ebx), %mm0
movq (%ebx,%edx,1), %mm1
movq (%ebx,%edx,2), %mm2
movq (%ebx,%eax,1), %mm3
transpose_4x4
biquad_4x4_pixels
transpose_4x4
movq %mm0, (%ebx)
movq %mm1, (%ebx,%edx,1)
movq %mm2, (%ebx,%edx,2)
movq %mm3, (%ebx,%eax,1)
addl $8, %ebx
decl %ecx
jnz .biquad_horlr_line_loop
emms
pop %edi
pop %esi
pop %ebx
leave
ret
.globl pixel_biquad_horrl_s16
.type pixel_biquad_horrl_s16, at function
# pixel_biquad_horrl_s16(char *pixel_array, int nb_rows, int linewidth, short int coef[20], short int state[8])
pixel_biquad_horrl_s16:
pushl %ebp
movl %esp, %ebp
push %ebx
push %esi
push %edi
movl 8(%ebp), %ebx # pixel array offset
movl 12(%ebp), %ecx # nb of 4x4 pixblocks
movl 16(%ebp), %edx # line with
movl %ecx, %eax
decl %eax
shll $3, %eax
addl %eax, %ebx # ebx points to last pixblock
movl 20(%ebp), %esi # coefs
movl 24(%ebp), %edi # state
shll $1, %edx # short int addressing
movl %edx, %eax
shll $1, %eax
addl %edx, %eax # eax = 3 * edx
.align 16
.biquad_horrl_line_loop:
movq (%ebx), %mm0
movq (%ebx,%edx,1), %mm1
movq (%ebx,%edx,2), %mm2
movq (%ebx,%eax,1), %mm3
antitranspose_4x4
biquad_4x4_pixels
antitranspose_4x4
movq %mm0, (%ebx)
movq %mm1, (%ebx,%edx,1)
movq %mm2, (%ebx,%edx,2)
movq %mm3, (%ebx,%eax,1)
subl $8, %ebx
decl %ecx
jnz .biquad_horrl_line_loop
emms
pop %edi
pop %esi
pop %ebx
leave
ret
.globl pixel_biquad_time_s16
.type pixel_biquad_time_s16, at function
# pixel_biquad_time_s16(short int *pixel_array, short int *s1, short int *s2, short int *coefs, int nb_4_pix_vectors)
pixel_biquad_time_s16:
pushl %ebp
movl %esp, %ebp
push %ebx
push %esi
push %edi
movl 8(%ebp), %ebx # pixel array offset
movl 12(%ebp), %edx # state 1 array
movl 16(%ebp), %edi # state 2 array
movl 20(%ebp), %esi # coefs
movl 24(%ebp), %ecx # nb of 4 pixel vectors
.align 16
.biquad_time_loop:
movq (%ebx), %mm0 # get input
movq (%edx), %mm4 # get state 1
movq (%edi), %mm5 # get state 2
df2 %mm0 # compute direct form 2
movq %mm0, (%ebx) # write output
movq %mm5, (%edi) # write state 2
movq %mm4, (%edx) # write state 1
addl $8, %ebx
addl $8, %edi
addl $8, %edx
decl %ecx
jnz .biquad_time_loop
emms
pop %edi
pop %esi
pop %ebx
leave
ret
--- NEW FILE: pixel_conv_hor_s16.s ---
# Pure Data Packet mmx routine.
# Copyright (c) by Tom Schouten <pdp at zzz.kotnet.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# intermediate function
# input in register:
# %mm0: left 4 pixels
# %mm1: middle 4 pixels
# %mm2: right 4 pixels
# %mm5: left 4 pixel masks
# %mm6: middle 4 pixel masks
# %mm7: right 4 pixel masks
# return in register:
# %mm0: middle 4 pixels result
.conv_hor_4_pixels:
.align 16
# compute quadruplet
# get left pixels
psrlq $48, %mm0 # shift word 3 to byte 0
movq %mm1, %mm4
psllq $16, %mm4 # shift word 0,1,2 to 1,2,3
por %mm4, %mm0 # combine
pmulhw %mm5, %mm0
psllw $1, %mm0
# get middle pixels
movq %mm1, %mm4
pmulhw %mm6, %mm4
psllw $1, %mm4
paddsw %mm4, %mm0
# get right pixels
movq %mm2, %mm3
psllq $48, %mm3 # shift word 0 to word 3
movq %mm1, %mm4
psrlq $16, %mm4 # shift word 1,2,3 to 0,1,2
por %mm4, %mm3 # combine
pmulhw %mm7, %mm3
psllw $1, %mm3
paddsw %mm3, %mm0 # accumulate
ret
.globl pixel_conv_hor_s16
.type pixel_conv_hor_s16, at function
# pixel_conv_hor_s16(short int *pixel_array, int nb_4_pixel_vectors, short int border[4], short int mask[12])
# horizontal unsigned pixel conv (1/4 1/2 1/4) not tested
# NOT TESTED
pixel_conv_hor_s16:
pushl %ebp
movl %esp, %ebp
push %esi
push %edi
movl 8(%ebp), %esi # pixel array offset
movl 12(%ebp), %ecx # nb of 8 pixel vectors in a row (at least 2)
movl 20(%ebp), %edi # mask vector
movq (%edi), %mm5
movq 8(%edi), %mm6
movq 16(%edi), %mm7
movl 16(%ebp), %edi # boundary pixel vector
movq (%edi), %mm0 # init regs (left edge, so mm0 is zero)
movq (%esi), %mm1
movq 8(%esi), %mm2
decl %ecx # loop has 2 terminator stubs
decl %ecx # todo: handle if ecx < 3
jmp .conv_line_loop
.align 16
.conv_line_loop:
call .conv_hor_4_pixels # compute conv
movq %mm0, (%esi) # store result
movq %mm1, %mm0 # mm0 <- prev (%esi)
movq %mm2, %mm1 # mm1 <- 8(%esi)
movq 16(%esi), %mm2 # mm2 <- 16(%esi)
addl $8, %esi # increase pointer
decl %ecx
jnz .conv_line_loop
call .conv_hor_4_pixels # compute conv
movq %mm0, (%esi) # store result
movq %mm1, %mm0 # mm0 <- prev (%esi)
movq %mm2, %mm1 # mm1 <- 8(%esi)
movq (%edi), %mm2 # mm2 <- border
call .conv_hor_4_pixels # compute last vector
movq %mm0, 8(%esi) # store it
emms
pop %edi
pop %esi
leave
ret
--- NEW FILE: pixel_crot_s16.s ---
# Pure Data Packet mmx routine.
# Copyright (c) by Tom Schouten <pdp at zzz.kotnet.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
.globl pixel_crot3d_s16
.type pixel_crot3d_s16, at function
# 3 dimensional colour space rotation
# 3x3 matrix is column encoded, each coefficient is a 4x16 bit fixed point vector
# void pixel_crot3d_s16(int *buf, int nb_4pixel_vectors_per_plane, short int *matrix)
pixel_crot3d_s16:
pushl %ebp
movl %esp, %ebp
push %esi
push %edi
movl 8(%ebp), %esi # input array
movl 12(%ebp), %ecx # pixel count
movl 16(%ebp), %edi # rotation matrix
movl %ecx, %edx
shll $3, %edx # %edx = plane spacing
.align 16
.loop_crot3d:
movq (%esi), %mm0 # get 1st component
movq (%esi,%edx,1), %mm6 # get 2nd component
movq (%esi,%edx,2), %mm7 # get 3rd component
movq %mm0, %mm1 # copy 1st component
movq %mm0, %mm2
pmulhw (%edi), %mm0 # mul first column
pmulhw 8(%edi), %mm1
pmulhw 16(%edi), %mm2
movq %mm6, %mm5 # copy 2nd component
movq %mm6, %mm3
pmulhw 24(%edi), %mm6 # mul second column
pmulhw 32(%edi), %mm5
pmulhw 40(%edi), %mm3
paddsw %mm6, %mm0 # accumulate
paddsw %mm5, %mm1
paddsw %mm3, %mm2
movq %mm7, %mm4 # copy 3rd component
movq %mm7, %mm6
pmulhw 48(%edi), %mm4 # mul third column
pmulhw 56(%edi), %mm6
pmulhw 64(%edi), %mm7
paddsw %mm4, %mm0 # accumulate
paddsw %mm6, %mm1
paddsw %mm7, %mm2
paddsw %mm0, %mm0 # double (fixed point normalization)
paddsw %mm1, %mm1
paddsw %mm2, %mm2
movq %mm0, (%esi) # store
movq %mm1, (%esi, %edx, 1)
movq %mm2, (%esi, %edx, 2)
addl $8, %esi # increment source pointer
decl %ecx
jnz .loop_crot3d # loop
emms
pop %edi
pop %esi
leave
ret
.globl pixel_crot2d_s16
.type pixel_crot2d_s16, at function
# 2 dimensional colour space rotation
# 2x2 matrix is column encoded, each coefficient is a 4x16 bit fixed point vector
# void pixel_crot2d_s16(int *buf, int nb_4pixel_vectors_per_plane, short int *matrix)
pixel_crot2d_s16:
pushl %ebp
movl %esp, %ebp
push %esi
push %edi
movl 8(%ebp), %esi # input array
movl 12(%ebp), %ecx # pixel count
movl 16(%ebp), %edi # rotation matrix
movl %ecx, %edx
shll $3, %edx # %edx = plane spacing
.align 16
.loop_crot2d:
movq (%esi), %mm0 # get 1st component
movq (%esi,%edx,1), %mm2 # get 2nd component
movq %mm0, %mm1 # copy 1st component
movq %mm2, %mm3 # copy 2nd component
pmulhw (%edi), %mm0 # mul first column
pmulhw 8(%edi), %mm1
pmulhw 16(%edi), %mm2 # mul second column
pmulhw 24(%edi), %mm3
paddsw %mm2, %mm0 # accumulate
paddsw %mm3, %mm1
paddsw %mm0, %mm0 # fixed point gain correction
paddsw %mm1, %mm1
movq %mm0, (%esi) # store
movq %mm1, (%esi, %edx, 1)
addl $8, %esi # increment source pointer
decl %ecx
jnz .loop_crot2d # loop
emms
pop %edi
pop %esi
leave
ret
--- NEW FILE: pixel_s1.s ---
# Pure Data Packet mmx routine.
# Copyright (c) by Tom Schouten <pdp at zzz.kotnet.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# this file contains ops for binary image processing
# 8x8 bit tile encoded
# low byte = bottom row
# low bit = right column
# %mm7 = scratch reg for all macros
# ************ load mask *******************
# compute bit masks for rows and columns
# %mm7: scratch reg
# load mask top
.macro ldmt count reg
pcmpeqb \reg, \reg
psllq $(64-(\count<<3)), \reg
.endm
# load mask bottom
.macro ldmb count reg
pcmpeqb \reg, \reg
psrlq $(64-(\count<<3)), \reg
.endm
# load mask top and bottom
.macro ldmtb count regt regb
ldmb \count, \regb
ldmt \count, \regt
.endm
# load mask right
.macro ldmr count reg
pcmpeqb %mm7, %mm7
psrlw $(16-\count), %mm7
movq %mm7, \reg
psllq $8, %mm7
por %mm7, \reg
.endm
# load mask left
.macro ldml count reg
pcmpeqb %mm7, %mm7
psllw $(16-\count), %mm7
movq %mm7, \reg
psrlq $8, %mm7
por %mm7, \reg
.endm
# load mask left and right
.macro ldmlr count regl regr
pcmpeqb %mm7, %mm7
psllw $(16-\count), %mm7
movq %mm7, \regl
psrlq $8, %mm7
por %mm7, \regl
movq \regl, \regr
psrlq $(8-\count), \regr
.endm
# ************* shift square **********
# shifts a square in reg, fills with zeros
# shift square top
.macro sst count reg
psllq $(\count<<3), \reg
.endm
# shift square bottom
.macro ssb count reg
psrlq $(\count<<3), \reg
.endm
# not tested
# shift square left
.macro ssl count reg
movq \reg, %mm7
pcmpeqb \reg, \reg
psllw $(16-\count), \reg
psrlw $8, \reg
pandn %mm7, \reg
psllw $(\count), \reg
.endm
# shift square right
.macro ssr count reg
movq \reg, %mm7
pcmpeqb \reg, \reg
psrlw $(16-\count), \reg
psllw $8, \reg
pandn %mm7, \reg
psrlw $(\count), \reg
.endm
# ********** combine square *************
# combines 2 squares
# combine right
.macro csr count regr reg
ssl \count, \reg
ssr (8-\count), \regr
por \regr, \reg
.endm
# combine left
.macro csl count regl reg
ssr \count, \reg
ssl (8-\count), \regl
por \regl, \reg
.endm
# combine top
.macro cst count regt reg
ssb \count, \reg
sst (8-\count), \regt
por \regt, \reg
.endm
# combine bottom
.macro csb count regb reg
sst \count, \reg
ssb (8-\count), \regb
por \regb, \reg
.endm
# ********** load combine square *************
# loads combined square using mask
# load combined square left
# mask should be count bits set right (i.e. 0x01)
.macro lcsml count mask source sourcel dstreg
movq \mask, \dstreg
movq \mask, %mm7
pandn \source, \dstreg
pand \sourcel, %mm7
psrlq $(\count), \dstreg
psllq $(8-\count), %mm7
por %mm7, \dstreg
.endm
.globl pixel_test_s1
.type pixel_test_s1, at function
# simple add
# void pixel_add_s16(void *dest, void *source, int nb_squares, int spacing)
#
pixel_test_s1:
pushl %ebp
movl %esp, %ebp
push %esi
push %edi
movl 8(%ebp), %edi # dest
movl 12(%ebp), %esi # source
movl 16(%ebp), %ecx # count
movl 20(%ebp), %edx # row distance
ldmr 1, %mm6
lcsml 1, %mm6, (%esi), 8(%esi), %mm0
movq %mm0, (%edi)
# movq (%esi), %mm0
# movq 8(%esi), %mm1
# csl 4, %mm1, %mm0
# movq %mm0, (%edi)
emms
pop %edi
pop %esi
leave
ret
--- NEW FILE: pdp_mmx_test.c ---
#include "pdp_mmx.h"
#define FP(x) ((short int)(((float)(x) * 2 * 256.0f)))
#define nbp 256
short int a1[4] = {0x0100,0x0100,0x0100,0x0100};
short int a2[4] = {0x0100,0x0100,0x0100,0x0100};
short int b0[4] = {0x0100,0x0100,0x0100,0x0100};
short int b1[4] = {0x0100,0x0100,0x0100,0x0100};
short int b2[4] = {0x0100,0x0100,0x0100,0x0100};
short int u1[4] = {0x0100,0x0100,0x0100,0x0100};
short int u2[4] = {0x0100,0x0100,0x0100,0x0100};
short int x0[4] = {0x0100,0x0100,0x0100,0x0100};
short int x1[4] = {0x0100,0x0100,0x0100,0x0100};
short int x2[4] = {0x0100,0x0100,0x0100,0x0100};
short int x3[4] = {0x0100,0x0100,0x0100,0x0100};
void print_pixel(unsigned int i)
{
if (i) printf("x ");
else printf(". ");
}
void print_line(void)
{
printf("\n");
}
void print_square(unsigned char *c)
{
int i,j;
for(j=7; j>=0; j--){
for(i=0; i<8; i++) print_pixel(c[j] & (1<<(7-i)));
printf("\n");
}
}
main()
{
unsigned char src[16]={1,2,3,4,5,6,7,8,-1,-2,-3,-4,-5,-6,-7,-8};
unsigned char dst[8];
print_square(src);
print_line();
print_square(src+8);
print_line();
pixel_test_s1(dst,src,1,1);
print_square(dst);
print_line();
}
--- NEW FILE: pixel_mul_s16.s ---
# Pure Data Packet mmx routine.
# Copyright (c) by Tom Schouten <pdp at zzz.kotnet.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
.globl pixel_mul_s16
.type pixel_mul_s16, at function
# simple add
# void pixel_mul_s16(int *left, int *right, int nb_4pixel_vectors)
pixel_mul_s16:
pushl %ebp
movl %esp, %ebp
push %esi
push %edi
movl 8(%ebp), %edi # left array
movl 12(%ebp), %esi # right array
movl 16(%ebp), %ecx # pixel count
.align 16
.loop_mix:
# prefetch 128(%esi)
movq (%esi), %mm1 # load right 4 pixels from memory
movq (%edi), %mm0 # load 4 left pixels from memory
pmulhw %mm1, %mm0 # mul
psllw $1, %mm0 # fixed point shift correction
movq %mm0, (%edi)
addl $8, %esi
addl $8, %edi
decl %ecx
jnz .loop_mix # loop
emms
pop %edi
pop %esi
leave
ret
--- NEW FILE: pixel_gain.s ---
# Pure Data Packet mmx routine.
# Copyright (c) by Tom Schouten <pdp at zzz.kotnet.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
.globl pixel_gain
.type pixel_gain, at function
# mmx rgba pixel gain
# void asmtest(char *pixelarray, int32 nbpixels, int *rgba_gain)
# gains are 7.9 fixed point for rgba
pixel_gain:
pushl %ebp
movl %esp, %ebp
push %esi
push %edi
movl 8(%ebp), %esi # pixel array offset
movl 12(%ebp), %ecx # nb of elements
movl 16(%ebp), %edi # int16[4] array of gains
prefetch (%esi)
emms
sarl $2, %ecx # process 4 pixels per loop iteration
jz .exit
movq (%edi), %mm7 # read gain array from memory
jmp .loop_gain
.align 16
.loop_gain:
prefetch 128(%esi)
movq (%esi), %mm5 # load pixel 1-2 from memory
movq 8(%esi), %mm6 # load pixel 3-4 from memory
pxor %mm0, %mm0 # zero mm0 - mm3
pxor %mm1, %mm1
pxor %mm2, %mm2
pxor %mm3, %mm3
punpcklbw %mm5, %mm0 # unpack 1st pixel into 8.8 bit ints
punpckhbw %mm5, %mm1 # unpack 2nd
punpcklbw %mm6, %mm2 # unpack 3rd
punpckhbw %mm6, %mm3 # unpack 4th
psrlw $0x1, %mm0 # shift right to clear sign bit 9.7
psrlw $0x1, %mm1
psrlw $0x1, %mm2
psrlw $0x1, %mm3
pmulhw %mm7, %mm0 # multiply 1st pixel 9.7 * 7.9 -> 16.0
pmulhw %mm7, %mm1 # multiply 2nd
pmulhw %mm7, %mm2 # multiply 3rd
pmulhw %mm7, %mm3 # multiply 4th
packuswb %mm1, %mm0 # pack & saturate to 8bit vector
movq %mm0, (%esi) # store result in memory
packuswb %mm3, %mm2 # pack & saturate to 8bit vector
movq %mm2, 8(%esi) # store result in memory
addl $16, %esi # increment source pointer
decl %ecx
jnz .loop_gain # loop
.exit:
emms
pop %edi
pop %esi
leave
ret
- Previous message: [PD-cvs] externals/pdp/system/image Makefile, 1.2, 1.3 pdp_imageproc_common.c, 1.2, 1.3 pdp_imageproc_mmx.c, 1.2, 1.3 pdp_imageproc_portable.c, 1.2, 1.3 pdp_llconv.c, 1.2, 1.3 pdp_llconv_mmx.c, 1.2, 1.3 pdp_llconv_portable.c, 1.2, 1.3 pdp_resample.c, 1.2, 1.3
- Next message: [PD-cvs] externals/pdp/system/type Makefile, 1.2, 1.3 pdp_bitmap.c, 1.2, 1.3 pdp_image.c, 1.2, 1.3 pdp_matrix.c, 1.2, 1.3
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
More information about the Pd-cvs
mailing list