[PD-cvs] pd/src m_simd_sse_gcc.c,1.1.4.11,1.1.4.12

Tim Blechmann timblech at users.sourceforge.net
Sun Jan 23 22:13:13 CET 2005


Update of /cvsroot/pure-data/pd/src
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv4633

Modified Files:
      Tag: devel_0_38
	m_simd_sse_gcc.c 
Log Message:
changing asm calls (should compile without -msse flag) & removing sumvec stuff


Index: m_simd_sse_gcc.c
===================================================================
RCS file: /cvsroot/pure-data/pd/src/Attic/m_simd_sse_gcc.c,v
retrieving revision 1.1.4.11
retrieving revision 1.1.4.12
diff -C2 -d -r1.1.4.11 -r1.1.4.12
*** m_simd_sse_gcc.c	18 Jan 2005 15:31:04 -0000	1.1.4.11
--- m_simd_sse_gcc.c	23 Jan 2005 21:13:11 -0000	1.1.4.12
***************
*** 40,44 ****
      asm(
  		".set T_FLOAT,4                            \n" /* sizeof(t_float) */
! 		"shufps    $0,%2,%2                        \n" /* load value */
  		"shr       $4,%0                           \n"
  		
--- 40,45 ----
      asm(
  		".set T_FLOAT,4                            \n" /* sizeof(t_float) */
! 		"movss     (%2),%%xmm0                     \n"
! 		"shufps    $0,%%xmm0,%%xmm0                \n" /* load value */
  		"shr       $4,%0                           \n"
  		
***************
*** 46,59 ****
  		/* *dst = v */
  		"1:                                        \n"
! 		"movaps    %2, (%1)                        \n"
! 		"movaps    %2, 4*T_FLOAT(%1)               \n"
! 		"movaps    %2, 8*T_FLOAT(%1)               \n"
! 		"movaps    %2, 12*T_FLOAT(%1)              \n"
  		
  		"addl      $16*T_FLOAT,%1                  \n"
  		"loop      1b                              \n"
  		:
! 		:"c"(n),"r"(dst),"x"((t_float)v)
! 		);
  }
  
--- 47,60 ----
  		/* *dst = v */
  		"1:                                        \n"
! 		"movaps    %%xmm0, (%1)                    \n"
! 		"movaps    %%xmm0, 4*T_FLOAT(%1)           \n"
! 		"movaps    %%xmm0, 8*T_FLOAT(%1)           \n"
! 		"movaps    %%xmm0, 12*T_FLOAT(%1)          \n"
  		
  		"addl      $16*T_FLOAT,%1                  \n"
  		"loop      1b                              \n"
  		:
! 		:"c"(n),"r"(dst),"r"(&v)
! 		:"%xmm0");
  }
  
***************
*** 366,370 ****
  	".set T_FLOAT,4                            \n"
  	
! 	"shufps    $0, %1, %1                      \n"
  	"shrl      $4, %3                          \n" /* divide by 16 */
  
--- 367,372 ----
  	".set T_FLOAT,4                            \n"
  	
! 	"movss     (%1), %%xmm0                    \n"
! 	"shufps    $0, %%xmm0, %%xmm0              \n"
  	"shrl      $4, %3                          \n" /* divide by 16 */
  
***************
*** 372,388 ****
  	"1:                                        \n"
  	"movaps    (%0), %%xmm1                    \n"
! 	"addps     %1, %%xmm1                      \n"
  	"movaps    %%xmm1, (%2)                    \n"
      
  	"movaps    4*T_FLOAT(%0), %%xmm2           \n"
! 	"addps     %1, %%xmm2                      \n"
  	"movaps    %%xmm2, 4*T_FLOAT(%2)           \n"
  	
  	"movaps    8*T_FLOAT(%0), %%xmm3           \n"
! 	"addps     %1, %%xmm3                      \n"
  	"movaps    %%xmm3, 8*T_FLOAT(%2)           \n"
  	
  	"movaps    12*T_FLOAT(%0), %%xmm4          \n"
! 	"addps     %1, %%xmm4                      \n"
  	"movaps    %%xmm4, 12*T_FLOAT(%2)          \n"
  	
--- 374,390 ----
  	"1:                                        \n"
  	"movaps    (%0), %%xmm1                    \n"
! 	"addps     %%xmm0, %%xmm1                  \n"
  	"movaps    %%xmm1, (%2)                    \n"
      
  	"movaps    4*T_FLOAT(%0), %%xmm2           \n"
! 	"addps     %%xmm0, %%xmm2                  \n"
  	"movaps    %%xmm2, 4*T_FLOAT(%2)           \n"
  	
  	"movaps    8*T_FLOAT(%0), %%xmm3           \n"
! 	"addps     %%xmm0, %%xmm3                  \n"
  	"movaps    %%xmm3, 8*T_FLOAT(%2)           \n"
  	
  	"movaps    12*T_FLOAT(%0), %%xmm4          \n"
! 	"addps     %%xmm0, %%xmm4                  \n"
  	"movaps    %%xmm4, 12*T_FLOAT(%2)          \n"
  	
***************
*** 392,397 ****
  	:
  	/* in, value, out, n */
! 	:"r"(w[1]),"x"(*(t_float*)w[2]),"r"(w[3]),"c"(w[4])
! 	:"%xmm1","%xmm2","%xmm3","%xmm4"
  	);
      return w+5;
--- 394,399 ----
  	:
  	/* in, value, out, n */
! 	:"r"(w[1]),"r"(w[2]),"r"(w[3]),"c"(w[4])
! 	:"%xmm0", "%xmm1","%xmm2","%xmm3","%xmm4"
  	);
      return w+5;
***************
*** 442,446 ****
  	".set T_FLOAT,4                            \n"
  	
!         "shufps    $0, %1, %1                      \n"
  	"shrl      $4, %3                          \n" /* divide by 16 */
  
--- 444,449 ----
  	".set T_FLOAT,4                            \n"
  	
! 	"movss     (%1), %%xmm0                    \n"
! 	"shufps    $0, %%xmm0, %%xmm0              \n"
  	"shrl      $4, %3                          \n" /* divide by 16 */
  
***************
*** 448,464 ****
  	"1:                                        \n"
  	"movaps    (%0), %%xmm1                    \n"
! 	"subps     %1, %%xmm1                      \n"
  	"movaps    %%xmm1, (%2)                    \n"
      
  	"movaps    4*T_FLOAT(%0), %%xmm2           \n"
! 	"subps     %1, %%xmm2                      \n"
  	"movaps    %%xmm2, 4*T_FLOAT(%2)           \n"
  	
  	"movaps    8*T_FLOAT(%0), %%xmm3           \n"
! 	"subps     %1, %%xmm3                      \n"
  	"movaps    %%xmm3, 8*T_FLOAT(%2)           \n"
  	
  	"movaps    12*T_FLOAT(%0), %%xmm4          \n"
! 	"subps     %1, %%xmm4                      \n"
  	"movaps    %%xmm4, 12*T_FLOAT(%2)          \n"
  	
--- 451,467 ----
  	"1:                                        \n"
  	"movaps    (%0), %%xmm1                    \n"
! 	"subps     %%xmm0, %%xmm1                  \n"
  	"movaps    %%xmm1, (%2)                    \n"
      
  	"movaps    4*T_FLOAT(%0), %%xmm2           \n"
! 	"subps     %%xmm0, %%xmm2                  \n"
  	"movaps    %%xmm2, 4*T_FLOAT(%2)           \n"
  	
  	"movaps    8*T_FLOAT(%0), %%xmm3           \n"
! 	"subps     %%xmm0, %%xmm3                  \n"
  	"movaps    %%xmm3, 8*T_FLOAT(%2)           \n"
  	
  	"movaps    12*T_FLOAT(%0), %%xmm4          \n"
! 	"subps     %%xmm0, %%xmm4                  \n"
  	"movaps    %%xmm4, 12*T_FLOAT(%2)          \n"
  	
***************
*** 468,473 ****
  	:
  	/* in, value, out, n */
! 	:"r"(w[1]),"x"(*(t_float*)w[2]),"r"(w[3]),"c"(w[4])
! 	:"%xmm1","%xmm2","%xmm3","%xmm4"
  	);
      return w+5;
--- 471,476 ----
  	:
  	/* in, value, out, n */
! 	:"r"(w[1]),"r"(w[2]),"r"(w[3]),"c"(w[4])
! 	:"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4"
  	);
      return w+5;
***************
*** 519,523 ****
  	".set T_FLOAT,4                            \n"
  	
!  	"shufps    $0, %1, %1                      \n" 
  	"shrl      $4, %3                          \n" /* divide by 16 */
  
--- 522,527 ----
  	".set T_FLOAT,4                            \n"
  	
! 	"movss     (%1), %%xmm0                    \n"
! 	"shufps    $0, %%xmm0, %%xmm0              \n"
  	"shrl      $4, %3                          \n" /* divide by 16 */
  
***************
*** 525,541 ****
  	"1:                                        \n"
  	"movaps    (%0), %%xmm1                    \n"
! 	"mulps     %1, %%xmm1                      \n"
  	"movaps    %%xmm1, (%2)                    \n"
      
  	"movaps    4*T_FLOAT(%0), %%xmm2           \n"
! 	"mulps     %1, %%xmm2                      \n"
  	"movaps    %%xmm2, 4*T_FLOAT(%2)           \n"
  	
  	"movaps    8*T_FLOAT(%0), %%xmm3           \n"
! 	"mulps     %1, %%xmm3                      \n"
  	"movaps    %%xmm3, 8*T_FLOAT(%2)           \n"
  	
  	"movaps    12*T_FLOAT(%0), %%xmm4          \n"
! 	"mulps     %1, %%xmm4                      \n"
  	"movaps    %%xmm4, 12*T_FLOAT(%2)          \n"
  	
--- 529,545 ----
  	"1:                                        \n"
  	"movaps    (%0), %%xmm1                    \n"
! 	"mulps     %%xmm0, %%xmm1                  \n"
  	"movaps    %%xmm1, (%2)                    \n"
      
  	"movaps    4*T_FLOAT(%0), %%xmm2           \n"
! 	"mulps     %%xmm0, %%xmm2                  \n"
  	"movaps    %%xmm2, 4*T_FLOAT(%2)           \n"
  	
  	"movaps    8*T_FLOAT(%0), %%xmm3           \n"
! 	"mulps     %%xmm0, %%xmm3                  \n"
  	"movaps    %%xmm3, 8*T_FLOAT(%2)           \n"
  	
  	"movaps    12*T_FLOAT(%0), %%xmm4          \n"
! 	"mulps     %%xmm0, %%xmm4                  \n"
  	"movaps    %%xmm4, 12*T_FLOAT(%2)          \n"
  	
***************
*** 545,550 ****
  	:
  	/* in, value, out, n */
! 	:"r"(w[1]),"x"(*(t_float*)w[2]),"r"(w[3]),"c"(w[4])
! 	:"%xmm1","%xmm2","%xmm3","%xmm4"
  	);
      return w+5;
--- 549,554 ----
  	:
  	/* in, value, out, n */
! 	:"r"(w[1]),"r"(w[2]),"r"(w[3]),"c"(w[4])
! 	:"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4"
  	);
      return w+5;
***************
*** 632,636 ****
  	".set T_FLOAT,4                            \n"
  	
!         "shufps    $0, %1, %1                      \n"
  	"shrl      $4, %3                          \n" /* divide by 16 */
  
--- 636,641 ----
  	".set T_FLOAT,4                            \n"
  	
! 	"movss     (%1), %%xmm0                    \n"
! 	"shufps    $0, %%xmm0, %%xmm0              \n"
  	"shrl      $4, %3                          \n" /* divide by 16 */
  
***************
*** 638,654 ****
  	"1:                                        \n"
  	"movaps    (%0), %%xmm1                    \n"
! 	"divps     %1, %%xmm1                      \n"
  	"movaps    %%xmm1, (%2)                    \n"
      
  	"movaps    4*T_FLOAT(%0), %%xmm2           \n"
! 	"divps     %1, %%xmm2                      \n"
  	"movaps    %%xmm2, 4*T_FLOAT(%2)           \n"
  	
  	"movaps    8*T_FLOAT(%0), %%xmm3           \n"
! 	"divps     %1, %%xmm3                      \n"
  	"movaps    %%xmm3, 8*T_FLOAT(%2)           \n"
  	
  	"movaps    12*T_FLOAT(%0), %%xmm4          \n"
! 	"divps     %1, %%xmm4                      \n"
  	"movaps    %%xmm4, 12*T_FLOAT(%2)          \n"
  	
--- 643,659 ----
  	"1:                                        \n"
  	"movaps    (%0), %%xmm1                    \n"
! 	"divps     %%xmm0, %%xmm1                  \n"
  	"movaps    %%xmm1, (%2)                    \n"
      
  	"movaps    4*T_FLOAT(%0), %%xmm2           \n"
! 	"divps     %%xmm0, %%xmm2                  \n"
  	"movaps    %%xmm2, 4*T_FLOAT(%2)           \n"
  	
  	"movaps    8*T_FLOAT(%0), %%xmm3           \n"
! 	"divps     %%xmm0, %%xmm3                  \n"
  	"movaps    %%xmm3, 8*T_FLOAT(%2)           \n"
  	
  	"movaps    12*T_FLOAT(%0), %%xmm4          \n"
! 	"divps     %%xmm0, %%xmm4                  \n"
  	"movaps    %%xmm4, 12*T_FLOAT(%2)          \n"
  	
***************
*** 658,663 ****
  	:
  	/* in, value, out, n */
! 	:"r"(w[1]),"x"(*(t_float*)w[2]),"r"(w[3]),"c"(w[4])
! 	:"%xmm1","%xmm2","%xmm3","%xmm4"
  	);
      return w+5;
--- 663,668 ----
  	:
  	/* in, value, out, n */
! 	:"r"(w[1]),"r"(w[2]),"r"(w[3]),"c"(w[4])
! 	:"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4"
  	);
      return w+5;
***************
*** 710,714 ****
  	".set T_FLOAT,4                            \n"
  	
!         "shufps    $0, %1, %1                      \n"
  	"shrl      $4, %3                          \n" /* divide by 16 */
  
--- 715,720 ----
  	".set T_FLOAT,4                            \n"
  	
! 	"movss     (%1), %%xmm0                    \n"
! 	"shufps    $0, %%xmm0, %%xmm0              \n"
  	"shrl      $4, %3                          \n" /* divide by 16 */
  
***************
*** 716,732 ****
  	"1:                                        \n"
  	"movaps    (%0), %%xmm1                    \n"
! 	"minps     %1, %%xmm1                      \n"
  	"movaps    %%xmm1, (%2)                    \n"
      
  	"movaps    4*T_FLOAT(%0), %%xmm2           \n"
! 	"minps     %1, %%xmm2                      \n"
  	"movaps    %%xmm2, 4*T_FLOAT(%2)           \n"
  	
  	"movaps    8*T_FLOAT(%0), %%xmm3           \n"
! 	"minps     %1, %%xmm3                      \n"
  	"movaps    %%xmm3, 8*T_FLOAT(%2)           \n"
  	
  	"movaps    12*T_FLOAT(%0), %%xmm4          \n"
! 	"minps     %1, %%xmm4                      \n"
  	"movaps    %%xmm4, 12*T_FLOAT(%2)          \n"
  	
--- 722,738 ----
  	"1:                                        \n"
  	"movaps    (%0), %%xmm1                    \n"
! 	"minps     %%xmm0, %%xmm1                  \n"
  	"movaps    %%xmm1, (%2)                    \n"
      
  	"movaps    4*T_FLOAT(%0), %%xmm2           \n"
! 	"minps     %%xmm0, %%xmm2                  \n"
  	"movaps    %%xmm2, 4*T_FLOAT(%2)           \n"
  	
  	"movaps    8*T_FLOAT(%0), %%xmm3           \n"
! 	"minps     %%xmm0, %%xmm3                  \n"
  	"movaps    %%xmm3, 8*T_FLOAT(%2)           \n"
  	
  	"movaps    12*T_FLOAT(%0), %%xmm4          \n"
! 	"minps     %%xmm0, %%xmm4                  \n"
  	"movaps    %%xmm4, 12*T_FLOAT(%2)          \n"
  	
***************
*** 736,741 ****
  	:
  	/* in, value, out, n */
! 	:"r"(w[1]),"x"(*(t_float*)w[2]),"r"(w[3]),"c"(w[4])
! 	:"%xmm1","%xmm2","%xmm3","%xmm4"
  	);
      return w+5;
--- 742,747 ----
  	:
  	/* in, value, out, n */
! 	:"r"(w[1]),"r"(w[2]),"r"(w[3]),"c"(w[4])
! 	:"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4"
  	);
      return w+5;
***************
*** 788,792 ****
  	".set T_FLOAT,4                            \n"
  	
!     "shufps    $0, %1, %1                      \n"
  	"shrl      $4, %3                          \n" /* divide by 16 */
  
--- 794,799 ----
  	".set T_FLOAT,4                            \n"
  	
! 	"movss     (%1), %%xmm0                    \n"
! 	"shufps    $0, %%xmm0, %%xmm0              \n"
  	"shrl      $4, %3                          \n" /* divide by 16 */
  
***************
*** 794,810 ****
  	"1:                                        \n"
  	"movaps    (%0), %%xmm1                    \n"
! 	"maxps     %1, %%xmm1                      \n"
  	"movaps    %%xmm1, (%2)                    \n"
      
  	"movaps    4*T_FLOAT(%0), %%xmm2           \n"
! 	"maxps     %1, %%xmm2                      \n"
  	"movaps    %%xmm2, 4*T_FLOAT(%2)           \n"
  	
  	"movaps    8*T_FLOAT(%0), %%xmm3           \n"
! 	"maxps     %1, %%xmm3                      \n"
  	"movaps    %%xmm3, 8*T_FLOAT(%2)           \n"
  	
  	"movaps    12*T_FLOAT(%0), %%xmm4          \n"
! 	"maxps     %1, %%xmm4                      \n"
  	"movaps    %%xmm4, 12*T_FLOAT(%2)          \n"
  	
--- 801,817 ----
  	"1:                                        \n"
  	"movaps    (%0), %%xmm1                    \n"
! 	"maxps     %%xmm0, %%xmm1                  \n"
  	"movaps    %%xmm1, (%2)                    \n"
      
  	"movaps    4*T_FLOAT(%0), %%xmm2           \n"
! 	"maxps     %%xmm0, %%xmm2                  \n"
  	"movaps    %%xmm2, 4*T_FLOAT(%2)           \n"
  	
  	"movaps    8*T_FLOAT(%0), %%xmm3           \n"
! 	"maxps     %%xmm0, %%xmm3                  \n"
  	"movaps    %%xmm3, 8*T_FLOAT(%2)           \n"
  	
  	"movaps    12*T_FLOAT(%0), %%xmm4          \n"
! 	"maxps     %%xmm0, %%xmm4                  \n"
  	"movaps    %%xmm4, 12*T_FLOAT(%2)          \n"
  	
***************
*** 814,819 ****
  	:
  	/* in, value, out, n */
! 	:"r"(w[1]),"x"(*(t_float*)w[2]),"r"(w[3]),"c"(w[4])
! 	:"%xmm1","%xmm2","%xmm3","%xmm4"
  	);
      return w+5;
--- 821,826 ----
  	:
  	/* in, value, out, n */
! 	:"r"(w[1]),"r"(w[2]),"r"(w[3]),"c"(w[4])
! 	:"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4"
  	);
      return w+5;
***************
*** 825,830 ****
  	".set T_FLOAT,4                            \n"
  	
!     "shufps    $0, %2, %2                      \n" /* lo */
!     "shufps    $0, %3, %3                      \n" /* hi */
  	"shrl      $4, %4                          \n" /* divide by 16 */
  
--- 832,840 ----
  	".set T_FLOAT,4                            \n"
  	
! 	"movss     (%2), %%xmm0                    \n" /* lo */
! 	"shufps    $0, %%xmm0, %%xmm0              \n"
! 	"movss     (%3), %%xmm1                    \n" /* hi */
! 	"shufps    $0, %%xmm1, %%xmm1              \n"
! 
  	"shrl      $4, %4                          \n" /* divide by 16 */
  
***************
*** 832,852 ****
  	"1:                                        \n"
  	"movaps    (%0), %%xmm2                    \n"
! 	"maxps     %2, %%xmm2                      \n"
! 	"minps     %3, %%xmm2                      \n"
  	"movaps    %%xmm2, (%1)                    \n"
      
  	"movaps    4*T_FLOAT(%0), %%xmm3           \n"
! 	"maxps     %2, %%xmm3                      \n"
! 	"minps     %3, %%xmm3                      \n"
  	"movaps    %%xmm3, 4*T_FLOAT(%1)           \n"
  	
  	"movaps    8*T_FLOAT(%0), %%xmm4           \n"
! 	"maxps     %2, %%xmm4                      \n"
! 	"minps     %3, %%xmm4                      \n"
  	"movaps    %%xmm4, 8*T_FLOAT(%1)           \n"
  
  	"movaps    12*T_FLOAT(%0), %%xmm5          \n"
! 	"maxps     %2, %%xmm5                      \n"
! 	"minps     %3, %%xmm5                      \n"
  	"movaps    %%xmm5, 12*T_FLOAT(%1)          \n"
  	
--- 842,862 ----
  	"1:                                        \n"
  	"movaps    (%0), %%xmm2                    \n"
! 	"maxps     %%xmm0, %%xmm2                  \n"
! 	"minps     %%xmm1, %%xmm2                  \n"
  	"movaps    %%xmm2, (%1)                    \n"
      
  	"movaps    4*T_FLOAT(%0), %%xmm3           \n"
! 	"maxps     %%xmm0, %%xmm3                  \n"
! 	"minps     %%xmm1, %%xmm3                  \n"
  	"movaps    %%xmm3, 4*T_FLOAT(%1)           \n"
  	
  	"movaps    8*T_FLOAT(%0), %%xmm4           \n"
! 	"maxps     %%xmm0, %%xmm4                  \n"
! 	"minps     %%xmm1, %%xmm4                  \n"
  	"movaps    %%xmm4, 8*T_FLOAT(%1)           \n"
  
  	"movaps    12*T_FLOAT(%0), %%xmm5          \n"
! 	"maxps     %%xmm0, %%xmm5                  \n"
! 	"minps     %%xmm1, %%xmm5                  \n"
  	"movaps    %%xmm5, 12*T_FLOAT(%1)          \n"
  	
***************
*** 856,861 ****
  	:
  	/* in, out, lo, hi, n */
! 	:"r"(w[1]),"r"(w[2]),"x"(*(t_float*)w[3]),"x"(*(t_float*)w[4]),"c"(w[5])
! 	:"%xmm2","%xmm3","%xmm4","%xmm5"
  	);
      return w+6;
--- 866,871 ----
  	:
  	/* in, out, lo, hi, n */
! 	:"r"(w[1]),"r"(w[2]),"r"(w[3]),"r"(w[4]),"c"(w[5])
! 	:"%xmm0","%xmm1","%xmm2","%xmm3","%xmm4","%xmm5"
  	);
      return w+6;
***************
*** 945,991 ****
  }
  
! float sumvec_simd(t_float* in, t_int n)
! {
! 	float ret;
!     asm(
! 		".set T_FLOAT,4                            \n"
  		
! 		"shrl      $4, %2                          \n" /* divide by 16 */
! 		"xorps     %%xmm4, %%xmm4                  \n" /* zero values */
! 		"xorps     %%xmm5, %%xmm5                  \n"
! 		"xorps     %%xmm6, %%xmm6                  \n"
! 		"xorps     %0, %0                          \n"
  
  		
! 		"1:                                        \n"
! 		"movaps    (%1), %%xmm0                    \n"
! 		"movaps    4*T_FLOAT(%1), %%xmm1           \n"
! 		"movaps    8*T_FLOAT(%1), %%xmm2           \n"
! 		"movaps    12*T_FLOAT(%1), %%xmm3          \n"
  
! 		"addps     %%xmm0,%%xmm4                   \n"
! 		"addps     %%xmm1,%%xmm4                   \n"
! 		"addps     %%xmm2,%%xmm4                   \n"
! 		"addps     %%xmm3,%%xmm4                   \n"
  
! 		"addl      $16*T_FLOAT,%1                  \n"
! 		"loop      1b                              \n"
  
! 		"movhlps   %%xmm4, %%xmm5                  \n"
! 		"movups   %%xmm4, %%xmm6                   \n"
! 		"movups   %%xmm5, %0                       \n"
! 		"shufps    $81, %%xmm6, %%xmm6             \n"
! 		"shufps    $81, %0, %0                     \n"
  
! 		"addss     %%xmm4, %%xmm5                  \n"
! 		"addss     %%xmm5, %%xmm6                  \n"
! 		"addss     %%xmm6, %0                      \n"
  
  
! 		:"=x"(ret)
! 		:"r"(in),"c"(n)
! 		:"%xmm0","%xmm1","%xmm2","%xmm3", "%xmm4","%xmm5","%xmm6");
! 	return ret;
! }
  		
  
--- 955,1002 ----
  }
  
! /* tb: is this needed? */
! /* float sumvec_simd(t_float* in, t_int n) */
! /* { */
! /* 	float ret; */
! /*     asm( */
! /* 		".set T_FLOAT,4                            \n" */
  		
! /* 		"shrl      $4, %2                          \n" /\* divide by 16 *\/ */
! /* 		"xorps     %%xmm4, %%xmm4                  \n" /\* zero values *\/ */
! /* 		"xorps     %%xmm5, %%xmm5                  \n" */
! /* 		"xorps     %%xmm6, %%xmm6                  \n" */
! /* 		"xorps     %0, %0                          \n" */
  
  		
! /* 		"1:                                        \n" */
! /* 		"movaps    (%1), %%xmm0                    \n" */
! /* 		"movaps    4*T_FLOAT(%1), %%xmm1           \n" */
! /* 		"movaps    8*T_FLOAT(%1), %%xmm2           \n" */
! /* 		"movaps    12*T_FLOAT(%1), %%xmm3          \n" */
  
! /* 		"addps     %%xmm0,%%xmm4                   \n" */
! /* 		"addps     %%xmm1,%%xmm4                   \n" */
! /* 		"addps     %%xmm2,%%xmm4                   \n" */
! /* 		"addps     %%xmm3,%%xmm4                   \n" */
  
! /* 		"addl      $16*T_FLOAT,%1                  \n" */
! /* 		"loop      1b                              \n" */
  
! /* 		"movhlps   %%xmm4, %%xmm5                  \n" */
! /* 		"movups   %%xmm4, %%xmm6                   \n" */
! /* 		"movups   %%xmm5, %0                       \n" */
! /* 		"shufps    $81, %%xmm6, %%xmm6             \n" */
! /* 		"shufps    $81, %0, %0                     \n" */
  
! /* 		"addss     %%xmm4, %%xmm5                  \n" */
! /* 		"addss     %%xmm5, %%xmm6                  \n" */
! /* 		"addss     %%xmm6, %0                      \n" */
  
  
! /* 		:"=x"(ret) */
! /* 		:"r"(in),"c"(n) */
! /* 		:"%xmm0","%xmm1","%xmm2","%xmm3", "%xmm4","%xmm5","%xmm6"); */
! /* 	return ret; */
! /* } */
  		
  
***************
*** 1001,1005 ****
  		"xorps     %%xmm3, %%xmm3                  \n"
  		"xorps     %%xmm4, %%xmm4                  \n"
! 		"xorps     %0, %0                          \n"
  		
  		"1:                                        \n"
--- 1012,1017 ----
  		"xorps     %%xmm3, %%xmm3                  \n"
  		"xorps     %%xmm4, %%xmm4                  \n"
! 		"xorps     %%xmm5, %%xmm5                  \n"
! 
  		
  		"1:                                        \n"
***************
*** 1042,1056 ****
  		"movhlps   %%xmm2, %%xmm3                  \n" /* unpack xmm0 */
  		"movups    %%xmm2, %%xmm4                  \n"
! 		"movups    %%xmm3, %0                      \n"
  		"shufps    $81, %%xmm4, %%xmm4             \n"
! 		"shufps    $81, %0, %0                     \n"
  
  		"addss     %%xmm2, %%xmm3                  \n"
  		"addss     %%xmm3, %%xmm4                  \n"
! 		"addss     %%xmm4, %0                      \n"
! 
  
! 		:"=x"(ret)
! 		:"r"(in),"c"(n), "r"(hp)
  		:"%xmm0","%xmm1","%xmm2","%xmm3", "%xmm4", "%xmm5");
  	return ret;
--- 1054,1069 ----
  		"movhlps   %%xmm2, %%xmm3                  \n" /* unpack xmm0 */
  		"movups    %%xmm2, %%xmm4                  \n"
! 		"movups    %%xmm3, %%xmm5                  \n"
  		"shufps    $81, %%xmm4, %%xmm4             \n"
! 		"shufps    $81, %%xmm5, %%xmm5             \n"
  
  		"addss     %%xmm2, %%xmm3                  \n"
  		"addss     %%xmm3, %%xmm4                  \n"
! 		"addss     %%xmm4, %%xmm5                  \n"
! 		
! 		"movss     %%xmm5, (%0)                    \n"
  
! 		:
! 		:"r"(&ret),"r"(in),"c"(n), "r"(hp)
  		:"%xmm0","%xmm1","%xmm2","%xmm3", "%xmm4", "%xmm5");
  	return ret;
***************
*** 1074,1101 ****
  		"shrl      $4, %2                          \n" /* divide by 16 */
  		"movaps    (2b), %%xmm0                    \n"
! 		"shufps    $0, %0, %0                      \n"
  
  		"1:                                        \n"
  		"movaps    (%1), %%xmm1                    \n"
  		"andps     %%xmm0, %%xmm1                  \n"
! 		"maxps     %%xmm1, %0                      \n"
  
  		"movaps    4*T_FLOAT(%1), %%xmm1           \n"
  		"andps     %%xmm0, %%xmm1                  \n"
! 		"maxps     %%xmm1, %0                      \n"
  
  		"movaps    8*T_FLOAT(%1), %%xmm1           \n"
  		"andps     %%xmm0, %%xmm1                  \n"
! 		"maxps     %%xmm1, %0                      \n"
  
  		"movaps    12*T_FLOAT(%1), %%xmm1          \n"
  		"andps     %%xmm0, %%xmm1                  \n"
! 		"maxps     %%xmm1, %0                      \n"
  
  		"addl      $16*T_FLOAT, %1                 \n"
  		"loop      1b                              \n"
  		
! 		"movhlps   %0, %%xmm2                      \n"
! 		"movaps    %0, %%xmm3                      \n"
  		"movaps    %%xmm2, %%xmm4                  \n"
  		"shufps    $81, %%xmm3, %%xmm3             \n"
--- 1087,1116 ----
  		"shrl      $4, %2                          \n" /* divide by 16 */
  		"movaps    (2b), %%xmm0                    \n"
! 
! 		"movss     (%0), %%xmm5                    \n" /* cur_max */
! 		"shufps    $0, %%xmm5, %%xmm5              \n"
  
  		"1:                                        \n"
  		"movaps    (%1), %%xmm1                    \n"
  		"andps     %%xmm0, %%xmm1                  \n"
! 		"maxps     %%xmm1, %%xmm5                  \n"
  
  		"movaps    4*T_FLOAT(%1), %%xmm1           \n"
  		"andps     %%xmm0, %%xmm1                  \n"
! 		"maxps     %%xmm1, %%xmm5                  \n"
  
  		"movaps    8*T_FLOAT(%1), %%xmm1           \n"
  		"andps     %%xmm0, %%xmm1                  \n"
! 		"maxps     %%xmm1, %%xmm5                  \n"
  
  		"movaps    12*T_FLOAT(%1), %%xmm1          \n"
  		"andps     %%xmm0, %%xmm1                  \n"
! 		"maxps     %%xmm1, %%xmm5                  \n"
  
  		"addl      $16*T_FLOAT, %1                 \n"
  		"loop      1b                              \n"
  		
! 		"movhlps   %%xmm5, %%xmm2                  \n"
! 		"movaps    %%xmm5, %%xmm3                  \n"
  		"movaps    %%xmm2, %%xmm4                  \n"
  		"shufps    $81, %%xmm3, %%xmm3             \n"
***************
*** 1104,1111 ****
  		"maxss     %%xmm2, %%xmm3                  \n"
  		"maxss     %%xmm3, %%xmm4                  \n"
! 		"maxss     %%xmm4, %0                      \n"
  
! 		:"=x"(cur_max)
! 		:"r"(vec),"c"(n), "0"(cur_max)
  		:"%xmm0","%xmm1","%xmm2","%xmm3", "%xmm4", "%xmm5");
  
--- 1119,1128 ----
  		"maxss     %%xmm2, %%xmm3                  \n"
  		"maxss     %%xmm3, %%xmm4                  \n"
! 		"maxss     %%xmm4, %%xmm5                  \n"
  
! 		"movss     %%xmm5, (%0)                    \n"
! 
! 		:
! 		:"r"(&cur_max), "r"(vec),"c"(n)
  		:"%xmm0","%xmm1","%xmm2","%xmm3", "%xmm4", "%xmm5");
  





More information about the Pd-cvs mailing list