« Return to Thread: [PATCH] simplify greedy2frame deinterlacer a bit

[PATCH] simplify greedy2frame deinterlacer a bit

by Roland Scheidegger :: Rate this Message:

| View in Thread

Cuts roughly 10% of the instructions (with sse), results should be
identical.
Not sure why it was that complicated in the first place, the
simplification is possible because the code gave a score of 1 to top and
bottom comparisons, and 2 for the middle one, and weaved when all scores
added together were more than 2. This is equivalent to weave when
(cmp(m) AND (cmp(b) OR cmp(t))) which is a much better match for the
available hw instructions. This also reduces the number of constant
loads a lot, and the patch moves up some memory loads a bit which can
never hurt.
Doesn't make much of a performance difference (memory bandwidth bound)
but looks nicer nonetheless.

Roland

[greedy2frame.diff]

diff -r 23f62fa05c72 src/post/deinterlace/plugins/greedy2frame_template.c
--- a/src/post/deinterlace/plugins/greedy2frame_template.c Wed Dec 21 21:23:21 2011 +0100
+++ b/src/post/deinterlace/plugins/greedy2frame_template.c Tue Apr 10 01:57:17 2012 +0200
@@ -85,18 +85,9 @@
 */
 
 
-/* debugging feature */
-/* output the value of mm4 at this point which is pink where we will weave */
-/* and green were we are going to bob */
-/* uncomment next line to see this */
-/* #define CHECK_BOBWEAVE */
-
 #if !defined(MASKS_DEFINED)
 #define MASKS_DEFINED
-  static const int64_t __attribute__((__used__)) YMask    = 0x00ff00ff00ff00ffll;
   static const int64_t __attribute__((__used__)) Mask = 0x7f7f7f7f7f7f7f7fll;
-  static const int64_t __attribute__((__used__)) DwordOne = 0x0000000100000001ll;
-  static const int64_t __attribute__((__used__)) DwordTwo = 0x0000000200000002ll;
   static int64_t qwGreedyTwoFrameThreshold;
 #endif
 
@@ -183,14 +174,16 @@
           asm volatile(
        /* Figure out what to do with the scanline above the one we just copied.
         * See above for a description of the algorithm.
- */
+        * weave if (weave(M) AND (weave(T) OR weave(B)))
+        */
             ".align 8 \n\t"
-            "movq %4, %%mm6 \n\t"
-
             "movq %0, %%mm1 \n\t"     // T1
             "movq %1, %%mm0 \n\t"     // M1
             "movq %2, %%mm3 \n\t"     // B1
             "movq %3, %%mm2 \n\t"     // M0
+
+            "movq %4, %%mm6 \n\t"     // Mask
+
             : /* no output */
             : "m" (*T1), "m" (*M1),
               "m" (*B1), "m" (*M0), "m" (Mask) );
@@ -223,78 +216,70 @@
   * movement
   */
 #if defined(IS_SSE)
-            "movq    %%mm0, %%mm4 \n\t"
-            "movq    %%mm2, %%mm5 \n\t"
-            "psubusb %%mm2, %%mm4 \n\t"
-            "psubusb %%mm0, %%mm5 \n\t"
-            "por     %%mm5, %%mm4 \n\t"
-            "psrlw   $1, %%mm4 \n\t"
-            "pavgb   %%mm2, %%mm0 \n\t"
-            "pand    %%mm6, %%mm4 \n\t"
+            "movq    %%mm0, %%mm4 \n\t"
+            "movq    %%mm2, %%mm5 \n\t"
+            "psubusb %%mm2, %%mm4 \n\t"
+            "psubusb %%mm0, %%mm5 \n\t"
+            "por     %%mm5, %%mm4 \n\t"
+            "pavgb   %%mm2, %%mm0 \n\t"
 #elif defined(IS_3DNOW)
-            "movq    %%mm0, %%mm4 \n\t"
-            "movq    %%mm2, %%mm5 \n\t"
-            "psubusb %%mm2, %%mm4 \n\t"
-            "psubusb %%mm0, %%mm5 \n\t"
-            "por     %%mm5, %%mm4 \n\t"
-            "psrlw   $1, %%mm4 \n\t"
-            "pavgusb %%mm2, %%mm0 \n\t"
-            "pand    %%mm6, %%mm4 \n\t"
+            "movq    %%mm0, %%mm4 \n\t"
+            "movq    %%mm2, %%mm5 \n\t"
+            "psubusb %%mm2, %%mm4 \n\t"
+            "psubusb %%mm0, %%mm5 \n\t"
+            "por     %%mm5, %%mm4 \n\t"
+            "pavgusb %%mm2, %%mm0 \n\t"
 #else
-            "movq    %%mm0, %%mm4 \n\t"
-            "psubusb %%mm2, %%mm4 \n\t"
-            "psubusb %%mm0, %%mm2 \n\t"
-            "por     %%mm2, %%mm4 \n\t"
-            "psrlw   $1, %%mm4 \n\t"
-            "pand    %%mm6, %%mm4 \n\t"
+            "movq    %%mm0, %%mm4 \n\t"
+            "psubusb %%mm2, %%mm4 \n\t"
+            "psubusb %%mm0, %%mm2 \n\t"
+            "por     %%mm2, %%mm4 \n\t"
 #endif
 
-            /* if |M1-M0| > Threshold we want dword worth of twos */
-            "pcmpgtb %3, %%mm4 \n\t"
-            "pand    %4, %%mm4 \n\t" /* get rid of sign bit */
-            "pcmpgtd %5, %%mm4 \n\t" /* do we want to bob */
-            "pandn   %6, %%mm4 \n\t"
-
             "movq    %1, %%mm2 \n\t" /* mm2 = T0 */
 
+            /* if |M1-M0| > Threshold we want 0 else dword minus one */
+            "psrlw   $1, %%mm4 \n\t"
+            "pand    %%mm6, %%mm4 \n\t"
+            "pxor    %%mm5, %%mm5 \n\t" // zero
+            "pcmpgtb %3, %%mm4 \n\t"
+            "pcmpeqd %%mm5, %%mm4 \n\t" /* do we want to bob */
+
             /* calculate |T1-T0| put result in mm5 */
-            "movq    %%mm2, %%mm5 \n\t"
-            "psubusb %%mm1, %%mm5 \n\t"
-            "psubusb %%mm2, %%mm1 \n\t"
-            "por     %%mm1, %%mm5 \n\t"
+            "movq    %%mm2, %%mm5 \n\t"
+            "psubusb %%mm1, %%mm5 \n\t"
+            "psubusb %%mm2, %%mm1 \n\t"
+            "por     %%mm1, %%mm5 \n\t"
+
+            "movq    %2, %%mm2 \n\t" /* mm2 = B0 */
+
+            /* if |T1-T0| > Threshold we want 0 else dword minus one */
             "psrlw   $1, %%mm5 \n\t"
-            "pand    %%mm6, %%mm5 \n\t"
+            "pand    %%mm6, %%mm5 \n\t"
+            "pxor    %%mm1, %%mm1 \n\t" // zero
+            "pcmpgtb %3, %%mm5 \n\t"
+            "pcmpeqd %%mm1, %%mm5 \n\t"
 
-            /* if |T1-T0| > Threshold we want dword worth of ones */
-            "pcmpgtb %3, %%mm5 \n\t"
-            "pand    %%mm6, %%mm5 \n\t" /* get rid of sign bit */
+            /* calculate |B1-B0| put result in mm1 */
+            "movq    %%mm2, %%mm1 \n\t"
+            "psubusb %%mm3, %%mm1 \n\t"
+            "psubusb %%mm2, %%mm3 \n\t"
+            "por     %%mm3, %%mm1 \n\t"
 
-            "pcmpgtd %5, %%mm5 \n\t"
-            "pandn   %5, %%mm5 \n\t"
-            "paddd   %%mm5, %%mm4 \n\t"
+            /* if |B1-B0| > Threshold we want 0 else dword minus one */
+            "psrlw   $1, %%mm1 \n\t"
+            "pand    %%mm6, %%mm1 \n\t"
+            "pxor    %%mm3, %%mm3 \n\t" // zero
+            "pcmpgtb %3, %%mm1 \n\t"
+            "pcmpeqd %%mm3, %%mm1 \n\t"
 
-            "movq    %2, %%mm2 \n\t"     /* B0 */
-
-            /* calculate |B1-B0| put result in mm5 */
-            "movq    %%mm2, %%mm5 \n\t"
-            "psubusb %%mm3, %%mm5 \n\t"
-            "psubusb %%mm2, %%mm3 \n\t"
-            "por     %%mm3, %%mm5 \n\t"
-            "psrlw   $1, %%mm5 \n\t"
-            "pand    %%mm6, %%mm5 \n\t"
-
-            /* if |B1-B0| > Threshold we want dword worth of ones */
-            "pcmpgtb %3, %%mm5 \n\t"
-            "pand    %%mm6, %%mm5 \n\t"     /* get rid of any sign bit */
-            "pcmpgtd %5, %%mm5 \n\t"
-            "pandn   %5, %%mm5 \n\t"
-            "paddd   %%mm5, %%mm4 \n\t"
-
-            "pcmpgtd %6, %%mm4 \n\t"
+            "por     %%mm1, %%mm5 \n\t"
+            "pand    %%mm5, %%mm4 \n\t"
 
 /* debugging feature
  * output the value of mm4 at this point which is pink where we will weave
- * and green were we are going to bob                                      */
+ * and green where we are going to bob
+ */
 #ifdef CHECK_BOBWEAVE
 #ifdef IS_SSE
             "movntq %%mm4, %0 \n\t"
@@ -303,11 +288,10 @@
 #endif
 #else
 
-            "movq    %%mm4, %%mm5 \n\t"
-         /* mm4 now is 1 where we want to weave and 0 where we want to bob */
-            "pand    %%mm0, %%mm4 \n\t"
-            "pandn   %%mm7, %%mm5 \n\t"
-            "por     %%mm5, %%mm4 \n\t"
+            /* mm4 now is 1 where we want to weave and 0 where we want to bob */
+            "pand    %%mm4, %%mm0 \n\t"
+            "pandn   %%mm7, %%mm4 \n\t"
+            "por     %%mm0, %%mm4 \n\t"
 #ifdef IS_SSE
             "movntq %%mm4, %0 \n\t"
 #else
@@ -316,7 +300,7 @@
 #endif
 
           : "=m" (*Dest2)
-          : "m" (*T0), "m" (*B0), "m" (qwGreedyTwoFrameThreshold), "m" (Mask), "m" (DwordOne), "m" (DwordTwo) );
+          : "m" (*T0), "m" (*B0), "m" (qwGreedyTwoFrameThreshold) );
 
           /* Advance to the next set of pixels. */
           T1 += 8;


------------------------------------------------------------------------------
Better than sec? Nothing is better than sec when it comes to
monitoring Big Data applications. Try Boundary one-second
resolution app monitoring today. Free.
http://p.sf.net/sfu/Boundary-dev2dev
_______________________________________________
xine-devel mailing list
xine-devel@...
https://lists.sourceforge.net/lists/listinfo/xine-devel

 « Return to Thread: [PATCH] simplify greedy2frame deinterlacer a bit