[PATCH] pixman: C fast path for add_1000_1000 and over_n_1_8888

View: New views
5 Messages — Rating Filter:   Alert me  

[PATCH] pixman: C fast path for add_1000_1000 and over_n_1_8888

by Siarhei Siamashka :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

Hello,

These two fast path functions dealing with 1-bit data are needed
to improve performance of xfce4 terminal. Some other applications
may potentially benefit too.

The patches are also available here:
http://cgit.freedesktop.org/~siamashka/pixman/log/?h=1bit-for-master

--
Best regards,
Siarhei Siamashka

From 07d3c5924e6a1196ce1025461084b6400110cc8f Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@...>
Date: Fri, 23 Oct 2009 20:56:30 +0300
Subject: [PATCH] blitters-test updated to also randomly generate mask_x/mask_y

---
 test/blitters-test.c |   10 ++++++++--
 1 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/test/blitters-test.c b/test/blitters-test.c
index b8b6eba..bba6b1e 100644
--- a/test/blitters-test.c
+++ b/test/blitters-test.c
@@ -473,6 +473,7 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
     int src_stride, dst_stride;
     int src_x, src_y;
     int dst_x, dst_y;
+    int mask_x, mask_y;
     int w, h;
     int op;
     pixman_format_code_t src_fmt, dst_fmt, mask_fmt;
@@ -516,6 +517,8 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
 
     mask_img = NULL;
     mask_fmt = -1;
+    mask_x = 0;
+    mask_y = 0;
 
     if (lcg_rand_n (2))
     {
@@ -534,6 +537,9 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
 
  if (lcg_rand_n (2))
     pixman_image_set_component_alpha (mask_img, 1);
+
+ mask_x = lcg_rand_n (pixman_image_get_width (mask_img));
+ mask_y = lcg_rand_n (pixman_image_get_height (mask_img));
     }
 
     src_width = pixman_image_get_width (src_img);
@@ -568,7 +574,7 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
     }
 
     pixman_image_composite (op, src_img, mask_img, dst_img,
-    src_x, src_y, src_x, src_y, dst_x, dst_y, w, h);
+    src_x, src_y, mask_x, mask_y, dst_x, dst_y, w, h);
 
     if (verbose)
     {
@@ -641,7 +647,7 @@ main (int argc, char *argv[])
     /* Predefined value for running with all the fastpath functions
        disabled. It needs to be updated every time when changes are
        introduced to this program or behavior of pixman changes! */
-    if (crc == 0x481369DE)
+    if (crc == 0x1911E2C3)
     {
  printf ("blitters test passed\n");
     }
--
1.5.4.3


From 5f2a77a0339ff70e1384c866b4e404dfb00784eb Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@...>
Date: Mon, 26 Oct 2009 01:56:55 +0200
Subject: [PATCH] C fast path for add_1000_1000 and over_n_1_8888

These two fast path functions dealing with 1-bit data are needed
to improve performance of xfce4 terminal. Some other applications
may potentially benefit too.
---
 pixman/pixman-fast-path.c |  126 +++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 126 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index c053229..871a9f8 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -1025,6 +1025,129 @@ fast_composite_add_n_8_8 (pixman_implementation_t *imp,
     }
 }
 
+#ifdef WORDS_BIGENDIAN
+
+#define CREATE_BITMASK(n) (0x80000000 >> (n))
+#define UPDATE_BITMASK(n) ((n) >> 1)
+
+#else
+
+#define CREATE_BITMASK(n) (1 << (n))
+#define UPDATE_BITMASK(n) ((n) << 1)
+
+#endif
+
+#define TEST_BIT(p, n) \
+ (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31))
+#define SET_BIT(p, n) \
+ do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0);
+
+static void
+fast_composite_add_1000_1000 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t     *dst_line, *dst;
+    uint32_t     *src_line, *src;
+    int           dst_stride, src_stride;
+    int32_t       w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t,
+                           src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, 0, dest_y, uint32_t,
+                           dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w--)
+ {
+    /*
+     * TODO: improve performance by processing uint32_t data instead
+     *       of individual bits
+     */
+    if (TEST_BIT (src, src_x + w))
+ SET_BIT (dst, dest_x + w);
+ }
+    }
+}
+
+static void
+fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t     src;
+    uint32_t    *dst, *dst_line;
+    uint32_t    *mask, *mask_line;
+    int          mask_stride, dst_stride;
+    uint32_t     bitcache, bitmask;
+    int32_t      w;
+
+    if (width <= 0)
+ return;
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    if (src == 0)
+ return;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t,
+                           dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
+                           mask_stride, mask_line, 1);
+    mask_line += mask_x >> 5;
+
+    while (height--)
+    {
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+ w = width;
+
+ bitcache = *mask++;
+ bitmask = CREATE_BITMASK (mask_x & 31);
+
+ while (w--)
+ {
+    if (bitmask == 0)
+    {
+ bitcache = *mask++;
+ bitmask = CREATE_BITMASK (0);
+    }
+    if (bitcache & bitmask)
+ *dst = over (src, *dst);
+    bitmask = UPDATE_BITMASK (bitmask);
+    dst++;
+ }
+    }
+}
+
 /*
  * Simple bitblt
  */
@@ -1107,6 +1230,8 @@ static const pixman_fast_path_t c_fast_paths[] =
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, fast_composite_over_n_8_8888, 0 },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, fast_composite_over_n_8_8888, 0 },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, fast_composite_over_n_8_8888, 0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a1,       PIXMAN_a8r8g8b8, fast_composite_over_n_1_8888, 0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a1,       PIXMAN_x8r8g8b8, fast_composite_over_n_1_8888, 0 },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, fast_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, fast_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   fast_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
@@ -1126,6 +1251,7 @@ static const pixman_fast_path_t c_fast_paths[] =
     { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_a8r8g8b8, fast_composite_add_8888_8888,   0 },
     { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_a8b8g8r8, fast_composite_add_8888_8888,   0 },
     { PIXMAN_OP_ADD, PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       fast_composite_add_8000_8000,   0 },
+    { PIXMAN_OP_ADD, PIXMAN_a1,        PIXMAN_null,     PIXMAN_a1,       fast_composite_add_1000_1000,   0 },
     { PIXMAN_OP_ADD, PIXMAN_solid,     PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, fast_composite_add_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
     { PIXMAN_OP_ADD, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       fast_composite_add_n_8_8,    0 },
     { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_a8r8g8b8, fast_composite_solid_fill, 0 },
--
1.5.4.3


_______________________________________________
cairo mailing list
cairo@...
http://lists.cairographics.org/mailman/listinfo/cairo

Re: [PATCH] pixman: C fast path for add_1000_1000 and over_n_1_8888

by Soeren Sandmann-2 :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

Hi,

> These two fast path functions dealing with 1-bit data are needed
> to improve performance of xfce4 terminal. Some other applications
> may potentially benefit too.

A couple of minor comments:

Can we get the two fast paths enabled in two separate commits to
facilitate bisecting?

> +static void
> +fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
> +                              pixman_op_t              op,
> +
> + [...]
> +    if (bitcache & bitmask)
> + *dst = over (src, *dst);

It would likely be a big win to check whether the alpha channel of the
source is 0xFF (which is almost always is), then simply writing out
the source if it is. That would completely avoid reading the
destination from memory.

> +    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a1,       PIXMAN_a8r8g8b8, fast_composite_over_n_1_8888, 0 },
> +    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a1,       PIXMAN_x8r8g8b8, fast_composite_over_n_1_8888, 0 },

I think this one can be enabled for destinations of type x8b8g8r8 and
a8b8g8r8 as well, since pixman_image_get_solid() takes care of the
swapping.

Other than those things, it looks good to me.


Thanks,
Soren
_______________________________________________
cairo mailing list
cairo@...
http://lists.cairographics.org/mailman/listinfo/cairo

Re: [PATCH] pixman: C fast path for add_1000_1000 and over_n_1_8888

by Siarhei Siamashka :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

On Monday 26 October 2009, Soeren Sandmann wrote:

> Hi,
>
> > These two fast path functions dealing with 1-bit data are needed
> > to improve performance of xfce4 terminal. Some other applications
> > may potentially benefit too.
>
> A couple of minor comments:
>
> Can we get the two fast paths enabled in two separate commits to
> facilitate bisecting?
>
> > +static void
> > +fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
> > +                              pixman_op_t              op,
> > +
> > + [...]
> > +    if (bitcache & bitmask)
> > + *dst = over (src, *dst);
>
> It would likely be a big win to check whether the alpha channel of the
> source is 0xFF (which is almost always is), then simply writing out
> the source if it is. That would completely avoid reading the
> destination from memory.
A good catch.

> > +    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a1,       PIXMAN_a8r8g8b8,
> > fast_composite_over_n_1_8888, 0 }, +    { PIXMAN_OP_OVER, PIXMAN_solid,  
> >  PIXMAN_a1,       PIXMAN_x8r8g8b8, fast_composite_over_n_1_8888, 0 },
>
> I think this one can be enabled for destinations of type x8b8g8r8 and
> a8b8g8r8 as well, since pixman_image_get_solid() takes care of the
> swapping.
>
> Other than those things, it looks good to me.

Thanks for the review.

Patches corrected according to your comments, tested on both big and little
endian systems and pushed to master.

--
Best regards,
Siarhei Siamashka


_______________________________________________
cairo mailing list
cairo@...
http://lists.cairographics.org/mailman/listinfo/cairo

signature.asc (196 bytes) Download Attachment

Re: [PATCH] pixman: C fast path for add_1000_1000 and over_n_1_8888

by Chris Wilson-11 :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

Hi Siarhei,
        I was just worrying about the absence of such paths from the
current set of cairo-traces. The only attempt I've made at capturing a
wide range of fonts and languages are the gnome-terminal and original
firefox traces. I suspect that these and my fontsets do not accurately
reflect your usage at all (and so my profiling is woefully myopic).
xfce4-terminal with a recent vte will use cairo for its rendering so
should generate a good trace, as will firefox and other gtk+
applications. Could you record some sample cairo-traces so that we can
see how much impact the addition of pixman fast paths makes to your
workflow, and so that we do not neglect you when developing the other
backends as well?

Thanks,
-ickle
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
cairo mailing list
cairo@...
http://lists.cairographics.org/mailman/listinfo/cairo

Re: [PATCH] pixman: C fast path for add_1000_1000 and over_n_1_8888

by Siarhei Siamashka :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

On Sunday 08 November 2009, Chris Wilson wrote:

> Hi Siarhei,
> I was just worrying about the absence of such paths from the
> current set of cairo-traces. The only attempt I've made at capturing a
> wide range of fonts and languages are the gnome-terminal and original
> firefox traces. I suspect that these and my fontsets do not accurately
> reflect your usage at all (and so my profiling is woefully myopic).
> xfce4-terminal with a recent vte will use cairo for its rendering so
> should generate a good trace, as will firefox and other gtk+
> applications. Could you record some sample cairo-traces so that we can
> see how much impact the addition of pixman fast paths makes to your
> workflow, and so that we do not neglect you when developing the other
> backends as well?
The use of these functions actually depends on font. I noticed that
performance was quite bad when using bitmap fonts such as terminus:
http://www.is-vn.bg/hamster

It's probably not a very important case for most users, though I myself
prefer to use bitmap fonts in terminals. But it just shows exceptionally
bad performance here unless pixman has the needed fast path functions.

Here is the trace (scrolling 'man gcc' in xfce4-terminal with terminus font,
16bpp desktop, ARM cpu):
http://people.freedesktop.org/~siamashka/files/20091109/Terminal.30676.lzma

Actually after upgrading cairo and some of the other libraries, now I get a
bit different behavior from what I have seen before. This is a log from
oprofile for Xorg process with current pixman git:

samples  %        image name               symbol name
13296    29.1528  libpixman-1.so.0.17.1    combine_over_u
6452     14.1466  libpixman-1.so.0.17.1    fetch_scanline_r5g6b5
5516     12.0944  libpixman-1.so.0.17.1    fetch_scanline_a1
2273      4.9838  libpixman-1.so.0.17.1    store_scanline_r5g6b5
1741      3.8173  libpixman-1.so.0.17.1    fast_composite_add_1000_1000
1718      3.7669  libc-2.9.so              memcpy
1176      2.5785  libpixman-1.so.0.17.1    arm_neon_fill
1114      2.4426  vmlinux                  __memzero
951       2.0852  libpixman-1.so.0.17.1    bits_image_fetch_solid_32
640       1.4033  libpixman-1.so.0.17.1    _pixman_run_fast_path
513       1.1248  libc-2.9.so              _int_malloc
447       0.9801  libpixman-1.so.0.17.1    
_pixman_bits_image_setup_raw_accessors
377       0.8266  libc-2.9.so              malloc
350       0.7674  libfb.so                 image_from_pict
321       0.7038  libc-2.9.so              _int_free
307       0.6731  vmlinux                  __do_softirq
293       0.6424  Xorg                     miGlyphs
270       0.5920  Xorg                     CompositePicture
210       0.4604  libc-2.9.so              free
204       0.4473  libfb.so                 fbComposite

It clearly shows that now 'over_n_1_0565' is also badly needed for this use
case. Earlier only 'over_n_1_8888' was called and then the result was
converted to 0565 as an additional step (which was bad itself, but represented
a separate problem which seems to be solved now).

Still 'over_n_1_8888' fast path is also useful for 32bpp desktop. Like PS3,
which I'm using for testing big endian compatibility.

I'll post some more benchmarks for this 1-bit stuff later.

--
Best regards,
Siarhei Siamashka


_______________________________________________
cairo mailing list
cairo@...
http://lists.cairographics.org/mailman/listinfo/cairo

signature.asc (196 bytes) Download Attachment