|
View:
New views
4 Messages
—
Rating Filter:
Alert me
|
|
|
|
|
|
Re: A small optimization to gradient_linear_colorHi Vinnie,
I'm repeating here, but some time ago I wrote MMX and SSE2 optimized interpolation between two colors based on linear increments instead of division or multiplication. Currently I'm prepared to rewrite it to my new library so it can give hints to you. Currently the best way to do color interpolation (I think) is to use fixed point (ideally 9.23 - one bit for sign, this is important) and incrementing mechanism (incrementing means that you only increment some values for each pixel, there is no multiplication). To emulate this: ... operator[n] ( m_c1.gradient(m_c2, double(v) * m_mult ); ) you need to store precomputed gradient into array, so you can write ... operator[n] ( return m_table[n]; ) I don't know if you are interested in this approach, but it's really fast. (I was this in concepts, continuing:) ... ) The my code looks like this: static void FOG_FASTCALL gradient_gradient_argb32_SSE2(uint8_t* dst, uint32_t c0, uint32_t c1, sysint_t w, sysint_t x1, sysint_t x2) { uint8_t* dstCur = dst; // Sanity checks. FOG_ASSERT(w >= 0 && x1 <= x2); sysint_t xw = (x2 - x1); if (xw == 0) return; // Width is decreased by 1 to fit our gradient schema that first and last // points in interpolation are always equal to c0 and c1 recpectively. if (w) w--; // 0op counter. sysint_t i; // Fill c0 before gradient start. if (x1 < 0) { i = fog_min((sysint_t)0, x2) - x1; xw -= i; x1 = 0; do { set4(dstCur, c0); dstCur += 4; } while (--i); if (xw == 0) return; } // Fill c0 to c1 using linear interpolation. if (x1 < w) { __m128i xmmz; __m128i xmm0, xmm1; __m128i xmm2, xmm3; __m128i xmm4, xmm5; FOG_DECLARE_ALIGNED_VARIABLE(sse2_t, tmpARGB, 16); xmmz = _mm_setzero_si128(); xmm0 = _mm_cvtsi32_si128((int)c0); // xmm0 = [ ARGB] c0 xmm1 = _mm_cvtsi32_si128((int)c1); // xmm1 = [ ARGB] c1 xmm0 = _mm_unpacklo_epi8(xmm0, xmmz); // xmm0 = [ 0A0R0G0B] c0 xmm1 = _mm_unpacklo_epi8(xmm1, xmmz); // xmm1 = [ 0A0R0G0B] c1 xmm0 = _mm_unpacklo_epi16(xmmz, xmm0); // xmm0 = [0A000R000G000B00] c0 xmm1 = _mm_unpacklo_epi16(xmmz, xmm1); // xmm1 = [0A000R000G000B00] c1 xmm1 = _mm_sub_epi32(xmm1, xmm0); // xmm1 = difference // Divide. tmpARGB.m128i = xmm1; // copy xmm1 to temporary buffer tmpARGB.sd[0] /= (int32_t)w; tmpARGB.sd[1] /= (int32_t)w; tmpARGB.sd[2] /= (int32_t)w; tmpARGB.sd[3] /= (int32_t)w; xmm1 = tmpARGB.m128i; // xmm1 = increment // Offset interpolation to x1. tmpARGB.sd[0] *= (int32_t)x1; tmpARGB.sd[1] *= (int32_t)x1; tmpARGB.sd[2] *= (int32_t)x1; tmpARGB.sd[3] *= (int32_t)x1; xmm0 = _mm_add_epi32(xmm0, tmpARGB.m128i); // xmm0 = c0 + offset // Align. while (((sysuint_t)dstCur & 15) != 0) { xmm2 = xmm0; // xmm2 = [xAxxxRxxxGxxxBxx] xmm0 = _mm_add_epi32(xmm0, xmm1); // xmm0 += xmm1 xmm2 = _mm_packus_epi16(xmm2, xmm2); // xmm2 = [AxRxGxBxAxRxGxBx] xmm2 = _mm_srai_epi16(xmm2, 8); // xmm2 = [0A0R0G0B0A0R0G0B] xmm2 = _mm_packs_epi16(xmm2, xmm2); // xmm2 = [ARGBARGBARGBARGB] ((int *)dstCur)[0] = _mm_cvtsi128_si32(xmm2); dstCur += 4; if (--i == 0) return; } // 0op: 4 pixels at time. while ((i -= 4) >= 0) { xmm2 = xmm0; // xmm2 = [xAxxxRxxxGxxxBxx] xmm0 = _mm_add_epi32(xmm0, xmm1); // xmm0 += xmm1 xmm3 = xmm0; // xmm3 = [xAxxxRxxxGxxxBxx] xmm0 = _mm_add_epi32(xmm0, xmm1); // xmm0 += xmm1 xmm4 = xmm0; // xmm4 = [xAxxxRxxxGxxxBxx] xmm0 = _mm_add_epi32(xmm0, xmm1); // xmm0 += xmm1 xmm5 = xmm0; // xmm5 = [xAxxxRxxxGxxxBxx] xmm0 = _mm_add_epi32(xmm0, xmm1); // xmm0 += xmm1 xmm2 = _mm_packus_epi16(xmm2, xmm3); // xmm2 = [AxRxGxBxAxRxGxBx] xmm4 = _mm_packus_epi16(xmm4, xmm5); // xmm4 = [AxRxGxBxAxRxGxBx] xmm2 = _mm_srai_epi16(xmm2, 8); // xmm2 = [0A0R0G0B0A0R0G0B] xmm4 = _mm_srai_epi16(xmm4, 8); // xmm4 = [0A0R0G0B0A0R0G0B] xmm2 = _mm_packs_epi16(xmm2, xmm4); // xmm2 = [ARGBARGBARGBARGB] _mm_store_si128((__m128i *)dstCur, xmm2); dstCur += 16; i += 4; } i += 4; // Tail. while (i) { xmm2 = xmm0; // xmm2 = [xAxxxRxxxGxxxBxx] xmm0 = _mm_add_epi32(xmm0, xmm1); // xmm0 += xmm1 xmm2 = _mm_packus_epi16(xmm2, xmm2); // xmm2 = [AxRxGxBxAxRxGxBx] xmm2 = _mm_srai_epi16(xmm2, 8); // xmm2 = [0A0R0G0B0A0R0G0B] xmm2 = _mm_packs_epi16(xmm2, xmm2); // xmm2 = [ARGBARGBARGBARGB] ((int *)dstCur)[0] = _mm_cvtsi128_si32(xmm2); dstCur += 4; i--; } if (x1 == x2) return; } // Fill c1 after gradient end. i = x2 - x1; do { set4(dstCur, c1); dstCur += 4; } while (--i); } Currently this code calculates with 4x16.16 fixed point variables per one SSE2 register and main loop produces 4 pixels per time. Full source code is here: http://code.google.com/p/fog/source/browse/trunk/Fog/Fog/Graphics/Raster_SSE2_p.cpp . Cheers - Petr 2009/4/28 Vinnie <thevinn@...>: > > I found something, there's a per-pixel divide that can be replaced with a precalculated multiply. I got a little performance boost out of it since I am doing tons of gradients. Here's my version of the class. Note that my sources are based on the second to last release (i.e. the non GPLed one): > > template<class ColorT> > struct gradient_linear_color > { > typedef ColorT color_type; > > gradient_linear_color() {} > gradient_linear_color(const color_type& c1, const color_type& c2, > unsigned size = 256) : > m_c1(c1), m_c2(c2), m_size(size) > // VFALCO 5/28/09 > ,m_mult(1/(double(size)-1)) > // VFALCO > {} > > unsigned size() const { return m_size; } > color_type operator [] (unsigned v) const > { > // VFALCO 5/28/09 > //return m_c1.gradient(m_c2, double(v) / double(m_size - 1)); > return m_c1.gradient(m_c2, double(v) * m_mult ); > // VFALCO > } > > void colors(const color_type& c1, const color_type& c2, unsigned size = 256) > { > m_c1 = c1; > m_c2 = c2; > m_size = size; > // VFALCO 5/28/09 > m_mult=1/(double(size)-1); > // VFALCO > } > > color_type m_c1; > color_type m_c2; > unsigned m_size; > // VFALCO 5/28/09 > double m_mult; > // VFALCO > }; > > > > ------------------------------------------------------------------------------ > Register Now & Save for Velocity, the Web Performance & Operations > Conference from O'Reilly Media. Velocity features a full day of > expert-led, hands-on workshops and two days of sessions from industry > leaders in dedicated Performance & Operations tracks. Use code vel09scf > and Save an extra 15% before 5/3. http://p.sf.net/sfu/velocityconf > _______________________________________________ > Vector-agg-general mailing list > Vector-agg-general@... > https://lists.sourceforge.net/lists/listinfo/vector-agg-general > ------------------------------------------------------------------------------ The NEW KODAK i700 Series Scanners deliver under ANY circumstances! Your production scanning environment may not be a perfect world - but thanks to Kodak, there's a perfect scanner to get the job done! With the NEW KODAK i700 Series Scanner you'll get full speed at 300 dpi even with all image processing features enabled. http://p.sf.net/sfu/kodak-com _______________________________________________ Vector-agg-general mailing list Vector-agg-general@... https://lists.sourceforge.net/lists/listinfo/vector-agg-general |
|
|
|
|
|
Re: A small optimization to gradient_linear_colorHi Vinnie,
if you are using only 1 pixel wide lines, I think that WU algorithm can be really fast compared to antigrain. Remember that antigrain producess scanline and span objects, so it needs first to serialize data thats rendered later. The SSE2 code I posted here contains bugs, I recommend to extract it from SVN I also posted (there are fixed all of them, i hope). > agg::rasterizer_cells_aa<>::line() > > This function looks complex and it seems to do a lot of work. What does it do? What is the meaning of the parameter list (x1, y1, x2, y2) ? I don't know exactly, but I think that it will process one line from list of lines that was generated from curves (line is main rasterizer object). Debug versions are slow, but Release should be fast (faster than cairo or gdi+). You can also try to switch compiler to use SSE2, this should also help. - Petr 2009/5/9 Vinnie <thevinn@...>: > >> From: Petr Kobal??ek <kobalicek.petr@...> >> I'm repeating here, but some time ago I wrote MMX and >> SSE2 optimized interpolation between two colors based on >> linear increments > > Those are some nice optimizations! The problem is that my app is spending by far, the majority of its time in: > > agg::rasterizer_cells_aa<>::line() > > This function looks complex and it seems to do a lot of work. What does it do? What is the meaning of the parameter list (x1, y1, x2, y2) ? > > ------------------------------------------------------------------------------ > The NEW KODAK i700 Series Scanners deliver under ANY circumstances! Your > production scanning environment may not be a perfect world - but thanks to > Kodak, there's a perfect scanner to get the job done! With the NEW KODAK i700 > Series Scanner you'll get full speed at 300 dpi even with all image > processing features enabled. http://p.sf.net/sfu/kodak-com > _______________________________________________ > Vector-agg-general mailing list > Vector-agg-general@... > https://lists.sourceforge.net/lists/listinfo/vector-agg-general > ------------------------------------------------------------------------------ The NEW KODAK i700 Series Scanners deliver under ANY circumstances! Your production scanning environment may not be a perfect world - but thanks to Kodak, there's a perfect scanner to get the job done! With the NEW KODAK i700 Series Scanner you'll get full speed at 300 dpi even with all image processing features enabled. http://p.sf.net/sfu/kodak-com _______________________________________________ Vector-agg-general mailing list Vector-agg-general@... https://lists.sourceforge.net/lists/listinfo/vector-agg-general |
| Free embeddable forum powered by Nabble | Forum Help |