Hi Vinnie,
I'm repeating here, but some time ago I wrote MMX and SSE2 optimized
interpolation between two colors based on linear increments instead of
division or multiplication. Currently I'm prepared to rewrite it to my
new library so it can give hints to you.
Currently the best way to do color interpolation (I think) is to use
fixed point (ideally 9.23 - one bit for sign, this is important) and
incrementing mechanism (incrementing means that you only increment
some values for each pixel, there is no multiplication).
To emulate this:
... operator[n] ( m_c1.gradient(m_c2, double(v) * m_mult ); )
you need to store precomputed gradient into array, so you can write
... operator[n] ( return m_table[n]; )
I don't know if you are interested in this approach, but it's really fast.
(I was this in concepts, continuing:) ... )
The my code looks like this:
static void FOG_FASTCALL gradient_gradient_argb32_SSE2(uint8_t* dst,
uint32_t c0, uint32_t c1, sysint_t w, sysint_t x1, sysint_t x2)
{
uint8_t* dstCur = dst;
// Sanity checks.
FOG_ASSERT(w >= 0 && x1 <= x2);
sysint_t xw = (x2 - x1);
if (xw == 0) return;
// Width is decreased by 1 to fit our gradient schema that first and last
// points in interpolation are always equal to c0 and c1 recpectively.
if (w) w--;
// 0op counter.
sysint_t i;
// Fill c0 before gradient start.
if (x1 < 0)
{
i = fog_min((sysint_t)0, x2) - x1;
xw -= i;
x1 = 0;
do { set4(dstCur, c0); dstCur += 4; } while (--i);
if (xw == 0) return;
}
// Fill c0 to c1 using linear interpolation.
if (x1 < w)
{
__m128i xmmz;
__m128i xmm0, xmm1;
__m128i xmm2, xmm3;
__m128i xmm4, xmm5;
FOG_DECLARE_ALIGNED_VARIABLE(sse2_t, tmpARGB, 16);
xmmz = _mm_setzero_si128();
xmm0 = _mm_cvtsi32_si128((int)c0); // xmm0 = [ ARGB] c0
xmm1 = _mm_cvtsi32_si128((int)c1); // xmm1 = [ ARGB] c1
xmm0 = _mm_unpacklo_epi8(xmm0, xmmz); // xmm0 = [ 0A0R0G0B] c0
xmm1 = _mm_unpacklo_epi8(xmm1, xmmz); // xmm1 = [ 0A0R0G0B] c1
xmm0 = _mm_unpacklo_epi16(xmmz, xmm0); // xmm0 = [0A000R000G000B00] c0
xmm1 = _mm_unpacklo_epi16(xmmz, xmm1); // xmm1 = [0A000R000G000B00] c1
xmm1 = _mm_sub_epi32(xmm1, xmm0); // xmm1 = difference
// Divide.
tmpARGB.m128i = xmm1; // copy xmm1 to
temporary buffer
tmpARGB.sd[0] /= (int32_t)w;
tmpARGB.sd[1] /= (int32_t)w;
tmpARGB.sd[2] /= (int32_t)w;
tmpARGB.sd[3] /= (int32_t)w;
xmm1 = tmpARGB.m128i; // xmm1 = increment
// Offset interpolation to x1.
tmpARGB.sd[0] *= (int32_t)x1;
tmpARGB.sd[1] *= (int32_t)x1;
tmpARGB.sd[2] *= (int32_t)x1;
tmpARGB.sd[3] *= (int32_t)x1;
xmm0 = _mm_add_epi32(xmm0, tmpARGB.m128i); // xmm0 = c0 + offset
// Align.
while (((sysuint_t)dstCur & 15) != 0)
{
xmm2 = xmm0; // xmm2 = [xAxxxRxxxGxxxBxx]
xmm0 = _mm_add_epi32(xmm0, xmm1); // xmm0 += xmm1
xmm2 = _mm_packus_epi16(xmm2, xmm2); // xmm2 = [AxRxGxBxAxRxGxBx]
xmm2 = _mm_srai_epi16(xmm2, 8); // xmm2 = [0A0R0G0B0A0R0G0B]
xmm2 = _mm_packs_epi16(xmm2, xmm2); // xmm2 = [ARGBARGBARGBARGB]
((int *)dstCur)[0] = _mm_cvtsi128_si32(xmm2);
dstCur += 4;
if (--i == 0) return;
}
// 0op: 4 pixels at time.
while ((i -= 4) >= 0)
{
xmm2 = xmm0; // xmm2 = [xAxxxRxxxGxxxBxx]
xmm0 = _mm_add_epi32(xmm0, xmm1); // xmm0 += xmm1
xmm3 = xmm0; // xmm3 = [xAxxxRxxxGxxxBxx]
xmm0 = _mm_add_epi32(xmm0, xmm1); // xmm0 += xmm1
xmm4 = xmm0; // xmm4 = [xAxxxRxxxGxxxBxx]
xmm0 = _mm_add_epi32(xmm0, xmm1); // xmm0 += xmm1
xmm5 = xmm0; // xmm5 = [xAxxxRxxxGxxxBxx]
xmm0 = _mm_add_epi32(xmm0, xmm1); // xmm0 += xmm1
xmm2 = _mm_packus_epi16(xmm2, xmm3); // xmm2 = [AxRxGxBxAxRxGxBx]
xmm4 = _mm_packus_epi16(xmm4, xmm5); // xmm4 = [AxRxGxBxAxRxGxBx]
xmm2 = _mm_srai_epi16(xmm2, 8); // xmm2 = [0A0R0G0B0A0R0G0B]
xmm4 = _mm_srai_epi16(xmm4, 8); // xmm4 = [0A0R0G0B0A0R0G0B]
xmm2 = _mm_packs_epi16(xmm2, xmm4); // xmm2 = [ARGBARGBARGBARGB]
_mm_store_si128((__m128i *)dstCur, xmm2);
dstCur += 16;
i += 4;
}
i += 4;
// Tail.
while (i)
{
xmm2 = xmm0; // xmm2 = [xAxxxRxxxGxxxBxx]
xmm0 = _mm_add_epi32(xmm0, xmm1); // xmm0 += xmm1
xmm2 = _mm_packus_epi16(xmm2, xmm2); // xmm2 = [AxRxGxBxAxRxGxBx]
xmm2 = _mm_srai_epi16(xmm2, 8); // xmm2 = [0A0R0G0B0A0R0G0B]
xmm2 = _mm_packs_epi16(xmm2, xmm2); // xmm2 = [ARGBARGBARGBARGB]
((int *)dstCur)[0] = _mm_cvtsi128_si32(xmm2);
dstCur += 4;
i--;
}
if (x1 == x2) return;
}
// Fill c1 after gradient end.
i = x2 - x1;
do { set4(dstCur, c1); dstCur += 4; } while (--i);
}
Currently this code calculates with 4x16.16 fixed point variables per
one SSE2 register and main loop produces 4 pixels per time. Full
source code is here:
http://code.google.com/p/fog/source/browse/trunk/Fog/Fog/Graphics/Raster_SSE2_p.cpp.
Cheers
- Petr
2009/4/28 Vinnie <
thevinn@...>:
>
> I found something, there's a per-pixel divide that can be replaced with a precalculated multiply. I got a little performance boost out of it since I am doing tons of gradients. Here's my version of the class. Note that my sources are based on the second to last release (i.e. the non GPLed one):
>
> template<class ColorT>
> struct gradient_linear_color
> {
> typedef ColorT color_type;
>
> gradient_linear_color() {}
> gradient_linear_color(const color_type& c1, const color_type& c2,
> unsigned size = 256) :
> m_c1(c1), m_c2(c2), m_size(size)
> // VFALCO 5/28/09
> ,m_mult(1/(double(size)-1))
> // VFALCO
> {}
>
> unsigned size() const { return m_size; }
> color_type operator [] (unsigned v) const
> {
> // VFALCO 5/28/09
> //return m_c1.gradient(m_c2, double(v) / double(m_size - 1));
> return m_c1.gradient(m_c2, double(v) * m_mult );
> // VFALCO
> }
>
> void colors(const color_type& c1, const color_type& c2, unsigned size = 256)
> {
> m_c1 = c1;
> m_c2 = c2;
> m_size = size;
> // VFALCO 5/28/09
> m_mult=1/(double(size)-1);
> // VFALCO
> }
>
> color_type m_c1;
> color_type m_c2;
> unsigned m_size;
> // VFALCO 5/28/09
> double m_mult;
> // VFALCO
> };
>
>
>
> ------------------------------------------------------------------------------
> Register Now & Save for Velocity, the Web Performance & Operations
> Conference from O'Reilly Media. Velocity features a full day of
> expert-led, hands-on workshops and two days of sessions from industry
> leaders in dedicated Performance & Operations tracks. Use code vel09scf
> and Save an extra 15% before 5/3.
http://p.sf.net/sfu/velocityconf> _______________________________________________
> Vector-agg-general mailing list
>
Vector-agg-general@...
>
https://lists.sourceforge.net/lists/listinfo/vector-agg-general>
------------------------------------------------------------------------------
The NEW KODAK i700 Series Scanners deliver under ANY circumstances! Your
production scanning environment may not be a perfect world - but thanks to
Kodak, there's a perfect scanner to get the job done! With the NEW KODAK i700
Series Scanner you'll get full speed at 300 dpi even with all image
processing features enabled.
http://p.sf.net/sfu/kodak-com_______________________________________________
Vector-agg-general mailing list
Vector-agg-general@...
https://lists.sourceforge.net/lists/listinfo/vector-agg-general