Hi Lorne,
for this thing you don't need BlitJit:) I wrote SSE2 code some time
ago for Fog library, I think it can be easily extracted and wrapped
for your needs.
Look at this file (NOTE IT'S BIG):
http://code.google.com/p/fog/source/browse/trunk/Fog/Fog/Graphics/Raster/Raster_SSE2.cpp(Some extracted functions are here):
static FOG_INLINE void pix_expand_alpha_1x1W(
__m128i& dst0, const __m128i& src0)
{
dst0 = _mm_shufflelo_epi16(src0, _MM_SHUFFLE(3, 3, 3, 3));
}
static FOG_INLINE void pix_expand_alpha_1x2W(
__m128i& dst0, const __m128i& src0)
{
dst0 = _mm_shufflelo_epi16(src0, _MM_SHUFFLE(3, 3, 3, 3));
dst0 = _mm_shufflehi_epi16(dst0, _MM_SHUFFLE(3, 3, 3, 3));
}
static FOG_INLINE void pix_expand_alpha_2x2W(
__m128i& dst0, const __m128i& src0,
__m128i& dst1, const __m128i& src1)
{
dst0 = _mm_shufflelo_epi16(src0, _MM_SHUFFLE(3, 3, 3, 3));
dst1 = _mm_shufflelo_epi16(src1, _MM_SHUFFLE(3, 3, 3, 3));
dst0 = _mm_shufflehi_epi16(dst0, _MM_SHUFFLE(3, 3, 3, 3));
dst1 = _mm_shufflehi_epi16(dst1, _MM_SHUFFLE(3, 3, 3, 3));
}
static FOG_INLINE void pix_multiply_1x1W(
__m128i& dst0, const __m128i& data0, const __m128i& alpha0)
{
__m128i t0;
t0 = _mm_mullo_epi16(data0, alpha0);
t0 = _mm_adds_epu16(t0, Mask0080008000800080);
dst0 = _mm_mulhi_epu16(t0, Mask0101010101010101);
}
static FOG_INLINE void pix_multiply_2x2W(
__m128i& dst0, const __m128i& data0, const __m128i& alpha0,
__m128i& dst1, const __m128i& data1, const __m128i& alpha1)
{
__m128i t0, t1;
t0 = _mm_mullo_epi16(data0, alpha0);
t1 = _mm_mullo_epi16(data1, alpha1);
t0 = _mm_adds_epu16(t0, Mask0080008000800080);
t1 = _mm_adds_epu16(t1, Mask0080008000800080);
dst0 = _mm_mulhi_epu16(t0, Mask0101010101010101);
dst1 = _mm_mulhi_epu16(t1, Mask0101010101010101);
}
static FOG_INLINE void pix_fill_alpha_1x1W(
__m128i& dst0)
{
dst0 = _mm_or_si128(dst0, Mask00FF000000000000);
}
static FOG_INLINE void pix_fill_alpha_2x2W(
__m128i& dst0,
__m128i& dst1)
{
dst0 = _mm_or_si128(dst0, Mask00FF000000000000);
dst1 = _mm_or_si128(dst1, Mask00FF000000000000);
}
static FOG_INLINE void pix_premultiply_1x1W(
__m128i& dst0, const __m128i& src0)
{
__m128i alpha0;
pix_expand_alpha_1x1W(alpha0, src0);
pix_fill_alpha_1x1W(alpha0);
pix_multiply_1x1W(dst0, src0, alpha0);
}
static FOG_INLINE void pix_premultiply_2x2W(
__m128i& dst0, const __m128i& src0,
__m128i& dst1, const __m128i& src1)
{
__m128i alpha0;
__m128i alpha1;
pix_expand_alpha_2x2W(
alpha0, src0,
alpha1, src1);
pix_fill_alpha_2x2W(
alpha0,
alpha1);
pix_multiply_2x2W(
dst0, src0, alpha0,
dst1, src1, alpha1);
}
The code can be simplified to one function, I'm using more functions
because they are used by many other functions and blitters to make
code clean. Yesterday I started blogging about Fog library so all
interested people go here:
http://twopixels.blogspot.com/Cheers
- Petr
2009/6/25 Lorne Laliberte <
lorne@...>:
> Hello!
>
> Has anyone ever attempted to speed up multiplier_rgba::premultiply()?
>
> I'm wondering whether it would see any possible benefit from MMX or SSE2.
>
> (Petr, is this something that you've covered with blitjit yet?)
>
> Lorne Laliberte
> Senior Software Developer, Indigo Rose Software
>
> ------------------------------------------------------------------------------
> _______________________________________________
> Vector-agg-general mailing list
>
Vector-agg-general@...
>
https://lists.sourceforge.net/lists/listinfo/vector-agg-general>
------------------------------------------------------------------------------
_______________________________________________
Vector-agg-general mailing list
Vector-agg-general@...
https://lists.sourceforge.net/lists/listinfo/vector-agg-general