About asmjit experance over the weekend

View: New views
3 Messages — Rating Filter:   Alert me  

Parent Message unknown About asmjit experance over the weekend

by Mike T. :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

I tried out asmjit this weekend, and was impressed with it.  Easy to use and
get decent performance gains.

I tested it by creating a clear buffer call dynamically that was specific
for my viewport size.  I put the viewport parameters and pointers into the
JIT'd assembler, and saw a 10% boost in performance over my existing
assembly code.

Of course, when the viewport size changes I need to rebuild the routine -
but that is human controlled.

I imagine that creating transform functions which have the matrix embedded
in the function would also gain a fair amount.

The other ability I like is that I can detect the availability of
BBX/SSE/SSE2 at runtime and generate functions that are tailored to the CPU.

Some example code:

#if defined(_USE_JIT)
    //  Test of JIT function use    
    #pragma warning(disable : 4311 4312)
    using namespace AsmJit;

    //  JIT function - use fastcall, first 2 args in ecx, edx
    typedef void (__fastcall *JIT_clear_bb)(DWORD dwColor);
    if(m_JIT_clear_bb_32.empty())
        {
        // X86 assembler
        X86     a;

        // Prolog
        a.push(ebp);
        a.mov(ebp, esp);
        a.push(esi);
        a.push(edi);

        if(haveSSECPU())
            {
            //  MMX version is 20% faster then x86 version
            Label   L_Loop_Next_Line;
            Label   L_Loop_Line;
            Label   L_Loop_Pixel;

            //  Code
            a.mov(eax,ecx);     // color in eax
            a.xor_(ecx,ecx);    // line counter
            a.movd(mm1,eax);
            a.movd(mm2,eax);
            a.psllq(mm1,imm(32));
            a.por(mm1,mm2);

            a.bind(&L_Loop_Line);
            a.mov(ebx,(SysUInt)(m_dwScanLen>>3));
            a.mov(edi,imm((SysUInt)m_bbuffer));
            a.mov(edi,dword_ptr(edi,ecx,2));

            a.bind(&L_Loop_Pixel);
            a.movq(dword_ptr(edi,0),mm1);
            a.dec(ebx);
            a.j(C_ZERO, &L_Loop_Next_Line);
            a.add(edi,8);
            a.jmp(&L_Loop_Pixel);

            a.bind(&L_Loop_Next_Line);
            if(m_dwScanLen%8)
                {
                a.mov(dword_ptr(edi,0),eax);
                }
            a.inc(ecx);
            a.cmp(ecx,imm(m_cy));
            a.j(C_NOT_EQUAL,&L_Loop_Line);

            a.emms();
            }
        else
            {
            //  x86 version
            Label   L_Loop_Next_Line;
            Label   L_Loop_Line;
            Label   L_Loop_Pixel;

            //  Code
            a.mov(eax,ecx);     // color in eax
            a.xor_(ecx,ecx);    // line counter

            a.bind(&L_Loop_Line);
            a.mov(ebx,imm((SysUInt)(m_dwScanLen>>2)));
            a.mov(edi,imm((SysUInt)m_bbuffer));
            a.mov(edi,dword_ptr(edi,ecx,2));

            a.bind(&L_Loop_Pixel);
            a.mov(dword_ptr(edi,0),eax);
            a.dec(ebx);
            a.j(C_ZERO, &L_Loop_Next_Line);
            a.add(edi,4);
            a.jmp(&L_Loop_Pixel);

            a.bind(&L_Loop_Next_Line);
            a.inc(ecx);
            a.cmp(ecx,imm(m_cy));
            a.j(C_NOT_EQUAL,&L_Loop_Line);
            }

        // Epilog
        a.pop(edi);
        a.pop(esi);
        a.mov(esp, ebp);
        a.pop(ebp);

        a.ret(0);   // fastcall, no args

        // Allocate execution enabled memory
        m_JIT_clear_bb_32.build(a);
        }

    reinterpret_cast<JIT_clear_bb>(m_JIT_clear_bb_32.ptr())(m_dwBGColor);

    #pragma warning(default : 4311 4312)
#endif

I made a simple class to manage the JIT Data, still a work in progress:

//
//  Wrapper for JIT methods
//
struct JIT_METHOD
    {
    void            *m_pJIT;
    AsmJit::SysUInt m_cbJIT;

    JIT_METHOD() :
        m_pJIT(NULL),
        m_cbJIT(0x0)
        {
        }

    ~JIT_METHOD()
        {
        clear();
        }

    void clear()
        {
        if(m_pJIT)
            AsmJit::VM::free(m_pJIT, m_cbJIT);

        m_pJIT  = NULL;
        m_cbJIT = 0x0;
        }

    void build(AsmJit::X86 & a)
        {
        m_pJIT = AsmJit::VM::alloc(a.codeSize(), &m_cbJIT, /*canExecute*/
true);
        assert(m_pJIT);
        memcpy(m_pJIT, a.pData, a.codeSize());
        }

    void *append(AsmJit::X86 & a)
        {
        AsmJit::SysUInt cbOldJIT        = m_cbJIT;
        AsmJit::SysUInt cbAlign         = (m_cbJIT % 16);       //  align on
16 byte boundry
        AsmJit::SysUInt cbOldJITAlign   = m_cbJIT + cbAlign;
        void            *pOrg           = m_pJIT;
        byte            *pNew           =
(byte*)AsmJit::VM::alloc(a.codeSize()+cbOldJITAlign, &m_cbJIT,
/*canExecute*/ true);

        memcpy(pNew, pOrg, cbOldJIT);
        memset(pNew+cbOldJIT, 0, cbAlign);  // clear buffer bytes
        memcpy(pNew+cbOldJITAlign, a.pData, a.codeSize());

        if(m_pJIT)
            AsmJit::VM::free(m_pJIT, cbOldJIT);

        m_pJIT = (void*)pNew;

        return (void*)(pNew+cbOldJITAlign); // return appended method ptr -
note when this JIT is cleared, all the functions are cleared!
        }

    __inline bool empty()
        {
        return 0x0 == m_cbJIT;
        }

    //  Get JIT function pointer
    __inline void *ptr()
        {
        return m_pJIT;
        }

    //  Get JIT function pointer with offset
    __inline void *ptr(AsmJit::SysUInt offset)
        {
        assert(offset<m_cbJIT);
        return (void*)(((byte*)m_pJIT)+offset);
        }
    };



------------------------------------------------------------------------------
Create and Deploy Rich Internet Apps outside the browser with Adobe(R)AIR(TM)
software. With Adobe AIR, Ajax developers can use existing skills and code to
build responsive, highly engaging applications that combine the power of local
resources and data with the reach of the web. Download the Adobe AIR SDK and
Ajax docs to start building applications today-http://p.sf.net/sfu/adobe-com
_______________________________________________
Vector-agg-general mailing list
Vector-agg-general@...
https://lists.sourceforge.net/lists/listinfo/vector-agg-general

Re: About asmjit experance over the weekend

by Petr Kobalíček :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

Hi mike,

thanks for your interest about project. I have created asmjit-dev
group in google for AsmJit development, so you can join and we can
discuss everything there (http://groups.google.com/group/asmjit-dev/).

I have also some notes about your code:

1)  Instead of push/pop esi use ebx (must be preserved, esi is not used)
2)  This code:
  a.movd(mm1,eax);
  a.movd(mm2,eax);
  a.psllq(mm1,imm(32));
  a.por(mm1,mm2);
can be rewritten as:
  a.movd(mm1, eax);
  a.punpckldq(mm1, mm1); // this is good thick to unpack low 4 bytes
to higher ones

So, new mailing list about AsmJit is asmjit-dev@....

Cheers
- Petr

2009/2/3 Mike Tajmajer <mike@...>:

> I tried out asmjit this weekend, and was impressed with it.  Easy to use and
> get decent performance gains.
>
> I tested it by creating a clear buffer call dynamically that was specific
> for my viewport size.  I put the viewport parameters and pointers into the
> JIT'd assembler, and saw a 10% boost in performance over my existing
> assembly code.
>
> Of course, when the viewport size changes I need to rebuild the routine -
> but that is human controlled.
>
> I imagine that creating transform functions which have the matrix embedded
> in the function would also gain a fair amount.
>
> The other ability I like is that I can detect the availability of
> BBX/SSE/SSE2 at runtime and generate functions that are tailored to the CPU.
>
> Some example code:
>
> #if defined(_USE_JIT)
>    //  Test of JIT function use
>    #pragma warning(disable : 4311 4312)
>    using namespace AsmJit;
>
>    //  JIT function - use fastcall, first 2 args in ecx, edx
>    typedef void (__fastcall *JIT_clear_bb)(DWORD dwColor);
>    if(m_JIT_clear_bb_32.empty())
>        {
>        // X86 assembler
>        X86     a;
>
>        // Prolog
>        a.push(ebp);
>        a.mov(ebp, esp);
>        a.push(esi);
>        a.push(edi);
>
>        if(haveSSECPU())
>            {
>            //  MMX version is 20% faster then x86 version
>            Label   L_Loop_Next_Line;
>            Label   L_Loop_Line;
>            Label   L_Loop_Pixel;
>
>            //  Code
>            a.mov(eax,ecx);     // color in eax
>            a.xor_(ecx,ecx);    // line counter
>            a.movd(mm1,eax);
>            a.movd(mm2,eax);
>            a.psllq(mm1,imm(32));
>            a.por(mm1,mm2);
>
>            a.bind(&L_Loop_Line);
>            a.mov(ebx,(SysUInt)(m_dwScanLen>>3));
>            a.mov(edi,imm((SysUInt)m_bbuffer));
>            a.mov(edi,dword_ptr(edi,ecx,2));
>
>            a.bind(&L_Loop_Pixel);
>            a.movq(dword_ptr(edi,0),mm1);
>            a.dec(ebx);
>            a.j(C_ZERO, &L_Loop_Next_Line);
>            a.add(edi,8);
>            a.jmp(&L_Loop_Pixel);
>
>            a.bind(&L_Loop_Next_Line);
>            if(m_dwScanLen%8)
>                {
>                a.mov(dword_ptr(edi,0),eax);
>                }
>            a.inc(ecx);
>            a.cmp(ecx,imm(m_cy));
>            a.j(C_NOT_EQUAL,&L_Loop_Line);
>
>            a.emms();
>            }
>        else
>            {
>            //  x86 version
>            Label   L_Loop_Next_Line;
>            Label   L_Loop_Line;
>            Label   L_Loop_Pixel;
>
>            //  Code
>            a.mov(eax,ecx);     // color in eax
>            a.xor_(ecx,ecx);    // line counter
>
>            a.bind(&L_Loop_Line);
>            a.mov(ebx,imm((SysUInt)(m_dwScanLen>>2)));
>            a.mov(edi,imm((SysUInt)m_bbuffer));
>            a.mov(edi,dword_ptr(edi,ecx,2));
>
>            a.bind(&L_Loop_Pixel);
>            a.mov(dword_ptr(edi,0),eax);
>            a.dec(ebx);
>            a.j(C_ZERO, &L_Loop_Next_Line);
>            a.add(edi,4);
>            a.jmp(&L_Loop_Pixel);
>
>            a.bind(&L_Loop_Next_Line);
>            a.inc(ecx);
>            a.cmp(ecx,imm(m_cy));
>            a.j(C_NOT_EQUAL,&L_Loop_Line);
>            }
>
>        // Epilog
>        a.pop(edi);
>        a.pop(esi);
>        a.mov(esp, ebp);
>        a.pop(ebp);
>
>        a.ret(0);   // fastcall, no args
>
>        // Allocate execution enabled memory
>        m_JIT_clear_bb_32.build(a);
>        }
>
>    reinterpret_cast<JIT_clear_bb>(m_JIT_clear_bb_32.ptr())(m_dwBGColor);
>
>    #pragma warning(default : 4311 4312)
> #endif
>
> I made a simple class to manage the JIT Data, still a work in progress:
>
> //
> //  Wrapper for JIT methods
> //
> struct JIT_METHOD
>    {
>    void            *m_pJIT;
>    AsmJit::SysUInt m_cbJIT;
>
>    JIT_METHOD() :
>        m_pJIT(NULL),
>        m_cbJIT(0x0)
>        {
>        }
>
>    ~JIT_METHOD()
>        {
>        clear();
>        }
>
>    void clear()
>        {
>        if(m_pJIT)
>            AsmJit::VM::free(m_pJIT, m_cbJIT);
>
>        m_pJIT  = NULL;
>        m_cbJIT = 0x0;
>        }
>
>    void build(AsmJit::X86 & a)
>        {
>        m_pJIT = AsmJit::VM::alloc(a.codeSize(), &m_cbJIT, /*canExecute*/
> true);
>        assert(m_pJIT);
>        memcpy(m_pJIT, a.pData, a.codeSize());
>        }
>
>    void *append(AsmJit::X86 & a)
>        {
>        AsmJit::SysUInt cbOldJIT        = m_cbJIT;
>        AsmJit::SysUInt cbAlign         = (m_cbJIT % 16);       //  align on
> 16 byte boundry
>        AsmJit::SysUInt cbOldJITAlign   = m_cbJIT + cbAlign;
>        void            *pOrg           = m_pJIT;
>        byte            *pNew           =
> (byte*)AsmJit::VM::alloc(a.codeSize()+cbOldJITAlign, &m_cbJIT,
> /*canExecute*/ true);
>
>        memcpy(pNew, pOrg, cbOldJIT);
>        memset(pNew+cbOldJIT, 0, cbAlign);  // clear buffer bytes
>        memcpy(pNew+cbOldJITAlign, a.pData, a.codeSize());
>
>        if(m_pJIT)
>            AsmJit::VM::free(m_pJIT, cbOldJIT);
>
>        m_pJIT = (void*)pNew;
>
>        return (void*)(pNew+cbOldJITAlign); // return appended method ptr -
> note when this JIT is cleared, all the functions are cleared!
>        }
>
>    __inline bool empty()
>        {
>        return 0x0 == m_cbJIT;
>        }
>
>    //  Get JIT function pointer
>    __inline void *ptr()
>        {
>        return m_pJIT;
>        }
>
>    //  Get JIT function pointer with offset
>    __inline void *ptr(AsmJit::SysUInt offset)
>        {
>        assert(offset<m_cbJIT);
>        return (void*)(((byte*)m_pJIT)+offset);
>        }
>    };
>
>
>
> ------------------------------------------------------------------------------
> Create and Deploy Rich Internet Apps outside the browser with Adobe(R)AIR(TM)
> software. With Adobe AIR, Ajax developers can use existing skills and code to
> build responsive, highly engaging applications that combine the power of local
> resources and data with the reach of the web. Download the Adobe AIR SDK and
> Ajax docs to start building applications today-http://p.sf.net/sfu/adobe-com
> _______________________________________________
> Vector-agg-general mailing list
> Vector-agg-general@...
> https://lists.sourceforge.net/lists/listinfo/vector-agg-general
>

------------------------------------------------------------------------------
Create and Deploy Rich Internet Apps outside the browser with Adobe(R)AIR(TM)
software. With Adobe AIR, Ajax developers can use existing skills and code to
build responsive, highly engaging applications that combine the power of local
resources and data with the reach of the web. Download the Adobe AIR SDK and
Ajax docs to start building applications today-http://p.sf.net/sfu/adobe-com
_______________________________________________
Vector-agg-general mailing list
Vector-agg-general@...
https://lists.sourceforge.net/lists/listinfo/vector-agg-general

Re: About asmjit experance over the weekend

by Mike T. :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

Hi Petr,

I did some timing with rdtsc and using punpckldq is actually slower then
shifting and or'ing.

Thanks for pointing out esi!  I really don't need any Prolog for this
function :-)

>
> Hi mike,
>
> thanks for your interest about project. I have created
> asmjit-dev group in google for AsmJit development, so you can
> join and we can discuss everything there
> (http://groups.google.com/group/asmjit-dev/).
>
> I have also some notes about your code:
>
> 1)  Instead of push/pop esi use ebx (must be preserved, esi
> is not used)
> 2)  This code:
>   a.movd(mm1,eax);
>   a.movd(mm2,eax);
>   a.psllq(mm1,imm(32));
>   a.por(mm1,mm2);
> can be rewritten as:
>   a.movd(mm1, eax);
>   a.punpckldq(mm1, mm1); // this is good thick to unpack low
> 4 bytes to higher ones
>
> So, new mailing list about AsmJit is asmjit-dev@....
>
> Cheers
> - Petr



------------------------------------------------------------------------------
Create and Deploy Rich Internet Apps outside the browser with Adobe(R)AIR(TM)
software. With Adobe AIR, Ajax developers can use existing skills and code to
build responsive, highly engaging applications that combine the power of local
resources and data with the reach of the web. Download the Adobe AIR SDK and
Ajax docs to start building applications today-http://p.sf.net/sfu/adobe-com
_______________________________________________
Vector-agg-general mailing list
Vector-agg-general@...
https://lists.sourceforge.net/lists/listinfo/vector-agg-general