EzDevInfo.com

neon

Nervana's python based Deep Learning Framework neon: A highly configurable deep learning framework — neon 0.9.0+d45114f documentation

Get it on Github
Language : Python

http://neon.nervanasys.com/

How do Android programs make use of NEON SIMD?

I've been learning up a little on the cpu features and stumbled upon NEON.

From what I've read, it looks like NEON requires specific programming to use this, but is this completely true, or do the cpus that have this feature still find ways to untilize it and speed media processes for some applications even though there is not specific code for it?

Source: (StackOverflow)

SIMD optimization of OpenCV(cvtColor) using ARM NEON intrinsics

I'm working on a SIMD optimization of BGR to grayscale conversion which is equivalent to OpenCV's cvtColor() function. There is an Intel SSE version of this function and I'm referring to it. (What I'm doing is basically converting SSE code to NEON code.)

I've almost finished writing the code, and can compile it with g++, but I can't get the proper output. Does anyone have any ideas what the error could be?

What I'm getting (incorrect):

my output

What I should be getting:

properly converted output

Here's my code:

#include <opencv/cv.hpp>
#include <opencv/highgui.h>
#include <arm_neon.h>
//#include <iostream>

using namespace std;
//using namespace cv;

#define int8x16_to_8x8x2(v) ((int8x8x2_t) { vget_low_s8(v), vget_high_s8(v) })

void cvtBGR2GrayNEON(cv::Mat& src, cv::Mat& dest)
{
  const int size = src.size().area()*src.channels();
  uchar* s = src.ptr<uchar>(0);
  uchar* d = dest.ptr<uchar>(0);

  const int8x16_t mask1 = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
  const int8x16_t smask1 = {6,7,8,9,10,0,1,2,3,4,5,11,12,13,14,15};
  const int8x16_t ssmask1 = {11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10};

  const int8x16_t mask2 = {0,3,6,9,12,15, 2,5,8,11,14,1,4,7,10,13};
  const int8x16_t ssmask2 = {0,1,2,3,4,11,12,13,14,15,5,6,7,8,9,10};

  const int8x16_t bmask1 = {255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0};
  const int8x16_t bmask2 = {255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0};
  const int8x16_t bmask3 = {255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0};
  const int8x16_t bmask4 = {255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0};

  const int shift = 8;
  const int amp = 1<<shift;

  const int16_t _R_ = (int16_t)(amp*0.299);
  const int16_t _G_ = (int16_t)(amp*0.587);
  const int16_t _B_ = (int16_t)(amp*0.114);
  const int16x8_t R = vdupq_n_s16(_R_);
  const int16x8_t G = vdupq_n_s16(_G_);
  const int16x8_t B = vdupq_n_s16(_B_);
  const int8x16_t zero = vdupq_n_s8(0);

  for(int i = 0; i < size; i += 48)
    {
      int8x16_t a = vld1q_s8((int8_t *) s + i);
      int8x16_t b = vld1q_s8((int8_t *) s + i + 16);
      int8x16_t c = vld1q_s8((int8_t *) s + i + 32);

      a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a),vget_low_s8(mask1)),vtbl2_s8(int8x16_to_8x8x2(a),vget_high_s8(mask1)));
      b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b), vget_low_s8(mask2)), vtbl2_s8(int8x16_to_8x8x2(b), vget_high_s8(mask2)));
      c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c), vget_low_s8(mask2)), vtbl2_s8(int8x16_to_8x8x2(c), vget_high_s8(mask2)));

      //BBBBBB
      const int8x16_t aaaa = vbslq_s8(c, vbslq_s8(b, a, bmask1), bmask2);

      a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a), vget_low_s8(smask1)), vtbl2_s8(int8x16_to_8x8x2(a), vget_high_s8(smask1)));
      b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b), vget_low_s8(smask1)), vtbl2_s8(int8x16_to_8x8x2(b), vget_high_s8(smask1)));
      c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c), vget_low_s8(smask1)), vtbl2_s8(int8x16_to_8x8x2(c), vget_high_s8(smask1)));

      //GGGGGG
      const int8x16_t bbbb = vbslq_s8(c, vbslq_s8(b, a, bmask3), bmask2);

      a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a), vget_low_s8(ssmask1)), vtbl2_s8(int8x16_to_8x8x2(a), vget_high_s8(ssmask1)));
      c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c), vget_low_s8(ssmask1)), vtbl2_s8(int8x16_to_8x8x2(c), vget_high_s8(ssmask1)));
      b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b), vget_low_s8(ssmask2)), vtbl2_s8(int8x16_to_8x8x2(b), vget_high_s8(ssmask2)));

      //RRRRRR
      const int8x16_t cccc = vbslq_s8(c, vbslq_s8(b, a, bmask3), bmask4);

      /*
      int8x8x2_t a1 = vzip_s8(vget_high_s8(aaaa), vget_high_s8(zero));
      int8x8x2_t a2 = vzip_s8(vget_low_s8(aaaa), vget_low_s8(zero));
      */

      int8x16_t a1 = aaaa;
      int8x16_t a2 = zero;
      int8x16x2_t temp1 =  vzipq_s8(a1, a2);
      a1 = temp1.val[0];
      a2 = temp1.val[1];
      int16x8_t aa1 = vmulq_s16((int16x8_t)a2, B);
      int16x8_t aa2 = vmulq_s16((int16x8_t)a1, B);

      int8x16_t b1 = bbbb;
      int8x16_t b2 = zero;
      int8x16x2_t temp2 =  vzipq_s8(b1, b2);
      b1 = temp2.val[0];
      b2 = temp2.val[1];
      int16x8_t bb1 = vmulq_s16((int16x8_t)b2, G);
      int16x8_t bb2 = vmulq_s16((int16x8_t)b1, G);

      int8x16_t c1 = cccc;
      int8x16_t c2 = zero;
      int8x16x2_t temp3 =  vzipq_s8(c1, c2);
      c1 = temp3.val[0];
      c2 = temp3.val[1];
      int16x8_t cc1 = vmulq_s16((int16x8_t)c2, R);
      int16x8_t cc2 = vmulq_s16((int16x8_t)c1, R);

      aa1 = vaddq_s16(aa1, bb1);
      aa1 = vaddq_s16(aa1, cc1);
      aa2 = vaddq_s16(aa2, bb2);
      aa2 = vaddq_s16(aa2, cc2);

      const int shift1 = 8;
      aa1 = vshrq_n_s16(aa1, shift1);
      aa2 = vshrq_n_s16(aa2, shift1);

      uint8x8_t aaa1 = vqmovun_s16(aa1);
      uint8x8_t aaa2 = vqmovun_s16(aa2);

      uint8x16_t result = vcombine_u8(aaa1, aaa2);

      vst1q_u8((uint8_t *)(d), result);

      d+=16;
    }    
}

int main() 
{
  cv::Mat src = cv::imread("Lenna.bmp");
  cv::Mat dest(src.rows, src.cols, CV_8UC1);

  cvtBGR2GrayNEON(src, dest);

  cv::imwrite("grey.jpg", dest);

  return 0;
}

Here is equivalent SSE code (from here):

void cvtBGR2GraySSEShort(Mat& src, Mat& dest)
{
    const int size = src.size().area()*src.channels();
    uchar* s = src.ptr<uchar>(0);
    uchar* d = dest.ptr<uchar>(0);

    //data structure
    //BGR BGR BGR BGR BGR B
    //GR BGR BGR BGR BGR BG
    //R BGR BGR BGR BGR BGR
    //shuffle to BBBBBBGGGGGRRRRR
    const __m128i mask1 = _mm_setr_epi8(0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14);
    const __m128i smask1 = _mm_setr_epi8(6,7,8,9,10,0,1,2,3,4,5,11,12,13,14,15);
    const __m128i ssmask1 = _mm_setr_epi8(11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10);

    //shuffle to GGGGGGBBBBBRRRRR
    const __m128i mask2 = _mm_setr_epi8(0,3,6,9,12,15, 2,5,8,11,14,1,4,7,10,13);
    //const __m128i smask2 = _mm_setr_epi8(6,7,8,9,10,0,1,2,3,4,5,11,12,13,14,15);same as smask1
    const __m128i ssmask2 = _mm_setr_epi8(0,1,2,3,4,11,12,13,14,15,5,6,7,8,9,10);

    //shuffle to RRRRRRGGGGGBBBBB
    //__m128i mask3 = _mm_setr_epi8(0,3,6,9,12,15, 2,5,8,11,14,1,4,7,10,13);//same as mask2
    //const __m128i smask3 = _mm_setr_epi8(6,7,8,9,10,0,1,2,3,4,5,6,7,8,9,10);//same as smask1
    //const __m128i ssmask3 = _mm_setr_epi8(11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10);//same as ssmask1

    //blend mask
    const __m128i bmask1 = _mm_setr_epi8
        (255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0);

    const __m128i bmask2 = _mm_setr_epi8
        (255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0);

    const __m128i bmask3 = _mm_setr_epi8
        (255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0);

    const __m128i bmask4 = _mm_setr_epi8
        (255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0);  

    const int shift = 8;
    const int amp = 1<<shift;
    const int _R_=(int)(amp*0.299);
    const int _G_=(int)(amp*0.587);
    const int _B_=(int)(amp*0.114);
    const __m128i R = _mm_set1_epi16(_R_);
    const __m128i G = _mm_set1_epi16(_G_);
    const __m128i B = _mm_set1_epi16(_B_);
    const __m128i zero = _mm_setzero_si128();   

    for(int i=0;i<size;i+=48)
    {
        __m128i a = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i)),mask1);
        __m128i b = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i+16)),mask2);
        __m128i c = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i+32)),mask2);
        const __m128i aaaa = _mm_blendv_epi8(c,_mm_blendv_epi8(b,a,bmask1),bmask2);

        a = _mm_shuffle_epi8(a,smask1);
        b = _mm_shuffle_epi8(b,smask1);
        c = _mm_shuffle_epi8(c,smask1);
        const __m128i bbbb =_mm_blendv_epi8(c,_mm_blendv_epi8(b,a,bmask3),bmask2);

        a = _mm_shuffle_epi8(a,ssmask1);
        c = _mm_shuffle_epi8(c,ssmask1);
        b = _mm_shuffle_epi8(b,ssmask2);
        const __m128i cccc =_mm_blendv_epi8(c,_mm_blendv_epi8(b,a,bmask3),bmask4);

        __m128i a1 = _mm_unpackhi_epi8(aaaa,zero);
        __m128i a2 = _mm_unpacklo_epi8(aaaa,zero);
        a1 = _mm_mullo_epi16(a1,B);
        a2 = _mm_mullo_epi16(a2,B);
        __m128i b1 = _mm_unpackhi_epi8(bbbb,zero);
        __m128i b2 = _mm_unpacklo_epi8(bbbb,zero);
        b1 = _mm_mullo_epi16(b1,G);
        b2 = _mm_mullo_epi16(b2,G);

        __m128i c1 = _mm_unpackhi_epi8(cccc,zero);
        __m128i c2 = _mm_unpacklo_epi8(cccc,zero);
        c1 = _mm_mullo_epi16(c1,R);
        c2 = _mm_mullo_epi16(c2,R);

        a1 = _mm_add_epi16(a1,b1);
        a1 = _mm_add_epi16(a1,c1);
        a2 = _mm_add_epi16(a2,b2);
        a2 = _mm_add_epi16(a2,c2);

        a1 = _mm_srli_epi16(a1,8);
        a2 = _mm_srli_epi16(a2,8);

        a = _mm_packus_epi16(a1,a2);

        _mm_stream_si128((__m128i*)(d),a);
        d+=16;
    } 
}

Source: (StackOverflow)

Advertisements

Summing 3 lanes in a NEON float32x4_t

I'm vectorizing an inner loop with ARM NEON intrinsics (llvm, iOS). I'm generally using float32x4_ts. My computation finishes with the need to sum three of the four floats in this vector.

I can drop back to C floats at this point and vst1q_f32 to get the four values out and add up the three I need. But I figure it may be more effective if there's a way to do it directly with the vector in an instruction or two, and then just grab a single lane result, but I couldn't figure out any clear path to doing this.

I'm new to NEON programming, and the existing "documentation" is pretty horrific. Any ideas? Thanks!

Source: (StackOverflow)

Compacting data in buffer from 16 bit per element to 12 bits

I'm wondering if there is any chance to improve performance of such compacting. The idea is to saturate values higher than 4095 and place each value every 12 bits in new continuous buffer. Just like that:

Concept:

Convert:

Input buffer: [0.0][0.1][0.2] ... [0.15] | [1.0][1.1][1.2] ... [1.15] | [2.0][2.1][2.2] ... [2.15] etc ...

to:

Output buffer: [0.0][0.1][0.2] ... [0.11] | [1.0][1.1][1.2] ... [1.11] | [2.0][2.1][2.2] ... [2.11] etc ...

The input and output buffers are defines as:

uint16_t input[76800] (it's size in Bytes equal 153600 Bytes)

uint24_t output[38400] (it's size in Bytes equal 115200 Bytes)

So I have reduced the data size by 1/4. This computation cost ~1ms on Cortex-A9 with 792 MHz CPU speed and 2 Cores. I have to perform such "compression" because I transfer about 18MB/s over Ethernet and that gives me huge overhead. I've tested various compression algorithms such Snappy, LZ4 and none of that was even close to achieved 1 ms with saturation and bits schifting.

I've written the following code:

#pragma pack(push, 1)
typedef struct {
        union {
                struct {
                        uint32_t value0_24x1:24;
                };
                struct {
                        uint32_t value0_12x1:12;
                        uint32_t value1_12x1:12;
                };
                struct {
                        uint32_t value0_8x1:8;
                        uint32_t value1_8x1:8;
                        uint32_t value3_8x1:8;
                };
        };
} uint24_t;
#pragma pack(pop)


static inline uint32_t __attribute__((always_inline)) saturate(uint32_t value)
{
        register uint32_t result;

        asm volatile("usat %0, %2, %1 \n\t"                     \
                : [result] "=r" (result)                        \
                : [value] "r" (value), [saturate] "I" (12)      \
                :                                               \
                );

        return result;
}

void __attribute__((noinline, used)) compact(const uint16_t *input, uint24_t *output, uint32_t elements)
{
#if 0
        /* More readable, but slower */
        for (uint32_t i = 0; i < elements; ++i) {
                output->value0_12x1 = saturate(*input++);
                (output++)->value1_12x1 = saturate(*input++);
        }
#else
        /* Alternative - less readable but faster */
        for (uint32_t i = 0; i < elements; ++i, input += 2)
                (output++)->value0_24x1 = saturate(*input) | ((uint32_t)saturate(*(input+1))) << 12;
#endif
}

static uint16_t buffer_in[76800] = {0};
static uint24_t buffer_out[38400] = {0};

int main()
{
    /* Dividing by 2 because we process two input values in a single loop inside compact() */
    compact(buffer_in, buffer_out, sizeof(buffer_in) / sizeof(buffer_in[0]) / 2);

    return 0;
}

And it's Assembly:

248 00008664 <compact>:
249     8664:   e92d4010    push    {r4, lr}
250     8668:   e3a03000    mov r3, #0
251     866c:   ea00000c    b   86a4 <compact+0x40>
252     8670:   e1d040b0    ldrh    r4, [r0]
253     8674:   e6ec4014    usat    r4, #12, r4
254     8678:   e1d0c0b2    ldrh    ip, [r0, #2]
255     867c:   e6ecc01c    usat    ip, #12, ip
256     8680:   e184c60c    orr ip, r4, ip, lsl #12
257     8684:   e2833001    add r3, r3, #1
258     8688:   e2800004    add r0, r0, #4
259     868c:   e5c1c000    strb    ip, [r1]
260     8690:   e7e7445c    ubfx    r4, ip, #8, #8
261     8694:   e7e7c85c    ubfx    ip, ip, #16, #8
262     8698:   e5c14001    strb    r4, [r1, #1]
263     869c:   e5c1c002    strb    ip, [r1, #2]
264     86a0:   e2811003    add r1, r1, #3
265     86a4:   e1530002    cmp r3, r2
266     86a8:   1afffff0    bne 8670 <compact+0xc>
267     86ac:   e8bd8010    pop {r4, pc}

Compiled using GCC 4.6.3 with the following CFLAGS:

-Os (-O2 and -O3 do not give any noticable improvements)

-march=armv7-a -mcpu=cortex-a9 -mtune=cortex-a9

-marm -mfloat-abi=softfp -mfpu=neon funsafe-math-optimizations

Benchmark has shown that we're using ~10.3 cycles per 1 data convertion.

The questions are:

Can I use NEON to improve the performance?
Can someone give me some hints regardles NEON? What intrinsics shall I use?

Some code example would be very welcome, because I'm completly noob when it comes to NEON.

Source: (StackOverflow)

How to use the multiply and accumulate intrinsics in ARM Cortex-a8?

how to use the Multiply-Accumulate intrinsics provided by GCC?

float32x4_t vmlaq_f32 (float32x4_t , float32x4_t , float32x4_t);

Can anyone explain what three parameters I have to pass to this function. I mean the Source and destination registers and what the function returns?

Help!!!

Source: (StackOverflow)

Test NEON-optimized cv::threshold() on mobile device [closed]

I have been writing some optimizations for the OpenCV's threshold function, for ARM devices (mobile phones). It should be working on both Android and iPhone.

However, I do not have a device to test it on, so I am looking for volunteers to give me a little help. If that motivates you more, I am planning to send it to OpenCV to be integrated into the main repository.

I would be interested in code correctness, and if it happens to work as intended, some statistics for original/optimized performance. Do not forget to look at all scenarios.

So, here is the code. To run it, paste in on opencv/modules/imgproc/src/thresh.cpp, at line 228 (as of 2.4.2) - just below SSE block, and recompile OpenCV.

Also, add this line at the top of the file

#include <arm_neon.h>

Main code body:

#define CV_USE_NEON 1
#if CV_USE_NEON
    //if( checkHardwareSupport(CV_CPU_ARM_NEON) )
    if( true )
    {
        uint8x16_t thresh_u = vdupq_n_u8(thresh);
        uint8x16_t maxval_ = vdupq_n_u8(maxval);

        j_scalar = roi.width & -8;

        for( i = 0; i < roi.height; i++ )
        {
            const uchar* src = (const uchar*)(_src.data + _src.step*i);
            uchar* dst = (uchar*)(_dst.data + _dst.step*i);

            switch( type )
            {
            case THRESH_BINARY:
                for( j = 0; j <= roi.width - 32; j += 32 )
                {
                    uint8x16_t v0, v1;
                    v0 = vld1q_u8 ( src + j );
                    v1 = vld1q_u8 ( src + j + 16 );
                    v0 = vcgtq_u8 ( v0, thresh_u );
                    v1 = vcgtq_u8 ( v1, thresh_u );
                    v0 = vandq_u8 ( v0, maxval_ );
                    v1 = vandq_u8 ( v1, maxval_ );
                    vst1q_u8 ( dst + j, v0 );
                    vst1q_u8 ( dst + j + 16, v1 );
                }


                for( ; j <= roi.width - 8; j += 8 )
                {
                    uint8x8_t v2;
                    v2 = vld1_u8( src + j );
                    v2 = vcgt_u8 ( v2, vget_low_s8 ( thresh_u ) );
                    v2 = vand_u8 ( v2, vget_low_s8 ( maxval_ ) );
                    vst1_u8 ( dst + j, v2 );                    
                }
                break;

            case THRESH_BINARY_INV:         
                for( j = 0; j <= roi.width - 32; j += 32 )
                {
                    uint8x16_t v0, v1;
                    v0 = vld1q_u8 ( src + j );
                    v1 = vld1q_u8 ( src + j + 16 );
                    v0 = vcleq_u8 ( v0, thresh_u );
                    v1 = vcleq_u8 ( v1, thresh_u );
                    v0 = vandq_u8 ( v0, maxval_ );
                    v1 = vandq_u8 ( v1, maxval_ );
                    vst1q_u8 ( dst + j, v0 );
                    vst1q_u8 ( dst + j + 16, v1 );
                }


                for( ; j <= roi.width - 8; j += 8 )
                {
                    uint8x8_t v2;
                    v2 = vld1_u8( src + j );
                    v2 = vcle_u8 ( v2, vget_low_s8 ( thresh_u ) );
                    v2 = vand_u8 ( v2, vget_low_s8 ( maxval_ ) );
                    vst1_u8 ( dst + j, v2 );                    
                }
                break;

            case THRESH_TRUNC:
                for( j = 0; j <= roi.width - 32; j += 32 )
                {
                    uint8x16_t v0, v1;
                    v0 = vld1q_u8 ( src + j );
                    v1 = vld1q_u8 ( src + j + 16 );
                    v0 = vminq_u8 ( v0, thresh_u );
                    v1 = vminq_u8 ( v1, thresh_u );                 
                    vst1q_u8 ( dst + j, v0 );
                    vst1q_u8 ( dst + j + 16, v1 );
                }


                for( ; j <= roi.width - 8; j += 8 )
                {
                    uint8x8_t v2;
                    v2 = vld1_u8( src + j );
                    v2 = vmin_u8  ( v2, vget_low_s8 ( thresh_u ) );                 
                    vst1_u8 ( dst + j, v2 );                    
                }
                break;

            case THRESH_TOZERO:         
                for( j = 0; j <= roi.width - 32; j += 32 )
                {
                    uint8x16_t v0, v1;
                    v0 = vld1q_u8 ( src + j );
                    v1 = vld1q_u8 ( src + j + 16 );             
                    v0 = vandq_u8 ( vcgtq_u8 ( v0, thresh_u ), vmaxq_u8 ( v0, thresh_u ) );
                    v1 = vandq_u8 ( vcgtq_u8 ( v1, thresh_u ), vmaxq_u8 ( v1, thresh_u ) );
                    vst1q_u8 ( dst + j, v0 );
                    vst1q_u8 ( dst + j + 16, v1 );
                }


                for( ; j <= roi.width - 8; j += 8 )
                {
                    uint8x8_t v2;
                    v2 = vld1_u8 ( src + j );                    
                    v2 = vand_u8 ( vcgt_u8 ( v2, vget_low_s8(thresh_u) ), vmax_u8 ( v2, vget_low_s8(thresh_u) ) );
                    vst1_u8 ( dst + j, v2 );                    
                }
                break;

            case THRESH_TOZERO_INV:
                for( j = 0; j <= roi.width - 32; j += 32 )
                {
                    uint8x16_t v0, v1;
                    v0 = vld1q_u8 ( src + j );
                    v1 = vld1q_u8 ( src + j + 16 );             
                    v0 = vandq_u8 ( vcleq_u8 ( v0, thresh_u ), vminq_u8 ( v0, thresh_u ) );
                    v1 = vandq_u8 ( vcleq_u8 ( v1, thresh_u ), vminq_u8 ( v1, thresh_u ) );
                    vst1q_u8 ( dst + j, v0 );
                    vst1q_u8 ( dst + j + 16, v1 );
                }


                for( ; j <= roi.width - 8; j += 8 )
                {
                    uint8x8_t v2;
                    v2 = vld1_u8 ( src + j );                    
                    v2 = vand_u8 ( vcle_u8 ( v2, vget_low_s8(thresh_u) ), vmin_u8 ( v2, vget_low_s8(thresh_u) ) );
                    vst1_u8 ( dst + j, v2 );                    
                }
                break;
            }
        }
    }
#endif

Source: (StackOverflow)

C vs assembler vs NEON performance

I am working on an iPhone application that does real time image processing. One of the earliest steps in its pipeline is to convert a BGRA image to greyscale. I tried several different methods and the difference in timing results is far greater than I had imagined possible. First I tried using C. I approximate the conversion to luminosity by adding B+2*G+R /4

void BGRA_To_Byte(Image<BGRA> &imBGRA, Image<byte> &imByte)
{
uchar *pIn = (uchar*) imBGRA.data;
uchar *pLimit = pIn + imBGRA.MemSize();

uchar *pOut = imByte.data;
for(; pIn < pLimit; pIn+=16)   // Does four pixels at a time
{
    unsigned int sumA = pIn[0] + 2 * pIn[1] + pIn[2];
    pOut[0] = sumA / 4;
    unsigned int sumB = pIn[4] + 2 * pIn[5] + pIn[6];
    pOut[1] = sumB / 4;
    unsigned int sumC = pIn[8] + 2 * pIn[9] + pIn[10];
    pOut[2] = sumC / 4;
    unsigned int sumD = pIn[12] + 2 * pIn[13] + pIn[14];
    pOut[3] = sumD / 4;
    pOut +=4;
}       
}

This code takes 55 ms to convert a 352x288 image. I then found some assembler code that does essentially the same thing

void BGRA_To_Byte(Image<BGRA> &imBGRA, Image<byte> &imByte)
{
uchar *pIn = (uchar*) imBGRA.data;
uchar *pLimit = pIn + imBGRA.MemSize();

unsigned int *pOut = (unsigned int*) imByte.data;

for(; pIn < pLimit; pIn+=16)   // Does four pixels at a time
{
  register unsigned int nBGRA1 asm("r4");
  register unsigned int nBGRA2 asm("r5");
  unsigned int nZero=0;
  unsigned int nSum1;
  unsigned int nSum2;
  unsigned int nPacked1;
  asm volatile(

               "ldrd %[nBGRA1], %[nBGRA2], [ %[pIn], #0]       \n"   // Load in two BGRA words
               "usad8 %[nSum1], %[nBGRA1], %[nZero]  \n"  // Add R+G+B+A 
               "usad8 %[nSum2], %[nBGRA2], %[nZero]  \n"  // Add R+G+B+A 
               "uxtab %[nSum1], %[nSum1], %[nBGRA1], ROR #8    \n"   // Add G again
               "uxtab %[nSum2], %[nSum2], %[nBGRA2], ROR #8    \n"   // Add G again
               "mov %[nPacked1], %[nSum1], LSR #2 \n"    // Init packed word   
               "mov %[nSum2], %[nSum2], LSR #2 \n"   // Div by four
               "add %[nPacked1], %[nPacked1], %[nSum2], LSL #8 \n"   // Add to packed word                 

               "ldrd %[nBGRA1], %[nBGRA2], [ %[pIn], #8]       \n"   // Load in two more BGRA words
               "usad8 %[nSum1], %[nBGRA1], %[nZero]  \n"  // Add R+G+B+A 
               "usad8 %[nSum2], %[nBGRA2], %[nZero]  \n"  // Add R+G+B+A 
               "uxtab %[nSum1], %[nSum1], %[nBGRA1], ROR #8    \n"   // Add G again
               "uxtab %[nSum2], %[nSum2], %[nBGRA2], ROR #8    \n"   // Add G again
               "mov %[nSum1], %[nSum1], LSR #2 \n"   // Div by four
               "add %[nPacked1], %[nPacked1], %[nSum1], LSL #16 \n"   // Add to packed word
               "mov %[nSum2], %[nSum2], LSR #2 \n"   // Div by four
               "add %[nPacked1], %[nPacked1], %[nSum2], LSL #24 \n"   // Add to packed word                 

               ///////////
               ////////////

               : [pIn]"+r" (pIn), 
         [nBGRA1]"+r"(nBGRA1),
         [nBGRA2]"+r"(nBGRA2),
         [nZero]"+r"(nZero),
         [nSum1]"+r"(nSum1),
         [nSum2]"+r"(nSum2),
         [nPacked1]"+r"(nPacked1)
               :
               : "cc"  );
  *pOut = nPacked1;
  pOut++;
 }
 }

This function converts the same image in 12ms, almost 5X faster! I have not programmed in assembler before but I assumed that it would not be this much faster than C for such a simple operation. Inspired by this success I continued searching and discovered a NEON conversion example here.

void greyScaleNEON(uchar* output_data, uchar* input_data, int tot_pixels)
{
__asm__ volatile("lsr          %2, %2, #3      \n"
                 "# build the three constants: \n"
                 "mov         r4, #28          \n" // Blue channel multiplier
                 "mov         r5, #151         \n" // Green channel multiplier
                 "mov         r6, #77          \n" // Red channel multiplier
                 "vdup.8      d4, r4           \n"
                 "vdup.8      d5, r5           \n"
                 "vdup.8      d6, r6           \n"
                 "0:                           \n"
                 "# load 8 pixels:             \n"
                 "vld4.8      {d0-d3}, [%1]!   \n"
                 "# do the weight average:     \n"
                 "vmull.u8    q7, d0, d4       \n"
                 "vmlal.u8    q7, d1, d5       \n"
                 "vmlal.u8    q7, d2, d6       \n"
                 "# shift and store:           \n"
                 "vshrn.u16   d7, q7, #8       \n" // Divide q3 by 256 and store in the d7
                 "vst1.8      {d7}, [%0]!      \n"
                 "subs        %2, %2, #1       \n" // Decrement iteration count
                 "bne         0b            \n" // Repeat unil iteration count is not zero
                 :
                 :  "r"(output_data),           
                 "r"(input_data),           
                 "r"(tot_pixels)        
                 : "r4", "r5", "r6"
                 );
}

The timing results were hard to believe. It converts the same image in 1 ms. 12X faster than assembler and an astounding 55X faster than C. I had no idea that such performance gains were possible. In light of this I have a few questions. First off, am I doing something terribly wrong in the C code? I still find it hard to believe that it is so slow. Second, if these results are at all accurate, in what kinds of situations can I expect to see these gains? You can probably imagine how excited I am at the prospect of making other parts of my pipeline run 55X faster. Should I be learning assembler/NEON and using them inside any loop that takes an appreciable amount of time?

Update 1: I have posted the assembler output from my C function in a text file at http://temp-share.com/show/f3Yg87jQn It was far too large to include directly here.

Timing is done using OpenCV functions.

double duration = static_cast<double>(cv::getTickCount()); 
//function call 
duration = static_cast<double>(cv::getTickCount())-duration;
duration /= cv::getTickFrequency();
//duration should now be elapsed time in ms

Results

I tested several suggested improvements. First, as recommended by Viktor I reordered the inner loop to put all fetches first. The inner loop then looked like.

for(; pIn < pLimit; pIn+=16)   // Does four pixels at a time
{     
  //Jul 16, 2012 MR: Read and writes collected
  sumA = pIn[0] + 2 * pIn[1] + pIn[2];
  sumB = pIn[4] + 2 * pIn[5] + pIn[6];
  sumC = pIn[8] + 2 * pIn[9] + pIn[10];
  sumD = pIn[12] + 2 * pIn[13] + pIn[14];
  pOut +=4;
  pOut[0] = sumA / 4;
  pOut[1] = sumB / 4;
  pOut[2] = sumC / 4;
  pOut[3] = sumD / 4;
}

This change brought processing time down to 53ms an improvement of 2ms. Next as recommended by Victor I changed my function to fetch as uint. The inner loop then looked like

unsigned int* in_int = (unsigned int*) original.data;
unsigned int* end = (unsigned int*) in_int + out_length;
uchar* out = temp.data;

for(; in_int < end; in_int+=4)   // Does four pixels at a time
{
    unsigned int pixelA = in_int[0];
    unsigned int pixelB = in_int[1];
    unsigned int pixelC = in_int[2];
    unsigned int pixelD = in_int[3];

    uchar* byteA = (uchar*)&pixelA;
    uchar* byteB = (uchar*)&pixelB;
    uchar* byteC = (uchar*)&pixelC;
    uchar* byteD = (uchar*)&pixelD;         

    unsigned int sumA = byteA[0] + 2 * byteA[1] + byteA[2];
    unsigned int sumB = byteB[0] + 2 * byteB[1] + byteB[2];
    unsigned int sumC = byteC[0] + 2 * byteC[1] + byteC[2];
    unsigned int sumD = byteD[0] + 2 * byteD[1] + byteD[2];

    out[0] = sumA / 4;
    out[1] = sumB / 4;
    out[2] = sumC / 4;
    out[3] = sumD / 4;
    out +=4;
    }

This modification had a dramatic effect, dropping processing time to 14ms, a drop of 39ms (75%). This last result is very close the the assembler performance of 11ms. The final optimization as recommended by rob was to include the __restrict keyword. I added it in front of every pointer declaration changing the following lines

__restrict unsigned int* in_int = (unsigned int*) original.data;
unsigned int* end = (unsigned int*) in_int + out_length;
__restrict uchar* out = temp.data;  
...
__restrict uchar* byteA = (uchar*)&pixelA;
__restrict uchar* byteB = (uchar*)&pixelB;
__restrict uchar* byteC = (uchar*)&pixelC;
__restrict uchar* byteD = (uchar*)&pixelD;  
...

These changes had no measurable effect on processing time. Thank you for all your help, I will be paying much closer attention to memory management in the future.

Source: (StackOverflow)

Common SIMD techniques

Where can I find information about common SIMD tricks? I have an instruction set and know, how to write non-tricky SIMD code, but I know, SIMD now is much more powerful. It can hold complex conditional branchless code.
For example (ARMv6), the following sequence of instructions sets each byte of Rd equal to the unsigned minimum of the corresponding bytes of Ra and Rb:

USUB8 Rd, Ra, Rb
SEL Rd, Rb, Ra

Links to tutorials / uncommon SIMD techniques are good too :) ARMv6 is the most interesting for me, but x86(SSE,...)/Neon(in ARMv7)/others are good too.

Source: (StackOverflow)

ARM Cortex-A8: Whats the difference between VFP and NEON

In ARM Cortex-A8 processor, I understand what NEON is, it is an SIMD co-processor.

But is VFP(Vector Floating Point) unit, which is also a co-processor, works as a SIMD processor? If so which one is better to use?

I read few links such as -

Link1
Link2.

But not really very clear what they mean. They say that VFP was never intended to be used for SIMD but on Wiki I read the following - "The VFP architecture also supports execution of short vector instructions but these operate on each vector element sequentially and thus do not offer the performance of true SIMD (Single Instruction Multiple Data) parallelism."

It so not so clear what to believe, can anyone elaborate more on this topic?

Source: (StackOverflow)

Android ARMv6/v7 and VFP/NEON

I would like to understand more the CPU used on Android phones. The reason is that we are building the C library which has the certain CPU/math processor architecture flags we can set.

So far we have found that all Android devices CPUs are ARM design and are either ARMv6 (older devices, low ends, Huawei, ZTE, small SE) or ARMv7 (Honeycomb tablets and all more expensive devices, almost all with resolution WVGA and higher)I have checked ~20 devices and all have processor of that type. Is that correct? Are there some others?
Now when it comes to the multimedia and mathematical operations I think two units are important – the VFP for floating point arithmetic and the SIMD - NEON. After testing the above mentioned group of devices I have found that VFP support is in almost all devices, while NEON not. Any comments to that?
I do not know what exactly is the ARMv6 and ARMv7 difference (besides the speed in general). Now we are building a multimedia C library, which has couple of flags for building. My question is how to target the largest number of devices on one side and how to allow the users of the better devices to use their hardware. My proposal is to prepare 3 distinct builds: ARMv6/VFP, ARMv7/VFP and ARMv7/VFP/NEON. Other proposals?
The ARMv6/VFP I think should run on all configurations, except devices, which are missing the VFP (e.g. the old HTC Wildfire) – but those will remain unsupported.

Is this a good approach? Any comments are welcomed.

Regards, STeN

Source: (StackOverflow)

Why ARM NEON not faster than plain C++?

Here is a C++ code:

#define ARR_SIZE_TEST ( 8 * 1024 * 1024 )

void cpp_tst_add( unsigned* x, unsigned* y )
{
    for ( register int i = 0; i < ARR_SIZE_TEST; ++i )
    {
        x[ i ] = x[ i ] + y[ i ];
    }
}

Here is a neon version:

void neon_assm_tst_add( unsigned* x, unsigned* y )
{
    register unsigned i = ARR_SIZE_TEST >> 2;

    __asm__ __volatile__
    (
        ".loop1:                            \n\t"

        "vld1.32   {q0}, [%[x]]             \n\t"
        "vld1.32   {q1}, [%[y]]!            \n\t"

        "vadd.i32  q0 ,q0, q1               \n\t"
        "vst1.32   {q0}, [%[x]]!            \n\t"

        "subs     %[i], %[i], $1            \n\t"
        "bne      .loop1                    \n\t"

        : [x]"+r"(x), [y]"+r"(y), [i]"+r"(i)
        :
        : "memory"
    );
}

Test function:

void bench_simple_types_test( )
{
    unsigned* a = new unsigned [ ARR_SIZE_TEST ];
    unsigned* b = new unsigned [ ARR_SIZE_TEST ];

    neon_tst_add( a, b );
    neon_assm_tst_add( a, b );
}

I have tested both variants and here are a report:

add, unsigned, C++       : 176 ms
add, unsigned, neon asm  : 185 ms // SLOW!!!

I also tested other types:

add, float,    C++       : 571 ms
add, float,    neon asm  : 184 ms // FASTER X3!

THE QUESTION: Why neon is slower with 32-bit integer types?

I used last version of GCC for Android NDK. NEON optimization flags were turned on. Here is a disassembled C++ version:

                 MOVS            R3, #0
                 PUSH            {R4}

 loc_8
                 LDR             R4, [R0,R3]
                 LDR             R2, [R1,R3]
                 ADDS            R2, R4, R2
                 STR             R2, [R0,R3]
                 ADDS            R3, #4
                 CMP.W           R3, #0x2000000
                 BNE             loc_8
                 POP             {R4}
                 BX              LR

Here is disassembled version of neon:

                 MOV.W           R3, #0x200000
.loop1
                 VLD1.32         {D0-D1}, [R0]
                 VLD1.32         {D2-D3}, [R1]!
                 VADD.I32        Q0, Q0, Q1
                 VST1.32         {D0-D1}, [R0]!
                 SUBS            R3, #1
                 BNE             .loop1
                 BX              LR

Here is all bench tests:

add, char,     C++       : 83  ms
add, char,     neon asm  : 46  ms FASTER x2

add, short,    C++       : 114 ms
add, short,    neon asm  : 92  ms FASTER x1.25

add, unsigned, C++       : 176 ms
add, unsigned, neon asm  : 184 ms SLOWER!!!

add, float,    C++       : 571 ms
add, float,    neon asm  : 184 ms FASTER x3

add, double,   C++       : 533 ms
add, double,   neon asm  : 420 ms FASTER x1.25

THE QUESTION: Why neon is slower with 32-bit integer types?

Source: (StackOverflow)

Fast sine/cosine for ARMv7+NEON: looking for testers…

Could somebody with access to an iPhone 3GS or a Pandora please test the following assembly routine I just wrote?

It is supposed to compute sines and cosines really really fast on the NEON vector FPU. I know it compiles fine, but without adequate hardware I can't test it. If you could just compute a few sines and cosines and compare the results with those of sinf() and cosf() it would really help.

Thanks!

#include <math.h>

/// Computes the sine and cosine of two angles
/// in: angles = Two angles, expressed in radians, in the [-PI,PI] range.
/// out: results = vector containing [sin(angles[0]),cos(angles[0]),sin(angles[1]),cos(angles[1])]
static inline void vsincos(const float angles[2], float results[4]) {
    static const float constants[]  = { 
    /* q1 */  0,                M_PI_2,           0,                M_PI_2,
    /* q2 */  M_PI,             M_PI,             M_PI,             M_PI,
    /* q3 */  4.f/M_PI,         4.f/M_PI,         4.f/M_PI,         4.f/M_PI,
    /* q4 */ -4.f/(M_PI*M_PI), -4.f/(M_PI*M_PI), -4.f/(M_PI*M_PI), -4.f/(M_PI*M_PI),
    /* q5 */  2.f,              2.f,              2.f,              2.f,
    /* q6 */  .225f,            .225f,            .225f,            .225f
    };  
    asm volatile(
        // Load q0 with [angle1,angle1,angle2,angle2]
        "vldmia %1, { d3 }\n\t"
        "vdup.f32 d0, d3[0]\n\t"
        "vdup.f32 d1, d3[1]\n\t"
        // Load q1-q6 with constants
        "vldmia %2, { q1-q6 }\n\t"
        // Cos(x) = Sin(x+PI/2), so
        // q0 = [angle1, angle1+PI/2, angle2, angle2+PI/2]
        "vadd.f32 q0,q0,q1\n\t"
        // if angle1+PI/2>PI, substract 2*PI
        // q0-=(q0>PI)?2*PI:0
        "vcge.f32 q1,q0,q2\n\t"
        "vand.f32 q1,q1,q2\n\t"
        "vmls.f32 q0,q1,q5\n\t"
        // q0=(4/PI)*q0 - q0*abs(q0)*4/(PI*PI)
        "vabs.f32 q1,q0\n\t"
        "vmul.f32 q1,q0,q1\n\t"
        "vmul.f32 q0,q0,q3\n\t"
        "vmul.f32 q1,q1,q4\n\t"
        "vadd.f32 q0,q0,q1\n\t"
        // q0+=.225*(q0*abs(q0) - q0)
        "vabs.f32 q1,q0\n\t"
        "vmul.f32 q1,q0,q1\n\t"
        "vsub.f32 q1,q0\n\t"
        "vmla.f32 q0,q1,q6\n\t"
        "vstmia %0, { q0 }\n\t"
        :: "r"(results), "r"(angles), "r"(constants)
        : "memory","cc","q0","q1","q2","q3","q4","q5","q6"
    );  
}

Source: (StackOverflow)

What is Neon with respect to Android?

I'm a beginner in Android. My friend heard "Neon". So I did Google and found this

Referring it, Neon is related to multimedia for Android OS or all mobile OS, is it? Please share me more.

Source: (StackOverflow)

Is there a good reference for ARM Neon intrinsics?

The ARM reference manual doesn't go into too much detail into the individual instructions ( http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0348b/BABIIBBG.html ). Is there something that's a little more detailed?

Source: (StackOverflow)

Fast 4x4 Matrix Multiplication in C

I am trying to find an optimized C or Assembler implementation of a function that multiplies two 4x4 matrices with each other. The platform is an ARM6 or ARM7 based iPhone or iPod.

Currently, I am using a fairly standard approach - just a little loop-unrolled.

#define O(y,x) (y + (x<<2))

static inline void Matrix4x4MultiplyBy4x4 (float *src1, float *src2, float *dest)
{
    *(dest+O(0,0)) = (*(src1+O(0,0)) * *(src2+O(0,0))) + (*(src1+O(0,1)) * *(src2+O(1,0))) + (*(src1+O(0,2)) * *(src2+O(2,0))) + (*(src1+O(0,3)) * *(src2+O(3,0)));	
    *(dest+O(0,1)) = (*(src1+O(0,0)) * *(src2+O(0,1))) + (*(src1+O(0,1)) * *(src2+O(1,1))) + (*(src1+O(0,2)) * *(src2+O(2,1))) + (*(src1+O(0,3)) * *(src2+O(3,1)));	
    *(dest+O(0,2)) = (*(src1+O(0,0)) * *(src2+O(0,2))) + (*(src1+O(0,1)) * *(src2+O(1,2))) + (*(src1+O(0,2)) * *(src2+O(2,2))) + (*(src1+O(0,3)) * *(src2+O(3,2)));	
    *(dest+O(0,3)) = (*(src1+O(0,0)) * *(src2+O(0,3))) + (*(src1+O(0,1)) * *(src2+O(1,3))) + (*(src1+O(0,2)) * *(src2+O(2,3))) + (*(src1+O(0,3)) * *(src2+O(3,3)));	
    *(dest+O(1,0)) = (*(src1+O(1,0)) * *(src2+O(0,0))) + (*(src1+O(1,1)) * *(src2+O(1,0))) + (*(src1+O(1,2)) * *(src2+O(2,0))) + (*(src1+O(1,3)) * *(src2+O(3,0)));	
    *(dest+O(1,1)) = (*(src1+O(1,0)) * *(src2+O(0,1))) + (*(src1+O(1,1)) * *(src2+O(1,1))) + (*(src1+O(1,2)) * *(src2+O(2,1))) + (*(src1+O(1,3)) * *(src2+O(3,1)));	
    *(dest+O(1,2)) = (*(src1+O(1,0)) * *(src2+O(0,2))) + (*(src1+O(1,1)) * *(src2+O(1,2))) + (*(src1+O(1,2)) * *(src2+O(2,2))) + (*(src1+O(1,3)) * *(src2+O(3,2)));	
    *(dest+O(1,3)) = (*(src1+O(1,0)) * *(src2+O(0,3))) + (*(src1+O(1,1)) * *(src2+O(1,3))) + (*(src1+O(1,2)) * *(src2+O(2,3))) + (*(src1+O(1,3)) * *(src2+O(3,3)));	
    *(dest+O(2,0)) = (*(src1+O(2,0)) * *(src2+O(0,0))) + (*(src1+O(2,1)) * *(src2+O(1,0))) + (*(src1+O(2,2)) * *(src2+O(2,0))) + (*(src1+O(2,3)) * *(src2+O(3,0)));	
    *(dest+O(2,1)) = (*(src1+O(2,0)) * *(src2+O(0,1))) + (*(src1+O(2,1)) * *(src2+O(1,1))) + (*(src1+O(2,2)) * *(src2+O(2,1))) + (*(src1+O(2,3)) * *(src2+O(3,1)));	
    *(dest+O(2,2)) = (*(src1+O(2,0)) * *(src2+O(0,2))) + (*(src1+O(2,1)) * *(src2+O(1,2))) + (*(src1+O(2,2)) * *(src2+O(2,2))) + (*(src1+O(2,3)) * *(src2+O(3,2)));	
    *(dest+O(2,3)) = (*(src1+O(2,0)) * *(src2+O(0,3))) + (*(src1+O(2,1)) * *(src2+O(1,3))) + (*(src1+O(2,2)) * *(src2+O(2,3))) + (*(src1+O(2,3)) * *(src2+O(3,3)));	
    *(dest+O(3,0)) = (*(src1+O(3,0)) * *(src2+O(0,0))) + (*(src1+O(3,1)) * *(src2+O(1,0))) + (*(src1+O(3,2)) * *(src2+O(2,0))) + (*(src1+O(3,3)) * *(src2+O(3,0)));	
    *(dest+O(3,1)) = (*(src1+O(3,0)) * *(src2+O(0,1))) + (*(src1+O(3,1)) * *(src2+O(1,1))) + (*(src1+O(3,2)) * *(src2+O(2,1))) + (*(src1+O(3,3)) * *(src2+O(3,1)));	
    *(dest+O(3,2)) = (*(src1+O(3,0)) * *(src2+O(0,2))) + (*(src1+O(3,1)) * *(src2+O(1,2))) + (*(src1+O(3,2)) * *(src2+O(2,2))) + (*(src1+O(3,3)) * *(src2+O(3,2)));	
    *(dest+O(3,3)) = (*(src1+O(3,0)) * *(src2+O(0,3))) + (*(src1+O(3,1)) * *(src2+O(1,3))) + (*(src1+O(3,2)) * *(src2+O(2,3))) + (*(src1+O(3,3)) * *(src2+O(3,3)));	
};

Would I benefit from using the Strassen- or the Coppersmith–Winograd algorithm?

Source: (StackOverflow)