Back to Mvision

整数

CNN/HighPerformanceComputing/各种指令集.md

latest10.2 KB
Original Source

整数

加法

8位 加法

 |1|2|3|4|5|6|7|8|9|10|11|12|13|14|15||16|
 总共是128位的向量。
 按照8位分割,可以有 16个操作数。

**c语言格式

c
SIMD指令          Code代码                          Input输入            Output输出

 ======8-bit unsigned/signed (unless specified otherwise)============
MMX(64-bit)    d = _m_paddb(a, b)                 __m64 a, b;           __m64 d;   
               d = _mm_add_pi8(a, b)

SSE2(128-bit)  d = _mm_add_epi8(a, b)             __m128i a, b;         __m128i d;
AVX(256-bit)   d = _mm256_add_epi8(a, b)          __m256i a, b;         __m256i d;
NEON (64-bit)  d = vadd_u8(a, b)                  uint8x8_t a, b;       uint8x8_t d;
NEON (128-bit) d = vaddq_u8(a, b)                 uint8x16_t a, b;      uint8x16_t d;
AltiVec/VMX    d = vec_add(a, b)
               d = vec_vaddubm(a, b)              vector uint8_t a, b;  vector uint8_t d;
VSX1           d = vec_add(a, b)
               d = vec_vaddubm(a, b)              vector uint8_t a, b;  vector uint8_t d;
==================8-bit signed============================
NEON (64-bit)   d = vadd_s8(a, b)                  int8x8_t a, b;        int8x8_t d;
NEON (128-bit)  d = vaddq_s8(a, b)                 int8x16_t a, b;       int8x16_t d;
AltiVec/VMX/VSX d = vec_add(a, b)
                d = vec_vaddubm(a, b)              vector int8_t a, b;   vector int8_t d;

汇编格式

asm
SIMD指令          Code代码              Description描述            Notes笔记
======8-bit unsigned/signed (unless specified otherwise)========
MMX(64-bit)    paddb mm1, mm2         Adds mm1, mm2,
                                      result in mm1             mm1, mm2 are 64-bit MMX registers
SSE2(128-bit)  paddb xmm1, xmm2       Adds xmm1, xmm2,
                                      result in xmm1            xmm1, xmm2 are SSE 128-bit registers
AVX(128-bit) vpaddb xmm1, xmm2, xmm3  Adds xmm2, xmm3,
                                      result xmm1               xmm1, xmm2, xmm3 are AVX 128-bit registers
AVX(256-bit) vpaddb ymm1, ymm2, ymm3  Adds ymm2, ymm3,
                                      result in ymm1            ymm1, ymm2, ymm3 are AVX 256-bit registers
NEON (64-bit) vadd.i8 d1, d2, d3      Adds d2, d3, 
                                      result ind1               d1,d2,d3 are 64-bit NEON registers
NEON (128-bit) vaddq.i8 q1, q2, q3    Adds q2, q3, 
                                      result in q1              q1,q2,q3 are 128-bit NEON registers
ARMv8 NEON (64-bit) 
            add Vd.8B, Vn.8B, Vm.8B   Adds Vn.8B, Vm.8B,
                                      result in Vd.8B           d1,d2,d3 are 64-bit  NEON registers
ARMv8 NEON (128-bit) 
          add Vd.16B, Vn.16B, Vm.16B  Adds Vn.16B, Vm.16B,
                                      result in Vd.16B          Vn.16B, Vm.16B, Vd.16B are 128-bit NEON registers
AltiVec/VMX/VSX vaddubm vD, vA, vB    Adds vA, vB, 
                                      result in vD              vA, vB, vD are VMX 128-bit integer registers

示例 c语言格式

c
// SSE (128-bit) 8*16=128bits
__m128i a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
__m128i b = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__m128i d = _mm_add_epi8(a, b);

// NEON (128-bit)
uint8x16_t a = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
uint8x16_t b = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
uint8x16_t d = vaddq_u8(a, b);

// AltiVec/VMX
vector uint8_t a = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
vector uint8_t b = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
vector uint8_t d = vec_add(a, b);

16位 加法

 |1   |2   |3   |4   |5   |6   |7   |8   | 
 总共是128位的向量。
 按照16位分割,可以有 8个操作数。

c语言格式

c
SIMD                   Code                         Input               Output
======16-bit unsigned/signed (unless specified otherwise)=====================
MMX(64-bit)    d  = _mm_add_pi16(a, b)               __m64 a, b;           __m64 d;
SSE2(128-bit)  d = _mm_add_epi16(a, b)               __m128i a, b;         __m128i d;
AVX(256-bit)   d = _mm256_add_epi16(a, b)            __m256i a, b;         __m256i d;
NEON (64-bit)  d = vadd_u16(a, b)                    uint16x4_t a, b;      uint16x4_t d;
NEON (128-bit) d = vaddq_u16(a, b)                   uint16x8_t a, b;      uint16x8_t d;
AltiVec/VMX    d = vec_add(a, b)
               d = vec_vadduhm(a, b)                 vector uint16_t a, b; vector uint16_t d;
VSX2           d = vec_add(a, b)
               d = vec_vadduhm(a, b)                 vector uint16_t a, b; vector uint16_t d;
==========16-bit signed========================================================
NEON (64-bit)   d = vadd_s16(a, b)                   int16x4_t a, b;       int16x4_t d;
NEON (128-bit)  d = vaddq_s16(a, b)                  int16x8_t a, b;       int16x8_t d;
AltiVec/VMX/VSX d = vec_add(a, b)
                d = vec_vadduhm(a, b)                vector int16_t a, b;  vector int16_t d;

汇编格式

c
AFT
SIMD                 Code              Description             Notes
=============16-bit unsigned/signed (unless specified otherwise)==================
MMX(64-bit)   paddw mm1, mm2           Adds mm1, mm2, 
                                       result in mm1        mm1, mm2 are 64-bit MMX registers
SSE2(128-bit) paddw xmm1, xmm2         Adds xmm1, xmm2,
                                       result in xmm1       xmm1, xmm2 are SSE 128-bit registers
AVX(128-bit)  vpaddw xmm1, xmm2, xmm3  Adds xmm2, xmm3,
                                       result xmm1          xmm1, xmm2, xmm3 are AVX 128-bit registers
AVX(256-bit) vpaddw ymm1, ymm2, ymm3   Adds ymm2, ymm3,
                                       result in ymm1       ymm1, ymm2, ymm3 are AVX 256-bit registers
NEON (64-bit) vadd.i16 d1, d2, d3      Adds d2, d3, 
                                       result in d1         d1,d2,d3 are 64-bit NEON registers
NEON (128-bit) vaddq.i16 q1, q2, q3    Adds q2, q3, 
                                       result in q1         q1,q2,q3 are 128-bit NEON registers
ARMv8 NEON (64-bit) 
            add Vd.4H, Vn.4H, Vm.4H    Adds Vn.4H, Vm.4H,
                                       result in Vd.4H      Vn.4H, Vm.4H, Vd.4H are 64-bit NEON registers
ARMv8 NEON (128-bit) 
            add Vd.8H, Vn.8H, Vm.8H    Adds Vn.8H, Vm.8H,
                                       result in Vd.8H      Vn.8H, Vm.8H, Vd.8H  are 128-bit NEON registers
AltiVec/VMX 
            vadduhm vD, vA, vB         Adds vA, vB,
                                       result in vD         vA, vB, vD are normal VMX 128-bit registers

示例 c语言格式

c
// SSE (128-bit)========
__m128i a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
__m128i b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
__m128i d = _mm_add_epi16(a, b);
// NEON (128-bit)=======
uint16x8_t a = {0, 1, 2, 3, 4, 5, 6, 7};
uint16x8_t b = {7, 6, 5, 4, 3, 2, 1, 0};
uint16x8_t d = vaddq_u16(a, b);
// AltiVec/VMX==========
vector uint16_t a = {0, 1, 2, 3, 4, 5, 6, 7};
vector uint16_t b = {7, 6, 5, 4, 3, 2, 1, 0};
vector uint16_t d = vec_add(a, b);

32位 加法

 |1          |2         |3        |4        |  
 总共是128位的向量。
 按照32位分割,可以有 4个操作数。

c语言格式

c
SIMD                Code                    Input                  Output
=======32-bit unsigned/signed (unless specified otherwise)===============
MMX(64-bit)     d = _mm_add_pi32(a, b)      __m64 a, b;            __m64 d;
SSE2(128-bit)   d = _mm_add_epi32(a, b)     __m128i a, b;          __m128i d;
AVX(256-bit)    d = _mm256_add_epi32(a, b)  __m256i a, b;          __m256i d;
NEON (64-bit)   d = vadd_u32(a, b)          uint32x2_t a, b;       uint32x2_t d;
NEON (128-bit)  d = vaddq_u32(a, b)         uint32x4_t a, b;       uint32x4_t d;
AltiVec/VMX     d = vec_add(a, b)
                d = vec_vadduwm(a, b)       vector uint32_t a, b;  vector uint32_t d;
=========32-bit signed====================================================
NEON (64-bit)   d = vadd_s32(a, b)          int32x2_t a, b;        int32x2_t d;
NEON (128-bit)  d = vaddq_s32(a, b)         int32x4_t a, b;        int32x4_t d;
AltiVec/VMX     d = vec_add(a, b)
                d = vec_vadduwm(a, b)       vector int32_t a, b;   vector int32_t d;

汇编格式

c
AFT
SIMD                 Code              Description             Notes
=============32-bit unsigned/signed (unless specified otherwise)==================
MMX(64-bit)        paddd mm1, mm2     Adds mm1, mm2, 
                                      result in mm1       mm1, mm2 are 64-bit MMX registers
SSE2(128-bit)      paddd xmm1, xmm2   Adds xmm1, xmm2,
                                      result in xmm1      xmm1, xmm2 are SSE 128-bit registers
AVX(128-bit) vpaddd xmm1, xmm2, xmm3  Adds xmm2, xmm3,
                                      result xmm1         xmm1, xmm2, xmm3 are AVX 128-bit registers
AVX(256-bit) vpaddd ymm1, ymm2, ymm3  Adds ymm2, ymm3,
                                      result in ymm1      ymm1, ymm2, ymm3 are AVX 256-bit registers
NEON (64-bit) vadd.i32 d1, d2, d3     Adds d2, d3, 
                                      result in d1        d1,d2,d3 are 64-bit NEON registers
NEON (128-bit) vaddq.i32 q1, q2, q3   Adds q2, q3, 
                                      result in q1        q1,q2,q3 are 128-bit NEON registers
ARMv8 NEON (64-bit) 
           add Vd.2S, Vn.2S, Vm.2S    Adds Vn.2S, Vm.2S,
                                      result in Vd.2S     Vn.2S, Vm.2S, Vd.2S are 64-bit NEON registers
ARMv8 NEON (128-bit) 
           add Vd.4S, Vn.4S, Vm.4S    Adds Vn.4S, Vm.4S,
                                      result in Vd.4S     Vn.4S, Vm.4S, Vd.4S are 128-bit NEON registers
AltiVec/VMX/VSX 
           vadduwm vD, vA, vB         Adds vA, vB, 
                                      result in vD        vA, vB, vD are normal VMX 128-bit registers

示例 c语言格式

c
// SSE (128-bit)===========
__m128i a = _mm_set_epi32(0, 1, 2, 3);
__m128i b = _mm_set_epi32(3, 2, 1, 0);
__m128i d = _mm_add_epi32(a, b);
//NEON (128-bit)==========
uint32x4_t a = {0, 1, 2, 3};
uint32x4_t b = {3, 2, 1, 0};
uint32x4_t d = vaddq_u32(a, b);
// AltiVec/VMX============
vector uint32_t a = {0, 1, 2, 3};
vector uint32_t b = {3, 2, 1, 0};
vector uint32_t d = vec_add(a, b);