These functions are specified by Intel and are now supported by many compilers.
Use these functions as an alternative to VectorC's vectorizer, but remember that
your code will require an MMX processor and to be compiled for an MMX target processor.
Function
|
MMX instruction
|
Comments
|
__m64 _m_paddb (__m64, __m64) |
paddb
|
vector addition |
__m64 _m_paddw (__m64, __m64) |
paddw |
__m64 _m_paddd (__m64, __m64) |
paddd |
__m64 _m_psubb (__m64, __m64) |
psubb |
vector subtraction |
__m64 _m_psubw (__m64, __m64) |
psubw |
__m64 _m_psubd (__m64, __m64) |
psubd |
__m64 _m_pand (__m64, __m64) |
pand |
bitwise logical operations performed on 64-bit values |
__m64 _m_pandn (__m64, __m64) |
pandn |
__m64 _m_por (__m64, __m64) |
por |
__m64 _m_pxor (__m64, __m64) |
pxor |
__m64 _m_paddsb (__m64, __m64) |
paddsb |
saturated arithmetic |
__m64 _m_paddb (__m64, __m64) |
paddb |
__m64 _m_psubsb (__m64, __m64) |
psubsb |
__m64 _m_paddusb (__m64, __m64) |
paddusb |
__m64 _m_psubusb (__m64, __m64) |
psubusb |
__m64 _m_paddsw (__m64, __m64) |
paddsw |
__m64 _m_psubsw (__m64, __m64) |
psubsw |
__m64 _m_paddusw (__m64, __m64) |
paddusw |
__m64 _m_psubusw (__m64, __m64) |
psubusw |
__m64 _m_packsswb (__m64, __m64) |
packsswb |
convert from a large component type to a smaller component
type with saturation |
__m64 _m_packssdw (__m64, __m64) |
packssdw |
__m64 _m_packuswb (__m64, __m64) |
packuswb |
__m64 _m_punpcklbw (__m64, __m64) |
punpcklbw |
|
__m64 _m_punpcklwd (__m64, __m64) |
punpcklwd |
__m64 _m_punpckldq (__m64, __m64) |
punpckldq |
__m64 _m_punpckhbw (__m64, __m64) |
punpckhbw |
__m64 _m_punpckhwd (__m64, __m64) |
punpckhwd |
__m64 _m_punpckhdq (__m64, __m64) |
punpckhdq |
__m64 _m_pmullw (__m64, __m64) |
pmullw |
multiply 16-bit |
__m64 _m_pmulhw (__m64, __m64) |
pmulhw |
multiply 16-bit by 16-bit with 32-bit result, then shift right
arithmetic by 16 |
__m64 _m_pmaddwd (__m64, __m64) |
pmaddwd |
multiply 16-bit to 32-bit result and add lower 2 and higher 2
components into 2 32-bit results |
__m64 _m_psllw (__m64, __m64) |
psllw |
shift instructions |
__m64 _m_pslld (__m64, __m64) |
pslld |
__m64 _m_psllq (__m64, __m64) |
psllq |
__m64 _m_psraw (__m64, __m64) |
psraw |
__m64 _m_psrad (__m64, __m64) |
psrad |
__m64 _m_psrlw (__m64, __m64) |
psrlw |
__m64 _m_psrld (__m64, __m64) |
psrld |
__m64 _m_psrlq (__m64, __m64) |
psrlq |
__m64 _m_pcmpeqb (__m64, __m64) |
pcmpeqb |
vector comparison instructions. Can be used with "pand",
"pandn" and "por" to create a vector conditional move |
__m64 _m_pcmpeqw (__m64, __m64) |
pcmpeqw |
__m64 _m_pcmpeqd (__m64, __m64) |
pcmpeqd |
__m64 _m_pcmpgtb (__m64, __m64) |
pcmpgtb |
__m64 _m_pcmpgtw (__m64, __m64) |
pcmpgtw |
__m64 _m_pcmpgtd (__m64, __m64) |
pcmpgtd |
__m64 _m_psllwi (__m64, int) |
psllw |
Alternative form of shift instructions using an int
shift value |
__m64 _m_pslldi (__m64, int) |
pslld |
__m64 _m_psllqi (__m64, int) |
psllq |
__m64 _m_psrawi (__m64, int) |
psraw |
__m64 _m_psradi (__m64, int) |
psrad |
__m64 _m_psrlwi (__m64, int) |
psrlw |
__m64 _m_psrldi (__m64, int) |
psrld |
__m64 _m_psrlqi (__m64, int) |
psrlq |
__m64 _m_from_int (int) |
movd |
Create a 64-bit MMX value from an integer - the high 32 bits are
zero |
int _m_to_int (__m64) |
movd |
Extract the low 32 bits from an MMX value |
void _m_empty (void) |
emms |
Ignored by VectorC - the placement of emms instructions is done
automatically |
_m64 _mm_set_pi32 (int, int) |
|
Create a 64-bit MMX vector from components. Last value
is in the lowest position in the vector |
_m64 _mm_set_pi16 (short, short, short, short) |
|
_m64 _mm_set_pi8 (char, char, char, char, char, char, char,
char) |
|
_m64 _mm_set1_pi32 (int) |
|
Create a 64-bit MMX vector from a single component
The component is duplicated |
_m64 _mm_set1_pi16 (short) |
|
_m64 _mm_set1_pi8 (char) |
|
_m64 _mm_setr_pi32 (int, int) |
|
Create a 64-bit MMX vector from components. First
value is in the lowest position in the vector |
_m64 _mm_setr_pi16 (short, short, short, short) |
|
_m64 _mm_setr_pi8 (char, char, char, char, char, char, char,
char) |
|
These functions are specified by Intel and are now supported by many compilers.
Use these functions as an alternative to VectorC's vectorizer, but remember that
your code will require an SSE processor and to be compiled for an SSE target processor.
Function
|
SSE Instruction
|
Comments
|
__m64 _m_pmuluhw (__m64, __m64) |
pmuluhw |
multiply 16-bit unsigned by 16-bit unsigned to 32-bit temporary,
shift right 16 |
__m64 _m_pmaxsw (__m64, __m64) |
pmaxsw |
maximum signed 16-bit |
__m64 _m_pminsw (__m64, __m64) |
pminsw |
minimum signed 16-bit |
__m64 _m_pmaxub (__m64, __m64) |
pmaxub |
maximum unsigned 8-bit |
__m64 _m_pminub (__m64, __m64) |
pminub |
minimum unsigned 8-bit |
__m64 _m_pavgb (__m64, __m64) |
pavgb |
average unsigned 8-bit |
__m64 _m_pavgw (__m64, __m64) |
pavgw |
average unsigned 16-bit |
__m64 _m_psadbw (__m64, __m64) |
psadbw |
sum of absolute differences |
__m64 _m_pshufw (__m64, const int) |
pshufw |
shuffle 16-bit words |
int _m_pmovmskb (__m64) |
pmovmskb |
returns a 4-bit mask from avector of 4 16-bit integers |
void _mm_stream_pi (_m64 *,_m64) |
movntq |
store without going through cache |
int _m_pextrw (_m64, const int) |
pextrw |
extract 16-bit value from vector |
_m64 _m_pinsrw (_m64, int, const int) |
pinsrw |
put a 16-bit value into a vector |
_m128 _mm_set_ps (float, float, float, float) |
|
creates a float vector from 4 floats. Last value is in the lowest
position in the vector |
_m128 _mm_setr_ps (float, float, float, float) |
|
creates a float vector from 4 floats. First value is in the lowest
position in the vector |
_m128 _mm_setr_ps1 (float) |
|
creates a float vector from a float - all components have the
same value |
_m128 _mm_set_ss (float) |
|
creates a float vector from a float - the 3 highest components
are zero |
_m128 _mm_cvt_si2ss (int) |
cvtsi2ss |
convert integer to float |
_m128 _mm_sqrt_ss (_m128) |
sqrtss |
square root |
_m128 _mm_rsqrt_ss (_m128) |
rsqrtss |
reciprocal square root approximate |
_m128 _mm_rcp_ss (_m128) |
rcpss |
reciprocal approximate |
_m128 _mm_sqrt_ps (_m128) |
sqrtps |
square root |
_m128 _mm_rsqrt_ps (_m128) |
rsqrtps |
reciprocal square root approximate |
_m128 _mm_rcp_ps (_m128) |
rcpps |
reciprocal approximate |
_m128 _mm_loadu_ps (_m128 *) |
movups |
un-aligned load |
void _mm_storeu_ps (_m128 *, _m128) |
movups |
un-aligned store |
_m128 _mm_load_ps (_m128 *) |
movaps |
aligned load |
_m128 _mm_loadr_ps (_m128 *) |
movaps,
shufps
|
aligned load and reverse order of vector |
void _mm_prefetch (void *, const int) |
prefetcht0
prefetcht1
prefetcht2
prefetchnta |
prefetch data into cache |
void _mm_stream_ps (_m128 *, _m128) |
movntps |
store without going through cache |
_m128 _mm_load_ss (float *) |
movss |
load float |
_m128 _mm_load_ps1 (float *) |
movss
shufps |
load float and duplicate into vector |
void _mm_store_ps (_m128 *, _m128) |
movaps |
store float vector aligned |
void _mm_store_ps1 (_m128 *, _m128) |
shufps
movaps |
|
void _mm_storer_ps (_m128 *, _m128) |
shufps
movaps |
store float vector aligned after reversing components |
void _mm_store_ss (float *, _m128) |
movss |
store float |
_m128 _mm_add_ps (_m128, _m128) |
addps |
vector arithmetic |
_m128 _mm_sub_ps (_m128, _m128) |
subps |
_m128 _mm_mul_ps (_m128, _m128) |
mulps |
_m128 _mm_div_ps (_m128, _m128) |
divps |
_m128 _mm_add_ss (_m128, _m128) |
addss |
float arithmetic |
_m128 _mm_mul_ss (_m128, _m128) |
mulss |
_m128 _mm_div_ss (_m128, _m128) |
divss |
_m128 _mm_and_ps (_m128, _m128) |
andps |
bitwise logical operations |
_m128 _mm_andnot_ps (_m128, _m128) |
andnps |
_m128 _mm_xor_ps (_m128, _m128) |
xorps |
_m128 _mm_or_ps (_m128, _m128) |
orps |
_m128 _mm_min_ss (_m128, _m128) |
minss |
minimum float |
_m128 _mm_max_ss (_m128, _m128) |
maxss |
maximum float |
_m128 _mm_cmpeq_ss (_m128, _m128) |
cmpss |
comparisons |
_m128 _mm_cmpneq_ss (_m128, _m128) |
_m128 _mm_cmplt_ss (_m128, _m128) |
_m128 _mm_cmpgt_ss (_m128, _m128) |
_m128 _mm_cmple_ss (_m128, _m128) |
_m128 _mm_cmpge_ss (_m128, _m128) |
_m128 _mm_cmpnlt_ss (_m128, _m128) |
_m128 _mm_cmpnle_ss (_m128, _m128) |
_m128 _mm_cmpngt_ss (_m128, _m128) |
_m128 _mm_cmpnge_ss (_m128, _m128) |
_m128 _mm_min_ps (_m128, _m128) |
minps |
minimum float vector |
_m128 _mm_max_ps (_m128, _m128) |
maxps |
maximum float vector |
_m128 _mm_cmpeq_ps (_m128, _m128) |
cmpps |
comparison of float vectors |
_m128 _mm_cmpneq_ps (_m128, _m128) |
_m128 _mm_cmplt_ps (_m128, _m128) |
_m128 _mm_cmpnlt_ps (_m128, _m128) |
_m128 _mm_cmpnle_ps (_m128, _m128) |
_m128 _mm_cmpnle_ps (_m128, _m128) |
_m128 _mm_cmpngt_ps (_m128, _m128) |
_m128 _mm_cmpnge_ps (_m128, _m128) |
_m128 _mm_cmpgt_ps (_m128, _m128) |
_m128 _mm_cmple_ps (_m128, _m128) |
_m128 _mm_cmpge_ps (_m128, _m128) |
_m128 _mm_cmpord_ps (_m128, _m128) |
_m128 _mm_cmpunord_ps (_m128, _m128) |
_m128 _mm_unpackhi_ps (_m128, _m128) |
unpckhps |
|
_m128 _mm_unpacklo_ps (_m128, _m128) |
unpcklps |
|
_m128 _mm_movehl_ps (_m128, _m128) |
movhlps |
move high to low |
_m128 _mm_movelh_ps (_m128, _m128) |
movlhps |
move low to high |
_m128 _mm_move_ss (_m128, _m128) |
movss |
move single floating-point value into a vector register - the
high values are 0 |
int _mm_comieq_ss (_m128, _m128) |
comiss |
floating-point comparisons |
int _mm_comilt_ss (_m128, _m128) |
int _mm_comieq_ss (_m128, _m128) |
int _mm_comineq_ss (_m128, _m128) |
int _mm_comile_ss (_m128, _m128) |
int _mm_comigt_ss (_m128, _m128) |
int _mm_comige_ss (_m128, _m128) |
int _mm_ucomieq_ss (_m128, _m128) |
ucomiss |
int _mm_ucomilt_ss (_m128, _m128) |
int _mm_ucomieq_ss (_m128, _m128) |
int _mm_ucomile_ss (_m128, _m128) |
int _mm_ucomigt_ss (_m128, _m128) |
int _mm_ucomige_ss (_m128, _m128) |
_m128 _mm_loadh_pi (_m128, _m64 *) |
movhps |
load high 64-bits |
_m128 _mm_loadl_pi (_m128, _m64 *) |
movlps |
load low 64-bits |
void _mm_storeh_pi (_m64 *, _m128) |
movhps |
store high 64-bits |
void _mm_storel_pi (_m64 *, _m128) |
movlps |
store low 64-bits |
int _mm_cvt_ss2si (_m128) |
cvtss2si |
convert float to int |
int _mm_cvtt_ss2si (_m128) |
cvttss2si |
convert float to int with truncation |
_m128 _mm_cvt_si2ss (_m128, int) |
cvtsi2ss |
convert int to float |
_m128 _mm_cvt_pi2ps (_m128, _m64) |
cvtpi2ps |
convert int vector to float vector |
_m128 _mm_shuffle_ps (_m128, _m128, const int) |
shufps |
shuffle |
void _mm_sfence (void) |
sfence |
store fence |
int _mm_getcsr (void) |
|
get SSE control register |
void _mm_setcsr (int) |
|
set SSE control register |
_m128 _mm_setzero_ps (void) |
xorps |
set a SSE register to 0 |