setting the speed for the future of games programming

INTRINSIC FUNCTIONS

Intrinsic functions are functions that are handled directly by the compiler, instead of generating a call. VectorC will only recognise a function as an intrinsic function if it is correctly prototyped.

Standard Intrinsic Functions

`int abs (int)`	returns the positive value of an integer
`double fabs (double)`	returns the positive value of a floating-point value. With the single-precision command-line option, this function returns a float if the parameter and expression is single precision.
`float fabs (float)`	If you declare this prototype, "fabs" will always be single precision
`double sqrt (double)`	returns the square root of a floating-point value. With the single-precision command-line option, this function returns a float if the parameter and expression is single precision. This function can also have `__hint__((precision(12)))` applied to it - see hints.
`__alloca (int)`	Allocates space on the stack. Usually, there will be a macro called "alloca" which calls this intrinsic function.
`memcpy (void , void, unsigned int)`	Copies a section of memory.
`double cos (double)`
`double sin (double)`

VectorC-Specific Intrinsic Functions

int __cpu_supported (void) returns 1 if the program is being run on a processor that supports the target processor selected when compiled. This should be called right at the start of the program before the processor has had a chance to execute any possibly unsupported instructions. Use for giving a message like: "This program will not run on this processor".

MMX Instrinsic Functions

These functions are specified by Intel and are now supported by many compilers. Use these functions as an alternative to VectorC's vectorizer, but remember that your code will require an MMX processor and to be compiled for an MMX target processor.

`Function`	`MMX instruction`	Comments
`__m64 _m_paddb (__m64, __m64)`	`paddb`	vector addition
`__m64 _m_paddw (__m64, __m64)`	`paddw`
`__m64 _m_paddd (__m64, __m64)`	`paddd`
`__m64 _m_psubb (__m64, __m64)`	`psubb`	vector subtraction
`__m64 _m_psubw (__m64, __m64)`	`psubw`
`__m64 _m_psubd (__m64, __m64)`	`psubd`
`__m64 _m_pand (__m64, __m64)`	`pand`	bitwise logical operations performed on 64-bit values
`__m64 _m_pandn (__m64, __m64)`	`pandn`
`__m64 _m_por (__m64, __m64)`	`por`
`__m64 _m_pxor (__m64, __m64)`	`pxor`
`__m64 _m_paddsb (__m64, __m64)`	`paddsb`	saturated arithmetic
`__m64 _m_paddb (__m64, __m64)`	`paddb`
`__m64 _m_psubsb (__m64, __m64)`	`psubsb`
`__m64 _m_paddusb (__m64, __m64)`	`paddusb`
`__m64 _m_psubusb (__m64, __m64)`	`psubusb`
`__m64 _m_paddsw (__m64, __m64)`	`paddsw`
`__m64 _m_psubsw (__m64, __m64)`	`psubsw`
`__m64 _m_paddusw (__m64, __m64)`	`paddusw`
`__m64 _m_psubusw (__m64, __m64)`	`psubusw`
`__m64 _m_packsswb (__m64, __m64)`	`packsswb`	convert from a large component type to a smaller component type with saturation
`__m64 _m_packssdw (__m64, __m64)`	`packssdw`
`__m64 _m_packuswb (__m64, __m64)`	`packuswb`
`__m64 _m_punpcklbw (__m64, __m64)`	`punpcklbw`
`__m64 _m_punpcklwd (__m64, __m64)`	`punpcklwd`
`__m64 _m_punpckldq (__m64, __m64)`	`punpckldq`
`__m64 _m_punpckhbw (__m64, __m64)`	`punpckhbw`
`__m64 _m_punpckhwd (__m64, __m64)`	`punpckhwd`
`__m64 _m_punpckhdq (__m64, __m64)`	`punpckhdq`
`__m64 _m_pmullw (__m64, __m64)`	`pmullw`	multiply 16-bit
`__m64 _m_pmulhw (__m64, __m64)`	`pmulhw`	multiply 16-bit by 16-bit with 32-bit result, then shift right arithmetic by 16
`__m64 _m_pmaddwd (__m64, __m64)`	`pmaddwd`	multiply 16-bit to 32-bit result and add lower 2 and higher 2 components into 2 32-bit results
`__m64 _m_psllw (__m64, __m64)`	`psllw`	shift instructions
`__m64 _m_pslld (__m64, __m64)`	`pslld`
`__m64 _m_psllq (__m64, __m64)`	`psllq`
`__m64 _m_psraw (__m64, __m64)`	`psraw`
`__m64 _m_psrad (__m64, __m64)`	`psrad`
`__m64 _m_psrlw (__m64, __m64)`	`psrlw`
`__m64 _m_psrld (__m64, __m64)`	`psrld`
`__m64 _m_psrlq (__m64, __m64)`	`psrlq`
`__m64 _m_pcmpeqb (__m64, __m64)`	`pcmpeqb`	vector comparison instructions. Can be used with "pand", "pandn" and "por" to create a vector conditional move
`__m64 _m_pcmpeqw (__m64, __m64)`	`pcmpeqw`
`__m64 _m_pcmpeqd (__m64, __m64)`	`pcmpeqd`
`__m64 _m_pcmpgtb (__m64, __m64)`	`pcmpgtb`
`__m64 _m_pcmpgtw (__m64, __m64)`	`pcmpgtw`
`__m64 _m_pcmpgtd (__m64, __m64)`	`pcmpgtd`
`__m64 _m_psllwi (__m64, int)`	`psllw`	Alternative form of shift instructions using an int shift value
`__m64 _m_pslldi (__m64, int)`	`pslld`
`__m64 _m_psllqi (__m64, int)`	`psllq`
`__m64 _m_psrawi (__m64, int)`	`psraw`
`__m64 _m_psradi (__m64, int)`	`psrad`
`__m64 _m_psrlwi (__m64, int)`	`psrlw`
`__m64 _m_psrldi (__m64, int)`	`psrld`
`__m64 _m_psrlqi (__m64, int)`	`psrlq`
`__m64 _m_from_int (int)`	`movd`	Create a 64-bit MMX value from an integer - the high 32 bits are zero
`int _m_to_int (__m64)`	`movd`	Extract the low 32 bits from an MMX value
`void _m_empty (void)`	`emms`	Ignored by VectorC - the placement of emms instructions is done automatically
`_m64 _mm_set_pi32 (int, int)`		Create a 64-bit MMX vector from components. Last value is in the lowest position in the vector
`_m64 _mm_set_pi16 (short, short, short, short)`
`_m64 _mm_set_pi8 (char, char, char, char, char, char, char, char)`
`_m64 _mm_set1_pi32 (int)`		Create a 64-bit MMX vector from a single component The component is duplicated
`_m64 _mm_set1_pi16 (short)`
`_m64 _mm_set1_pi8 (char)`
`_m64 _mm_setr_pi32 (int, int)`		Create a 64-bit MMX vector from components. First value is in the lowest position in the vector
`_m64 _mm_setr_pi16 (short, short, short, short)`
`_m64 _mm_setr_pi8 (char, char, char, char, char, char, char, char)`

SSE Instrinsic Functions

These functions are specified by Intel and are now supported by many compilers. Use these functions as an alternative to VectorC's vectorizer, but remember that your code will require an SSE processor and to be compiled for an SSE target processor.

Function	SSE Instruction	Comments
`__m64 _m_pmuluhw (__m64, __m64)`	pmuluhw	multiply 16-bit unsigned by 16-bit unsigned to 32-bit temporary, shift right 16
`__m64 _m_pmaxsw (__m64, __m64)`	pmaxsw	maximum signed 16-bit
`__m64 _m_pminsw (__m64, __m64)`	pminsw	minimum signed 16-bit
`__m64 _m_pmaxub (__m64, __m64)`	pmaxub	maximum unsigned 8-bit
`__m64 _m_pminub (__m64, __m64)`	pminub	minimum unsigned 8-bit
`__m64 _m_pavgb (__m64, __m64)`	pavgb	average unsigned 8-bit
`__m64 _m_pavgw (__m64, __m64)`	pavgw	average unsigned 16-bit
`__m64 _m_psadbw (__m64, __m64)`	psadbw	sum of absolute differences
`__m64 _m_pshufw (__m64, const int)`	pshufw	shuffle 16-bit words
`int _m_pmovmskb (__m64)`	pmovmskb	returns a 4-bit mask from avector of 4 16-bit integers
`void _mm_stream_pi (_m64 *,_m64)`	movntq	store without going through cache
`int _m_pextrw (_m64, const int)`	pextrw	extract 16-bit value from vector
`_m64 _m_pinsrw (_m64, int, const int)`	pinsrw	put a 16-bit value into a vector
`_m128 _mm_set_ps (float, float, float, float)`		creates a float vector from 4 floats. Last value is in the lowest position in the vector
`_m128 _mm_setr_ps (float, float, float, float)`		creates a float vector from 4 floats. First value is in the lowest position in the vector
`_m128 _mm_setr_ps1 (float)`		creates a float vector from a float - all components have the same value
`_m128 _mm_set_ss (float)`		creates a float vector from a float - the 3 highest components are zero
`_m128 _mm_cvt_si2ss (int)`	cvtsi2ss	convert integer to float
`_m128 _mm_sqrt_ss (_m128)`	sqrtss	square root
`_m128 _mm_rsqrt_ss (_m128)`	rsqrtss	reciprocal square root approximate
`_m128 _mm_rcp_ss (_m128)`	rcpss	reciprocal approximate
`_m128 _mm_sqrt_ps (_m128)`	sqrtps	square root
`_m128 _mm_rsqrt_ps (_m128)`	rsqrtps	reciprocal square root approximate
`_m128 _mm_rcp_ps (_m128)`	rcpps	reciprocal approximate
`_m128 _mm_loadu_ps (_m128 *)`	movups	un-aligned load
`void _mm_storeu_ps (_m128 *, _m128)`	movups	un-aligned store
`_m128 _mm_load_ps (_m128 *)`	movaps	aligned load
`_m128 _mm_loadr_ps (_m128 *)`	movaps, shufps	aligned load and reverse order of vector
`void _mm_prefetch (void *, const int)`	prefetcht0 prefetcht1 prefetcht2 prefetchnta	prefetch data into cache
`void _mm_stream_ps (_m128 *, _m128)`	movntps	store without going through cache
`_m128 _mm_load_ss (float *)`	movss	load float
`_m128 _mm_load_ps1 (float *)`	movss shufps	load float and duplicate into vector
`void _mm_store_ps (_m128 *, _m128)`	movaps	store float vector aligned
`void _mm_store_ps1 (_m128 *, _m128)`	shufps movaps
`void _mm_storer_ps (_m128 *, _m128)`	shufps movaps	store float vector aligned after reversing components
`void _mm_store_ss (float *, _m128)`	movss	store float
`_m128 _mm_add_ps (_m128, _m128)`	addps	vector arithmetic
`_m128 _mm_sub_ps (_m128, _m128)`	subps
`_m128 _mm_mul_ps (_m128, _m128)`	mulps
`_m128 _mm_div_ps (_m128, _m128)`	divps
`_m128 _mm_add_ss (_m128, _m128)`	addss	float arithmetic
`_m128 _mm_mul_ss (_m128, _m128)`	mulss
`_m128 _mm_div_ss (_m128, _m128)`	divss
`_m128 _mm_and_ps (_m128, _m128)`	andps	bitwise logical operations
`_m128 _mm_andnot_ps (_m128, _m128)`	andnps
`_m128 _mm_xor_ps (_m128, _m128)`	xorps
`_m128 _mm_or_ps (_m128, _m128)`	orps
`_m128 _mm_min_ss (_m128, _m128)`	minss	minimum float
`_m128 _mm_max_ss (_m128, _m128)`	maxss	maximum float
`_m128 _mm_cmpeq_ss (_m128, _m128)`	cmpss	comparisons
`_m128 _mm_cmpneq_ss (_m128, _m128)`
`_m128 _mm_cmplt_ss (_m128, _m128)`
`_m128 _mm_cmpgt_ss (_m128, _m128)`
`_m128 _mm_cmple_ss (_m128, _m128)`
`_m128 _mm_cmpge_ss (_m128, _m128)`
`_m128 _mm_cmpnlt_ss (_m128, _m128)`
`_m128 _mm_cmpnle_ss (_m128, _m128)`
`_m128 _mm_cmpngt_ss (_m128, _m128)`
`_m128 _mm_cmpnge_ss (_m128, _m128)`
`_m128 _mm_min_ps (_m128, _m128)`	minps	minimum float vector
`_m128 _mm_max_ps (_m128, _m128)`	maxps	maximum float vector
`_m128 _mm_cmpeq_ps (_m128, _m128)`	cmpps	comparison of float vectors
`_m128 _mm_cmpneq_ps (_m128, _m128)`
`_m128 _mm_cmplt_ps (_m128, _m128)`
`_m128 _mm_cmpnlt_ps (_m128, _m128)`
`_m128 _mm_cmpnle_ps (_m128, _m128)`
`_m128 _mm_cmpnle_ps (_m128, _m128)`
`_m128 _mm_cmpngt_ps (_m128, _m128)`
`_m128 _mm_cmpnge_ps (_m128, _m128)`
`_m128 _mm_cmpgt_ps (_m128, _m128)`
`_m128 _mm_cmple_ps (_m128, _m128)`
`_m128 _mm_cmpge_ps (_m128, _m128)`
`_m128 _mm_cmpord_ps (_m128, _m128)`
`_m128 _mm_cmpunord_ps (_m128, _m128)`
`_m128 _mm_unpackhi_ps (_m128, _m128)`	unpckhps
`_m128 _mm_unpacklo_ps (_m128, _m128)`	unpcklps
`_m128 _mm_movehl_ps (_m128, _m128)`	movhlps	move high to low
`_m128 _mm_movelh_ps (_m128, _m128)`	movlhps	move low to high
`_m128 _mm_move_ss (_m128, _m128)`	movss	move single floating-point value into a vector register - the high values are 0
`int _mm_comieq_ss (_m128, _m128)`	comiss	floating-point comparisons
`int _mm_comilt_ss (_m128, _m128)`
`int _mm_comieq_ss (_m128, _m128)`
`int _mm_comineq_ss (_m128, _m128)`
`int _mm_comile_ss (_m128, _m128)`
`int _mm_comigt_ss (_m128, _m128)`
`int _mm_comige_ss (_m128, _m128)`
`int _mm_ucomieq_ss (_m128, _m128)`	ucomiss
`int _mm_ucomilt_ss (_m128, _m128)`
`int _mm_ucomieq_ss (_m128, _m128)`
`int _mm_ucomile_ss (_m128, _m128)`
`int _mm_ucomigt_ss (_m128, _m128)`
`int _mm_ucomige_ss (_m128, _m128)`
`_m128 _mm_loadh_pi (_m128, _m64 *)`	movhps	load high 64-bits
`_m128 _mm_loadl_pi (_m128, _m64 *)`	movlps	load low 64-bits
`void _mm_storeh_pi (_m64 *, _m128)`	movhps	store high 64-bits
`void _mm_storel_pi (_m64 *, _m128)`	movlps	store low 64-bits
`int _mm_cvt_ss2si (_m128)`	cvtss2si	convert float to int
`int _mm_cvtt_ss2si (_m128)`	cvttss2si	convert float to int with truncation
`_m128 _mm_cvt_si2ss (_m128, int)`	cvtsi2ss	convert int to float
`_m128 _mm_cvt_pi2ps (_m128, _m64)`	cvtpi2ps	convert int vector to float vector
`_m128 _mm_shuffle_ps (_m128, _m128, const int)`	shufps	shuffle
`void _mm_sfence (void)`	sfence	store fence
`int _mm_getcsr (void)`		get SSE control register
`void _mm_setcsr (int)`		set SSE control register
`_m128 _mm_setzero_ps (void)`	xorps	set a SSE register to 0