b4c522fabd
ChangeLog: * Makefile.def (target_modules): Add libphobos. (flags_to_pass): Add GDC, GDCFLAGS, GDC_FOR_TARGET and GDCFLAGS_FOR_TARGET. (dependencies): Make libphobos depend on libatomic, libbacktrace configure, and zlib configure. (language): Add language d. * Makefile.in: Rebuild. * Makefile.tpl (BUILD_EXPORTS): Add GDC and GDCFLAGS. (HOST_EXPORTS): Add GDC. (POSTSTAGE1_HOST_EXPORTS): Add GDC and GDC_FOR_BUILD. (BASE_TARGET_EXPORTS): Add GDC. (GDC_FOR_BUILD, GDC, GDCFLAGS): New variables. (GDC_FOR_TARGET, GDC_FLAGS_FOR_TARGET): New variables. (EXTRA_HOST_FLAGS): Add GDC. (STAGE1_FLAGS_TO_PASS): Add GDC. (EXTRA_TARGET_FLAGS): Add GDC and GDCFLAGS. * config-ml.in: Treat GDC and GDCFLAGS like other compiler/flag environment variables. * configure: Rebuild. * configure.ac: Add target-libphobos to target_libraries. Set and substitute GDC_FOR_BUILD and GDC_FOR_TARGET. config/ChangeLog: * multi.m4: Set GDC. gcc/ChangeLog: * Makefile.in (tm_d_file_list, tm_d_include_list): New variables. (TM_D_H, D_TARGET_DEF, D_TARGET_H, D_TARGET_OBJS): New variables. (tm_d.h, cs-tm_d.h, default-d.o): New rules. (d/d-target-hooks-def.h, s-d-target-hooks-def-h): New rules. (s-tm-texi): Also check timestamp on d-target.def. (generated_files): Add TM_D_H and d-target-hooks-def.h. (build/genhooks.o): Also depend on D_TARGET_DEF. * config.gcc (tm_d_file, d_target_objs, target_has_targetdm): New variables. * config/aarch64/aarch64-d.c: New file. * config/aarch64/aarch64-linux.h (GNU_USER_TARGET_D_CRITSEC_SIZE): Define. * config/aarch64/aarch64-protos.h (aarch64_d_target_versions): New prototype. * config/aarch64/aarch64.h (TARGET_D_CPU_VERSIONS): Define. * config/aarch64/t-aarch64 (aarch64-d.o): New rule. * config/arm/arm-d.c: New file. * config/arm/arm-protos.h (arm_d_target_versions): New prototype. * config/arm/arm.h (TARGET_D_CPU_VERSIONS): Define. * config/arm/linux-eabi.h (EXTRA_TARGET_D_OS_VERSIONS): Define. * config/arm/t-arm (arm-d.o): New rule. * config/default-d.c: New file. * config/glibc-d.c: New file. * config/gnu.h (GNU_USER_TARGET_D_OS_VERSIONS): Define. * config/i386/i386-d.c: New file. * config/i386/i386-protos.h (ix86_d_target_versions): New prototype. * config/i386/i386.h (TARGET_D_CPU_VERSIONS): Define. * config/i386/linux-common.h (EXTRA_TARGET_D_OS_VERSIONS): Define. (GNU_USER_TARGET_D_CRITSEC_SIZE): Define. * config/i386/t-i386 (i386-d.o): New rule. * config/kfreebsd-gnu.h (GNU_USER_TARGET_D_OS_VERSIONS): Define. * config/kopensolaris-gnu.h (GNU_USER_TARGET_D_OS_VERSIONS): Define. * config/linux-android.h (ANDROID_TARGET_D_OS_VERSIONS): Define. * config/linux.h (GNU_USER_TARGET_D_OS_VERSIONS): Define. * config/mips/linux-common.h (EXTRA_TARGET_D_OS_VERSIONS): Define. * config/mips/mips-d.c: New file. * config/mips/mips-protos.h (mips_d_target_versions): New prototype. * config/mips/mips.h (TARGET_D_CPU_VERSIONS): Define. * config/mips/t-mips (mips-d.o): New rule. * config/powerpcspe/linux.h (GNU_USER_TARGET_D_OS_VERSIONS): Define. * config/powerpcspe/linux64.h (GNU_USER_TARGET_D_OS_VERSIONS): Define. * config/powerpcspe/powerpcspe-d.c: New file. * config/powerpcspe/powerpcspe-protos.h (rs6000_d_target_versions): New prototype. * config/powerpcspe/powerpcspe.c (rs6000_output_function_epilogue): Support GNU D by using 0 as the language type. * config/powerpcspe/powerpcspe.h (TARGET_D_CPU_VERSIONS): Define. * config/powerpcspe/t-powerpcspe (powerpcspe-d.o): New rule. * config/riscv/riscv-d.c: New file. * config/riscv/riscv-protos.h (riscv_d_target_versions): New prototype. * config/riscv/riscv.h (TARGET_D_CPU_VERSIONS): Define. * config/riscv/t-riscv (riscv-d.o): New rule. * config/rs6000/linux.h (GNU_USER_TARGET_D_OS_VERSIONS): Define. * config/rs6000/linux64.h (GNU_USER_TARGET_D_OS_VERSIONS): Define. * config/rs6000/rs6000-d.c: New file. * config/rs6000/rs6000-protos.h (rs6000_d_target_versions): New prototype. * config/rs6000/rs6000.c (rs6000_output_function_epilogue): Support GNU D by using 0 as the language type. * config/rs6000/rs6000.h (TARGET_D_CPU_VERSIONS): Define. * config/rs6000/t-rs6000 (rs6000-d.o): New rule. * config/s390/s390-d.c: New file. * config/s390/s390-protos.h (s390_d_target_versions): New prototype. * config/s390/s390.h (TARGET_D_CPU_VERSIONS): Define. * config/s390/t-s390 (s390-d.o): New rule. * config/sparc/sparc-d.c: New file. * config/sparc/sparc-protos.h (sparc_d_target_versions): New prototype. * config/sparc/sparc.h (TARGET_D_CPU_VERSIONS): Define. * config/sparc/t-sparc (sparc-d.o): New rule. * config/t-glibc (glibc-d.o): New rule. * configure: Regenerated. * configure.ac (tm_d_file): New variable. (tm_d_file_list, tm_d_include_list, d_target_objs): Add substitutes. * doc/contrib.texi (Contributors): Add self for the D frontend. * doc/frontends.texi (G++ and GCC): Mention D as a supported language. * doc/install.texi (Configuration): Mention libphobos as an option for --enable-shared. Mention d as an option for --enable-languages. (Testing): Mention check-d as a target. * doc/invoke.texi (Overall Options): Mention .d, .dd, and .di as file name suffixes. Mention d as a -x option. * doc/sourcebuild.texi (Top Level): Mention libphobos. * doc/standards.texi (Standards): Add section on D language. * doc/tm.texi: Regenerated. * doc/tm.texi.in: Add @node for D language and ABI, and @hook for TARGET_CPU_VERSIONS, TARGET_D_OS_VERSIONS, and TARGET_D_CRITSEC_SIZE. * dwarf2out.c (is_dlang): New function. (gen_compile_unit_die): Use DW_LANG_D for D. (declare_in_namespace): Return module die for D, instead of adding extra declarations into the namespace. (gen_namespace_die): Generate DW_TAG_module for D. (gen_decl_die): Handle CONST_DECLSs for D. (dwarf2out_decl): Likewise. (prune_unused_types_walk_local_classes): Handle DW_tag_interface_type. (prune_unused_types_walk): Handle DW_tag_interface_type same as other kinds of aggregates. * gcc.c (default_compilers): Add entries for .d, .dd and .di. * genhooks.c: Include d/d-target.def. gcc/po/ChangeLog: * EXCLUDES: Add sources from d/dmd. gcc/testsuite/ChangeLog: * gcc.misc-tests/help.exp: Add D to option descriptions check. * gdc.dg/asan/asan.exp: New file. * gdc.dg/asan/gdc272.d: New test. * gdc.dg/compilable.d: New test. * gdc.dg/dg.exp: New file. * gdc.dg/gdc254.d: New test. * gdc.dg/gdc260.d: New test. * gdc.dg/gdc270a.d: New test. * gdc.dg/gdc270b.d: New test. * gdc.dg/gdc282.d: New test. * gdc.dg/gdc283.d: New test. * gdc.dg/imports/gdc170.d: New test. * gdc.dg/imports/gdc231.d: New test. * gdc.dg/imports/gdc239.d: New test. * gdc.dg/imports/gdc241a.d: New test. * gdc.dg/imports/gdc241b.d: New test. * gdc.dg/imports/gdc251a.d: New test. * gdc.dg/imports/gdc251b.d: New test. * gdc.dg/imports/gdc253.d: New test. * gdc.dg/imports/gdc254a.d: New test. * gdc.dg/imports/gdc256.d: New test. * gdc.dg/imports/gdc27.d: New test. * gdc.dg/imports/gdcpkg256/package.d: New test. * gdc.dg/imports/runnable.d: New test. * gdc.dg/link.d: New test. * gdc.dg/lto/lto.exp: New file. * gdc.dg/lto/ltotests_0.d: New test. * gdc.dg/lto/ltotests_1.d: New test. * gdc.dg/runnable.d: New test. * gdc.dg/simd.d: New test. * gdc.test/gdc-test.exp: New file. * lib/gdc-dg.exp: New file. * lib/gdc.exp: New file. libphobos/ChangeLog: * Makefile.am: New file. * Makefile.in: New file. * acinclude.m4: New file. * aclocal.m4: New file. * config.h.in: New file. * configure: New file. * configure.ac: New file. * d_rules.am: New file. * libdruntime/Makefile.am: New file. * libdruntime/Makefile.in: New file. * libdruntime/__entrypoint.di: New file. * libdruntime/__main.di: New file. * libdruntime/gcc/attribute.d: New file. * libdruntime/gcc/backtrace.d: New file. * libdruntime/gcc/builtins.d: New file. * libdruntime/gcc/config.d.in: New file. * libdruntime/gcc/deh.d: New file. * libdruntime/gcc/libbacktrace.d.in: New file. * libdruntime/gcc/unwind/arm.d: New file. * libdruntime/gcc/unwind/arm_common.d: New file. * libdruntime/gcc/unwind/c6x.d: New file. * libdruntime/gcc/unwind/generic.d: New file. * libdruntime/gcc/unwind/package.d: New file. * libdruntime/gcc/unwind/pe.d: New file. * m4/autoconf.m4: New file. * m4/druntime.m4: New file. * m4/druntime/cpu.m4: New file. * m4/druntime/libraries.m4: New file. * m4/druntime/os.m4: New file. * m4/gcc_support.m4: New file. * m4/gdc.m4: New file. * m4/libtool.m4: New file. * src/Makefile.am: New file. * src/Makefile.in: New file. * src/libgphobos.spec.in: New file. * testsuite/Makefile.am: New file. * testsuite/Makefile.in: New file. * testsuite/config/default.exp: New file. * testsuite/lib/libphobos-dg.exp: New file. * testsuite/lib/libphobos.exp: New file. * testsuite/testsuite_flags.in: New file. From-SVN: r265573
554 lines
15 KiB
D
554 lines
15 KiB
D
// Written in the D programming language.
|
|
|
|
/**
|
|
* Builtin SIMD intrinsics
|
|
*
|
|
* Source: $(DRUNTIMESRC core/_simd.d)
|
|
*
|
|
* Copyright: Copyright Digital Mars 2012.
|
|
* License: $(WEB www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
|
|
* Authors: $(WEB digitalmars.com, Walter Bright),
|
|
*/
|
|
|
|
/* NOTE: This file has been patched from the original DMD distribution to
|
|
* work with the GDC compiler.
|
|
*/
|
|
module core.simd;
|
|
|
|
pure:
|
|
nothrow:
|
|
@safe:
|
|
@nogc:
|
|
|
|
/*******************************
|
|
* Create a vector type.
|
|
*
|
|
* Parameters:
|
|
* T = one of double[2], float[4], void[16], byte[16], ubyte[16],
|
|
* short[8], ushort[8], int[4], uint[4], long[2], ulong[2].
|
|
* For 256 bit vectors,
|
|
* one of double[4], float[8], void[32], byte[32], ubyte[32],
|
|
* short[16], ushort[16], int[8], uint[8], long[4], ulong[4]
|
|
*/
|
|
|
|
template Vector(T)
|
|
{
|
|
/* __vector is compiler magic, hide it behind a template.
|
|
* The compiler will reject T's that don't work.
|
|
*/
|
|
alias __vector(T) Vector;
|
|
}
|
|
|
|
/* Handy aliases
|
|
*/
|
|
static if (is(Vector!(void[8]))) alias Vector!(void[8]) void8; ///
|
|
static if (is(Vector!(float[2]))) alias Vector!(float[2]) float2; ///
|
|
static if (is(Vector!(byte[8]))) alias Vector!(byte[8]) byte8; ///
|
|
static if (is(Vector!(ubyte[8]))) alias Vector!(ubyte[8]) ubyte8; ///
|
|
static if (is(Vector!(short[4]))) alias Vector!(short[4]) short4; ///
|
|
static if (is(Vector!(ushort[4]))) alias Vector!(ushort[4]) ushort4; ///
|
|
static if (is(Vector!(int[2]))) alias Vector!(int[2]) int2; ///
|
|
static if (is(Vector!(uint[2]))) alias Vector!(uint[2]) uint2; ///
|
|
|
|
static if (is(Vector!(void[16]))) alias Vector!(void[16]) void16; ///
|
|
static if (is(Vector!(double[2]))) alias Vector!(double[2]) double2; ///
|
|
static if (is(Vector!(float[4]))) alias Vector!(float[4]) float4; ///
|
|
static if (is(Vector!(byte[16]))) alias Vector!(byte[16]) byte16; ///
|
|
static if (is(Vector!(ubyte[16]))) alias Vector!(ubyte[16]) ubyte16; ///
|
|
static if (is(Vector!(short[8]))) alias Vector!(short[8]) short8; ///
|
|
static if (is(Vector!(ushort[8]))) alias Vector!(ushort[8]) ushort8; ///
|
|
static if (is(Vector!(int[4]))) alias Vector!(int[4]) int4; ///
|
|
static if (is(Vector!(uint[4]))) alias Vector!(uint[4]) uint4; ///
|
|
static if (is(Vector!(long[2]))) alias Vector!(long[2]) long2; ///
|
|
static if (is(Vector!(ulong[2]))) alias Vector!(ulong[2]) ulong2; ///
|
|
|
|
static if (is(Vector!(void[32]))) alias Vector!(void[32]) void32; ///
|
|
static if (is(Vector!(double[4]))) alias Vector!(double[4]) double4; ///
|
|
static if (is(Vector!(float[8]))) alias Vector!(float[8]) float8; ///
|
|
static if (is(Vector!(byte[32]))) alias Vector!(byte[32]) byte32; ///
|
|
static if (is(Vector!(ubyte[32]))) alias Vector!(ubyte[32]) ubyte32; ///
|
|
static if (is(Vector!(short[16]))) alias Vector!(short[16]) short16; ///
|
|
static if (is(Vector!(ushort[16]))) alias Vector!(ushort[16]) ushort16; ///
|
|
static if (is(Vector!(int[8]))) alias Vector!(int[8]) int8; ///
|
|
static if (is(Vector!(uint[8]))) alias Vector!(uint[8]) uint8; ///
|
|
static if (is(Vector!(long[4]))) alias Vector!(long[4]) long4; ///
|
|
static if (is(Vector!(ulong[4]))) alias Vector!(ulong[4]) ulong4; ///
|
|
|
|
version (D_SIMD)
|
|
{
|
|
/** XMM opcodes that conform to the following:
|
|
*
|
|
* opcode xmm1,xmm2/mem
|
|
*
|
|
* and do not have side effects (i.e. do not write to memory).
|
|
*/
|
|
enum XMM
|
|
{
|
|
ADDSS = 0xF30F58,
|
|
ADDSD = 0xF20F58,
|
|
ADDPS = 0x000F58,
|
|
ADDPD = 0x660F58,
|
|
PADDB = 0x660FFC,
|
|
PADDW = 0x660FFD,
|
|
PADDD = 0x660FFE,
|
|
PADDQ = 0x660FD4,
|
|
|
|
SUBSS = 0xF30F5C,
|
|
SUBSD = 0xF20F5C,
|
|
SUBPS = 0x000F5C,
|
|
SUBPD = 0x660F5C,
|
|
PSUBB = 0x660FF8,
|
|
PSUBW = 0x660FF9,
|
|
PSUBD = 0x660FFA,
|
|
PSUBQ = 0x660FFB,
|
|
|
|
MULSS = 0xF30F59,
|
|
MULSD = 0xF20F59,
|
|
MULPS = 0x000F59,
|
|
MULPD = 0x660F59,
|
|
PMULLW = 0x660FD5,
|
|
|
|
DIVSS = 0xF30F5E,
|
|
DIVSD = 0xF20F5E,
|
|
DIVPS = 0x000F5E,
|
|
DIVPD = 0x660F5E,
|
|
|
|
PAND = 0x660FDB,
|
|
POR = 0x660FEB,
|
|
|
|
UCOMISS = 0x000F2E,
|
|
UCOMISD = 0x660F2E,
|
|
|
|
XORPS = 0x000F57,
|
|
XORPD = 0x660F57,
|
|
|
|
// Use STO and LOD instead of MOV to distinguish the direction
|
|
STOSS = 0xF30F11,
|
|
STOSD = 0xF20F11,
|
|
STOAPS = 0x000F29,
|
|
STOAPD = 0x660F29,
|
|
STODQA = 0x660F7F,
|
|
STOD = 0x660F7E, // MOVD reg/mem64, xmm 66 0F 7E /r
|
|
STOQ = 0x660FD6,
|
|
|
|
LODSS = 0xF30F10,
|
|
LODSD = 0xF20F10,
|
|
LODAPS = 0x000F28,
|
|
LODAPD = 0x660F28,
|
|
LODDQA = 0x660F6F,
|
|
LODD = 0x660F6E, // MOVD xmm, reg/mem64 66 0F 6E /r
|
|
LODQ = 0xF30F7E,
|
|
|
|
LODDQU = 0xF30F6F, // MOVDQU xmm1, xmm2/mem128 F3 0F 6F /r
|
|
STODQU = 0xF30F7F, // MOVDQU xmm1/mem128, xmm2 F3 0F 7F /r
|
|
MOVDQ2Q = 0xF20FD6, // MOVDQ2Q mmx, xmm F2 0F D6 /r
|
|
MOVHLPS = 0x0F12, // MOVHLPS xmm1, xmm2 0F 12 /r
|
|
LODHPD = 0x660F16,
|
|
STOHPD = 0x660F17, // MOVHPD mem64, xmm 66 0F 17 /r
|
|
LODHPS = 0x0F16,
|
|
STOHPS = 0x0F17,
|
|
MOVLHPS = 0x0F16,
|
|
LODLPD = 0x660F12,
|
|
STOLPD = 0x660F13,
|
|
LODLPS = 0x0F12,
|
|
STOLPS = 0x0F13,
|
|
MOVMSKPD = 0x660F50,
|
|
MOVMSKPS = 0x0F50,
|
|
MOVNTDQ = 0x660FE7,
|
|
MOVNTI = 0x0FC3,
|
|
MOVNTPD = 0x660F2B,
|
|
MOVNTPS = 0x0F2B,
|
|
MOVNTQ = 0x0FE7,
|
|
MOVQ2DQ = 0xF30FD6,
|
|
LODUPD = 0x660F10,
|
|
STOUPD = 0x660F11,
|
|
LODUPS = 0x0F10,
|
|
STOUPS = 0x0F11,
|
|
|
|
PACKSSDW = 0x660F6B,
|
|
PACKSSWB = 0x660F63,
|
|
PACKUSWB = 0x660F67,
|
|
PADDSB = 0x660FEC,
|
|
PADDSW = 0x660FED,
|
|
PADDUSB = 0x660FDC,
|
|
PADDUSW = 0x660FDD,
|
|
PANDN = 0x660FDF,
|
|
PCMPEQB = 0x660F74,
|
|
PCMPEQD = 0x660F76,
|
|
PCMPEQW = 0x660F75,
|
|
PCMPGTB = 0x660F64,
|
|
PCMPGTD = 0x660F66,
|
|
PCMPGTW = 0x660F65,
|
|
PMADDWD = 0x660FF5,
|
|
PSLLW = 0x660FF1,
|
|
PSLLD = 0x660FF2,
|
|
PSLLQ = 0x660FF3,
|
|
PSRAW = 0x660FE1,
|
|
PSRAD = 0x660FE2,
|
|
PSRLW = 0x660FD1,
|
|
PSRLD = 0x660FD2,
|
|
PSRLQ = 0x660FD3,
|
|
PSUBSB = 0x660FE8,
|
|
PSUBSW = 0x660FE9,
|
|
PSUBUSB = 0x660FD8,
|
|
PSUBUSW = 0x660FD9,
|
|
PUNPCKHBW = 0x660F68,
|
|
PUNPCKHDQ = 0x660F6A,
|
|
PUNPCKHWD = 0x660F69,
|
|
PUNPCKLBW = 0x660F60,
|
|
PUNPCKLDQ = 0x660F62,
|
|
PUNPCKLWD = 0x660F61,
|
|
PXOR = 0x660FEF,
|
|
ANDPD = 0x660F54,
|
|
ANDPS = 0x0F54,
|
|
ANDNPD = 0x660F55,
|
|
ANDNPS = 0x0F55,
|
|
CMPPS = 0x0FC2,
|
|
CMPPD = 0x660FC2,
|
|
CMPSD = 0xF20FC2,
|
|
CMPSS = 0xF30FC2,
|
|
COMISD = 0x660F2F,
|
|
COMISS = 0x0F2F,
|
|
CVTDQ2PD = 0xF30FE6,
|
|
CVTDQ2PS = 0x0F5B,
|
|
CVTPD2DQ = 0xF20FE6,
|
|
CVTPD2PI = 0x660F2D,
|
|
CVTPD2PS = 0x660F5A,
|
|
CVTPI2PD = 0x660F2A,
|
|
CVTPI2PS = 0x0F2A,
|
|
CVTPS2DQ = 0x660F5B,
|
|
CVTPS2PD = 0x0F5A,
|
|
CVTPS2PI = 0x0F2D,
|
|
CVTSD2SI = 0xF20F2D,
|
|
CVTSD2SS = 0xF20F5A,
|
|
CVTSI2SD = 0xF20F2A,
|
|
CVTSI2SS = 0xF30F2A,
|
|
CVTSS2SD = 0xF30F5A,
|
|
CVTSS2SI = 0xF30F2D,
|
|
CVTTPD2PI = 0x660F2C,
|
|
CVTTPD2DQ = 0x660FE6,
|
|
CVTTPS2DQ = 0xF30F5B,
|
|
CVTTPS2PI = 0x0F2C,
|
|
CVTTSD2SI = 0xF20F2C,
|
|
CVTTSS2SI = 0xF30F2C,
|
|
MASKMOVDQU = 0x660FF7,
|
|
MASKMOVQ = 0x0FF7,
|
|
MAXPD = 0x660F5F,
|
|
MAXPS = 0x0F5F,
|
|
MAXSD = 0xF20F5F,
|
|
MAXSS = 0xF30F5F,
|
|
MINPD = 0x660F5D,
|
|
MINPS = 0x0F5D,
|
|
MINSD = 0xF20F5D,
|
|
MINSS = 0xF30F5D,
|
|
ORPD = 0x660F56,
|
|
ORPS = 0x0F56,
|
|
PAVGB = 0x660FE0,
|
|
PAVGW = 0x660FE3,
|
|
PMAXSW = 0x660FEE,
|
|
//PINSRW = 0x660FC4,
|
|
PMAXUB = 0x660FDE,
|
|
PMINSW = 0x660FEA,
|
|
PMINUB = 0x660FDA,
|
|
//PMOVMSKB = 0x660FD7,
|
|
PMULHUW = 0x660FE4,
|
|
PMULHW = 0x660FE5,
|
|
PMULUDQ = 0x660FF4,
|
|
PSADBW = 0x660FF6,
|
|
PUNPCKHQDQ = 0x660F6D,
|
|
PUNPCKLQDQ = 0x660F6C,
|
|
RCPPS = 0x0F53,
|
|
RCPSS = 0xF30F53,
|
|
RSQRTPS = 0x0F52,
|
|
RSQRTSS = 0xF30F52,
|
|
SQRTPD = 0x660F51,
|
|
SHUFPD = 0x660FC6,
|
|
SHUFPS = 0x0FC6,
|
|
SQRTPS = 0x0F51,
|
|
SQRTSD = 0xF20F51,
|
|
SQRTSS = 0xF30F51,
|
|
UNPCKHPD = 0x660F15,
|
|
UNPCKHPS = 0x0F15,
|
|
UNPCKLPD = 0x660F14,
|
|
UNPCKLPS = 0x0F14,
|
|
|
|
PSHUFD = 0x660F70,
|
|
PSHUFHW = 0xF30F70,
|
|
PSHUFLW = 0xF20F70,
|
|
PSHUFW = 0x0F70,
|
|
PSLLDQ = 0x07660F73,
|
|
PSRLDQ = 0x03660F73,
|
|
|
|
//PREFETCH = 0x0F18,
|
|
|
|
// SSE3 Pentium 4 (Prescott)
|
|
|
|
ADDSUBPD = 0x660FD0,
|
|
ADDSUBPS = 0xF20FD0,
|
|
HADDPD = 0x660F7C,
|
|
HADDPS = 0xF20F7C,
|
|
HSUBPD = 0x660F7D,
|
|
HSUBPS = 0xF20F7D,
|
|
MOVDDUP = 0xF20F12,
|
|
MOVSHDUP = 0xF30F16,
|
|
MOVSLDUP = 0xF30F12,
|
|
LDDQU = 0xF20FF0,
|
|
MONITOR = 0x0F01C8,
|
|
MWAIT = 0x0F01C9,
|
|
|
|
// SSSE3
|
|
PALIGNR = 0x660F3A0F,
|
|
PHADDD = 0x660F3802,
|
|
PHADDW = 0x660F3801,
|
|
PHADDSW = 0x660F3803,
|
|
PABSB = 0x660F381C,
|
|
PABSD = 0x660F381E,
|
|
PABSW = 0x660F381D,
|
|
PSIGNB = 0x660F3808,
|
|
PSIGND = 0x660F380A,
|
|
PSIGNW = 0x660F3809,
|
|
PSHUFB = 0x660F3800,
|
|
PMADDUBSW = 0x660F3804,
|
|
PMULHRSW = 0x660F380B,
|
|
PHSUBD = 0x660F3806,
|
|
PHSUBW = 0x660F3805,
|
|
PHSUBSW = 0x660F3807,
|
|
|
|
// SSE4.1
|
|
|
|
BLENDPD = 0x660F3A0D,
|
|
BLENDPS = 0x660F3A0C,
|
|
BLENDVPD = 0x660F3815,
|
|
BLENDVPS = 0x660F3814,
|
|
DPPD = 0x660F3A41,
|
|
DPPS = 0x660F3A40,
|
|
EXTRACTPS = 0x660F3A17,
|
|
INSERTPS = 0x660F3A21,
|
|
MPSADBW = 0x660F3A42,
|
|
PBLENDVB = 0x660F3810,
|
|
PBLENDW = 0x660F3A0E,
|
|
PEXTRD = 0x660F3A16,
|
|
PEXTRQ = 0x660F3A16,
|
|
PINSRB = 0x660F3A20,
|
|
PINSRD = 0x660F3A22,
|
|
PINSRQ = 0x660F3A22,
|
|
|
|
MOVNTDQA = 0x660F382A,
|
|
PACKUSDW = 0x660F382B,
|
|
PCMPEQQ = 0x660F3829,
|
|
PEXTRB = 0x660F3A14,
|
|
PHMINPOSUW = 0x660F3841,
|
|
PMAXSB = 0x660F383C,
|
|
PMAXSD = 0x660F383D,
|
|
PMAXUD = 0x660F383F,
|
|
PMAXUW = 0x660F383E,
|
|
PMINSB = 0x660F3838,
|
|
PMINSD = 0x660F3839,
|
|
PMINUD = 0x660F383B,
|
|
PMINUW = 0x660F383A,
|
|
PMOVSXBW = 0x660F3820,
|
|
PMOVSXBD = 0x660F3821,
|
|
PMOVSXBQ = 0x660F3822,
|
|
PMOVSXWD = 0x660F3823,
|
|
PMOVSXWQ = 0x660F3824,
|
|
PMOVSXDQ = 0x660F3825,
|
|
PMOVZXBW = 0x660F3830,
|
|
PMOVZXBD = 0x660F3831,
|
|
PMOVZXBQ = 0x660F3832,
|
|
PMOVZXWD = 0x660F3833,
|
|
PMOVZXWQ = 0x660F3834,
|
|
PMOVZXDQ = 0x660F3835,
|
|
PMULDQ = 0x660F3828,
|
|
PMULLD = 0x660F3840,
|
|
PTEST = 0x660F3817,
|
|
|
|
ROUNDPD = 0x660F3A09,
|
|
ROUNDPS = 0x660F3A08,
|
|
ROUNDSD = 0x660F3A0B,
|
|
ROUNDSS = 0x660F3A0A,
|
|
|
|
// SSE4.2
|
|
PCMPESTRI = 0x660F3A61,
|
|
PCMPESTRM = 0x660F3A60,
|
|
PCMPISTRI = 0x660F3A63,
|
|
PCMPISTRM = 0x660F3A62,
|
|
PCMPGTQ = 0x660F3837,
|
|
//CRC32
|
|
|
|
// SSE4a (AMD only)
|
|
// EXTRQ,INSERTQ,MOVNTSD,MOVNTSS
|
|
|
|
// POPCNT and LZCNT (have their own CPUID bits)
|
|
POPCNT = 0xF30FB8,
|
|
// LZCNT
|
|
}
|
|
|
|
/**
|
|
* Generate two operand instruction with XMM 128 bit operands.
|
|
*
|
|
* This is a compiler magic function - it doesn't behave like
|
|
* regular D functions.
|
|
*
|
|
* Parameters:
|
|
* opcode any of the XMM opcodes; it must be a compile time constant
|
|
* op1 first operand
|
|
* op2 second operand
|
|
* Returns:
|
|
* result of opcode
|
|
*/
|
|
pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2);
|
|
|
|
/**
|
|
* Unary SIMD instructions.
|
|
*/
|
|
pure @safe void16 __simd(XMM opcode, void16 op1);
|
|
pure @safe void16 __simd(XMM opcode, double d); ///
|
|
pure @safe void16 __simd(XMM opcode, float f); ///
|
|
|
|
/****
|
|
* For instructions:
|
|
* CMPPD, CMPSS, CMPSD, CMPPS,
|
|
* PSHUFD, PSHUFHW, PSHUFLW,
|
|
* BLENDPD, BLENDPS, DPPD, DPPS,
|
|
* MPSADBW, PBLENDW,
|
|
* ROUNDPD, ROUNDPS, ROUNDSD, ROUNDSS
|
|
* Parameters:
|
|
* opcode any of the above XMM opcodes; it must be a compile time constant
|
|
* op1 first operand
|
|
* op2 second operand
|
|
* imm8 third operand; must be a compile time constant
|
|
* Returns:
|
|
* result of opcode
|
|
*/
|
|
pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2, ubyte imm8);
|
|
|
|
/***
|
|
* For instructions with the imm8 version:
|
|
* PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW,
|
|
* PSRLDQ, PSLLDQ
|
|
* Parameters:
|
|
* opcode any of the XMM opcodes; it must be a compile time constant
|
|
* op1 first operand
|
|
* imm8 second operand; must be a compile time constant
|
|
* Returns:
|
|
* result of opcode
|
|
*/
|
|
pure @safe void16 __simd_ib(XMM opcode, void16 op1, ubyte imm8);
|
|
|
|
/*****
|
|
* For "store" operations of the form:
|
|
* op1 op= op2
|
|
* Returns:
|
|
* op2
|
|
* These cannot be marked as pure, as semantic() doesn't check them.
|
|
*/
|
|
@safe void16 __simd_sto(XMM opcode, void16 op1, void16 op2);
|
|
@safe void16 __simd_sto(XMM opcode, double op1, void16 op2); ///
|
|
@safe void16 __simd_sto(XMM opcode, float op1, void16 op2); ///
|
|
|
|
/* The following use overloading to ensure correct typing.
|
|
* Compile with inlining on for best performance.
|
|
*/
|
|
|
|
pure @safe short8 pcmpeq()(short8 v1, short8 v2)
|
|
{
|
|
return __simd(XMM.PCMPEQW, v1, v2);
|
|
}
|
|
|
|
pure @safe ushort8 pcmpeq()(ushort8 v1, ushort8 v2)
|
|
{
|
|
return __simd(XMM.PCMPEQW, v1, v2);
|
|
}
|
|
|
|
/*********************
|
|
* Emit prefetch instruction.
|
|
* Params:
|
|
* address = address to be prefetched
|
|
* writeFetch = true for write fetch, false for read fetch
|
|
* locality = 0..3 (0 meaning least local, 3 meaning most local)
|
|
* Note:
|
|
* The Intel mappings are:
|
|
* $(TABLE
|
|
* $(THEAD writeFetch, locality, Instruction)
|
|
* $(TROW false, 0, prefetchnta)
|
|
* $(TROW false, 1, prefetch2)
|
|
* $(TROW false, 2, prefetch1)
|
|
* $(TROW false, 3, prefetch0)
|
|
* $(TROW false, 0, prefetchw)
|
|
* $(TROW false, 1, prefetchw)
|
|
* $(TROW false, 2, prefetchw)
|
|
* $(TROW false, 3, prefetchw)
|
|
* )
|
|
*/
|
|
void prefetch(bool writeFetch, ubyte locality)(const(void)* address)
|
|
{
|
|
static if (writeFetch)
|
|
__prefetch(address, 4);
|
|
else static if (locality < 4)
|
|
__prefetch(address, 3 - locality);
|
|
else
|
|
static assert(0, "0..3 expected for locality");
|
|
}
|
|
|
|
private void __prefetch(const(void*) address, ubyte encoding);
|
|
|
|
/*************************************
|
|
* Load unaligned vector from address.
|
|
* This is a compiler intrinsic.
|
|
* Params:
|
|
* p = pointer to vector
|
|
* Returns:
|
|
* vector
|
|
*/
|
|
|
|
V loadUnaligned(V)(const V* p)
|
|
if (is(V == void16) ||
|
|
is(V == byte16) ||
|
|
is(V == ubyte16) ||
|
|
is(V == short8) ||
|
|
is(V == ushort8) ||
|
|
is(V == int4) ||
|
|
is(V == uint4) ||
|
|
is(V == long2) ||
|
|
is(V == ulong2))
|
|
{
|
|
pragma(inline, true);
|
|
static if (is(V == double2))
|
|
return cast(V)__simd(XMM.LODUPD, *cast(const void16*)p);
|
|
else static if (is(V == float4))
|
|
return cast(V)__simd(XMM.LODUPS, *cast(const void16*)p);
|
|
else
|
|
return cast(V)__simd(XMM.LODDQU, *cast(const void16*)p);
|
|
}
|
|
|
|
/*************************************
|
|
* Store vector to unaligned address.
|
|
* This is a compiler intrinsic.
|
|
* Params:
|
|
* p = pointer to vector
|
|
* value = value to store
|
|
* Returns:
|
|
* value
|
|
*/
|
|
|
|
V storeUnaligned(V)(V* p, V value)
|
|
if (is(V == void16) ||
|
|
is(V == byte16) ||
|
|
is(V == ubyte16) ||
|
|
is(V == short8) ||
|
|
is(V == ushort8) ||
|
|
is(V == int4) ||
|
|
is(V == uint4) ||
|
|
is(V == long2) ||
|
|
is(V == ulong2))
|
|
{
|
|
pragma(inline, true);
|
|
static if (is(V == double2))
|
|
return cast(V)__simd_sto(XMM.STOUPD, *cast(void16*)p, value);
|
|
else static if (is(V == float4))
|
|
return cast(V)__simd_sto(XMM.STOUPS, *cast(void16*)p, value);
|
|
else
|
|
return cast(V)__simd_sto(XMM.STODQU, *cast(void16*)p, value);
|
|
}
|
|
}
|