diff --git a/libphobos/ChangeLog b/libphobos/ChangeLog index fec421789a1..58754bc35af 100644 --- a/libphobos/ChangeLog +++ b/libphobos/ChangeLog @@ -1,3 +1,11 @@ +2018-11-19 Iain Buclaw + + * src/Makefile.am: Remove std.internal.digest.sha_SSSE3 and + std.internal.math.biguintx86 modules. + * src/Makefile.in: Rebuild. + * src/std/internal/digest/sha_SSSE3.d: Remove. + * src/std/internal/math/biguintx86.d: Remove. + 2018-11-02 Iain Buclaw PR d/87827 diff --git a/libphobos/src/Makefile.am b/libphobos/src/Makefile.am index 51ebf4e902e..c5b36aea0cb 100644 --- a/libphobos/src/Makefile.am +++ b/libphobos/src/Makefile.am @@ -156,9 +156,8 @@ PHOBOS_DSOURCES = etc/c/curl.d etc/c/sqlite3.d etc/c/zlib.d \ std/experimental/logger/multilogger.d \ std/experimental/logger/nulllogger.d std/experimental/logger/package.d \ std/experimental/typecons.d std/file.d std/format.d std/functional.d \ - std/getopt.d std/internal/cstring.d std/internal/digest/sha_SSSE3.d \ - std/internal/math/biguintcore.d std/internal/math/biguintnoasm.d \ - std/internal/math/biguintx86.d std/internal/math/errorfunction.d \ + std/getopt.d std/internal/cstring.d std/internal/math/biguintcore.d \ + std/internal/math/biguintnoasm.d std/internal/math/errorfunction.d \ std/internal/math/gammafunction.d std/internal/scopebuffer.d \ std/internal/test/dummyrange.d std/internal/test/range.d \ std/internal/test/uda.d std/internal/unicode_comp.d \ diff --git a/libphobos/src/Makefile.in b/libphobos/src/Makefile.in index 1e2003436eb..08470abba3e 100644 --- a/libphobos/src/Makefile.in +++ b/libphobos/src/Makefile.in @@ -193,10 +193,8 @@ am__objects_1 = etc/c/curl.lo etc/c/sqlite3.lo etc/c/zlib.lo \ std/experimental/logger/package.lo \ std/experimental/typecons.lo std/file.lo std/format.lo \ std/functional.lo std/getopt.lo std/internal/cstring.lo \ - std/internal/digest/sha_SSSE3.lo \ std/internal/math/biguintcore.lo \ std/internal/math/biguintnoasm.lo \ - std/internal/math/biguintx86.lo \ std/internal/math/errorfunction.lo \ std/internal/math/gammafunction.lo std/internal/scopebuffer.lo \ std/internal/test/dummyrange.lo std/internal/test/range.lo \ @@ -282,10 +280,8 @@ am__DEPENDENCIES_1 = etc/c/curl.t.lo etc/c/sqlite3.t.lo \ std/experimental/logger/package.t.lo \ std/experimental/typecons.t.lo std/file.t.lo std/format.t.lo \ std/functional.t.lo std/getopt.t.lo std/internal/cstring.t.lo \ - std/internal/digest/sha_SSSE3.t.lo \ std/internal/math/biguintcore.t.lo \ std/internal/math/biguintnoasm.t.lo \ - std/internal/math/biguintx86.t.lo \ std/internal/math/errorfunction.t.lo \ std/internal/math/gammafunction.t.lo \ std/internal/scopebuffer.t.lo \ @@ -392,10 +388,8 @@ am__DEPENDENCIES_4 = etc/c/curl.t.o etc/c/sqlite3.t.o etc/c/zlib.t.o \ std/experimental/logger/package.t.o \ std/experimental/typecons.t.o std/file.t.o std/format.t.o \ std/functional.t.o std/getopt.t.o std/internal/cstring.t.o \ - std/internal/digest/sha_SSSE3.t.o \ std/internal/math/biguintcore.t.o \ std/internal/math/biguintnoasm.t.o \ - std/internal/math/biguintx86.t.o \ std/internal/math/errorfunction.t.o \ std/internal/math/gammafunction.t.o \ std/internal/scopebuffer.t.o std/internal/test/dummyrange.t.o \ @@ -788,9 +782,8 @@ PHOBOS_DSOURCES = etc/c/curl.d etc/c/sqlite3.d etc/c/zlib.d \ std/experimental/logger/multilogger.d \ std/experimental/logger/nulllogger.d std/experimental/logger/package.d \ std/experimental/typecons.d std/file.d std/format.d std/functional.d \ - std/getopt.d std/internal/cstring.d std/internal/digest/sha_SSSE3.d \ - std/internal/math/biguintcore.d std/internal/math/biguintnoasm.d \ - std/internal/math/biguintx86.d std/internal/math/errorfunction.d \ + std/getopt.d std/internal/cstring.d std/internal/math/biguintcore.d \ + std/internal/math/biguintnoasm.d std/internal/math/errorfunction.d \ std/internal/math/gammafunction.d std/internal/scopebuffer.d \ std/internal/test/dummyrange.d std/internal/test/range.d \ std/internal/test/uda.d std/internal/unicode_comp.d \ @@ -1032,16 +1025,11 @@ std/internal/$(am__dirstamp): @$(MKDIR_P) std/internal @: > std/internal/$(am__dirstamp) std/internal/cstring.lo: std/internal/$(am__dirstamp) -std/internal/digest/$(am__dirstamp): - @$(MKDIR_P) std/internal/digest - @: > std/internal/digest/$(am__dirstamp) -std/internal/digest/sha_SSSE3.lo: std/internal/digest/$(am__dirstamp) std/internal/math/$(am__dirstamp): @$(MKDIR_P) std/internal/math @: > std/internal/math/$(am__dirstamp) std/internal/math/biguintcore.lo: std/internal/math/$(am__dirstamp) std/internal/math/biguintnoasm.lo: std/internal/math/$(am__dirstamp) -std/internal/math/biguintx86.lo: std/internal/math/$(am__dirstamp) std/internal/math/errorfunction.lo: std/internal/math/$(am__dirstamp) std/internal/math/gammafunction.lo: std/internal/math/$(am__dirstamp) std/internal/scopebuffer.lo: std/internal/$(am__dirstamp) @@ -1174,8 +1162,6 @@ mostlyclean-compile: -rm -f std/experimental/logger/*.lo -rm -f std/internal/*.$(OBJEXT) -rm -f std/internal/*.lo - -rm -f std/internal/digest/*.$(OBJEXT) - -rm -f std/internal/digest/*.lo -rm -f std/internal/math/*.$(OBJEXT) -rm -f std/internal/math/*.lo -rm -f std/internal/test/*.$(OBJEXT) @@ -1401,7 +1387,6 @@ clean-libtool: -rm -rf std/experimental/allocator/building_blocks/.libs std/experimental/allocator/building_blocks/_libs -rm -rf std/experimental/logger/.libs std/experimental/logger/_libs -rm -rf std/internal/.libs std/internal/_libs - -rm -rf std/internal/digest/.libs std/internal/digest/_libs -rm -rf std/internal/math/.libs std/internal/math/_libs -rm -rf std/internal/test/.libs std/internal/test/_libs -rm -rf std/internal/windows/.libs std/internal/windows/_libs @@ -1529,7 +1514,6 @@ distclean-generic: -rm -f std/experimental/allocator/building_blocks/$(am__dirstamp) -rm -f std/experimental/logger/$(am__dirstamp) -rm -f std/internal/$(am__dirstamp) - -rm -f std/internal/digest/$(am__dirstamp) -rm -f std/internal/math/$(am__dirstamp) -rm -f std/internal/test/$(am__dirstamp) -rm -f std/internal/windows/$(am__dirstamp) diff --git a/libphobos/src/std/internal/digest/sha_SSSE3.d b/libphobos/src/std/internal/digest/sha_SSSE3.d deleted file mode 100644 index 4060f34a063..00000000000 --- a/libphobos/src/std/internal/digest/sha_SSSE3.d +++ /dev/null @@ -1,729 +0,0 @@ -// Written in the D programming language. - -/** - * Computes SHA1 digests of arbitrary data, using an optimized algorithm with SSSE3 instructions. - * - * Authors: - * The general idea is described by Dean Gaudet. - * Another important observation is published by Max Locktyukhin. - * (Both implementations are public domain.) - * Translation to X86 and D by Kai Nacke - * - * References: - * $(LINK2 http://arctic.org/~dean/crypto/sha1.html) - * $(LINK2 http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/, Fast implementation of SHA1) - */ -module std.internal.digest.sha_SSSE3; - -version (D_InlineAsm_X86) -{ - version (D_PIC) {} // Bugzilla 9378 - else - { - private version = USE_SSSE3; - private version = _32Bit; - } -} -else version (D_InlineAsm_X86_64) -{ - private version = USE_SSSE3; - private version = _64Bit; -} - -/* - * The idea is quite simple. The SHA-1 specification defines the following message schedule: - * W[i] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1 - * - * To employ SSE, simply write down the formula four times: - * W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1 - * W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1 - * W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1 - * W[i+3] = (W[i ] ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1 - * The last formula requires value W[i] computed with the first formula. - * Because the xor operation and the rotate operation are commutative, we can replace the - * last formula with - * W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1 - * and then calculate - * W[i+3] ^= W[i] rol 1 - * which unfortunately requires many additional operations. This approach was described by - * Dean Gaudet. - * - * Max Locktyukhin observed that - * W[i] = W[i-A] ^ W[i-B] - * is equivalent to - * W[i] = W[i-2*A] ^ W[i-2*B] - * (if the indices are still in valid ranges). Using this observation, the formula is - * translated to - * W[i] = (W[i-6] ^ W[i-16] ^ W[i-28] ^ W[i-32]) rol 2 - * Again, to employ SSE the formula is used four times. - * - * Later on, the expression W[i] + K(i) is used. (K(i) is the constant used in round i.) - * Once the 4 W[i] are calculated, we can also add the four K(i) values with one SSE instruction. - * - * The 32bit and 64bit implementations are almost identical. The main difference is that there - * are only 8 XMM registers in 32bit mode. Therefore, space on the stack is needed to save - * computed values. - */ - -version (USE_SSSE3) -{ - /* - * The general idea is to use the XMM registers as a sliding window over - * message schedule. XMM0 to XMM7 are used to store the last 64 byte of - * the message schedule. In 64 bit mode this is fine because of the number of - * registers. The main difference of the 32 bit code is that a part of the - * calculated message schedule is saved on the stack because 2 temporary - * registers are needed. - */ - - /* Number of message words we are precalculating. */ - private immutable int PRECALC_AHEAD = 16; - - /* T1 and T2 are used for intermediate results of computations. */ - private immutable string T1 = "EAX"; - private immutable string T2 = "EBX"; - - /* The registers used for the SHA-1 variables. */ - private immutable string A = "ECX"; - private immutable string B = "ESI"; - private immutable string C = "EDI"; - private immutable string D = "EBP"; - private immutable string E = "EDX"; - - /* */ - version (_32Bit) - { - private immutable string SP = "ESP"; - private immutable string BUFFER_PTR = "EAX"; - private immutable string STATE_PTR = "EBX"; - - // Control byte for shuffle instruction (only used in round 0-15) - private immutable string X_SHUFFLECTL = "XMM6"; - - // Round constant (only used in round 0-15) - private immutable string X_CONSTANT = "XMM7"; - } - version (_64Bit) - { - private immutable string SP = "RSP"; - private immutable string BUFFER_PTR = "R9"; - private immutable string STATE_PTR = "R8"; - private immutable string CONSTANTS_PTR = "R10"; - - // Registers for temporary results (XMM10 and XMM11 are also used temporary) - private immutable string W_TMP = "XMM8"; - private immutable string W_TMP2 = "XMM9"; - - // Control byte for shuffle instruction (only used in round 0-15) - private immutable string X_SHUFFLECTL = "XMM12"; - - // Round constant - private immutable string X_CONSTANT = "XMM13"; - } - - /* The control words for the byte shuffle instruction and the round constants. */ - align(16) public immutable uint[20] constants = - [ - // The control words for the byte shuffle instruction. - 0x0001_0203, 0x0405_0607, 0x0809_0a0b, 0x0c0d_0e0f, - // Constants for round 0-19 - 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999, - // Constants for round 20-39 - 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, - // Constants for round 40-59 - 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, - // Constants for round 60-79 - 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6 - ]; - - /** Simple version to produce numbers < 100 as string. */ - private nothrow pure string to_string(uint i) - { - if (i < 10) - return "0123456789"[i .. i + 1]; - - assert(i < 100); - char[2] s; - s[0] = cast(char)(i / 10 + '0'); - s[1] = cast(char)(i % 10 + '0'); - return s.idup; - } - - /** Returns the reference to the byte shuffle control word. */ - private nothrow pure string bswap_shufb_ctl() - { - version (_64Bit) - return "["~CONSTANTS_PTR~"]"; - else - return "[constants]"; - } - - /** Returns the reference to constant used in round i. */ - private nothrow pure string constant(uint i) - { - version (_64Bit) - return "16 + 16*"~to_string(i/20)~"["~CONSTANTS_PTR~"]"; - else - return "[constants + 16 + 16*"~to_string(i/20)~"]"; - } - - /** Returns the XMM register number used in round i */ - private nothrow pure uint regno(uint i) - { - return (i/4)&7; - } - - /** Returns reference to storage of vector W[i .. i+4]. */ - private nothrow pure string WiV(uint i) - { - return "["~SP~" + WI_PTR + "~to_string((i/4)&7)~"*16]"; - } - - /** Returns reference to storage of vector (W + K)[i .. i+4]. */ - private nothrow pure string WiKiV(uint i) - { - return "["~SP~" + WI_PLUS_KI_PTR + "~to_string((i/4)&3)~"*16]"; - } - - /** Returns reference to storage of value W[i] + K[i]. */ - private nothrow pure string WiKi(uint i) - { - return "["~SP~" + WI_PLUS_KI_PTR + 4*"~to_string(i&15)~"]"; - } - - /** - * Chooses the instruction sequence based on the 32bit or 64bit model. - */ - private nothrow pure string[] swt3264(string[] insn32, string[] insn64) - { - version (_32Bit) - { - return insn32; - } - version (_64Bit) - { - return insn64; - } - } - - /** - * Flattens the instruction sequence and wraps it in an asm block. - */ - private nothrow pure string wrap(string[] insn) - { - string s = "asm pure nothrow @nogc {"; - foreach (t; insn) s ~= (t ~ "; \n"); - s ~= "}"; - return s; - // Is not CTFE: - // return "asm pure nothrow @nogc { " ~ join(insn, "; \n") ~ "}"; - } - - /** - * Weaves the 2 instruction sequences together. - */ - private nothrow pure string[] weave(string[] seq1, string[] seq2, uint dist = 1) - { - string[] res = []; - auto i1 = 0, i2 = 0; - while (i1 < seq1.length || i2 < seq2.length) - { - if (i2 < seq2.length) - { - res ~= seq2[i2 .. i2+1]; - i2 += 1; - } - if (i1 < seq1.length) - { - import std.algorithm.comparison : min; - - res ~= seq1[i1 .. min(i1+dist, $)]; - i1 += dist; - } - } - return res; - } - - /** - * Generates instructions to load state from memory into registers. - */ - private nothrow pure string[] loadstate(string base, string a, string b, string c, string d, string e) - { - return ["mov "~a~",["~base~" + 0*4]", - "mov "~b~",["~base~" + 1*4]", - "mov "~c~",["~base~" + 2*4]", - "mov "~d~",["~base~" + 3*4]", - "mov "~e~",["~base~" + 4*4]" ]; - } - - /** - * Generates instructions to update state from registers, saving result in memory. - */ - private nothrow pure string[] savestate(string base, string a, string b, string c, string d, string e) - { - return ["add ["~base~" + 0*4],"~a, - "add ["~base~" + 1*4],"~b, - "add ["~base~" + 2*4],"~c, - "add ["~base~" + 3*4],"~d, - "add ["~base~" + 4*4],"~e ]; - } - - /** Calculates Ch(x, y, z) = z ^ (x & (y ^ z)) */ - private nothrow pure string[] Ch(string x, string y, string z) - { - return ["mov "~T1~","~y, - "xor "~T1~","~z, - "and "~T1~","~x, - "xor "~T1~","~z ]; - } - - /** Calculates Parity(x, y, z) = x ^ y ^ z */ - private nothrow pure string[] Parity(string x, string y, string z) - { - return ["mov "~T1~","~z, - "xor "~T1~","~y, - "xor "~T1~","~x ]; - } - - /** Calculates Maj(x, y, z) = (x & y) | (z & (x ^ y)) */ - private nothrow pure string[] Maj(string x, string y, string z) - { - return ["mov "~T1~","~y, - "mov "~T2~","~x, - "or "~T1~","~x, - "and "~T2~","~y, - "and "~T1~","~z, - "or "~T1~","~T2 ]; - } - - /** Returns function for round i. Function returns result in T1 and may destroy T2. */ - private nothrow pure string[] F(int i, string b, string c, string d) - { - string[] insn; - if (i >= 0 && i <= 19) insn = Ch(b, c, d); - else if (i >= 20 && i <= 39) insn = Parity(b, c, d); - else if (i >= 40 && i <= 59) insn = Maj(b, c, d); - else if (i >= 60 && i <= 79) insn = Parity(b, c, d); - else assert(false, "Coding error"); - return insn; - } - - /** Returns instruction used to setup a round. */ - private nothrow pure string[] xsetup(int i) - { - if (i == 0) - { - return swt3264(["movdqa "~X_SHUFFLECTL~","~bswap_shufb_ctl(), - "movdqa "~X_CONSTANT~","~constant(i)], - ["movdqa "~X_SHUFFLECTL~","~bswap_shufb_ctl(), - "movdqa "~X_CONSTANT~","~constant(i)]); - } - version (_64Bit) - { - if (i%20 == 0) - { - return ["movdqa "~X_CONSTANT~","~constant(i)]; - } - } - return []; - } - - /** - * Loads the message words and performs the little to big endian conversion. - * Requires that the shuffle control word and the round constant is loaded - * into required XMM register. The BUFFER_PTR register must point to the - * buffer. - */ - private nothrow pure string[] precalc_00_15(int i) - { - int regno = regno(i); - - string W = "XMM" ~ to_string(regno); - version (_32Bit) - { - string W_TMP = "XMM" ~ to_string(regno+2); - } - version (_64Bit) - { - string W_TMP = "XMM" ~ to_string(regno+8); - } - - if ((i & 3) == 0) - { - return ["movdqu "~W~",["~BUFFER_PTR~" + "~to_string(regno)~"*16]"]; - } - else if ((i & 3) == 1) - { - return ["pshufb "~W~","~X_SHUFFLECTL] ~ - swt3264(["movdqa "~WiV(i)~","~W], []); - } - else if ((i & 3) == 2) - { - return ["movdqa "~W_TMP~","~W, - "paddd "~W_TMP~","~X_CONSTANT, - ]; - } - else - { - return ["movdqa "~WiKiV(i)~","~W_TMP, - ]; - } - } - - /** - * Done on 4 consequtive W[i] values in a single XMM register - * W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1 - * W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1 - * W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1 - * W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1 - * - * This additional calculation unfortunately requires many additional operations - * W[i+3] ^= W[i] rol 1 - * - * Once we have 4 W[i] values in XMM we can also add four K values with one instruction - * W[i:i+3] += {K,K,K,K} - */ - private nothrow pure string[] precalc_16_31(int i) - { - int regno = regno(i); - - string W = "XMM" ~ to_string(regno); - string W_minus_4 = "XMM" ~ to_string((regno-1)&7); - string W_minus_8 = "XMM" ~ to_string((regno-2)&7); - string W_minus_12 = "XMM" ~ to_string((regno-3)&7); - string W_minus_16 = "XMM" ~ to_string((regno-4)&7); - version (_32Bit) - { - string W_TMP = "XMM" ~ to_string((regno+1)&7); - string W_TMP2 = "XMM" ~ to_string((regno+2)&7); - } - - if ((i & 3) == 0) - { - return ["movdqa "~W~","~W_minus_12, - "palignr "~W~","~W_minus_16~",8", // W[i] = W[i-14] - "pxor "~W~","~W_minus_16, // W[i] ^= W[i-16] - "pxor "~W~","~W_minus_8, // W[i] ^= W[i-8] - "movdqa "~W_TMP~","~W_minus_4, - ]; - } - else if ((i & 3) == 1) - { - return ["psrldq "~W_TMP~",4", // W[i-3] - "pxor "~W~","~W_TMP, // W[i] ^= W[i-3] - "movdqa "~W_TMP~","~W, - "psrld "~W~",31", - "pslld "~W_TMP~",1", - ]; - } - else if ((i & 3) == 2) - { - return ["por "~W~","~W_TMP, - "movdqa "~W_TMP~","~W, - "pslldq "~W_TMP~",12", - "movdqa "~W_TMP2~","~W_TMP, - "pslld "~W_TMP~",1", - ]; - } - else - { - return ["psrld "~W_TMP2~",31", - "por "~W_TMP~","~W_TMP2, - "pxor "~W~","~W_TMP, - "movdqa "~W_TMP~","~W ] ~ - swt3264(["movdqa "~WiV(i)~","~W, - "paddd "~W_TMP~","~constant(i) ], - ["paddd "~W_TMP~","~X_CONSTANT ]) ~ - ["movdqa "~WiKiV(i)~","~W_TMP]; - } - } - - /** Performs the main calculation as decribed above. */ - private nothrow pure string[] precalc_32_79(int i) - { - int regno = regno(i); - - string W = "XMM" ~ to_string(regno); - string W_minus_4 = "XMM" ~ to_string((regno-1)&7); - string W_minus_8 = "XMM" ~ to_string((regno-2)&7); - string W_minus_16 = "XMM" ~ to_string((regno-4)&7); - version (_32Bit) - { - string W_minus_28 = "[ESP + WI_PTR + "~ to_string((regno-7)&7)~"*16]"; - string W_minus_32 = "[ESP + WI_PTR + "~ to_string((regno-8)&7)~"*16]"; - string W_TMP = "XMM" ~ to_string((regno+1)&7); - string W_TMP2 = "XMM" ~ to_string((regno+2)&7); - } - version (_64Bit) - { - string W_minus_28 = "XMM" ~ to_string((regno-7)&7); - string W_minus_32 = "XMM" ~ to_string((regno-8)&7); - } - - if ((i & 3) == 0) - { - return swt3264(["movdqa "~W~","~W_minus_32], []) ~ - ["movdqa "~W_TMP~","~W_minus_4, - "pxor "~W~","~W_minus_28, // W is W_minus_32 before xor - "palignr "~W_TMP~","~W_minus_8~",8", - ]; - } - else if ((i & 3) == 1) - { - return ["pxor "~W~","~W_minus_16, - "pxor "~W~","~W_TMP, - "movdqa "~W_TMP~","~W, - ]; - } - else if ((i & 3) == 2) - { - return ["psrld "~W~",30", - "pslld "~W_TMP~",2", - "por "~W_TMP~","~W, - ]; - } - else - { - if (i < 76) - return ["movdqa "~W~","~W_TMP] ~ - swt3264(["movdqa "~WiV(i)~","~W, - "paddd "~W_TMP~","~constant(i)], - ["paddd "~W_TMP~","~X_CONSTANT]) ~ - ["movdqa "~WiKiV(i)~","~W_TMP]; - else - return swt3264(["paddd "~W_TMP~","~constant(i)], - ["paddd "~W_TMP~","~X_CONSTANT]) ~ - ["movdqa "~WiKiV(i)~","~W_TMP]; - } - } - - /** Choose right precalc method. */ - private nothrow pure string[] precalc(int i) - { - if (i >= 0 && i < 16) return precalc_00_15(i); - if (i >= 16 && i < 32) return precalc_16_31(i); - if (i >= 32 && i < 80) return precalc_32_79(i); - return []; - } - - /** - * Return code for round i and i+1. - * Performs the following rotation: - * in=>out: A=>D, B=>E, C=>A, D=>B, E=>C - */ - private nothrow pure string[] round(int i, string a, string b, string c, string d, string e) - { - return xsetup(PRECALC_AHEAD + i) ~ - weave(F(i, b, c, d) ~ // Returns result in T1; may destroy T2 - ["add "~e~","~WiKi(i), - "ror "~b~",2", - "mov "~T2~","~a, - "add "~d~","~WiKi(i+1), - "rol "~T2~",5", - "add "~e~","~T1 ], - precalc(PRECALC_AHEAD + i), 2) ~ - weave( - ["add "~T2~","~e, // T2 = (A <<< 5) + F(B, C, D) + Wi + Ki + E - "mov "~e~","~T2, - "rol "~T2~",5", - "add "~d~","~T2 ] ~ - F(i+1, a, b, c) ~ // Returns result in T1; may destroy T2 - ["add "~d~","~T1, - "ror "~a~",2"], - precalc(PRECALC_AHEAD + i+1), 2); - } - - // Offset into stack (see below) - version (_32Bit) - { - private enum { STATE_OFS = 4, WI_PLUS_KI_PTR = 8, WI_PTR = 72 }; - } - version (_64Bit) - { - private enum { WI_PLUS_KI_PTR = 0 }; - } - - /** The prologue sequence. */ - private nothrow pure string[] prologue() - { - version (_32Bit) - { - /* - * Parameters: - * EAX contains pointer to input buffer - * - * Stack layout as follows: - * +----------------+ - * | ptr to state | - * +----------------+ - * | return address | - * +----------------+ - * | EBP | - * +----------------+ - * | ESI | - * +----------------+ - * | EDI | - * +----------------+ - * | EBX | - * +----------------+ - * | Space for | - * | Wi | <- ESP+72 - * +----------------+ - * | Space for | - * | Wi+Ki | <- ESP+8 - * +----------------+ <- 16byte aligned - * | ptr to state | <- ESP+4 - * +----------------+ - * | old ESP | <- ESP - * +----------------+ - */ - static assert(BUFFER_PTR == "EAX"); - static assert(STATE_PTR == "EBX"); - return [// Save registers according to calling convention - "push EBP", - "push ESI", - "push EDI", - "push EBX", - // Load parameters - "mov EBX, [ESP + 5*4]", //pointer to state - // Align stack - "mov EBP, ESP", - "sub ESP, 4*16 + 8*16", - "and ESP, 0xffff_fff0", - "push EBX", - "push EBP", - ]; - } - version (_64Bit) - { - /* - * Parameters: - * RDX contains pointer to state - * RSI contains pointer to input buffer - * RDI contains pointer to constants - * - * Stack layout as follows: - * +----------------+ - * | return address | - * +----------------+ - * | RBP | - * +----------------+ - * | RBX | - * +----------------+ - * | Unused | - * +----------------+ - * | Space for | - * | Wi+Ki | <- RSP - * +----------------+ <- 16byte aligned - */ - return [// Save registers according to calling convention - "push RBP", - "push RBX", - // Save parameters - "mov "~STATE_PTR~", RDX", //pointer to state - "mov "~BUFFER_PTR~", RSI", //pointer to buffer - "mov "~CONSTANTS_PTR~", RDI", //pointer to constants to avoid absolute addressing - // Align stack - "sub RSP, 4*16+8", - ]; - } - } - - /** - * The epilogue sequence. Just pop the saved registers from stack and return to caller. - */ - private nothrow pure string[] epilogue() - { - version (_32Bit) - { - return ["pop ESP", - "pop EBX", - "pop EDI", - "pop ESI", - "pop EBP", - "ret 4", - ]; - } - version (_64Bit) - { - return ["add RSP,4*16+8", - "pop RBX", - "pop RBP", - "ret 0", - ]; - } - } - - // constants as extra argument for PIC, see Bugzilla 9378 - import std.meta : AliasSeq; - version (_64Bit) - alias ExtraArgs = AliasSeq!(typeof(&constants)); - else - alias ExtraArgs = AliasSeq!(); - - /** - * - */ - public void transformSSSE3(uint[5]* state, const(ubyte[64])* buffer, ExtraArgs) pure nothrow @nogc - { - mixin(wrap(["naked;"] ~ prologue())); - // Precalc first 4*16=64 bytes - mixin(wrap(xsetup(0))); - mixin(wrap(weave(precalc(0)~precalc(1)~precalc(2)~precalc(3), - precalc(4)~precalc(5)~precalc(6)~precalc(7)))); - mixin(wrap(weave(loadstate(STATE_PTR, A, B, C, D, E), - weave(precalc(8)~precalc(9)~precalc(10)~precalc(11), - precalc(12)~precalc(13)~precalc(14)~precalc(15))))); - // Round 1 - mixin(wrap(round( 0, A, B, C, D, E))); - mixin(wrap(round( 2, D, E, A, B, C))); - mixin(wrap(round( 4, B, C, D, E, A))); - mixin(wrap(round( 6, E, A, B, C, D))); - mixin(wrap(round( 8, C, D, E, A, B))); - mixin(wrap(round(10, A, B, C, D, E))); - mixin(wrap(round(12, D, E, A, B, C))); - mixin(wrap(round(14, B, C, D, E, A))); - mixin(wrap(round(16, E, A, B, C, D))); - mixin(wrap(round(18, C, D, E, A, B))); - // Round 2 - mixin(wrap(round(20, A, B, C, D, E))); - mixin(wrap(round(22, D, E, A, B, C))); - mixin(wrap(round(24, B, C, D, E, A))); - mixin(wrap(round(26, E, A, B, C, D))); - mixin(wrap(round(28, C, D, E, A, B))); - mixin(wrap(round(30, A, B, C, D, E))); - mixin(wrap(round(32, D, E, A, B, C))); - mixin(wrap(round(34, B, C, D, E, A))); - mixin(wrap(round(36, E, A, B, C, D))); - mixin(wrap(round(38, C, D, E, A, B))); - // Round 3 - mixin(wrap(round(40, A, B, C, D, E))); - mixin(wrap(round(42, D, E, A, B, C))); - mixin(wrap(round(44, B, C, D, E, A))); - mixin(wrap(round(46, E, A, B, C, D))); - mixin(wrap(round(48, C, D, E, A, B))); - mixin(wrap(round(50, A, B, C, D, E))); - mixin(wrap(round(52, D, E, A, B, C))); - mixin(wrap(round(54, B, C, D, E, A))); - mixin(wrap(round(56, E, A, B, C, D))); - mixin(wrap(round(58, C, D, E, A, B))); - // Round 4 - mixin(wrap(round(60, A, B, C, D, E))); - mixin(wrap(round(62, D, E, A, B, C))); - mixin(wrap(round(64, B, C, D, E, A))); - mixin(wrap(round(66, E, A, B, C, D))); - mixin(wrap(round(68, C, D, E, A, B))); - mixin(wrap(round(70, A, B, C, D, E))); - mixin(wrap(round(72, D, E, A, B, C))); - mixin(wrap(round(74, B, C, D, E, A))); - mixin(wrap(round(76, E, A, B, C, D))); - mixin(wrap(round(78, C, D, E, A, B))); - version (_32Bit) - { - // Load pointer to state - mixin(wrap(["mov "~STATE_PTR~",[ESP + STATE_OFS]"])); - } - mixin(wrap(savestate(STATE_PTR, A, B, C, D, E))); - mixin(wrap(epilogue())); - } -} diff --git a/libphobos/src/std/internal/math/biguintx86.d b/libphobos/src/std/internal/math/biguintx86.d deleted file mode 100644 index bd03d2e6fe9..00000000000 --- a/libphobos/src/std/internal/math/biguintx86.d +++ /dev/null @@ -1,1353 +0,0 @@ -/** Optimised asm arbitrary precision arithmetic ('bignum') - * routines for X86 processors. - * - * All functions operate on arrays of uints, stored LSB first. - * If there is a destination array, it will be the first parameter. - * Currently, all of these functions are subject to change, and are - * intended for internal use only. - * The symbol [#] indicates an array of machine words which is to be - * interpreted as a multi-byte number. - */ - -/* Copyright Don Clugston 2008 - 2010. - * Distributed under the Boost Software License, Version 1.0. - * (See accompanying file LICENSE_1_0.txt or copy at - * http://www.boost.org/LICENSE_1_0.txt) - */ -/** - * In simple terms, there are 3 modern x86 microarchitectures: - * (a) the P6 family (Pentium Pro, PII, PIII, PM, Core), produced by Intel; - * (b) the K6, Athlon, and AMD64 families, produced by AMD; and - * (c) the Pentium 4, produced by Marketing. - * - * This code has been optimised for the Intel P6 family. - * Generally the code remains near-optimal for Intel Core2/Corei7, after - * translating EAX-> RAX, etc, since all these CPUs use essentially the same - * pipeline, and are typically limited by memory access. - * The code uses techniques described in Agner Fog's superb Pentium manuals - * available at www.agner.org. - * Not optimised for AMD, which can do two memory loads per cycle (Intel - * CPUs can only do one). Despite this, performance is superior on AMD. - * Performance is dreadful on P4. - * - * Timing results (cycles per int) - * --Intel Pentium-- --AMD-- - * PM P4 Core2 K7 - * +,- 2.25 15.6 2.25 1.5 - * <<,>> 2.0 6.6 2.0 5.0 - * (<< MMX) 1.7 5.3 1.5 1.2 - * * 5.0 15.0 4.0 4.3 - * mulAdd 5.7 19.0 4.9 4.0 - * div 30.0 32.0 32.0 22.4 - * mulAcc(32) 6.5 20.0 5.4 4.9 - * - * mulAcc(32) is multiplyAccumulate() for a 32*32 multiply. Thus it includes - * function call overhead. - * The timing for Div is quite unpredictable, but it's probably too slow - * to be useful. On 64-bit processors, these times should - * halve if run in 64-bit mode, except for the MMX functions. - */ - -module std.internal.math.biguintx86; - -@system: -pure: -nothrow: - -/* - Naked asm is used throughout, because: - (a) it frees up the EBP register - (b) compiler bugs prevent the use of .ptr when a frame pointer is used. -*/ - -version (D_InlineAsm_X86) -{ - -private: - -/* Duplicate string s, with n times, substituting index for '@'. - * - * Each instance of '@' in s is replaced by 0,1,...n-1. This is a helper - * function for some of the asm routines. - */ -string indexedLoopUnroll(int n, string s) pure @safe -{ - string u; - for (int i = 0; i9 ? ""~ cast(char)('0'+i/10) : "") ~ cast(char)('0' + i%10); - - int last = 0; - for (int j = 0; j> numbits - * numbits must be in the range 1 .. 31 - * This version uses MMX. - */ -uint multibyteShl(uint [] dest, const uint [] src, uint numbits) pure -{ - // Timing: - // K7 1.2/int. PM 1.7/int P4 5.3/int - enum { LASTPARAM = 4*4 } // 3* pushes + return address. - asm pure nothrow { - naked; - push ESI; - push EDI; - push EBX; - mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr; - mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length; - mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr; - - movd MM3, EAX; // numbits = bits to shift left - xor EAX, 63; - align 16; - inc EAX; - movd MM4, EAX ; // 64-numbits = bits to shift right - - // Get the return value into EAX - and EAX, 31; // EAX = 32-numbits - movd MM2, EAX; // 32-numbits - movd MM1, [ESI+4*EBX-4]; - psrlq MM1, MM2; - movd EAX, MM1; // EAX = return value - test EBX, 1; - jz L_even; -L_odd: - cmp EBX, 1; - jz L_length1; - - // deal with odd lengths - movq MM1, [ESI+4*EBX-8]; - psrlq MM1, MM2; - movd [EDI +4*EBX-4], MM1; - sub EBX, 1; -L_even: // It's either singly or doubly even - movq MM2, [ESI + 4*EBX - 8]; - psllq MM2, MM3; - sub EBX, 2; - jle L_last; - movq MM1, MM2; - add EBX, 2; - test EBX, 2; - jz L_onceeven; - sub EBX, 2; - - // MAIN LOOP -- 128 bytes per iteration - L_twiceeven: // here MM2 is the carry - movq MM0, [ESI + 4*EBX-8]; - psrlq MM0, MM4; - movq MM1, [ESI + 4*EBX-8]; - psllq MM1, MM3; - por MM2, MM0; - movq [EDI +4*EBX], MM2; -L_onceeven: // here MM1 is the carry - movq MM0, [ESI + 4*EBX-16]; - psrlq MM0, MM4; - movq MM2, [ESI + 4*EBX-16]; - por MM1, MM0; - movq [EDI +4*EBX-8], MM1; - psllq MM2, MM3; - sub EBX, 4; - jg L_twiceeven; -L_last: - movq [EDI +4*EBX], MM2; -L_alldone: - emms; // NOTE: costs 6 cycles on Intel CPUs - pop EBX; - pop EDI; - pop ESI; - ret 4*4; - -L_length1: - // length 1 is a special case - movd MM1, [ESI]; - psllq MM1, MM3; - movd [EDI], MM1; - jmp L_alldone; - } -} - -void multibyteShr(uint [] dest, const uint [] src, uint numbits) pure -{ - enum { LASTPARAM = 4*4 } // 3* pushes + return address. - asm pure nothrow { - naked; - push ESI; - push EDI; - push EBX; - mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr; - mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length; -align 16; - mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr; - lea EDI, [EDI + 4*EBX]; // EDI = end of dest - lea ESI, [ESI + 4*EBX]; // ESI = end of src - neg EBX; // count UP to zero. - - movd MM3, EAX; // numbits = bits to shift right - xor EAX, 63; - inc EAX; - movd MM4, EAX ; // 64-numbits = bits to shift left - - test EBX, 1; - jz L_even; -L_odd: - // deal with odd lengths - and EAX, 31; // EAX = 32-numbits - movd MM2, EAX; // 32-numbits - cmp EBX, -1; - jz L_length1; - - movq MM0, [ESI+4*EBX]; - psrlq MM0, MM3; - movd [EDI +4*EBX], MM0; - add EBX, 1; -L_even: - movq MM2, [ESI + 4*EBX]; - psrlq MM2, MM3; - - movq MM1, MM2; - add EBX, 4; - cmp EBX, -2+4; - jz L_last; - // It's either singly or doubly even - sub EBX, 2; - test EBX, 2; - jnz L_onceeven; - add EBX, 2; - - // MAIN LOOP -- 128 bytes per iteration - L_twiceeven: // here MM2 is the carry - movq MM0, [ESI + 4*EBX-8]; - psllq MM0, MM4; - movq MM1, [ESI + 4*EBX-8]; - psrlq MM1, MM3; - por MM2, MM0; - movq [EDI +4*EBX-16], MM2; -L_onceeven: // here MM1 is the carry - movq MM0, [ESI + 4*EBX]; - psllq MM0, MM4; - movq MM2, [ESI + 4*EBX]; - por MM1, MM0; - movq [EDI +4*EBX-8], MM1; - psrlq MM2, MM3; - add EBX, 4; - jl L_twiceeven; -L_last: - movq [EDI +4*EBX-16], MM2; -L_alldone: - emms; // NOTE: costs 6 cycles on Intel CPUs - pop EBX; - pop EDI; - pop ESI; - ret 4*4; - -L_length1: - // length 1 is a special case - movd MM1, [ESI+4*EBX]; - psrlq MM1, MM3; - movd [EDI +4*EBX], MM1; - jmp L_alldone; - - } -} - -/** dest[#] = src[#] >> numbits - * numbits must be in the range 1 .. 31 - */ -void multibyteShrNoMMX(uint [] dest, const uint [] src, uint numbits) pure -{ - // Timing: Optimal for P6 family. - // 2.0 cycles/int on PPro .. PM (limited by execution port p0) - // Terrible performance on AMD64, which has 7 cycles for SHRD!! - enum { LASTPARAM = 4*4 } // 3* pushes + return address. - asm pure nothrow { - naked; - push ESI; - push EDI; - push EBX; - mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr; - mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length; - mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr; - mov ECX, EAX; // numbits; - - lea EDI, [EDI + 4*EBX]; // EDI = end of dest - lea ESI, [ESI + 4*EBX]; // ESI = end of src - neg EBX; // count UP to zero. - mov EAX, [ESI + 4*EBX]; - cmp EBX, -1; - jz L_last; - mov EDX, [ESI + 4*EBX]; - test EBX, 1; - jz L_odd; - add EBX, 1; -L_even: - mov EDX, [ ESI + 4*EBX]; - shrd EAX, EDX, CL; - mov [-4 + EDI+4*EBX], EAX; -L_odd: - mov EAX, [4 + ESI + 4*EBX]; - shrd EDX, EAX, CL; - mov [EDI + 4*EBX], EDX; - add EBX, 2; - jl L_even; -L_last: - shr EAX, CL; - mov [-4 + EDI], EAX; - - pop EBX; - pop EDI; - pop ESI; - ret 4*4; - } -} - -@system unittest -{ - - uint [] aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; - multibyteShr(aa[0..$-1], aa, 4); - assert(aa[0] == 0x6122_2222 && aa[1]==0xA455_5555 - && aa[2]==0xD899_9999 && aa[3]==0x0BCC_CCCC); - - aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; - multibyteShr(aa[2..$-1], aa[2..$-1], 4); - assert(aa[0] == 0x1222_2223 && aa[1]==0x4555_5556 - && aa[2]==0xD899_9999 && aa[3]==0x0BCC_CCCC); - - aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; - multibyteShr(aa[0..$-2], aa, 4); - assert(aa[1]==0xA455_5555 && aa[2]==0x0899_9999); - assert(aa[0]==0x6122_2222); - assert(aa[3]==0xBCCC_CCCD); - - - aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; - uint r = multibyteShl(aa[2 .. 4], aa[2 .. 4], 4); - assert(aa[0] == 0xF0FF_FFFF && aa[1]==0x1222_2223 - && aa[2]==0x5555_5560 && aa[3]==0x9999_99A4 && aa[4]==0xBCCC_CCCD); - assert(r == 8); - - aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; - r = multibyteShl(aa[1 .. 4], aa[1 .. 4], 4); - assert(aa[0] == 0xF0FF_FFFF - && aa[2]==0x5555_5561); - assert(aa[3]==0x9999_99A4 && aa[4]==0xBCCC_CCCD); - assert(r == 8); - assert(aa[1]==0x2222_2230); - - aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; - r = multibyteShl(aa[0 .. 4], aa[1 .. 5], 31); -} - -/** dest[#] = src[#] * multiplier + carry. - * Returns carry. - */ -uint multibyteMul(uint[] dest, const uint[] src, uint multiplier, uint carry) - pure -{ - // Timing: definitely not optimal. - // Pentium M: 5.0 cycles/operation, has 3 resource stalls/iteration - // Fastest implementation found was 4.6 cycles/op, but not worth the complexity. - - enum { LASTPARAM = 4*4 } // 4* pushes + return address. - // We'll use p2 (load unit) instead of the overworked p0 or p1 (ALU units) - // when initializing variables to zero. - version (D_PIC) - { - enum { zero = 0 } - } - else - { - __gshared int zero = 0; - } - asm pure nothrow { - naked; - push ESI; - push EDI; - push EBX; - - mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr - mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length - mov ESI, [ESP + LASTPARAM + 4*2]; // src.ptr - align 16; - lea EDI, [EDI + 4*EBX]; // EDI = end of dest - lea ESI, [ESI + 4*EBX]; // ESI = end of src - mov ECX, EAX; // [carry]; -- last param is in EAX. - neg EBX; // count UP to zero. - test EBX, 1; - jnz L_odd; - add EBX, 1; - L1: - mov EAX, [-4 + ESI + 4*EBX]; - mul int ptr [ESP+LASTPARAM]; //[multiplier]; - add EAX, ECX; - mov ECX, zero; - mov [-4+EDI + 4*EBX], EAX; - adc ECX, EDX; -L_odd: - mov EAX, [ESI + 4*EBX]; // p2 - mul int ptr [ESP+LASTPARAM]; //[multiplier]; // p0*3, - add EAX, ECX; - mov ECX, zero; - adc ECX, EDX; - mov [EDI + 4*EBX], EAX; - add EBX, 2; - jl L1; - - mov EAX, ECX; // get final carry - - pop EBX; - pop EDI; - pop ESI; - ret 5*4; - } -} - -@system unittest -{ - uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; - multibyteMul(aa[1 .. 4], aa[1 .. 4], 16, 0); - assert(aa[0] == 0xF0FF_FFFF && aa[1] == 0x2222_2230 && - aa[2]==0x5555_5561 && aa[3]==0x9999_99A4 && aa[4]==0x0BCCC_CCCD); -} - -// The inner multiply-and-add loop, together with the Even entry point. -// Multiples by M_ADDRESS which should be "ESP+LASTPARAM" or "ESP". OP must be "add" or "sub" -// This is the most time-critical code in the BigInt library. -// It is used by both MulAdd, multiplyAccumulate, and triangleAccumulate -string asmMulAdd_innerloop(string OP, string M_ADDRESS) pure { - // The bottlenecks in this code are extremely complicated. The MUL, ADD, and ADC - // need 4 cycles on each of the ALUs units p0 and p1. So we use memory load - // (unit p2) for initializing registers to zero. - // There are also dependencies between the instructions, and we run up against the - // ROB-read limit (can only read 2 registers per cycle). - // We also need the number of uops in the loop to be a multiple of 3. - // The only available execution unit for this is p3 (memory write). Unfortunately we can't do that - // if Position-Independent Code is required. - - // Register usage - // ESI = end of src - // EDI = end of dest - // EBX = index. Counts up to zero (in steps of 2). - // EDX:EAX = scratch, used in multiply. - // ECX = carry1. - // EBP = carry2. - // ESP = points to the multiplier. - - // The first member of 'dest' which will be modified is [EDI+4*EBX]. - // EAX must already contain the first member of 'src', [ESI+4*EBX]. - - version (D_PIC) { bool using_PIC = true; } else { bool using_PIC = false; } - return " - // Entry point for even length - add EBX, 1; - mov EBP, ECX; // carry - - mul int ptr [" ~ M_ADDRESS ~ "]; // M - mov ECX, 0; - - add EBP, EAX; - mov EAX, [ESI+4*EBX]; - adc ECX, EDX; - - mul int ptr [" ~ M_ADDRESS ~ "]; // M - " ~ OP ~ " [-4+EDI+4*EBX], EBP; - mov EBP, zero; - - adc ECX, EAX; - mov EAX, [4+ESI+4*EBX]; - - adc EBP, EDX; - add EBX, 2; - jnl L_done; -L1: - mul int ptr [" ~ M_ADDRESS ~ "]; - " ~ OP ~ " [-8+EDI+4*EBX], ECX; - adc EBP, EAX; - mov ECX, zero; - mov EAX, [ESI+4*EBX]; - adc ECX, EDX; -" ~ - (using_PIC ? "" : " mov storagenop, EDX; ") // make #uops in loop a multiple of 3, can't do this in PIC mode. -~ " - mul int ptr [" ~ M_ADDRESS ~ "]; - " ~ OP ~ " [-4+EDI+4*EBX], EBP; - mov EBP, zero; - - adc ECX, EAX; - mov EAX, [4+ESI+4*EBX]; - - adc EBP, EDX; - add EBX, 2; - jl L1; -L_done: " ~ OP ~ " [-8+EDI+4*EBX], ECX; - adc EBP, 0; -"; - // final carry is now in EBP -} - -string asmMulAdd_enter_odd(string OP, string M_ADDRESS) pure -{ - return " - mul int ptr [" ~M_ADDRESS ~"]; - mov EBP, zero; - add ECX, EAX; - mov EAX, [4+ESI+4*EBX]; - - adc EBP, EDX; - add EBX, 2; - jl L1; - jmp L_done; -"; -} - - - -/** - * dest[#] += src[#] * multiplier OP carry(0 .. FFFF_FFFF). - * where op == '+' or '-' - * Returns carry out of MSB (0 .. FFFF_FFFF). - */ -uint multibyteMulAdd(char op)(uint [] dest, const uint [] src, uint - multiplier, uint carry) pure { - // Timing: This is the most time-critical bignum function. - // Pentium M: 5.4 cycles/operation, still has 2 resource stalls + 1load block/iteration - - // The main loop is pipelined and unrolled by 2, - // so entry to the loop is also complicated. - - // Register usage - // EDX:EAX = multiply - // EBX = counter - // ECX = carry1 - // EBP = carry2 - // EDI = dest - // ESI = src - - enum string OP = (op=='+')? "add" : "sub"; - version (D_PIC) - { - enum { zero = 0 } - } - else - { - // use p2 (load unit) instead of the overworked p0 or p1 (ALU units) - // when initializing registers to zero. - __gshared int zero = 0; - // use p3/p4 units - __gshared int storagenop; // write-only - } - - enum { LASTPARAM = 5*4 } // 4* pushes + return address. - asm pure nothrow { - naked; - - push ESI; - push EDI; - push EBX; - push EBP; - mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr - mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length - align 16; - nop; - mov ESI, [ESP + LASTPARAM + 4*2]; // src.ptr - lea EDI, [EDI + 4*EBX]; // EDI = end of dest - lea ESI, [ESI + 4*EBX]; // ESI = end of src - mov EBP, 0; - mov ECX, EAX; // ECX = input carry. - neg EBX; // count UP to zero. - mov EAX, [ESI+4*EBX]; - test EBX, 1; - jnz L_enter_odd; - } - // Main loop, with entry point for even length - mixin("asm pure nothrow {" ~ asmMulAdd_innerloop(OP, "ESP+LASTPARAM") ~ "}"); - asm pure nothrow { - mov EAX, EBP; // get final carry - pop EBP; - pop EBX; - pop EDI; - pop ESI; - ret 5*4; - } -L_enter_odd: - mixin("asm pure nothrow {" ~ asmMulAdd_enter_odd(OP, "ESP+LASTPARAM") ~ "}"); -} - -@system unittest -{ - - uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; - uint [] bb = [0x1234_1234, 0xF0F0_F0F0, 0x00C0_C0C0, 0xF0F0_F0F0, 0xC0C0_C0C0]; - multibyteMulAdd!('+')(bb[1..$-1], aa[1..$-2], 16, 5); - assert(bb[0] == 0x1234_1234 && bb[4] == 0xC0C0_C0C0); - assert(bb[1] == 0x2222_2230 + 0xF0F0_F0F0+5 && bb[2] == 0x5555_5561+0x00C0_C0C0+1 - && bb[3] == 0x9999_99A4+0xF0F0_F0F0 ); -} - -/** - Sets result[#] = result[0 .. left.length] + left[#] * right[#] - - It is defined in this way to allow cache-efficient multiplication. - This function is equivalent to: - ---- - for (int i = 0; i< right.length; ++i) - { - dest[left.length + i] = multibyteMulAdd(dest[i .. left.length+i], - left, right[i], 0); - } - ---- - */ -void multibyteMultiplyAccumulate(uint [] dest, const uint[] left, - const uint [] right) pure { - // Register usage - // EDX:EAX = used in multiply - // EBX = index - // ECX = carry1 - // EBP = carry2 - // EDI = end of dest for this pass through the loop. Index for outer loop. - // ESI = end of left. never changes - // [ESP] = M = right[i] = multiplier for this pass through the loop. - // right.length is changed into dest.ptr+dest.length - version (D_PIC) - { - enum { zero = 0 } - } - else - { - // use p2 (load unit) instead of the overworked p0 or p1 (ALU units) - // when initializing registers to zero. - __gshared int zero = 0; - // use p3/p4 units - __gshared int storagenop; // write-only - } - - enum { LASTPARAM = 6*4 } // 4* pushes + local + return address. - asm pure nothrow { - naked; - - push ESI; - push EDI; - align 16; - push EBX; - push EBP; - push EAX; // local variable M - mov EDI, [ESP + LASTPARAM + 4*5]; // dest.ptr - mov EBX, [ESP + LASTPARAM + 4*2]; // left.length - mov ESI, [ESP + LASTPARAM + 4*3]; // left.ptr - lea EDI, [EDI + 4*EBX]; // EDI = end of dest for first pass - - mov EAX, [ESP + LASTPARAM + 4*0]; // right.length - lea EAX, [EDI + 4*EAX]; - mov [ESP + LASTPARAM + 4*0], EAX; // last value for EDI - - lea ESI, [ESI + 4*EBX]; // ESI = end of left - mov EAX, [ESP + LASTPARAM + 4*1]; // right.ptr - mov EAX, [EAX]; - mov [ESP], EAX; // M -outer_loop: - mov EBP, 0; - mov ECX, 0; // ECX = input carry. - neg EBX; // count UP to zero. - mov EAX, [ESI+4*EBX]; - test EBX, 1; - jnz L_enter_odd; - } - // -- Inner loop, with even entry point - mixin("asm pure nothrow { " ~ asmMulAdd_innerloop("add", "ESP") ~ "}"); - asm pure nothrow { - mov [-4+EDI+4*EBX], EBP; - add EDI, 4; - cmp EDI, [ESP + LASTPARAM + 4*0]; // is EDI = &dest[$]? - jz outer_done; - mov EAX, [ESP + LASTPARAM + 4*1]; // right.ptr - mov EAX, [EAX+4]; // get new M - mov [ESP], EAX; // save new M - add int ptr [ESP + LASTPARAM + 4*1], 4; // right.ptr - mov EBX, [ESP + LASTPARAM + 4*2]; // left.length - jmp outer_loop; -outer_done: - pop EAX; - pop EBP; - pop EBX; - pop EDI; - pop ESI; - ret 6*4; - } -L_enter_odd: - mixin("asm pure nothrow {" ~ asmMulAdd_enter_odd("add", "ESP") ~ "}"); -} - -/** dest[#] /= divisor. - * overflow is the initial remainder, and must be in the range 0 .. divisor-1. - * divisor must not be a power of 2 (use right shift for that case; - * A division by zero will occur if divisor is a power of 2). - * Returns the final remainder - * - * Based on public domain code by Eric Bainville. - * (http://www.bealto.com/) Used with permission. - */ -uint multibyteDivAssign(uint [] dest, uint divisor, uint overflow) pure -{ - // Timing: limited by a horrible dependency chain. - // Pentium M: 18 cycles/op, 8 resource stalls/op. - // EAX, EDX = scratch, used by MUL - // EDI = dest - // CL = shift - // ESI = quotient - // EBX = remainderhi - // EBP = remainderlo - // [ESP-4] = mask - // [ESP] = kinv (2^64 /divisor) - enum { LASTPARAM = 5*4 } // 4* pushes + return address. - enum { LOCALS = 2*4} // MASK, KINV - asm pure nothrow { - naked; - - push ESI; - push EDI; - push EBX; - push EBP; - - mov EDI, [ESP + LASTPARAM + 4*2]; // dest.ptr - mov EBX, [ESP + LASTPARAM + 4*1]; // dest.length - - // Loop from msb to lsb - lea EDI, [EDI + 4*EBX]; - mov EBP, EAX; // rem is the input remainder, in 0 .. divisor-1 - // Build the pseudo-inverse of divisor k: 2^64/k - // First determine the shift in ecx to get the max number of bits in kinv - xor ECX, ECX; - mov EAX, [ESP + LASTPARAM]; //divisor; - mov EDX, 1; -kinv1: - inc ECX; - ror EDX, 1; - shl EAX, 1; - jnc kinv1; - dec ECX; - // Here, ecx is a left shift moving the msb of k to bit 32 - - mov EAX, 1; - shl EAX, CL; - dec EAX; - ror EAX, CL ; //ecx bits at msb - push EAX; // MASK - - // Then divide 2^(32+cx) by divisor (edx already ok) - xor EAX, EAX; - div int ptr [ESP + LASTPARAM + LOCALS-4*1]; //divisor; - push EAX; // kinv - align 16; -L2: - // Get 32 bits of quotient approx, multiplying - // most significant word of (rem*2^32+input) - mov EAX, [ESP+4]; //MASK; - and EAX, [EDI - 4]; - or EAX, EBP; - rol EAX, CL; - mov EBX, EBP; - mov EBP, [EDI - 4]; - mul int ptr [ESP]; //KINV; - - shl EAX, 1; - rcl EDX, 1; - - // Multiply by k and subtract to get remainder - // Subtraction must be done on two words - mov EAX, EDX; - mov ESI, EDX; // quot = high word - mul int ptr [ESP + LASTPARAM+LOCALS]; //divisor; - sub EBP, EAX; - sbb EBX, EDX; - jz Lb; // high word is 0, goto adjust on single word - - // Adjust quotient and remainder on two words -Ld: inc ESI; - sub EBP, [ESP + LASTPARAM+LOCALS]; //divisor; - sbb EBX, 0; - jnz Ld; - - // Adjust quotient and remainder on single word -Lb: cmp EBP, [ESP + LASTPARAM+LOCALS]; //divisor; - jc Lc; // rem in 0 .. divisor-1, OK - sub EBP, [ESP + LASTPARAM+LOCALS]; //divisor; - inc ESI; - jmp Lb; - - // Store result -Lc: - mov [EDI - 4], ESI; - lea EDI, [EDI - 4]; - dec int ptr [ESP + LASTPARAM + 4*1+LOCALS]; // len - jnz L2; - - pop EAX; // discard kinv - pop EAX; // discard mask - - mov EAX, EBP; // return final remainder - pop EBP; - pop EBX; - pop EDI; - pop ESI; - ret 3*4; - } -} - -@system unittest -{ - uint [] aa = new uint[101]; - for (int i=0; i>= 32; - c += cast(ulong)(x[$-3]) * x[$-1] + dest[$-4]; - dest[$-4] = cast(uint) c; - c >>= 32; -length2: - c += cast(ulong)(x[$-2]) * x[$-1]; - dest[$-3] = cast(uint) c; - c >>= 32; - dest[$-2] = cast(uint) c; -} - -//dest += src[0]*src[1...$] + src[1]*src[2..$] + ... + src[$-3]*src[$-2..$]+ src[$-2]*src[$-1] -// assert(dest.length = src.length*2); -// assert(src.length >= 3); -void multibyteTriangleAccumulateAsm(uint[] dest, const uint[] src) pure -{ - // Register usage - // EDX:EAX = used in multiply - // EBX = index - // ECX = carry1 - // EBP = carry2 - // EDI = end of dest for this pass through the loop. Index for outer loop. - // ESI = end of src. never changes - // [ESP] = M = src[i] = multiplier for this pass through the loop. - // dest.length is changed into dest.ptr+dest.length - version (D_PIC) - { - enum { zero = 0 } - } - else - { - // use p2 (load unit) instead of the overworked p0 or p1 (ALU units) - // when initializing registers to zero. - __gshared int zero = 0; - // use p3/p4 units - __gshared int storagenop; // write-only - } - - enum { LASTPARAM = 6*4 } // 4* pushes + local + return address. - asm pure nothrow { - naked; - - push ESI; - push EDI; - align 16; - push EBX; - push EBP; - push EAX; // local variable M= src[i] - mov EDI, [ESP + LASTPARAM + 4*3]; // dest.ptr - mov EBX, [ESP + LASTPARAM + 4*0]; // src.length - mov ESI, [ESP + LASTPARAM + 4*1]; // src.ptr - - lea ESI, [ESI + 4*EBX]; // ESI = end of left - add int ptr [ESP + LASTPARAM + 4*1], 4; // src.ptr, used for getting M - - // local variable [ESP + LASTPARAM + 4*2] = last value for EDI - lea EDI, [EDI + 4*EBX]; // EDI = end of dest for first pass - - lea EAX, [EDI + 4*EBX-3*4]; // up to src.length - 3 - mov [ESP + LASTPARAM + 4*2], EAX; // last value for EDI = &dest[src.length*2 -3] - - cmp EBX, 3; - jz length_is_3; - - // We start at src[1], not src[0]. - dec EBX; - mov [ESP + LASTPARAM + 4*0], EBX; - -outer_loop: - mov EBX, [ESP + LASTPARAM + 4*0]; // src.length - mov EBP, 0; - mov ECX, 0; // ECX = input carry. - dec [ESP + LASTPARAM + 4*0]; // Next time, the length will be shorter by 1. - neg EBX; // count UP to zero. - - mov EAX, [ESI + 4*EBX - 4*1]; // get new M - mov [ESP], EAX; // save new M - - mov EAX, [ESI+4*EBX]; - test EBX, 1; - jnz L_enter_odd; - } - // -- Inner loop, with even entry point - mixin("asm pure nothrow { " ~ asmMulAdd_innerloop("add", "ESP") ~ "}"); - asm pure nothrow { - mov [-4+EDI+4*EBX], EBP; - add EDI, 4; - cmp EDI, [ESP + LASTPARAM + 4*2]; // is EDI = &dest[$-3]? - jnz outer_loop; -length_is_3: - mov EAX, [ESI - 4*3]; - mul EAX, [ESI - 4*2]; - mov ECX, 0; - add [EDI-2*4], EAX; // ECX:dest[$-5] += x[$-3] * x[$-2] - adc ECX, EDX; - - mov EAX, [ESI - 4*3]; - mul EAX, [ESI - 4*1]; // x[$-3] * x[$-1] - add EAX, ECX; - mov ECX, 0; - adc EDX, 0; - // now EDX: EAX = c + x[$-3] * x[$-1] - add [EDI-1*4], EAX; // ECX:dest[$-4] += (EDX:EAX) - adc ECX, EDX; // ECX holds dest[$-3], it acts as carry for the last row -// do length == 2 - mov EAX, [ESI - 4*2]; - mul EAX, [ESI - 4*1]; - add ECX, EAX; - adc EDX, 0; - mov [EDI - 0*4], ECX; // dest[$-2:$-3] = c + x[$-2] * x[$-1]; - mov [EDI + 1*4], EDX; - - pop EAX; - pop EBP; - pop EBX; - pop EDI; - pop ESI; - ret 4*4; - } -L_enter_odd: - mixin("asm pure nothrow {" ~ asmMulAdd_enter_odd("add", "ESP") ~ "}"); -} - -@system unittest -{ - uint [] aa = new uint[200]; - uint [] a = aa[0 .. 100]; - uint [] b = new uint [100]; - aa[] = 761; - a[] = 0; - b[] = 0; - a[3] = 6; - b[0]=1; - b[1] = 17; - b[50 .. 100]=78; - multibyteTriangleAccumulateAsm(a, b[0 .. 50]); - uint [] c = new uint[100]; - c[] = 0; - c[1] = 17; - c[3] = 6; - assert(a[]==c[]); - assert(a[0]==0); - aa[] = 0xFFFF_FFFF; - a[] = 0; - b[] = 0; - b[0]= 0xbf6a1f01; - b[1]= 0x6e38ed64; - b[2]= 0xdaa797ed; - b[3] = 0; - - multibyteTriangleAccumulateAsm(a[0 .. 8], b[0 .. 4]); - assert(a[1]==0x3a600964); - assert(a[2]==0x339974f6); - assert(a[3]==0x46736fce); - assert(a[4]==0x5e24a2b4); - - b[3] = 0xe93ff9f4; - b[4] = 0x184f03; - a[]=0; - multibyteTriangleAccumulateAsm(a[0 .. 14], b[0 .. 7]); - assert(a[3]==0x79fff5c2); - assert(a[4]==0xcf384241); - assert(a[5]== 0x4a17fc8); - assert(a[6]==0x4d549025); -} - - -void multibyteSquare(BigDigit[] result, const BigDigit [] x) pure -{ - if (x.length < 4) - { - // Special cases, not worth doing triangular. - result[x.length] = multibyteMul(result[0 .. x.length], x, x[0], 0); - multibyteMultiplyAccumulate(result[1..$], x, x[1..$]); - return; - } - // Do half a square multiply. - // dest += src[0]*src[1...$] + src[1]*src[2..$] + ... + src[$-3]*src[$-2..$]+ src[$-2]*src[$-1] - result[x.length] = multibyteMul(result[1 .. x.length], x[1..$], x[0], 0); - multibyteTriangleAccumulateAsm(result[2..$], x[1..$]); - // Multiply by 2 - result[$-1] = multibyteShlNoMMX(result[1..$-1], result[1..$-1], 1); - // And add the diagonal elements - result[0] = 0; - multibyteAddDiagonalSquares(result, x); -} - -version (BignumPerformanceTest) -{ -import core.stdc.stdio; -int clock() { asm { push EBX; xor EAX, EAX; cpuid; pop EBX; rdtsc; } } - -__gshared uint [2200] X1; -__gshared uint [2200] Y1; -__gshared uint [4000] Z1; - -void testPerformance() pure -{ - // The performance results at the top of this file were obtained using - // a Windows device driver to access the CPU performance counters. - // The code below is less accurate but more widely usable. - // The value for division is quite inconsistent. - for (int i=0; i