From c78039e8ce1e427880f400175cb7cddd6cce7248 Mon Sep 17 00:00:00 2001 From: Nigel Tao Date: Wed, 26 Oct 2016 15:57:48 +1100 Subject: [PATCH] vector: tweak the asm register assignment. This makes the fixed point and floating point code more similar. Benchmarks don't show any significant change. Change-Id: I723fa1605eaa248b40e784201b680c16cc3d26a2 Reviewed-on: https://go-review.googlesource.com/32134 Reviewed-by: David Crawshaw --- vector/acc_amd64.s | 60 +++++++++++++++++++++++----------------------- vector/gen.go | 16 ++++++------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/vector/acc_amd64.s b/vector/acc_amd64.s index 299e052..6a424bc 100644 --- a/vector/acc_amd64.s +++ b/vector/acc_amd64.s @@ -577,9 +577,9 @@ fxAccMaskEnd: // xmm0 scratch // xmm1 x // xmm2 y, z -// xmm3 flAlmost65536 +// xmm3 flSignMask // xmm4 flOne -// xmm5 flSignMask +// xmm5 flAlmost65536 // xmm6 gather // xmm7 offset // xmm8 scatterAndMulBy0x101 @@ -608,12 +608,12 @@ TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48 ORL $0x6000, AX MOVL AX, mxcsrNew-4(SP) - // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. - // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. - MOVOU flAlmost65536<>(SB), X3 + // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. + // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. + MOVOU flSignMask<>(SB), X3 MOVOU flOne<>(SB), X4 - MOVOU flSignMask<>(SB), X5 + MOVOU flAlmost65536<>(SB), X5 // gather := XMM(see above) // PSHUFB shuffle mask. // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask. @@ -659,10 +659,10 @@ flAccOpOverLoop4: // y = x & flSignMask // y = min(y, flOne) // y = mul(y, flAlmost65536) - MOVOU X5, X2 + MOVOU X3, X2 ANDPS X1, X2 MINPS X4, X2 - MULPS X3, X2 + MULPS X5, X2 // z = convertToInt32(y) LDMXCSR mxcsrNew-4(SP) @@ -738,10 +738,10 @@ flAccOpOverLoop1: // y = x & flSignMask // y = min(y, flOne) // y = mul(y, flAlmost65536) - MOVOU X5, X2 + MOVOU X3, X2 ANDPS X1, X2 MINPS X4, X2 - MULPS X3, X2 + MULPS X5, X2 // z = convertToInt32(y) LDMXCSR mxcsrNew-4(SP) @@ -791,9 +791,9 @@ flAccOpOverEnd: // xmm0 scratch // xmm1 x // xmm2 y, z -// xmm3 flAlmost65536 +// xmm3 flSignMask // xmm4 flOne -// xmm5 flSignMask +// xmm5 flAlmost65536 // xmm6 gather // xmm7 offset // xmm8 - @@ -822,12 +822,12 @@ TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48 ORL $0x6000, AX MOVL AX, mxcsrNew-4(SP) - // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. - // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. - MOVOU flAlmost65536<>(SB), X3 + // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. + // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. + MOVOU flSignMask<>(SB), X3 MOVOU flOne<>(SB), X4 - MOVOU flSignMask<>(SB), X5 + MOVOU flAlmost65536<>(SB), X5 // gather := XMM(see above) // PSHUFB shuffle mask. MOVOU gather<>(SB), X6 @@ -867,10 +867,10 @@ flAccOpSrcLoop4: // y = x & flSignMask // y = min(y, flOne) // y = mul(y, flAlmost65536) - MOVOU X5, X2 + MOVOU X3, X2 ANDPS X1, X2 MINPS X4, X2 - MULPS X3, X2 + MULPS X5, X2 // z = convertToInt32(y) LDMXCSR mxcsrNew-4(SP) @@ -906,10 +906,10 @@ flAccOpSrcLoop1: // y = x & flSignMask // y = min(y, flOne) // y = mul(y, flAlmost65536) - MOVOU X5, X2 + MOVOU X3, X2 ANDPS X1, X2 MINPS X4, X2 - MULPS X3, X2 + MULPS X5, X2 // z = convertToInt32(y) LDMXCSR mxcsrNew-4(SP) @@ -945,9 +945,9 @@ flAccOpSrcEnd: // xmm0 scratch // xmm1 x // xmm2 y, z -// xmm3 flAlmost65536 +// xmm3 flSignMask // xmm4 flOne -// xmm5 flSignMask +// xmm5 flAlmost65536 // xmm6 - // xmm7 offset // xmm8 - @@ -976,12 +976,12 @@ TEXT ·floatingAccumulateMaskSIMD(SB), NOSPLIT, $8-48 ORL $0x6000, AX MOVL AX, mxcsrNew-4(SP) - // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. - // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. - MOVOU flAlmost65536<>(SB), X3 + // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. + // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. + MOVOU flSignMask<>(SB), X3 MOVOU flOne<>(SB), X4 - MOVOU flSignMask<>(SB), X5 + MOVOU flAlmost65536<>(SB), X5 // offset := XMM(0x00000000 repeated four times) // Cumulative sum. XORPS X7, X7 @@ -1018,10 +1018,10 @@ flAccMaskLoop4: // y = x & flSignMask // y = min(y, flOne) // y = mul(y, flAlmost65536) - MOVOU X5, X2 + MOVOU X3, X2 ANDPS X1, X2 MINPS X4, X2 - MULPS X3, X2 + MULPS X5, X2 // z = convertToInt32(y) LDMXCSR mxcsrNew-4(SP) @@ -1055,10 +1055,10 @@ flAccMaskLoop1: // y = x & flSignMask // y = min(y, flOne) // y = mul(y, flAlmost65536) - MOVOU X5, X2 + MOVOU X3, X2 ANDPS X1, X2 MINPS X4, X2 - MULPS X3, X2 + MULPS X5, X2 // z = convertToInt32(y) LDMXCSR mxcsrNew-4(SP) diff --git a/vector/gen.go b/vector/gen.go index cafd860..28b298b 100644 --- a/vector/gen.go +++ b/vector/gen.go @@ -242,13 +242,13 @@ const ( sizeOfUint32 = 4 fxXMM3 = `-` - flXMM3 = `flAlmost65536` + flXMM3 = `flSignMask` fxXMM4 = `-` flXMM4 = `flOne` fxXMM5 = `fxAlmost65536` - flXMM5 = `flSignMask` + flXMM5 = `flAlmost65536` oneArgLoadArgs = ` MOVQ buf_base+0(FP), DI @@ -281,12 +281,12 @@ const ( MOVOU fxAlmost65536<>(SB), X5 ` flLoadXMMRegs = ` - // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. - // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. - MOVOU flAlmost65536<>(SB), X3 + // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. + // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. + MOVOU flSignMask<>(SB), X3 MOVOU flOne<>(SB), X4 - MOVOU flSignMask<>(SB), X5 + MOVOU flAlmost65536<>(SB), X5 ` fxAdd = `PADDD` @@ -312,10 +312,10 @@ const ( // y = x & flSignMask // y = min(y, flOne) // y = mul(y, flAlmost65536) - MOVOU X5, X2 + MOVOU X3, X2 ANDPS X1, X2 MINPS X4, X2 - MULPS X3, X2 + MULPS X5, X2 ` fxConvertToInt32 = `