vector: tweak the asm register assignment.

This makes the fixed point and floating point code more similar.

Benchmarks don't show any significant change.

Change-Id: I723fa1605eaa248b40e784201b680c16cc3d26a2
Reviewed-on: https://go-review.googlesource.com/32134
Reviewed-by: David Crawshaw <crawshaw@golang.org>
This commit is contained in:
Nigel Tao 2016-10-26 15:57:48 +11:00
parent 0ed2caa453
commit c78039e8ce
2 changed files with 38 additions and 38 deletions

View File

@ -577,9 +577,9 @@ fxAccMaskEnd:
// xmm0 scratch
// xmm1 x
// xmm2 y, z
// xmm3 flAlmost65536
// xmm3 flSignMask
// xmm4 flOne
// xmm5 flSignMask
// xmm5 flAlmost65536
// xmm6 gather
// xmm7 offset
// xmm8 scatterAndMulBy0x101
@ -608,12 +608,12 @@ TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48
ORL $0x6000, AX
MOVL AX, mxcsrNew-4(SP)
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
// flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
MOVOU flAlmost65536<>(SB), X3
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
MOVOU flSignMask<>(SB), X3
MOVOU flOne<>(SB), X4
MOVOU flSignMask<>(SB), X5
MOVOU flAlmost65536<>(SB), X5
// gather := XMM(see above) // PSHUFB shuffle mask.
// scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask.
@ -659,10 +659,10 @@ flAccOpOverLoop4:
// y = x & flSignMask
// y = min(y, flOne)
// y = mul(y, flAlmost65536)
MOVOU X5, X2
MOVOU X3, X2
ANDPS X1, X2
MINPS X4, X2
MULPS X3, X2
MULPS X5, X2
// z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP)
@ -738,10 +738,10 @@ flAccOpOverLoop1:
// y = x & flSignMask
// y = min(y, flOne)
// y = mul(y, flAlmost65536)
MOVOU X5, X2
MOVOU X3, X2
ANDPS X1, X2
MINPS X4, X2
MULPS X3, X2
MULPS X5, X2
// z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP)
@ -791,9 +791,9 @@ flAccOpOverEnd:
// xmm0 scratch
// xmm1 x
// xmm2 y, z
// xmm3 flAlmost65536
// xmm3 flSignMask
// xmm4 flOne
// xmm5 flSignMask
// xmm5 flAlmost65536
// xmm6 gather
// xmm7 offset
// xmm8 -
@ -822,12 +822,12 @@ TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48
ORL $0x6000, AX
MOVL AX, mxcsrNew-4(SP)
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
// flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
MOVOU flAlmost65536<>(SB), X3
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
MOVOU flSignMask<>(SB), X3
MOVOU flOne<>(SB), X4
MOVOU flSignMask<>(SB), X5
MOVOU flAlmost65536<>(SB), X5
// gather := XMM(see above) // PSHUFB shuffle mask.
MOVOU gather<>(SB), X6
@ -867,10 +867,10 @@ flAccOpSrcLoop4:
// y = x & flSignMask
// y = min(y, flOne)
// y = mul(y, flAlmost65536)
MOVOU X5, X2
MOVOU X3, X2
ANDPS X1, X2
MINPS X4, X2
MULPS X3, X2
MULPS X5, X2
// z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP)
@ -906,10 +906,10 @@ flAccOpSrcLoop1:
// y = x & flSignMask
// y = min(y, flOne)
// y = mul(y, flAlmost65536)
MOVOU X5, X2
MOVOU X3, X2
ANDPS X1, X2
MINPS X4, X2
MULPS X3, X2
MULPS X5, X2
// z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP)
@ -945,9 +945,9 @@ flAccOpSrcEnd:
// xmm0 scratch
// xmm1 x
// xmm2 y, z
// xmm3 flAlmost65536
// xmm3 flSignMask
// xmm4 flOne
// xmm5 flSignMask
// xmm5 flAlmost65536
// xmm6 -
// xmm7 offset
// xmm8 -
@ -976,12 +976,12 @@ TEXT ·floatingAccumulateMaskSIMD(SB), NOSPLIT, $8-48
ORL $0x6000, AX
MOVL AX, mxcsrNew-4(SP)
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
// flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
MOVOU flAlmost65536<>(SB), X3
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
MOVOU flSignMask<>(SB), X3
MOVOU flOne<>(SB), X4
MOVOU flSignMask<>(SB), X5
MOVOU flAlmost65536<>(SB), X5
// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
XORPS X7, X7
@ -1018,10 +1018,10 @@ flAccMaskLoop4:
// y = x & flSignMask
// y = min(y, flOne)
// y = mul(y, flAlmost65536)
MOVOU X5, X2
MOVOU X3, X2
ANDPS X1, X2
MINPS X4, X2
MULPS X3, X2
MULPS X5, X2
// z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP)
@ -1055,10 +1055,10 @@ flAccMaskLoop1:
// y = x & flSignMask
// y = min(y, flOne)
// y = mul(y, flAlmost65536)
MOVOU X5, X2
MOVOU X3, X2
ANDPS X1, X2
MINPS X4, X2
MULPS X3, X2
MULPS X5, X2
// z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP)

View File

@ -242,13 +242,13 @@ const (
sizeOfUint32 = 4
fxXMM3 = `-`
flXMM3 = `flAlmost65536`
flXMM3 = `flSignMask`
fxXMM4 = `-`
flXMM4 = `flOne`
fxXMM5 = `fxAlmost65536`
flXMM5 = `flSignMask`
flXMM5 = `flAlmost65536`
oneArgLoadArgs = `
MOVQ buf_base+0(FP), DI
@ -281,12 +281,12 @@ const (
MOVOU fxAlmost65536<>(SB), X5
`
flLoadXMMRegs = `
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
// flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
MOVOU flAlmost65536<>(SB), X3
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
MOVOU flSignMask<>(SB), X3
MOVOU flOne<>(SB), X4
MOVOU flSignMask<>(SB), X5
MOVOU flAlmost65536<>(SB), X5
`
fxAdd = `PADDD`
@ -312,10 +312,10 @@ const (
// y = x & flSignMask
// y = min(y, flOne)
// y = mul(y, flAlmost65536)
MOVOU X5, X2
MOVOU X3, X2
ANDPS X1, X2
MINPS X4, X2
MULPS X3, X2
MULPS X5, X2
`
fxConvertToInt32 = `