vector: tweak the asm register assignment.
This makes the fixed point and floating point code more similar. Benchmarks don't show any significant change. Change-Id: I723fa1605eaa248b40e784201b680c16cc3d26a2 Reviewed-on: https://go-review.googlesource.com/32134 Reviewed-by: David Crawshaw <crawshaw@golang.org>
This commit is contained in:
parent
0ed2caa453
commit
c78039e8ce
|
@ -577,9 +577,9 @@ fxAccMaskEnd:
|
||||||
// xmm0 scratch
|
// xmm0 scratch
|
||||||
// xmm1 x
|
// xmm1 x
|
||||||
// xmm2 y, z
|
// xmm2 y, z
|
||||||
// xmm3 flAlmost65536
|
// xmm3 flSignMask
|
||||||
// xmm4 flOne
|
// xmm4 flOne
|
||||||
// xmm5 flSignMask
|
// xmm5 flAlmost65536
|
||||||
// xmm6 gather
|
// xmm6 gather
|
||||||
// xmm7 offset
|
// xmm7 offset
|
||||||
// xmm8 scatterAndMulBy0x101
|
// xmm8 scatterAndMulBy0x101
|
||||||
|
@ -608,12 +608,12 @@ TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48
|
||||||
ORL $0x6000, AX
|
ORL $0x6000, AX
|
||||||
MOVL AX, mxcsrNew-4(SP)
|
MOVL AX, mxcsrNew-4(SP)
|
||||||
|
|
||||||
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
|
|
||||||
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
|
|
||||||
// flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
|
// flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
|
||||||
MOVOU flAlmost65536<>(SB), X3
|
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
|
||||||
|
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
|
||||||
|
MOVOU flSignMask<>(SB), X3
|
||||||
MOVOU flOne<>(SB), X4
|
MOVOU flOne<>(SB), X4
|
||||||
MOVOU flSignMask<>(SB), X5
|
MOVOU flAlmost65536<>(SB), X5
|
||||||
|
|
||||||
// gather := XMM(see above) // PSHUFB shuffle mask.
|
// gather := XMM(see above) // PSHUFB shuffle mask.
|
||||||
// scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask.
|
// scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask.
|
||||||
|
@ -659,10 +659,10 @@ flAccOpOverLoop4:
|
||||||
// y = x & flSignMask
|
// y = x & flSignMask
|
||||||
// y = min(y, flOne)
|
// y = min(y, flOne)
|
||||||
// y = mul(y, flAlmost65536)
|
// y = mul(y, flAlmost65536)
|
||||||
MOVOU X5, X2
|
MOVOU X3, X2
|
||||||
ANDPS X1, X2
|
ANDPS X1, X2
|
||||||
MINPS X4, X2
|
MINPS X4, X2
|
||||||
MULPS X3, X2
|
MULPS X5, X2
|
||||||
|
|
||||||
// z = convertToInt32(y)
|
// z = convertToInt32(y)
|
||||||
LDMXCSR mxcsrNew-4(SP)
|
LDMXCSR mxcsrNew-4(SP)
|
||||||
|
@ -738,10 +738,10 @@ flAccOpOverLoop1:
|
||||||
// y = x & flSignMask
|
// y = x & flSignMask
|
||||||
// y = min(y, flOne)
|
// y = min(y, flOne)
|
||||||
// y = mul(y, flAlmost65536)
|
// y = mul(y, flAlmost65536)
|
||||||
MOVOU X5, X2
|
MOVOU X3, X2
|
||||||
ANDPS X1, X2
|
ANDPS X1, X2
|
||||||
MINPS X4, X2
|
MINPS X4, X2
|
||||||
MULPS X3, X2
|
MULPS X5, X2
|
||||||
|
|
||||||
// z = convertToInt32(y)
|
// z = convertToInt32(y)
|
||||||
LDMXCSR mxcsrNew-4(SP)
|
LDMXCSR mxcsrNew-4(SP)
|
||||||
|
@ -791,9 +791,9 @@ flAccOpOverEnd:
|
||||||
// xmm0 scratch
|
// xmm0 scratch
|
||||||
// xmm1 x
|
// xmm1 x
|
||||||
// xmm2 y, z
|
// xmm2 y, z
|
||||||
// xmm3 flAlmost65536
|
// xmm3 flSignMask
|
||||||
// xmm4 flOne
|
// xmm4 flOne
|
||||||
// xmm5 flSignMask
|
// xmm5 flAlmost65536
|
||||||
// xmm6 gather
|
// xmm6 gather
|
||||||
// xmm7 offset
|
// xmm7 offset
|
||||||
// xmm8 -
|
// xmm8 -
|
||||||
|
@ -822,12 +822,12 @@ TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48
|
||||||
ORL $0x6000, AX
|
ORL $0x6000, AX
|
||||||
MOVL AX, mxcsrNew-4(SP)
|
MOVL AX, mxcsrNew-4(SP)
|
||||||
|
|
||||||
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
|
|
||||||
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
|
|
||||||
// flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
|
// flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
|
||||||
MOVOU flAlmost65536<>(SB), X3
|
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
|
||||||
|
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
|
||||||
|
MOVOU flSignMask<>(SB), X3
|
||||||
MOVOU flOne<>(SB), X4
|
MOVOU flOne<>(SB), X4
|
||||||
MOVOU flSignMask<>(SB), X5
|
MOVOU flAlmost65536<>(SB), X5
|
||||||
|
|
||||||
// gather := XMM(see above) // PSHUFB shuffle mask.
|
// gather := XMM(see above) // PSHUFB shuffle mask.
|
||||||
MOVOU gather<>(SB), X6
|
MOVOU gather<>(SB), X6
|
||||||
|
@ -867,10 +867,10 @@ flAccOpSrcLoop4:
|
||||||
// y = x & flSignMask
|
// y = x & flSignMask
|
||||||
// y = min(y, flOne)
|
// y = min(y, flOne)
|
||||||
// y = mul(y, flAlmost65536)
|
// y = mul(y, flAlmost65536)
|
||||||
MOVOU X5, X2
|
MOVOU X3, X2
|
||||||
ANDPS X1, X2
|
ANDPS X1, X2
|
||||||
MINPS X4, X2
|
MINPS X4, X2
|
||||||
MULPS X3, X2
|
MULPS X5, X2
|
||||||
|
|
||||||
// z = convertToInt32(y)
|
// z = convertToInt32(y)
|
||||||
LDMXCSR mxcsrNew-4(SP)
|
LDMXCSR mxcsrNew-4(SP)
|
||||||
|
@ -906,10 +906,10 @@ flAccOpSrcLoop1:
|
||||||
// y = x & flSignMask
|
// y = x & flSignMask
|
||||||
// y = min(y, flOne)
|
// y = min(y, flOne)
|
||||||
// y = mul(y, flAlmost65536)
|
// y = mul(y, flAlmost65536)
|
||||||
MOVOU X5, X2
|
MOVOU X3, X2
|
||||||
ANDPS X1, X2
|
ANDPS X1, X2
|
||||||
MINPS X4, X2
|
MINPS X4, X2
|
||||||
MULPS X3, X2
|
MULPS X5, X2
|
||||||
|
|
||||||
// z = convertToInt32(y)
|
// z = convertToInt32(y)
|
||||||
LDMXCSR mxcsrNew-4(SP)
|
LDMXCSR mxcsrNew-4(SP)
|
||||||
|
@ -945,9 +945,9 @@ flAccOpSrcEnd:
|
||||||
// xmm0 scratch
|
// xmm0 scratch
|
||||||
// xmm1 x
|
// xmm1 x
|
||||||
// xmm2 y, z
|
// xmm2 y, z
|
||||||
// xmm3 flAlmost65536
|
// xmm3 flSignMask
|
||||||
// xmm4 flOne
|
// xmm4 flOne
|
||||||
// xmm5 flSignMask
|
// xmm5 flAlmost65536
|
||||||
// xmm6 -
|
// xmm6 -
|
||||||
// xmm7 offset
|
// xmm7 offset
|
||||||
// xmm8 -
|
// xmm8 -
|
||||||
|
@ -976,12 +976,12 @@ TEXT ·floatingAccumulateMaskSIMD(SB), NOSPLIT, $8-48
|
||||||
ORL $0x6000, AX
|
ORL $0x6000, AX
|
||||||
MOVL AX, mxcsrNew-4(SP)
|
MOVL AX, mxcsrNew-4(SP)
|
||||||
|
|
||||||
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
|
|
||||||
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
|
|
||||||
// flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
|
// flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
|
||||||
MOVOU flAlmost65536<>(SB), X3
|
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
|
||||||
|
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
|
||||||
|
MOVOU flSignMask<>(SB), X3
|
||||||
MOVOU flOne<>(SB), X4
|
MOVOU flOne<>(SB), X4
|
||||||
MOVOU flSignMask<>(SB), X5
|
MOVOU flAlmost65536<>(SB), X5
|
||||||
|
|
||||||
// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
|
// offset := XMM(0x00000000 repeated four times) // Cumulative sum.
|
||||||
XORPS X7, X7
|
XORPS X7, X7
|
||||||
|
@ -1018,10 +1018,10 @@ flAccMaskLoop4:
|
||||||
// y = x & flSignMask
|
// y = x & flSignMask
|
||||||
// y = min(y, flOne)
|
// y = min(y, flOne)
|
||||||
// y = mul(y, flAlmost65536)
|
// y = mul(y, flAlmost65536)
|
||||||
MOVOU X5, X2
|
MOVOU X3, X2
|
||||||
ANDPS X1, X2
|
ANDPS X1, X2
|
||||||
MINPS X4, X2
|
MINPS X4, X2
|
||||||
MULPS X3, X2
|
MULPS X5, X2
|
||||||
|
|
||||||
// z = convertToInt32(y)
|
// z = convertToInt32(y)
|
||||||
LDMXCSR mxcsrNew-4(SP)
|
LDMXCSR mxcsrNew-4(SP)
|
||||||
|
@ -1055,10 +1055,10 @@ flAccMaskLoop1:
|
||||||
// y = x & flSignMask
|
// y = x & flSignMask
|
||||||
// y = min(y, flOne)
|
// y = min(y, flOne)
|
||||||
// y = mul(y, flAlmost65536)
|
// y = mul(y, flAlmost65536)
|
||||||
MOVOU X5, X2
|
MOVOU X3, X2
|
||||||
ANDPS X1, X2
|
ANDPS X1, X2
|
||||||
MINPS X4, X2
|
MINPS X4, X2
|
||||||
MULPS X3, X2
|
MULPS X5, X2
|
||||||
|
|
||||||
// z = convertToInt32(y)
|
// z = convertToInt32(y)
|
||||||
LDMXCSR mxcsrNew-4(SP)
|
LDMXCSR mxcsrNew-4(SP)
|
||||||
|
|
|
@ -242,13 +242,13 @@ const (
|
||||||
sizeOfUint32 = 4
|
sizeOfUint32 = 4
|
||||||
|
|
||||||
fxXMM3 = `-`
|
fxXMM3 = `-`
|
||||||
flXMM3 = `flAlmost65536`
|
flXMM3 = `flSignMask`
|
||||||
|
|
||||||
fxXMM4 = `-`
|
fxXMM4 = `-`
|
||||||
flXMM4 = `flOne`
|
flXMM4 = `flOne`
|
||||||
|
|
||||||
fxXMM5 = `fxAlmost65536`
|
fxXMM5 = `fxAlmost65536`
|
||||||
flXMM5 = `flSignMask`
|
flXMM5 = `flAlmost65536`
|
||||||
|
|
||||||
oneArgLoadArgs = `
|
oneArgLoadArgs = `
|
||||||
MOVQ buf_base+0(FP), DI
|
MOVQ buf_base+0(FP), DI
|
||||||
|
@ -281,12 +281,12 @@ const (
|
||||||
MOVOU fxAlmost65536<>(SB), X5
|
MOVOU fxAlmost65536<>(SB), X5
|
||||||
`
|
`
|
||||||
flLoadXMMRegs = `
|
flLoadXMMRegs = `
|
||||||
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
|
|
||||||
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
|
|
||||||
// flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
|
// flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
|
||||||
MOVOU flAlmost65536<>(SB), X3
|
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
|
||||||
|
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
|
||||||
|
MOVOU flSignMask<>(SB), X3
|
||||||
MOVOU flOne<>(SB), X4
|
MOVOU flOne<>(SB), X4
|
||||||
MOVOU flSignMask<>(SB), X5
|
MOVOU flAlmost65536<>(SB), X5
|
||||||
`
|
`
|
||||||
|
|
||||||
fxAdd = `PADDD`
|
fxAdd = `PADDD`
|
||||||
|
@ -312,10 +312,10 @@ const (
|
||||||
// y = x & flSignMask
|
// y = x & flSignMask
|
||||||
// y = min(y, flOne)
|
// y = min(y, flOne)
|
||||||
// y = mul(y, flAlmost65536)
|
// y = mul(y, flAlmost65536)
|
||||||
MOVOU X5, X2
|
MOVOU X3, X2
|
||||||
ANDPS X1, X2
|
ANDPS X1, X2
|
||||||
MINPS X4, X2
|
MINPS X4, X2
|
||||||
MULPS X3, X2
|
MULPS X5, X2
|
||||||
`
|
`
|
||||||
|
|
||||||
fxConvertToInt32 = `
|
fxConvertToInt32 = `
|
||||||
|
|
Loading…
Reference in New Issue
Block a user