vector: tweak the asm register assignment.

This makes the fixed point and floating point code more similar.

Benchmarks don't show any significant change.

Change-Id: I723fa1605eaa248b40e784201b680c16cc3d26a2
Reviewed-on: https://go-review.googlesource.com/32134
Reviewed-by: David Crawshaw <crawshaw@golang.org>
This commit is contained in:
Nigel Tao 2016-10-26 15:57:48 +11:00
parent 0ed2caa453
commit c78039e8ce
2 changed files with 38 additions and 38 deletions

View File

@ -577,9 +577,9 @@ fxAccMaskEnd:
// xmm0 scratch // xmm0 scratch
// xmm1 x // xmm1 x
// xmm2 y, z // xmm2 y, z
// xmm3 flAlmost65536 // xmm3 flSignMask
// xmm4 flOne // xmm4 flOne
// xmm5 flSignMask // xmm5 flAlmost65536
// xmm6 gather // xmm6 gather
// xmm7 offset // xmm7 offset
// xmm8 scatterAndMulBy0x101 // xmm8 scatterAndMulBy0x101
@ -608,12 +608,12 @@ TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48
ORL $0x6000, AX ORL $0x6000, AX
MOVL AX, mxcsrNew-4(SP) MOVL AX, mxcsrNew-4(SP)
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
// flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
MOVOU flAlmost65536<>(SB), X3 // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
MOVOU flSignMask<>(SB), X3
MOVOU flOne<>(SB), X4 MOVOU flOne<>(SB), X4
MOVOU flSignMask<>(SB), X5 MOVOU flAlmost65536<>(SB), X5
// gather := XMM(see above) // PSHUFB shuffle mask. // gather := XMM(see above) // PSHUFB shuffle mask.
// scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask. // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask.
@ -659,10 +659,10 @@ flAccOpOverLoop4:
// y = x & flSignMask // y = x & flSignMask
// y = min(y, flOne) // y = min(y, flOne)
// y = mul(y, flAlmost65536) // y = mul(y, flAlmost65536)
MOVOU X5, X2 MOVOU X3, X2
ANDPS X1, X2 ANDPS X1, X2
MINPS X4, X2 MINPS X4, X2
MULPS X3, X2 MULPS X5, X2
// z = convertToInt32(y) // z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP) LDMXCSR mxcsrNew-4(SP)
@ -738,10 +738,10 @@ flAccOpOverLoop1:
// y = x & flSignMask // y = x & flSignMask
// y = min(y, flOne) // y = min(y, flOne)
// y = mul(y, flAlmost65536) // y = mul(y, flAlmost65536)
MOVOU X5, X2 MOVOU X3, X2
ANDPS X1, X2 ANDPS X1, X2
MINPS X4, X2 MINPS X4, X2
MULPS X3, X2 MULPS X5, X2
// z = convertToInt32(y) // z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP) LDMXCSR mxcsrNew-4(SP)
@ -791,9 +791,9 @@ flAccOpOverEnd:
// xmm0 scratch // xmm0 scratch
// xmm1 x // xmm1 x
// xmm2 y, z // xmm2 y, z
// xmm3 flAlmost65536 // xmm3 flSignMask
// xmm4 flOne // xmm4 flOne
// xmm5 flSignMask // xmm5 flAlmost65536
// xmm6 gather // xmm6 gather
// xmm7 offset // xmm7 offset
// xmm8 - // xmm8 -
@ -822,12 +822,12 @@ TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48
ORL $0x6000, AX ORL $0x6000, AX
MOVL AX, mxcsrNew-4(SP) MOVL AX, mxcsrNew-4(SP)
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
// flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
MOVOU flAlmost65536<>(SB), X3 // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
MOVOU flSignMask<>(SB), X3
MOVOU flOne<>(SB), X4 MOVOU flOne<>(SB), X4
MOVOU flSignMask<>(SB), X5 MOVOU flAlmost65536<>(SB), X5
// gather := XMM(see above) // PSHUFB shuffle mask. // gather := XMM(see above) // PSHUFB shuffle mask.
MOVOU gather<>(SB), X6 MOVOU gather<>(SB), X6
@ -867,10 +867,10 @@ flAccOpSrcLoop4:
// y = x & flSignMask // y = x & flSignMask
// y = min(y, flOne) // y = min(y, flOne)
// y = mul(y, flAlmost65536) // y = mul(y, flAlmost65536)
MOVOU X5, X2 MOVOU X3, X2
ANDPS X1, X2 ANDPS X1, X2
MINPS X4, X2 MINPS X4, X2
MULPS X3, X2 MULPS X5, X2
// z = convertToInt32(y) // z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP) LDMXCSR mxcsrNew-4(SP)
@ -906,10 +906,10 @@ flAccOpSrcLoop1:
// y = x & flSignMask // y = x & flSignMask
// y = min(y, flOne) // y = min(y, flOne)
// y = mul(y, flAlmost65536) // y = mul(y, flAlmost65536)
MOVOU X5, X2 MOVOU X3, X2
ANDPS X1, X2 ANDPS X1, X2
MINPS X4, X2 MINPS X4, X2
MULPS X3, X2 MULPS X5, X2
// z = convertToInt32(y) // z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP) LDMXCSR mxcsrNew-4(SP)
@ -945,9 +945,9 @@ flAccOpSrcEnd:
// xmm0 scratch // xmm0 scratch
// xmm1 x // xmm1 x
// xmm2 y, z // xmm2 y, z
// xmm3 flAlmost65536 // xmm3 flSignMask
// xmm4 flOne // xmm4 flOne
// xmm5 flSignMask // xmm5 flAlmost65536
// xmm6 - // xmm6 -
// xmm7 offset // xmm7 offset
// xmm8 - // xmm8 -
@ -976,12 +976,12 @@ TEXT ·floatingAccumulateMaskSIMD(SB), NOSPLIT, $8-48
ORL $0x6000, AX ORL $0x6000, AX
MOVL AX, mxcsrNew-4(SP) MOVL AX, mxcsrNew-4(SP)
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
// flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
MOVOU flAlmost65536<>(SB), X3 // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
MOVOU flSignMask<>(SB), X3
MOVOU flOne<>(SB), X4 MOVOU flOne<>(SB), X4
MOVOU flSignMask<>(SB), X5 MOVOU flAlmost65536<>(SB), X5
// offset := XMM(0x00000000 repeated four times) // Cumulative sum. // offset := XMM(0x00000000 repeated four times) // Cumulative sum.
XORPS X7, X7 XORPS X7, X7
@ -1018,10 +1018,10 @@ flAccMaskLoop4:
// y = x & flSignMask // y = x & flSignMask
// y = min(y, flOne) // y = min(y, flOne)
// y = mul(y, flAlmost65536) // y = mul(y, flAlmost65536)
MOVOU X5, X2 MOVOU X3, X2
ANDPS X1, X2 ANDPS X1, X2
MINPS X4, X2 MINPS X4, X2
MULPS X3, X2 MULPS X5, X2
// z = convertToInt32(y) // z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP) LDMXCSR mxcsrNew-4(SP)
@ -1055,10 +1055,10 @@ flAccMaskLoop1:
// y = x & flSignMask // y = x & flSignMask
// y = min(y, flOne) // y = min(y, flOne)
// y = mul(y, flAlmost65536) // y = mul(y, flAlmost65536)
MOVOU X5, X2 MOVOU X3, X2
ANDPS X1, X2 ANDPS X1, X2
MINPS X4, X2 MINPS X4, X2
MULPS X3, X2 MULPS X5, X2
// z = convertToInt32(y) // z = convertToInt32(y)
LDMXCSR mxcsrNew-4(SP) LDMXCSR mxcsrNew-4(SP)

View File

@ -242,13 +242,13 @@ const (
sizeOfUint32 = 4 sizeOfUint32 = 4
fxXMM3 = `-` fxXMM3 = `-`
flXMM3 = `flAlmost65536` flXMM3 = `flSignMask`
fxXMM4 = `-` fxXMM4 = `-`
flXMM4 = `flOne` flXMM4 = `flOne`
fxXMM5 = `fxAlmost65536` fxXMM5 = `fxAlmost65536`
flXMM5 = `flSignMask` flXMM5 = `flAlmost65536`
oneArgLoadArgs = ` oneArgLoadArgs = `
MOVQ buf_base+0(FP), DI MOVQ buf_base+0(FP), DI
@ -281,12 +281,12 @@ const (
MOVOU fxAlmost65536<>(SB), X5 MOVOU fxAlmost65536<>(SB), X5
` `
flLoadXMMRegs = ` flLoadXMMRegs = `
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
// flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
// flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
MOVOU flAlmost65536<>(SB), X3 // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
MOVOU flSignMask<>(SB), X3
MOVOU flOne<>(SB), X4 MOVOU flOne<>(SB), X4
MOVOU flSignMask<>(SB), X5 MOVOU flAlmost65536<>(SB), X5
` `
fxAdd = `PADDD` fxAdd = `PADDD`
@ -312,10 +312,10 @@ const (
// y = x & flSignMask // y = x & flSignMask
// y = min(y, flOne) // y = min(y, flOne)
// y = mul(y, flAlmost65536) // y = mul(y, flAlmost65536)
MOVOU X5, X2 MOVOU X3, X2
ANDPS X1, X2 ANDPS X1, X2
MINPS X4, X2 MINPS X4, X2
MULPS X3, X2 MULPS X5, X2
` `
fxConvertToInt32 = ` fxConvertToInt32 = `