vector: remove a shift from the inner loop.

PSHUFB already lets us pick certain bytes out of an XMM register. There's no need to shift by 8 bits (1 byte) beforehand. The code generator is simpler. There's also a small win on the benchmarks, especially for FixedAccumulateOpOverSIMD. name old time/op new time/op delta FixedAccumulateOpOverSIMD16-8 183ns ± 0% 174ns ± 0% -4.92% (p=0.000 n=8+8) FixedAccumulateOpSrcSIMD16-8 87.0ns ± 1% 86.3ns ± 0% -0.77% (p=0.000 n=10+9) FixedAccumulateMaskSIMD16-8 80.4ns ± 1% 81.2ns ± 1% +1.01% (p=0.000 n=10+10) FloatingAccumulateOpOverSIMD16-8 250ns ± 1% 244ns ± 0% -2.39% (p=0.000 n=10+8) FloatingAccumulateOpSrcSIMD16-8 176ns ± 1% 176ns ± 0% ~ (p=0.142 n=10+8) FloatingAccumulateMaskSIMD16-8 167ns ± 0% 167ns ± 0% ~ (p=0.137 n=8+10) FixedAccumulateOpOverSIMD64-8 2.73µs ± 1% 2.58µs ± 0% -5.36% (p=0.000 n=10+7) FixedAccumulateOpSrcSIMD64-8 1.18µs ± 1% 1.17µs ± 0% -0.33% (p=0.003 n=10+9) FixedAccumulateMaskSIMD64-8 1.09µs ± 0% 1.09µs ± 0% -0.17% (p=0.047 n=9+9) FloatingAccumulateOpOverSIMD64-8 3.67µs ± 0% 3.61µs ± 1% -1.47% (p=0.000 n=7+10) FloatingAccumulateOpSrcSIMD64-8 2.60µs ± 0% 2.61µs ± 0% +0.19% (p=0.003 n=8+8) FloatingAccumulateMaskSIMD64-8 2.47µs ± 0% 2.46µs ± 0% ~ (p=0.162 n=10+9) GlyphAlpha16Over-8 2.99µs ± 0% 2.98µs ± 1% -0.50% (p=0.021 n=9+10) GlyphAlpha16Src-8 2.89µs ± 1% 2.89µs ± 0% ~ (p=0.381 n=10+10) GlyphAlpha32Over-8 4.53µs ± 0% 4.50µs ± 0% -0.83% (p=0.000 n=10+10) GlyphAlpha32Src-8 4.14µs ± 0% 4.13µs ± 0% -0.21% (p=0.026 n=9+10) GlyphAlpha64Over-8 8.97µs ± 1% 8.80µs ± 0% -1.85% (p=0.000 n=10+9) GlyphAlpha64Src-8 7.42µs ± 1% 7.39µs ± 0% -0.45% (p=0.011 n=10+10) GlyphAlpha128Over-8 21.8µs ± 0% 21.2µs ± 0% -2.91% (p=0.000 n=9+9) GlyphAlpha128Src-8 15.6µs ± 0% 15.6µs ± 0% ~ (p=0.982 n=10+7) GlyphAlpha256Over-8 66.3µs ± 1% 63.7µs ± 0% -3.84% (p=0.000 n=10+9) GlyphAlpha256Src-8 41.2µs ± 1% 41.2µs ± 1% ~ (p=1.000 n=10+10) GlyphRGBA16Over-8 4.75µs ± 0% 4.75µs ± 1% ~ (p=0.735 n=9+10) GlyphRGBA16Src-8 4.20µs ± 0% 4.20µs ± 0% ~ (p=0.503 n=8+8) GlyphRGBA32Over-8 11.4µs ± 0% 11.4µs ± 0% ~ (p=0.119 n=9+9) GlyphRGBA32Src-8 9.34µs ± 1% 9.32µs ± 0% ~ (p=0.062 n=9+8) GlyphRGBA64Over-8 36.0µs ± 0% 36.1µs ± 0% ~ (p=0.209 n=8+9) GlyphRGBA64Src-8 27.9µs ± 1% 27.8µs ± 0% ~ (p=0.796 n=10+10) GlyphRGBA128Over-8 131µs ± 0% 131µs ± 0% ~ (p=0.931 n=9+9) GlyphRGBA128Src-8 97.9µs ± 0% 97.7µs ± 1% ~ (p=0.053 n=9+10) GlyphRGBA256Over-8 503µs ± 0% 503µs ± 1% ~ (p=0.274 n=8+10) GlyphRGBA256Src-8 370µs ± 0% 369µs ± 0% ~ (p=0.497 n=9+10) Change-Id: I56651e70b258792b83ea2a74904756243c88bef4 Reviewed-on: https://go-review.googlesource.com/31537 Reviewed-by: David Crawshaw <crawshaw@golang.org>
2016-10-20 15:15:53 +11:00 · 2016-10-20 15:15:53 +11:00 · 93fad3647f
commit 93fad3647f
parent 8874bef159
3 changed files with 68 additions and 124 deletions
--- a/vector/acc_amd64.s
+++ b/vector/acc_amd64.s
@ -8,8 +8,6 @@
 // fl is short for floating point math. fx is short for fixed point math.
 DATA flAlmost256<>+0x00(SB)/8, $0x437fffff437fffff
 DATA flAlmost256<>+0x08(SB)/8, $0x437fffff437fffff
 DATA flAlmost65536<>+0x00(SB)/8, $0x477fffff477fffff
 DATA flAlmost65536<>+0x08(SB)/8, $0x477fffff477fffff
 DATA flOne<>+0x00(SB)/8, $0x3f8000003f800000
@ -28,30 +26,26 @@ DATA flSignMask<>+0x08(SB)/8, $0x7fffffff7fffffff
 DATA scatterAndMulBy0x101<>+0x00(SB)/8, $0x8080010180800000
 DATA scatterAndMulBy0x101<>+0x08(SB)/8, $0x8080030380800202
-// gather is a PSHUFB mask that brings the low byte of the XMM register's four
+// gather is a PSHUFB mask that brings the second-lowest byte of the XMM
-// uint32 values to the low four bytes of that register.
+// register's four uint32 values to the low four bytes of that register.
 //
 // It transforms a little-endian 16-byte XMM value from
-//	i???j???k???l???
+//	?i???j???k???l??
 // to
 //	ijkl000000000000
-DATA gather<>+0x00(SB)/8, $0x808080800c080400
+DATA gather<>+0x00(SB)/8, $0x808080800d090501
 DATA gather<>+0x08(SB)/8, $0x8080808080808080
 DATA fxAlmost256<>+0x00(SB)/8, $0x000000ff000000ff
 DATA fxAlmost256<>+0x08(SB)/8, $0x000000ff000000ff
 DATA fxAlmost65536<>+0x00(SB)/8, $0x0000ffff0000ffff
 DATA fxAlmost65536<>+0x08(SB)/8, $0x0000ffff0000ffff
 DATA inverseFFFF<>+0x00(SB)/8, $0x8000800180008001
 DATA inverseFFFF<>+0x08(SB)/8, $0x8000800180008001
 GLOBL flAlmost256<>(SB), (NOPTR+RODATA), $16
 GLOBL flAlmost65536<>(SB), (NOPTR+RODATA), $16
 GLOBL flOne<>(SB), (NOPTR+RODATA), $16
 GLOBL flSignMask<>(SB), (NOPTR+RODATA), $16
 GLOBL scatterAndMulBy0x101<>(SB), (NOPTR+RODATA), $16
 GLOBL gather<>(SB), (NOPTR+RODATA), $16
 GLOBL fxAlmost256<>(SB), (NOPTR+RODATA), $16
 GLOBL fxAlmost65536<>(SB), (NOPTR+RODATA), $16
 GLOBL inverseFFFF<>(SB), (NOPTR+RODATA), $16
@ -196,15 +190,12 @@ fxAccOpOverLoop4:
 	PSRLQ $47, X0
 	PSRLQ $47, X11
-	// Merge the two registers back to one, X11.
+	// Merge the two registers back to one, X11, and add maskA.
 	PSLLQ $32, X11
 	XORPS X0, X11
 	// Add maskA, shift from 16 bit color to 8 bit color.
 	PADDD X11, X2
 	PSRLQ $8, X2
-	// As per opSrcStore4, shuffle and copy the low 4 bytes.
+	// As per opSrcStore4, shuffle and copy the 4 second-lowest bytes.
 	PSHUFB X6, X2
 	MOVL   X2, (DI)
@ -292,7 +283,7 @@ fxAccOpOverEnd:
 //	xmm2	y, z
 //	xmm3	-
 //	xmm4	-
-//	xmm5	fxAlmost256
+//	xmm5	fxAlmost65536
 //	xmm6	gather
 //	xmm7	offset
 //	xmm8	-
@ -314,8 +305,8 @@ TEXT ·fixedAccumulateOpSrcSIMD(SB), NOSPLIT, $0-48
 	MOVQ R10, R11
 	ANDQ $-4, R10
-	// fxAlmost256 := XMM(0x000000ff repeated four times) // Maximum of an uint8.
+	// fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
-	MOVOU fxAlmost256<>(SB), X5
+	MOVOU fxAlmost65536<>(SB), X5
 	// gather := XMM(see above) // PSHUFB shuffle mask.
 	MOVOU gather<>(SB), X6
@ -353,24 +344,24 @@ fxAccOpSrcLoop4:
 	PADDD X7, X1
 	// y = abs(x)
-	// y >>= 10 // Shift by 2*ϕ - 8.
+	// y >>= 2 // Shift by 2*ϕ - 16.
-	// y = min(y, fxAlmost256)
+	// y = min(y, fxAlmost65536)
 	//
 	// pabsd  %xmm1,%xmm2
-	// psrld  $0xa,%xmm2
+	// psrld  $0x2,%xmm2
 	// pminud %xmm5,%xmm2
 	//
 	// Hopefully we'll get these opcode mnemonics into the assembler for Go
 	// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
 	// it's similar.
 	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
-	BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0a
+	BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
 	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
 	// z = convertToInt32(y)
 	// No-op.
-	// z = shuffleTheLowBytesOfEach4ByteElement(z)
+	// z = shuffleTheSecondLowestBytesOfEach4ByteElement(z)
 	// copy(dst[:4], low4BytesOf(z))
 	PSHUFB X6, X2
 	MOVL   X2, (DI)
@ -397,25 +388,26 @@ fxAccOpSrcLoop1:
 	PADDD X7, X1
 	// y = abs(x)
-	// y >>= 10 // Shift by 2*ϕ - 8.
+	// y >>= 2 // Shift by 2*ϕ - 16.
-	// y = min(y, fxAlmost256)
+	// y = min(y, fxAlmost65536)
 	//
 	// pabsd  %xmm1,%xmm2
-	// psrld  $0xa,%xmm2
+	// psrld  $0x2,%xmm2
 	// pminud %xmm5,%xmm2
 	//
 	// Hopefully we'll get these opcode mnemonics into the assembler for Go
 	// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
 	// it's similar.
 	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
-	BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0a
+	BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
 	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
 	// z = convertToInt32(y)
 	// No-op.
-	// dst[0] = uint8(z)
+	// dst[0] = uint8(z>>8)
 	MOVL X2, BX
 	SHRL $8, BX
 	MOVB BX, (DI)
 	// offset = x
@ -712,15 +704,12 @@ flAccOpOverLoop4:
 	PSRLQ $47, X0
 	PSRLQ $47, X11
-	// Merge the two registers back to one, X11.
+	// Merge the two registers back to one, X11, and add maskA.
 	PSLLQ $32, X11
 	XORPS X0, X11
 	// Add maskA, shift from 16 bit color to 8 bit color.
 	PADDD X11, X2
 	PSRLQ $8, X2
-	// As per opSrcStore4, shuffle and copy the low 4 bytes.
+	// As per opSrcStore4, shuffle and copy the 4 second-lowest bytes.
 	PSHUFB X6, X2
 	MOVL   X2, (DI)
@ -801,7 +790,7 @@ flAccOpOverEnd:
 //	xmm0	scratch
 //	xmm1	x
 //	xmm2	y, z
-//	xmm3	flAlmost256
+//	xmm3	flAlmost65536
 //	xmm4	flOne
 //	xmm5	flSignMask
 //	xmm6	gather
@ -832,10 +821,10 @@ TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48
 	ORL     $0x6000, AX
 	MOVL    AX, mxcsrNew-4(SP)
-	// flAlmost256 := XMM(0x437fffff repeated four times) // 255.99998 as a float32.
+	// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
-	// flOne       := XMM(0x3f800000 repeated four times) // 1 as a float32.
+	// flOne         := XMM(0x3f800000 repeated four times) // 1 as a float32.
-	// flSignMask  := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
+	// flSignMask    := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
-	MOVOU flAlmost256<>(SB), X3
+	MOVOU flAlmost65536<>(SB), X3
 	MOVOU flOne<>(SB), X4
 	MOVOU flSignMask<>(SB), X5
@ -876,7 +865,7 @@ flAccOpSrcLoop4:
 	// y = x & flSignMask
 	// y = min(y, flOne)
-	// y = mul(y, flAlmost256)
+	// y = mul(y, flAlmost65536)
 	MOVOU X5, X2
 	ANDPS X1, X2
 	MINPS X4, X2
@ -887,7 +876,7 @@ flAccOpSrcLoop4:
 	CVTPS2PL X2, X2
 	LDMXCSR  mxcsrOrig-8(SP)
-	// z = shuffleTheLowBytesOfEach4ByteElement(z)
+	// z = shuffleTheSecondLowestBytesOfEach4ByteElement(z)
 	// copy(dst[:4], low4BytesOf(z))
 	PSHUFB X6, X2
 	MOVL   X2, (DI)
@ -915,7 +904,7 @@ flAccOpSrcLoop1:
 	// y = x & flSignMask
 	// y = min(y, flOne)
-	// y = mul(y, flAlmost256)
+	// y = mul(y, flAlmost65536)
 	MOVOU X5, X2
 	ANDPS X1, X2
 	MINPS X4, X2
@ -926,8 +915,9 @@ flAccOpSrcLoop1:
 	CVTPS2PL X2, X2
 	LDMXCSR  mxcsrOrig-8(SP)
-	// dst[0] = uint8(z)
+	// dst[0] = uint8(z>>8)
 	MOVL X2, BX
 	SHRL $8, BX
 	MOVB BX, (DI)
 	// offset = x
--- a/vector/gen.go
+++ b/vector/gen.go
@ -101,16 +101,16 @@ var instances = []struct {
 	DstElemSize4:   4 * sizeOfUint8,
 	XMM3:           fxXMM3,
 	XMM4:           fxXMM4,
-	XMM5:           fxXMM5_65536,
+	XMM5:           fxXMM5,
 	XMM6:           opOverXMM6,
 	XMM8:           opOverXMM8,
 	XMM9:           opOverXMM9,
 	XMM10:          opOverXMM10,
 	LoadArgs:       twoArgLoadArgs,
 	Setup:          fxSetup,
-	LoadXMMRegs:    fxLoadXMMRegs65536 + "\n" + opOverLoadXMMRegs,
+	LoadXMMRegs:    fxLoadXMMRegs + "\n" + opOverLoadXMMRegs,
 	Add:            fxAdd,
-	ClampAndScale:  fxClampAndScale65536,
+	ClampAndScale:  fxClampAndScale,
 	ConvertToInt32: fxConvertToInt32,
 	Store4:         opOverStore4,
 	Store1:         opOverStore1,
@ -124,16 +124,16 @@ var instances = []struct {
 	DstElemSize4:   4 * sizeOfUint8,
 	XMM3:           fxXMM3,
 	XMM4:           fxXMM4,
-	XMM5:           fxXMM5_256,
+	XMM5:           fxXMM5,
 	XMM6:           opSrcXMM6,
 	XMM8:           opSrcXMM8,
 	XMM9:           opSrcXMM9,
 	XMM10:          opSrcXMM10,
 	LoadArgs:       twoArgLoadArgs,
 	Setup:          fxSetup,
-	LoadXMMRegs:    fxLoadXMMRegs256 + "\n" + opSrcLoadXMMRegs,
+	LoadXMMRegs:    fxLoadXMMRegs + "\n" + opSrcLoadXMMRegs,
 	Add:            fxAdd,
-	ClampAndScale:  fxClampAndScale256,
+	ClampAndScale:  fxClampAndScale,
 	ConvertToInt32: fxConvertToInt32,
 	Store4:         opSrcStore4,
 	Store1:         opSrcStore1,
@ -147,16 +147,16 @@ var instances = []struct {
 	DstElemSize4:   4 * sizeOfUint32,
 	XMM3:           fxXMM3,
 	XMM4:           fxXMM4,
-	XMM5:           fxXMM5_65536,
+	XMM5:           fxXMM5,
 	XMM6:           maskXMM6,
 	XMM8:           maskXMM8,
 	XMM9:           maskXMM9,
 	XMM10:          maskXMM10,
 	LoadArgs:       oneArgLoadArgs,
 	Setup:          fxSetup,
-	LoadXMMRegs:    fxLoadXMMRegs65536 + "\n" + maskLoadXMMRegs,
+	LoadXMMRegs:    fxLoadXMMRegs + "\n" + maskLoadXMMRegs,
 	Add:            fxAdd,
-	ClampAndScale:  fxClampAndScale65536,
+	ClampAndScale:  fxClampAndScale,
 	ConvertToInt32: fxConvertToInt32,
 	Store4:         maskStore4,
 	Store1:         maskStore1,
@ -168,7 +168,7 @@ var instances = []struct {
 	Args:           "dst []uint8, src []float32",
 	DstElemSize1:   1 * sizeOfUint8,
 	DstElemSize4:   4 * sizeOfUint8,
-	XMM3:           flXMM3_65536,
+	XMM3:           flXMM3,
 	XMM4:           flXMM4,
 	XMM5:           flXMM5,
 	XMM6:           opOverXMM6,
@ -177,9 +177,9 @@ var instances = []struct {
 	XMM10:          opOverXMM10,
 	LoadArgs:       twoArgLoadArgs,
 	Setup:          flSetup,
-	LoadXMMRegs:    flLoadXMMRegs65536 + "\n" + opOverLoadXMMRegs,
+	LoadXMMRegs:    flLoadXMMRegs + "\n" + opOverLoadXMMRegs,
 	Add:            flAdd,
-	ClampAndScale:  flClampAndScale65536,
+	ClampAndScale:  flClampAndScale,
 	ConvertToInt32: flConvertToInt32,
 	Store4:         opOverStore4,
 	Store1:         opOverStore1,
@ -191,7 +191,7 @@ var instances = []struct {
 	Args:           "dst []uint8, src []float32",
 	DstElemSize1:   1 * sizeOfUint8,
 	DstElemSize4:   4 * sizeOfUint8,
-	XMM3:           flXMM3_256,
+	XMM3:           flXMM3,
 	XMM4:           flXMM4,
 	XMM5:           flXMM5,
 	XMM6:           opSrcXMM6,
@ -200,9 +200,9 @@ var instances = []struct {
 	XMM10:          opSrcXMM10,
 	LoadArgs:       twoArgLoadArgs,
 	Setup:          flSetup,
-	LoadXMMRegs:    flLoadXMMRegs256 + "\n" + opSrcLoadXMMRegs,
+	LoadXMMRegs:    flLoadXMMRegs + "\n" + opSrcLoadXMMRegs,
 	Add:            flAdd,
-	ClampAndScale:  flClampAndScale256,
+	ClampAndScale:  flClampAndScale,
 	ConvertToInt32: flConvertToInt32,
 	Store4:         opSrcStore4,
 	Store1:         opSrcStore1,
@ -214,7 +214,7 @@ var instances = []struct {
 	Args:           "dst []uint32, src []float32",
 	DstElemSize1:   1 * sizeOfUint32,
 	DstElemSize4:   4 * sizeOfUint32,
-	XMM3:           flXMM3_65536,
+	XMM3:           flXMM3,
 	XMM4:           flXMM4,
 	XMM5:           flXMM5,
 	XMM6:           maskXMM6,
@ -223,9 +223,9 @@ var instances = []struct {
 	XMM10:          maskXMM10,
 	LoadArgs:       twoArgLoadArgs,
 	Setup:          flSetup,
-	LoadXMMRegs:    flLoadXMMRegs65536 + "\n" + maskLoadXMMRegs,
+	LoadXMMRegs:    flLoadXMMRegs + "\n" + maskLoadXMMRegs,
 	Add:            flAdd,
-	ClampAndScale:  flClampAndScale65536,
+	ClampAndScale:  flClampAndScale,
 	ConvertToInt32: flConvertToInt32,
 	Store4:         maskStore4,
 	Store1:         maskStore1,
@ -241,16 +241,14 @@ const (
 	sizeOfUint8  = 1
 	sizeOfUint32 = 4
-	fxXMM3       = `-`
+	fxXMM3 = `-`
-	flXMM3_256   = `flAlmost256`
+	flXMM3 = `flAlmost65536`
 	flXMM3_65536 = `flAlmost65536`
 	fxXMM4 = `-`
 	flXMM4 = `flOne`
-	fxXMM5_256   = `fxAlmost256`
+	fxXMM5 = `fxAlmost65536`
-	fxXMM5_65536 = `fxAlmost65536`
+	flXMM5 = `flSignMask`
 	flXMM5       = `flSignMask`
 	oneArgLoadArgs = `
 		MOVQ buf_base+0(FP), DI
@ -278,23 +276,11 @@ const (
 		MOVL    AX, mxcsrNew-4(SP)
 		`
-	fxLoadXMMRegs256 = `
+	fxLoadXMMRegs = `
 		// fxAlmost256 := XMM(0x000000ff repeated four times) // Maximum of an uint8.
 		MOVOU fxAlmost256<>(SB), X5
 		`
 	fxLoadXMMRegs65536 = `
 		// fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
 		MOVOU fxAlmost65536<>(SB), X5
 		`
-	flLoadXMMRegs256 = `
+	flLoadXMMRegs = `
 		// flAlmost256 := XMM(0x437fffff repeated four times) // 255.99998 as a float32.
 		// flOne       := XMM(0x3f800000 repeated four times) // 1 as a float32.
 		// flSignMask  := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
 		MOVOU flAlmost256<>(SB), X3
 		MOVOU flOne<>(SB), X4
 		MOVOU flSignMask<>(SB), X5
 		`
 	flLoadXMMRegs65536 = `
 		// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
 		// flOne         := XMM(0x3f800000 repeated four times) // 1 as a float32.
 		// flSignMask    := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
@ -306,23 +292,7 @@ const (
 	fxAdd = `PADDD`
 	flAdd = `ADDPS`
-	fxClampAndScale256 = `
+	fxClampAndScale = `
 		// y = abs(x)
 		// y >>= 10 // Shift by 2*ϕ - 8.
 		// y = min(y, fxAlmost256)
 		//
 		// pabsd  %xmm1,%xmm2
 		// psrld  $0xa,%xmm2
 		// pminud %xmm5,%xmm2
 		//
 		// Hopefully we'll get these opcode mnemonics into the assembler for Go
 		// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
 		// it's similar.
 		BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
 		BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0a
 		BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
 		`
 	fxClampAndScale65536 = `
 		// y = abs(x)
 		// y >>= 2 // Shift by 2*ϕ - 16.
 		// y = min(y, fxAlmost65536)
@ -338,16 +308,7 @@ const (
 		BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
 		BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
 		`
-	flClampAndScale256 = `
+	flClampAndScale = `
 		// y = x & flSignMask
 		// y = min(y, flOne)
 		// y = mul(y, flAlmost256)
 		MOVOU X5, X2
 		ANDPS X1, X2
 		MINPS X4, X2
 		MULPS X3, X2
 		`
 	flClampAndScale65536 = `
 		// y = x & flSignMask
 		// y = min(y, flOne)
 		// y = mul(y, flAlmost65536)
@ -402,18 +363,16 @@ const (
 		BYTE  $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda
 		PSRLQ $47, X0
 		PSRLQ $47, X11
-		// Merge the two registers back to one, X11.
+		// Merge the two registers back to one, X11, and add maskA.
 		PSLLQ $32, X11
 		XORPS X0, X11
-		// Add maskA, shift from 16 bit color to 8 bit color.
+		PADDD X11, X2
-		PADDD  X11, X2
+		// As per opSrcStore4, shuffle and copy the 4 second-lowest bytes.
 		PSRLQ  $8, X2
 		// As per opSrcStore4, shuffle and copy the low 4 bytes.
 		PSHUFB X6, X2
 		MOVL   X2, (DI)
 		`
 	opSrcStore4 = `
-		// z = shuffleTheLowBytesOfEach4ByteElement(z)
+		// z = shuffleTheSecondLowestBytesOfEach4ByteElement(z)
 		// copy(dst[:4], low4BytesOf(z))
 		PSHUFB X6, X2
 		MOVL   X2, (DI)
@ -444,8 +403,9 @@ const (
 		MOVB    R13, (DI)
 		`
 	opSrcStore1 = `
-		// dst[0] = uint8(z)
+		// dst[0] = uint8(z>>8)
 		MOVL X2, BX
 		SHRL $8, BX
 		MOVB BX, (DI)
 		`
 	maskStore1 = `
--- a/vector/gen_acc_amd64.s.tmpl
+++ b/vector/gen_acc_amd64.s.tmpl
@ -10,8 +10,6 @@
 // fl is short for floating point math. fx is short for fixed point math.
 DATA flAlmost256<>+0x00(SB)/8, $0x437fffff437fffff
 DATA flAlmost256<>+0x08(SB)/8, $0x437fffff437fffff
 DATA flAlmost65536<>+0x00(SB)/8, $0x477fffff477fffff
 DATA flAlmost65536<>+0x08(SB)/8, $0x477fffff477fffff
 DATA flOne<>+0x00(SB)/8, $0x3f8000003f800000
@ -30,30 +28,26 @@ DATA flSignMask<>+0x08(SB)/8, $0x7fffffff7fffffff
 DATA scatterAndMulBy0x101<>+0x00(SB)/8, $0x8080010180800000
 DATA scatterAndMulBy0x101<>+0x08(SB)/8, $0x8080030380800202
-// gather is a PSHUFB mask that brings the low byte of the XMM register's four
+// gather is a PSHUFB mask that brings the second-lowest byte of the XMM
-// uint32 values to the low four bytes of that register.
+// register's four uint32 values to the low four bytes of that register.
 //
 // It transforms a little-endian 16-byte XMM value from
-//	i???j???k???l???
+//	?i???j???k???l??
 // to
 //	ijkl000000000000
-DATA gather<>+0x00(SB)/8, $0x808080800c080400
+DATA gather<>+0x00(SB)/8, $0x808080800d090501
 DATA gather<>+0x08(SB)/8, $0x8080808080808080
 DATA fxAlmost256<>+0x00(SB)/8, $0x000000ff000000ff
 DATA fxAlmost256<>+0x08(SB)/8, $0x000000ff000000ff
 DATA fxAlmost65536<>+0x00(SB)/8, $0x0000ffff0000ffff
 DATA fxAlmost65536<>+0x08(SB)/8, $0x0000ffff0000ffff
 DATA inverseFFFF<>+0x00(SB)/8, $0x8000800180008001
 DATA inverseFFFF<>+0x08(SB)/8, $0x8000800180008001
 GLOBL flAlmost256<>(SB), (NOPTR+RODATA), $16
 GLOBL flAlmost65536<>(SB), (NOPTR+RODATA), $16
 GLOBL flOne<>(SB), (NOPTR+RODATA), $16
 GLOBL flSignMask<>(SB), (NOPTR+RODATA), $16
 GLOBL scatterAndMulBy0x101<>(SB), (NOPTR+RODATA), $16
 GLOBL gather<>(SB), (NOPTR+RODATA), $16
 GLOBL fxAlmost256<>(SB), (NOPTR+RODATA), $16
 GLOBL fxAlmost65536<>(SB), (NOPTR+RODATA), $16
 GLOBL inverseFFFF<>(SB), (NOPTR+RODATA), $16