From 93fad3647fcda43e12f413a6dd7568896dc06ba9 Mon Sep 17 00:00:00 2001
From: Nigel Tao <nigeltao@golang.org>
Date: Thu, 20 Oct 2016 15:15:53 +1100
Subject: [PATCH] vector: remove a shift from the inner loop.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PSHUFB already lets us pick certain bytes out of an XMM register.
There's no need to shift by 8 bits (1 byte) beforehand.

The code generator is simpler. There's also a small win on the
benchmarks, especially for FixedAccumulateOpOverSIMD.

name                              old time/op  new time/op  delta
FixedAccumulateOpOverSIMD16-8      183ns ± 0%   174ns ± 0%  -4.92%    (p=0.000 n=8+8)
FixedAccumulateOpSrcSIMD16-8      87.0ns ± 1%  86.3ns ± 0%  -0.77%   (p=0.000 n=10+9)
FixedAccumulateMaskSIMD16-8       80.4ns ± 1%  81.2ns ± 1%  +1.01%  (p=0.000 n=10+10)
FloatingAccumulateOpOverSIMD16-8   250ns ± 1%   244ns ± 0%  -2.39%   (p=0.000 n=10+8)
FloatingAccumulateOpSrcSIMD16-8    176ns ± 1%   176ns ± 0%    ~      (p=0.142 n=10+8)
FloatingAccumulateMaskSIMD16-8     167ns ± 0%   167ns ± 0%    ~      (p=0.137 n=8+10)
FixedAccumulateOpOverSIMD64-8     2.73µs ± 1%  2.58µs ± 0%  -5.36%   (p=0.000 n=10+7)
FixedAccumulateOpSrcSIMD64-8      1.18µs ± 1%  1.17µs ± 0%  -0.33%   (p=0.003 n=10+9)
FixedAccumulateMaskSIMD64-8       1.09µs ± 0%  1.09µs ± 0%  -0.17%    (p=0.047 n=9+9)
FloatingAccumulateOpOverSIMD64-8  3.67µs ± 0%  3.61µs ± 1%  -1.47%   (p=0.000 n=7+10)
FloatingAccumulateOpSrcSIMD64-8   2.60µs ± 0%  2.61µs ± 0%  +0.19%    (p=0.003 n=8+8)
FloatingAccumulateMaskSIMD64-8    2.47µs ± 0%  2.46µs ± 0%    ~      (p=0.162 n=10+9)
GlyphAlpha16Over-8                2.99µs ± 0%  2.98µs ± 1%  -0.50%   (p=0.021 n=9+10)
GlyphAlpha16Src-8                 2.89µs ± 1%  2.89µs ± 0%    ~     (p=0.381 n=10+10)
GlyphAlpha32Over-8                4.53µs ± 0%  4.50µs ± 0%  -0.83%  (p=0.000 n=10+10)
GlyphAlpha32Src-8                 4.14µs ± 0%  4.13µs ± 0%  -0.21%   (p=0.026 n=9+10)
GlyphAlpha64Over-8                8.97µs ± 1%  8.80µs ± 0%  -1.85%   (p=0.000 n=10+9)
GlyphAlpha64Src-8                 7.42µs ± 1%  7.39µs ± 0%  -0.45%  (p=0.011 n=10+10)
GlyphAlpha128Over-8               21.8µs ± 0%  21.2µs ± 0%  -2.91%    (p=0.000 n=9+9)
GlyphAlpha128Src-8                15.6µs ± 0%  15.6µs ± 0%    ~      (p=0.982 n=10+7)
GlyphAlpha256Over-8               66.3µs ± 1%  63.7µs ± 0%  -3.84%   (p=0.000 n=10+9)
GlyphAlpha256Src-8                41.2µs ± 1%  41.2µs ± 1%    ~     (p=1.000 n=10+10)
GlyphRGBA16Over-8                 4.75µs ± 0%  4.75µs ± 1%    ~      (p=0.735 n=9+10)
GlyphRGBA16Src-8                  4.20µs ± 0%  4.20µs ± 0%    ~       (p=0.503 n=8+8)
GlyphRGBA32Over-8                 11.4µs ± 0%  11.4µs ± 0%    ~       (p=0.119 n=9+9)
GlyphRGBA32Src-8                  9.34µs ± 1%  9.32µs ± 0%    ~       (p=0.062 n=9+8)
GlyphRGBA64Over-8                 36.0µs ± 0%  36.1µs ± 0%    ~       (p=0.209 n=8+9)
GlyphRGBA64Src-8                  27.9µs ± 1%  27.8µs ± 0%    ~     (p=0.796 n=10+10)
GlyphRGBA128Over-8                 131µs ± 0%   131µs ± 0%    ~       (p=0.931 n=9+9)
GlyphRGBA128Src-8                 97.9µs ± 0%  97.7µs ± 1%    ~      (p=0.053 n=9+10)
GlyphRGBA256Over-8                 503µs ± 0%   503µs ± 1%    ~      (p=0.274 n=8+10)
GlyphRGBA256Src-8                  370µs ± 0%   369µs ± 0%    ~      (p=0.497 n=9+10)

Change-Id: I56651e70b258792b83ea2a74904756243c88bef4
Reviewed-on: https://go-review.googlesource.com/31537
Reviewed-by: David Crawshaw <crawshaw@golang.org>
---
 vector/acc_amd64.s          |  74 +++++++++++--------------
 vector/gen.go               | 104 +++++++++++-------------------------
 vector/gen_acc_amd64.s.tmpl |  14 ++---
 3 files changed, 68 insertions(+), 124 deletions(-)

diff --git a/vector/acc_amd64.s b/vector/acc_amd64.s
index 8d9719c..dee7ad1 100644
--- a/vector/acc_amd64.s
+++ b/vector/acc_amd64.s
@@ -8,8 +8,6 @@
 
 // fl is short for floating point math. fx is short for fixed point math.
 
-DATA flAlmost256<>+0x00(SB)/8, $0x437fffff437fffff
-DATA flAlmost256<>+0x08(SB)/8, $0x437fffff437fffff
 DATA flAlmost65536<>+0x00(SB)/8, $0x477fffff477fffff
 DATA flAlmost65536<>+0x08(SB)/8, $0x477fffff477fffff
 DATA flOne<>+0x00(SB)/8, $0x3f8000003f800000
@@ -28,30 +26,26 @@ DATA flSignMask<>+0x08(SB)/8, $0x7fffffff7fffffff
 DATA scatterAndMulBy0x101<>+0x00(SB)/8, $0x8080010180800000
 DATA scatterAndMulBy0x101<>+0x08(SB)/8, $0x8080030380800202
 
-// gather is a PSHUFB mask that brings the low byte of the XMM register's four
-// uint32 values to the low four bytes of that register.
+// gather is a PSHUFB mask that brings the second-lowest byte of the XMM
+// register's four uint32 values to the low four bytes of that register.
 //
 // It transforms a little-endian 16-byte XMM value from
-//	i???j???k???l???
+//	?i???j???k???l??
 // to
 //	ijkl000000000000
-DATA gather<>+0x00(SB)/8, $0x808080800c080400
+DATA gather<>+0x00(SB)/8, $0x808080800d090501
 DATA gather<>+0x08(SB)/8, $0x8080808080808080
 
-DATA fxAlmost256<>+0x00(SB)/8, $0x000000ff000000ff
-DATA fxAlmost256<>+0x08(SB)/8, $0x000000ff000000ff
 DATA fxAlmost65536<>+0x00(SB)/8, $0x0000ffff0000ffff
 DATA fxAlmost65536<>+0x08(SB)/8, $0x0000ffff0000ffff
 DATA inverseFFFF<>+0x00(SB)/8, $0x8000800180008001
 DATA inverseFFFF<>+0x08(SB)/8, $0x8000800180008001
 
-GLOBL flAlmost256<>(SB), (NOPTR+RODATA), $16
 GLOBL flAlmost65536<>(SB), (NOPTR+RODATA), $16
 GLOBL flOne<>(SB), (NOPTR+RODATA), $16
 GLOBL flSignMask<>(SB), (NOPTR+RODATA), $16
 GLOBL scatterAndMulBy0x101<>(SB), (NOPTR+RODATA), $16
 GLOBL gather<>(SB), (NOPTR+RODATA), $16
-GLOBL fxAlmost256<>(SB), (NOPTR+RODATA), $16
 GLOBL fxAlmost65536<>(SB), (NOPTR+RODATA), $16
 GLOBL inverseFFFF<>(SB), (NOPTR+RODATA), $16
 
@@ -196,15 +190,12 @@ fxAccOpOverLoop4:
 	PSRLQ $47, X0
 	PSRLQ $47, X11
 
-	// Merge the two registers back to one, X11.
+	// Merge the two registers back to one, X11, and add maskA.
 	PSLLQ $32, X11
 	XORPS X0, X11
-
-	// Add maskA, shift from 16 bit color to 8 bit color.
 	PADDD X11, X2
-	PSRLQ $8, X2
 
-	// As per opSrcStore4, shuffle and copy the low 4 bytes.
+	// As per opSrcStore4, shuffle and copy the 4 second-lowest bytes.
 	PSHUFB X6, X2
 	MOVL   X2, (DI)
 
@@ -292,7 +283,7 @@ fxAccOpOverEnd:
 //	xmm2	y, z
 //	xmm3	-
 //	xmm4	-
-//	xmm5	fxAlmost256
+//	xmm5	fxAlmost65536
 //	xmm6	gather
 //	xmm7	offset
 //	xmm8	-
@@ -314,8 +305,8 @@ TEXT ·fixedAccumulateOpSrcSIMD(SB), NOSPLIT, $0-48
 	MOVQ R10, R11
 	ANDQ $-4, R10
 
-	// fxAlmost256 := XMM(0x000000ff repeated four times) // Maximum of an uint8.
-	MOVOU fxAlmost256<>(SB), X5
+	// fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
+	MOVOU fxAlmost65536<>(SB), X5
 
 	// gather := XMM(see above) // PSHUFB shuffle mask.
 	MOVOU gather<>(SB), X6
@@ -353,24 +344,24 @@ fxAccOpSrcLoop4:
 	PADDD X7, X1
 
 	// y = abs(x)
-	// y >>= 10 // Shift by 2*ϕ - 8.
-	// y = min(y, fxAlmost256)
+	// y >>= 2 // Shift by 2*ϕ - 16.
+	// y = min(y, fxAlmost65536)
 	//
 	// pabsd  %xmm1,%xmm2
-	// psrld  $0xa,%xmm2
+	// psrld  $0x2,%xmm2
 	// pminud %xmm5,%xmm2
 	//
 	// Hopefully we'll get these opcode mnemonics into the assembler for Go
 	// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
 	// it's similar.
 	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
-	BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0a
+	BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
 	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
 
 	// z = convertToInt32(y)
 	// No-op.
 
-	// z = shuffleTheLowBytesOfEach4ByteElement(z)
+	// z = shuffleTheSecondLowestBytesOfEach4ByteElement(z)
 	// copy(dst[:4], low4BytesOf(z))
 	PSHUFB X6, X2
 	MOVL   X2, (DI)
@@ -397,25 +388,26 @@ fxAccOpSrcLoop1:
 	PADDD X7, X1
 
 	// y = abs(x)
-	// y >>= 10 // Shift by 2*ϕ - 8.
-	// y = min(y, fxAlmost256)
+	// y >>= 2 // Shift by 2*ϕ - 16.
+	// y = min(y, fxAlmost65536)
 	//
 	// pabsd  %xmm1,%xmm2
-	// psrld  $0xa,%xmm2
+	// psrld  $0x2,%xmm2
 	// pminud %xmm5,%xmm2
 	//
 	// Hopefully we'll get these opcode mnemonics into the assembler for Go
 	// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
 	// it's similar.
 	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
-	BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0a
+	BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
 	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
 
 	// z = convertToInt32(y)
 	// No-op.
 
-	// dst[0] = uint8(z)
+	// dst[0] = uint8(z>>8)
 	MOVL X2, BX
+	SHRL $8, BX
 	MOVB BX, (DI)
 
 	// offset = x
@@ -712,15 +704,12 @@ flAccOpOverLoop4:
 	PSRLQ $47, X0
 	PSRLQ $47, X11
 
-	// Merge the two registers back to one, X11.
+	// Merge the two registers back to one, X11, and add maskA.
 	PSLLQ $32, X11
 	XORPS X0, X11
-
-	// Add maskA, shift from 16 bit color to 8 bit color.
 	PADDD X11, X2
-	PSRLQ $8, X2
 
-	// As per opSrcStore4, shuffle and copy the low 4 bytes.
+	// As per opSrcStore4, shuffle and copy the 4 second-lowest bytes.
 	PSHUFB X6, X2
 	MOVL   X2, (DI)
 
@@ -801,7 +790,7 @@ flAccOpOverEnd:
 //	xmm0	scratch
 //	xmm1	x
 //	xmm2	y, z
-//	xmm3	flAlmost256
+//	xmm3	flAlmost65536
 //	xmm4	flOne
 //	xmm5	flSignMask
 //	xmm6	gather
@@ -832,10 +821,10 @@ TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48
 	ORL     $0x6000, AX
 	MOVL    AX, mxcsrNew-4(SP)
 
-	// flAlmost256 := XMM(0x437fffff repeated four times) // 255.99998 as a float32.
-	// flOne       := XMM(0x3f800000 repeated four times) // 1 as a float32.
-	// flSignMask  := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
-	MOVOU flAlmost256<>(SB), X3
+	// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
+	// flOne         := XMM(0x3f800000 repeated four times) // 1 as a float32.
+	// flSignMask    := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
+	MOVOU flAlmost65536<>(SB), X3
 	MOVOU flOne<>(SB), X4
 	MOVOU flSignMask<>(SB), X5
 
@@ -876,7 +865,7 @@ flAccOpSrcLoop4:
 
 	// y = x & flSignMask
 	// y = min(y, flOne)
-	// y = mul(y, flAlmost256)
+	// y = mul(y, flAlmost65536)
 	MOVOU X5, X2
 	ANDPS X1, X2
 	MINPS X4, X2
@@ -887,7 +876,7 @@ flAccOpSrcLoop4:
 	CVTPS2PL X2, X2
 	LDMXCSR  mxcsrOrig-8(SP)
 
-	// z = shuffleTheLowBytesOfEach4ByteElement(z)
+	// z = shuffleTheSecondLowestBytesOfEach4ByteElement(z)
 	// copy(dst[:4], low4BytesOf(z))
 	PSHUFB X6, X2
 	MOVL   X2, (DI)
@@ -915,7 +904,7 @@ flAccOpSrcLoop1:
 
 	// y = x & flSignMask
 	// y = min(y, flOne)
-	// y = mul(y, flAlmost256)
+	// y = mul(y, flAlmost65536)
 	MOVOU X5, X2
 	ANDPS X1, X2
 	MINPS X4, X2
@@ -926,8 +915,9 @@ flAccOpSrcLoop1:
 	CVTPS2PL X2, X2
 	LDMXCSR  mxcsrOrig-8(SP)
 
-	// dst[0] = uint8(z)
+	// dst[0] = uint8(z>>8)
 	MOVL X2, BX
+	SHRL $8, BX
 	MOVB BX, (DI)
 
 	// offset = x
diff --git a/vector/gen.go b/vector/gen.go
index 92be417..cafd860 100644
--- a/vector/gen.go
+++ b/vector/gen.go
@@ -101,16 +101,16 @@ var instances = []struct {
 	DstElemSize4:   4 * sizeOfUint8,
 	XMM3:           fxXMM3,
 	XMM4:           fxXMM4,
-	XMM5:           fxXMM5_65536,
+	XMM5:           fxXMM5,
 	XMM6:           opOverXMM6,
 	XMM8:           opOverXMM8,
 	XMM9:           opOverXMM9,
 	XMM10:          opOverXMM10,
 	LoadArgs:       twoArgLoadArgs,
 	Setup:          fxSetup,
-	LoadXMMRegs:    fxLoadXMMRegs65536 + "\n" + opOverLoadXMMRegs,
+	LoadXMMRegs:    fxLoadXMMRegs + "\n" + opOverLoadXMMRegs,
 	Add:            fxAdd,
-	ClampAndScale:  fxClampAndScale65536,
+	ClampAndScale:  fxClampAndScale,
 	ConvertToInt32: fxConvertToInt32,
 	Store4:         opOverStore4,
 	Store1:         opOverStore1,
@@ -124,16 +124,16 @@ var instances = []struct {
 	DstElemSize4:   4 * sizeOfUint8,
 	XMM3:           fxXMM3,
 	XMM4:           fxXMM4,
-	XMM5:           fxXMM5_256,
+	XMM5:           fxXMM5,
 	XMM6:           opSrcXMM6,
 	XMM8:           opSrcXMM8,
 	XMM9:           opSrcXMM9,
 	XMM10:          opSrcXMM10,
 	LoadArgs:       twoArgLoadArgs,
 	Setup:          fxSetup,
-	LoadXMMRegs:    fxLoadXMMRegs256 + "\n" + opSrcLoadXMMRegs,
+	LoadXMMRegs:    fxLoadXMMRegs + "\n" + opSrcLoadXMMRegs,
 	Add:            fxAdd,
-	ClampAndScale:  fxClampAndScale256,
+	ClampAndScale:  fxClampAndScale,
 	ConvertToInt32: fxConvertToInt32,
 	Store4:         opSrcStore4,
 	Store1:         opSrcStore1,
@@ -147,16 +147,16 @@ var instances = []struct {
 	DstElemSize4:   4 * sizeOfUint32,
 	XMM3:           fxXMM3,
 	XMM4:           fxXMM4,
-	XMM5:           fxXMM5_65536,
+	XMM5:           fxXMM5,
 	XMM6:           maskXMM6,
 	XMM8:           maskXMM8,
 	XMM9:           maskXMM9,
 	XMM10:          maskXMM10,
 	LoadArgs:       oneArgLoadArgs,
 	Setup:          fxSetup,
-	LoadXMMRegs:    fxLoadXMMRegs65536 + "\n" + maskLoadXMMRegs,
+	LoadXMMRegs:    fxLoadXMMRegs + "\n" + maskLoadXMMRegs,
 	Add:            fxAdd,
-	ClampAndScale:  fxClampAndScale65536,
+	ClampAndScale:  fxClampAndScale,
 	ConvertToInt32: fxConvertToInt32,
 	Store4:         maskStore4,
 	Store1:         maskStore1,
@@ -168,7 +168,7 @@ var instances = []struct {
 	Args:           "dst []uint8, src []float32",
 	DstElemSize1:   1 * sizeOfUint8,
 	DstElemSize4:   4 * sizeOfUint8,
-	XMM3:           flXMM3_65536,
+	XMM3:           flXMM3,
 	XMM4:           flXMM4,
 	XMM5:           flXMM5,
 	XMM6:           opOverXMM6,
@@ -177,9 +177,9 @@ var instances = []struct {
 	XMM10:          opOverXMM10,
 	LoadArgs:       twoArgLoadArgs,
 	Setup:          flSetup,
-	LoadXMMRegs:    flLoadXMMRegs65536 + "\n" + opOverLoadXMMRegs,
+	LoadXMMRegs:    flLoadXMMRegs + "\n" + opOverLoadXMMRegs,
 	Add:            flAdd,
-	ClampAndScale:  flClampAndScale65536,
+	ClampAndScale:  flClampAndScale,
 	ConvertToInt32: flConvertToInt32,
 	Store4:         opOverStore4,
 	Store1:         opOverStore1,
@@ -191,7 +191,7 @@ var instances = []struct {
 	Args:           "dst []uint8, src []float32",
 	DstElemSize1:   1 * sizeOfUint8,
 	DstElemSize4:   4 * sizeOfUint8,
-	XMM3:           flXMM3_256,
+	XMM3:           flXMM3,
 	XMM4:           flXMM4,
 	XMM5:           flXMM5,
 	XMM6:           opSrcXMM6,
@@ -200,9 +200,9 @@ var instances = []struct {
 	XMM10:          opSrcXMM10,
 	LoadArgs:       twoArgLoadArgs,
 	Setup:          flSetup,
-	LoadXMMRegs:    flLoadXMMRegs256 + "\n" + opSrcLoadXMMRegs,
+	LoadXMMRegs:    flLoadXMMRegs + "\n" + opSrcLoadXMMRegs,
 	Add:            flAdd,
-	ClampAndScale:  flClampAndScale256,
+	ClampAndScale:  flClampAndScale,
 	ConvertToInt32: flConvertToInt32,
 	Store4:         opSrcStore4,
 	Store1:         opSrcStore1,
@@ -214,7 +214,7 @@ var instances = []struct {
 	Args:           "dst []uint32, src []float32",
 	DstElemSize1:   1 * sizeOfUint32,
 	DstElemSize4:   4 * sizeOfUint32,
-	XMM3:           flXMM3_65536,
+	XMM3:           flXMM3,
 	XMM4:           flXMM4,
 	XMM5:           flXMM5,
 	XMM6:           maskXMM6,
@@ -223,9 +223,9 @@ var instances = []struct {
 	XMM10:          maskXMM10,
 	LoadArgs:       twoArgLoadArgs,
 	Setup:          flSetup,
-	LoadXMMRegs:    flLoadXMMRegs65536 + "\n" + maskLoadXMMRegs,
+	LoadXMMRegs:    flLoadXMMRegs + "\n" + maskLoadXMMRegs,
 	Add:            flAdd,
-	ClampAndScale:  flClampAndScale65536,
+	ClampAndScale:  flClampAndScale,
 	ConvertToInt32: flConvertToInt32,
 	Store4:         maskStore4,
 	Store1:         maskStore1,
@@ -241,16 +241,14 @@ const (
 	sizeOfUint8  = 1
 	sizeOfUint32 = 4
 
-	fxXMM3       = `-`
-	flXMM3_256   = `flAlmost256`
-	flXMM3_65536 = `flAlmost65536`
+	fxXMM3 = `-`
+	flXMM3 = `flAlmost65536`
 
 	fxXMM4 = `-`
 	flXMM4 = `flOne`
 
-	fxXMM5_256   = `fxAlmost256`
-	fxXMM5_65536 = `fxAlmost65536`
-	flXMM5       = `flSignMask`
+	fxXMM5 = `fxAlmost65536`
+	flXMM5 = `flSignMask`
 
 	oneArgLoadArgs = `
 		MOVQ buf_base+0(FP), DI
@@ -278,23 +276,11 @@ const (
 		MOVL    AX, mxcsrNew-4(SP)
 		`
 
-	fxLoadXMMRegs256 = `
-		// fxAlmost256 := XMM(0x000000ff repeated four times) // Maximum of an uint8.
-		MOVOU fxAlmost256<>(SB), X5
-		`
-	fxLoadXMMRegs65536 = `
+	fxLoadXMMRegs = `
 		// fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
 		MOVOU fxAlmost65536<>(SB), X5
 		`
-	flLoadXMMRegs256 = `
-		// flAlmost256 := XMM(0x437fffff repeated four times) // 255.99998 as a float32.
-		// flOne       := XMM(0x3f800000 repeated four times) // 1 as a float32.
-		// flSignMask  := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
-		MOVOU flAlmost256<>(SB), X3
-		MOVOU flOne<>(SB), X4
-		MOVOU flSignMask<>(SB), X5
-		`
-	flLoadXMMRegs65536 = `
+	flLoadXMMRegs = `
 		// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
 		// flOne         := XMM(0x3f800000 repeated four times) // 1 as a float32.
 		// flSignMask    := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
@@ -306,23 +292,7 @@ const (
 	fxAdd = `PADDD`
 	flAdd = `ADDPS`
 
-	fxClampAndScale256 = `
-		// y = abs(x)
-		// y >>= 10 // Shift by 2*ϕ - 8.
-		// y = min(y, fxAlmost256)
-		//
-		// pabsd  %xmm1,%xmm2
-		// psrld  $0xa,%xmm2
-		// pminud %xmm5,%xmm2
-		//
-		// Hopefully we'll get these opcode mnemonics into the assembler for Go
-		// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
-		// it's similar.
-		BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
-		BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0a
-		BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
-		`
-	fxClampAndScale65536 = `
+	fxClampAndScale = `
 		// y = abs(x)
 		// y >>= 2 // Shift by 2*ϕ - 16.
 		// y = min(y, fxAlmost65536)
@@ -338,16 +308,7 @@ const (
 		BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
 		BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
 		`
-	flClampAndScale256 = `
-		// y = x & flSignMask
-		// y = min(y, flOne)
-		// y = mul(y, flAlmost256)
-		MOVOU X5, X2
-		ANDPS X1, X2
-		MINPS X4, X2
-		MULPS X3, X2
-		`
-	flClampAndScale65536 = `
+	flClampAndScale = `
 		// y = x & flSignMask
 		// y = min(y, flOne)
 		// y = mul(y, flAlmost65536)
@@ -402,18 +363,16 @@ const (
 		BYTE  $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda
 		PSRLQ $47, X0
 		PSRLQ $47, X11
-		// Merge the two registers back to one, X11.
+		// Merge the two registers back to one, X11, and add maskA.
 		PSLLQ $32, X11
 		XORPS X0, X11
-		// Add maskA, shift from 16 bit color to 8 bit color.
-		PADDD  X11, X2
-		PSRLQ  $8, X2
-		// As per opSrcStore4, shuffle and copy the low 4 bytes.
+		PADDD X11, X2
+		// As per opSrcStore4, shuffle and copy the 4 second-lowest bytes.
 		PSHUFB X6, X2
 		MOVL   X2, (DI)
 		`
 	opSrcStore4 = `
-		// z = shuffleTheLowBytesOfEach4ByteElement(z)
+		// z = shuffleTheSecondLowestBytesOfEach4ByteElement(z)
 		// copy(dst[:4], low4BytesOf(z))
 		PSHUFB X6, X2
 		MOVL   X2, (DI)
@@ -444,8 +403,9 @@ const (
 		MOVB    R13, (DI)
 		`
 	opSrcStore1 = `
-		// dst[0] = uint8(z)
+		// dst[0] = uint8(z>>8)
 		MOVL X2, BX
+		SHRL $8, BX
 		MOVB BX, (DI)
 		`
 	maskStore1 = `
diff --git a/vector/gen_acc_amd64.s.tmpl b/vector/gen_acc_amd64.s.tmpl
index 615d7a0..05ce25b 100644
--- a/vector/gen_acc_amd64.s.tmpl
+++ b/vector/gen_acc_amd64.s.tmpl
@@ -10,8 +10,6 @@
 
 // fl is short for floating point math. fx is short for fixed point math.
 
-DATA flAlmost256<>+0x00(SB)/8, $0x437fffff437fffff
-DATA flAlmost256<>+0x08(SB)/8, $0x437fffff437fffff
 DATA flAlmost65536<>+0x00(SB)/8, $0x477fffff477fffff
 DATA flAlmost65536<>+0x08(SB)/8, $0x477fffff477fffff
 DATA flOne<>+0x00(SB)/8, $0x3f8000003f800000
@@ -30,30 +28,26 @@ DATA flSignMask<>+0x08(SB)/8, $0x7fffffff7fffffff
 DATA scatterAndMulBy0x101<>+0x00(SB)/8, $0x8080010180800000
 DATA scatterAndMulBy0x101<>+0x08(SB)/8, $0x8080030380800202
 
-// gather is a PSHUFB mask that brings the low byte of the XMM register's four
-// uint32 values to the low four bytes of that register.
+// gather is a PSHUFB mask that brings the second-lowest byte of the XMM
+// register's four uint32 values to the low four bytes of that register.
 //
 // It transforms a little-endian 16-byte XMM value from
-//	i???j???k???l???
+//	?i???j???k???l??
 // to
 //	ijkl000000000000
-DATA gather<>+0x00(SB)/8, $0x808080800c080400
+DATA gather<>+0x00(SB)/8, $0x808080800d090501
 DATA gather<>+0x08(SB)/8, $0x8080808080808080
 
-DATA fxAlmost256<>+0x00(SB)/8, $0x000000ff000000ff
-DATA fxAlmost256<>+0x08(SB)/8, $0x000000ff000000ff
 DATA fxAlmost65536<>+0x00(SB)/8, $0x0000ffff0000ffff
 DATA fxAlmost65536<>+0x08(SB)/8, $0x0000ffff0000ffff
 DATA inverseFFFF<>+0x00(SB)/8, $0x8000800180008001
 DATA inverseFFFF<>+0x08(SB)/8, $0x8000800180008001
 
-GLOBL flAlmost256<>(SB), (NOPTR+RODATA), $16
 GLOBL flAlmost65536<>(SB), (NOPTR+RODATA), $16
 GLOBL flOne<>(SB), (NOPTR+RODATA), $16
 GLOBL flSignMask<>(SB), (NOPTR+RODATA), $16
 GLOBL scatterAndMulBy0x101<>(SB), (NOPTR+RODATA), $16
 GLOBL gather<>(SB), (NOPTR+RODATA), $16
-GLOBL fxAlmost256<>(SB), (NOPTR+RODATA), $16
 GLOBL fxAlmost65536<>(SB), (NOPTR+RODATA), $16
 GLOBL inverseFFFF<>(SB), (NOPTR+RODATA), $16