vector: add SIMD versions of xxxAccumulateOpSrc.

name old time/op new time/op delta GlyphAlpha16Src-8 3.37µs ± 0% 3.07µs ± 1% -8.86% (p=0.000 n=9+9) GlyphAlpha32Src-8 6.01µs ± 1% 4.55µs ± 0% -24.28% (p=0.000 n=10+9) GlyphAlpha64Src-8 13.2µs ± 0% 8.1µs ± 0% -38.69% (p=0.000 n=10+9) GlyphAlpha128Src-8 32.9µs ± 0% 16.9µs ± 0% -48.85% (p=0.000 n=10+9) GlyphAlpha256Src-8 98.0µs ± 0% 43.6µs ± 1% -55.50% (p=0.000 n=10+10) A comparison of the non-SIMD and SIMD versions: name time/op FixedAccumulateOpSrc16-8 368ns ± 0% FixedAccumulateOpSrcSIMD16-8 86.8ns ± 1% FloatingAccumulateOpSrc16-8 434ns ± 0% FloatingAccumulateOpSrcSIMD16-8 119ns ± 0% FixedAccumulateOpSrc64-8 6.12µs ± 0% FixedAccumulateOpSrcSIMD64-8 1.17µs ± 0% FloatingAccumulateOpSrc64-8 7.15µs ± 0% FloatingAccumulateOpSrcSIMD64-8 1.68µs ± 1% Change-Id: I58e5c7a3ecd12e536aab8e765e94275453d0eac8 Reviewed-on: https://go-review.googlesource.com/30431 Reviewed-by: David Crawshaw <crawshaw@golang.org>
2016-10-06 11:55:55 +11:00 · 2016-10-06 11:55:55 +11:00 · 746988e7a2
commit 746988e7a2
parent dc590effac
7 changed files with 590 additions and 101 deletions
--- a/vector/acc_amd64.go
+++ b/vector/acc_amd64.go
@ -0,0 +1,21 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !appengine
 // +build gc
 // +build !noasm
 package vector
 func haveSSE4_1() bool
 var haveFixedAccumulateSIMD = haveSSE4_1()
 const haveFloatingAccumulateSIMD = true
 //go:noescape
 func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32)
 //go:noescape
 func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32)
--- a/vector/acc_amd64.s
+++ b/vector/acc_amd64.s
@ -0,0 +1,321 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !appengine
 // +build gc
 // +build !noasm
 #include "textflag.h"
 // fl is short for floating point math. fx is short for fixed point math.
 DATA flAlmost256<>+0x00(SB)/8, $0x437fffff437fffff
 DATA flAlmost256<>+0x08(SB)/8, $0x437fffff437fffff
 DATA flOnes<>+0x00(SB)/8, $0x3f8000003f800000
 DATA flOnes<>+0x08(SB)/8, $0x3f8000003f800000
 DATA flSignMask<>+0x00(SB)/8, $0x7fffffff7fffffff
 DATA flSignMask<>+0x08(SB)/8, $0x7fffffff7fffffff
 DATA shuffleMask<>+0x00(SB)/8, $0x0c0804000c080400
 DATA shuffleMask<>+0x08(SB)/8, $0x0c0804000c080400
 DATA fxAlmost256<>+0x00(SB)/8, $0x000000ff000000ff
 DATA fxAlmost256<>+0x08(SB)/8, $0x000000ff000000ff
 GLOBL flAlmost256<>(SB), (NOPTR+RODATA), $16
 GLOBL flOnes<>(SB), (NOPTR+RODATA), $16
 GLOBL flSignMask<>(SB), (NOPTR+RODATA), $16
 GLOBL shuffleMask<>(SB), (NOPTR+RODATA), $16
 GLOBL fxAlmost256<>(SB), (NOPTR+RODATA), $16
 // func haveSSE4_1() bool
 TEXT ·haveSSE4_1(SB), NOSPLIT, $0
 	MOVQ $1, AX
 	CPUID
 	SHRQ $19, CX
 	ANDQ $1, CX
 	MOVB CX, ret+0(FP)
 	RET
 // ----------------------------------------------------------------------------
 // func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32)
 //
 // XMM registers. Variable names are per
 // https://github.com/google/font-rs/blob/master/src/accumulate.c
 //
 //	xmm0	scratch
 //	xmm1	x
 //	xmm2	y, z
 //	xmm3	-
 //	xmm4	-
 //	xmm5	fxAlmost256
 //	xmm6	shuffleMask
 //	xmm7	offset
 TEXT ·fixedAccumulateOpSrcSIMD(SB), NOSPLIT, $0-48
 	MOVQ dst_base+0(FP), DI
 	MOVQ dst_len+8(FP), BX
 	MOVQ src_base+24(FP), SI
 	MOVQ src_len+32(FP), CX
 	// Sanity check that len(dst) >= len(src).
 	CMPQ BX, CX
 	JLT  fxAccOpSrcEnd
 	// CX = len(src) &^ 3
 	// DX = len(src)
 	MOVQ CX, DX
 	ANDQ $-4, CX
 	// fxAlmost256 := XMM(0x000000ff repeated four times) // Maximum of an uint8.
 	// shuffleMask := XMM(0x0c080400 repeated four times) // PSHUFB shuffle mask.
 	// offset      := XMM(0x00000000 repeated four times) // Cumulative sum.
 	MOVOU fxAlmost256<>(SB), X5
 	MOVOU shuffleMask<>(SB), X6
 	XORPS X7, X7
 	// i := 0
 	MOVQ $0, AX
 fxAccOpSrcLoop4:
 	// for i < (len(src) &^ 3)
 	CMPQ AX, CX
 	JAE  fxAccOpSrcLoop1
 	// x = XMM(s0, s1, s2, s3)
 	//
 	// Where s0 is src[i+0], s1 is src[i+1], etc.
 	MOVOU (SI), X1
 	// scratch = XMM(0, s0, s1, s2)
 	// x += scratch                                  // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
 	MOVOU X1, X0
 	PSLLO $4, X0
 	PADDD X0, X1
 	// scratch = XMM(0, 0, 0, 0)
 	// scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
 	// x += scratch                                  // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
 	XORPS  X0, X0
 	SHUFPS $0x40, X1, X0
 	PADDD  X0, X1
 	// x += offset
 	PADDD X7, X1
 	// y = abs(x)
 	// y >>= 12 // Shift by 2*ϕ - 8.
 	// y = min(y, fxAlmost256)
 	//
 	// pabsd  %xmm1,%xmm2
 	// psrld  $0xc,%xmm2
 	// pminud %xmm5,%xmm2
 	//
 	// Hopefully we'll get these opcode mnemonics into the assembler for Go
 	// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
 	// it's similar.
 	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
 	BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0c
 	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
 	// z = shuffleTheLowBytesOfEach4ByteElement(y)
 	// copy(dst[:4], low4BytesOf(z))
 	PSHUFB X6, X2
 	MOVL   X2, (DI)
 	// offset = XMM(x@3, x@3, x@3, x@3)
 	MOVOU  X1, X7
 	SHUFPS $0xff, X1, X7
 	// i += 4
 	// dst = dst[4:]
 	// src = src[4:]
 	ADDQ $4, AX
 	ADDQ $4, DI
 	ADDQ $16, SI
 	JMP  fxAccOpSrcLoop4
 fxAccOpSrcLoop1:
 	// for i < len(src)
 	CMPQ AX, DX
 	JAE  fxAccOpSrcEnd
 	// x = src[i] + offset
 	MOVL  (SI), X1
 	PADDD X7, X1
 	// y = abs(x)
 	// y >>= 12 // Shift by 2*ϕ - 8.
 	// y = min(y, fxAlmost256)
 	//
 	// pabsd  %xmm1,%xmm2
 	// psrld  $0xc,%xmm2
 	// pminud %xmm5,%xmm2
 	//
 	// Hopefully we'll get these opcode mnemonics into the assembler for Go
 	// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
 	// it's similar.
 	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
 	BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0c
 	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
 	// dst[0] = uint8(y)
 	MOVL X2, BX
 	MOVB BX, (DI)
 	// offset = x
 	MOVOU X1, X7
 	// i += 1
 	// dst = dst[1:]
 	// src = src[1:]
 	ADDQ $1, AX
 	ADDQ $1, DI
 	ADDQ $4, SI
 	JMP  fxAccOpSrcLoop1
 fxAccOpSrcEnd:
 	RET
 // ----------------------------------------------------------------------------
 // func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32)
 //
 // XMM registers. Variable names are per
 // https://github.com/google/font-rs/blob/master/src/accumulate.c
 //
 //	xmm0	scratch
 //	xmm1	x
 //	xmm2	y, z
 //	xmm3	flAlmost256
 //	xmm4	flOnes
 //	xmm5	flSignMask
 //	xmm6	shuffleMask
 //	xmm7	offset
 TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48
 	MOVQ dst_base+0(FP), DI
 	MOVQ dst_len+8(FP), BX
 	MOVQ src_base+24(FP), SI
 	MOVQ src_len+32(FP), CX
 	// Sanity check that len(dst) >= len(src).
 	CMPQ BX, CX
 	JLT  flAccOpSrcEnd
 	// CX = len(src) &^ 3
 	// DX = len(src)
 	MOVQ CX, DX
 	ANDQ $-4, CX
 	// Set MXCSR bits 13 and 14, so that the CVTPS2PL below is "Round To Zero".
 	STMXCSR mxcsrOrig-8(SP)
 	MOVL    mxcsrOrig-8(SP), AX
 	ORL     $0x6000, AX
 	MOVL    AX, mxcsrNew-4(SP)
 	LDMXCSR mxcsrNew-4(SP)
 	// flAlmost256 := XMM(0x437fffff repeated four times) // 255.99998 as a float32.
 	// flOnes      := XMM(0x3f800000 repeated four times) // 1 as a float32.
 	// flSignMask  := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
 	// shuffleMask := XMM(0x0c080400 repeated four times) // PSHUFB shuffle mask.
 	// offset      := XMM(0x00000000 repeated four times) // Cumulative sum.
 	MOVOU flAlmost256<>(SB), X3
 	MOVOU flOnes<>(SB), X4
 	MOVOU flSignMask<>(SB), X5
 	MOVOU shuffleMask<>(SB), X6
 	XORPS X7, X7
 	// i := 0
 	MOVQ $0, AX
 flAccOpSrcLoop4:
 	// for i < (len(src) &^ 3)
 	CMPQ AX, CX
 	JAE  flAccOpSrcLoop1
 	// x = XMM(s0, s1, s2, s3)
 	//
 	// Where s0 is src[i+0], s1 is src[i+1], etc.
 	MOVOU (SI), X1
 	// scratch = XMM(0, s0, s1, s2)
 	// x += scratch                                  // yields x == XMM(s0, s0+s1, s1+s2, s2+s3)
 	MOVOU X1, X0
 	PSLLO $4, X0
 	ADDPS X0, X1
 	// scratch = XMM(0, 0, 0, 0)
 	// scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1)
 	// x += scratch                                  // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3)
 	XORPS  X0, X0
 	SHUFPS $0x40, X1, X0
 	ADDPS  X0, X1
 	// x += offset
 	ADDPS X7, X1
 	// y = x & flSignMask
 	// y = min(y, flOnes)
 	// y = mul(y, flAlmost256)
 	MOVOU X5, X2
 	ANDPS X1, X2
 	MINPS X4, X2
 	MULPS X3, X2
 	// z = float32ToInt32(y)
 	// z = shuffleTheLowBytesOfEach4ByteElement(z)
 	// copy(dst[:4], low4BytesOf(z))
 	CVTPS2PL X2, X2
 	PSHUFB   X6, X2
 	MOVL     X2, (DI)
 	// offset = XMM(x@3, x@3, x@3, x@3)
 	MOVOU  X1, X7
 	SHUFPS $0xff, X1, X7
 	// i += 4
 	// dst = dst[4:]
 	// src = src[4:]
 	ADDQ $4, AX
 	ADDQ $4, DI
 	ADDQ $16, SI
 	JMP  flAccOpSrcLoop4
 flAccOpSrcLoop1:
 	// for i < len(src)
 	CMPQ AX, DX
 	JAE  flAccOpSrcRestoreMXCSR
 	// x = src[i] + offset
 	MOVL  (SI), X1
 	ADDPS X7, X1
 	// y = x & flSignMask
 	// y = min(y, flOnes)
 	// y = mul(y, flAlmost256)
 	MOVOU X5, X2
 	ANDPS X1, X2
 	MINPS X4, X2
 	MULPS X3, X2
 	// z = float32ToInt32(y)
 	// dst[0] = uint8(z)
 	CVTPS2PL X2, X2
 	MOVL     X2, BX
 	MOVB     BX, (DI)
 	// offset = x
 	MOVOU X1, X7
 	// i += 1
 	// dst = dst[1:]
 	// src = src[1:]
 	ADDQ $1, AX
 	ADDQ $1, DI
 	ADDQ $4, SI
 	JMP  flAccOpSrcLoop1
 flAccOpSrcRestoreMXCSR:
 	LDMXCSR mxcsrOrig-8(SP)
 flAccOpSrcEnd:
 	RET
--- a/vector/acc_other.go
+++ b/vector/acc_other.go
@ -0,0 +1,13 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !amd64 appengine !gc noasm
 package vector
 const haveFixedAccumulateSIMD = false
 const haveFloatingAccumulateSIMD = false
 func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32)     {}
 func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) {}
--- a/vector/acc_test.go
+++ b/vector/acc_test.go
@ -10,6 +10,87 @@ import (
 	"testing"
 )
 // TestXxxSIMDUnaligned tests that unaligned SIMD loads/stores don't crash.
 func TestFixedAccumulateSIMDUnaligned(t *testing.T) {
 	if !haveFixedAccumulateSIMD {
 		t.Skip("No SIMD implemention")
 	}
 	dst := make([]uint8, 64)
 	src := make([]uint32, 64)
 	for d := 0; d < 16; d++ {
 		for s := 0; s < 16; s++ {
 			fixedAccumulateOpSrcSIMD(dst[d:d+32], src[s:s+32])
 		}
 	}
 }
 func TestFloatingAccumulateSIMDUnaligned(t *testing.T) {
 	if !haveFloatingAccumulateSIMD {
 		t.Skip("No SIMD implemention")
 	}
 	dst := make([]uint8, 64)
 	src := make([]float32, 64)
 	for d := 0; d < 16; d++ {
 		for s := 0; s < 16; s++ {
 			floatingAccumulateOpSrcSIMD(dst[d:d+32], src[s:s+32])
 		}
 	}
 }
 // TestXxxSIMDShortDst tests that the SIMD implementations don't write past the
 // end of the dst buffer.
 func TestFixedAccumulateSIMDShortDst(t *testing.T) {
 	if !haveFixedAccumulateSIMD {
 		t.Skip("No SIMD implemention")
 	}
 	const oneQuarter = uint32(int2ϕ(fxOne*fxOne)) / 4
 	src := []uint32{oneQuarter, oneQuarter, oneQuarter, oneQuarter}
 	for i := 0; i < 4; i++ {
 		dst := make([]uint8, 4)
 		fixedAccumulateOpSrcSIMD(dst[:i], src[:i])
 		for j := range dst {
 			if j < i {
 				if got := dst[j]; got == 0 {
 					t.Errorf("i=%d, j=%d: got %#02x, want non-zero", i, j, got)
 				}
 			} else {
 				if got := dst[j]; got != 0 {
 					t.Errorf("i=%d, j=%d: got %#02x, want zero", i, j, got)
 				}
 			}
 		}
 	}
 }
 func TestFloatingAccumulateSIMDShortDst(t *testing.T) {
 	if !haveFloatingAccumulateSIMD {
 		t.Skip("No SIMD implemention")
 	}
 	const oneQuarter = 0.25
 	src := []float32{oneQuarter, oneQuarter, oneQuarter, oneQuarter}
 	for i := 0; i < 4; i++ {
 		dst := make([]uint8, 4)
 		floatingAccumulateOpSrcSIMD(dst[:i], src[:i])
 		for j := range dst {
 			if j < i {
 				if got := dst[j]; got == 0 {
 					t.Errorf("i=%d, j=%d: got %#02x, want non-zero", i, j, got)
 				}
 			} else {
 				if got := dst[j]; got != 0 {
 					t.Errorf("i=%d, j=%d: got %#02x, want zero", i, j, got)
 				}
 			}
 		}
 	}
 }
 func TestFixedAccumulateOpOverShort(t *testing.T)    { testAcc(t, fxInShort, fxMaskShort, "over") }
 func TestFixedAccumulateOpSrcShort(t *testing.T)     { testAcc(t, fxInShort, fxMaskShort, "src") }
 func TestFixedAccumulateMaskShort(t *testing.T)      { testAcc(t, fxInShort, fxMaskShort, "mask") }
@ -25,81 +106,97 @@ func TestFloatingAccumulateOpSrc16(t *testing.T)  { testAcc(t, flIn16, flMask16,
 func TestFloatingAccumulateMask16(t *testing.T)   { testAcc(t, flIn16, flMask16, "mask") }
 func testAcc(t *testing.T, in interface{}, mask []uint32, op string) {
-	maxN := 0
+	for _, simd := range []bool{false, true} {
-	switch in := in.(type) {
+		maxN := 0
 	case []uint32:
 		maxN = len(in)
 	case []float32:
 		maxN = len(in)
 	}
 	for _, n := range []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
 		33, 55, 79, 96, 120, 165, 256, maxN} {
 		if n > maxN {
 			continue
 		}
 		var (
 			got8, want8   []uint8
 			got32, want32 []uint32
 		)
 		switch op {
 		case "over":
 			const background = 0x40
 			got8 = make([]uint8, n)
 			for i := range got8 {
 				got8[i] = background
 			}
 			want8 = make([]uint8, n)
 			for i := range want8 {
 				dstA := uint32(background * 0x101)
 				maskA := mask[i]
 				outA := dstA*(0xffff-maskA)/0xffff + maskA
 				want8[i] = uint8(outA >> 8)
 			}
 		case "src":
 			got8 = make([]uint8, n)
 			want8 = make([]uint8, n)
 			for i := range want8 {
 				want8[i] = uint8(mask[i] >> 8)
 			}
 		case "mask":
 			got32 = make([]uint32, n)
 			want32 = mask[:n]
 		}
 		switch in := in.(type) {
 		case []uint32:
-			switch op {
+			if simd && !haveFixedAccumulateSIMD {
-			case "over":
+				continue
 				fixedAccumulateOpOver(got8, in[:n])
 			case "src":
 				fixedAccumulateOpSrc(got8, in[:n])
 			case "mask":
 				copy(got32, in[:n])
 				fixedAccumulateMask(got32)
 			}
 			maxN = len(in)
 		case []float32:
-			switch op {
+			if simd && !haveFloatingAccumulateSIMD {
-			case "over":
+				continue
 				floatingAccumulateOpOver(got8, in[:n])
 			case "src":
 				floatingAccumulateOpSrc(got8, in[:n])
 			case "mask":
 				floatingAccumulateMask(got32, in[:n])
 			}
 			maxN = len(in)
 		}
-		if op != "mask" {
+		for _, n := range []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
-			if !bytes.Equal(got8, want8) {
+			33, 55, 79, 96, 120, 165, 256, maxN} {
-				t.Errorf("n=%d:\ngot:  % x\nwant: % x", n, got8, want8)
+
 			if n > maxN {
 				continue
 			}
-		} else {
+
-			if !uint32sEqual(got32, want32) {
+			var (
-				t.Errorf("n=%d:\ngot:  % x\nwant: % x", n, got32, want32)
+				got8, want8   []uint8
 				got32, want32 []uint32
 			)
 			switch op {
 			case "over":
 				const background = 0x40
 				got8 = make([]uint8, n)
 				for i := range got8 {
 					got8[i] = background
 				}
 				want8 = make([]uint8, n)
 				for i := range want8 {
 					dstA := uint32(background * 0x101)
 					maskA := mask[i]
 					outA := dstA*(0xffff-maskA)/0xffff + maskA
 					want8[i] = uint8(outA >> 8)
 				}
 			case "src":
 				got8 = make([]uint8, n)
 				want8 = make([]uint8, n)
 				for i := range want8 {
 					want8[i] = uint8(mask[i] >> 8)
 				}
 			case "mask":
 				got32 = make([]uint32, n)
 				want32 = mask[:n]
 			}
 			switch in := in.(type) {
 			case []uint32:
 				switch op {
 				case "over":
 					fixedAccumulateOpOver(got8, in[:n])
 				case "src":
 					if simd {
 						fixedAccumulateOpSrcSIMD(got8, in[:n])
 					} else {
 						fixedAccumulateOpSrc(got8, in[:n])
 					}
 				case "mask":
 					copy(got32, in[:n])
 					fixedAccumulateMask(got32)
 				}
 			case []float32:
 				switch op {
 				case "over":
 					floatingAccumulateOpOver(got8, in[:n])
 				case "src":
 					if simd {
 						floatingAccumulateOpSrcSIMD(got8, in[:n])
 					} else {
 						floatingAccumulateOpSrc(got8, in[:n])
 					}
 				case "mask":
 					floatingAccumulateMask(got32, in[:n])
 				}
 			}
 			if op != "mask" {
 				if !bytes.Equal(got8, want8) {
 					t.Errorf("simd=%t, n=%d:\ngot:  % x\nwant: % x", simd, n, got8, want8)
 				}
 			} else {
 				if !uint32sEqual(got32, want32) {
 					t.Errorf("simd=%t, n=%d:\ngot:  % x\nwant: % x", simd, n, got32, want32)
 				}
 			}
 		}
 	}
@ -129,45 +226,66 @@ func float32sEqual(xs, ys []float32) bool {
 	return true
 }
-func BenchmarkFixedAccumulateOpOver16(b *testing.B)    { benchAcc(b, fxIn16, "over") }
+func BenchmarkFixedAccumulateOpOver16(b *testing.B)       { benchAcc(b, fxIn16, "over", false) }
-func BenchmarkFixedAccumulateOpSrc16(b *testing.B)     { benchAcc(b, fxIn16, "src") }
+func BenchmarkFixedAccumulateOpSrc16(b *testing.B)        { benchAcc(b, fxIn16, "src", false) }
-func BenchmarkFixedAccumulateMask16(b *testing.B)      { benchAcc(b, fxIn16, "mask") }
+func BenchmarkFixedAccumulateOpSrcSIMD16(b *testing.B)    { benchAcc(b, fxIn16, "src", true) }
-func BenchmarkFloatingAccumulateOpOver16(b *testing.B) { benchAcc(b, flIn16, "over") }
+func BenchmarkFixedAccumulateMask16(b *testing.B)         { benchAcc(b, fxIn16, "mask", false) }
-func BenchmarkFloatingAccumulateOpSrc16(b *testing.B)  { benchAcc(b, flIn16, "src") }
+func BenchmarkFloatingAccumulateOpOver16(b *testing.B)    { benchAcc(b, flIn16, "over", false) }
-func BenchmarkFloatingAccumulateMask16(b *testing.B)   { benchAcc(b, flIn16, "mask") }
+func BenchmarkFloatingAccumulateOpSrc16(b *testing.B)     { benchAcc(b, flIn16, "src", false) }
 func BenchmarkFloatingAccumulateOpSrcSIMD16(b *testing.B) { benchAcc(b, flIn16, "src", true) }
 func BenchmarkFloatingAccumulateMask16(b *testing.B)      { benchAcc(b, flIn16, "mask", false) }
-func BenchmarkFixedAccumulateOpOver64(b *testing.B)    { benchAcc(b, fxIn64, "over") }
+func BenchmarkFixedAccumulateOpOver64(b *testing.B)       { benchAcc(b, fxIn64, "over", false) }
-func BenchmarkFixedAccumulateOpSrc64(b *testing.B)     { benchAcc(b, fxIn64, "src") }
+func BenchmarkFixedAccumulateOpSrc64(b *testing.B)        { benchAcc(b, fxIn64, "src", false) }
-func BenchmarkFixedAccumulateMask64(b *testing.B)      { benchAcc(b, fxIn64, "mask") }
+func BenchmarkFixedAccumulateOpSrcSIMD64(b *testing.B)    { benchAcc(b, fxIn64, "src", true) }
-func BenchmarkFloatingAccumulateOpOver64(b *testing.B) { benchAcc(b, flIn64, "over") }
+func BenchmarkFixedAccumulateMask64(b *testing.B)         { benchAcc(b, fxIn64, "mask", false) }
-func BenchmarkFloatingAccumulateOpSrc64(b *testing.B)  { benchAcc(b, flIn64, "src") }
+func BenchmarkFloatingAccumulateOpOver64(b *testing.B)    { benchAcc(b, flIn64, "over", false) }
-func BenchmarkFloatingAccumulateMask64(b *testing.B)   { benchAcc(b, flIn64, "mask") }
+func BenchmarkFloatingAccumulateOpSrc64(b *testing.B)     { benchAcc(b, flIn64, "src", false) }
 func BenchmarkFloatingAccumulateOpSrcSIMD64(b *testing.B) { benchAcc(b, flIn64, "src", true) }
 func BenchmarkFloatingAccumulateMask64(b *testing.B)      { benchAcc(b, flIn64, "mask", false) }
-func benchAcc(b *testing.B, in interface{}, op string) {
+func benchAcc(b *testing.B, in interface{}, op string, simd bool) {
 	var f func()
 	switch in := in.(type) {
 	case []uint32:
 		if simd && !haveFixedAccumulateSIMD {
 			b.Skip("No SIMD implemention")
 		}
 		switch op {
 		case "over":
 			dst := make([]uint8, len(in))
 			f = func() { fixedAccumulateOpOver(dst, in) }
 		case "src":
 			dst := make([]uint8, len(in))
-			f = func() { fixedAccumulateOpSrc(dst, in) }
+			if simd {
 				f = func() { fixedAccumulateOpSrcSIMD(dst, in) }
 			} else {
 				f = func() { fixedAccumulateOpSrc(dst, in) }
 			}
 		case "mask":
 			buf := make([]uint32, len(in))
 			copy(buf, in)
 			f = func() { fixedAccumulateMask(buf) }
 		}
 	case []float32:
 		if simd && !haveFloatingAccumulateSIMD {
 			b.Skip("No SIMD implemention")
 		}
 		switch op {
 		case "over":
 			dst := make([]uint8, len(in))
 			f = func() { floatingAccumulateOpOver(dst, in) }
 		case "src":
 			dst := make([]uint8, len(in))
-			f = func() { floatingAccumulateOpSrc(dst, in) }
+			if simd {
 				f = func() { floatingAccumulateOpSrcSIMD(dst, in) }
 			} else {
 				f = func() { floatingAccumulateOpSrc(dst, in) }
 			}
 		case "mask":
 			dst := make([]uint32, len(in))
 			f = func() { floatingAccumulateMask(dst, in) }
--- a/vector/raster_fixed.go
+++ b/vector/raster_fixed.go
@ -21,9 +21,9 @@ const (
 	// in the .s files).
 	ϕ = 10
-	one          int1ϕ = 1 << ϕ
+	fxOne          int1ϕ = 1 << ϕ
-	oneAndAHalf  int1ϕ = 1<<ϕ + 1<<(ϕ-1)
+	fxOneAndAHalf  int1ϕ = 1<<ϕ + 1<<(ϕ-1)
-	oneMinusIota int1ϕ = 1<<ϕ - 1 // Used for rounding up.
+	fxOneMinusIota int1ϕ = 1<<ϕ - 1 // Used for rounding up.
 )
 // int1ϕ is a signed fixed-point number with 1*ϕ binary digits after the fixed
@ -56,7 +56,7 @@ func fixedMin(x, y int1ϕ) int1ϕ {
 }
 func fixedFloor(x int1ϕ) int32 { return int32(x >> ϕ) }
-func fixedCeil(x int1ϕ) int32  { return int32((x + oneMinusIota) >> ϕ) }
+func fixedCeil(x int1ϕ) int32  { return int32((x + fxOneMinusIota) >> ϕ) }
 func (z *Rasterizer) fixedLineTo(b f32.Vec2) {
 	a := z.pen
@ -74,10 +74,10 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) {
 	}
 	dxdy := (b[0] - a[0]) / (b[1] - a[1])
-	ay := int1ϕ(a[1] * float32(one))
+	ay := int1ϕ(a[1] * float32(fxOne))
-	by := int1ϕ(b[1] * float32(one))
+	by := int1ϕ(b[1] * float32(fxOne))
-	x := int1ϕ(a[0] * float32(one))
+	x := int1ϕ(a[0] * float32(fxOne))
 	y := fixedFloor(ay)
 	yMax := fixedCeil(by)
 	if yMax > int32(z.size.Y) {
@ -106,7 +106,7 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) {
 		if x1i <= x0i+1 {
 			xmf := (x+xNext)>>1 - x0Floor
 			if i := clamp(x0i+0, width); i < uint(len(buf)) {
-				buf[i] += uint32(d * (one - xmf))
+				buf[i] += uint32(d * (fxOne - xmf))
 			}
 			if i := clamp(x0i+1, width); i < uint(len(buf)) {
 				buf[i] += uint32(d * xmf)
@ -115,9 +115,9 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) {
 			oneOverS := x1 - x0
 			twoOverS := 2 * oneOverS
 			x0f := x0 - x0Floor
-			oneMinusX0f := one - x0f
+			oneMinusX0f := fxOne - x0f
 			oneMinusX0fSquared := oneMinusX0f * oneMinusX0f
-			x1f := x1 - x1Ceil + one
+			x1f := x1 - x1Ceil + fxOne
 			x1fSquared := x1f * x1f
 			// These next two variables are unused, as rounding errors are
@ -139,7 +139,7 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) {
 			if x1i == x0i+2 {
 				if i := clamp(x0i+1, width); i < uint(len(buf)) {
-					// In ideal math: buf[i] += uint32(d * (one - a0 - am))
+					// In ideal math: buf[i] += uint32(d * (fxOne - a0 - am))
 					D := twoOverS<<ϕ - oneMinusX0fSquared - x1fSquared
 					D *= d
 					D /= twoOverS
@ -148,14 +148,14 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) {
 			} else {
 				// This is commented out for the same reason as a0 and am.
 				//
-				// a1 := ((oneAndAHalf - x0f) << ϕ) / oneOverS
+				// a1 := ((fxOneAndAHalf - x0f) << ϕ) / oneOverS
 				if i := clamp(x0i+1, width); i < uint(len(buf)) {
 					// In ideal math: buf[i] += uint32(d * (a1 - a0))
 					//
 					// Convert to int64 to avoid overflow. Without that,
 					// TestRasterizePolygon fails.
-					D := int64((oneAndAHalf-x0f)<<(ϕ+1) - oneMinusX0fSquared)
+					D := int64((fxOneAndAHalf-x0f)<<(ϕ+1) - oneMinusX0fSquared)
 					D *= int64(d)
 					D /= int64(twoOverS)
 					buf[i] += uint32(D)
@ -172,12 +172,12 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) {
 				// a2 := a1 + (int1ϕ(x1i-x0i-3)<<(2*ϕ))/oneOverS
 				if i := clamp(x1i-1, width); i < uint(len(buf)) {
-					// In ideal math: buf[i] += uint32(d * (one - a2 - am))
+					// In ideal math: buf[i] += uint32(d * (fxOne - a2 - am))
 					//
 					// Convert to int64 to avoid overflow. Without that,
 					// TestRasterizePolygon fails.
 					D := int64(twoOverS << ϕ)
-					D -= int64((oneAndAHalf - x0f) << (ϕ + 1))
+					D -= int64((fxOneAndAHalf - x0f) << (ϕ + 1))
 					D -= int64((x1i - x0i - 3) << (2*ϕ + 1))
 					D -= int64(x1fSquared)
 					D *= int64(d)
@ -220,6 +220,11 @@ func fixedAccumulateOpOver(dst []uint8, src []uint32) {
 }
 func fixedAccumulateOpSrc(dst []uint8, src []uint32) {
 	// Sanity check that len(dst) >= len(src).
 	if len(dst) < len(src) {
 		return
 	}
 	acc := int2ϕ(0)
 	for i, v := range src {
 		acc += int2ϕ(v)
--- a/vector/raster_floating.go
+++ b/vector/raster_floating.go
@ -165,6 +165,11 @@ func floatingAccumulateOpOver(dst []uint8, src []float32) {
 }
 func floatingAccumulateOpSrc(dst []uint8, src []float32) {
 	// Sanity check that len(dst) >= len(src).
 	if len(dst) < len(src) {
 		return
 	}
 	acc := float32(0)
 	for i, v := range src {
 		acc += v
--- a/vector/vector.go
+++ b/vector/vector.go
@ -312,7 +312,6 @@ func (z *Rasterizer) accumulateMask() {
 }
 func (z *Rasterizer) rasterizeDstAlphaSrcOpaqueOpOver(dst *image.Alpha, r image.Rectangle) {
 	// TODO: add SIMD implementations.
 	// TODO: non-zero vs even-odd winding?
 	if r == dst.Bounds() && r == z.Bounds() {
 		// We bypass the z.accumulateMask step and convert straight from
@ -341,15 +340,22 @@ func (z *Rasterizer) rasterizeDstAlphaSrcOpaqueOpOver(dst *image.Alpha, r image.
 }
 func (z *Rasterizer) rasterizeDstAlphaSrcOpaqueOpSrc(dst *image.Alpha, r image.Rectangle) {
 	// TODO: add SIMD implementations.
 	// TODO: non-zero vs even-odd winding?
 	if r == dst.Bounds() && r == z.Bounds() {
 		// We bypass the z.accumulateMask step and convert straight from
 		// z.bufF32 or z.bufU32 to dst.Pix.
 		if z.useFloatingPointMath {
-			floatingAccumulateOpSrc(dst.Pix, z.bufF32)
+			if haveFloatingAccumulateSIMD {
 				floatingAccumulateOpSrcSIMD(dst.Pix, z.bufF32)
 			} else {
 				floatingAccumulateOpSrc(dst.Pix, z.bufF32)
 			}
 		} else {
-			fixedAccumulateOpSrc(dst.Pix, z.bufU32)
+			if haveFixedAccumulateSIMD {
 				fixedAccumulateOpSrcSIMD(dst.Pix, z.bufU32)
 			} else {
 				fixedAccumulateOpSrc(dst.Pix, z.bufU32)
 			}
 		}
 		return
 	}