diff --git a/vector/acc_amd64.go b/vector/acc_amd64.go new file mode 100644 index 0000000..e2149de --- /dev/null +++ b/vector/acc_amd64.go @@ -0,0 +1,21 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !appengine +// +build gc +// +build !noasm + +package vector + +func haveSSE4_1() bool + +var haveFixedAccumulateSIMD = haveSSE4_1() + +const haveFloatingAccumulateSIMD = true + +//go:noescape +func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32) + +//go:noescape +func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) diff --git a/vector/acc_amd64.s b/vector/acc_amd64.s new file mode 100644 index 0000000..7ef7aac --- /dev/null +++ b/vector/acc_amd64.s @@ -0,0 +1,321 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !appengine +// +build gc +// +build !noasm + +#include "textflag.h" + +// fl is short for floating point math. fx is short for fixed point math. + +DATA flAlmost256<>+0x00(SB)/8, $0x437fffff437fffff +DATA flAlmost256<>+0x08(SB)/8, $0x437fffff437fffff +DATA flOnes<>+0x00(SB)/8, $0x3f8000003f800000 +DATA flOnes<>+0x08(SB)/8, $0x3f8000003f800000 +DATA flSignMask<>+0x00(SB)/8, $0x7fffffff7fffffff +DATA flSignMask<>+0x08(SB)/8, $0x7fffffff7fffffff +DATA shuffleMask<>+0x00(SB)/8, $0x0c0804000c080400 +DATA shuffleMask<>+0x08(SB)/8, $0x0c0804000c080400 +DATA fxAlmost256<>+0x00(SB)/8, $0x000000ff000000ff +DATA fxAlmost256<>+0x08(SB)/8, $0x000000ff000000ff + +GLOBL flAlmost256<>(SB), (NOPTR+RODATA), $16 +GLOBL flOnes<>(SB), (NOPTR+RODATA), $16 +GLOBL flSignMask<>(SB), (NOPTR+RODATA), $16 +GLOBL shuffleMask<>(SB), (NOPTR+RODATA), $16 +GLOBL fxAlmost256<>(SB), (NOPTR+RODATA), $16 + +// func haveSSE4_1() bool +TEXT ·haveSSE4_1(SB), NOSPLIT, $0 + MOVQ $1, AX + CPUID + SHRQ $19, CX + ANDQ $1, CX + MOVB CX, ret+0(FP) + RET + +// ---------------------------------------------------------------------------- + +// func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32) +// +// XMM registers. Variable names are per +// https://github.com/google/font-rs/blob/master/src/accumulate.c +// +// xmm0 scratch +// xmm1 x +// xmm2 y, z +// xmm3 - +// xmm4 - +// xmm5 fxAlmost256 +// xmm6 shuffleMask +// xmm7 offset +TEXT ·fixedAccumulateOpSrcSIMD(SB), NOSPLIT, $0-48 + MOVQ dst_base+0(FP), DI + MOVQ dst_len+8(FP), BX + MOVQ src_base+24(FP), SI + MOVQ src_len+32(FP), CX + + // Sanity check that len(dst) >= len(src). + CMPQ BX, CX + JLT fxAccOpSrcEnd + + // CX = len(src) &^ 3 + // DX = len(src) + MOVQ CX, DX + ANDQ $-4, CX + + // fxAlmost256 := XMM(0x000000ff repeated four times) // Maximum of an uint8. + // shuffleMask := XMM(0x0c080400 repeated four times) // PSHUFB shuffle mask. + // offset := XMM(0x00000000 repeated four times) // Cumulative sum. + MOVOU fxAlmost256<>(SB), X5 + MOVOU shuffleMask<>(SB), X6 + XORPS X7, X7 + + // i := 0 + MOVQ $0, AX + +fxAccOpSrcLoop4: + // for i < (len(src) &^ 3) + CMPQ AX, CX + JAE fxAccOpSrcLoop1 + + // x = XMM(s0, s1, s2, s3) + // + // Where s0 is src[i+0], s1 is src[i+1], etc. + MOVOU (SI), X1 + + // scratch = XMM(0, s0, s1, s2) + // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) + MOVOU X1, X0 + PSLLO $4, X0 + PADDD X0, X1 + + // scratch = XMM(0, 0, 0, 0) + // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) + // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) + XORPS X0, X0 + SHUFPS $0x40, X1, X0 + PADDD X0, X1 + + // x += offset + PADDD X7, X1 + + // y = abs(x) + // y >>= 12 // Shift by 2*ϕ - 8. + // y = min(y, fxAlmost256) + // + // pabsd %xmm1,%xmm2 + // psrld $0xc,%xmm2 + // pminud %xmm5,%xmm2 + // + // Hopefully we'll get these opcode mnemonics into the assembler for Go + // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but + // it's similar. + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0c + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + + // z = shuffleTheLowBytesOfEach4ByteElement(y) + // copy(dst[:4], low4BytesOf(z)) + PSHUFB X6, X2 + MOVL X2, (DI) + + // offset = XMM(x@3, x@3, x@3, x@3) + MOVOU X1, X7 + SHUFPS $0xff, X1, X7 + + // i += 4 + // dst = dst[4:] + // src = src[4:] + ADDQ $4, AX + ADDQ $4, DI + ADDQ $16, SI + JMP fxAccOpSrcLoop4 + +fxAccOpSrcLoop1: + // for i < len(src) + CMPQ AX, DX + JAE fxAccOpSrcEnd + + // x = src[i] + offset + MOVL (SI), X1 + PADDD X7, X1 + + // y = abs(x) + // y >>= 12 // Shift by 2*ϕ - 8. + // y = min(y, fxAlmost256) + // + // pabsd %xmm1,%xmm2 + // psrld $0xc,%xmm2 + // pminud %xmm5,%xmm2 + // + // Hopefully we'll get these opcode mnemonics into the assembler for Go + // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but + // it's similar. + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0c + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + + // dst[0] = uint8(y) + MOVL X2, BX + MOVB BX, (DI) + + // offset = x + MOVOU X1, X7 + + // i += 1 + // dst = dst[1:] + // src = src[1:] + ADDQ $1, AX + ADDQ $1, DI + ADDQ $4, SI + JMP fxAccOpSrcLoop1 + +fxAccOpSrcEnd: + RET + +// ---------------------------------------------------------------------------- + +// func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) +// +// XMM registers. Variable names are per +// https://github.com/google/font-rs/blob/master/src/accumulate.c +// +// xmm0 scratch +// xmm1 x +// xmm2 y, z +// xmm3 flAlmost256 +// xmm4 flOnes +// xmm5 flSignMask +// xmm6 shuffleMask +// xmm7 offset +TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48 + MOVQ dst_base+0(FP), DI + MOVQ dst_len+8(FP), BX + MOVQ src_base+24(FP), SI + MOVQ src_len+32(FP), CX + + // Sanity check that len(dst) >= len(src). + CMPQ BX, CX + JLT flAccOpSrcEnd + + // CX = len(src) &^ 3 + // DX = len(src) + MOVQ CX, DX + ANDQ $-4, CX + + // Set MXCSR bits 13 and 14, so that the CVTPS2PL below is "Round To Zero". + STMXCSR mxcsrOrig-8(SP) + MOVL mxcsrOrig-8(SP), AX + ORL $0x6000, AX + MOVL AX, mxcsrNew-4(SP) + LDMXCSR mxcsrNew-4(SP) + + // flAlmost256 := XMM(0x437fffff repeated four times) // 255.99998 as a float32. + // flOnes := XMM(0x3f800000 repeated four times) // 1 as a float32. + // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. + // shuffleMask := XMM(0x0c080400 repeated four times) // PSHUFB shuffle mask. + // offset := XMM(0x00000000 repeated four times) // Cumulative sum. + MOVOU flAlmost256<>(SB), X3 + MOVOU flOnes<>(SB), X4 + MOVOU flSignMask<>(SB), X5 + MOVOU shuffleMask<>(SB), X6 + XORPS X7, X7 + + // i := 0 + MOVQ $0, AX + +flAccOpSrcLoop4: + // for i < (len(src) &^ 3) + CMPQ AX, CX + JAE flAccOpSrcLoop1 + + // x = XMM(s0, s1, s2, s3) + // + // Where s0 is src[i+0], s1 is src[i+1], etc. + MOVOU (SI), X1 + + // scratch = XMM(0, s0, s1, s2) + // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) + MOVOU X1, X0 + PSLLO $4, X0 + ADDPS X0, X1 + + // scratch = XMM(0, 0, 0, 0) + // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) + // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) + XORPS X0, X0 + SHUFPS $0x40, X1, X0 + ADDPS X0, X1 + + // x += offset + ADDPS X7, X1 + + // y = x & flSignMask + // y = min(y, flOnes) + // y = mul(y, flAlmost256) + MOVOU X5, X2 + ANDPS X1, X2 + MINPS X4, X2 + MULPS X3, X2 + + // z = float32ToInt32(y) + // z = shuffleTheLowBytesOfEach4ByteElement(z) + // copy(dst[:4], low4BytesOf(z)) + CVTPS2PL X2, X2 + PSHUFB X6, X2 + MOVL X2, (DI) + + // offset = XMM(x@3, x@3, x@3, x@3) + MOVOU X1, X7 + SHUFPS $0xff, X1, X7 + + // i += 4 + // dst = dst[4:] + // src = src[4:] + ADDQ $4, AX + ADDQ $4, DI + ADDQ $16, SI + JMP flAccOpSrcLoop4 + +flAccOpSrcLoop1: + // for i < len(src) + CMPQ AX, DX + JAE flAccOpSrcRestoreMXCSR + + // x = src[i] + offset + MOVL (SI), X1 + ADDPS X7, X1 + + // y = x & flSignMask + // y = min(y, flOnes) + // y = mul(y, flAlmost256) + MOVOU X5, X2 + ANDPS X1, X2 + MINPS X4, X2 + MULPS X3, X2 + + // z = float32ToInt32(y) + // dst[0] = uint8(z) + CVTPS2PL X2, X2 + MOVL X2, BX + MOVB BX, (DI) + + // offset = x + MOVOU X1, X7 + + // i += 1 + // dst = dst[1:] + // src = src[1:] + ADDQ $1, AX + ADDQ $1, DI + ADDQ $4, SI + JMP flAccOpSrcLoop1 + +flAccOpSrcRestoreMXCSR: + LDMXCSR mxcsrOrig-8(SP) + +flAccOpSrcEnd: + RET diff --git a/vector/acc_other.go b/vector/acc_other.go new file mode 100644 index 0000000..c0b78f8 --- /dev/null +++ b/vector/acc_other.go @@ -0,0 +1,13 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !amd64 appengine !gc noasm + +package vector + +const haveFixedAccumulateSIMD = false +const haveFloatingAccumulateSIMD = false + +func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32) {} +func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) {} diff --git a/vector/acc_test.go b/vector/acc_test.go index 8ba932e..1c59c2b 100644 --- a/vector/acc_test.go +++ b/vector/acc_test.go @@ -10,6 +10,87 @@ import ( "testing" ) +// TestXxxSIMDUnaligned tests that unaligned SIMD loads/stores don't crash. + +func TestFixedAccumulateSIMDUnaligned(t *testing.T) { + if !haveFixedAccumulateSIMD { + t.Skip("No SIMD implemention") + } + + dst := make([]uint8, 64) + src := make([]uint32, 64) + for d := 0; d < 16; d++ { + for s := 0; s < 16; s++ { + fixedAccumulateOpSrcSIMD(dst[d:d+32], src[s:s+32]) + } + } +} + +func TestFloatingAccumulateSIMDUnaligned(t *testing.T) { + if !haveFloatingAccumulateSIMD { + t.Skip("No SIMD implemention") + } + + dst := make([]uint8, 64) + src := make([]float32, 64) + for d := 0; d < 16; d++ { + for s := 0; s < 16; s++ { + floatingAccumulateOpSrcSIMD(dst[d:d+32], src[s:s+32]) + } + } +} + +// TestXxxSIMDShortDst tests that the SIMD implementations don't write past the +// end of the dst buffer. + +func TestFixedAccumulateSIMDShortDst(t *testing.T) { + if !haveFixedAccumulateSIMD { + t.Skip("No SIMD implemention") + } + + const oneQuarter = uint32(int2ϕ(fxOne*fxOne)) / 4 + src := []uint32{oneQuarter, oneQuarter, oneQuarter, oneQuarter} + for i := 0; i < 4; i++ { + dst := make([]uint8, 4) + fixedAccumulateOpSrcSIMD(dst[:i], src[:i]) + for j := range dst { + if j < i { + if got := dst[j]; got == 0 { + t.Errorf("i=%d, j=%d: got %#02x, want non-zero", i, j, got) + } + } else { + if got := dst[j]; got != 0 { + t.Errorf("i=%d, j=%d: got %#02x, want zero", i, j, got) + } + } + } + } +} + +func TestFloatingAccumulateSIMDShortDst(t *testing.T) { + if !haveFloatingAccumulateSIMD { + t.Skip("No SIMD implemention") + } + + const oneQuarter = 0.25 + src := []float32{oneQuarter, oneQuarter, oneQuarter, oneQuarter} + for i := 0; i < 4; i++ { + dst := make([]uint8, 4) + floatingAccumulateOpSrcSIMD(dst[:i], src[:i]) + for j := range dst { + if j < i { + if got := dst[j]; got == 0 { + t.Errorf("i=%d, j=%d: got %#02x, want non-zero", i, j, got) + } + } else { + if got := dst[j]; got != 0 { + t.Errorf("i=%d, j=%d: got %#02x, want zero", i, j, got) + } + } + } + } +} + func TestFixedAccumulateOpOverShort(t *testing.T) { testAcc(t, fxInShort, fxMaskShort, "over") } func TestFixedAccumulateOpSrcShort(t *testing.T) { testAcc(t, fxInShort, fxMaskShort, "src") } func TestFixedAccumulateMaskShort(t *testing.T) { testAcc(t, fxInShort, fxMaskShort, "mask") } @@ -25,81 +106,97 @@ func TestFloatingAccumulateOpSrc16(t *testing.T) { testAcc(t, flIn16, flMask16, func TestFloatingAccumulateMask16(t *testing.T) { testAcc(t, flIn16, flMask16, "mask") } func testAcc(t *testing.T, in interface{}, mask []uint32, op string) { - maxN := 0 - switch in := in.(type) { - case []uint32: - maxN = len(in) - case []float32: - maxN = len(in) - } - - for _, n := range []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, - 33, 55, 79, 96, 120, 165, 256, maxN} { - - if n > maxN { - continue - } - - var ( - got8, want8 []uint8 - got32, want32 []uint32 - ) - switch op { - case "over": - const background = 0x40 - got8 = make([]uint8, n) - for i := range got8 { - got8[i] = background - } - want8 = make([]uint8, n) - for i := range want8 { - dstA := uint32(background * 0x101) - maskA := mask[i] - outA := dstA*(0xffff-maskA)/0xffff + maskA - want8[i] = uint8(outA >> 8) - } - - case "src": - got8 = make([]uint8, n) - want8 = make([]uint8, n) - for i := range want8 { - want8[i] = uint8(mask[i] >> 8) - } - - case "mask": - got32 = make([]uint32, n) - want32 = mask[:n] - } - + for _, simd := range []bool{false, true} { + maxN := 0 switch in := in.(type) { case []uint32: - switch op { - case "over": - fixedAccumulateOpOver(got8, in[:n]) - case "src": - fixedAccumulateOpSrc(got8, in[:n]) - case "mask": - copy(got32, in[:n]) - fixedAccumulateMask(got32) + if simd && !haveFixedAccumulateSIMD { + continue } + maxN = len(in) case []float32: - switch op { - case "over": - floatingAccumulateOpOver(got8, in[:n]) - case "src": - floatingAccumulateOpSrc(got8, in[:n]) - case "mask": - floatingAccumulateMask(got32, in[:n]) + if simd && !haveFloatingAccumulateSIMD { + continue } + maxN = len(in) } - if op != "mask" { - if !bytes.Equal(got8, want8) { - t.Errorf("n=%d:\ngot: % x\nwant: % x", n, got8, want8) + for _, n := range []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 33, 55, 79, 96, 120, 165, 256, maxN} { + + if n > maxN { + continue } - } else { - if !uint32sEqual(got32, want32) { - t.Errorf("n=%d:\ngot: % x\nwant: % x", n, got32, want32) + + var ( + got8, want8 []uint8 + got32, want32 []uint32 + ) + switch op { + case "over": + const background = 0x40 + got8 = make([]uint8, n) + for i := range got8 { + got8[i] = background + } + want8 = make([]uint8, n) + for i := range want8 { + dstA := uint32(background * 0x101) + maskA := mask[i] + outA := dstA*(0xffff-maskA)/0xffff + maskA + want8[i] = uint8(outA >> 8) + } + + case "src": + got8 = make([]uint8, n) + want8 = make([]uint8, n) + for i := range want8 { + want8[i] = uint8(mask[i] >> 8) + } + + case "mask": + got32 = make([]uint32, n) + want32 = mask[:n] + } + + switch in := in.(type) { + case []uint32: + switch op { + case "over": + fixedAccumulateOpOver(got8, in[:n]) + case "src": + if simd { + fixedAccumulateOpSrcSIMD(got8, in[:n]) + } else { + fixedAccumulateOpSrc(got8, in[:n]) + } + case "mask": + copy(got32, in[:n]) + fixedAccumulateMask(got32) + } + case []float32: + switch op { + case "over": + floatingAccumulateOpOver(got8, in[:n]) + case "src": + if simd { + floatingAccumulateOpSrcSIMD(got8, in[:n]) + } else { + floatingAccumulateOpSrc(got8, in[:n]) + } + case "mask": + floatingAccumulateMask(got32, in[:n]) + } + } + + if op != "mask" { + if !bytes.Equal(got8, want8) { + t.Errorf("simd=%t, n=%d:\ngot: % x\nwant: % x", simd, n, got8, want8) + } + } else { + if !uint32sEqual(got32, want32) { + t.Errorf("simd=%t, n=%d:\ngot: % x\nwant: % x", simd, n, got32, want32) + } } } } @@ -129,45 +226,66 @@ func float32sEqual(xs, ys []float32) bool { return true } -func BenchmarkFixedAccumulateOpOver16(b *testing.B) { benchAcc(b, fxIn16, "over") } -func BenchmarkFixedAccumulateOpSrc16(b *testing.B) { benchAcc(b, fxIn16, "src") } -func BenchmarkFixedAccumulateMask16(b *testing.B) { benchAcc(b, fxIn16, "mask") } -func BenchmarkFloatingAccumulateOpOver16(b *testing.B) { benchAcc(b, flIn16, "over") } -func BenchmarkFloatingAccumulateOpSrc16(b *testing.B) { benchAcc(b, flIn16, "src") } -func BenchmarkFloatingAccumulateMask16(b *testing.B) { benchAcc(b, flIn16, "mask") } +func BenchmarkFixedAccumulateOpOver16(b *testing.B) { benchAcc(b, fxIn16, "over", false) } +func BenchmarkFixedAccumulateOpSrc16(b *testing.B) { benchAcc(b, fxIn16, "src", false) } +func BenchmarkFixedAccumulateOpSrcSIMD16(b *testing.B) { benchAcc(b, fxIn16, "src", true) } +func BenchmarkFixedAccumulateMask16(b *testing.B) { benchAcc(b, fxIn16, "mask", false) } +func BenchmarkFloatingAccumulateOpOver16(b *testing.B) { benchAcc(b, flIn16, "over", false) } +func BenchmarkFloatingAccumulateOpSrc16(b *testing.B) { benchAcc(b, flIn16, "src", false) } +func BenchmarkFloatingAccumulateOpSrcSIMD16(b *testing.B) { benchAcc(b, flIn16, "src", true) } +func BenchmarkFloatingAccumulateMask16(b *testing.B) { benchAcc(b, flIn16, "mask", false) } -func BenchmarkFixedAccumulateOpOver64(b *testing.B) { benchAcc(b, fxIn64, "over") } -func BenchmarkFixedAccumulateOpSrc64(b *testing.B) { benchAcc(b, fxIn64, "src") } -func BenchmarkFixedAccumulateMask64(b *testing.B) { benchAcc(b, fxIn64, "mask") } -func BenchmarkFloatingAccumulateOpOver64(b *testing.B) { benchAcc(b, flIn64, "over") } -func BenchmarkFloatingAccumulateOpSrc64(b *testing.B) { benchAcc(b, flIn64, "src") } -func BenchmarkFloatingAccumulateMask64(b *testing.B) { benchAcc(b, flIn64, "mask") } +func BenchmarkFixedAccumulateOpOver64(b *testing.B) { benchAcc(b, fxIn64, "over", false) } +func BenchmarkFixedAccumulateOpSrc64(b *testing.B) { benchAcc(b, fxIn64, "src", false) } +func BenchmarkFixedAccumulateOpSrcSIMD64(b *testing.B) { benchAcc(b, fxIn64, "src", true) } +func BenchmarkFixedAccumulateMask64(b *testing.B) { benchAcc(b, fxIn64, "mask", false) } +func BenchmarkFloatingAccumulateOpOver64(b *testing.B) { benchAcc(b, flIn64, "over", false) } +func BenchmarkFloatingAccumulateOpSrc64(b *testing.B) { benchAcc(b, flIn64, "src", false) } +func BenchmarkFloatingAccumulateOpSrcSIMD64(b *testing.B) { benchAcc(b, flIn64, "src", true) } +func BenchmarkFloatingAccumulateMask64(b *testing.B) { benchAcc(b, flIn64, "mask", false) } -func benchAcc(b *testing.B, in interface{}, op string) { +func benchAcc(b *testing.B, in interface{}, op string, simd bool) { var f func() switch in := in.(type) { case []uint32: + if simd && !haveFixedAccumulateSIMD { + b.Skip("No SIMD implemention") + } + switch op { case "over": dst := make([]uint8, len(in)) f = func() { fixedAccumulateOpOver(dst, in) } case "src": dst := make([]uint8, len(in)) - f = func() { fixedAccumulateOpSrc(dst, in) } + if simd { + f = func() { fixedAccumulateOpSrcSIMD(dst, in) } + } else { + f = func() { fixedAccumulateOpSrc(dst, in) } + } case "mask": buf := make([]uint32, len(in)) copy(buf, in) f = func() { fixedAccumulateMask(buf) } } + case []float32: + if simd && !haveFloatingAccumulateSIMD { + b.Skip("No SIMD implemention") + } + switch op { case "over": dst := make([]uint8, len(in)) f = func() { floatingAccumulateOpOver(dst, in) } case "src": dst := make([]uint8, len(in)) - f = func() { floatingAccumulateOpSrc(dst, in) } + if simd { + f = func() { floatingAccumulateOpSrcSIMD(dst, in) } + } else { + f = func() { floatingAccumulateOpSrc(dst, in) } + } case "mask": dst := make([]uint32, len(in)) f = func() { floatingAccumulateMask(dst, in) } diff --git a/vector/raster_fixed.go b/vector/raster_fixed.go index 5678bab..40c17aa 100644 --- a/vector/raster_fixed.go +++ b/vector/raster_fixed.go @@ -21,9 +21,9 @@ const ( // in the .s files). ϕ = 10 - one int1ϕ = 1 << ϕ - oneAndAHalf int1ϕ = 1<<ϕ + 1<<(ϕ-1) - oneMinusIota int1ϕ = 1<<ϕ - 1 // Used for rounding up. + fxOne int1ϕ = 1 << ϕ + fxOneAndAHalf int1ϕ = 1<<ϕ + 1<<(ϕ-1) + fxOneMinusIota int1ϕ = 1<<ϕ - 1 // Used for rounding up. ) // int1ϕ is a signed fixed-point number with 1*ϕ binary digits after the fixed @@ -56,7 +56,7 @@ func fixedMin(x, y int1ϕ) int1ϕ { } func fixedFloor(x int1ϕ) int32 { return int32(x >> ϕ) } -func fixedCeil(x int1ϕ) int32 { return int32((x + oneMinusIota) >> ϕ) } +func fixedCeil(x int1ϕ) int32 { return int32((x + fxOneMinusIota) >> ϕ) } func (z *Rasterizer) fixedLineTo(b f32.Vec2) { a := z.pen @@ -74,10 +74,10 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) { } dxdy := (b[0] - a[0]) / (b[1] - a[1]) - ay := int1ϕ(a[1] * float32(one)) - by := int1ϕ(b[1] * float32(one)) + ay := int1ϕ(a[1] * float32(fxOne)) + by := int1ϕ(b[1] * float32(fxOne)) - x := int1ϕ(a[0] * float32(one)) + x := int1ϕ(a[0] * float32(fxOne)) y := fixedFloor(ay) yMax := fixedCeil(by) if yMax > int32(z.size.Y) { @@ -106,7 +106,7 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) { if x1i <= x0i+1 { xmf := (x+xNext)>>1 - x0Floor if i := clamp(x0i+0, width); i < uint(len(buf)) { - buf[i] += uint32(d * (one - xmf)) + buf[i] += uint32(d * (fxOne - xmf)) } if i := clamp(x0i+1, width); i < uint(len(buf)) { buf[i] += uint32(d * xmf) @@ -115,9 +115,9 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) { oneOverS := x1 - x0 twoOverS := 2 * oneOverS x0f := x0 - x0Floor - oneMinusX0f := one - x0f + oneMinusX0f := fxOne - x0f oneMinusX0fSquared := oneMinusX0f * oneMinusX0f - x1f := x1 - x1Ceil + one + x1f := x1 - x1Ceil + fxOne x1fSquared := x1f * x1f // These next two variables are unused, as rounding errors are @@ -139,7 +139,7 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) { if x1i == x0i+2 { if i := clamp(x0i+1, width); i < uint(len(buf)) { - // In ideal math: buf[i] += uint32(d * (one - a0 - am)) + // In ideal math: buf[i] += uint32(d * (fxOne - a0 - am)) D := twoOverS<<ϕ - oneMinusX0fSquared - x1fSquared D *= d D /= twoOverS @@ -148,14 +148,14 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) { } else { // This is commented out for the same reason as a0 and am. // - // a1 := ((oneAndAHalf - x0f) << ϕ) / oneOverS + // a1 := ((fxOneAndAHalf - x0f) << ϕ) / oneOverS if i := clamp(x0i+1, width); i < uint(len(buf)) { // In ideal math: buf[i] += uint32(d * (a1 - a0)) // // Convert to int64 to avoid overflow. Without that, // TestRasterizePolygon fails. - D := int64((oneAndAHalf-x0f)<<(ϕ+1) - oneMinusX0fSquared) + D := int64((fxOneAndAHalf-x0f)<<(ϕ+1) - oneMinusX0fSquared) D *= int64(d) D /= int64(twoOverS) buf[i] += uint32(D) @@ -172,12 +172,12 @@ func (z *Rasterizer) fixedLineTo(b f32.Vec2) { // a2 := a1 + (int1ϕ(x1i-x0i-3)<<(2*ϕ))/oneOverS if i := clamp(x1i-1, width); i < uint(len(buf)) { - // In ideal math: buf[i] += uint32(d * (one - a2 - am)) + // In ideal math: buf[i] += uint32(d * (fxOne - a2 - am)) // // Convert to int64 to avoid overflow. Without that, // TestRasterizePolygon fails. D := int64(twoOverS << ϕ) - D -= int64((oneAndAHalf - x0f) << (ϕ + 1)) + D -= int64((fxOneAndAHalf - x0f) << (ϕ + 1)) D -= int64((x1i - x0i - 3) << (2*ϕ + 1)) D -= int64(x1fSquared) D *= int64(d) @@ -220,6 +220,11 @@ func fixedAccumulateOpOver(dst []uint8, src []uint32) { } func fixedAccumulateOpSrc(dst []uint8, src []uint32) { + // Sanity check that len(dst) >= len(src). + if len(dst) < len(src) { + return + } + acc := int2ϕ(0) for i, v := range src { acc += int2ϕ(v) diff --git a/vector/raster_floating.go b/vector/raster_floating.go index fa3e7b9..50621e1 100644 --- a/vector/raster_floating.go +++ b/vector/raster_floating.go @@ -165,6 +165,11 @@ func floatingAccumulateOpOver(dst []uint8, src []float32) { } func floatingAccumulateOpSrc(dst []uint8, src []float32) { + // Sanity check that len(dst) >= len(src). + if len(dst) < len(src) { + return + } + acc := float32(0) for i, v := range src { acc += v diff --git a/vector/vector.go b/vector/vector.go index 4813832..a9f832a 100644 --- a/vector/vector.go +++ b/vector/vector.go @@ -312,7 +312,6 @@ func (z *Rasterizer) accumulateMask() { } func (z *Rasterizer) rasterizeDstAlphaSrcOpaqueOpOver(dst *image.Alpha, r image.Rectangle) { - // TODO: add SIMD implementations. // TODO: non-zero vs even-odd winding? if r == dst.Bounds() && r == z.Bounds() { // We bypass the z.accumulateMask step and convert straight from @@ -341,15 +340,22 @@ func (z *Rasterizer) rasterizeDstAlphaSrcOpaqueOpOver(dst *image.Alpha, r image. } func (z *Rasterizer) rasterizeDstAlphaSrcOpaqueOpSrc(dst *image.Alpha, r image.Rectangle) { - // TODO: add SIMD implementations. // TODO: non-zero vs even-odd winding? if r == dst.Bounds() && r == z.Bounds() { // We bypass the z.accumulateMask step and convert straight from // z.bufF32 or z.bufU32 to dst.Pix. if z.useFloatingPointMath { - floatingAccumulateOpSrc(dst.Pix, z.bufF32) + if haveFloatingAccumulateSIMD { + floatingAccumulateOpSrcSIMD(dst.Pix, z.bufF32) + } else { + floatingAccumulateOpSrc(dst.Pix, z.bufF32) + } } else { - fixedAccumulateOpSrc(dst.Pix, z.bufU32) + if haveFixedAccumulateSIMD { + fixedAccumulateOpSrcSIMD(dst.Pix, z.bufU32) + } else { + fixedAccumulateOpSrc(dst.Pix, z.bufU32) + } } return }