diff --git a/vector/acc_amd64.s b/vector/acc_amd64.s index 7ef7aac..c74b2a9 100644 --- a/vector/acc_amd64.s +++ b/vector/acc_amd64.s @@ -1,6 +1,4 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// generated by go run gen.go; DO NOT EDIT // +build !appengine // +build gc @@ -12,8 +10,8 @@ DATA flAlmost256<>+0x00(SB)/8, $0x437fffff437fffff DATA flAlmost256<>+0x08(SB)/8, $0x437fffff437fffff -DATA flOnes<>+0x00(SB)/8, $0x3f8000003f800000 -DATA flOnes<>+0x08(SB)/8, $0x3f8000003f800000 +DATA flOne<>+0x00(SB)/8, $0x3f8000003f800000 +DATA flOne<>+0x08(SB)/8, $0x3f8000003f800000 DATA flSignMask<>+0x00(SB)/8, $0x7fffffff7fffffff DATA flSignMask<>+0x08(SB)/8, $0x7fffffff7fffffff DATA shuffleMask<>+0x00(SB)/8, $0x0c0804000c080400 @@ -22,7 +20,7 @@ DATA fxAlmost256<>+0x00(SB)/8, $0x000000ff000000ff DATA fxAlmost256<>+0x08(SB)/8, $0x000000ff000000ff GLOBL flAlmost256<>(SB), (NOPTR+RODATA), $16 -GLOBL flOnes<>(SB), (NOPTR+RODATA), $16 +GLOBL flOne<>(SB), (NOPTR+RODATA), $16 GLOBL flSignMask<>(SB), (NOPTR+RODATA), $16 GLOBL shuffleMask<>(SB), (NOPTR+RODATA), $16 GLOBL fxAlmost256<>(SB), (NOPTR+RODATA), $16 @@ -67,9 +65,10 @@ TEXT ·fixedAccumulateOpSrcSIMD(SB), NOSPLIT, $0-48 ANDQ $-4, CX // fxAlmost256 := XMM(0x000000ff repeated four times) // Maximum of an uint8. + MOVOU fxAlmost256<>(SB), X5 + // shuffleMask := XMM(0x0c080400 repeated four times) // PSHUFB shuffle mask. // offset := XMM(0x00000000 repeated four times) // Cumulative sum. - MOVOU fxAlmost256<>(SB), X5 MOVOU shuffleMask<>(SB), X6 XORPS X7, X7 @@ -117,7 +116,10 @@ fxAccOpSrcLoop4: BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0c BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 - // z = shuffleTheLowBytesOfEach4ByteElement(y) + // z = convertToInt32(y) + // No-op. + + // z = shuffleTheLowBytesOfEach4ByteElement(z) // copy(dst[:4], low4BytesOf(z)) PSHUFB X6, X2 MOVL X2, (DI) @@ -137,7 +139,7 @@ fxAccOpSrcLoop4: fxAccOpSrcLoop1: // for i < len(src) CMPQ AX, DX - JAE fxAccOpSrcEnd + JAE fxAccOpSrcCleanup // x = src[i] + offset MOVL (SI), X1 @@ -158,7 +160,10 @@ fxAccOpSrcLoop1: BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0c BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 - // dst[0] = uint8(y) + // z = convertToInt32(y) + // No-op. + + // dst[0] = uint8(z) MOVL X2, BX MOVB BX, (DI) @@ -173,6 +178,9 @@ fxAccOpSrcLoop1: ADDQ $4, SI JMP fxAccOpSrcLoop1 +fxAccOpSrcCleanup: + // No-op. + fxAccOpSrcEnd: RET @@ -187,7 +195,7 @@ fxAccOpSrcEnd: // xmm1 x // xmm2 y, z // xmm3 flAlmost256 -// xmm4 flOnes +// xmm4 flOne // xmm5 flSignMask // xmm6 shuffleMask // xmm7 offset @@ -214,13 +222,14 @@ TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48 LDMXCSR mxcsrNew-4(SP) // flAlmost256 := XMM(0x437fffff repeated four times) // 255.99998 as a float32. - // flOnes := XMM(0x3f800000 repeated four times) // 1 as a float32. + // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. + MOVOU flAlmost256<>(SB), X3 + MOVOU flOne<>(SB), X4 + MOVOU flSignMask<>(SB), X5 + // shuffleMask := XMM(0x0c080400 repeated four times) // PSHUFB shuffle mask. // offset := XMM(0x00000000 repeated four times) // Cumulative sum. - MOVOU flAlmost256<>(SB), X3 - MOVOU flOnes<>(SB), X4 - MOVOU flSignMask<>(SB), X5 MOVOU shuffleMask<>(SB), X6 XORPS X7, X7 @@ -254,19 +263,20 @@ flAccOpSrcLoop4: ADDPS X7, X1 // y = x & flSignMask - // y = min(y, flOnes) + // y = min(y, flOne) // y = mul(y, flAlmost256) MOVOU X5, X2 ANDPS X1, X2 MINPS X4, X2 MULPS X3, X2 - // z = float32ToInt32(y) + // z = convertToInt32(y) + CVTPS2PL X2, X2 + // z = shuffleTheLowBytesOfEach4ByteElement(z) // copy(dst[:4], low4BytesOf(z)) - CVTPS2PL X2, X2 - PSHUFB X6, X2 - MOVL X2, (DI) + PSHUFB X6, X2 + MOVL X2, (DI) // offset = XMM(x@3, x@3, x@3, x@3) MOVOU X1, X7 @@ -283,25 +293,26 @@ flAccOpSrcLoop4: flAccOpSrcLoop1: // for i < len(src) CMPQ AX, DX - JAE flAccOpSrcRestoreMXCSR + JAE flAccOpSrcCleanup // x = src[i] + offset MOVL (SI), X1 ADDPS X7, X1 // y = x & flSignMask - // y = min(y, flOnes) + // y = min(y, flOne) // y = mul(y, flAlmost256) MOVOU X5, X2 ANDPS X1, X2 MINPS X4, X2 MULPS X3, X2 - // z = float32ToInt32(y) - // dst[0] = uint8(z) + // z = convertToInt32(y) CVTPS2PL X2, X2 - MOVL X2, BX - MOVB BX, (DI) + + // dst[0] = uint8(z) + MOVL X2, BX + MOVB BX, (DI) // offset = x MOVOU X1, X7 @@ -314,7 +325,7 @@ flAccOpSrcLoop1: ADDQ $4, SI JMP flAccOpSrcLoop1 -flAccOpSrcRestoreMXCSR: +flAccOpSrcCleanup: LDMXCSR mxcsrOrig-8(SP) flAccOpSrcEnd: diff --git a/vector/gen.go b/vector/gen.go new file mode 100644 index 0000000..482e8f6 --- /dev/null +++ b/vector/gen.go @@ -0,0 +1,178 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build ignore + +package main + +import ( + "bytes" + "io/ioutil" + "log" + "text/template" +) + +const ( + copyright = "" + + "// Copyright 2016 The Go Authors. All rights reserved.\n" + + "// Use of this source code is governed by a BSD-style\n" + + "// license that can be found in the LICENSE file.\n" + + doNotEdit = "// generated by go run gen.go; DO NOT EDIT\n" + + dashDashDash = "// --------" +) + +func main() { + tmpl, err := ioutil.ReadFile("gen_acc_amd64.s.tmpl") + if err != nil { + log.Fatalf("ReadFile: %v", err) + } + if !bytes.HasPrefix(tmpl, []byte(copyright)) { + log.Fatal("source template did not start with the copyright header") + } + tmpl = tmpl[len(copyright):] + + preamble := []byte(nil) + if i := bytes.Index(tmpl, []byte(dashDashDash)); i < 0 { + log.Fatalf("source template did not contain %q", dashDashDash) + } else { + preamble, tmpl = tmpl[:i], tmpl[i:] + } + + t, err := template.New("").Parse(string(tmpl)) + if err != nil { + log.Fatalf("Parse: %v", err) + } + + out := bytes.NewBuffer(nil) + out.WriteString(doNotEdit) + out.Write(preamble) + + for i, v := range instances { + if i != 0 { + out.WriteString("\n") + } + if err := t.Execute(out, v); err != nil { + log.Fatalf("Execute(%q): %v", v.ShortName, err) + } + } + + if err := ioutil.WriteFile("acc_amd64.s", out.Bytes(), 0666); err != nil { + log.Fatalf("WriteFile: %v", err) + } +} + +var instances = []struct { + LongName string + ShortName string + FrameSize string + SrcType string + XMM3 string + XMM4 string + XMM5 string + Setup string + Cleanup string + Add string + ClampAndScale string + ConvertToInt32 string +}{{ + LongName: "fixedAccumulateOpSrc", + ShortName: "fxAccOpSrc", + FrameSize: fxFrameSize, + SrcType: fxSrcType, + XMM3: fxXMM3, + XMM4: fxXMM4, + XMM5: fxXMM5, + Setup: fxSetup, + Cleanup: fxCleanup, + Add: fxAdd, + ClampAndScale: fxClampAndScale, + ConvertToInt32: fxConvertToInt32, +}, { + LongName: "floatingAccumulateOpSrc", + ShortName: "flAccOpSrc", + FrameSize: flFrameSize, + SrcType: flSrcType, + XMM3: flXMM3, + XMM4: flXMM4, + XMM5: flXMM5, + Setup: flSetup, + Cleanup: flCleanup, + Add: flAdd, + ClampAndScale: flClampAndScale, + ConvertToInt32: flConvertToInt32, +}} + +const ( + fxFrameSize = `0` + flFrameSize = `8` + + fxSrcType = `[]uint32` + flSrcType = `[]float32` + + fxXMM3 = `-` + flXMM3 = `flAlmost256` + + fxXMM4 = `-` + flXMM4 = `flOne` + + fxXMM5 = `fxAlmost256` + flXMM5 = `flSignMask` + + fxSetup = ` + // fxAlmost256 := XMM(0x000000ff repeated four times) // Maximum of an uint8. + MOVOU fxAlmost256<>(SB), X5 + ` + flSetup = ` + // Set MXCSR bits 13 and 14, so that the CVTPS2PL below is "Round To Zero". + STMXCSR mxcsrOrig-8(SP) + MOVL mxcsrOrig-8(SP), AX + ORL $0x6000, AX + MOVL AX, mxcsrNew-4(SP) + LDMXCSR mxcsrNew-4(SP) + + // flAlmost256 := XMM(0x437fffff repeated four times) // 255.99998 as a float32. + // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. + // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. + MOVOU flAlmost256<>(SB), X3 + MOVOU flOne<>(SB), X4 + MOVOU flSignMask<>(SB), X5 + ` + + fxCleanup = `// No-op.` + flCleanup = `LDMXCSR mxcsrOrig-8(SP)` + + fxAdd = `PADDD` + flAdd = `ADDPS` + + fxClampAndScale = ` + // y = abs(x) + // y >>= 12 // Shift by 2*ϕ - 8. + // y = min(y, fxAlmost256) + // + // pabsd %xmm1,%xmm2 + // psrld $0xc,%xmm2 + // pminud %xmm5,%xmm2 + // + // Hopefully we'll get these opcode mnemonics into the assembler for Go + // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but + // it's similar. + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x0c + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + ` + flClampAndScale = ` + // y = x & flSignMask + // y = min(y, flOne) + // y = mul(y, flAlmost256) + MOVOU X5, X2 + ANDPS X1, X2 + MINPS X4, X2 + MULPS X3, X2 + ` + + fxConvertToInt32 = `// No-op.` + flConvertToInt32 = `CVTPS2PL X2, X2` +) diff --git a/vector/gen_acc_amd64.s.tmpl b/vector/gen_acc_amd64.s.tmpl new file mode 100644 index 0000000..854abe5 --- /dev/null +++ b/vector/gen_acc_amd64.s.tmpl @@ -0,0 +1,160 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !appengine +// +build gc +// +build !noasm + +#include "textflag.h" + +// fl is short for floating point math. fx is short for fixed point math. + +DATA flAlmost256<>+0x00(SB)/8, $0x437fffff437fffff +DATA flAlmost256<>+0x08(SB)/8, $0x437fffff437fffff +DATA flOne<>+0x00(SB)/8, $0x3f8000003f800000 +DATA flOne<>+0x08(SB)/8, $0x3f8000003f800000 +DATA flSignMask<>+0x00(SB)/8, $0x7fffffff7fffffff +DATA flSignMask<>+0x08(SB)/8, $0x7fffffff7fffffff +DATA shuffleMask<>+0x00(SB)/8, $0x0c0804000c080400 +DATA shuffleMask<>+0x08(SB)/8, $0x0c0804000c080400 +DATA fxAlmost256<>+0x00(SB)/8, $0x000000ff000000ff +DATA fxAlmost256<>+0x08(SB)/8, $0x000000ff000000ff + +GLOBL flAlmost256<>(SB), (NOPTR+RODATA), $16 +GLOBL flOne<>(SB), (NOPTR+RODATA), $16 +GLOBL flSignMask<>(SB), (NOPTR+RODATA), $16 +GLOBL shuffleMask<>(SB), (NOPTR+RODATA), $16 +GLOBL fxAlmost256<>(SB), (NOPTR+RODATA), $16 + +// func haveSSE4_1() bool +TEXT ·haveSSE4_1(SB), NOSPLIT, $0 + MOVQ $1, AX + CPUID + SHRQ $19, CX + ANDQ $1, CX + MOVB CX, ret+0(FP) + RET + +// ---------------------------------------------------------------------------- + +// func {{.LongName}}SIMD(dst []uint8, src {{.SrcType}}) +// +// XMM registers. Variable names are per +// https://github.com/google/font-rs/blob/master/src/accumulate.c +// +// xmm0 scratch +// xmm1 x +// xmm2 y, z +// xmm3 {{.XMM3}} +// xmm4 {{.XMM4}} +// xmm5 {{.XMM5}} +// xmm6 shuffleMask +// xmm7 offset +TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-48 + MOVQ dst_base+0(FP), DI + MOVQ dst_len+8(FP), BX + MOVQ src_base+24(FP), SI + MOVQ src_len+32(FP), CX + + // Sanity check that len(dst) >= len(src). + CMPQ BX, CX + JLT {{.ShortName}}End + + // CX = len(src) &^ 3 + // DX = len(src) + MOVQ CX, DX + ANDQ $-4, CX + + {{.Setup}} + + // shuffleMask := XMM(0x0c080400 repeated four times) // PSHUFB shuffle mask. + // offset := XMM(0x00000000 repeated four times) // Cumulative sum. + MOVOU shuffleMask<>(SB), X6 + XORPS X7, X7 + + // i := 0 + MOVQ $0, AX + +{{.ShortName}}Loop4: + // for i < (len(src) &^ 3) + CMPQ AX, CX + JAE {{.ShortName}}Loop1 + + // x = XMM(s0, s1, s2, s3) + // + // Where s0 is src[i+0], s1 is src[i+1], etc. + MOVOU (SI), X1 + + // scratch = XMM(0, s0, s1, s2) + // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) + MOVOU X1, X0 + PSLLO $4, X0 + {{.Add}} X0, X1 + + // scratch = XMM(0, 0, 0, 0) + // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) + // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) + XORPS X0, X0 + SHUFPS $0x40, X1, X0 + {{.Add}} X0, X1 + + // x += offset + {{.Add}} X7, X1 + + {{.ClampAndScale}} + + // z = convertToInt32(y) + {{.ConvertToInt32}} + + // z = shuffleTheLowBytesOfEach4ByteElement(z) + // copy(dst[:4], low4BytesOf(z)) + PSHUFB X6, X2 + MOVL X2, (DI) + + // offset = XMM(x@3, x@3, x@3, x@3) + MOVOU X1, X7 + SHUFPS $0xff, X1, X7 + + // i += 4 + // dst = dst[4:] + // src = src[4:] + ADDQ $4, AX + ADDQ $4, DI + ADDQ $16, SI + JMP {{.ShortName}}Loop4 + +{{.ShortName}}Loop1: + // for i < len(src) + CMPQ AX, DX + JAE {{.ShortName}}Cleanup + + // x = src[i] + offset + MOVL (SI), X1 + {{.Add}} X7, X1 + + {{.ClampAndScale}} + + // z = convertToInt32(y) + {{.ConvertToInt32}} + + // dst[0] = uint8(z) + MOVL X2, BX + MOVB BX, (DI) + + // offset = x + MOVOU X1, X7 + + // i += 1 + // dst = dst[1:] + // src = src[1:] + ADDQ $1, AX + ADDQ $1, DI + ADDQ $4, SI + JMP {{.ShortName}}Loop1 + +{{.ShortName}}Cleanup: + {{.Cleanup}} + +{{.ShortName}}End: + RET diff --git a/vector/vector.go b/vector/vector.go index a9f832a..aed62d2 100644 --- a/vector/vector.go +++ b/vector/vector.go @@ -2,6 +2,11 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +//go:generate go run gen.go +//go:generate asmfmt -w acc_amd64.s + +// asmfmt is https://github.com/klauspost/asmfmt + // Package vector provides a rasterizer for 2-D vector graphics. package vector // import "golang.org/x/image/vector"