// Copyright 2016 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // +build !appengine // +build gc // +build !noasm #include "textflag.h" // fl is short for floating point math. fx is short for fixed point math. DATA flAlmost256<>+0x00(SB)/8, $0x437fffff437fffff DATA flAlmost256<>+0x08(SB)/8, $0x437fffff437fffff DATA flAlmost65536<>+0x00(SB)/8, $0x477fffff477fffff DATA flAlmost65536<>+0x08(SB)/8, $0x477fffff477fffff DATA flOne<>+0x00(SB)/8, $0x3f8000003f800000 DATA flOne<>+0x08(SB)/8, $0x3f8000003f800000 DATA flSignMask<>+0x00(SB)/8, $0x7fffffff7fffffff DATA flSignMask<>+0x08(SB)/8, $0x7fffffff7fffffff // scatterAndMulBy0x101 is a PSHUFB mask that brings the low four bytes of an // XMM register to the low byte of that register's four uint32 values. It // duplicates those bytes, effectively multiplying each uint32 by 0x101. // // It transforms a little-endian 16-byte XMM value from // ijkl???????????? // to // ii00jj00kk00ll00 DATA scatterAndMulBy0x101<>+0x00(SB)/8, $0x8080010180800000 DATA scatterAndMulBy0x101<>+0x08(SB)/8, $0x8080030380800202 // gather is a PSHUFB mask that brings the low byte of the XMM register's four // uint32 values to the low four bytes of that register. // // It transforms a little-endian 16-byte XMM value from // i???j???k???l??? // to // ijkl000000000000 DATA gather<>+0x00(SB)/8, $0x808080800c080400 DATA gather<>+0x08(SB)/8, $0x8080808080808080 DATA fxAlmost256<>+0x00(SB)/8, $0x000000ff000000ff DATA fxAlmost256<>+0x08(SB)/8, $0x000000ff000000ff DATA fxAlmost65536<>+0x00(SB)/8, $0x0000ffff0000ffff DATA fxAlmost65536<>+0x08(SB)/8, $0x0000ffff0000ffff DATA inverseFFFF<>+0x00(SB)/8, $0x8000800180008001 DATA inverseFFFF<>+0x08(SB)/8, $0x8000800180008001 GLOBL flAlmost256<>(SB), (NOPTR+RODATA), $16 GLOBL flAlmost65536<>(SB), (NOPTR+RODATA), $16 GLOBL flOne<>(SB), (NOPTR+RODATA), $16 GLOBL flSignMask<>(SB), (NOPTR+RODATA), $16 GLOBL scatterAndMulBy0x101<>(SB), (NOPTR+RODATA), $16 GLOBL gather<>(SB), (NOPTR+RODATA), $16 GLOBL fxAlmost256<>(SB), (NOPTR+RODATA), $16 GLOBL fxAlmost65536<>(SB), (NOPTR+RODATA), $16 GLOBL inverseFFFF<>(SB), (NOPTR+RODATA), $16 // func haveSSE4_1() bool TEXT ·haveSSE4_1(SB), NOSPLIT, $0 MOVQ $1, AX CPUID SHRQ $19, CX ANDQ $1, CX MOVB CX, ret+0(FP) RET // ---------------------------------------------------------------------------- // func {{.LongName}}SIMD({{.Args}}) // // XMM registers. Variable names are per // https://github.com/google/font-rs/blob/master/src/accumulate.c // // xmm0 scratch // xmm1 x // xmm2 y, z // xmm3 {{.XMM3}} // xmm4 {{.XMM4}} // xmm5 {{.XMM5}} // xmm6 {{.XMM6}} // xmm7 offset // xmm8 {{.XMM8}} // xmm9 {{.XMM9}} // xmm10 {{.XMM10}} TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-{{.ArgsSize}} {{.LoadArgs}} // R10 = len(src) &^ 3 // R11 = len(src) MOVQ R10, R11 ANDQ $-4, R10 {{.Setup}} {{.LoadXMMRegs}} // offset := XMM(0x00000000 repeated four times) // Cumulative sum. XORPS X7, X7 // i := 0 MOVQ $0, R9 {{.ShortName}}Loop4: // for i < (len(src) &^ 3) CMPQ R9, R10 JAE {{.ShortName}}Loop1 // x = XMM(s0, s1, s2, s3) // // Where s0 is src[i+0], s1 is src[i+1], etc. MOVOU (SI), X1 // scratch = XMM(0, s0, s1, s2) // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) MOVOU X1, X0 PSLLO $4, X0 {{.Add}} X0, X1 // scratch = XMM(0, 0, 0, 0) // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) XORPS X0, X0 SHUFPS $0x40, X1, X0 {{.Add}} X0, X1 // x += offset {{.Add}} X7, X1 {{.ClampAndScale}} {{.ConvertToInt32}} {{.Store4}} // offset = XMM(x@3, x@3, x@3, x@3) MOVOU X1, X7 SHUFPS $0xff, X1, X7 // i += 4 // dst = dst[4:] // src = src[4:] ADDQ $4, R9 ADDQ ${{.DstElemSize4}}, DI ADDQ $16, SI JMP {{.ShortName}}Loop4 {{.ShortName}}Loop1: // for i < len(src) CMPQ R9, R11 JAE {{.ShortName}}End // x = src[i] + offset MOVL (SI), X1 {{.Add}} X7, X1 {{.ClampAndScale}} {{.ConvertToInt32}} {{.Store1}} // offset = x MOVOU X1, X7 // i += 1 // dst = dst[1:] // src = src[1:] ADDQ $1, R9 ADDQ ${{.DstElemSize1}}, DI ADDQ $4, SI JMP {{.ShortName}}Loop1 {{.ShortName}}End: RET