From 46e4eb730abb48e79f504fbfd0424c0bb54e26c0 Mon Sep 17 00:00:00 2001 From: Nigel Tao Date: Sat, 10 Nov 2018 11:03:43 +1100 Subject: [PATCH] vector: use asm opcode mnemonics There's no change in the binary output, just less mystery in the asm. These mnemonics were introduced in Go 1.10: https://golang.org/doc/go1.10#asm and https://golang.org/cl/75490 Current stable release (as of 2018-11-10) is Go 1.11, and https://golang.org/doc/devel/release.html#policy says that Go 1.9 and below are therefore no longer supported. Change-Id: I1f9a63521bc8d5e8f8d395605f62bf7fb6a63bc5 Reviewed-on: https://go-review.googlesource.com/c/148997 Reviewed-by: Dmitri Shuralyov --- vector/acc_amd64.s | 106 +++++++++++---------------------------------- vector/gen.go | 25 +++-------- 2 files changed, 33 insertions(+), 98 deletions(-) diff --git a/vector/acc_amd64.s b/vector/acc_amd64.s index 69e0fc2..fc6e7f8 100644 --- a/vector/acc_amd64.s +++ b/vector/acc_amd64.s @@ -139,17 +139,9 @@ fxAccOpOverLoop4: // y = abs(x) // y >>= 2 // Shift by 2*ϕ - 16. // y = min(y, fxAlmost65536) - // - // pabsd %xmm1,%xmm2 - // psrld $0x2,%xmm2 - // pminud %xmm5,%xmm2 - // - // Hopefully we'll get these opcode mnemonics into the assembler for Go - // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but - // it's similar. - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + PABSD X1, X2 + PSRLL $2, X2 + PMINUD X5, X2 // z = convertToInt32(y) // No-op. @@ -182,13 +174,10 @@ fxAccOpOverLoop4: PSRLQ $32, X11 // Multiply by magic, shift by magic. - // - // pmuludq %xmm10,%xmm0 - // pmuludq %xmm10,%xmm11 - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xf4; BYTE $0xc2 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda - PSRLQ $47, X0 - PSRLQ $47, X11 + PMULULQ X10, X0 + PMULULQ X10, X11 + PSRLQ $47, X0 + PSRLQ $47, X11 // Merge the two registers back to one, X11, and add maskA. PSLLQ $32, X11 @@ -223,17 +212,9 @@ fxAccOpOverLoop1: // y = abs(x) // y >>= 2 // Shift by 2*ϕ - 16. // y = min(y, fxAlmost65536) - // - // pabsd %xmm1,%xmm2 - // psrld $0x2,%xmm2 - // pminud %xmm5,%xmm2 - // - // Hopefully we'll get these opcode mnemonics into the assembler for Go - // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but - // it's similar. - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + PABSD X1, X2 + PSRLL $2, X2 + PMINUD X5, X2 // z = convertToInt32(y) // No-op. @@ -346,17 +327,9 @@ fxAccOpSrcLoop4: // y = abs(x) // y >>= 2 // Shift by 2*ϕ - 16. // y = min(y, fxAlmost65536) - // - // pabsd %xmm1,%xmm2 - // psrld $0x2,%xmm2 - // pminud %xmm5,%xmm2 - // - // Hopefully we'll get these opcode mnemonics into the assembler for Go - // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but - // it's similar. - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + PABSD X1, X2 + PSRLL $2, X2 + PMINUD X5, X2 // z = convertToInt32(y) // No-op. @@ -390,17 +363,9 @@ fxAccOpSrcLoop1: // y = abs(x) // y >>= 2 // Shift by 2*ϕ - 16. // y = min(y, fxAlmost65536) - // - // pabsd %xmm1,%xmm2 - // psrld $0x2,%xmm2 - // pminud %xmm5,%xmm2 - // - // Hopefully we'll get these opcode mnemonics into the assembler for Go - // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but - // it's similar. - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + PABSD X1, X2 + PSRLL $2, X2 + PMINUD X5, X2 // z = convertToInt32(y) // No-op. @@ -492,17 +457,9 @@ fxAccMaskLoop4: // y = abs(x) // y >>= 2 // Shift by 2*ϕ - 16. // y = min(y, fxAlmost65536) - // - // pabsd %xmm1,%xmm2 - // psrld $0x2,%xmm2 - // pminud %xmm5,%xmm2 - // - // Hopefully we'll get these opcode mnemonics into the assembler for Go - // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but - // it's similar. - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + PABSD X1, X2 + PSRLL $2, X2 + PMINUD X5, X2 // z = convertToInt32(y) // No-op. @@ -534,17 +491,9 @@ fxAccMaskLoop1: // y = abs(x) // y >>= 2 // Shift by 2*ϕ - 16. // y = min(y, fxAlmost65536) - // - // pabsd %xmm1,%xmm2 - // psrld $0x2,%xmm2 - // pminud %xmm5,%xmm2 - // - // Hopefully we'll get these opcode mnemonics into the assembler for Go - // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but - // it's similar. - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + PABSD X1, X2 + PSRLL $2, X2 + PMINUD X5, X2 // z = convertToInt32(y) // No-op. @@ -696,13 +645,10 @@ flAccOpOverLoop4: PSRLQ $32, X11 // Multiply by magic, shift by magic. - // - // pmuludq %xmm10,%xmm0 - // pmuludq %xmm10,%xmm11 - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xf4; BYTE $0xc2 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda - PSRLQ $47, X0 - PSRLQ $47, X11 + PMULULQ X10, X0 + PMULULQ X10, X11 + PSRLQ $47, X0 + PSRLQ $47, X11 // Merge the two registers back to one, X11, and add maskA. PSLLQ $32, X11 diff --git a/vector/gen.go b/vector/gen.go index 28b298b..2e71a51 100644 --- a/vector/gen.go +++ b/vector/gen.go @@ -296,17 +296,9 @@ const ( // y = abs(x) // y >>= 2 // Shift by 2*ϕ - 16. // y = min(y, fxAlmost65536) - // - // pabsd %xmm1,%xmm2 - // psrld $0x2,%xmm2 - // pminud %xmm5,%xmm2 - // - // Hopefully we'll get these opcode mnemonics into the assembler for Go - // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but - // it's similar. - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + PABSD X1, X2 + PSRLL $2, X2 + PMINUD X5, X2 ` flClampAndScale = ` // y = x & flSignMask @@ -356,13 +348,10 @@ const ( MOVOU X0, X11 PSRLQ $32, X11 // Multiply by magic, shift by magic. - // - // pmuludq %xmm10,%xmm0 - // pmuludq %xmm10,%xmm11 - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xf4; BYTE $0xc2 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda - PSRLQ $47, X0 - PSRLQ $47, X11 + PMULULQ X10, X0 + PMULULQ X10, X11 + PSRLQ $47, X0 + PSRLQ $47, X11 // Merge the two registers back to one, X11, and add maskA. PSLLQ $32, X11 XORPS X0, X11