From 46e4eb730abb48e79f504fbfd0424c0bb54e26c0 Mon Sep 17 00:00:00 2001
From: Nigel Tao <nigeltao@golang.org>
Date: Sat, 10 Nov 2018 11:03:43 +1100
Subject: [PATCH] vector: use asm opcode mnemonics

There's no change in the binary output, just less mystery in the asm.

These mnemonics were introduced in Go 1.10:
https://golang.org/doc/go1.10#asm and https://golang.org/cl/75490

Current stable release (as of 2018-11-10) is Go 1.11, and
https://golang.org/doc/devel/release.html#policy says that Go 1.9 and
below are therefore no longer supported.

Change-Id: I1f9a63521bc8d5e8f8d395605f62bf7fb6a63bc5
Reviewed-on: https://go-review.googlesource.com/c/148997
Reviewed-by: Dmitri Shuralyov <dmitshur@golang.org>
---
 vector/acc_amd64.s | 106 +++++++++++----------------------------------
 vector/gen.go      |  25 +++--------
 2 files changed, 33 insertions(+), 98 deletions(-)

diff --git a/vector/acc_amd64.s b/vector/acc_amd64.s
index 69e0fc2..fc6e7f8 100644
--- a/vector/acc_amd64.s
+++ b/vector/acc_amd64.s
@@ -139,17 +139,9 @@ fxAccOpOverLoop4:
 	// y = abs(x)
 	// y >>= 2 // Shift by 2*ϕ - 16.
 	// y = min(y, fxAlmost65536)
-	//
-	// pabsd  %xmm1,%xmm2
-	// psrld  $0x2,%xmm2
-	// pminud %xmm5,%xmm2
-	//
-	// Hopefully we'll get these opcode mnemonics into the assembler for Go
-	// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
-	// it's similar.
-	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
-	BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
-	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
+	PABSD  X1, X2
+	PSRLL  $2, X2
+	PMINUD X5, X2
 
 	// z = convertToInt32(y)
 	// No-op.
@@ -182,13 +174,10 @@ fxAccOpOverLoop4:
 	PSRLQ $32, X11
 
 	// Multiply by magic, shift by magic.
-	//
-	// pmuludq %xmm10,%xmm0
-	// pmuludq %xmm10,%xmm11
-	BYTE  $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xf4; BYTE $0xc2
-	BYTE  $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda
-	PSRLQ $47, X0
-	PSRLQ $47, X11
+	PMULULQ X10, X0
+	PMULULQ X10, X11
+	PSRLQ   $47, X0
+	PSRLQ   $47, X11
 
 	// Merge the two registers back to one, X11, and add maskA.
 	PSLLQ $32, X11
@@ -223,17 +212,9 @@ fxAccOpOverLoop1:
 	// y = abs(x)
 	// y >>= 2 // Shift by 2*ϕ - 16.
 	// y = min(y, fxAlmost65536)
-	//
-	// pabsd  %xmm1,%xmm2
-	// psrld  $0x2,%xmm2
-	// pminud %xmm5,%xmm2
-	//
-	// Hopefully we'll get these opcode mnemonics into the assembler for Go
-	// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
-	// it's similar.
-	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
-	BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
-	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
+	PABSD  X1, X2
+	PSRLL  $2, X2
+	PMINUD X5, X2
 
 	// z = convertToInt32(y)
 	// No-op.
@@ -346,17 +327,9 @@ fxAccOpSrcLoop4:
 	// y = abs(x)
 	// y >>= 2 // Shift by 2*ϕ - 16.
 	// y = min(y, fxAlmost65536)
-	//
-	// pabsd  %xmm1,%xmm2
-	// psrld  $0x2,%xmm2
-	// pminud %xmm5,%xmm2
-	//
-	// Hopefully we'll get these opcode mnemonics into the assembler for Go
-	// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
-	// it's similar.
-	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
-	BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
-	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
+	PABSD  X1, X2
+	PSRLL  $2, X2
+	PMINUD X5, X2
 
 	// z = convertToInt32(y)
 	// No-op.
@@ -390,17 +363,9 @@ fxAccOpSrcLoop1:
 	// y = abs(x)
 	// y >>= 2 // Shift by 2*ϕ - 16.
 	// y = min(y, fxAlmost65536)
-	//
-	// pabsd  %xmm1,%xmm2
-	// psrld  $0x2,%xmm2
-	// pminud %xmm5,%xmm2
-	//
-	// Hopefully we'll get these opcode mnemonics into the assembler for Go
-	// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
-	// it's similar.
-	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
-	BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
-	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
+	PABSD  X1, X2
+	PSRLL  $2, X2
+	PMINUD X5, X2
 
 	// z = convertToInt32(y)
 	// No-op.
@@ -492,17 +457,9 @@ fxAccMaskLoop4:
 	// y = abs(x)
 	// y >>= 2 // Shift by 2*ϕ - 16.
 	// y = min(y, fxAlmost65536)
-	//
-	// pabsd  %xmm1,%xmm2
-	// psrld  $0x2,%xmm2
-	// pminud %xmm5,%xmm2
-	//
-	// Hopefully we'll get these opcode mnemonics into the assembler for Go
-	// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
-	// it's similar.
-	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
-	BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
-	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
+	PABSD  X1, X2
+	PSRLL  $2, X2
+	PMINUD X5, X2
 
 	// z = convertToInt32(y)
 	// No-op.
@@ -534,17 +491,9 @@ fxAccMaskLoop1:
 	// y = abs(x)
 	// y >>= 2 // Shift by 2*ϕ - 16.
 	// y = min(y, fxAlmost65536)
-	//
-	// pabsd  %xmm1,%xmm2
-	// psrld  $0x2,%xmm2
-	// pminud %xmm5,%xmm2
-	//
-	// Hopefully we'll get these opcode mnemonics into the assembler for Go
-	// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
-	// it's similar.
-	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
-	BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
-	BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
+	PABSD  X1, X2
+	PSRLL  $2, X2
+	PMINUD X5, X2
 
 	// z = convertToInt32(y)
 	// No-op.
@@ -696,13 +645,10 @@ flAccOpOverLoop4:
 	PSRLQ $32, X11
 
 	// Multiply by magic, shift by magic.
-	//
-	// pmuludq %xmm10,%xmm0
-	// pmuludq %xmm10,%xmm11
-	BYTE  $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xf4; BYTE $0xc2
-	BYTE  $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda
-	PSRLQ $47, X0
-	PSRLQ $47, X11
+	PMULULQ X10, X0
+	PMULULQ X10, X11
+	PSRLQ   $47, X0
+	PSRLQ   $47, X11
 
 	// Merge the two registers back to one, X11, and add maskA.
 	PSLLQ $32, X11
diff --git a/vector/gen.go b/vector/gen.go
index 28b298b..2e71a51 100644
--- a/vector/gen.go
+++ b/vector/gen.go
@@ -296,17 +296,9 @@ const (
 		// y = abs(x)
 		// y >>= 2 // Shift by 2*ϕ - 16.
 		// y = min(y, fxAlmost65536)
-		//
-		// pabsd  %xmm1,%xmm2
-		// psrld  $0x2,%xmm2
-		// pminud %xmm5,%xmm2
-		//
-		// Hopefully we'll get these opcode mnemonics into the assembler for Go
-		// 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
-		// it's similar.
-		BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
-		BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
-		BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
+		PABSD  X1, X2
+		PSRLL  $2, X2
+		PMINUD X5, X2
 		`
 	flClampAndScale = `
 		// y = x & flSignMask
@@ -356,13 +348,10 @@ const (
 		MOVOU X0, X11
 		PSRLQ $32, X11
 		// Multiply by magic, shift by magic.
-		//
-		// pmuludq %xmm10,%xmm0
-		// pmuludq %xmm10,%xmm11
-		BYTE  $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xf4; BYTE $0xc2
-		BYTE  $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda
-		PSRLQ $47, X0
-		PSRLQ $47, X11
+		PMULULQ X10, X0
+		PMULULQ X10, X11
+		PSRLQ   $47, X0
+		PSRLQ   $47, X11
 		// Merge the two registers back to one, X11, and add maskA.
 		PSLLQ $32, X11
 		XORPS X0, X11