go/test/codegen/simd_arm64.go

// asmcheck

// Copyright 2026 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// These tests check ARM64 SIMD code generation and peephole optimizations.

//go:build goexperiment.simd && arm64

package codegen

import (
	"simd/archsimd"
)

//go:noinline
func forceSpill() {}

func spillAroundCall(a archsimd.Int8x16) archsimd.Int8x16 {
	forceSpill()
	// arm64:`FMOVQ` `FMOVQ`
	return a
}

var (
	sinkU8  archsimd.Uint8x16
	sinkI8  archsimd.Int8x16
	sinkU16 archsimd.Uint16x8
	sinkU32 archsimd.Uint32x4
	sinkU64 archsimd.Uint64x2
	sinkF32 archsimd.Float32x4
	sinkF64 archsimd.Float64x2
)

func broadcastConstImmFold(k int) {
	switch k {
	case 0:
		// arm64:`VMOVI [$]0,` -`VDUP`
		sinkU8 = archsimd.BroadcastUint8x16(0)
	case 1:
		// arm64:`VMOVI [$]1,` -`VDUP`
		sinkU8 = archsimd.BroadcastUint8x16(1)
	case 127:
		// arm64:`VMOVI [$]127,` -`VDUP`
		sinkI8 = archsimd.BroadcastInt8x16(127)
	case 128:
		// arm64:`VMOVI [$]128,` -`VDUP`
		sinkU8 = archsimd.BroadcastUint8x16(128)
	case -128:
		// arm64:`VMOVI [$]128,` -`VDUP`
		sinkI8 = archsimd.BroadcastInt8x16(-128)
	case 255:
		// arm64:`VMOVI [$]255,` -`VDUP`
		sinkU8 = archsimd.BroadcastUint8x16(255)
	case -1:
		// arm64:`VMOVI [$]255,` -`VDUP`
		sinkI8 = archsimd.BroadcastInt8x16(-1)
	case -2:
		// arm64:`VMOVI [$]254,` -`VDUP`
		sinkI8 = archsimd.BroadcastInt8x16(-2)
	default:
		// arm64:`VMOV R0, V\d+.B\[0\]` `VDUP`
		sinkI8 = archsimd.BroadcastInt8x16(int8(k))
	}
}

func shiftAllImmFold(k int) {
	switch k {
	case 100:
		// arm64:`VMOVI [$]100,` `VSSHL` -`VDUP`
		sinkI8 = sinkI8.ShiftAllLeft(100)
		// arm64:`VMOVI [$]156,` `VUSHL` -`VDUP`
		sinkU8 = sinkU8.ShiftAllRight(100)
	}
}

func setHiUint32(x, lo archsimd.Uint32x4) {
	// arm64:`VMOV V1.D\[0\], V0.D\[1\]`
	sinkU32 = loToHiUint32Vec(x, lo)
}

func setHiFloat64(x, lo archsimd.Float64x2) {
	// arm64:`VMOV V1.D\[0\], V0.D\[1\]`
	sinkF64 = x.SetElem(1, lo.GetElem(0))
}

func getHiFloat32(x archsimd.Float32x4) {
	// arm64:`VDUP V0.D\[1\],`
	sinkF32 = x.HiToLo()
}

func getHiFloat64(x archsimd.Float64x2) {
	// arm64:`VDUP V0.D\[1\],`
	sinkF64 = x.HiToLo()
}

func foldGetHiSetHiMuls(a, b archsimd.Uint16x8) archsimd.Uint16x8 {
	wLo := a.MulWidenLo(b)                     // arm64: `VUMULL V0.H4, V1.H4, V[0-9].S4`
	wHi := a.HiToLo().MulWidenLo(b.HiToLo())   // arm64: `VUMULL2 V1.H8, V0.H8, V[0-9].S4` -`VDUP`
	narrowLo := wLo.TruncToUint16()            // arm64: `VXTN V[0-9]+.S4, V0.H4`
	narrowHi := wHi.TruncToUint16()            // folded into next line
	return loToHiUint16Vec(narrowLo, narrowHi) // arm64: `VXTN2 V[0-9]+.S4, V0.H8`
}

func carrylessMultiplies(x, y archsimd.Uint64x2) archsimd.Uint64x2 {
	lo := x.CarrylessMultiplyEven(y)                   // arm64:`VPMULL V` -`VPMULL2`
	hi := x.HiToLo().CarrylessMultiplyEven(y.HiToLo()) // arm64:`VPMULL2 V` -`VPMULL `
	return lo.Xor(hi)
}

func mergeWithNotMask(x, y archsimd.Int8x16, mask archsimd.Mask8x16, f1, f2 archsimd.Float32x4) {
	// arm64:`VBIT` -`VBIF` -`VNOT`
	sinkI8 = x.IfElse(mask.Not(), y)
	// arm64: `VFCMEQ`
	eq := f1.Equal(f2)
	// The next line `ne` should be CSEd with `eq` above
	ne := f1.NotEqual(f2)    // arm64: -`.*`
	feq := f1.IfElse(eq, f2) // arm64:`VBIF`
	fne := f1.IfElse(ne, f2) // arm64:`VBIT`
	sinkF32 = fne.Add(feq)
}

// loToHiUint32Vec returns a vector with the lower 64 bits of x preserved and
// the upper 64 bits replaced with the lower 64 bits of lo.
// It routes through Float64x2 to stay in the FP/SIMD register file,
// avoiding a round-trip through a GP register.
func loToHiUint32Vec(x, lo archsimd.Uint32x4) archsimd.Uint32x4 {
	return x.ReshapeToUint64s().BitsToFloat64().SetElem(1, lo.ReshapeToUint64s().BitsToFloat64().GetElem(0)).ToBits().ReshapeToUint32s()
}

// loToHiUint16Vec returns a vector with the lower 64 bits of x preserved and
// the upper 64 bits replaced with the lower 64 bits of lo.
// It routes through Float64x2 to stay in the FP/SIMD register file,
// avoiding a round-trip through a GP register.
func loToHiUint16Vec(x, lo archsimd.Uint16x8) archsimd.Uint16x8 {
	return x.ReshapeToUint64s().BitsToFloat64().SetElem(1, lo.ReshapeToUint64s().BitsToFloat64().GetElem(0)).ToBits().ReshapeToUint16s()
}