cmd/compile: optimize multiplication rules on loong64

Improve multiplication strength reduction, refer to CL 626998, add additional 3 linear combination instructions for loong64. goos: linux goarch: loong64 pkg: cmd/compile/internal/test cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | MulconstI32/3 1.6010n ± 0% 0.8005n ± 0% -50.00% (p=0.000 n=10) MulconstI32/5 1.6010n ± 0% 0.8005n ± 0% -50.00% (p=0.000 n=10) MulconstI32/12 1.601n ± 0% 1.201n ± 0% -24.98% (p=0.000 n=10) MulconstI32/120 1.6010n ± 0% 0.8130n ± 0% -49.22% (p=0.000 n=10) MulconstI32/-120 1.6010n ± 0% 0.8109n ± 0% -49.35% (p=0.000 n=10) MulconstI32/65537 1.6275n ± 0% 0.8005n ± 0% -50.81% (p=0.000 n=10) MulconstI32/65538 1.6290n ± 0% 0.8004n ± 0% -50.87% (p=0.000 n=10) MulconstI64/3 1.6010n ± 0% 0.8004n ± 0% -50.01% (p=0.000 n=10) MulconstI64/5 1.6010n ± 0% 0.8004n ± 0% -50.01% (p=0.000 n=10) MulconstI64/12 1.601n ± 0% 1.201n ± 0% -24.98% (p=0.000 n=10) MulconstI64/120 1.6010n ± 0% 0.8005n ± 0% -50.00% (p=0.000 n=10) MulconstI64/-120 1.6010n ± 0% 0.8005n ± 0% -50.00% (p=0.000 n=10) MulconstI64/65537 1.6270n ± 0% 0.8005n ± 0% -50.80% (p=0.000 n=10) MulconstI64/65538 1.6290n ± 0% 0.8071n ± 1% -50.45% (p=0.000 n=10) MulconstU32/3 1.6010n ± 0% 0.8004n ± 0% -50.01% (p=0.000 n=10) MulconstU32/5 1.6010n ± 0% 0.8004n ± 0% -50.01% (p=0.000 n=10) MulconstU32/12 1.601n ± 0% 1.201n ± 0% -24.98% (p=0.000 n=10) MulconstU32/120 1.6010n ± 0% 0.8066n ± 0% -49.62% (p=0.000 n=10) MulconstU32/65537 1.6290n ± 0% 0.8005n ± 0% -50.86% (p=0.000 n=10) MulconstU32/65538 1.6280n ± 0% 0.8005n ± 0% -50.83% (p=0.000 n=10) MulconstU64/3 1.6010n ± 0% 0.8005n ± 0% -50.00% (p=0.000 n=10) MulconstU64/5 1.6010n ± 0% 0.8005n ± 0% -50.00% (p=0.000 n=10) MulconstU64/12 1.601n ± 0% 1.201n ± 0% -24.98% (p=0.000 n=10) MulconstU64/120 1.6010n ± 0% 0.8005n ± 0% -50.00% (p=0.000 n=10) MulconstU64/65537 1.6290n ± 0% 0.8005n ± 0% -50.86% (p=0.000 n=10) MulconstU64/65538 1.6300n ± 0% 0.8067n ± 0% -50.51% (p=0.000 n=10) geomean 1.609n 0.8537n -46.95% goos: linux goarch: loong64 pkg: cmd/compile/internal/test cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | MulconstI32/3 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstI32/5 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstI32/12 1.601n ± 0% 1.202n ± 0% -24.92% (p=0.000 n=10) MulconstI32/120 1.6020n ± 0% 0.8012n ± 0% -49.99% (p=0.000 n=10) MulconstI32/-120 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstI32/65537 1.6020n ± 0% 0.8007n ± 0% -50.02% (p=0.000 n=10) MulconstI32/65538 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstI64/3 1.6015n ± 0% 0.8007n ± 0% -50.00% (p=0.000 n=10) MulconstI64/5 1.6020n ± 0% 0.8007n ± 0% -50.02% (p=0.000 n=10) MulconstI64/12 1.602n ± 0% 1.202n ± 0% -25.00% (p=0.000 n=10) MulconstI64/120 1.6030n ± 0% 0.8011n ± 0% -50.02% (p=0.000 n=10) MulconstI64/-120 1.6020n ± 0% 0.8007n ± 0% -50.02% (p=0.000 n=10) MulconstI64/65537 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstI64/65538 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstU32/3 1.6010n ± 0% 0.8006n ± 0% -49.99% (p=0.000 n=10) MulconstU32/5 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstU32/12 1.601n ± 0% 1.202n ± 0% -24.92% (p=0.000 n=10) MulconstU32/120 1.6010n ± 0% 0.8006n ± 0% -49.99% (p=0.000 n=10) MulconstU32/65537 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstU32/65538 1.6020n ± 0% 0.8009n ± 0% -50.01% (p=0.000 n=10) MulconstU64/3 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstU64/5 1.6010n ± 0% 0.8007n ± 0% -49.98% (p=0.000 n=10) MulconstU64/12 1.601n ± 0% 1.201n ± 0% -24.98% (p=0.000 n=10) MulconstU64/120 1.6020n ± 0% 0.8007n ± 0% -50.02% (p=0.000 n=10) MulconstU64/65537 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstU64/65538 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) geomean 1.601n 0.8523n -46.77% Change-Id: I9fb0e47ca57875da171a347bf4828adfab41b875 Reviewed-on: https://go-review.googlesource.com/c/go/+/675455 Reviewed-by: Mark Freeman <mark@golang.org> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Keith Randall <khr@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Keith Randall <khr@golang.org>
2025-12-08 06:10:04 +00:00 · 2025-05-22 16:21:10 +08:00 · 2025-05-22 16:21:10 +08:00 · e071617222
commit e071617222
parent eb7f515c4d
7 changed files with 190 additions and 25 deletions
--- a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules
@ -750,10 +750,10 @@
 (SRLVconst [rc] (MOVBUreg x)) && rc >= 8 => (MOVVconst [0])

 // mul by constant
-(MULV x (MOVVconst [-1])) => (NEGV x)
 (MULV _ (MOVVconst [0])) => (MOVVconst [0])
 (MULV x (MOVVconst [1])) => x
-(MULV x (MOVVconst [c])) && isPowerOfTwo(c) => (SLLVconst [log64(c)] x)
+
+(MULV  x (MOVVconst [c])) && canMulStrengthReduce(config, c) => {mulStrengthReduce(v, x, c)}

 // div by constant
 (DIVVU x (MOVVconst [1])) => x
--- a/src/cmd/compile/internal/ssa/_gen/LOONG64latelower.rules
+++ b/src/cmd/compile/internal/ssa/_gen/LOONG64latelower.rules
@ -0,0 +1,6 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Prefer addition when shifting left by one.
+(SLLVconst [1] x) => (ADDV x x)
--- a/src/cmd/compile/internal/ssa/config.go
+++ b/src/cmd/compile/internal/ssa/config.go
@ -283,6 +283,8 @@ func NewConfig(arch string, types Types, ctxt *obj.Link, optimize, softfloat boo
 		c.RegSize = 8
 		c.lowerBlock = rewriteBlockLOONG64
 		c.lowerValue = rewriteValueLOONG64
+		c.lateLowerBlock = rewriteBlockLOONG64latelower
+		c.lateLowerValue = rewriteValueLOONG64latelower
 		c.registers = registersLOONG64[:]
 		c.gpRegMask = gpRegMaskLOONG64
 		c.fpRegMask = fpRegMaskLOONG64
@ -562,6 +564,43 @@ func (c *Config) buildRecipes(arch string) {
 					return m.Block.NewValue2I(m.Pos, OpARM64SUBshiftLL, m.Type, int64(i), x, y)
 				})
 		}
+	case "loong64":
+		// - multiply is 4 cycles.
+		// - add/sub/shift are 1 cycle.
+		// On loong64, using a multiply also needs to load the constant into a register.
+		// TODO: figure out a happy medium.
+		mulCost = 45
+
+		// add
+		r(1, 1, 10,
+			func(m, x, y *Value) *Value {
+				return m.Block.NewValue2(m.Pos, OpLOONG64ADDV, m.Type, x, y)
+			})
+		// neg
+		r(-1, 0, 10,
+			func(m, x, y *Value) *Value {
+				return m.Block.NewValue1(m.Pos, OpLOONG64NEGV, m.Type, x)
+			})
+		// sub
+		r(1, -1, 10,
+			func(m, x, y *Value) *Value {
+				return m.Block.NewValue2(m.Pos, OpLOONG64SUBV, m.Type, x, y)
+			})
+
+		// regular shifts
+		for i := 1; i < 64; i++ {
+			c := 10
+			if i == 1 {
+				// Prefer x<<1 over x+x.
+				// Note that we eventually reverse this decision in LOONG64latelower.rules,
+				// but this makes shift combining rules in LOONG64.rules simpler.
+				c--
+			}
+			r(1<<i, 0, c,
+				func(m, x, y *Value) *Value {
+					return m.Block.NewValue1I(m.Pos, OpLOONG64SLLVconst, m.Type, int64(i), x)
+				})
+		}
 	}

 	c.mulRecipes = map[int64]mulRecipe{}
@ -628,17 +667,58 @@ func (c *Config) buildRecipes(arch string) {
 		}
 	}

+	// Currently we only process 3 linear combination instructions for loong64.
+	if arch == "loong64" {
+		// Three-instruction recipes.
+		// D: The first and the second are all single-instruction recipes, and they are also the third's inputs.
+		// E: The first single-instruction is the second's input, and the second is the third's input.
+
+		// D
+		for _, first := range linearCombos {
+			for _, second := range linearCombos {
+				for _, third := range linearCombos {
+					x := third.a*(first.a+first.b) + third.b*(second.a+second.b)
+					cost := first.cost + second.cost + third.cost
+					old := c.mulRecipes[x]
+					if (old.build == nil || cost < old.cost) && cost < mulCost {
+						c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
+							v1 := first.build(m, v, v)
+							v2 := second.build(m, v, v)
+							return third.build(m, v1, v2)
+						}}
+					}
+				}
+			}
+		}
+
+		// E
+		for _, first := range linearCombos {
+			for _, second := range linearCombos {
+				for _, third := range linearCombos {
+					x := third.a*(second.a*(first.a+first.b)+second.b) + third.b
+					cost := first.cost + second.cost + third.cost
+					old := c.mulRecipes[x]
+					if (old.build == nil || cost < old.cost) && cost < mulCost {
+						c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
+							v1 := first.build(m, v, v)
+							v2 := second.build(m, v1, v)
+							return third.build(m, v2, v)
+						}}
+					}
+				}
+			}
+		}
+	}
+
 	// These cases should be handled specially by rewrite rules.
 	// (Otherwise v * 1 == (neg (neg v)))
 	delete(c.mulRecipes, 0)
 	delete(c.mulRecipes, 1)

-	// Currently we assume that it doesn't help to do 3 linear
-	// combination instructions.
-
 	// Currently:
 	// len(c.mulRecipes) == 5984 on arm64
 	//                       680 on amd64
+	//                      5984 on loong64
 	// This function takes ~2.5ms on arm64.
 	//println(len(c.mulRecipes))
 }
--- a/src/cmd/compile/internal/ssa/rewriteLOONG64.go
+++ b/src/cmd/compile/internal/ssa/rewriteLOONG64.go
@ -5537,20 +5537,8 @@ func rewriteValueLOONG64_OpLOONG64MOVWstorezeroidx(v *Value) bool {
 func rewriteValueLOONG64_OpLOONG64MULV(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
-	// match: (MULV x (MOVVconst [-1]))
-	// result: (NEGV x)
-	for {
-		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-			x := v_0
-			if v_1.Op != OpLOONG64MOVVconst || auxIntToInt64(v_1.AuxInt) != -1 {
-				continue
-			}
-			v.reset(OpLOONG64NEGV)
-			v.AddArg(x)
-			return true
-		}
-		break
-	}
+	b := v.Block
+	config := b.Func.Config
 	// match: (MULV _ (MOVVconst [0]))
 	// result: (MOVVconst [0])
 	for {
@ -5578,8 +5566,8 @@ func rewriteValueLOONG64_OpLOONG64MULV(v *Value) bool {
 		break
 	}
 	// match: (MULV x (MOVVconst [c]))
-	// cond: isPowerOfTwo(c)
-	// result: (SLLVconst [log64(c)] x)
+	// cond: canMulStrengthReduce(config, c)
+	// result: {mulStrengthReduce(v, x, c)}
 	for {
 		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
 			x := v_0
@ -5587,12 +5575,10 @@ func rewriteValueLOONG64_OpLOONG64MULV(v *Value) bool {
 				continue
 			}
 			c := auxIntToInt64(v_1.AuxInt)
-			if !(isPowerOfTwo(c)) {
+			if !(canMulStrengthReduce(config, c)) {
 				continue
 			}
-			v.reset(OpLOONG64SLLVconst)
-			v.AuxInt = int64ToAuxInt(log64(c))
-			v.AddArg(x)
+			v.copyOf(mulStrengthReduce(v, x, c))
 			return true
 		}
 		break
--- a/src/cmd/compile/internal/ssa/rewriteLOONG64latelower.go
+++ b/src/cmd/compile/internal/ssa/rewriteLOONG64latelower.go
@ -0,0 +1,29 @@
+// Code generated from _gen/LOONG64latelower.rules using 'go generate'; DO NOT EDIT.
+
+package ssa
+
+func rewriteValueLOONG64latelower(v *Value) bool {
+	switch v.Op {
+	case OpLOONG64SLLVconst:
+		return rewriteValueLOONG64latelower_OpLOONG64SLLVconst(v)
+	}
+	return false
+}
+func rewriteValueLOONG64latelower_OpLOONG64SLLVconst(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (SLLVconst [1] x)
+	// result: (ADDV x x)
+	for {
+		if auxIntToInt64(v.AuxInt) != 1 {
+			break
+		}
+		x := v_0
+		v.reset(OpLOONG64ADDV)
+		v.AddArg2(x, x)
+		return true
+	}
+	return false
+}
+func rewriteBlockLOONG64latelower(b *Block) bool {
+	return false
+}
--- a/test/codegen/arithmetic.go
+++ b/test/codegen/arithmetic.go
@ -228,6 +228,7 @@ func Pow2Muls(n1, n2 int) (int, int) {
 	// 386:"SHLL\t[$]5",-"IMULL"
 	// arm:"SLL\t[$]5",-"MUL"
 	// arm64:"LSL\t[$]5",-"MUL"
+	// loong64:"SLLV\t[$]5",-"MULV"
 	// ppc64x:"SLD\t[$]5",-"MUL"
 	a := n1 * 32

@ -235,6 +236,7 @@ func Pow2Muls(n1, n2 int) (int, int) {
 	// 386:"SHLL\t[$]6",-"IMULL"
 	// arm:"SLL\t[$]6",-"MUL"
 	// arm64:`NEG\sR[0-9]+<<6,\sR[0-9]+`,-`LSL`,-`MUL`
+	// loong64:"SLLV\t[$]6",-"MULV"
 	// ppc64x:"SLD\t[$]6","NEG\\sR[0-9]+,\\sR[0-9]+",-"MUL"
 	b := -64 * n2

@ -255,11 +257,13 @@ func Mul_96(n int) int {
 	// 386:`SHLL\t[$]5`,`LEAL\t\(.*\)\(.*\*2\),`,-`IMULL`
 	// arm64:`LSL\t[$]5`,`ADD\sR[0-9]+<<1,\sR[0-9]+`,-`MUL`
 	// arm:`SLL\t[$]5`,`ADD\sR[0-9]+<<1,\sR[0-9]+`,-`MUL`
+	// loong64:"ADDVU","SLLV\t[$]5",-"MULV"
 	// s390x:`SLD\t[$]5`,`SLD\t[$]6`,-`MULLD`
 	return n * 96
 }

 func Mul_n120(n int) int {
+	// loong64:"SLLV\t[$]3","SLLV\t[$]7","SUBVU",-"MULV"
 	// s390x:`SLD\t[$]3`,`SLD\t[$]7`,-`MULLD`
 	return n * -120
 }
--- a/test/codegen/multiply.go
+++ b/test/codegen/multiply.go
@ -12,301 +12,361 @@ package codegen
 func m0(x int64) int64 {
 	// amd64: "XORL"
 	// arm64: "MOVD\tZR"
+	// loong64: "MOVV\t[$]0"
 	return x * 0
 }
 func m2(x int64) int64 {
 	// amd64: "ADDQ"
 	// arm64: "ADD"
+	// loong64: "ADDVU"
 	return x * 2
 }
 func m3(x int64) int64 {
 	// amd64: "LEAQ\t.*[*]2"
 	// arm64: "ADD\tR[0-9]+<<1,"
+	// loong64: "ADDVU","ADDVU"
 	return x * 3
 }
 func m4(x int64) int64 {
 	// amd64: "SHLQ\t[$]2,"
 	// arm64: "LSL\t[$]2,"
+	// loong64: "SLLV\t[$]2,"
 	return x * 4
 }
 func m5(x int64) int64 {
 	// amd64: "LEAQ\t.*[*]4"
 	// arm64: "ADD\tR[0-9]+<<2,"
+	// loong64: "SLLV\t[$]2,","ADDVU"
 	return x * 5
 }
 func m6(x int64) int64 {
 	// amd64: "LEAQ\t.*[*]1", "LEAQ\t.*[*]2"
 	// arm64: "ADD\tR[0-9]+,", "ADD\tR[0-9]+<<1,"
+	// loong64: "ADDVU","ADDVU","ADDVU"
 	return x * 6
 }
 func m7(x int64) int64 {
 	// amd64: "LEAQ\t.*[*]2"
 	// arm64: "LSL\t[$]3,", "SUB\tR[0-9]+,"
+	// loong64: "SLLV\t[$]3,","SUBVU"
 	return x * 7
 }
 func m8(x int64) int64 {
 	// amd64: "SHLQ\t[$]3,"
 	// arm64: "LSL\t[$]3,"
+	// loong64: "SLLV\t[$]3,"
 	return x * 8
 }
 func m9(x int64) int64 {
 	// amd64: "LEAQ\t.*[*]8"
 	// arm64: "ADD\tR[0-9]+<<3,"
+	// loong64: "SLLV\t[$]3,","ADDVU"
 	return x * 9
 }
 func m10(x int64) int64 {
 	// amd64: "LEAQ\t.*[*]1", "LEAQ\t.*[*]4"
 	// arm64: "ADD\tR[0-9]+,", "ADD\tR[0-9]+<<2,"
+	// loong64: "ADDVU","SLLV\t[$]3,","ADDVU"
 	return x * 10
 }
 func m11(x int64) int64 {
 	// amd64: "LEAQ\t.*[*]4", "LEAQ\t.*[*]2"
 	// arm64: "MOVD\t[$]11,", "MUL"
+	// loong64: "MOVV\t[$]11,", "MULV"
 	return x * 11
 }
 func m12(x int64) int64 {
 	// amd64: "LEAQ\t.*[*]2", "SHLQ\t[$]2,"
 	// arm64: "LSL\t[$]2,", "ADD\tR[0-9]+<<1,"
+	// loong64: "ADDVU","ADDVU","SLLV\t[$]2,"
 	return x * 12
 }
 func m13(x int64) int64 {
 	// amd64: "LEAQ\t.*[*]2", "LEAQ\t.*[*]4"
 	// arm64: "MOVD\t[$]13,", "MUL"
+	// loong64: "MOVV\t[$]13,","MULV"
 	return x * 13
 }
 func m14(x int64) int64 {
 	// amd64: "IMUL3Q\t[$]14,"
 	// arm64: "LSL\t[$]4,", "SUB\tR[0-9]+<<1,"
+	// loong64: "ADDVU","SLLV\t[$]4,","SUBVU"
 	return x * 14
 }
 func m15(x int64) int64 {
 	// amd64: "LEAQ\t.*[*]2", "LEAQ\t.*[*]4"
 	// arm64: "LSL\t[$]4,", "SUB\tR[0-9]+,"
+	// loong64: "SLLV\t[$]4,","SUBVU"
 	return x * 15
 }
 func m16(x int64) int64 {
 	// amd64: "SHLQ\t[$]4,"
 	// arm64: "LSL\t[$]4,"
+	// loong64: "SLLV\t[$]4,"
 	return x * 16
 }
 func m17(x int64) int64 {
 	// amd64: "LEAQ\t.*[*]1", "LEAQ\t.*[*]8"
 	// arm64: "ADD\tR[0-9]+<<4,"
+	// loong64: "SLLV\t[$]4,","ADDVU"
 	return x * 17
 }
 func m18(x int64) int64 {
 	// amd64: "LEAQ\t.*[*]1", "LEAQ\t.*[*]8"
 	// arm64: "ADD\tR[0-9]+,", "ADD\tR[0-9]+<<3,"
+	// loong64: "ADDVU","SLLV\t[$]4,","ADDVU"
 	return x * 18
 }
 func m19(x int64) int64 {
 	// amd64: "LEAQ\t.*[*]8", "LEAQ\t.*[*]2"
 	// arm64: "MOVD\t[$]19,", "MUL"
+	// loong64: "MOVV\t[$]19,","MULV"
 	return x * 19
 }
 func m20(x int64) int64 {
 	// amd64: "LEAQ\t.*[*]4", "SHLQ\t[$]2,"
 	// arm64: "LSL\t[$]2,", "ADD\tR[0-9]+<<2,"
+	// loong64: "SLLV\t[$]2,","SLLV\t[$]4,","ADDVU"
 	return x * 20
 }
 func m21(x int64) int64 {
 	// amd64: "LEAQ\t.*[*]4", "LEAQ\t.*[*]4"
 	// arm64: "MOVD\t[$]21,", "MUL"
+	// loong64: "MOVV\t[$]21,","MULV"
 	return x * 21
 }
 func m22(x int64) int64 {
 	// amd64: "IMUL3Q\t[$]22,"
 	// arm64: "MOVD\t[$]22,", "MUL"
+	// loong64: "MOVV\t[$]22,","MULV"
 	return x * 22
 }
 func m23(x int64) int64 {
 	// amd64: "IMUL3Q\t[$]23,"
 	// arm64: "MOVD\t[$]23,", "MUL"
+	// loong64: "MOVV\t[$]23,","MULV"
 	return x * 23
 }
 func m24(x int64) int64 {
 	// amd64: "LEAQ\t.*[*]2", "SHLQ\t[$]3,"
 	// arm64: "LSL\t[$]3,", "ADD\tR[0-9]+<<1,"
+	// loong64: "ADDVU","ADDVU","SLLV\t[$]3,"
 	return x * 24
 }
 func m25(x int64) int64 {
 	// amd64: "LEAQ\t.*[*]4", "LEAQ\t.*[*]4"
 	// arm64: "MOVD\t[$]25,", "MUL"
+	// loong64: "MOVV\t[$]25,","MULV"
 	return x * 25
 }
 func m26(x int64) int64 {
 	// amd64: "IMUL3Q\t[$]26,"
 	// arm64: "MOVD\t[$]26,", "MUL"
+	// loong64: "MOVV\t[$]26,","MULV"
 	return x * 26
 }
 func m27(x int64) int64 {
 	// amd64: "LEAQ\t.*[*]2", "LEAQ\t.*[*]8"
 	// arm64: "MOVD\t[$]27,", "MUL"
+	// loong64: "MOVV\t[$]27,","MULV"
 	return x * 27
 }
 func m28(x int64) int64 {
 	// amd64: "IMUL3Q\t[$]28,"
 	// arm64: "LSL\t[$]5, "SUB\tR[0-9]+<<2,"
+	// loong64: "SLLV\t[$]5,","SLLV\t[$]2,","SUBVU"
 	return x * 28
 }
 func m29(x int64) int64 {
 	// amd64: "IMUL3Q\t[$]29,"
 	// arm64: "MOVD\t[$]29,", "MUL"
+	// loong64: "MOVV\t[$]29,","MULV"
 	return x * 29
 }
 func m30(x int64) int64 {
 	// amd64: "IMUL3Q\t[$]30,"
 	// arm64: "LSL\t[$]5,", "SUB\tR[0-9]+<<1,"
+	// loong64: "ADDVU","SLLV\t[$]5,","SUBVU"
 	return x * 30
 }
 func m31(x int64) int64 {
 	// amd64: "SHLQ\t[$]5,", "SUBQ"
 	// arm64: "LSL\t[$]5,", "SUB\tR[0-9]+,"
+	// loong64: "SLLV\t[$]5,","SUBVU"
 	return x * 31
 }
 func m32(x int64) int64 {
 	// amd64: "SHLQ\t[$]5,"
 	// arm64: "LSL\t[$]5,"
+	// loong64: "SLLV\t[$]5,"
 	return x * 32
 }
 func m33(x int64) int64 {
 	// amd64: "SHLQ\t[$]2,", "LEAQ\t.*[*]8"
 	// arm64: "ADD\tR[0-9]+<<5,"
+	// loong64: "SLLV\t[$]5,","ADDVU"
 	return x * 33
 }
 func m34(x int64) int64 {
 	// amd64: "SHLQ\t[$]5,", "LEAQ\t.*[*]2"
 	// arm64: "ADD\tR[0-9]+,", "ADD\tR[0-9]+<<4,"
+	// loong64: "ADDVU","SLLV\t[$]5,","ADDVU"
 	return x * 34
 }
 func m35(x int64) int64 {
 	// amd64: "IMUL3Q\t[$]35,"
 	// arm64: "MOVD\t[$]35,", "MUL"
+	// loong64: "MOVV\t[$]35,","MULV"
 	return x * 35
 }
 func m36(x int64) int64 {
 	// amd64: "LEAQ\t.*[*]8", "SHLQ\t[$]2,"
 	// arm64: "LSL\t[$]2,", "ADD\tR[0-9]+<<3,"
+	// loong64: "SLLV\t[$]2,","SLLV\t[$]5,","ADDVU"
 	return x * 36
 }
 func m37(x int64) int64 {
 	// amd64: "LEAQ\t.*[*]8", "LEAQ\t.*[*]4"
 	// arm64: "MOVD\t[$]37,", "MUL"
+	// loong64: "MOVV\t[$]37,","MULV"
 	return x * 37
 }
 func m38(x int64) int64 {
 	// amd64: "IMUL3Q\t[$]38,"
 	// arm64: "MOVD\t[$]38,", "MUL"
+	// loong64: "MOVV\t[$]38,","MULV"
 	return x * 38
 }
 func m39(x int64) int64 {
 	// amd64: "IMUL3Q\t[$]39,"
 	// arm64: "MOVD\t[$]39,", "MUL"
+	// loong64: "MOVV\t[$]39,", "MULV"
 	return x * 39
 }
 func m40(x int64) int64 {
 	// amd64: "LEAQ\t.*[*]4", "SHLQ\t[$]3,"
 	// arm64: "LSL\t[$]3,", "ADD\tR[0-9]+<<2,"
+	// loong64: "SLLV\t[$]3,","SLLV\t[$]5,","ADDVU"
 	return x * 40
 }

 func mn1(x int64) int64 {
 	// amd64: "NEGQ\t"
 	// arm64: "NEG\tR[0-9]+,"
+	// loong64: "SUBVU\tR[0-9], R0,"
 	return x * -1
 }
 func mn2(x int64) int64 {
 	// amd64: "NEGQ", "ADDQ"
 	// arm64: "NEG\tR[0-9]+<<1,"
+	// loong64: "ADDVU","SUBVU\tR[0-9], R0,"
 	return x * -2
 }
 func mn3(x int64) int64 {
 	// amd64: "NEGQ", "LEAQ\t.*[*]2"
 	// arm64: "SUB\tR[0-9]+<<2,"
+	// loong64: "SLLV\t[$]2,","SUBVU"
 	return x * -3
 }
 func mn4(x int64) int64 {
 	// amd64: "NEGQ", "SHLQ\t[$]2,"
 	// arm64: "NEG\tR[0-9]+<<2,"
+	// loong64: "SLLV\t[$]2,","SUBVU\tR[0-9], R0,"
 	return x * -4
 }
 func mn5(x int64) int64 {
 	// amd64: "NEGQ", "LEAQ\t.*[*]4"
 	// arm64: "NEG\tR[0-9]+,", "ADD\tR[0-9]+<<2,"
+	// loong64: "SUBVU\tR[0-9], R0,","SLLV\t[$]2,","SUBVU"
 	return x * -5
 }
 func mn6(x int64) int64 {
 	// amd64: "IMUL3Q\t[$]-6,"
 	// arm64: "ADD\tR[0-9]+,", "SUB\tR[0-9]+<<2,"
+	// loong64: "ADDVU","SLLV\t[$]3,","SUBVU"
 	return x * -6
 }
 func mn7(x int64) int64 {
 	// amd64: "NEGQ", "LEAQ\t.*[*]8"
 	// arm64: "SUB\tR[0-9]+<<3,"
+	// loong64: "SLLV\t[$]3","SUBVU"
 	return x * -7
 }
 func mn8(x int64) int64 {
 	// amd64: "NEGQ", "SHLQ\t[$]3,"
 	// arm64: "NEG\tR[0-9]+<<3,"
+	// loong64: "SLLV\t[$]3","SUBVU\tR[0-9], R0,"
 	return x * -8
 }
 func mn9(x int64) int64 {
 	// amd64: "NEGQ", "LEAQ\t.*[*]8"
 	// arm64: "NEG\tR[0-9]+,", "ADD\tR[0-9]+<<3,"
+	// loong64: "SUBVU\tR[0-9], R0,","SLLV\t[$]3","SUBVU"
 	return x * -9
 }
 func mn10(x int64) int64 {
 	// amd64: "IMUL3Q\t[$]-10,"
 	// arm64: "MOVD\t[$]-10,", "MUL"
+	// loong64: "MOVV\t[$]-10","MULV"
 	return x * -10
 }
 func mn11(x int64) int64 {
 	// amd64: "IMUL3Q\t[$]-11,"
 	// arm64: "MOVD\t[$]-11,", "MUL"
+	// loong64: "MOVV\t[$]-11","MULV"
 	return x * -11
 }
 func mn12(x int64) int64 {
 	// amd64: "IMUL3Q\t[$]-12,"
 	// arm64: "LSL\t[$]2,", "SUB\tR[0-9]+<<2,"
+	// loong64: "SLLV\t[$]2,","SLLV\t[$]4,","SUBVU"
 	return x * -12
 }
 func mn13(x int64) int64 {
 	// amd64: "IMUL3Q\t[$]-13,"
 	// arm64: "MOVD\t[$]-13,", "MUL"
+	// loong64: "MOVV\t[$]-13","MULV"
 	return x * -13
 }
 func mn14(x int64) int64 {
 	// amd64: "IMUL3Q\t[$]-14,"
 	// arm64: "ADD\tR[0-9]+,", "SUB\tR[0-9]+<<3,"
+	// loong64: "ADDVU","SLLV\t[$]4,","SUBVU"
 	return x * -14
 }
 func mn15(x int64) int64 {
 	// amd64: "SHLQ\t[$]4,", "SUBQ"
 	// arm64: "SUB\tR[0-9]+<<4,"
+	// loong64: "SLLV\t[$]4,","SUBVU"
 	return x * -15
 }
 func mn16(x int64) int64 {
 	// amd64: "NEGQ", "SHLQ\t[$]4,"
 	// arm64: "NEG\tR[0-9]+<<4,"
+	// loong64: "SLLV\t[$]4,","SUBVU\tR[0-9], R0,"
 	return x * -16
 }
 func mn17(x int64) int64 {
 	// amd64: "IMUL3Q\t[$]-17,"
 	// arm64: "NEG\tR[0-9]+,", "ADD\tR[0-9]+<<4,"
+	// loong64: "SUBVU\tR[0-9], R0,","SLLV\t[$]4,","SUBVU"
 	return x * -17
 }
 func mn18(x int64) int64 {
 	// amd64: "IMUL3Q\t[$]-18,"
 	// arm64: "MOVD\t[$]-18,", "MUL"
+	// loong64: "MOVV\t[$]-18","MULV"
 	return x * -18
 }
 func mn19(x int64) int64 {
 	// amd64: "IMUL3Q\t[$]-19,"
 	// arm64: "MOVD\t[$]-19,", "MUL"
+	// loong64: "MOVV\t[$]-19","MULV"
 	return x * -19
 }
 func mn20(x int64) int64 {
 	// amd64: "IMUL3Q\t[$]-20,"
 	// arm64: "MOVD\t[$]-20,", "MUL"
+	// loong64: "MOVV\t[$]-20","MULV"
 	return x * -20
 }