cmd/compile: add unsigned divisibility rules

"Division by invariant integers using multiplication" paper by Granlund and Montgomery contains a method for directly computing divisibility (x%c == 0 for c constant) by means of the modular inverse. The method is further elaborated in "Hacker's Delight" by Warren Section 10-17 This general rule can compute divisibilty by one multiplication and a compare for odd divisors and an additional rotate for even divisors. To apply the divisibility rule, we must take into account the rules to rewrite x%c = x-((x/c)*c) and (x/c) for c constant on the first optimization pass "opt". This complicates the matching as we want to match only in the cases where the result of (x/c) is not also available. So, we must match on the expanded form of (x/c) in the expression x == c*(x/c) in the "late opt" pass after common subexpresion elimination. Note, that if there is an intermediate opt pass introduced in the future we could simplify these rules by delaying the magic division rewrite to "late opt" and matching directly on (x/c) in the intermediate opt pass. Additional rules to lower the generic RotateLeft* ops were also applied. On amd64, the divisibility check is 25-50% faster. name old time/op new time/op delta DivconstI64-4 2.08ns ± 0% 2.08ns ± 1% ~ (p=0.881 n=5+5) DivisibleconstI64-4 2.67ns ± 0% 2.67ns ± 1% ~ (p=1.000 n=5+5) DivisibleWDivconstI64-4 2.67ns ± 0% 2.67ns ± 0% ~ (p=0.683 n=5+5) DivconstU64-4 2.08ns ± 1% 2.08ns ± 1% ~ (p=1.000 n=5+5) DivisibleconstU64-4 2.77ns ± 1% 1.55ns ± 2% -43.90% (p=0.008 n=5+5) DivisibleWDivconstU64-4 2.99ns ± 1% 2.99ns ± 1% ~ (p=1.000 n=5+5) DivconstI32-4 1.53ns ± 2% 1.53ns ± 0% ~ (p=1.000 n=5+5) DivisibleconstI32-4 2.23ns ± 0% 2.25ns ± 3% ~ (p=0.167 n=5+5) DivisibleWDivconstI32-4 2.27ns ± 1% 2.27ns ± 1% ~ (p=0.429 n=5+5) DivconstU32-4 1.78ns ± 0% 1.78ns ± 1% ~ (p=1.000 n=4+5) DivisibleconstU32-4 2.52ns ± 2% 1.26ns ± 0% -49.96% (p=0.000 n=5+4) DivisibleWDivconstU32-4 2.63ns ± 0% 2.85ns ±10% +8.29% (p=0.016 n=4+5) DivconstI16-4 1.54ns ± 0% 1.54ns ± 0% ~ (p=0.333 n=4+5) DivisibleconstI16-4 2.10ns ± 0% 2.10ns ± 1% ~ (p=0.571 n=4+5) DivisibleWDivconstI16-4 2.22ns ± 0% 2.23ns ± 1% ~ (p=0.556 n=4+5) DivconstU16-4 1.09ns ± 0% 1.01ns ± 1% -7.74% (p=0.000 n=4+5) DivisibleconstU16-4 1.83ns ± 0% 1.26ns ± 0% -31.52% (p=0.008 n=5+5) DivisibleWDivconstU16-4 1.88ns ± 0% 1.89ns ± 1% ~ (p=0.365 n=5+5) DivconstI8-4 1.54ns ± 1% 1.54ns ± 1% ~ (p=1.000 n=5+5) DivisibleconstI8-4 2.10ns ± 0% 2.11ns ± 0% ~ (p=0.238 n=5+4) DivisibleWDivconstI8-4 2.22ns ± 0% 2.23ns ± 2% ~ (p=0.762 n=5+5) DivconstU8-4 0.92ns ± 1% 0.94ns ± 1% +2.65% (p=0.008 n=5+5) DivisibleconstU8-4 1.66ns ± 0% 1.26ns ± 1% -24.28% (p=0.008 n=5+5) DivisibleWDivconstU8-4 1.79ns ± 0% 1.80ns ± 1% ~ (p=0.079 n=4+5) A follow-up change will address the signed division case. Updates #30282 Change-Id: I7e995f167179aa5c76bb10fbcbeb49c520943403 Reviewed-on: https://go-review.googlesource.com/c/go/+/168037 Run-TryBot: Brian Kessler <brian.m.kessler@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2025-12-08 06:10:04 +00:00 · 2019-03-09 21:58:16 -07:00 · 2019-03-09 21:58:16 -07:00 · a28a942768
commit a28a942768
parent f67f5511ee
24 changed files with 9927 additions and 24 deletions
--- a/src/cmd/compile/internal/gc/testdata/arith_test.go
+++ b/src/cmd/compile/internal/gc/testdata/arith_test.go
@ -926,6 +926,7 @@ func TestArithmetic(t *testing.T) {
 	testShiftedOps(t)
 	testDivFixUp(t)
 	testDivisibleSignedPow2(t)
+	testDivisibility(t)
 }

 // testDivFixUp ensures that signed division fix-ups are being generated.
@ -1240,6 +1241,102 @@ func testDivisibleSignedPow2(t *testing.T) {
 		if want, got := x%two57 == 0, divisible_int64_2to57(x); got != want {
 			t.Errorf("divisible_int64_2to57(%d) = %v want %v", x, got, want)
 		}
-
+	}
+}
+
+func div6_uint8(n uint8) bool {
+	return n%6 == 0
+}
+
+//go:noinline
+func div6_uint16(n uint16) bool {
+	return n%6 == 0
+}
+
+//go:noinline
+func div6_uint32(n uint32) bool {
+	return n%6 == 0
+}
+
+//go:noinline
+func div6_uint64(n uint64) bool {
+	return n%6 == 0
+}
+
+//go:noinline
+func div19_uint8(n uint8) bool {
+	return n%19 == 0
+}
+
+//go:noinline
+func div19_uint16(n uint16) bool {
+	return n%19 == 0
+}
+
+//go:noinline
+func div19_uint32(n uint32) bool {
+	return n%19 == 0
+}
+
+//go:noinline
+func div19_uint64(n uint64) bool {
+	return n%19 == 0
+}
+
+// testDivisibility confirms that rewrite rules x%c ==0 for c constant are correct.
+func testDivisibility(t *testing.T) {
+	// test an even and an odd divisor
+	var six, nineteen uint64 = 6, 19
+	// test all inputs for uint8, uint16
+	for i := uint64(0); i <= math.MaxUint16; i++ {
+		if i <= math.MaxUint8 {
+			if want, got := uint8(i)%uint8(six) == 0, div6_uint8(uint8(i)); got != want {
+				t.Errorf("div6_uint8(%d) = %v want %v", i, got, want)
+			}
+			if want, got := uint8(i)%uint8(nineteen) == 0, div19_uint8(uint8(i)); got != want {
+				t.Errorf("div6_uint19(%d) = %v want %v", i, got, want)
+			}
+		}
+		if want, got := uint16(i)%uint16(six) == 0, div6_uint16(uint16(i)); got != want {
+			t.Errorf("div6_uint16(%d) = %v want %v", i, got, want)
+		}
+		if want, got := uint16(i)%uint16(nineteen) == 0, div19_uint16(uint16(i)); got != want {
+			t.Errorf("div19_uint16(%d) = %v want %v", i, got, want)
+		}
+	}
+	var maxU32, maxU64 uint64 = math.MaxUint32, math.MaxUint64
+	// spot check inputs for uint32 and uint64
+	xu := []uint64{
+		0, 1, 2, 3, 4, 5,
+		six, 2 * six, 3 * six, 5 * six, 12345 * six,
+		six + 1, 2*six - 5, 3*six + 3, 5*six + 4, 12345*six - 2,
+		nineteen, 2 * nineteen, 3 * nineteen, 5 * nineteen, 12345 * nineteen,
+		nineteen + 1, 2*nineteen - 5, 3*nineteen + 3, 5*nineteen + 4, 12345*nineteen - 2,
+		maxU32, maxU32 - 1, maxU32 - 2, maxU32 - 3, maxU32 - 4,
+		maxU32, maxU32 - 5, maxU32 - 6, maxU32 - 7, maxU32 - 8,
+		maxU32, maxU32 - 9, maxU32 - 10, maxU32 - 11, maxU32 - 12,
+		maxU32, maxU32 - 13, maxU32 - 14, maxU32 - 15, maxU32 - 16,
+		maxU32, maxU32 - 17, maxU32 - 18, maxU32 - 19, maxU32 - 20,
+		maxU64, maxU64 - 1, maxU64 - 2, maxU64 - 3, maxU64 - 4,
+		maxU64, maxU64 - 5, maxU64 - 6, maxU64 - 7, maxU64 - 8,
+		maxU64, maxU64 - 9, maxU64 - 10, maxU64 - 11, maxU64 - 12,
+		maxU64, maxU64 - 13, maxU64 - 14, maxU64 - 15, maxU64 - 16,
+		maxU64, maxU64 - 17, maxU64 - 18, maxU64 - 19, maxU64 - 20,
+	}
+	for _, x := range xu {
+		if x <= maxU32 {
+			if want, got := uint32(x)%uint32(six) == 0, div6_uint32(uint32(x)); got != want {
+				t.Errorf("div6_uint32(%d) = %v want %v", x, got, want)
+			}
+			if want, got := uint32(x)%uint32(nineteen) == 0, div19_uint32(uint32(x)); got != want {
+				t.Errorf("div19_uint32(%d) = %v want %v", x, got, want)
+			}
+		}
+		if want, got := x%six == 0, div6_uint64(x); got != want {
+			t.Errorf("div6_uint64(%d) = %v want %v", x, got, want)
+		}
+		if want, got := x%nineteen == 0, div19_uint64(x); got != want {
+			t.Errorf("div19_uint64(%d) = %v want %v", x, got, want)
+		}
 	}
 }
--- a/src/cmd/compile/internal/ssa/gen/386.rules
+++ b/src/cmd/compile/internal/ssa/gen/386.rules
@ -165,6 +165,11 @@
 (Rsh16x64 x (Const64 [c])) && uint64(c) >= 16 -> (SARWconst x [15])
 (Rsh8x64 x (Const64 [c])) && uint64(c) >= 8 -> (SARBconst x [7])

+// constant rotates
+(RotateLeft32 x (MOVLconst [c])) -> (ROLLconst [c&31] x)
+(RotateLeft16 x (MOVLconst [c])) -> (ROLWconst [c&15] x)
+(RotateLeft8 x (MOVLconst [c]))  -> (ROLBconst [c&7] x)
+
 // Lowering comparisons
 (Less32  x y) -> (SETL (CMPL x y))
 (Less16  x y) -> (SETL (CMPW x y))
--- a/src/cmd/compile/internal/ssa/gen/ARM.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM.rules
@ -1233,6 +1233,10 @@
 ( ORshiftRL [c] (SLLconst x [32-c]) x) -> (SRRconst [   c] x)
 (XORshiftRL [c] (SLLconst x [32-c]) x) -> (SRRconst [   c] x)

+(RotateLeft32 x (MOVWconst [c])) -> (SRRconst [-c&31] x)
+(RotateLeft16 <t> x (MOVWconst [c])) -> (Or16 (Lsh16x32 <t> x (MOVWconst [c&15])) (Rsh16Ux32 <t> x (MOVWconst [-c&15])))
+(RotateLeft8 <t> x (MOVWconst [c])) -> (Or8 (Lsh8x32 <t> x (MOVWconst [c&7])) (Rsh8Ux32 <t> x (MOVWconst [-c&7])))
+
 // ((x>>8) | (x<<8)) -> (REV16 x), the type of x is uint16, "|" can also be "^" or "+".
 // UBFX instruction is supported by ARMv6T2, ARMv7 and above versions, REV16 is supported by
 // ARMv6 and above versions. So for ARMv6, we need to match SLLconst, SRLconst and ORshiftLL.
--- a/src/cmd/compile/internal/ssa/gen/ARM64.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules
@ -92,6 +92,8 @@
 (Trunc x) -> (FRINTZD x)

 // lowering rotates
+(RotateLeft8 <t> x (MOVDconst [c])) -> (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7])))
+(RotateLeft16 <t> x (MOVDconst [c])) -> (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15])))
 (RotateLeft32 x y) -> (RORW x (NEG <y.Type> y))
 (RotateLeft64 x y) -> (ROR x (NEG <y.Type> y))

--- a/src/cmd/compile/internal/ssa/gen/MIPS.rules
+++ b/src/cmd/compile/internal/ssa/gen/MIPS.rules
@ -108,6 +108,12 @@
 (Rsh8x16 x y) -> (SRA (SignExt16to32 x) ( CMOVZ <typ.UInt32> (ZeroExt16to32 y) (MOVWconst [-1]) (SGTUconst [32] (ZeroExt16to32 y))))
 (Rsh8x8 x y) -> (SRA (SignExt16to32 x) ( CMOVZ <typ.UInt32> (ZeroExt8to32 y) (MOVWconst [-1]) (SGTUconst [32] (ZeroExt8to32 y))))

+// rotates
+(RotateLeft8 <t> x (MOVWconst [c])) -> (Or8 (Lsh8x32 <t> x (MOVWconst [c&7])) (Rsh8Ux32 <t> x (MOVWconst [-c&7])))
+(RotateLeft16 <t> x (MOVWconst [c])) -> (Or16 (Lsh16x32 <t> x (MOVWconst [c&15])) (Rsh16Ux32 <t> x (MOVWconst [-c&15])))
+(RotateLeft32 <t> x (MOVWconst [c])) -> (Or32 (Lsh32x32 <t> x (MOVWconst [c&31])) (Rsh32Ux32 <t> x (MOVWconst [-c&31])))
+(RotateLeft64 <t> x (MOVWconst [c])) -> (Or64 (Lsh64x32 <t> x (MOVWconst [c&63])) (Rsh64Ux32 <t> x (MOVWconst [-c&63])))
+
 // unary ops
 (Neg(32|16|8) x) -> (NEG x)
 (Neg(32|64)F x) -> (NEG(F|D) x)
--- a/src/cmd/compile/internal/ssa/gen/MIPS64.rules
+++ b/src/cmd/compile/internal/ssa/gen/MIPS64.rules
@ -105,6 +105,12 @@
 (Rsh8x16 <t> x y) -> (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
 (Rsh8x8  <t> x y) -> (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64  y)))

+// rotates
+(RotateLeft8 <t> x (MOVVconst [c])) -> (Or8 (Lsh8x64 <t> x (MOVVconst [c&7])) (Rsh8Ux64 <t> x (MOVVconst [-c&7])))
+(RotateLeft16 <t> x (MOVVconst [c])) -> (Or16 (Lsh16x64 <t> x (MOVVconst [c&15])) (Rsh16Ux64 <t> x (MOVVconst [-c&15])))
+(RotateLeft32 <t> x (MOVVconst [c])) -> (Or32 (Lsh32x64 <t> x (MOVVconst [c&31])) (Rsh32Ux64 <t> x (MOVVconst [-c&31])))
+(RotateLeft64 <t> x (MOVVconst [c])) -> (Or64 (Lsh64x64 <t> x (MOVVconst [c&63])) (Rsh64Ux64 <t> x (MOVVconst [-c&63])))
+
 // unary ops
 (Neg(64|32|16|8) x) -> (NEGV x)
 (Neg(32|64)F x) -> (NEG(F|D) x)
--- a/src/cmd/compile/internal/ssa/gen/PPC64.rules
+++ b/src/cmd/compile/internal/ssa/gen/PPC64.rules
@ -81,6 +81,12 @@
 (FCEIL (FMOVDconst [x])) -> (FMOVDconst [auxFrom64F(math.Ceil(auxTo64F(x)))])
 (FTRUNC (FMOVDconst [x])) -> (FMOVDconst [auxFrom64F(math.Trunc(auxTo64F(x)))])

+// Rotates
+(RotateLeft8 <t> x (MOVDconst [c])) -> (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7])))
+(RotateLeft16 <t> x (MOVDconst [c])) -> (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15])))
+(RotateLeft32 x (MOVDconst [c])) -> (ROTLWconst [c&31] x)
+(RotateLeft64 x (MOVDconst [c])) -> (ROTLconst [c&63] x)
+
 // Rotate generation with const shift
 (ADD (SLDconst x [c]) (SRDconst x [d])) && d == 64-c -> (ROTLconst [c] x)
 ( OR (SLDconst x [c]) (SRDconst x [d])) && d == 64-c -> (ROTLconst [c] x)
--- a/src/cmd/compile/internal/ssa/gen/S390X.rules
+++ b/src/cmd/compile/internal/ssa/gen/S390X.rules
@ -234,6 +234,8 @@
 (Rsh(16|8)x8  x y) -> (SRAW (MOV(H|B)reg x) (MOVDGE <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst (MOVBZreg y) [64])))

 // Lowering rotates
+(RotateLeft8 <t> x (MOVDconst [c])) -> (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7])))
+(RotateLeft16 <t> x (MOVDconst [c])) -> (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15])))
 (RotateLeft32 x y) -> (RLL  x y)
 (RotateLeft64 x y) -> (RLLG x y)

--- a/src/cmd/compile/internal/ssa/gen/Wasm.rules
+++ b/src/cmd/compile/internal/ssa/gen/Wasm.rules
@ -165,6 +165,11 @@
 (Rsh8x16 x y)  -> (Rsh64x64 (SignExt8to64 x) (ZeroExt16to64 y))
 (Rsh8x8  x y)  -> (Rsh64x64 (SignExt8to64 x) (ZeroExt8to64  y))

+// Lowering rotates
+(RotateLeft8 <t> x (I64Const [c])) -> (Or8 (Lsh8x64 <t> x (I64Const [c&7])) (Rsh8Ux64 <t> x (I64Const [-c&7])))
+(RotateLeft16 <t> x (I64Const [c])) -> (Or16 (Lsh16x64 <t> x (I64Const [c&15])) (Rsh16Ux64 <t> x (I64Const [-c&15])))
+(RotateLeft32 <t> x (I64Const [c])) -> (Or32 (Lsh32x64 <t> x (I64Const [c&31])) (Rsh32Ux64 <t> x (I64Const [-c&31])))
+
 // Lowering comparisons
 (Less64  x y) -> (I64LtS x y)
 (Less32  x y) -> (I64LtS (SignExt32to64 x) (SignExt32to64 y))
--- a/src/cmd/compile/internal/ssa/gen/generic.rules
+++ b/src/cmd/compile/internal/ssa/gen/generic.rules
@ -334,6 +334,12 @@
 (Rsh(64|32|16|8)x64  x (Const64 [0])) -> x
 (Rsh(64|32|16|8)Ux64 x (Const64 [0])) -> x

+// rotates by multiples of register width
+(RotateLeft64 x (Const64 [c])) && c%64 == 0 -> x
+(RotateLeft32 x (Const32 [c])) && c%32 == 0 -> x
+(RotateLeft16 x (Const16 [c])) && c%16 == 0 -> x
+(RotateLeft8 x (Const8 [c])) && c%8 == 0 -> x
+
 // zero shifted
 (Lsh64x(64|32|16|8)  (Const64 [0]) _) -> (Const64 [0])
 (Rsh64x(64|32|16|8)  (Const64 [0]) _) -> (Const64 [0])
@ -1154,7 +1160,7 @@
  -> (Sub32 x (Mul32 <t> (Div32  <t> x (Const32 <t> [c])) (Const32 <t> [c])))
 (Mod64  <t> x (Const64 [c])) && x.Op != OpConst64 && (c > 0 || c == -1<<63)
  -> (Sub64 x (Mul64 <t> (Div64  <t> x (Const64 <t> [c])) (Const64 <t> [c])))
-(Mod8u  <t> x (Const8  [c])) && x.Op != OpConst8  && c > 0 && umagicOK(8 ,c)
+(Mod8u  <t> x (Const8  [c])) && x.Op != OpConst8  && c > 0 && umagicOK(8, c)
  -> (Sub8  x (Mul8  <t> (Div8u  <t> x (Const8  <t> [c])) (Const8  <t> [c])))
 (Mod16u <t> x (Const16 [c])) && x.Op != OpConst16 && c > 0 && umagicOK(16,c)
  -> (Sub16 x (Mul16 <t> (Div16u <t> x (Const16 <t> [c])) (Const16 <t> [c])))
@ -1163,6 +1169,326 @@
 (Mod64u <t> x (Const64 [c])) && x.Op != OpConst64 && c > 0 && umagicOK(64,c)
  -> (Sub64 x (Mul64 <t> (Div64u <t> x (Const64 <t> [c])) (Const64 <t> [c])))

+// For architectures without rotates on less than 32-bits, promote these checks to 32-bit.
+(Eq8 (Mod8u x (Const8  [c])) (Const8 [0])) && x.Op != OpConst8 && udivisibleOK(8,c) && !hasSmallRotate(config) ->
+	(Eq32 (Mod32u <typ.UInt32> (ZeroExt8to32 <typ.UInt32> x) (Const32 <typ.UInt32> [c&0xff])) (Const32 <typ.UInt32> [0]))
+(Eq16 (Mod16u x (Const16  [c])) (Const16 [0])) && x.Op != OpConst16 && udivisibleOK(16,c) && !hasSmallRotate(config) ->
+	(Eq32 (Mod32u <typ.UInt32> (ZeroExt16to32 <typ.UInt32> x) (Const32 <typ.UInt32> [c&0xffff])) (Const32 <typ.UInt32> [0]))
+
+// Divisibility checks x%c == 0 convert to multiply and rotate.
+// Note, x%c == 0 is rewritten as x == c*(x/c) during the opt pass
+// where (x/c) is peformed using multiplication with magic constants.
+// To rewrite x%c == 0 requires pattern matching the rewritten expression
+// and checking that the division by the same constant wasn't already calculated.
+// This check is made by counting uses of the magic constant multiplication.
+// Note that if there were an intermediate opt pass, this rule could be applied
+// directly on the Div op and magic division rewrites could be delayed to late opt.
+
+(Eq8 x (Mul8 (Const8 [c])
+  (Trunc32to8
+    (Rsh32Ux64
+      mul:(Mul32
+        (Const32 [m])
+        (ZeroExt8to32 x))
+      (Const64 [s])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(1<<8+umagic(8,c).m) && s == 8+umagic(8,c).s
+  && x.Op != OpConst8 && udivisibleOK(8,c)
+ -> (Leq8U
+			(RotateLeft8 <typ.UInt8>
+				(Mul8 <typ.UInt8>
+					(Const8 <typ.UInt8> [int64(int8(udivisible(8,c).m))])
+					x)
+				(Const8 <typ.UInt8> [int64(8-udivisible(8,c).k)])
+				)
+			(Const8 <typ.UInt8> [int64(int8(udivisible(8,c).max))])
+		)
+
+(Eq16 x (Mul16 (Const16 [c])
+  (Trunc64to16
+    (Rsh64Ux64
+      mul:(Mul64
+        (Const64 [m])
+        (ZeroExt16to64 x))
+      (Const64 [s])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(1<<16+umagic(16,c).m) && s == 16+umagic(16,c).s
+  && x.Op != OpConst16 && udivisibleOK(16,c)
+ -> (Leq16U
+			(RotateLeft16 <typ.UInt16>
+				(Mul16 <typ.UInt16>
+					(Const16 <typ.UInt16> [int64(int16(udivisible(16,c).m))])
+					x)
+				(Const16 <typ.UInt16> [int64(16-udivisible(16,c).k)])
+				)
+			(Const16 <typ.UInt16> [int64(int16(udivisible(16,c).max))])
+		)
+
+(Eq16 x (Mul16 (Const16 [c])
+  (Trunc32to16
+    (Rsh32Ux64
+      mul:(Mul32
+        (Const32 [m])
+        (ZeroExt16to32 x))
+      (Const64 [s])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(1<<15+umagic(16,c).m/2) && s == 16+umagic(16,c).s-1
+  && x.Op != OpConst16 && udivisibleOK(16,c)
+ -> (Leq16U
+			(RotateLeft16 <typ.UInt16>
+				(Mul16 <typ.UInt16>
+					(Const16 <typ.UInt16> [int64(int16(udivisible(16,c).m))])
+					x)
+				(Const16 <typ.UInt16> [int64(16-udivisible(16,c).k)])
+				)
+			(Const16 <typ.UInt16> [int64(int16(udivisible(16,c).max))])
+		)
+
+(Eq16 x (Mul16 (Const16 [c])
+  (Trunc32to16
+    (Rsh32Ux64
+      mul:(Mul32
+        (Const32 [m])
+        (Rsh32Ux64 (ZeroExt16to32 x) (Const64 [1])))
+      (Const64 [s])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(1<<15+(umagic(16,c).m+1)/2) && s == 16+umagic(16,c).s-2
+  && x.Op != OpConst16 && udivisibleOK(16,c)
+ -> (Leq16U
+			(RotateLeft16 <typ.UInt16>
+				(Mul16 <typ.UInt16>
+					(Const16 <typ.UInt16> [int64(int16(udivisible(16,c).m))])
+					x)
+				(Const16 <typ.UInt16> [int64(16-udivisible(16,c).k)])
+				)
+			(Const16 <typ.UInt16> [int64(int16(udivisible(16,c).max))])
+		)
+
+(Eq16 x (Mul16 (Const16 [c])
+  (Trunc32to16
+    (Rsh32Ux64
+      (Avg32u
+        (Lsh32x64 (ZeroExt16to32 x) (Const64 [16]))
+        mul:(Mul32
+          (Const32 [m])
+          (ZeroExt16to32 x)))
+      (Const64 [s])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(umagic(16,c).m) && s == 16+umagic(16,c).s-1
+  && x.Op != OpConst16 && udivisibleOK(16,c)
+ -> (Leq16U
+			(RotateLeft16 <typ.UInt16>
+				(Mul16 <typ.UInt16>
+					(Const16 <typ.UInt16> [int64(int16(udivisible(16,c).m))])
+					x)
+				(Const16 <typ.UInt16> [int64(16-udivisible(16,c).k)])
+				)
+			(Const16 <typ.UInt16> [int64(int16(udivisible(16,c).max))])
+		)
+
+(Eq32 x (Mul32 (Const32 [c])
+	(Rsh32Ux64
+		mul:(Hmul32u
+			(Const32 [m])
+			x)
+		(Const64 [s]))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(int32(1<<31+umagic(32,c).m/2)) && s == umagic(32,c).s-1
+	&& x.Op != OpConst32 && udivisibleOK(32,c)
+ -> (Leq32U
+			(RotateLeft32 <typ.UInt32>
+				(Mul32 <typ.UInt32>
+					(Const32 <typ.UInt32> [int64(int32(udivisible(32,c).m))])
+					x)
+				(Const32 <typ.UInt32> [int64(32-udivisible(32,c).k)])
+				)
+			(Const32 <typ.UInt32> [int64(int32(udivisible(32,c).max))])
+		)
+
+(Eq32 x (Mul32 (Const32 [c])
+  (Rsh32Ux64
+    mul:(Hmul32u
+      (Const32 <typ.UInt32> [m])
+      (Rsh32Ux64 x (Const64 [1])))
+    (Const64 [s]))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(int32(1<<31+(umagic(32,c).m+1)/2)) && s == umagic(32,c).s-2
+	&& x.Op != OpConst32 && udivisibleOK(32,c)
+ -> (Leq32U
+			(RotateLeft32 <typ.UInt32>
+				(Mul32 <typ.UInt32>
+					(Const32 <typ.UInt32> [int64(int32(udivisible(32,c).m))])
+					x)
+				(Const32 <typ.UInt32> [int64(32-udivisible(32,c).k)])
+				)
+			(Const32 <typ.UInt32> [int64(int32(udivisible(32,c).max))])
+		)
+
+(Eq32 x (Mul32 (Const32 [c])
+  (Rsh32Ux64
+    (Avg32u
+      x
+      mul:(Hmul32u
+        (Const32 [m])
+        x))
+    (Const64 [s]))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(int32(umagic(32,c).m)) && s == umagic(32,c).s-1
+	&& x.Op != OpConst32 && udivisibleOK(32,c)
+ -> (Leq32U
+			(RotateLeft32 <typ.UInt32>
+				(Mul32 <typ.UInt32>
+					(Const32 <typ.UInt32> [int64(int32(udivisible(32,c).m))])
+					x)
+				(Const32 <typ.UInt32> [int64(32-udivisible(32,c).k)])
+				)
+			(Const32 <typ.UInt32> [int64(int32(udivisible(32,c).max))])
+		)
+
+(Eq32 x (Mul32 (Const32 [c])
+  (Trunc64to32
+    (Rsh64Ux64
+      mul:(Mul64
+        (Const64 [m])
+        (ZeroExt32to64 x))
+      (Const64 [s])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(1<<31+umagic(32,c).m/2) && s == 32+umagic(32,c).s-1
+	&& x.Op != OpConst32 && udivisibleOK(32,c)
+ -> (Leq32U
+			(RotateLeft32 <typ.UInt32>
+				(Mul32 <typ.UInt32>
+					(Const32 <typ.UInt32> [int64(int32(udivisible(32,c).m))])
+					x)
+				(Const32 <typ.UInt32> [int64(32-udivisible(32,c).k)])
+				)
+			(Const32 <typ.UInt32> [int64(int32(udivisible(32,c).max))])
+		)
+
+(Eq32 x (Mul32 (Const32 [c])
+  (Trunc64to32
+    (Rsh64Ux64
+      mul:(Mul64
+        (Const64 [m])
+        (Rsh64Ux64 (ZeroExt32to64 x) (Const64 [1])))
+      (Const64 [s])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(1<<31+(umagic(32,c).m+1)/2) && s == 32+umagic(32,c).s-2
+	&& x.Op != OpConst32 && udivisibleOK(32,c)
+ -> (Leq32U
+			(RotateLeft32 <typ.UInt32>
+				(Mul32 <typ.UInt32>
+					(Const32 <typ.UInt32> [int64(int32(udivisible(32,c).m))])
+					x)
+				(Const32 <typ.UInt32> [int64(32-udivisible(32,c).k)])
+				)
+			(Const32 <typ.UInt32> [int64(int32(udivisible(32,c).max))])
+		)
+
+(Eq32 x (Mul32 (Const32 [c])
+  (Trunc64to32
+    (Rsh64Ux64
+      (Avg64u
+        (Lsh64x64 (ZeroExt32to64 x) (Const64 [32]))
+        mul:(Mul64
+          (Const64 [m])
+          (ZeroExt32to64 x)))
+      (Const64 [s])))
+	)
+)
+  && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(umagic(32,c).m) && s == 32+umagic(32,c).s-1
+	&& x.Op != OpConst32 && udivisibleOK(32,c)
+ -> (Leq32U
+			(RotateLeft32 <typ.UInt32>
+				(Mul32 <typ.UInt32>
+					(Const32 <typ.UInt32> [int64(int32(udivisible(32,c).m))])
+					x)
+				(Const32 <typ.UInt32> [int64(32-udivisible(32,c).k)])
+				)
+			(Const32 <typ.UInt32> [int64(int32(udivisible(32,c).max))])
+		)
+
+(Eq64 x (Mul64 (Const64 [c])
+	(Rsh64Ux64
+		mul:(Hmul64u
+			(Const64 [m])
+			x)
+		(Const64 [s]))
+	)
+) && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(1<<63+umagic(64,c).m/2) && s == umagic(64,c).s-1
+  && x.Op != OpConst64 && udivisibleOK(64,c)
+ -> (Leq64U
+			(RotateLeft64 <typ.UInt64>
+				(Mul64 <typ.UInt64>
+					(Const64 <typ.UInt64> [int64(udivisible(64,c).m)])
+					x)
+				(Const64 <typ.UInt64> [int64(64-udivisible(64,c).k)])
+				)
+			(Const64 <typ.UInt64> [int64(udivisible(64,c).max)])
+		)
+(Eq64 x (Mul64 (Const64 [c])
+	(Rsh64Ux64
+		mul:(Hmul64u
+			(Const64 [m])
+			(Rsh64Ux64 x (Const64 [1])))
+		(Const64 [s]))
+	)
+) && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(1<<63+(umagic(64,c).m+1)/2) && s == umagic(64,c).s-2
+  && x.Op != OpConst64 && udivisibleOK(64,c)
+ -> (Leq64U
+			(RotateLeft64 <typ.UInt64>
+				(Mul64 <typ.UInt64>
+					(Const64 <typ.UInt64> [int64(udivisible(64,c).m)])
+					x)
+				(Const64 <typ.UInt64> [int64(64-udivisible(64,c).k)])
+				)
+			(Const64 <typ.UInt64> [int64(udivisible(64,c).max)])
+		)
+(Eq64 x (Mul64 (Const64 [c])
+	(Rsh64Ux64
+		(Avg64u
+			x
+			mul:(Hmul64u
+				(Const64 [m])
+				x))
+		(Const64 [s]))
+	)
+) && v.Block.Func.pass.name != "opt" && mul.Uses == 1
+  && m == int64(umagic(64,c).m) && s == umagic(64,c).s-1
+  && x.Op != OpConst64 && udivisibleOK(64,c)
+ -> (Leq64U
+			(RotateLeft64 <typ.UInt64>
+				(Mul64 <typ.UInt64>
+					(Const64 <typ.UInt64> [int64(udivisible(64,c).m)])
+					x)
+				(Const64 <typ.UInt64> [int64(64-udivisible(64,c).k)])
+				)
+			(Const64 <typ.UInt64> [int64(udivisible(64,c).max)])
+		)
+
 // Divisibility check for signed integers for power of two constant are simple mask.
 // However, we must match against the rewritten n%c == 0 -> n - c*(n/c) == 0 -> n == c *(n/c)
 // where n/c contains fixup code to handle signed n.
--- a/src/cmd/compile/internal/ssa/magic.go
+++ b/src/cmd/compile/internal/ssa/magic.go
@ -4,7 +4,10 @@

 package ssa

-import "math/big"
+import (
+	"math/big"
+	"math/bits"
+)

 // So you want to compute x / c for some constant c?
 // Machine division instructions are slow, so we try to
@ -180,3 +183,105 @@ func smagic(n uint, c int64) smagicData {
 	m := M.Uint64()
 	return smagicData{s: int64(s), m: m}
 }
+
+// Divisibility x%c == 0 can be checked more efficiently than directly computing
+// the modulus x%c and comparing against 0.
+//
+// The same "Division by invariant integers using multiplication" paper
+// by Granlund and Montgomery referenced above briefly mentions this method
+// and it is further elaborated in "Hacker's Delight" by Warren Section 10-17
+//
+// The first thing to note is that for odd integers, exact division can be computed
+// by using the modular inverse with respect to the word size 2^n.
+//
+// Given c, compute m such that (c * m) mod 2^n == 1
+// Then if c divides x (x%c ==0), the quotient is given by q = x/c == x*cinv mod 2^n
+//
+// x can range from 0, c, 2c, 3c, ... ⎣(2^n - 1)/c⎦ * c the maximum multiple
+// Thus, x*m mod 2^n is 0, 1, 2, 3, ... ⎣(2^n - 1)/c⎦
+// i.e. the quotient takes all values from zero up to max = ⎣(2^n - 1)/c⎦
+//
+// If x is not divisible by c, then x*m mod 2^n must take some larger value than max.
+//
+// This gives x*m mod 2^n <= ⎣(2^n - 1)/c⎦ as a test for divisibility
+// involving one multiplication and compare.
+//
+// To extend this to even integers, consider c = d0 * 2^k where d0 is odd.
+// We can test whether x is divisible by both d0 and 2^k.
+// For d0, the test is the same as above.  Let m be such that m*d0 mod 2^n == 1
+// Then x*m mod 2^n <= ⎣(2^n - 1)/d0⎦ is the first test.
+// The test for divisibility by 2^k is a check for k trailing zeroes.
+// Note that since d0 is odd, m is odd and thus x*m will have the same number of
+// trailing zeroes as x.  So the two tests are,
+//
+// x*m mod 2^n <= ⎣(2^n - 1)/d0⎦
+// and x*m ends in k zero bits
+//
+// These can be combined into a single comparison by the following
+// (theorem ZRU in Hacker's Delight) for unsigned integers.
+//
+// x <= a and x ends in k zero bits if and only if RotRight(x ,k) <= ⎣a/(2^k)⎦
+// Where RotRight(x ,k) is right rotation of x by k bits.
+//
+// To prove the first direction, x <= a -> ⎣x/(2^k)⎦ <= ⎣a/(2^k)⎦
+// But since x ends in k zeroes all the rotated bits would be zero too.
+// So RotRight(x, k) == ⎣x/(2^k)⎦ <= ⎣a/(2^k)⎦
+//
+// If x does not end in k zero bits, then RotRight(x, k)
+// has some non-zero bits in the k highest bits.
+// ⎣x/(2^k)⎦ has all zeroes in the k highest bits,
+// so RotRight(x, k) > ⎣x/(2^k)⎦
+//
+// Finally, if x > a and has k trailing zero bits, then RotRight(x, k) == ⎣x/(2^k)⎦
+// and ⎣x/(2^k)⎦ must be greater than ⎣a/(2^k)⎦, that is the top n-k bits of x must
+// be greater than the top n-k bits of a because the rest of x bits are zero.
+//
+// So the two conditions about can be replaced with the single test
+//
+// RotRight(x*m mod 2^n, k) <= ⎣(2^n - 1)/c⎦
+//
+// Where d0*2^k was replaced by c on the right hand side.
+
+// uivisibleOK reports whether we should strength reduce a n-bit dividisibilty check by c.
+func udivisibleOK(n uint, c int64) bool {
+	// Convert from ConstX auxint values to the real uint64 constant they represent.
+	d := uint64(c) << (64 - n) >> (64 - n)
+
+	// Doesn't work for 0.
+	// Don't use for powers of 2.
+	return d&(d-1) != 0
+}
+
+type udivisibleData struct {
+	k   int64  // trailingZeros(c)
+	m   uint64 // m * (c>>k) mod 2^n == 1 multiplicative inverse of odd portion modulo 2^n
+	max uint64 // ⎣(2^n - 1)/ c⎦ max value to for divisibility
+}
+
+func udivisible(n uint, c int64) udivisibleData {
+	// Convert from ConstX auxint values to the real uint64 constant they represent.
+	d := uint64(c) << (64 - n) >> (64 - n)
+
+	k := bits.TrailingZeros64(d)
+	d0 := d >> uint(k) // the odd portion of the divisor
+
+	mask := ^uint64(0) >> (64 - n)
+
+	// Calculate the multiplicative inverse via Newton's method.
+	// Quadratic convergence doubles the number of correct bits per iteration.
+	m := d0            // initial guess correct to 3-bits d0*d0 mod 8 == 1
+	m = m * (2 - m*d0) // 6-bits
+	m = m * (2 - m*d0) // 12-bits
+	m = m * (2 - m*d0) // 24-bits
+	m = m * (2 - m*d0) // 48-bits
+	m = m * (2 - m*d0) // 96-bits >= 64-bits
+	m = m & mask
+
+	max := mask / d
+
+	return udivisibleData{
+		k:   int64(k),
+		m:   m,
+		max: max,
+	}
+}
--- a/src/cmd/compile/internal/ssa/magic_test.go
+++ b/src/cmd/compile/internal/ssa/magic_test.go
@ -203,3 +203,103 @@ func TestMagicSigned(t *testing.T) {
 		}
 	}
 }
+
+func testDivisibleExhaustiveU(t *testing.T, n uint) {
+	maxU := uint64(1) << n
+	for c := uint64(1); c < maxU; c++ {
+		if !udivisibleOK(n, int64(c)) {
+			continue
+		}
+		k := udivisible(n, int64(c)).k
+		m := udivisible(n, int64(c)).m
+		max := udivisible(n, int64(c)).max
+		mask := ^uint64(0) >> (64 - n)
+		for i := uint64(0); i < maxU; i++ {
+			want := i%c == 0
+			mul := (i * m) & mask
+			rot := (mul>>uint(k) | mul<<(n-uint(k))) & mask
+			got := rot <= max
+			if want != got {
+				t.Errorf("unsigned divisible wrong for %d %% %d == 0: got %v, want %v (k=%d,m=%d,max=%d)\n", i, c, got, want, k, m, max)
+			}
+		}
+	}
+}
+
+func TestDivisibleExhaustive8U(t *testing.T) {
+	testDivisibleExhaustiveU(t, 8)
+}
+
+func TestDivisibleExhaustive16U(t *testing.T) {
+	if testing.Short() {
+		t.Skip("slow test; skipping")
+	}
+	testDivisibleExhaustiveU(t, 16)
+}
+
+func TestDivisibleUnsigned(t *testing.T) {
+	One := new(big.Int).SetUint64(1)
+	for _, n := range [...]uint{8, 16, 32, 64} {
+		TwoN := new(big.Int).Lsh(One, n)
+		Max := new(big.Int).Sub(TwoN, One)
+		for _, c := range [...]uint64{
+			3,
+			5,
+			6,
+			7,
+			9,
+			10,
+			11,
+			12,
+			13,
+			14,
+			15,
+			17,
+			1<<8 - 1,
+			1<<8 + 1,
+			1<<16 - 1,
+			1<<16 + 1,
+			1<<32 - 1,
+			1<<32 + 1,
+			1<<64 - 1,
+		} {
+			if c>>n != 0 {
+				continue // c too large for the given n.
+			}
+			if !udivisibleOK(n, int64(c)) {
+				t.Errorf("expected n=%d c=%d to pass\n", n, c)
+			}
+			k := udivisible(n, int64(c)).k
+			m := udivisible(n, int64(c)).m
+			max := udivisible(n, int64(c)).max
+			mask := ^uint64(0) >> (64 - n)
+
+			C := new(big.Int).SetUint64(c)
+
+			// Find largest multiple of c.
+			Mul := new(big.Int).Div(Max, C)
+			Mul.Mul(Mul, C)
+			mul := Mul.Uint64()
+
+			// Try some input values, mostly around multiples of c.
+			for _, x := range [...]uint64{0, 1,
+				c - 1, c, c + 1,
+				2*c - 1, 2 * c, 2*c + 1,
+				mul - 1, mul, mul + 1,
+				uint64(1)<<n - 1,
+			} {
+				X := new(big.Int).SetUint64(x)
+				if X.Cmp(Max) > 0 {
+					continue
+				}
+				want := x%c == 0
+				mul := (x * m) & mask
+				rot := (mul>>uint(k) | mul<<(n-uint(k))) & mask
+				got := rot <= max
+				if want != got {
+					t.Errorf("unsigned divisible wrong for %d %% %d == 0: got %v, want %v (k=%d,m=%d,max=%d)\n", x, c, got, want, k, m, max)
+				}
+			}
+		}
+	}
+}
--- a/src/cmd/compile/internal/ssa/rewrite.go
+++ b/src/cmd/compile/internal/ssa/rewrite.go
@ -1053,6 +1053,17 @@ func isInlinableMemmove(dst, src *Value, sz int64, c *Config) bool {
 	return false
 }

+// hasSmallRotate reports whether the architecture has rotate instructions
+// for sizes < 32-bit.  This is used to decide whether to promote some rotations.
+func hasSmallRotate(c *Config) bool {
+	switch c.arch {
+	case "amd64", "amd64p32", "386":
+		return true
+	default:
+		return false
+	}
+}
+
 // encodes the lsb and width for arm(64) bitfield ops into the expected auxInt format.
 func armBFAuxInt(lsb, width int64) int64 {
 	if lsb < 0 || lsb > 63 {
--- a/src/cmd/compile/internal/ssa/rewrite386.go
+++ b/src/cmd/compile/internal/ssa/rewrite386.go
@ -591,6 +591,12 @@ func rewriteValue386(v *Value) bool {
 		return rewriteValue386_OpPanicBounds_0(v)
 	case OpPanicExtend:
 		return rewriteValue386_OpPanicExtend_0(v)
+	case OpRotateLeft16:
+		return rewriteValue386_OpRotateLeft16_0(v)
+	case OpRotateLeft32:
+		return rewriteValue386_OpRotateLeft32_0(v)
+	case OpRotateLeft8:
+		return rewriteValue386_OpRotateLeft8_0(v)
 	case OpRound32F:
 		return rewriteValue386_OpRound32F_0(v)
 	case OpRound64F:
@ -23021,6 +23027,63 @@ func rewriteValue386_OpPanicExtend_0(v *Value) bool {
 	}
 	return false
 }
+func rewriteValue386_OpRotateLeft16_0(v *Value) bool {
+	// match: (RotateLeft16 x (MOVLconst [c]))
+	// cond:
+	// result: (ROLWconst [c&15] x)
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != Op386MOVLconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(Op386ROLWconst)
+		v.AuxInt = c & 15
+		v.AddArg(x)
+		return true
+	}
+	return false
+}
+func rewriteValue386_OpRotateLeft32_0(v *Value) bool {
+	// match: (RotateLeft32 x (MOVLconst [c]))
+	// cond:
+	// result: (ROLLconst [c&31] x)
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != Op386MOVLconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(Op386ROLLconst)
+		v.AuxInt = c & 31
+		v.AddArg(x)
+		return true
+	}
+	return false
+}
+func rewriteValue386_OpRotateLeft8_0(v *Value) bool {
+	// match: (RotateLeft8 x (MOVLconst [c]))
+	// cond:
+	// result: (ROLBconst [c&7] x)
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != Op386MOVLconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(Op386ROLBconst)
+		v.AuxInt = c & 7
+		v.AddArg(x)
+		return true
+	}
+	return false
+}
 func rewriteValue386_OpRound32F_0(v *Value) bool {
 	// match: (Round32F x)
 	// cond:
--- a/src/cmd/compile/internal/ssa/rewriteARM.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM.go
@ -723,6 +723,12 @@ func rewriteValueARM(v *Value) bool {
 		return rewriteValueARM_OpPanicBounds_0(v)
 	case OpPanicExtend:
 		return rewriteValueARM_OpPanicExtend_0(v)
+	case OpRotateLeft16:
+		return rewriteValueARM_OpRotateLeft16_0(v)
+	case OpRotateLeft32:
+		return rewriteValueARM_OpRotateLeft32_0(v)
+	case OpRotateLeft8:
+		return rewriteValueARM_OpRotateLeft8_0(v)
 	case OpRound32F:
 		return rewriteValueARM_OpRound32F_0(v)
 	case OpRound64F:
@ -20199,6 +20205,89 @@ func rewriteValueARM_OpPanicExtend_0(v *Value) bool {
 	}
 	return false
 }
+func rewriteValueARM_OpRotateLeft16_0(v *Value) bool {
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (RotateLeft16 <t> x (MOVWconst [c]))
+	// cond:
+	// result: (Or16 (Lsh16x32 <t> x (MOVWconst [c&15])) (Rsh16Ux32 <t> x (MOVWconst [-c&15])))
+	for {
+		t := v.Type
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARMMOVWconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpOr16)
+		v0 := b.NewValue0(v.Pos, OpLsh16x32, t)
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpARMMOVWconst, typ.UInt32)
+		v1.AuxInt = c & 15
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		v2 := b.NewValue0(v.Pos, OpRsh16Ux32, t)
+		v2.AddArg(x)
+		v3 := b.NewValue0(v.Pos, OpARMMOVWconst, typ.UInt32)
+		v3.AuxInt = -c & 15
+		v2.AddArg(v3)
+		v.AddArg(v2)
+		return true
+	}
+	return false
+}
+func rewriteValueARM_OpRotateLeft32_0(v *Value) bool {
+	// match: (RotateLeft32 x (MOVWconst [c]))
+	// cond:
+	// result: (SRRconst [-c&31] x)
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARMMOVWconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpARMSRRconst)
+		v.AuxInt = -c & 31
+		v.AddArg(x)
+		return true
+	}
+	return false
+}
+func rewriteValueARM_OpRotateLeft8_0(v *Value) bool {
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (RotateLeft8 <t> x (MOVWconst [c]))
+	// cond:
+	// result: (Or8 (Lsh8x32 <t> x (MOVWconst [c&7])) (Rsh8Ux32 <t> x (MOVWconst [-c&7])))
+	for {
+		t := v.Type
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARMMOVWconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpOr8)
+		v0 := b.NewValue0(v.Pos, OpLsh8x32, t)
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpARMMOVWconst, typ.UInt32)
+		v1.AuxInt = c & 7
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		v2 := b.NewValue0(v.Pos, OpRsh8Ux32, t)
+		v2.AddArg(x)
+		v3 := b.NewValue0(v.Pos, OpARMMOVWconst, typ.UInt32)
+		v3.AuxInt = -c & 7
+		v2.AddArg(v3)
+		v.AddArg(v2)
+		return true
+	}
+	return false
+}
 func rewriteValueARM_OpRound32F_0(v *Value) bool {
 	// match: (Round32F x)
 	// cond:
--- a/src/cmd/compile/internal/ssa/rewriteARM64.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
@ -801,10 +801,14 @@ func rewriteValueARM64(v *Value) bool {
 		return rewriteValueARM64_OpPopCount32_0(v)
 	case OpPopCount64:
 		return rewriteValueARM64_OpPopCount64_0(v)
+	case OpRotateLeft16:
+		return rewriteValueARM64_OpRotateLeft16_0(v)
 	case OpRotateLeft32:
 		return rewriteValueARM64_OpRotateLeft32_0(v)
 	case OpRotateLeft64:
 		return rewriteValueARM64_OpRotateLeft64_0(v)
+	case OpRotateLeft8:
+		return rewriteValueARM64_OpRotateLeft8_0(v)
 	case OpRound:
 		return rewriteValueARM64_OpRound_0(v)
 	case OpRound32F:
@ -35912,6 +35916,38 @@ func rewriteValueARM64_OpPopCount64_0(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueARM64_OpRotateLeft16_0(v *Value) bool {
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (RotateLeft16 <t> x (MOVDconst [c]))
+	// cond:
+	// result: (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15])))
+	for {
+		t := v.Type
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpOr16)
+		v0 := b.NewValue0(v.Pos, OpLsh16x64, t)
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+		v1.AuxInt = c & 15
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		v2 := b.NewValue0(v.Pos, OpRsh16Ux64, t)
+		v2.AddArg(x)
+		v3 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+		v3.AuxInt = -c & 15
+		v2.AddArg(v3)
+		v.AddArg(v2)
+		return true
+	}
+	return false
+}
 func rewriteValueARM64_OpRotateLeft32_0(v *Value) bool {
 	b := v.Block
 	// match: (RotateLeft32 x y)
@ -35944,6 +35980,38 @@ func rewriteValueARM64_OpRotateLeft64_0(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueARM64_OpRotateLeft8_0(v *Value) bool {
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (RotateLeft8 <t> x (MOVDconst [c]))
+	// cond:
+	// result: (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7])))
+	for {
+		t := v.Type
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpARM64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpOr8)
+		v0 := b.NewValue0(v.Pos, OpLsh8x64, t)
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+		v1.AuxInt = c & 7
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		v2 := b.NewValue0(v.Pos, OpRsh8Ux64, t)
+		v2.AddArg(x)
+		v3 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+		v3.AuxInt = -c & 7
+		v2.AddArg(v3)
+		v.AddArg(v2)
+		return true
+	}
+	return false
+}
 func rewriteValueARM64_OpRound_0(v *Value) bool {
 	// match: (Round x)
 	// cond:
--- a/src/cmd/compile/internal/ssa/rewriteMIPS.go
+++ b/src/cmd/compile/internal/ssa/rewriteMIPS.go
@ -409,6 +409,14 @@ func rewriteValueMIPS(v *Value) bool {
 		return rewriteValueMIPS_OpPanicBounds_0(v)
 	case OpPanicExtend:
 		return rewriteValueMIPS_OpPanicExtend_0(v)
+	case OpRotateLeft16:
+		return rewriteValueMIPS_OpRotateLeft16_0(v)
+	case OpRotateLeft32:
+		return rewriteValueMIPS_OpRotateLeft32_0(v)
+	case OpRotateLeft64:
+		return rewriteValueMIPS_OpRotateLeft64_0(v)
+	case OpRotateLeft8:
+		return rewriteValueMIPS_OpRotateLeft8_0(v)
 	case OpRound32F:
 		return rewriteValueMIPS_OpRound32F_0(v)
 	case OpRound64F:
@ -7099,6 +7107,134 @@ func rewriteValueMIPS_OpPanicExtend_0(v *Value) bool {
 	}
 	return false
 }
+func rewriteValueMIPS_OpRotateLeft16_0(v *Value) bool {
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (RotateLeft16 <t> x (MOVWconst [c]))
+	// cond:
+	// result: (Or16 (Lsh16x32 <t> x (MOVWconst [c&15])) (Rsh16Ux32 <t> x (MOVWconst [-c&15])))
+	for {
+		t := v.Type
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpMIPSMOVWconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpOr16)
+		v0 := b.NewValue0(v.Pos, OpLsh16x32, t)
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpMIPSMOVWconst, typ.UInt32)
+		v1.AuxInt = c & 15
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		v2 := b.NewValue0(v.Pos, OpRsh16Ux32, t)
+		v2.AddArg(x)
+		v3 := b.NewValue0(v.Pos, OpMIPSMOVWconst, typ.UInt32)
+		v3.AuxInt = -c & 15
+		v2.AddArg(v3)
+		v.AddArg(v2)
+		return true
+	}
+	return false
+}
+func rewriteValueMIPS_OpRotateLeft32_0(v *Value) bool {
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (RotateLeft32 <t> x (MOVWconst [c]))
+	// cond:
+	// result: (Or32 (Lsh32x32 <t> x (MOVWconst [c&31])) (Rsh32Ux32 <t> x (MOVWconst [-c&31])))
+	for {
+		t := v.Type
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpMIPSMOVWconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpOr32)
+		v0 := b.NewValue0(v.Pos, OpLsh32x32, t)
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpMIPSMOVWconst, typ.UInt32)
+		v1.AuxInt = c & 31
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		v2 := b.NewValue0(v.Pos, OpRsh32Ux32, t)
+		v2.AddArg(x)
+		v3 := b.NewValue0(v.Pos, OpMIPSMOVWconst, typ.UInt32)
+		v3.AuxInt = -c & 31
+		v2.AddArg(v3)
+		v.AddArg(v2)
+		return true
+	}
+	return false
+}
+func rewriteValueMIPS_OpRotateLeft64_0(v *Value) bool {
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (RotateLeft64 <t> x (MOVWconst [c]))
+	// cond:
+	// result: (Or64 (Lsh64x32 <t> x (MOVWconst [c&63])) (Rsh64Ux32 <t> x (MOVWconst [-c&63])))
+	for {
+		t := v.Type
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpMIPSMOVWconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpOr64)
+		v0 := b.NewValue0(v.Pos, OpLsh64x32, t)
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpMIPSMOVWconst, typ.UInt32)
+		v1.AuxInt = c & 63
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		v2 := b.NewValue0(v.Pos, OpRsh64Ux32, t)
+		v2.AddArg(x)
+		v3 := b.NewValue0(v.Pos, OpMIPSMOVWconst, typ.UInt32)
+		v3.AuxInt = -c & 63
+		v2.AddArg(v3)
+		v.AddArg(v2)
+		return true
+	}
+	return false
+}
+func rewriteValueMIPS_OpRotateLeft8_0(v *Value) bool {
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (RotateLeft8 <t> x (MOVWconst [c]))
+	// cond:
+	// result: (Or8 (Lsh8x32 <t> x (MOVWconst [c&7])) (Rsh8Ux32 <t> x (MOVWconst [-c&7])))
+	for {
+		t := v.Type
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpMIPSMOVWconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpOr8)
+		v0 := b.NewValue0(v.Pos, OpLsh8x32, t)
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpMIPSMOVWconst, typ.UInt32)
+		v1.AuxInt = c & 7
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		v2 := b.NewValue0(v.Pos, OpRsh8Ux32, t)
+		v2.AddArg(x)
+		v3 := b.NewValue0(v.Pos, OpMIPSMOVWconst, typ.UInt32)
+		v3.AuxInt = -c & 7
+		v2.AddArg(v3)
+		v.AddArg(v2)
+		return true
+	}
+	return false
+}
 func rewriteValueMIPS_OpRound32F_0(v *Value) bool {
 	// match: (Round32F x)
 	// cond:
--- a/src/cmd/compile/internal/ssa/rewriteMIPS64.go
+++ b/src/cmd/compile/internal/ssa/rewriteMIPS64.go
@ -471,6 +471,14 @@ func rewriteValueMIPS64(v *Value) bool {
 		return rewriteValueMIPS64_OpOrB_0(v)
 	case OpPanicBounds:
 		return rewriteValueMIPS64_OpPanicBounds_0(v)
+	case OpRotateLeft16:
+		return rewriteValueMIPS64_OpRotateLeft16_0(v)
+	case OpRotateLeft32:
+		return rewriteValueMIPS64_OpRotateLeft32_0(v)
+	case OpRotateLeft64:
+		return rewriteValueMIPS64_OpRotateLeft64_0(v)
+	case OpRotateLeft8:
+		return rewriteValueMIPS64_OpRotateLeft8_0(v)
 	case OpRound32F:
 		return rewriteValueMIPS64_OpRound32F_0(v)
 	case OpRound64F:
@ -7471,6 +7479,134 @@ func rewriteValueMIPS64_OpPanicBounds_0(v *Value) bool {
 	}
 	return false
 }
+func rewriteValueMIPS64_OpRotateLeft16_0(v *Value) bool {
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (RotateLeft16 <t> x (MOVVconst [c]))
+	// cond:
+	// result: (Or16 (Lsh16x64 <t> x (MOVVconst [c&15])) (Rsh16Ux64 <t> x (MOVVconst [-c&15])))
+	for {
+		t := v.Type
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpMIPS64MOVVconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpOr16)
+		v0 := b.NewValue0(v.Pos, OpLsh16x64, t)
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpMIPS64MOVVconst, typ.UInt64)
+		v1.AuxInt = c & 15
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		v2 := b.NewValue0(v.Pos, OpRsh16Ux64, t)
+		v2.AddArg(x)
+		v3 := b.NewValue0(v.Pos, OpMIPS64MOVVconst, typ.UInt64)
+		v3.AuxInt = -c & 15
+		v2.AddArg(v3)
+		v.AddArg(v2)
+		return true
+	}
+	return false
+}
+func rewriteValueMIPS64_OpRotateLeft32_0(v *Value) bool {
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (RotateLeft32 <t> x (MOVVconst [c]))
+	// cond:
+	// result: (Or32 (Lsh32x64 <t> x (MOVVconst [c&31])) (Rsh32Ux64 <t> x (MOVVconst [-c&31])))
+	for {
+		t := v.Type
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpMIPS64MOVVconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpOr32)
+		v0 := b.NewValue0(v.Pos, OpLsh32x64, t)
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpMIPS64MOVVconst, typ.UInt64)
+		v1.AuxInt = c & 31
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		v2 := b.NewValue0(v.Pos, OpRsh32Ux64, t)
+		v2.AddArg(x)
+		v3 := b.NewValue0(v.Pos, OpMIPS64MOVVconst, typ.UInt64)
+		v3.AuxInt = -c & 31
+		v2.AddArg(v3)
+		v.AddArg(v2)
+		return true
+	}
+	return false
+}
+func rewriteValueMIPS64_OpRotateLeft64_0(v *Value) bool {
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (RotateLeft64 <t> x (MOVVconst [c]))
+	// cond:
+	// result: (Or64 (Lsh64x64 <t> x (MOVVconst [c&63])) (Rsh64Ux64 <t> x (MOVVconst [-c&63])))
+	for {
+		t := v.Type
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpMIPS64MOVVconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpOr64)
+		v0 := b.NewValue0(v.Pos, OpLsh64x64, t)
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpMIPS64MOVVconst, typ.UInt64)
+		v1.AuxInt = c & 63
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		v2 := b.NewValue0(v.Pos, OpRsh64Ux64, t)
+		v2.AddArg(x)
+		v3 := b.NewValue0(v.Pos, OpMIPS64MOVVconst, typ.UInt64)
+		v3.AuxInt = -c & 63
+		v2.AddArg(v3)
+		v.AddArg(v2)
+		return true
+	}
+	return false
+}
+func rewriteValueMIPS64_OpRotateLeft8_0(v *Value) bool {
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (RotateLeft8 <t> x (MOVVconst [c]))
+	// cond:
+	// result: (Or8 (Lsh8x64 <t> x (MOVVconst [c&7])) (Rsh8Ux64 <t> x (MOVVconst [-c&7])))
+	for {
+		t := v.Type
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpMIPS64MOVVconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpOr8)
+		v0 := b.NewValue0(v.Pos, OpLsh8x64, t)
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpMIPS64MOVVconst, typ.UInt64)
+		v1.AuxInt = c & 7
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		v2 := b.NewValue0(v.Pos, OpRsh8Ux64, t)
+		v2.AddArg(x)
+		v3 := b.NewValue0(v.Pos, OpMIPS64MOVVconst, typ.UInt64)
+		v3.AuxInt = -c & 7
+		v2.AddArg(v3)
+		v.AddArg(v2)
+		return true
+	}
+	return false
+}
 func rewriteValueMIPS64_OpRound32F_0(v *Value) bool {
 	// match: (Round32F x)
 	// cond:
--- a/src/cmd/compile/internal/ssa/rewritePPC64.go
+++ b/src/cmd/compile/internal/ssa/rewritePPC64.go
@ -557,10 +557,14 @@ func rewriteValuePPC64(v *Value) bool {
 		return rewriteValuePPC64_OpPopCount64_0(v)
 	case OpPopCount8:
 		return rewriteValuePPC64_OpPopCount8_0(v)
+	case OpRotateLeft16:
+		return rewriteValuePPC64_OpRotateLeft16_0(v)
 	case OpRotateLeft32:
 		return rewriteValuePPC64_OpRotateLeft32_0(v)
 	case OpRotateLeft64:
 		return rewriteValuePPC64_OpRotateLeft64_0(v)
+	case OpRotateLeft8:
+		return rewriteValuePPC64_OpRotateLeft8_0(v)
 	case OpRound:
 		return rewriteValuePPC64_OpRound_0(v)
 	case OpRound32F:
@ -26217,7 +26221,55 @@ func rewriteValuePPC64_OpPopCount8_0(v *Value) bool {
 		return true
 	}
 }
+func rewriteValuePPC64_OpRotateLeft16_0(v *Value) bool {
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (RotateLeft16 <t> x (MOVDconst [c]))
+	// cond:
+	// result: (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15])))
+	for {
+		t := v.Type
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpPPC64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpOr16)
+		v0 := b.NewValue0(v.Pos, OpLsh16x64, t)
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpPPC64MOVDconst, typ.Int64)
+		v1.AuxInt = c & 15
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		v2 := b.NewValue0(v.Pos, OpRsh16Ux64, t)
+		v2.AddArg(x)
+		v3 := b.NewValue0(v.Pos, OpPPC64MOVDconst, typ.Int64)
+		v3.AuxInt = -c & 15
+		v2.AddArg(v3)
+		v.AddArg(v2)
+		return true
+	}
+	return false
+}
 func rewriteValuePPC64_OpRotateLeft32_0(v *Value) bool {
+	// match: (RotateLeft32 x (MOVDconst [c]))
+	// cond:
+	// result: (ROTLWconst [c&31] x)
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpPPC64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpPPC64ROTLWconst)
+		v.AuxInt = c & 31
+		v.AddArg(x)
+		return true
+	}
 	// match: (RotateLeft32 x y)
 	// cond:
 	// result: (ROTLW x y)
@ -26231,6 +26283,22 @@ func rewriteValuePPC64_OpRotateLeft32_0(v *Value) bool {
 	}
 }
 func rewriteValuePPC64_OpRotateLeft64_0(v *Value) bool {
+	// match: (RotateLeft64 x (MOVDconst [c]))
+	// cond:
+	// result: (ROTLconst [c&63] x)
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpPPC64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpPPC64ROTLconst)
+		v.AuxInt = c & 63
+		v.AddArg(x)
+		return true
+	}
 	// match: (RotateLeft64 x y)
 	// cond:
 	// result: (ROTL x y)
@ -26243,6 +26311,38 @@ func rewriteValuePPC64_OpRotateLeft64_0(v *Value) bool {
 		return true
 	}
 }
+func rewriteValuePPC64_OpRotateLeft8_0(v *Value) bool {
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (RotateLeft8 <t> x (MOVDconst [c]))
+	// cond:
+	// result: (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7])))
+	for {
+		t := v.Type
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpPPC64MOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpOr8)
+		v0 := b.NewValue0(v.Pos, OpLsh8x64, t)
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpPPC64MOVDconst, typ.Int64)
+		v1.AuxInt = c & 7
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		v2 := b.NewValue0(v.Pos, OpRsh8Ux64, t)
+		v2.AddArg(x)
+		v3 := b.NewValue0(v.Pos, OpPPC64MOVDconst, typ.Int64)
+		v3.AuxInt = -c & 7
+		v2.AddArg(v3)
+		v.AddArg(v2)
+		return true
+	}
+	return false
+}
 func rewriteValuePPC64_OpRound_0(v *Value) bool {
 	// match: (Round x)
 	// cond:
--- a/src/cmd/compile/internal/ssa/rewriteS390X.go
+++ b/src/cmd/compile/internal/ssa/rewriteS390X.go
@ -395,10 +395,14 @@ func rewriteValueS390X(v *Value) bool {
 		return rewriteValueS390X_OpPopCount64_0(v)
 	case OpPopCount8:
 		return rewriteValueS390X_OpPopCount8_0(v)
+	case OpRotateLeft16:
+		return rewriteValueS390X_OpRotateLeft16_0(v)
 	case OpRotateLeft32:
 		return rewriteValueS390X_OpRotateLeft32_0(v)
 	case OpRotateLeft64:
 		return rewriteValueS390X_OpRotateLeft64_0(v)
+	case OpRotateLeft8:
+		return rewriteValueS390X_OpRotateLeft8_0(v)
 	case OpRound:
 		return rewriteValueS390X_OpRound_0(v)
 	case OpRound32F:
@ -5090,6 +5094,38 @@ func rewriteValueS390X_OpPopCount8_0(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueS390X_OpRotateLeft16_0(v *Value) bool {
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (RotateLeft16 <t> x (MOVDconst [c]))
+	// cond:
+	// result: (Or16 (Lsh16x64 <t> x (MOVDconst [c&15])) (Rsh16Ux64 <t> x (MOVDconst [-c&15])))
+	for {
+		t := v.Type
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpS390XMOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpOr16)
+		v0 := b.NewValue0(v.Pos, OpLsh16x64, t)
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpS390XMOVDconst, typ.UInt64)
+		v1.AuxInt = c & 15
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		v2 := b.NewValue0(v.Pos, OpRsh16Ux64, t)
+		v2.AddArg(x)
+		v3 := b.NewValue0(v.Pos, OpS390XMOVDconst, typ.UInt64)
+		v3.AuxInt = -c & 15
+		v2.AddArg(v3)
+		v.AddArg(v2)
+		return true
+	}
+	return false
+}
 func rewriteValueS390X_OpRotateLeft32_0(v *Value) bool {
 	// match: (RotateLeft32 x y)
 	// cond:
@ -5116,6 +5152,38 @@ func rewriteValueS390X_OpRotateLeft64_0(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueS390X_OpRotateLeft8_0(v *Value) bool {
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (RotateLeft8 <t> x (MOVDconst [c]))
+	// cond:
+	// result: (Or8 (Lsh8x64 <t> x (MOVDconst [c&7])) (Rsh8Ux64 <t> x (MOVDconst [-c&7])))
+	for {
+		t := v.Type
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpS390XMOVDconst {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpOr8)
+		v0 := b.NewValue0(v.Pos, OpLsh8x64, t)
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpS390XMOVDconst, typ.UInt64)
+		v1.AuxInt = c & 7
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		v2 := b.NewValue0(v.Pos, OpRsh8Ux64, t)
+		v2.AddArg(x)
+		v3 := b.NewValue0(v.Pos, OpS390XMOVDconst, typ.UInt64)
+		v3.AuxInt = -c & 7
+		v2.AddArg(v3)
+		v.AddArg(v2)
+		return true
+	}
+	return false
+}
 func rewriteValueS390X_OpRound_0(v *Value) bool {
 	// match: (Round x)
 	// cond:
--- a/src/cmd/compile/internal/ssa/rewriteWasm.go
+++ b/src/cmd/compile/internal/ssa/rewriteWasm.go
@ -381,8 +381,14 @@ func rewriteValueWasm(v *Value) bool {
 		return rewriteValueWasm_OpPopCount64_0(v)
 	case OpPopCount8:
 		return rewriteValueWasm_OpPopCount8_0(v)
+	case OpRotateLeft16:
+		return rewriteValueWasm_OpRotateLeft16_0(v)
+	case OpRotateLeft32:
+		return rewriteValueWasm_OpRotateLeft32_0(v)
 	case OpRotateLeft64:
 		return rewriteValueWasm_OpRotateLeft64_0(v)
+	case OpRotateLeft8:
+		return rewriteValueWasm_OpRotateLeft8_0(v)
 	case OpRound32F:
 		return rewriteValueWasm_OpRound32F_0(v)
 	case OpRound64F:
@ -3763,6 +3769,70 @@ func rewriteValueWasm_OpPopCount8_0(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueWasm_OpRotateLeft16_0(v *Value) bool {
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (RotateLeft16 <t> x (I64Const [c]))
+	// cond:
+	// result: (Or16 (Lsh16x64 <t> x (I64Const [c&15])) (Rsh16Ux64 <t> x (I64Const [-c&15])))
+	for {
+		t := v.Type
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpWasmI64Const {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpOr16)
+		v0 := b.NewValue0(v.Pos, OpLsh16x64, t)
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpWasmI64Const, typ.Int64)
+		v1.AuxInt = c & 15
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		v2 := b.NewValue0(v.Pos, OpRsh16Ux64, t)
+		v2.AddArg(x)
+		v3 := b.NewValue0(v.Pos, OpWasmI64Const, typ.Int64)
+		v3.AuxInt = -c & 15
+		v2.AddArg(v3)
+		v.AddArg(v2)
+		return true
+	}
+	return false
+}
+func rewriteValueWasm_OpRotateLeft32_0(v *Value) bool {
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (RotateLeft32 <t> x (I64Const [c]))
+	// cond:
+	// result: (Or32 (Lsh32x64 <t> x (I64Const [c&31])) (Rsh32Ux64 <t> x (I64Const [-c&31])))
+	for {
+		t := v.Type
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpWasmI64Const {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpOr32)
+		v0 := b.NewValue0(v.Pos, OpLsh32x64, t)
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpWasmI64Const, typ.Int64)
+		v1.AuxInt = c & 31
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		v2 := b.NewValue0(v.Pos, OpRsh32Ux64, t)
+		v2.AddArg(x)
+		v3 := b.NewValue0(v.Pos, OpWasmI64Const, typ.Int64)
+		v3.AuxInt = -c & 31
+		v2.AddArg(v3)
+		v.AddArg(v2)
+		return true
+	}
+	return false
+}
 func rewriteValueWasm_OpRotateLeft64_0(v *Value) bool {
 	// match: (RotateLeft64 x y)
 	// cond:
@ -3776,6 +3846,38 @@ func rewriteValueWasm_OpRotateLeft64_0(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueWasm_OpRotateLeft8_0(v *Value) bool {
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (RotateLeft8 <t> x (I64Const [c]))
+	// cond:
+	// result: (Or8 (Lsh8x64 <t> x (I64Const [c&7])) (Rsh8Ux64 <t> x (I64Const [-c&7])))
+	for {
+		t := v.Type
+		_ = v.Args[1]
+		x := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpWasmI64Const {
+			break
+		}
+		c := v_1.AuxInt
+		v.reset(OpOr8)
+		v0 := b.NewValue0(v.Pos, OpLsh8x64, t)
+		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpWasmI64Const, typ.Int64)
+		v1.AuxInt = c & 7
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		v2 := b.NewValue0(v.Pos, OpRsh8Ux64, t)
+		v2.AddArg(x)
+		v3 := b.NewValue0(v.Pos, OpWasmI64Const, typ.Int64)
+		v3.AuxInt = -c & 7
+		v2.AddArg(v3)
+		v.AddArg(v2)
+		return true
+	}
+	return false
+}
 func rewriteValueWasm_OpRound32F_0(v *Value) bool {
 	// match: (Round32F x)
 	// cond:
--- a/src/cmd/compile/internal/ssa/rewritegeneric.go
+++ b/src/cmd/compile/internal/ssa/rewritegeneric.go
--- a/src/cmd/compile/internal/test/divconst_test.go
+++ b/src/cmd/compile/internal/test/divconst_test.go
@ -18,11 +18,29 @@ func BenchmarkDivconstI64(b *testing.B) {
 	}
 }

+func BenchmarkModconstI64(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		i64res = int64(i) % 7
+	}
+}
+
 func BenchmarkDivisiblePow2constI64(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		boolres = int64(i)%16 == 0
 	}
 }
+func BenchmarkDivisibleconstI64(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		boolres = int64(i)%7 == 0
+	}
+}
+
+func BenchmarkDivisibleWDivconstI64(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		i64res = int64(i) / 7
+		boolres = int64(i)%7 == 0
+	}
+}

 var u64res uint64

@ -32,6 +50,25 @@ func BenchmarkDivconstU64(b *testing.B) {
 	}
 }

+func BenchmarkModconstU64(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		u64res = uint64(i) % 7
+	}
+}
+
+func BenchmarkDivisibleconstU64(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		boolres = uint64(i)%7 == 0
+	}
+}
+
+func BenchmarkDivisibleWDivconstU64(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		u64res = uint64(i) / 7
+		boolres = uint64(i)%7 == 0
+	}
+}
+
 var i32res int32

 func BenchmarkDivconstI32(b *testing.B) {
@ -40,12 +77,31 @@ func BenchmarkDivconstI32(b *testing.B) {
 	}
 }

+func BenchmarkModconstI32(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		i32res = int32(i) % 7
+	}
+}
+
 func BenchmarkDivisiblePow2constI32(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		boolres = int32(i)%16 == 0
 	}
 }

+func BenchmarkDivisibleconstI32(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		boolres = int32(i)%7 == 0
+	}
+}
+
+func BenchmarkDivisibleWDivconstI32(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		i32res = int32(i) / 7
+		boolres = int32(i)%7 == 0
+	}
+}
+
 var u32res uint32

 func BenchmarkDivconstU32(b *testing.B) {
@ -54,6 +110,25 @@ func BenchmarkDivconstU32(b *testing.B) {
 	}
 }

+func BenchmarkModconstU32(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		u32res = uint32(i) % 7
+	}
+}
+
+func BenchmarkDivisibleconstU32(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		boolres = uint32(i)%7 == 0
+	}
+}
+
+func BenchmarkDivisibleWDivconstU32(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		u32res = uint32(i) / 7
+		boolres = uint32(i)%7 == 0
+	}
+}
+
 var i16res int16

 func BenchmarkDivconstI16(b *testing.B) {
@ -62,12 +137,31 @@ func BenchmarkDivconstI16(b *testing.B) {
 	}
 }

+func BenchmarkModconstI16(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		i16res = int16(i) % 7
+	}
+}
+
 func BenchmarkDivisiblePow2constI16(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		boolres = int16(i)%16 == 0
 	}
 }

+func BenchmarkDivisibleconstI16(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		boolres = int16(i)%7 == 0
+	}
+}
+
+func BenchmarkDivisibleWDivconstI16(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		i16res = int16(i) / 7
+		boolres = int16(i)%7 == 0
+	}
+}
+
 var u16res uint16

 func BenchmarkDivconstU16(b *testing.B) {
@ -76,6 +170,25 @@ func BenchmarkDivconstU16(b *testing.B) {
 	}
 }

+func BenchmarkModconstU16(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		u16res = uint16(i) % 7
+	}
+}
+
+func BenchmarkDivisibleconstU16(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		boolres = uint16(i)%7 == 0
+	}
+}
+
+func BenchmarkDivisibleWDivconstU16(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		u16res = uint16(i) / 7
+		boolres = uint16(i)%7 == 0
+	}
+}
+
 var i8res int8

 func BenchmarkDivconstI8(b *testing.B) {
@ -84,12 +197,31 @@ func BenchmarkDivconstI8(b *testing.B) {
 	}
 }

+func BenchmarkModconstI8(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		i8res = int8(i) % 7
+	}
+}
+
 func BenchmarkDivisiblePow2constI8(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		boolres = int8(i)%16 == 0
 	}
 }

+func BenchmarkDivisibleconstI8(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		boolres = int8(i)%7 == 0
+	}
+}
+
+func BenchmarkDivisibleWDivconstI8(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		i8res = int8(i) / 7
+		boolres = int8(i)%7 == 0
+	}
+}
+
 var u8res uint8

 func BenchmarkDivconstU8(b *testing.B) {
@ -97,3 +229,22 @@ func BenchmarkDivconstU8(b *testing.B) {
 		u8res = uint8(i) / 7
 	}
 }
+
+func BenchmarkModconstU8(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		u8res = uint8(i) % 7
+	}
+}
+
+func BenchmarkDivisibleconstU8(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		boolres = uint8(i)%7 == 0
+	}
+}
+
+func BenchmarkDivisibleWDivconstU8(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		u8res = uint8(i) / 7
+		boolres = uint8(i)%7 == 0
+	}
+}
--- a/test/codegen/arithmetic.go
+++ b/test/codegen/arithmetic.go
@ -215,6 +215,26 @@ func ConstMods(n1 uint, n2 int) (uint, int) {
 	return a, b
 }

+// Check that divisibility checks x%c==0 are converted to MULs and rotates
+func Divisible(n uint) (even, odd bool) {
+	// amd64:"MOVQ\t[$]-6148914691236517205","IMULQ","ROLQ\t[$]63",-"DIVQ"
+	// 386:"IMUL3L\t[$]-1431655765","ROLL\t[$]31",-"DIVQ"
+	// arm64:"MOVD\t[$]-6148914691236517205","MUL","ROR",-"DIV"
+	// arm:"MUL","CMP\t[$]715827882",-".*udiv"
+	// ppc64:"MULLD","ROTL\t[$]63"
+	// ppc64le:"MULLD","ROTL\t[$]63"
+	even = n%6 == 0
+
+	// amd64:"MOVQ\t[$]-8737931403336103397","IMULQ",-"ROLQ",-"DIVQ"
+	// 386:"IMUL3L\t[$]678152731",-"ROLL",-"DIVQ"
+	// arm64:"MOVD\t[$]-8737931403336103397","MUL",-"ROR",-"DIV"
+	// arm:"MUL","CMP\t[$]226050910",-".*udiv"
+	// ppc64:"MULLD",-"ROTL"
+	// ppc64le:"MULLD",-"ROTL"
+	odd = n%19 == 0
+	return
+}
+
 // Check that fix-up code is not generated for divisions where it has been proven that
 // that the divisor is not -1 or that the dividend is > MinIntNN.
 func NoFix64A(divr int64) (int64, int64) {