diff --git a/misc/cgo/test/test.go b/misc/cgo/test/test.go
index 8c69ad91ac7..35bc3a14475 100644
--- a/misc/cgo/test/test.go
+++ b/misc/cgo/test/test.go
@@ -901,6 +901,12 @@ typedef struct S32579 { unsigned char data[1]; } S32579;
 // issue 38649
 // Test that #define'd type aliases work.
 #define netbsd_gid unsigned int
+
+// issue 40494
+// Inconsistent handling of tagged enum and union types.
+enum Enum40494 { X_40494 };
+union Union40494 { int x; };
+void issue40494(enum Enum40494 e, union Union40494* up) {}
 */
 import "C"
 
@@ -2204,3 +2210,10 @@ var issue38649 C.netbsd_gid = 42
 // issue 39877
 
 var issue39877 *C.void = nil
+
+// issue 40494
+// No runtime test; just make sure it compiles.
+
+func Issue40494() {
+	C.issue40494(C.enum_Enum40494(C.X_40494), (*C.union_Union40494)(nil))
+}
diff --git a/misc/cgo/testshared/shared_test.go b/misc/cgo/testshared/shared_test.go
index f8dabbe7a01..5e0893784b6 100644
--- a/misc/cgo/testshared/shared_test.go
+++ b/misc/cgo/testshared/shared_test.go
@@ -462,6 +462,7 @@ func TestTrivialExecutable(t *testing.T) {
 	run(t, "trivial executable", "../../bin/trivial")
 	AssertIsLinkedTo(t, "../../bin/trivial", soname)
 	AssertHasRPath(t, "../../bin/trivial", gorootInstallDir)
+	checkSize(t, "../../bin/trivial", 100000) // it is 19K on linux/amd64, 100K should be enough
 }
 
 // Build a trivial program in PIE mode that links against the shared runtime and check it runs.
@@ -470,6 +471,18 @@ func TestTrivialExecutablePIE(t *testing.T) {
 	run(t, "trivial executable", "./trivial.pie")
 	AssertIsLinkedTo(t, "./trivial.pie", soname)
 	AssertHasRPath(t, "./trivial.pie", gorootInstallDir)
+	checkSize(t, "./trivial.pie", 100000) // it is 19K on linux/amd64, 100K should be enough
+}
+
+// Check that the file size does not exceed a limit.
+func checkSize(t *testing.T, f string, limit int64) {
+	fi, err := os.Stat(f)
+	if err != nil {
+		t.Fatalf("stat failed: %v", err)
+	}
+	if sz := fi.Size(); sz > limit {
+		t.Errorf("file too large: got %d, want <= %d", sz, limit)
+	}
 }
 
 // Build a division test program and check it runs.
diff --git a/src/cmd/cgo/out.go b/src/cmd/cgo/out.go
index 6c221473e08..4064f0ae418 100644
--- a/src/cmd/cgo/out.go
+++ b/src/cmd/cgo/out.go
@@ -123,7 +123,9 @@ func (p *Package) writeDefs() {
 		// Moreover, empty file name makes compile emit no source debug info at all.
 		var buf bytes.Buffer
 		noSourceConf.Fprint(&buf, fset, def.Go)
-		if bytes.HasPrefix(buf.Bytes(), []byte("_Ctype_")) {
+		if bytes.HasPrefix(buf.Bytes(), []byte("_Ctype_")) ||
+			strings.HasPrefix(name, "_Ctype_enum_") ||
+			strings.HasPrefix(name, "_Ctype_union_") {
 			// This typedef is of the form `typedef a b` and should be an alias.
 			fmt.Fprintf(fgo2, "= ")
 		}
diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go
index 47cb422ab11..9d8a0920b39 100644
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@@ -874,7 +874,11 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		ssa.OpAMD64SUBLloadidx1, ssa.OpAMD64SUBLloadidx4, ssa.OpAMD64SUBLloadidx8, ssa.OpAMD64SUBQloadidx1, ssa.OpAMD64SUBQloadidx8,
 		ssa.OpAMD64ANDLloadidx1, ssa.OpAMD64ANDLloadidx4, ssa.OpAMD64ANDLloadidx8, ssa.OpAMD64ANDQloadidx1, ssa.OpAMD64ANDQloadidx8,
 		ssa.OpAMD64ORLloadidx1, ssa.OpAMD64ORLloadidx4, ssa.OpAMD64ORLloadidx8, ssa.OpAMD64ORQloadidx1, ssa.OpAMD64ORQloadidx8,
-		ssa.OpAMD64XORLloadidx1, ssa.OpAMD64XORLloadidx4, ssa.OpAMD64XORLloadidx8, ssa.OpAMD64XORQloadidx1, ssa.OpAMD64XORQloadidx8:
+		ssa.OpAMD64XORLloadidx1, ssa.OpAMD64XORLloadidx4, ssa.OpAMD64XORLloadidx8, ssa.OpAMD64XORQloadidx1, ssa.OpAMD64XORQloadidx8,
+		ssa.OpAMD64ADDSSloadidx1, ssa.OpAMD64ADDSSloadidx4, ssa.OpAMD64ADDSDloadidx1, ssa.OpAMD64ADDSDloadidx8,
+		ssa.OpAMD64SUBSSloadidx1, ssa.OpAMD64SUBSSloadidx4, ssa.OpAMD64SUBSDloadidx1, ssa.OpAMD64SUBSDloadidx8,
+		ssa.OpAMD64MULSSloadidx1, ssa.OpAMD64MULSSloadidx4, ssa.OpAMD64MULSDloadidx1, ssa.OpAMD64MULSDloadidx8,
+		ssa.OpAMD64DIVSSloadidx1, ssa.OpAMD64DIVSSloadidx4, ssa.OpAMD64DIVSDloadidx1, ssa.OpAMD64DIVSDloadidx8:
 		p := s.Prog(v.Op.Asm())
 
 		r, i := v.Args[1].Reg(), v.Args[2].Reg()
diff --git a/src/cmd/compile/internal/ssa/addressingmodes.go b/src/cmd/compile/internal/ssa/addressingmodes.go
index 78c979b7cb0..97a5ab4f031 100644
--- a/src/cmd/compile/internal/ssa/addressingmodes.go
+++ b/src/cmd/compile/internal/ssa/addressingmodes.go
@@ -321,6 +321,23 @@ var combine = map[[2]Op]Op{
 	[2]Op{OpAMD64XORQconstmodify, OpAMD64LEAQ1}: OpAMD64XORQconstmodifyidx1,
 	[2]Op{OpAMD64XORQconstmodify, OpAMD64LEAQ8}: OpAMD64XORQconstmodifyidx8,
 
+	[2]Op{OpAMD64ADDSSload, OpAMD64LEAQ1}: OpAMD64ADDSSloadidx1,
+	[2]Op{OpAMD64ADDSSload, OpAMD64LEAQ4}: OpAMD64ADDSSloadidx4,
+	[2]Op{OpAMD64ADDSDload, OpAMD64LEAQ1}: OpAMD64ADDSDloadidx1,
+	[2]Op{OpAMD64ADDSDload, OpAMD64LEAQ8}: OpAMD64ADDSDloadidx8,
+	[2]Op{OpAMD64SUBSSload, OpAMD64LEAQ1}: OpAMD64SUBSSloadidx1,
+	[2]Op{OpAMD64SUBSSload, OpAMD64LEAQ4}: OpAMD64SUBSSloadidx4,
+	[2]Op{OpAMD64SUBSDload, OpAMD64LEAQ1}: OpAMD64SUBSDloadidx1,
+	[2]Op{OpAMD64SUBSDload, OpAMD64LEAQ8}: OpAMD64SUBSDloadidx8,
+	[2]Op{OpAMD64MULSSload, OpAMD64LEAQ1}: OpAMD64MULSSloadidx1,
+	[2]Op{OpAMD64MULSSload, OpAMD64LEAQ4}: OpAMD64MULSSloadidx4,
+	[2]Op{OpAMD64MULSDload, OpAMD64LEAQ1}: OpAMD64MULSDloadidx1,
+	[2]Op{OpAMD64MULSDload, OpAMD64LEAQ8}: OpAMD64MULSDloadidx8,
+	[2]Op{OpAMD64DIVSSload, OpAMD64LEAQ1}: OpAMD64DIVSSloadidx1,
+	[2]Op{OpAMD64DIVSSload, OpAMD64LEAQ4}: OpAMD64DIVSSloadidx4,
+	[2]Op{OpAMD64DIVSDload, OpAMD64LEAQ1}: OpAMD64DIVSDloadidx1,
+	[2]Op{OpAMD64DIVSDload, OpAMD64LEAQ8}: OpAMD64DIVSDloadidx8,
+
 	// 386
 	[2]Op{Op386MOVBload, Op386ADDL}:  Op386MOVBloadidx1,
 	[2]Op{Op386MOVWload, Op386ADDL}:  Op386MOVWloadidx1,
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
index cd9cb515c02..a3b29049df6 100644
--- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
@@ -149,14 +149,15 @@ func init() {
 		gpstorexchg     = regInfo{inputs: []regMask{gp, gpspsb, 0}, outputs: []regMask{gp}}
 		cmpxchg         = regInfo{inputs: []regMask{gp, ax, gp, 0}, outputs: []regMask{gp, 0}, clobbers: ax}
 
-		fp01     = regInfo{inputs: nil, outputs: fponly}
-		fp21     = regInfo{inputs: []regMask{fp, fp}, outputs: fponly}
-		fp31     = regInfo{inputs: []regMask{fp, fp, fp}, outputs: fponly}
-		fp21load = regInfo{inputs: []regMask{fp, gpspsb, 0}, outputs: fponly}
-		fpgp     = regInfo{inputs: fponly, outputs: gponly}
-		gpfp     = regInfo{inputs: gponly, outputs: fponly}
-		fp11     = regInfo{inputs: fponly, outputs: fponly}
-		fp2flags = regInfo{inputs: []regMask{fp, fp}}
+		fp01        = regInfo{inputs: nil, outputs: fponly}
+		fp21        = regInfo{inputs: []regMask{fp, fp}, outputs: fponly}
+		fp31        = regInfo{inputs: []regMask{fp, fp, fp}, outputs: fponly}
+		fp21load    = regInfo{inputs: []regMask{fp, gpspsb, 0}, outputs: fponly}
+		fp21loadidx = regInfo{inputs: []regMask{fp, gpspsb, gpspsb, 0}, outputs: fponly}
+		fpgp        = regInfo{inputs: fponly, outputs: gponly}
+		gpfp        = regInfo{inputs: gponly, outputs: fponly}
+		fp11        = regInfo{inputs: fponly, outputs: fponly}
+		fp2flags    = regInfo{inputs: []regMask{fp, fp}}
 
 		fpload    = regInfo{inputs: []regMask{gpspsb, 0}, outputs: fponly}
 		fploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: fponly}
@@ -201,6 +202,23 @@ func init() {
 		{name: "DIVSSload", argLength: 3, reg: fp21load, asm: "DIVSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 / tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
 		{name: "DIVSDload", argLength: 3, reg: fp21load, asm: "DIVSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 / tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
 
+		{name: "ADDSSloadidx1", argLength: 4, reg: fp21loadidx, asm: "ADDSS", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp32 arg0 + tmp, tmp loaded from arg1+arg2+auxint+aux, arg3 = mem
+		{name: "ADDSSloadidx4", argLength: 4, reg: fp21loadidx, asm: "ADDSS", scale: 4, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp32 arg0 + tmp, tmp loaded from arg1+4*arg2+auxint+aux, arg3 = mem
+		{name: "ADDSDloadidx1", argLength: 4, reg: fp21loadidx, asm: "ADDSD", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp64 arg0 + tmp, tmp loaded from arg1+arg2+auxint+aux, arg3 = mem
+		{name: "ADDSDloadidx8", argLength: 4, reg: fp21loadidx, asm: "ADDSD", scale: 8, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp64 arg0 + tmp, tmp loaded from arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "SUBSSloadidx1", argLength: 4, reg: fp21loadidx, asm: "SUBSS", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp32 arg0 - tmp, tmp loaded from arg1+arg2+auxint+aux, arg3 = mem
+		{name: "SUBSSloadidx4", argLength: 4, reg: fp21loadidx, asm: "SUBSS", scale: 4, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp32 arg0 - tmp, tmp loaded from arg1+4*arg2+auxint+aux, arg3 = mem
+		{name: "SUBSDloadidx1", argLength: 4, reg: fp21loadidx, asm: "SUBSD", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp64 arg0 - tmp, tmp loaded from arg1+arg2+auxint+aux, arg3 = mem
+		{name: "SUBSDloadidx8", argLength: 4, reg: fp21loadidx, asm: "SUBSD", scale: 8, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp64 arg0 - tmp, tmp loaded from arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "MULSSloadidx1", argLength: 4, reg: fp21loadidx, asm: "MULSS", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp32 arg0 * tmp, tmp loaded from arg1+arg2+auxint+aux, arg3 = mem
+		{name: "MULSSloadidx4", argLength: 4, reg: fp21loadidx, asm: "MULSS", scale: 4, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp32 arg0 * tmp, tmp loaded from arg1+4*arg2+auxint+aux, arg3 = mem
+		{name: "MULSDloadidx1", argLength: 4, reg: fp21loadidx, asm: "MULSD", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp64 arg0 * tmp, tmp loaded from arg1+arg2+auxint+aux, arg3 = mem
+		{name: "MULSDloadidx8", argLength: 4, reg: fp21loadidx, asm: "MULSD", scale: 8, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp64 arg0 * tmp, tmp loaded from arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "DIVSSloadidx1", argLength: 4, reg: fp21loadidx, asm: "DIVSS", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp32 arg0 / tmp, tmp loaded from arg1+arg2+auxint+aux, arg3 = mem
+		{name: "DIVSSloadidx4", argLength: 4, reg: fp21loadidx, asm: "DIVSS", scale: 4, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp32 arg0 / tmp, tmp loaded from arg1+4*arg2+auxint+aux, arg3 = mem
+		{name: "DIVSDloadidx1", argLength: 4, reg: fp21loadidx, asm: "DIVSD", scale: 1, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp64 arg0 / tmp, tmp loaded from arg1+arg2+auxint+aux, arg3 = mem
+		{name: "DIVSDloadidx8", argLength: 4, reg: fp21loadidx, asm: "DIVSD", scale: 8, aux: "SymOff", resultInArg0: true, symEffect: "Read"}, // fp64 arg0 / tmp, tmp loaded from arg1+8*arg2+auxint+aux, arg3 = mem
+
 		// binary ops
 		{name: "ADDQ", argLength: 2, reg: gp21sp, asm: "ADDQ", commutative: true, clobberFlags: true},                                                                   // arg0 + arg1
 		{name: "ADDL", argLength: 2, reg: gp21sp, asm: "ADDL", commutative: true, clobberFlags: true},                                                                   // arg0 + arg1
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index d27682e3b38..9efa1bfcc4b 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -572,6 +572,22 @@ const (
 	OpAMD64MULSDload
 	OpAMD64DIVSSload
 	OpAMD64DIVSDload
+	OpAMD64ADDSSloadidx1
+	OpAMD64ADDSSloadidx4
+	OpAMD64ADDSDloadidx1
+	OpAMD64ADDSDloadidx8
+	OpAMD64SUBSSloadidx1
+	OpAMD64SUBSSloadidx4
+	OpAMD64SUBSDloadidx1
+	OpAMD64SUBSDloadidx8
+	OpAMD64MULSSloadidx1
+	OpAMD64MULSSloadidx4
+	OpAMD64MULSDloadidx1
+	OpAMD64MULSDloadidx8
+	OpAMD64DIVSSloadidx1
+	OpAMD64DIVSSloadidx4
+	OpAMD64DIVSDloadidx1
+	OpAMD64DIVSDloadidx8
 	OpAMD64ADDQ
 	OpAMD64ADDL
 	OpAMD64ADDQconst
@@ -6576,6 +6592,310 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:         "ADDSSloadidx1",
+		auxType:      auxSymOff,
+		argLen:       4,
+		resultInArg0: true,
+		symEffect:    SymRead,
+		asm:          x86.AADDSS,
+		scale:        1,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+				{1, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+				{2, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+			},
+		},
+	},
+	{
+		name:         "ADDSSloadidx4",
+		auxType:      auxSymOff,
+		argLen:       4,
+		resultInArg0: true,
+		symEffect:    SymRead,
+		asm:          x86.AADDSS,
+		scale:        4,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+				{1, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+				{2, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+			},
+		},
+	},
+	{
+		name:         "ADDSDloadidx1",
+		auxType:      auxSymOff,
+		argLen:       4,
+		resultInArg0: true,
+		symEffect:    SymRead,
+		asm:          x86.AADDSD,
+		scale:        1,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+				{1, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+				{2, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+			},
+		},
+	},
+	{
+		name:         "ADDSDloadidx8",
+		auxType:      auxSymOff,
+		argLen:       4,
+		resultInArg0: true,
+		symEffect:    SymRead,
+		asm:          x86.AADDSD,
+		scale:        8,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+				{1, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+				{2, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+			},
+		},
+	},
+	{
+		name:         "SUBSSloadidx1",
+		auxType:      auxSymOff,
+		argLen:       4,
+		resultInArg0: true,
+		symEffect:    SymRead,
+		asm:          x86.ASUBSS,
+		scale:        1,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+				{1, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+				{2, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+			},
+		},
+	},
+	{
+		name:         "SUBSSloadidx4",
+		auxType:      auxSymOff,
+		argLen:       4,
+		resultInArg0: true,
+		symEffect:    SymRead,
+		asm:          x86.ASUBSS,
+		scale:        4,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+				{1, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+				{2, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+			},
+		},
+	},
+	{
+		name:         "SUBSDloadidx1",
+		auxType:      auxSymOff,
+		argLen:       4,
+		resultInArg0: true,
+		symEffect:    SymRead,
+		asm:          x86.ASUBSD,
+		scale:        1,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+				{1, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+				{2, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+			},
+		},
+	},
+	{
+		name:         "SUBSDloadidx8",
+		auxType:      auxSymOff,
+		argLen:       4,
+		resultInArg0: true,
+		symEffect:    SymRead,
+		asm:          x86.ASUBSD,
+		scale:        8,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+				{1, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+				{2, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+			},
+		},
+	},
+	{
+		name:         "MULSSloadidx1",
+		auxType:      auxSymOff,
+		argLen:       4,
+		resultInArg0: true,
+		symEffect:    SymRead,
+		asm:          x86.AMULSS,
+		scale:        1,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+				{1, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+				{2, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+			},
+		},
+	},
+	{
+		name:         "MULSSloadidx4",
+		auxType:      auxSymOff,
+		argLen:       4,
+		resultInArg0: true,
+		symEffect:    SymRead,
+		asm:          x86.AMULSS,
+		scale:        4,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+				{1, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+				{2, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+			},
+		},
+	},
+	{
+		name:         "MULSDloadidx1",
+		auxType:      auxSymOff,
+		argLen:       4,
+		resultInArg0: true,
+		symEffect:    SymRead,
+		asm:          x86.AMULSD,
+		scale:        1,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+				{1, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+				{2, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+			},
+		},
+	},
+	{
+		name:         "MULSDloadidx8",
+		auxType:      auxSymOff,
+		argLen:       4,
+		resultInArg0: true,
+		symEffect:    SymRead,
+		asm:          x86.AMULSD,
+		scale:        8,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+				{1, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+				{2, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+			},
+		},
+	},
+	{
+		name:         "DIVSSloadidx1",
+		auxType:      auxSymOff,
+		argLen:       4,
+		resultInArg0: true,
+		symEffect:    SymRead,
+		asm:          x86.ADIVSS,
+		scale:        1,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+				{1, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+				{2, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+			},
+		},
+	},
+	{
+		name:         "DIVSSloadidx4",
+		auxType:      auxSymOff,
+		argLen:       4,
+		resultInArg0: true,
+		symEffect:    SymRead,
+		asm:          x86.ADIVSS,
+		scale:        4,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+				{1, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+				{2, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+			},
+		},
+	},
+	{
+		name:         "DIVSDloadidx1",
+		auxType:      auxSymOff,
+		argLen:       4,
+		resultInArg0: true,
+		symEffect:    SymRead,
+		asm:          x86.ADIVSD,
+		scale:        1,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+				{1, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+				{2, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+			},
+		},
+	},
+	{
+		name:         "DIVSDloadidx8",
+		auxType:      auxSymOff,
+		argLen:       4,
+		resultInArg0: true,
+		symEffect:    SymRead,
+		asm:          x86.ADIVSD,
+		scale:        8,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+				{1, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+				{2, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+			},
+		},
+	},
 	{
 		name:         "ADDQ",
 		argLen:       2,
diff --git a/src/cmd/compile/internal/ssa/prove.go b/src/cmd/compile/internal/ssa/prove.go
index a8e43d01147..6c6be39d34b 100644
--- a/src/cmd/compile/internal/ssa/prove.go
+++ b/src/cmd/compile/internal/ssa/prove.go
@@ -1051,6 +1051,11 @@ func addLocalInductiveFacts(ft *factsTable, b *Block) {
 	//
 	// If all of these conditions are true, then i1 < max and i1 >= min.
 
+	// To ensure this is a loop header node.
+	if len(b.Preds) != 2 {
+		return
+	}
+
 	for _, i1 := range b.Values {
 		if i1.Op != OpPhi {
 			continue
@@ -1093,6 +1098,9 @@ func addLocalInductiveFacts(ft *factsTable, b *Block) {
 				}
 				br = negative
 			}
+			if br == unknown {
+				continue
+			}
 
 			tr, has := domainRelationTable[control.Op]
 			if !has {
diff --git a/src/cmd/link/internal/ld/lib.go b/src/cmd/link/internal/ld/lib.go
index ef4c86719e0..5271b8f3484 100644
--- a/src/cmd/link/internal/ld/lib.go
+++ b/src/cmd/link/internal/ld/lib.go
@@ -2070,17 +2070,6 @@ func ldshlibsyms(ctxt *Link, shlib string) {
 		l.SetSymElfType(s, elf.ST_TYPE(elfsym.Info))
 		su.SetSize(int64(elfsym.Size))
 		if elfsym.Section != elf.SHN_UNDEF {
-			// If it's not undefined, mark the symbol as reachable
-			// so as to protect it from dead code elimination,
-			// even if there aren't any explicit references to it.
-			// Under the previous sym.Symbol based regime this
-			// wasn't necessary, but for the loader-based deadcode
-			// it is definitely needed.
-			//
-			// FIXME: have a more general/flexible mechanism for this?
-			//
-			l.SetAttrReachable(s, true)
-
 			// Set .File for the library that actually defines the symbol.
 			l.SetSymPkg(s, libpath)
 
diff --git a/src/crypto/ed25519/ed25519.go b/src/crypto/ed25519/ed25519.go
index 5766970f827..6f59bb5cffb 100644
--- a/src/crypto/ed25519/ed25519.go
+++ b/src/crypto/ed25519/ed25519.go
@@ -154,7 +154,7 @@ func Sign(privateKey PrivateKey, message []byte) []byte {
 	return signature
 }
 
-func signGeneric(signature, privateKey, message []byte) {
+func sign(signature, privateKey, message []byte) {
 	if l := len(privateKey); l != PrivateKeySize {
 		panic("ed25519: bad private key length: " + strconv.Itoa(l))
 	}
@@ -201,10 +201,6 @@ func signGeneric(signature, privateKey, message []byte) {
 // Verify reports whether sig is a valid signature of message by publicKey. It
 // will panic if len(publicKey) is not PublicKeySize.
 func Verify(publicKey PublicKey, message, sig []byte) bool {
-	return verify(publicKey, message, sig)
-}
-
-func verifyGeneric(publicKey PublicKey, message, sig []byte) bool {
 	if l := len(publicKey); l != PublicKeySize {
 		panic("ed25519: bad public key length: " + strconv.Itoa(l))
 	}
diff --git a/src/crypto/ed25519/ed25519_noasm.go b/src/crypto/ed25519/ed25519_noasm.go
deleted file mode 100644
index caa84f74fbc..00000000000
--- a/src/crypto/ed25519/ed25519_noasm.go
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright 2020 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !s390x
-
-package ed25519
-
-func sign(signature, privateKey, message []byte) {
-	signGeneric(signature, privateKey, message)
-}
-
-func verify(publicKey PublicKey, message, sig []byte) bool {
-	return verifyGeneric(publicKey, message, sig)
-}
diff --git a/src/crypto/ed25519/ed25519_s390x.go b/src/crypto/ed25519/ed25519_s390x.go
deleted file mode 100644
index c8627a06523..00000000000
--- a/src/crypto/ed25519/ed25519_s390x.go
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2020 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package ed25519
-
-import (
-	"internal/cpu"
-	"strconv"
-)
-
-//go:noescape
-func kdsaSign(message, signature, privateKey []byte) bool
-
-//go:noescape
-func kdsaVerify(message, signature, publicKey []byte) bool
-
-// sign does a check to see if hardware has Edwards Curve instruction available.
-// If it does, use the hardware implementation. Otherwise, use the generic version.
-func sign(signature, privateKey, message []byte) {
-	if cpu.S390X.HasEDDSA {
-		if l := len(privateKey); l != PrivateKeySize {
-			panic("ed25519: bad private key length: " + strconv.Itoa(l))
-		}
-
-		ret := kdsaSign(message, signature, privateKey[:32])
-		if !ret {
-			panic("ed25519: kdsa sign has a failure")
-		}
-		return
-	}
-	signGeneric(signature, privateKey, message)
-}
-
-// verify does a check to see if hardware has Edwards Curve instruction available.
-// If it does, use the hardware implementation for eddsa verfication. Otherwise, the generic
-// version is used
-func verify(publicKey PublicKey, message, sig []byte) bool {
-	if cpu.S390X.HasEDDSA {
-		if l := len(publicKey); l != PublicKeySize {
-			panic("ed25519: bad public key length: " + strconv.Itoa(l))
-		}
-
-		if len(sig) != SignatureSize || sig[63]&224 != 0 {
-			return false
-		}
-
-		return kdsaVerify(message, sig, publicKey)
-	}
-	return verifyGeneric(publicKey, message, sig)
-}
diff --git a/src/crypto/ed25519/ed25519_s390x.s b/src/crypto/ed25519/ed25519_s390x.s
deleted file mode 100644
index 1c77b51a780..00000000000
--- a/src/crypto/ed25519/ed25519_s390x.s
+++ /dev/null
@@ -1,161 +0,0 @@
-// Copyright 2020 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "textflag.h"
-
-// func kdsaSign(message, signature, privateKey []byte) bool
-TEXT ·kdsaSign(SB), $4096-73
-	// The kdsa instruction takes function code,
-	// buffer's location, message's location and message len
-	// as parameters. Out of those, the function code and buffer's location
-	// should be placed in R0 and R1 respectively. The message's location
-	// and message length should be placed in an even-odd register pair. (e.g: R2 and R3)
-
-	// The content of parameter block(buffer) looks like the following:
-	// Signature R, Signature S and Private Key all take 32 bytes.
-	// In the signing case, the signatures(R and S) will be generated by
-	// the signing instruction and get placed in the locations shown in the parameter block.
-	//    0 +---------------+
-	//      |  Signature(R) |
-	//   32 +---------------+
-	//      |  Signature(S) |
-	//   64 +---------------+
-	//      |  Private Key  |
-	//   96 +---------------+
-	//      |   Reserved    |
-	//   112+---------------+
-	//      |               |
-	//      |     ...       |
-	//      |               |
-	// 4088 +---------------+
-
-	// The following code section setups the buffer from stack:
-	// Get the address of the buffer stack variable.
-	MOVD $buffer-4096(SP), R1
-
-	// Zero the buffer.
-	MOVD R1, R2
-	MOVD $(4096/256), R0 // number of 256 byte chunks to clear
-
-clear:
-	XC    $256, (R2), (R2)
-	MOVD  $256(R2), R2
-	BRCTG R0, clear
-
-	MOVD $40, R0                   // EDDSA-25519 sign has a function code of 40
-	LMG  message+0(FP), R2, R3     // R2=base R3=len
-	LMG  signature+24(FP), R4, R5  // R4=base R5=len
-	LMG  privateKey+48(FP), R6, R7 // R6=base R7=len
-
-	// Checks the length of signature and private key
-	CMPBNE R5, $64, panic
-	CMPBNE R7, $32, panic
-
-	// The instruction uses RFC 8032's private key, which is the first 32 bytes
-	// of the private key in this package. So we copy that into the buffer.
-	MVC $32, (R6), 64(R1)
-
-loop:
-	WORD $0xB93A0002 // The KDSA instruction
-	BVS  loop        // The instruction is exectued by hardware and can be interrupted. This does a retry when that happens.
-	BNE  error
-
-success:
-	// The signatures generated are in big-endian form, so we
-	// need to reverse the bytes of Signature(R) and Signature(S) in the buffers to transform
-	// them from big-endian to little-endian.
-
-	// Transform Signature(R) from big endian to little endian and copy into the signature
-	MVCIN $32, 31(R1), (R4)
-
-	// Transform Signature(S) from big endian to little endian and copy into the signature
-	MVCIN $32, 63(R1), 32(R4)
-
-	MOVB $1, ret+72(FP)
-	RET
-
-error:
-	// return false
-	MOVB $0, ret+72(FP)
-	RET
-
-panic:
-	UNDEF
-
-// func kdsaVerify(message, signature, publicKey []byte) bool
-TEXT ·kdsaVerify(SB), $4096-73
-	// The kdsa instruction takes function code,
-	// buffer's location, message's location and message len
-	// as parameters. Out of those, the function code and buffer's location
-	// should be placed in R0 and R1 respectively. The message's location
-	// and message length should be placed in an even-odd register pair. (e.g: R2 and R3)
-
-	// The parameter block(buffer) is similar to that of signing, except that
-	// we use public key for verification, and Signatures(R and S) are provided
-	// as input parameters to the parameter block.
-	//    0 +---------------+
-	//      |  Signature(R) |
-	//   32 +---------------+
-	//      |  Signature(S) |
-	//   64 +---------------+
-	//      |  Public Key   |
-	//   96 +---------------+
-	//      |   Reserved    |
-	//   112+---------------+
-	//      |               |
-	//      |     ...       |
-	//      |               |
-	// 4088 +---------------+
-
-	// The following code section setups the buffer from stack:
-	// Get the address of the buffer stack variable.
-	MOVD $buffer-4096(SP), R1
-
-	// Zero the buffer.
-	MOVD R1, R2
-	MOVD $(4096/256), R0 // number of 256 byte chunks to clear
-
-clear:
-	XC    $256, (R2), (R2)
-	MOVD  $256(R2), R2
-	BRCTG R0, clear
-
-	MOVD $32, R0                  // EDDSA-25519 verify has a function code of 32
-	LMG  message+0(FP), R2, R3    // R2=base R3=len
-	LMG  signature+24(FP), R4, R5 // R4=base R5=len
-	LMG  publicKey+48(FP), R6, R7 // R6=base R7=len
-
-	// Checks the length of public key and signature
-	CMPBNE R5, $64, panic
-	CMPBNE R7, $32, panic
-
-verify:
-	// The instruction needs Signature(R), Signature(S) and public key
-	// to be in big-endian form during computation. Therefore,
-	// we do the transformation (from little endian to big endian) and copy those into the buffer.
-
-	// Transform Signature(R) from little endian to big endian and copy into the buffer
-	MVCIN $32, 31(R4), (R1)
-
-	// Transform Signature(S) from little endian to big endian and copy into the buffer
-	MVCIN $32, 63(R4), 32(R1)
-
-	// Transform Public Key from little endian to big endian and copy into the buffer
-	MVCIN $32, 31(R6), 64(R1)
-
-verifyLoop:
-	WORD $0xB93A0002 // KDSA instruction
-	BVS  verifyLoop  // Retry upon hardware interrupt
-	BNE  error
-
-success:
-	MOVB $1, ret+72(FP)
-	RET
-
-error:
-	MOVB $0, ret+72(FP)
-	RET
-
-panic:
-	UNDEF
diff --git a/src/crypto/ed25519/ed25519_test.go b/src/crypto/ed25519/ed25519_test.go
index f77d463721c..adb09e409a5 100644
--- a/src/crypto/ed25519/ed25519_test.go
+++ b/src/crypto/ed25519/ed25519_test.go
@@ -26,14 +26,6 @@ func (zeroReader) Read(buf []byte) (int, error) {
 	return len(buf), nil
 }
 
-// signGenericWrapper is identical to Sign except that it unconditionally calls signGeneric directly
-// rather than going through the sign function that might call assembly code.
-func signGenericWrapper(privateKey PrivateKey, msg []byte) []byte {
-	sig := make([]byte, SignatureSize)
-	signGeneric(sig, privateKey, msg)
-	return sig
-}
-
 func TestUnmarshalMarshal(t *testing.T) {
 	pub, _, _ := GenerateKey(rand.Reader)
 
@@ -53,33 +45,22 @@ func TestUnmarshalMarshal(t *testing.T) {
 }
 
 func TestSignVerify(t *testing.T) {
-	t.Run("Generic", func(t *testing.T) { testSignVerify(t, signGenericWrapper, verifyGeneric) })
-	t.Run("Native", func(t *testing.T) { testSignVerify(t, Sign, Verify) })
-}
-
-func testSignVerify(t *testing.T, signImpl func(privateKey PrivateKey, message []byte) []byte,
-	verifyImpl func(publicKey PublicKey, message, sig []byte) bool) {
 	var zero zeroReader
 	public, private, _ := GenerateKey(zero)
 
 	message := []byte("test message")
-	sig := signImpl(private, message)
-	if !verifyImpl(public, message, sig) {
+	sig := Sign(private, message)
+	if !Verify(public, message, sig) {
 		t.Errorf("valid signature rejected")
 	}
 
 	wrongMessage := []byte("wrong message")
-	if verifyImpl(public, wrongMessage, sig) {
+	if Verify(public, wrongMessage, sig) {
 		t.Errorf("signature of different message accepted")
 	}
 }
 
 func TestCryptoSigner(t *testing.T) {
-	t.Run("Generic", func(t *testing.T) { testCryptoSigner(t, verifyGeneric) })
-	t.Run("Native", func(t *testing.T) { testCryptoSigner(t, Verify) })
-}
-
-func testCryptoSigner(t *testing.T, verifyImpl func(publicKey PublicKey, message, sig []byte) bool) {
 	var zero zeroReader
 	public, private, _ := GenerateKey(zero)
 
@@ -102,7 +83,7 @@ func testCryptoSigner(t *testing.T, verifyImpl func(publicKey PublicKey, message
 		t.Fatalf("error from Sign(): %s", err)
 	}
 
-	if !verifyImpl(public, message, signature) {
+	if !Verify(public, message, signature) {
 		t.Errorf("Verify failed on signature from Sign()")
 	}
 }
@@ -130,12 +111,6 @@ func TestEqual(t *testing.T) {
 }
 
 func TestGolden(t *testing.T) {
-	t.Run("Generic", func(t *testing.T) { testGolden(t, signGenericWrapper, verifyGeneric) })
-	t.Run("Native", func(t *testing.T) { testGolden(t, Sign, Verify) })
-}
-
-func testGolden(t *testing.T, signImpl func(privateKey PrivateKey, message []byte) []byte,
-	verifyImpl func(publicKey PublicKey, message, sig []byte) bool) {
 	// sign.input.gz is a selection of test cases from
 	// https://ed25519.cr.yp.to/python/sign.input
 	testDataZ, err := os.Open("testdata/sign.input.gz")
@@ -177,12 +152,12 @@ func testGolden(t *testing.T, signImpl func(privateKey PrivateKey, message []byt
 		copy(priv[:], privBytes)
 		copy(priv[32:], pubKey)
 
-		sig2 := signImpl(priv[:], msg)
+		sig2 := Sign(priv[:], msg)
 		if !bytes.Equal(sig, sig2[:]) {
 			t.Errorf("different signature result on line %d: %x vs %x", lineNo, sig, sig2)
 		}
 
-		if !verifyImpl(pubKey, msg, sig2) {
+		if !Verify(pubKey, msg, sig2) {
 			t.Errorf("signature failed to verify on line %d", lineNo)
 		}
 
@@ -206,11 +181,6 @@ func testGolden(t *testing.T, signImpl func(privateKey PrivateKey, message []byt
 }
 
 func TestMalleability(t *testing.T) {
-	t.Run("Generic", func(t *testing.T) { testMalleability(t, verifyGeneric) })
-	t.Run("Native", func(t *testing.T) { testMalleability(t, Verify) })
-}
-
-func testMalleability(t *testing.T, verifyImpl func(publicKey PublicKey, message, sig []byte) bool) {
 	// https://tools.ietf.org/html/rfc8032#section-5.1.7 adds an additional test
 	// that s be in [0, order). This prevents someone from adding a multiple of
 	// order to s and obtaining a second valid signature for the same message.
@@ -229,7 +199,7 @@ func testMalleability(t *testing.T, verifyImpl func(publicKey PublicKey, message
 		0xb1, 0x08, 0xc3, 0xbd, 0xae, 0x36, 0x9e, 0xf5, 0x49, 0xfa,
 	}
 
-	if verifyImpl(publicKey, msg, sig) {
+	if Verify(publicKey, msg, sig) {
 		t.Fatal("non-canonical signature accepted")
 	}
 }
diff --git a/src/runtime/lockrank_off.go b/src/runtime/lockrank_off.go
index 891589c0f27..425ca8dd93f 100644
--- a/src/runtime/lockrank_off.go
+++ b/src/runtime/lockrank_off.go
@@ -1,3 +1,7 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
 // +build !goexperiment.staticlockranking
 
 package runtime
diff --git a/src/runtime/lockrank_on.go b/src/runtime/lockrank_on.go
index cf4151ff462..fbc5ff58b72 100644
--- a/src/runtime/lockrank_on.go
+++ b/src/runtime/lockrank_on.go
@@ -1,3 +1,7 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
 // +build goexperiment.staticlockranking
 
 package runtime
diff --git a/src/runtime/mpagealloc.go b/src/runtime/mpagealloc.go
index 60f7f9ff58e..8b3c62c375e 100644
--- a/src/runtime/mpagealloc.go
+++ b/src/runtime/mpagealloc.go
@@ -233,16 +233,12 @@ type pageAlloc struct {
 
 	// The address to start an allocation search with. It must never
 	// point to any memory that is not contained in inUse, i.e.
-	// inUse.contains(searchAddr) must always be true.
+	// inUse.contains(searchAddr.addr()) must always be true. The one
+	// exception to this rule is that it may take on the value of
+	// maxOffAddr to indicate that the heap is exhausted.
 	//
-	// When added with arenaBaseOffset, we guarantee that
-	// all valid heap addresses (when also added with
-	// arenaBaseOffset) below this value are allocated and
-	// not worth searching.
-	//
-	// Note that adding in arenaBaseOffset transforms addresses
-	// to a new address space with a linear view of the full address
-	// space on architectures with segmented address spaces.
+	// We guarantee that all valid heap addresses below this value
+	// are allocated and not worth searching.
 	searchAddr offAddr
 
 	// start and end represent the chunk indices
@@ -518,6 +514,30 @@ func (s *pageAlloc) allocRange(base, npages uintptr) uintptr {
 	return uintptr(scav) * pageSize
 }
 
+// findMappedAddr returns the smallest mapped offAddr that is
+// >= addr. That is, if addr refers to mapped memory, then it is
+// returned. If addr is higher than any mapped region, then
+// it returns maxOffAddr.
+//
+// s.mheapLock must be held.
+func (s *pageAlloc) findMappedAddr(addr offAddr) offAddr {
+	// If we're not in a test, validate first by checking mheap_.arenas.
+	// This is a fast path which is only safe to use outside of testing.
+	ai := arenaIndex(addr.addr())
+	if s.test || mheap_.arenas[ai.l1()] == nil || mheap_.arenas[ai.l1()][ai.l2()] == nil {
+		vAddr, ok := s.inUse.findAddrGreaterEqual(addr.addr())
+		if ok {
+			return offAddr{vAddr}
+		} else {
+			// The candidate search address is greater than any
+			// known address, which means we definitely have no
+			// free memory left.
+			return maxOffAddr
+		}
+	}
+	return addr
+}
+
 // find searches for the first (address-ordered) contiguous free region of
 // npages in size and returns a base address for that region.
 //
@@ -526,6 +546,7 @@ func (s *pageAlloc) allocRange(base, npages uintptr) uintptr {
 //
 // find also computes and returns a candidate s.searchAddr, which may or
 // may not prune more of the address space than s.searchAddr already does.
+// This candidate is always a valid s.searchAddr.
 //
 // find represents the slow path and the full radix tree search.
 //
@@ -695,7 +716,7 @@ nextLevel:
 			// We found a sufficiently large run of free pages straddling
 			// some boundary, so compute the address and return it.
 			addr := levelIndexToOffAddr(l, i).add(uintptr(base) * pageSize).addr()
-			return addr, firstFree.base
+			return addr, s.findMappedAddr(firstFree.base)
 		}
 		if l == 0 {
 			// We're at level zero, so that means we've exhausted our search.
@@ -741,7 +762,7 @@ nextLevel:
 	// found an even narrower free window.
 	searchAddr := chunkBase(ci) + uintptr(searchIdx)*pageSize
 	foundFree(offAddr{searchAddr}, chunkBase(ci+1)-searchAddr)
-	return addr, firstFree.base
+	return addr, s.findMappedAddr(firstFree.base)
 }
 
 // alloc allocates npages worth of memory from the page heap, returning the base
diff --git a/src/runtime/mpagealloc_test.go b/src/runtime/mpagealloc_test.go
index 89a4a2502ce..65ba71d459c 100644
--- a/src/runtime/mpagealloc_test.go
+++ b/src/runtime/mpagealloc_test.go
@@ -612,6 +612,63 @@ func TestPageAllocAlloc(t *testing.T) {
 				baseChunkIdx + chunkIdxBigJump:     {{0, PallocChunkPages}},
 			},
 		}
+
+		// Test to check for issue #40191. Essentially, the candidate searchAddr
+		// discovered by find may not point to mapped memory, so we need to handle
+		// that explicitly.
+		//
+		// chunkIdxSmallOffset is an offset intended to be used within chunkIdxBigJump.
+		// It is far enough within chunkIdxBigJump that the summaries at the beginning
+		// of an address range the size of chunkIdxBigJump will not be mapped in.
+		const chunkIdxSmallOffset = 0x503
+		tests["DiscontiguousBadSearchAddr"] = test{
+			before: map[ChunkIdx][]BitRange{
+				// The mechanism for the bug involves three chunks, A, B, and C, which are
+				// far apart in the address space. In particular, B is chunkIdxBigJump +
+				// chunkIdxSmalloffset chunks away from B, and C is 2*chunkIdxBigJump chunks
+				// away from A. A has 1 page free, B has several (NOT at the end of B), and
+				// C is totally free.
+				// Note that B's free memory must not be at the end of B because the fast
+				// path in the page allocator will check if the searchAddr even gives us
+				// enough space to place the allocation in a chunk before accessing the
+				// summary.
+				BaseChunkIdx + chunkIdxBigJump*0: {{0, PallocChunkPages - 1}},
+				BaseChunkIdx + chunkIdxBigJump*1 + chunkIdxSmallOffset: {
+					{0, PallocChunkPages - 10},
+					{PallocChunkPages - 1, 1},
+				},
+				BaseChunkIdx + chunkIdxBigJump*2: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx + chunkIdxBigJump*0:                       {},
+				BaseChunkIdx + chunkIdxBigJump*1 + chunkIdxSmallOffset: {},
+				BaseChunkIdx + chunkIdxBigJump*2:                       {},
+			},
+			hits: []hit{
+				// We first allocate into A to set the page allocator's searchAddr to the
+				// end of that chunk. That is the only purpose A serves.
+				{1, PageBase(BaseChunkIdx, PallocChunkPages-1), 0},
+				// Then, we make a big allocation that doesn't fit into B, and so must be
+				// fulfilled by C.
+				//
+				// On the way to fulfilling the allocation into C, we estimate searchAddr
+				// using the summary structure, but that will give us a searchAddr of
+				// B's base address minus chunkIdxSmallOffset chunks. These chunks will
+				// not be mapped.
+				{100, PageBase(baseChunkIdx+chunkIdxBigJump*2, 0), 0},
+				// Now we try to make a smaller allocation that can be fulfilled by B.
+				// In an older implementation of the page allocator, this will segfault,
+				// because this last allocation will first try to access the summary
+				// for B's base address minus chunkIdxSmallOffset chunks in the fast path,
+				// and this will not be mapped.
+				{9, PageBase(baseChunkIdx+chunkIdxBigJump*1+chunkIdxSmallOffset, PallocChunkPages-10), 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx + chunkIdxBigJump*0:                       {{0, PallocChunkPages}},
+				BaseChunkIdx + chunkIdxBigJump*1 + chunkIdxSmallOffset: {{0, PallocChunkPages}},
+				BaseChunkIdx + chunkIdxBigJump*2:                       {{0, 100}},
+			},
+		}
 	}
 	for name, v := range tests {
 		v := v
diff --git a/src/runtime/mranges.go b/src/runtime/mranges.go
index e23d0778eb9..2c0eb2c2ddf 100644
--- a/src/runtime/mranges.go
+++ b/src/runtime/mranges.go
@@ -188,6 +188,25 @@ func (a *addrRanges) findSucc(addr uintptr) int {
 	return len(a.ranges)
 }
 
+// findAddrGreaterEqual returns the smallest address represented by a
+// that is >= addr. Thus, if the address is represented by a,
+// then it returns addr. The second return value indicates whether
+// such an address exists for addr in a. That is, if addr is larger than
+// any address known to a, the second return value will be false.
+func (a *addrRanges) findAddrGreaterEqual(addr uintptr) (uintptr, bool) {
+	i := a.findSucc(addr)
+	if i == 0 {
+		return a.ranges[0].base.addr(), true
+	}
+	if a.ranges[i-1].contains(addr) {
+		return addr, true
+	}
+	if i < len(a.ranges) {
+		return a.ranges[i].base.addr(), true
+	}
+	return 0, false
+}
+
 // contains returns true if a covers the address addr.
 func (a *addrRanges) contains(addr uintptr) bool {
 	i := a.findSucc(addr)
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 2399f0a1d3d..035822216d2 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -132,7 +132,7 @@ func main() {
 
 	if GOARCH != "wasm" { // no threads on wasm yet, so no sysmon
 		systemstack(func() {
-			newm(sysmon, nil)
+			newm(sysmon, nil, -1)
 		})
 	}
 
@@ -562,7 +562,7 @@ func schedinit() {
 	stackinit()
 	mallocinit()
 	fastrandinit() // must run before mcommoninit
-	mcommoninit(_g_.m)
+	mcommoninit(_g_.m, -1)
 	cpuinit()       // must run before alginit
 	alginit()       // maps must not be used before this call
 	modulesinit()   // provides activeModules
@@ -623,7 +623,22 @@ func checkmcount() {
 	}
 }
 
-func mcommoninit(mp *m) {
+// mReserveID returns the next ID to use for a new m. This new m is immediately
+// considered 'running' by checkdead.
+//
+// sched.lock must be held.
+func mReserveID() int64 {
+	if sched.mnext+1 < sched.mnext {
+		throw("runtime: thread ID overflow")
+	}
+	id := sched.mnext
+	sched.mnext++
+	checkmcount()
+	return id
+}
+
+// Pre-allocated ID may be passed as 'id', or omitted by passing -1.
+func mcommoninit(mp *m, id int64) {
 	_g_ := getg()
 
 	// g0 stack won't make sense for user (and is not necessary unwindable).
@@ -632,12 +647,12 @@ func mcommoninit(mp *m) {
 	}
 
 	lock(&sched.lock)
-	if sched.mnext+1 < sched.mnext {
-		throw("runtime: thread ID overflow")
+
+	if id >= 0 {
+		mp.id = id
+	} else {
+		mp.id = mReserveID()
 	}
-	mp.id = sched.mnext
-	sched.mnext++
-	checkmcount()
 
 	mp.fastrand[0] = uint32(int64Hash(uint64(mp.id), fastrandseed))
 	mp.fastrand[1] = uint32(int64Hash(uint64(cputicks()), ^fastrandseed))
@@ -1068,7 +1083,7 @@ func startTheWorldWithSema(emitTraceEvent bool) int64 {
 			notewakeup(&mp.park)
 		} else {
 			// Start M to run P.  Do not start another M below.
-			newm(nil, p)
+			newm(nil, p, -1)
 		}
 	}
 
@@ -1413,12 +1428,13 @@ type cgothreadstart struct {
 // Allocate a new m unassociated with any thread.
 // Can use p for allocation context if needed.
 // fn is recorded as the new m's m.mstartfn.
+// id is optional pre-allocated m ID. Omit by passing -1.
 //
 // This function is allowed to have write barriers even if the caller
 // isn't because it borrows _p_.
 //
 //go:yeswritebarrierrec
-func allocm(_p_ *p, fn func()) *m {
+func allocm(_p_ *p, fn func(), id int64) *m {
 	_g_ := getg()
 	acquirem() // disable GC because it can be called from sysmon
 	if _g_.m.p == 0 {
@@ -1447,7 +1463,7 @@ func allocm(_p_ *p, fn func()) *m {
 
 	mp := new(m)
 	mp.mstartfn = fn
-	mcommoninit(mp)
+	mcommoninit(mp, id)
 
 	// In case of cgo or Solaris or illumos or Darwin, pthread_create will make us a stack.
 	// Windows and Plan 9 will layout sched stack on OS stack.
@@ -1586,7 +1602,7 @@ func oneNewExtraM() {
 	// The sched.pc will never be returned to, but setting it to
 	// goexit makes clear to the traceback routines where
 	// the goroutine stack ends.
-	mp := allocm(nil, nil)
+	mp := allocm(nil, nil, -1)
 	gp := malg(4096)
 	gp.sched.pc = funcPC(goexit) + sys.PCQuantum
 	gp.sched.sp = gp.stack.hi
@@ -1757,9 +1773,11 @@ var newmHandoff struct {
 // Create a new m. It will start off with a call to fn, or else the scheduler.
 // fn needs to be static and not a heap allocated closure.
 // May run with m.p==nil, so write barriers are not allowed.
+//
+// id is optional pre-allocated m ID. Omit by passing -1.
 //go:nowritebarrierrec
-func newm(fn func(), _p_ *p) {
-	mp := allocm(_p_, fn)
+func newm(fn func(), _p_ *p, id int64) {
+	mp := allocm(_p_, fn, id)
 	mp.nextp.set(_p_)
 	mp.sigmask = initSigmask
 	if gp := getg(); gp != nil && gp.m != nil && (gp.m.lockedExt != 0 || gp.m.incgo) && GOOS != "plan9" {
@@ -1828,7 +1846,7 @@ func startTemplateThread() {
 		releasem(mp)
 		return
 	}
-	newm(templateThread, nil)
+	newm(templateThread, nil, -1)
 	releasem(mp)
 }
 
@@ -1923,16 +1941,31 @@ func startm(_p_ *p, spinning bool) {
 		}
 	}
 	mp := mget()
-	unlock(&sched.lock)
 	if mp == nil {
+		// No M is available, we must drop sched.lock and call newm.
+		// However, we already own a P to assign to the M.
+		//
+		// Once sched.lock is released, another G (e.g., in a syscall),
+		// could find no idle P while checkdead finds a runnable G but
+		// no running M's because this new M hasn't started yet, thus
+		// throwing in an apparent deadlock.
+		//
+		// Avoid this situation by pre-allocating the ID for the new M,
+		// thus marking it as 'running' before we drop sched.lock. This
+		// new M will eventually run the scheduler to execute any
+		// queued G's.
+		id := mReserveID()
+		unlock(&sched.lock)
+
 		var fn func()
 		if spinning {
 			// The caller incremented nmspinning, so set m.spinning in the new M.
 			fn = mspinning
 		}
-		newm(fn, _p_)
+		newm(fn, _p_, id)
 		return
 	}
+	unlock(&sched.lock)
 	if mp.spinning {
 		throw("startm: m is spinning")
 	}
@@ -5192,7 +5225,9 @@ func runqputbatch(pp *p, q *gQueue, qsize int) {
 
 	atomic.StoreRel(&pp.runqtail, t)
 	if !q.empty() {
+		lock(&sched.lock)
 		globrunqputbatch(q, int32(qsize))
+		unlock(&sched.lock)
 	}
 }
 
diff --git a/src/sync/runtime2.go b/src/sync/runtime2.go
index 931edad9f1c..f10c4e8e0ef 100644
--- a/src/sync/runtime2.go
+++ b/src/sync/runtime2.go
@@ -1,3 +1,7 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
 // +build !goexperiment.staticlockranking
 
 package sync
diff --git a/src/sync/runtime2_lockrank.go b/src/sync/runtime2_lockrank.go
index 5a68e901fa8..aaa1c276261 100644
--- a/src/sync/runtime2_lockrank.go
+++ b/src/sync/runtime2_lockrank.go
@@ -1,3 +1,7 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
 // +build goexperiment.staticlockranking
 
 package sync
diff --git a/src/syscall/exec_linux.go b/src/syscall/exec_linux.go
index 23d7343d3ad..b7351cda823 100644
--- a/src/syscall/exec_linux.go
+++ b/src/syscall/exec_linux.go
@@ -465,7 +465,7 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
 			}
 			_, _, err1 = RawSyscall(SYS_DUP3, uintptr(fd[i]), uintptr(nextfd), O_CLOEXEC)
 			if _SYS_dup != SYS_DUP3 && err1 == ENOSYS {
-				_, _, err1 = RawSyscall(_SYS_dup, uintptr(pipe), uintptr(nextfd), 0)
+				_, _, err1 = RawSyscall(_SYS_dup, uintptr(fd[i]), uintptr(nextfd), 0)
 				if err1 != 0 {
 					goto childerror
 				}
diff --git a/src/testing/testing.go b/src/testing/testing.go
index 85da6bb02a1..061142b9abd 100644
--- a/src/testing/testing.go
+++ b/src/testing/testing.go
@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // Package testing provides support for automated testing of Go packages.
-// It is intended to be used in concert with the ``go test'' command, which automates
+// It is intended to be used in concert with the "go test" command, which automates
 // execution of any function of the form
 //     func TestXxx(*testing.T)
 // where Xxx does not start with a lowercase letter. The function name
@@ -14,8 +14,8 @@
 // To write a new test suite, create a file whose name ends _test.go that
 // contains the TestXxx functions as described here. Put the file in the same
 // package as the one being tested. The file will be excluded from regular
-// package builds but will be included when the ``go test'' command is run.
-// For more detail, run ``go help test'' and ``go help testflag''.
+// package builds but will be included when the "go test" command is run.
+// For more detail, run "go help test" and "go help testflag".
 //
 // A simple test function looks like this:
 //
diff --git a/test/codegen/memops.go b/test/codegen/memops.go
index cd35910c128..a2342831460 100644
--- a/test/codegen/memops.go
+++ b/test/codegen/memops.go
@@ -354,3 +354,26 @@ func idxCompare(i int) int {
 	}
 	return 1
 }
+
+func idxFloatOps(a []float64, b []float32, i int) (float64, float32) {
+	c := float64(7)
+	// amd64: `ADDSD\t8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\), X[0-9]+`
+	c += a[i+1]
+	// amd64: `SUBSD\t16\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\), X[0-9]+`
+	c -= a[i+2]
+	// amd64: `MULSD\t24\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\), X[0-9]+`
+	c *= a[i+3]
+	// amd64: `DIVSD\t32\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*8\), X[0-9]+`
+	c /= a[i+4]
+
+	d := float32(8)
+	// amd64: `ADDSS\t4\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*4\), X[0-9]+`
+	d += b[i+1]
+	// amd64: `SUBSS\t8\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*4\), X[0-9]+`
+	d -= b[i+2]
+	// amd64: `MULSS\t12\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*4\), X[0-9]+`
+	d *= b[i+3]
+	// amd64: `DIVSS\t16\([A-Z]+[0-9]*\)\([A-Z]+[0-9]*\*4\), X[0-9]+`
+	d /= b[i+4]
+	return c, d
+}
diff --git a/test/fixedbugs/issue40367.go b/test/fixedbugs/issue40367.go
new file mode 100644
index 00000000000..0dc5ad71206
--- /dev/null
+++ b/test/fixedbugs/issue40367.go
@@ -0,0 +1,41 @@
+// run
+
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+func case1() {
+	rates := []int32{1,2,3,4,5,6}
+	var sink [6]int
+	j := len(sink)
+	for star, _ := range rates {
+		if star+1 < 1 {
+			panic("")
+		}
+		j--
+		sink[j] = j
+	}
+}
+
+func case2() {
+	i := 0
+	var sink [3]int
+	j := len(sink)
+top:
+	j--
+	sink[j] = j
+	if i < 2 {
+		i++
+		if i < 1 {
+			return
+		}
+		goto top
+	}
+}
+
+func main() {
+	case1()
+	case2()
+}
\ No newline at end of file
diff --git a/test/prove.go b/test/prove.go
index d37021d2830..3c19c513b65 100644
--- a/test/prove.go
+++ b/test/prove.go
@@ -670,7 +670,8 @@ func oforuntil(b []int) {
 	i := 0
 	if len(b) > i {
 	top:
-		println(b[i]) // ERROR "Induction variable: limits \[0,\?\), increment 1$" "Proved IsInBounds$"
+		// TODO: remove the todo of next line once we complete the following optimization of CL 244579
+		// println(b[i]) // todo: ERROR "Induction variable: limits \[0,\?\), increment 1$" "Proved IsInBounds$"
 		i++
 		if i < len(b) {
 			goto top
@@ -720,7 +721,8 @@ func range1(b []int) {
 // range2 elements are larger, so they use the general form of a range loop.
 func range2(b [][32]int) {
 	for i, v := range b {
-		b[i][0] = v[0] + 1 // ERROR "Induction variable: limits \[0,\?\), increment 1$" "Proved IsInBounds$"
+		// TODO: remove the todo of next line once we complete the following optimization of CL 244579
+		b[i][0] = v[0] + 1 // todo: ERROR "Induction variable: limits \[0,\?\), increment 1$" "Proved IsInBounds$"
 		if i < len(b) {    // ERROR "Proved Less64$"
 			println("x")
 		}