[dev.boringcrypto] all: merge master into dev.boringcrypto

Change-Id: Idd59c37d2fd759b0f73d2ee01b30f72ef4e9aee8
2025-12-08 06:10:04 +00:00 · 2020-05-06 00:20:47 -04:00 · 2020-05-06 00:20:47 -04:00 · a9d2e3abf7
commit a9d2e3abf7
parent c19c0a047b c9d5f60eaa
995 changed files with 118107 additions and 139260 deletions
--- a/4
+++ b/4
@ -96,6 +96,7 @@ Alexei Sholik <alcosholik@gmail.com>
 Alexey Borzenkov <snaury@gmail.com>
 Alexey Neganov <neganovalexey@gmail.com>
 Alexey Palazhchenko <alexey.palazhchenko@gmail.com>
+Alexey Semenyuk <alexsemenyuk88@gmail.com>
 Alexis Hildebrandt <surryhill@gmail.com>
 Ali Rizvi-Santiago <arizvisa@gmail.com>
 Aliaksandr Valialkin <valyala@gmail.com>
@ -144,6 +145,7 @@ Andy Davis <andy@bigandian.com>
 Andy Finkenstadt <afinkenstadt@zynga.com>
 Andy Lindeman <andy@lindeman.io>
 Andy Maloney <asmaloney@gmail.com>
+Andy Pan <panjf2000@gmail.com>
 Andy Walker <walkeraj@gmail.com>
 Anfernee Yongkun Gui <anfernee.gui@gmail.com>
 Angelo Bulfone <mbulfone@gmail.com>
@ -1143,6 +1145,7 @@ Pontus Leitzler <leitzler@gmail.com>
 Prashant Varanasi <prashant@prashantv.com>
 Pravendra Singh <hackpravj@gmail.com>
 Preetam Jinka <pj@preet.am>
+Qais Patankar <qaisjp@gmail.com>
 Qiuxuan Zhu <ilsh1022@gmail.com>
 Qualcomm Data Center, Inc.
 Quan Tran <qeed.quan@gmail.com>
@ -1308,6 +1311,7 @@ Szabolcs Nagy <nsz@port70.net>
 Taavi Kivisik <taavi.kivisik@gmail.com>
 Tad Fisher <tadfisher@gmail.com>
 Tad Glines <tad.glines@gmail.com>
+Tailscale Inc.
 Taj Khattra <taj.khattra@gmail.com>
 Takayoshi Nishida <takayoshi.nishida@gmail.com>
 Takeshi YAMANASHI <9.nashi@gmail.com>
--- a/2
+++ b/2
@ -145,6 +145,7 @@ Alexey Borzenkov <snaury@gmail.com>
 Alexey Naidonov <alexey.naidyonov@gmail.com>
 Alexey Neganov <neganovalexey@gmail.com>
 Alexey Palazhchenko <alexey.palazhchenko@gmail.com>
+Alexey Semenyuk <alexsemenyuk88@gmail.com>
 Alexis Hildebrandt <surryhill@gmail.com>
 Alexis Hunt <lexer@google.com>
 Alexis Imperial-Legrand <ail@google.com>
@ -216,6 +217,7 @@ Andy Davis <andy@bigandian.com>
 Andy Finkenstadt <afinkenstadt@zynga.com>
 Andy Lindeman <andy@lindeman.io>
 Andy Maloney <asmaloney@gmail.com>
+Andy Pan <panjf2000@gmail.com>
 Andy Walker <walkeraj@gmail.com>
 Andzej Maciusovic <andzej.maciusovic@gmail.com>
 Anfernee Yongkun Gui <anfernee.gui@gmail.com>
--- a/api/except.txt
+++ b/api/except.txt
@ -8,10 +8,6 @@ pkg os (linux-arm), const O_SYNC = 4096
 pkg os (linux-arm-cgo), const O_SYNC = 4096
 pkg os (linux-arm), const O_SYNC = 1052672
 pkg os (linux-arm-cgo), const O_SYNC = 1052672
-pkg syscall (darwin-386), const ImplementsGetwd = false
-pkg syscall (darwin-386), func Fchflags(string, int) error
-pkg syscall (darwin-386-cgo), const ImplementsGetwd = false
-pkg syscall (darwin-386-cgo), func Fchflags(string, int) error
 pkg syscall (darwin-amd64), const ImplementsGetwd = false
 pkg syscall (darwin-amd64), func Fchflags(string, int) error
 pkg syscall (darwin-amd64-cgo), const ImplementsGetwd = false
--- a/doc/asm.html
+++ b/doc/asm.html
@ -437,6 +437,31 @@ This is a wrapper function and should not count as disabling <code>recover</code
 (For <code>TEXT</code> items.)
 This function is a closure so it uses its incoming context register.
 </li>
+<li>
+<code>LOCAL</code> = 128
+<br>
+This symbol is local to the dynamic shared object.
+</li>
+<li>
+<code>TLSBSS</code> = 256
+<br>
+(For <code>DATA</code> and <code>GLOBL</code> items.)
+Put this data in thread local storage.
+</li>
+<li>
+<code>NOFRAME</code> = 512
+<br>
+(For <code>TEXT</code> items.)
+Do not insert instructions to allocate a stack frame and save/restore the return
+address, even if this is not a leaf function.
+Only valid on functions that declare a frame size of 0.
+</li>
+<li>
+<code>TOPFRAME</code> = 2048
+<br>
+(For <code>TEXT</code> items.)
+Function is the top of the call stack. Traceback should stop at this function.
+</li>
 </ul>

 <h3 id="runtime">Runtime Coordination</h3>
--- a/doc/contrib.html
+++ b/doc/contrib.html
@ -1,133 +0,0 @@
-<!--{
-	"Title": "The Go Project",
-	"Path": "/project/"
-}-->
-
-<img class="gopher" src="/doc/gopher/project.png" />
-
-<div id="manual-nav"></div>
-
-<p>
-Go is an open source project developed by a team at
-<a href="//google.com/">Google</a> and many
-<a href="/CONTRIBUTORS">contributors</a> from the open source community.
-</p>
-
-<p>
-Go is distributed under a <a href="/LICENSE">BSD-style license</a>.
-</p>
-
-<h3 id="announce"><a href="//groups.google.com/group/golang-announce">Announcements Mailing List</a></h3>
-<p>
-A low traffic mailing list for important announcements, such as new releases.
-</p>
-<p>
-We encourage all Go users to subscribe to
-<a href="//groups.google.com/group/golang-announce">golang-announce</a>.
-</p>
-
-
-<h2 id="go1">Version history</h2>
-
-<h3 id="release"><a href="/doc/devel/release.html">Release History</a></h3>
-
-<p>A <a href="/doc/devel/release.html">summary</a> of the changes between Go releases. Notes for the major releases:</p>
-
-<ul>
-	<li><a href="/doc/go1.14">Go 1.14</a> <small>(February 2020)</small></li>
-	<li><a href="/doc/go1.13">Go 1.13</a> <small>(September 2019)</small></li>
-	<li><a href="/doc/go1.12">Go 1.12</a> <small>(February 2019)</small></li>
-	<li><a href="/doc/go1.11">Go 1.11</a> <small>(August 2018)</small></li>
-	<li><a href="/doc/go1.10">Go 1.10</a> <small>(February 2018)</small></li>
-	<li><a href="/doc/go1.9">Go 1.9</a> <small>(August 2017)</small></li>
-	<li><a href="/doc/go1.8">Go 1.8</a> <small>(February 2017)</small></li>
-	<li><a href="/doc/go1.7">Go 1.7</a> <small>(August 2016)</small></li>
-	<li><a href="/doc/go1.6">Go 1.6</a> <small>(February 2016)</small></li>
-	<li><a href="/doc/go1.5">Go 1.5</a> <small>(August 2015)</small></li>
-	<li><a href="/doc/go1.4">Go 1.4</a> <small>(December 2014)</small></li>
-	<li><a href="/doc/go1.3">Go 1.3</a> <small>(June 2014)</small></li>
-	<li><a href="/doc/go1.2">Go 1.2</a> <small>(December 2013)</small></li>
-	<li><a href="/doc/go1.1">Go 1.1</a> <small>(May 2013)</small></li>
-	<li><a href="/doc/go1">Go 1</a> <small>(March 2012)</small></li>
-</ul>
-
-<h3 id="go1compat"><a href="/doc/go1compat">Go 1 and the Future of Go Programs</a></h3>
-<p>
-What Go 1 defines and the backwards-compatibility guarantees one can expect as
-Go 1 matures.
-</p>
-
-
-<h2 id="resources">Developer Resources</h2>
-
-<h3 id="source"><a href="https://golang.org/change">Source Code</a></h3>
-<p>Check out the Go source code.</p>
-
-<h3 id="discuss"><a href="//groups.google.com/group/golang-nuts">Discussion Mailing List</a></h3>
-<p>
-A mailing list for general discussion of Go programming.
-</p>
-<p>
-Questions about using Go or announcements relevant to other Go users should be sent to
-<a href="//groups.google.com/group/golang-nuts">golang-nuts</a>.
-</p>
-
-<h3 id="golang-dev"><a href="https://groups.google.com/group/golang-dev">Developer</a> and
-<a href="https://groups.google.com/group/golang-codereviews">Code Review Mailing List</a></h3>
-<p>The <a href="https://groups.google.com/group/golang-dev">golang-dev</a>
-mailing list is for discussing code changes to the Go project.
-The <a href="https://groups.google.com/group/golang-codereviews">golang-codereviews</a>
-mailing list is for actual reviewing of the code changes (CLs).</p>
-
-<h3 id="golang-checkins"><a href="https://groups.google.com/group/golang-checkins">Checkins Mailing List</a></h3>
-<p>A mailing list that receives a message summarizing each checkin to the Go repository.</p>
-
-<h3 id="build_status"><a href="//build.golang.org/">Build Status</a></h3>
-<p>View the status of Go builds across the supported operating
-systems and architectures.</p>
-
-
-<h2 id="howto">How you can help</h2>
-
-<h3><a href="//golang.org/issue">Reporting issues</a></h3>
-
-<p>
-If you spot bugs, mistakes, or inconsistencies in the Go project's code or
-documentation, please let us know by
-<a href="//golang.org/issue/new">filing a ticket</a>
-on our <a href="//golang.org/issue">issue tracker</a>.
-(Of course, you should check it's not an existing issue before creating
-a new one.)
-</p>
-
-<p>
-We pride ourselves on being meticulous; no issue is too small.
-</p>
-
-<p>
-Security-related issues should be reported to
-<a href="mailto:security@golang.org">security@golang.org</a>.<br>
-See the <a href="/security">security policy</a> for more details.
-</p>
-
-<p>
-Community-related issues should be reported to
-<a href="mailto:conduct@golang.org">conduct@golang.org</a>.<br>
-See the <a href="/conduct">Code of Conduct</a> for more details.
-</p>
-
-<h3><a href="/doc/contribute.html">Contributing code &amp; documentation</a></h3>
-
-<p>
-Go is an open source project and we welcome contributions from the community.
-</p>
-<p>
-To get started, read these <a href="/doc/contribute.html">contribution
-guidelines</a> for information on design, testing, and our code review process.
-</p>
-<p>
-Check <a href="//golang.org/issue">the tracker</a> for
-open issues that interest you. Those labeled
-<a href="https://github.com/golang/go/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22">help wanted</a>
-are particularly in need of outside help.
-</p>
--- a/doc/debugging_with_gdb.html
+++ b/doc/debugging_with_gdb.html
@ -18,13 +18,13 @@ Gccgo has native gdb support.
 </p>
 <p>
 Note that 
-<a href="https://github.com/derekparker/delve">Delve</a> is a better 
+<a href="https://github.com/go-delve/delve">Delve</a> is a better
 alternative to GDB when debugging Go programs built with the standard
 toolchain. It understands the Go runtime, data structures, and
 expressions better than GDB. Delve currently supports Linux, OSX,
 and Windows on <code>amd64</code>.
 For the most up-to-date list of supported platforms, please see
-<a href="https://github.com/derekparker/delve/tree/master/Documentation/installation">
+<a href="https://github.com/go-delve/delve/tree/master/Documentation/installation">
 the Delve documentation</a>.
 </p>
 </i>
--- a/doc/go1.15.html
+++ b/doc/go1.15.html
@ -31,6 +31,24 @@ TODO

 <h2 id="ports">Ports</h2>

+<h3 id="darwin">Darwin</h3>
+
+<p> <!-- golang.org/issue/37610, golang.org/issue/37611 -->
+  As <a href="/doc/go1.14#darwin">announced</a> in the Go 1.14 release
+  notes, Go 1.15 drops support for 32-bit binaries on macOS, iOS,
+  iPadOS, watchOS, and tvOS (the <code>darwin/386</code>
+  and <code>darwin/arm</code> ports). Go continues to support the
+  64-bit <code>darwin/amd64</code> and <code>darwin/arm64</code> ports.
+</p>
+
+<h3 id="windows">Windows</h3>
+
+<p> <!-- CL 214397 and CL 230217 -->
+  Go 1.15 now generates Windows ASLR executables when -buildmode=pie
+  cmd/link flag is provided. Go command uses -buildmode=pie by default
+  on Windows.
+</p>
+
 <p>
 TODO
 </p>
@ -78,15 +96,61 @@ TODO
  directory of each individual test).
 </p>

+<h4 id="module-cache">Module cache</h4>
+
+<p><!-- https://golang.org/cl/219538 -->
+  The location of the module cache may now be set with
+  the <code>GOMODCACHE</code> environment variable. The default value of
+  <code>GOMODCACHE</code> is <code>GOPATH[0]/pkg/mod</code>, the location of the
+  module cache before this change.
+</p>
+
+<p><!-- https://golang.org/cl/221157 -->
+  A workaround is now available for Windows "Access is denied" errors in
+  <code>go</code> commands that access the module cache, caused by external
+  programs concurrently scanning the file system (see
+  <a href="https://golang.org/issue/36568">issue #36568</a>). The workaround is
+  not enabled by default because it is not safe to use when Go versions lower
+  than 1.14.2 and 1.13.10 are running concurrently with the same module cache.
+  It can be enabled by explictly setting the environment variable
+  <code>GODEBUG=modcacheunzipinplace=1</code>.
+</p>
+
 <h2 id="runtime">Runtime</h2>

 <p>
 TODO
 </p>

+<h2 id="compiler">Compiler</h2>
+
+<p><!-- https://golang.org/cl/229578 -->
+  Package <code>unsafe</code>'s <a href="/pkg/unsafe/#Pointer">safety
+  rules</a> allow converting an <code>unsafe.Pointer</code>
+  into <code>uintptr</code> when calling certain
+  functions. Previously, in some cases, the compiler allowed multiple
+  chained conversions (for example, <code>syscall.Syscall(…,
+  uintptr(uintptr(ptr)), …)</code>). The compiler now requires exactly
+  one conversion. Code that used multiple conversions should be
+  updated to satisfy the safety rules.
+</p>

 <h2 id="library">Core library</h2>

+<h3 id="time/tzdata">New embedded tzdata package</h3>
+
+<p> <!-- CL 224588 -->
+  Go 1.15 includes a new package,
+  <a href="/pkg/time/tzdata/"><code>time/tzdata</code></a>,
+  that permits embedding the timezone database into a program.
+  Importing this package (as <code>import _ "time/tzdata"</code>)
+  permits the program to find timezone information even if the
+  timezone database is not available on the local system.
+  You can also embed the timezone database by building
+  with <code>-tags timetzdata</code>.
+  Either approach increases the size of the program by about 800 KB.
+</p>
+
 <p>
 TODO
 </p>
@ -119,6 +183,127 @@ TODO
 TODO
 </p>

+<dl id="crypto/tls"><dt><a href="/crypto/tls/">crypto/tls</a></dt>
+  <dd>
+    <p><!-- CL 214977 -->
+      The new
+      <a href="/pkg/crypto/tls/#Dialer"><code>Dialer</code></a>
+      type and its
+      <a href="/pkg/crypto/tls/#Dialer.DialContext"><code>DialContext</code></a>
+      method permits using a context to both connect and handshake with a TLS server.
+    </p>
+  </dd>
+</dl>
+
+<dl id="flag"><dt><a href="/pkg/flag/">flag</a></dt>
+  <dd>
+    <p><!-- CL 221427 -->
+      When the flag package sees <code>-h</code> or <code>-help</code>, and
+      those flags are not defined, the flag package prints a usage message.
+      If the <a href="/pkg/flag/#FlagSet"><code>FlagSet</code></a> was created with
+      <a href="/pkg/flag/#ExitOnError"><code>ExitOnError</code></a>,
+      <a href="/pkg/flag/#FlagSet.Parse"><code>FlagSet.Parse</code></a> would then
+      exit with a status of 2. In this release, the exit status for <code>-h</code>
+      or <code>-help</code> has been changed to 0. In particular, this applies to
+      the default handling of command line flags.
+    </p>
+  </dd>
+</dl>
+
+<dl id="net"><dt><a href="/pkg/net/">net</a></dt>
+  <dd>
+    <p><!-- CL 228645 -->
+      If an I/O operation exceeds a deadline set by
+      the <a href="/pkg/net/#Conn"><code>Conn.SetDeadline</code></a>,
+      <code>Conn.SetReadDeadline</code>,
+      or <code>Conn.SetWriteDeadline</code> methods, it will now
+      return an error that is or wraps
+      <a href="/pkg/os#ErrDeadlineExceeded"><code>os.ErrDeadlineExceeded</code></a>.
+      This may be used to reliably detect whether an error is due to
+      an exceeded deadline.
+      Earlier releases recommended calling the <code>Timeout</code>
+      method on the error, but I/O operations can return errors for
+      which <code>Timeout</code> returns <code>true</code> although a
+      deadline has not been exceeded.
+    </p>
+
+    <p><!-- CL 228641 -->
+      The new <a href="/pkg/net/#Resolver.LookupIP"><code>Resolver.LookupIP</code></a>
+      method supports IP lookups that are both network-specific and accept a context.
+    </p>
+  </dd>
+</dl>
+
+<dl id="net/http/httputil"><dt><a href="/pkg/net/http/httputil/">net/http/httputil</a></dt>
+  <dd>
+    <p><!-- CL 230937 -->
+      <a href="/pkg/net/http/httputil/#ReverseProxy"><code>ReverseProxy</code></a>
+      now supports not modifying the <code>X-Forwarded-For</code>
+      header when the incoming <code>Request.Header</code> map entry
+      for that field is <code>nil</code>.
+    </p>
+  </dd>
+</dl>
+
+<dl id="net/http/pprof"><dt><a href="/pkg/net/http/pprof/">net/http/pprof</a></dt>
+  <dd>
+    <p><!-- CL 147598, CL 229537 -->
+      All profile endpoints now support a "<code>seconds</code>" parameter. When present,
+      the endpoint profiles for the specified number of seconds and reports the difference.
+      The meaning of the "<code>seconds</code>" parameter in the <code>cpu</code> profile and
+      the trace endpoints is unchanged.
+    </p>
+  </dd>
+</dl>
+
+<dl id="net/url"><dt><a href="/pkg/net/url/">net/url</a></dt>
+  <dd>
+    <p><!-- CL 227645 -->
+      The new <a href="/pkg/net/url/#URL"><code>URL</code></a> field
+      <code>RawFragment</code> and method <a href="/pkg/net/url/#URL.EscapedFragment"><code>EscapedFragment</code></a>
+      provide detail about and control over the exact encoding of a particular fragment.
+      These are analogous to
+      <code>RawPath</code> and <a href="/pkg/net/url/#URL.EscapedPath"><code>EscapedPath</code></a>.
+    </p>
+    <p><!-- CL 207082 -->
+      The new <a href="/pkg/net/url/#URL"><code>URL</code></a>
+      method <a href="/pkg/net/url/#URL.Redacted"><code>Redacted</code></a>
+      returns the URL in string form with any password replaced with <code>xxxxx</code>.
+    </p>
+  </dd>
+</dl>
+
+<dl id="os"><dt><a href="/pkg/os/">os</a></dt>
+  <dd>
+    <p><!-- CL -->
+      If an I/O operation exceeds a deadline set by
+      the <a href="/pkg/os/#File.SetDeadline"><code>File.SetDeadline</code></a>,
+      <a href="/pkg/os/#File.SetReadDeadline"><code>File.SetReadDeadline</code></a>,
+      or <a href="/pkg/os/#File.SetWriteDeadline"><code>File.SetWriteDeadline</code></a>
+      methods, it will now return an error that is or wraps
+      <a href="/pkg/os#ErrDeadlineExceeded"><code>os.ErrDeadlineExceeded</code></a>.
+      This may be used to reliably detect whether an error is due to
+      an exceeded deadline.
+      Earlier releases recommended calling the <code>Timeout</code>
+      method on the error, but I/O operations can return errors for
+      which <code>Timeout</code> returns <code>true</code> although a
+      deadline has not been exceeded.
+    </p>
+  </dd>
+</dl>
+
+<dl id="reflect"><dt><a href="/pkg/reflect/">reflect</a></dt>
+  <dd>
+    <p><!-- CL 228902 -->
+      Package reflect now disallows accessing methods of all
+      non-exported fields, whereas previously it allowed accessing
+      those of non-exported, embedded fields. Code that relies on the
+      previous behavior should be updated to instead access the
+      corresponding promoted method of the enclosing variable.
+    </p>
+  </dd>
+</dl>
+
 <dl id="pkg-runtime"><dt><a href="/pkg/runtime/">runtime</a></dt>
  <dd>
    <p><!-- CL 221779 -->
@ -128,28 +313,83 @@ TODO
      <code>uint</code>, <code>uint8</code>, <code>uint16</code>, <code>uint32</code>, <code>uint64</code>, <code>uintptr</code>,
      then the value will be printed, instead of just its address.
    </p>
+
+    <p><!-- CL -->
+      On a Unix system, if the <code>kill</code> command
+      or <code>kill</code> system call is used to send
+      a <code>SIGSEGV</code>, <code>SIGBUS</code>,
+      or <code>SIGFPE</code> signal to a Go program, and if the signal
+      is not being handled via
+      <a href="/pkg/os/signal/#Notify"><code>os/signal.Notify</code></a>,
+      the Go program will now reliably crash with a stack trace.
+      In earlier releases the behavior was unpredictable.
+    </p>
+  </dd>
+</dl>
+
+<dl id="pkg-runtime-pprof"><dt><a href="/pkg/runtime/pprof">runtime/pprof</a></dt>
+  <dd>
+    <p><!-- CL 189318 -->
+      The goroutine profile includes the profile labels associated with each goroutine
+      at the time of profiling. This feature is not yet implemented for the profile
+      reported with <code>debug=2</code>.
+    </p>
  </dd>
 </dl>

 <dl id="sync"><dt><a href="/pkg/sync/">sync</a></dt>
  <dd>
-    <p><!-- golang.org/issue/33762 -->
+    <p><!-- CL 205899, golang.org/issue/33762 -->
      The new method
-      <a href="/pkg/sync#Map.LoadAndDelete"><code>Map.LoadAndDelete</code></a>
+      <a href="/pkg/sync/#Map.LoadAndDelete"><code>Map.LoadAndDelete</code></a>
      atomically deletes a key and returns the previous value if present.
    </p>
    <p><!-- CL 205899 -->
      The method
-      <a href="/pkg/sync#Map.Delete"><code>Map.Delete</code></a>
+      <a href="/pkg/sync/#Map.Delete"><code>Map.Delete</code></a>
      is more efficient.
    </p>
 </dl><!-- sync -->

+<dl id="syscall"><dt><a href="/pkg/syscall/">syscall</a></dt>
+  <dd>
+    <p><!-- CL 231638 -->
+      On Unix systems, functions that use
+      <a href="/pkg/syscall/#SysProcAttr"><code>SysProcAttr</code></a>
+      will now reject attempts to set both the <code>Setctty</code>
+      and <code>Foreground</code> fields, as they both use
+      the <code>Ctty</code> field but do so in incompatible ways.
+      We expect that few existing programs set both fields.
+    </p>
+    <p>
+      Setting the <code>Setctty</code> field now requires that the
+      <code>Ctty</code> field be set to a file descriptor number in the
+      child process, as determined by the <code>ProcAttr.Files</code> field.
+      Using a child descriptor always worked, but there were certain
+      cases where using a parent file descriptor also happened to work.
+      Some programs that set <code>Setctty</code> will need to change
+      the value of <code>Ctty</code> to use a child descriptor number.
+    </p>
+  </dd>
+</dl>
+
+<dl id="testing"><dt><a href="/pkg/testing/">testing</a></dt>
+  <dd>
+    <p><!-- CL 226877, golang.org/issue/35998 -->
+       The new methods
+       <a href="/pkg/testing/#T.TempDir"><code>T.TempDir</code></a> and
+       <a href="/pkg/testing/#B.TempDir"><code>B.TempDir</code></a> and
+       return temporary directories that are automatically cleaned up
+       at the end of the test.
+    </p>
+  </dd>
+</dl><!-- testing -->
+
 <dl id="time"><dt><a href="/pkg/time/">time</a></dt>
  <dd>
-    <p><!-- golang.org/issue/33184 -->
+    <p><!-- CL 220424, CL 217362, golang.org/issue/33184 -->
       The new method
-       <a href="/pkg/time#Ticker.Reset"><code>Ticker.Reset</code></a>
+       <a href="/pkg/time/#Ticker.Reset"><code>Ticker.Reset</code></a>
       supports changing the duration of a ticker.
    </p>
  </dd>
--- a/doc/install-source.html
+++ b/doc/install-source.html
@ -540,15 +540,9 @@ The valid combinations of <code>$GOOS</code> and <code>$GOARCH</code> are:
 <td></td><td><code>android</code></td> <td><code>arm64</code></td>
 </tr>
 <tr>
-<td></td><td><code>darwin</code></td> <td><code>386</code></td>
-</tr>
-<tr>
 <td></td><td><code>darwin</code></td> <td><code>amd64</code></td>
 </tr>
 <tr>
-<td></td><td><code>darwin</code></td> <td><code>arm</code></td>
-</tr>
-<tr>
 <td></td><td><code>darwin</code></td> <td><code>arm64</code></td>
 </tr>
 <tr>
--- a/lib/time/update.bash
+++ b/lib/time/update.bash
@ -8,8 +8,8 @@
 # Consult https://www.iana.org/time-zones for the latest versions.

 # Versions to use.
-CODE=2019c
-DATA=2019c
+CODE=2020a
+DATA=2020a

 set -e
 rm -rf work
@ -28,6 +28,8 @@ rm -f ../../zoneinfo.zip
 zip -0 -r ../../zoneinfo.zip *
 cd ../..

+go generate time/tzdata
+
 echo
 if [ "$1" = "-work" ]; then
 	echo Left workspace behind in work/.
--- a/lib/time/zoneinfo.zip
+++ b/lib/time/zoneinfo.zip
--- a/misc/cgo/test/pkg_test.go
+++ b/misc/cgo/test/pkg_test.go
@ -32,7 +32,7 @@ func TestCrossPackageTests(t *testing.T) {
 		t.Skip("Can't exec cmd/go subprocess on Android.")
 	case "darwin":
 		switch runtime.GOARCH {
-		case "arm", "arm64":
+		case "arm64":
 			t.Skip("Can't exec cmd/go subprocess on iOS.")
 		}
 	}
--- a/misc/cgo/test/sigaltstack.go
+++ b/misc/cgo/test/sigaltstack.go
@ -62,10 +62,8 @@ import (

 func testSigaltstack(t *testing.T) {
 	switch {
-	case runtime.GOOS == "solaris", runtime.GOOS == "illumos", runtime.GOOS == "darwin" && (runtime.GOARCH == "arm" || runtime.GOARCH == "arm64"):
+	case runtime.GOOS == "solaris", runtime.GOOS == "illumos", runtime.GOOS == "darwin" && runtime.GOARCH == "arm64":
 		t.Skipf("switching signal stack not implemented on %s/%s", runtime.GOOS, runtime.GOARCH)
-	case runtime.GOOS == "darwin" && runtime.GOARCH == "386":
-		t.Skipf("sigaltstack fails on darwin/386")
 	}

 	C.changeSignalStack()
--- a/misc/cgo/test/test.go
+++ b/misc/cgo/test/test.go
@ -897,6 +897,10 @@ static uint16_t issue31093F(uint16_t v) { return v; }

 // issue 32579
 typedef struct S32579 { unsigned char data[1]; } S32579;
+
+// issue 38649
+// Test that #define'd type aliases work.
+#define netbsd_gid unsigned int
 */
 import "C"

@ -2192,3 +2196,7 @@ func test32579(t *testing.T) {
 		t.Errorf("&s[0].data[0] failed: got %d, want %d", s[0].data[0], 1)
 	}
 }
+
+// issue 38649
+
+var issue38649 C.netbsd_gid = 42
--- a/misc/cgo/test/testx.go
+++ b/misc/cgo/test/testx.go
@ -124,6 +124,11 @@ typedef struct {
 } Issue31891B;

 void callIssue31891(void);
+
+typedef struct {
+	int i;
+} Issue38408, *PIssue38408;
+
 */
 import "C"

@ -159,7 +164,7 @@ func Add(x int) {
 }

 func testCthread(t *testing.T) {
-	if runtime.GOOS == "darwin" && (runtime.GOARCH == "arm" || runtime.GOARCH == "arm64") {
+	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
 		t.Skip("the iOS exec wrapper is unable to properly handle the panic from Add")
 	}
 	sum.i = 0
@ -552,3 +557,8 @@ func useIssue31891B(c *C.Issue31891B) {}
 func test31891(t *testing.T) {
 	C.callIssue31891()
 }
+
+// issue 38408
+// A typedef pointer can be used as the element type.
+// No runtime test; just make sure it compiles.
+var _ C.PIssue38408 = &C.Issue38408{i: 1}
--- a/misc/cgo/testcarchive/carchive_test.go
+++ b/misc/cgo/testcarchive/carchive_test.go
@ -134,7 +134,7 @@ func testMain(m *testing.M) int {
 	} else {
 		switch GOOS {
 		case "darwin":
-			if GOARCH == "arm" || GOARCH == "arm64" {
+			if GOARCH == "arm64" {
 				libbase += "_shared"
 			}
 		case "dragonfly", "freebsd", "linux", "netbsd", "openbsd", "solaris", "illumos":
@ -305,7 +305,7 @@ func TestEarlySignalHandler(t *testing.T) {
 	switch GOOS {
 	case "darwin":
 		switch GOARCH {
-		case "arm", "arm64":
+		case "arm64":
 			t.Skipf("skipping on %s/%s; see https://golang.org/issue/13701", GOOS, GOARCH)
 		}
 	case "windows":
@ -487,7 +487,7 @@ func checkSignalForwardingTest(t *testing.T) {
 	switch GOOS {
 	case "darwin":
 		switch GOARCH {
-		case "arm", "arm64":
+		case "arm64":
 			t.Skipf("skipping on %s/%s; see https://golang.org/issue/13701", GOOS, GOARCH)
 		}
 	case "windows":
@ -603,7 +603,7 @@ func TestExtar(t *testing.T) {
 	if runtime.Compiler == "gccgo" {
 		t.Skip("skipping -extar test when using gccgo")
 	}
-	if runtime.GOOS == "darwin" && (runtime.GOARCH == "arm" || runtime.GOARCH == "arm64") {
+	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
 		t.Skip("shell scripts are not executable on iOS hosts")
 	}

--- a/misc/cgo/testcshared/cshared_test.go
+++ b/misc/cgo/testcshared/cshared_test.go
@ -108,7 +108,7 @@ func testMain(m *testing.M) int {
 	libgodir := GOOS + "_" + GOARCH
 	switch GOOS {
 	case "darwin":
-		if GOARCH == "arm" || GOARCH == "arm64" {
+		if GOARCH == "arm64" {
 			libgodir += "_shared"
 		}
 	case "dragonfly", "freebsd", "linux", "netbsd", "openbsd", "solaris", "illumos":
--- a/misc/cgo/testgodefs/testdata/issue38649.go
+++ b/misc/cgo/testgodefs/testdata/issue38649.go
@ -0,0 +1,15 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// +build ignore
+
+package main
+
+/*
+struct Issue38649 { int x; };
+#define issue38649 struct Issue38649
+*/
+import "C"
+
+type issue38649 C.issue38649
--- a/misc/cgo/testgodefs/testdata/main.go
+++ b/misc/cgo/testgodefs/testdata/main.go
@ -19,5 +19,8 @@ var v6 = B{}
 // Test that S is fully defined
 var v7 = S{}

+// Test that #define'd type is fully defined
+var _ = issue38649{X: 0}
+
 func main() {
 }
--- a/misc/cgo/testgodefs/testgodefs_test.go
+++ b/misc/cgo/testgodefs/testgodefs_test.go
@ -23,6 +23,7 @@ var filePrefixes = []string{
 	"fieldtypedef",
 	"issue37479",
 	"issue37621",
+	"issue38649",
 }

 func TestGoDefs(t *testing.T) {
--- a/misc/cgo/testso/so_test.go
+++ b/misc/cgo/testso/so_test.go
@ -20,7 +20,7 @@ import (
 func requireTestSOSupported(t *testing.T) {
 	t.Helper()
 	switch runtime.GOARCH {
-	case "arm", "arm64":
+	case "arm64":
 		if runtime.GOOS == "darwin" {
 			t.Skip("No exec facility on iOS.")
 		}
--- a/misc/cgo/testsovar/so_test.go
+++ b/misc/cgo/testsovar/so_test.go
@ -20,7 +20,7 @@ import (
 func requireTestSOSupported(t *testing.T) {
 	t.Helper()
 	switch runtime.GOARCH {
-	case "arm", "arm64":
+	case "arm64":
 		if runtime.GOOS == "darwin" {
 			t.Skip("No exec facility on iOS.")
 		}
--- a/misc/ios/clangwrap.sh
+++ b/misc/ios/clangwrap.sh
@ -8,9 +8,7 @@ export IPHONEOS_DEPLOYMENT_TARGET=5.1
 # cmd/cgo doesn't support llvm-gcc-4.2, so we have to use clang.
 CLANG=`xcrun --sdk $SDK --find clang`

-if [ "$GOARCH" == "arm" ]; then
-	CLANGARCH="armv7"
-elif [ "$GOARCH" == "arm64" ]; then
+if [ "$GOARCH" == "arm64" ]; then
 	CLANGARCH="arm64"
 else
 	echo "unknown GOARCH=$GOARCH" >&2
--- a/src/all.rc
+++ b/src/all.rc
@ -10,7 +10,7 @@ if(! test -f make.rc){
 	exit wrongdir
 }

-. ./make.rc --no-banner
+. ./make.rc --no-banner $*
 bind -b $GOROOT/bin /bin
 ./run.rc --no-rebuild
 $GOTOOLDIR/dist banner  # print build info
--- a/src/bufio/bufio.go
+++ b/src/bufio/bufio.go
@ -11,6 +11,7 @@ import (
 	"bytes"
 	"errors"
 	"io"
+	"strings"
 	"unicode/utf8"
 )

@ -419,20 +420,16 @@ func (b *Reader) ReadLine() (line []byte, isPrefix bool, err error) {
 	return
 }

-// ReadBytes reads until the first occurrence of delim in the input,
-// returning a slice containing the data up to and including the delimiter.
-// If ReadBytes encounters an error before finding a delimiter,
-// it returns the data read before the error and the error itself (often io.EOF).
-// ReadBytes returns err != nil if and only if the returned data does not end in
-// delim.
-// For simple uses, a Scanner may be more convenient.
-func (b *Reader) ReadBytes(delim byte) ([]byte, error) {
-	// Use ReadSlice to look for array,
-	// accumulating full buffers.
+// collectFragments reads until the first occurrence of delim in the input. It
+// returns (slice of full buffers, remaining bytes before delim, total number
+// of bytes in the combined first two elements, error).
+// The complete result is equal to
+// `bytes.Join(append(fullBuffers, finalFragment), nil)`, which has a
+// length of `totalLen`. The result is strucured in this way to allow callers
+// to minimize allocations and copies.
+func (b *Reader) collectFragments(delim byte) (fullBuffers [][]byte, finalFragment []byte, totalLen int, err error) {
 	var frag []byte
-	var full [][]byte
-	var err error
-	n := 0
+	// Use ReadSlice to look for delim, accumulating full buffers.
 	for {
 		var e error
 		frag, e = b.ReadSlice(delim)
@ -447,12 +444,23 @@ func (b *Reader) ReadBytes(delim byte) ([]byte, error) {
 		// Make a copy of the buffer.
 		buf := make([]byte, len(frag))
 		copy(buf, frag)
-		full = append(full, buf)
-		n += len(buf)
+		fullBuffers = append(fullBuffers, buf)
+		totalLen += len(buf)
 	}

-	n += len(frag)
+	totalLen += len(frag)
+	return fullBuffers, frag, totalLen, err
+}

+// ReadBytes reads until the first occurrence of delim in the input,
+// returning a slice containing the data up to and including the delimiter.
+// If ReadBytes encounters an error before finding a delimiter,
+// it returns the data read before the error and the error itself (often io.EOF).
+// ReadBytes returns err != nil if and only if the returned data does not end in
+// delim.
+// For simple uses, a Scanner may be more convenient.
+func (b *Reader) ReadBytes(delim byte) ([]byte, error) {
+	full, frag, n, err := b.collectFragments(delim)
 	// Allocate new buffer to hold the full pieces and the fragment.
 	buf := make([]byte, n)
 	n = 0
@ -472,8 +480,16 @@ func (b *Reader) ReadBytes(delim byte) ([]byte, error) {
 // delim.
 // For simple uses, a Scanner may be more convenient.
 func (b *Reader) ReadString(delim byte) (string, error) {
-	bytes, err := b.ReadBytes(delim)
-	return string(bytes), err
+	full, frag, n, err := b.collectFragments(delim)
+	// Allocate new buffer to hold the full pieces and the fragment.
+	var buf strings.Builder
+	buf.Grow(n)
+	// Copy full pieces and fragment in.
+	for _, fb := range full {
+		buf.Write(fb)
+	}
+	buf.Write(frag)
+	return buf.String(), err
 }

 // WriteTo implements io.WriterTo.
--- a/src/bufio/bufio_test.go
+++ b/src/bufio/bufio_test.go
@ -535,6 +535,23 @@ func TestReadWriteRune(t *testing.T) {
 	}
 }

+func TestReadStringAllocs(t *testing.T) {
+	r := strings.NewReader("       foo       foo        42        42        42        42        42        42        42        42       4.2       4.2       4.2       4.2\n")
+	buf := NewReader(r)
+	allocs := testing.AllocsPerRun(100, func() {
+		r.Seek(0, io.SeekStart)
+		buf.Reset(r)
+
+		_, err := buf.ReadString('\n')
+		if err != nil {
+			t.Fatal(err)
+		}
+	})
+	if allocs != 1 {
+		t.Errorf("Unexpected number of allocations, got %f, want 1", allocs)
+	}
+}
+
 func TestWriter(t *testing.T) {
 	var data [8192]byte

@ -1644,6 +1661,21 @@ func BenchmarkReaderWriteToOptimal(b *testing.B) {
 	}
 }

+func BenchmarkReaderReadString(b *testing.B) {
+	r := strings.NewReader("       foo       foo        42        42        42        42        42        42        42        42       4.2       4.2       4.2       4.2\n")
+	buf := NewReader(r)
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		r.Seek(0, io.SeekStart)
+		buf.Reset(r)
+
+		_, err := buf.ReadString('\n')
+		if err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
 func BenchmarkWriterCopyOptimal(b *testing.B) {
 	// Optimal case is where the underlying writer implements io.ReaderFrom
 	srcBuf := bytes.NewBuffer(make([]byte, 8192))
--- a/src/buildall.bash
+++ b/src/buildall.bash
@ -42,7 +42,7 @@ gettargets() {
 }

 selectedtargets() {
-	gettargets | egrep -v 'android-arm|darwin-arm' | egrep "$pattern"
+	gettargets | egrep -v 'android-arm|darwin-arm64' | egrep "$pattern"
 }

 # put linux first in the target list to get all the architectures up front.
--- a/src/bytes/bytes.go
+++ b/src/bytes/bytes.go
@ -445,8 +445,9 @@ func Fields(s []byte) [][]byte {
 // It splits the slice s at each run of code points c satisfying f(c) and
 // returns a slice of subslices of s. If all code points in s satisfy f(c), or
 // len(s) == 0, an empty slice is returned.
-// FieldsFunc makes no guarantees about the order in which it calls f(c).
-// If f does not return consistent results for a given c, FieldsFunc may crash.
+//
+// FieldsFunc makes no guarantees about the order in which it calls f(c)
+// and assumes that f always returns the same value for a given c.
 func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
 	// A span is used to record a slice of s of the form s[start:end].
 	// The start index is inclusive and the end index is exclusive.
@ -457,8 +458,10 @@ func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
 	spans := make([]span, 0, 32)

 	// Find the field start and end indices.
-	wasField := false
-	fromIndex := 0
+	// Doing this in a separate pass (rather than slicing the string s
+	// and collecting the result substrings right away) is significantly
+	// more efficient, possibly due to cache effects.
+	start := -1 // valid span start if >= 0
 	for i := 0; i < len(s); {
 		size := 1
 		r := rune(s[i])
@ -466,22 +469,21 @@ func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
 			r, size = utf8.DecodeRune(s[i:])
 		}
 		if f(r) {
-			if wasField {
-				spans = append(spans, span{start: fromIndex, end: i})
-				wasField = false
+			if start >= 0 {
+				spans = append(spans, span{start, i})
+				start = -1
 			}
 		} else {
-			if !wasField {
-				fromIndex = i
-				wasField = true
+			if start < 0 {
+				start = i
 			}
 		}
 		i += size
 	}

 	// Last field might end at EOF.
-	if wasField {
-		spans = append(spans, span{fromIndex, len(s)})
+	if start >= 0 {
+		spans = append(spans, span{start, len(s)})
 	}

 	// Create subslices from recorded field indices.
--- a/src/cmd/api/goapi.go
+++ b/src/cmd/api/goapi.go
@ -60,8 +60,6 @@ var contexts = []*build.Context{
 	{GOOS: "linux", GOARCH: "amd64"},
 	{GOOS: "linux", GOARCH: "arm", CgoEnabled: true},
 	{GOOS: "linux", GOARCH: "arm"},
-	{GOOS: "darwin", GOARCH: "386", CgoEnabled: true},
-	{GOOS: "darwin", GOARCH: "386"},
 	{GOOS: "darwin", GOARCH: "amd64", CgoEnabled: true},
 	{GOOS: "darwin", GOARCH: "amd64"},
 	{GOOS: "windows", GOARCH: "amd64"},
@ -252,6 +250,13 @@ func featureWithoutContext(f string) string {
 	return spaceParensRx.ReplaceAllString(f, "")
 }

+// portRemoved reports whether the given port-specific API feature is
+// okay to no longer exist because its port was removed.
+func portRemoved(feature string) bool {
+	return strings.Contains(feature, "(darwin-386)") ||
+		strings.Contains(feature, "(darwin-386-cgo)")
+}
+
 func compareAPI(w io.Writer, features, required, optional, exception []string, allowAdd bool) (ok bool) {
 	ok = true

@ -279,6 +284,8 @@ func compareAPI(w io.Writer, features, required, optional, exception []string, a
 				// acknowledged by being in the file
 				// "api/except.txt". No need to print them out
 				// here.
+			} else if portRemoved(feature) {
+				// okay.
 			} else if featureSet[featureWithoutContext(feature)] {
 				// okay.
 			} else {
@ -437,6 +444,11 @@ type listImports struct {

 var listCache sync.Map // map[string]listImports, keyed by contextName

+// listSem is a semaphore restricting concurrent invocations of 'go list'.
+var listSem = make(chan semToken, runtime.GOMAXPROCS(0))
+
+type semToken struct{}
+
 // loadImports populates w with information about the packages in the standard
 // library and the packages they themselves import in w's build context.
 //
@ -461,6 +473,9 @@ func (w *Walker) loadImports() {

 	imports, ok := listCache.Load(name)
 	if !ok {
+		listSem <- semToken{}
+		defer func() { <-listSem }()
+
 		cmd := exec.Command(goCmd(), "list", "-e", "-deps", "-json", "std")
 		cmd.Env = listEnv(w.context)
 		out, err := cmd.CombinedOutput()
--- a/src/cmd/api/goapi_test.go
+++ b/src/cmd/api/goapi_test.go
@ -140,7 +140,6 @@ func TestCompareAPI(t *testing.T) {
 			name: "contexts reconverging",
 			required: []string{
 				"A",
-				"pkg syscall (darwin-386), type RawSockaddrInet6 struct",
 				"pkg syscall (darwin-amd64), type RawSockaddrInet6 struct",
 			},
 			features: []string{
--- a/src/cmd/asm/internal/asm/endtoend_test.go
+++ b/src/cmd/asm/internal/asm/endtoend_test.go
@ -390,7 +390,12 @@ func TestARM64Errors(t *testing.T) {
 }

 func TestAMD64EndToEnd(t *testing.T) {
-	testEndToEnd(t, "amd64", "amd64")
+	defer func(old string) { objabi.GOAMD64 = old }(objabi.GOAMD64)
+	for _, goamd64 := range []string{"normaljumps", "alignedjumps"} {
+		t.Logf("GOAMD64=%s", goamd64)
+		objabi.GOAMD64 = goamd64
+		testEndToEnd(t, "amd64", "amd64")
+	}
 }

 func Test386Encoder(t *testing.T) {
--- a/src/cmd/asm/internal/asm/testdata/ppc64.s
+++ b/src/cmd/asm/internal/asm/testdata/ppc64.s
@ -748,6 +748,14 @@ label1:
 	COPY R2,R1
 	PASTECC R2,R1

+//	Modulo signed/unsigned double/word X-form
+//	<MNEMONIC> RA,RB,RT produces
+//	<mnemonic> RT,RA,RB
+	MODUD R3,R4,R5
+	MODUW R3,R4,R5
+	MODSD R3,R4,R5
+	MODSW R3,R4,R5
+
 //	VMX instructions

 //	Described as:
@ -950,12 +958,19 @@ label1:
 	VCMPGTSDCC  V3, V2, V1
 	VCMPNEZB    V3, V2, V1
 	VCMPNEZBCC  V3, V2, V1
+	VCMPNEB     V3, V2, V1
+	VCMPNEBCC   V3, V2, V1
+	VCMPNEH     V3, V2, V1
+	VCMPNEHCC   V3, V2, V1
+	VCMPNEW     V3, V2, V1
+	VCMPNEWCC   V3, V2, V1

 //	Vector permute, VA-form
 //	<MNEMONIC> VRA,VRB,VRC,VRT produces
 //	<mnemonic> VRT,VRA,VRB,VRC
 	VPERM V3, V2, V1, V0
 	VPERMXOR V3, V2, V1, V0
+	VPERMR V3, V2, V1, V0

 //	Vector bit permute, VX-form
 //	<MNEMONIC> VRA,VRB,VRT produces
@ -1019,6 +1034,9 @@ label1:
 	LXSIWAX	    (R1)(R2*1), VS0
 	LXSIWZX	    (R1)(R2*1), VS0

+// VSX load with length X-form (also left-justified)
+	LXVL        R3,R4, VS0
+	LXVLL       R3,R4, VS0
 // VSX load, DQ-form
 // <MNEMONIC> DQ(RA), XS produces
 // <mnemonic> XS, DQ(RA)
@ -1039,6 +1057,10 @@ label1:
 // <mnemonic> XS, DQ(RA)
 	STXV        VS63, -32752(R1)

+// VSX store with length, X-form (also left-justified)
+	STXVL	    VS0, R3,R4
+	STXVLL      VS0, R3,R4
+
 //	VSX move from VSR, XX1-form
 //	<MNEMONIC> XS,RA produces
 //	<mnemonic> RA,XS
@ -1076,6 +1098,7 @@ label1:
 	XXLNOR	    VS0,VS1,VS32
 	XXLORQ	    VS0,VS1,VS32
 	XXLXOR	    VS0,VS1,VS32
+	XXLOR       VS0,VS1,VS32

 //	VSX select, XX4-form
 //	<MNEMONIC> XA,XB,XC,XT produces
@ -1092,6 +1115,7 @@ label1:
 //	<MNEMONIC> XB,UIM,XT produces
 //	<mnemonic> XT,XB,UIM
 	XXSPLTW	    VS0,$3,VS32
+	XXSPLTIB    $26,VS0

 //      VSX permute, XX3-form
 //      <MNEMONIC> XA,XB,XT produces
@ -1108,6 +1132,14 @@ label1:
 //	<mnemonic> XT,XA,XB,SHW
 	XXSLDWI	    VS0,VS1,$3,VS32

+//	VSX byte-reverse XX2-form
+//	<MNEMONIC> XB,XT produces
+//	<mnemonic> XT,XB
+	XXBRQ       VS0,VS1
+	XXBRD       VS0,VS1
+	XXBRW       VS0,VS1
+	XXBRH       VS0,VS1
+
 //	VSX scalar FP-FP conversion, XX2-form
 //	<MNEMONIC> XB,XT produces
 //	<mnemonic> XT,XB
--- a/src/cmd/asm/internal/asm/testdata/ppc64enc.s
+++ b/src/cmd/asm/internal/asm/testdata/ppc64enc.s
@ -19,8 +19,66 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
 	MOVD $-32767, R5                // 38a08001
 	MOVD $-32768, R6                // 38c08000
 	MOVD $1234567, R5               // 6405001260a5d687
+	MOVW $1, R3                     // 38600001
+	MOVW $-1, R4                    // 3880ffff
+	MOVW $65535, R5                 // 6005ffff
+	MOVW $65536, R6                 // 64060001
+	MOVW $-32767, R5                // 38a08001
+	MOVW $-32768, R6                // 38c08000
+	MOVW $1234567, R5               // 6405001260a5d687
+	MOVD 8(R3), R4			// e8830008
+	MOVD (R3)(R4), R5               // 7ca4182a
+	MOVW 4(R3), R4                  // e8830006
+	MOVW (R3)(R4), R5               // 7ca41aaa
+	MOVWZ 4(R3), R4                 // 80830004
+	MOVWZ (R3)(R4), R5              // 7ca4182e
+	MOVH 4(R3), R4                  // a8830004
+	MOVH (R3)(R4), R5               // 7ca41aae
+	MOVHZ 2(R3), R4                 // a0830002
+	MOVHZ (R3)(R4), R5              // 7ca41a2e
+	MOVB 1(R3), R4                  // 888300017c840774
+	MOVB (R3)(R4), R5               // 7ca418ae7ca50774
+	MOVBZ 1(R3), R4                 // 88830001
+	MOVBZ (R3)(R4), R5              // 7ca418ae
+	MOVDBR (R3)(R4), R5             // 7ca41c28
+	MOVWBR (R3)(R4), R5             // 7ca41c2c
+	MOVHBR (R3)(R4), R5             // 7ca41e2c
+
+	MOVDU 8(R3), R4                 // e8830009
+	MOVDU (R3)(R4), R5              // 7ca4186a
+	MOVWU (R3)(R4), R5              // 7ca41aea
+	MOVWZU 4(R3), R4                // 84830004
+	MOVWZU (R3)(R4), R5             // 7ca4186e
+	MOVHU 2(R3), R4                 // ac830002
+	MOVHU (R3)(R4), R5              // 7ca41aee
+	MOVHZU 2(R3), R4                // a4830002
+	MOVHZU (R3)(R4), R5             // 7ca41a6e
+	MOVBU 1(R3), R4                 // 8c8300017c840774
+	MOVBU (R3)(R4), R5              // 7ca418ee7ca50774
+	MOVBZU 1(R3), R4                // 8c830001
+	MOVBZU (R3)(R4), R5             // 7ca418ee
+
+	MOVD R4, 8(R3)                  // f8830008
+	MOVD R5, (R3)(R4)               // 7ca4192a
+	MOVW R4, 4(R3)                  // 90830004
+	MOVW R5, (R3)(R4)               // 7ca4192e
+	MOVH R4, 2(R3)                  // b0830002
+	MOVH R5, (R3)(R4)               // 7ca41b2e
+	MOVB R4, 1(R3)                  // 98830001
+	MOVB R5, (R3)(R4)               // 7ca419ae
+	MOVDBR R5, (R3)(R4)             // 7ca41d28
+	MOVWBR R5, (R3)(R4)             // 7ca41d2c
+	MOVHBR R5, (R3)(R4)             // 7ca41f2c
+
+	MOVDU R4, 8(R3)                 // f8830009
+	MOVDU R5, (R3)(R4)              // 7ca4196a
+	MOVWU R4, 4(R3)                 // 94830004
+	MOVWU R5, (R3)(R4)              // 7ca4196e
+	MOVHU R4, 2(R3)                 // b4830002
+	MOVHU R5, (R3)(R4)              // 7ca41b6e
+	MOVBU R4, 1(R3)                 // 9c830001
+	MOVBU R5, (R3)(R4)              // 7ca419ee

-	// add constants
 	ADD $1, R3                      // 38630001
 	ADD $1, R3, R4                  // 38830001
 	ADD $-1, R4                     // 3884ffff
@ -35,8 +93,9 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
 	ADD $-32768, R6, R5             // 38a68000
 	ADD $1234567, R5                // 641f001263ffd6877cbf2a14
 	ADD $1234567, R5, R6            // 641f001263ffd6877cdf2a14
+	ADDIS $8, R3                    // 3c630008
+	ADDIS $1000, R3, R4             // 3c8303e8

-	// and constants
 	ANDCC $1, R3                    // 70630001
 	ANDCC $1, R3, R4                // 70640001
 	ANDCC $-1, R4                   // 3be0ffff7fe42039
@ -51,8 +110,9 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
 	ANDCC $-32768, R5, R6           // 3be080007fe62839
 	ANDCC $1234567, R5              // 641f001263ffd6877fe52839
 	ANDCC $1234567, R5, R6          // 641f001263ffd6877fe62839
+	ANDISCC $1, R3                  // 74630001
+	ANDISCC $1000, R3, R4           // 746403e8

-	// or constants
 	OR $1, R3                       // 60630001
 	OR $1, R3, R4                   // 60640001
 	OR $-1, R4                      // 3be0ffff7fe42378
@ -68,7 +128,6 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
 	OR $1234567, R5                 // 641f001263ffd6877fe52b78
 	OR $1234567, R5, R3             // 641f001263ffd6877fe32b78

-	// or constants
 	XOR $1, R3                      // 68630001
 	XOR $1, R3, R4                  // 68640001
 	XOR $-1, R4                     // 3be0ffff7fe42278
@ -84,6 +143,177 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
 	XOR $1234567, R5                // 641f001263ffd6877fe52a78
 	XOR $1234567, R5, R3            // 641f001263ffd6877fe32a78

+	// TODO: the order of CR operands don't match
+	CMP R3, R4                      // 7c232000
+	CMPU R3, R4                     // 7c232040
+	CMPW R3, R4                     // 7c032000
+	CMPWU R3, R4                    // 7c032040
+
+	// TODO: constants for ADDC?
+	ADD R3, R4                      // 7c841a14
+	ADD R3, R4, R5                  // 7ca41a14
+	ADDC R3, R4                     // 7c841814
+	ADDC R3, R4, R5                 // 7ca41814
+	ADDE R3, R4                     // 7c841914
+	ADDECC R3, R4                   // 7c841915
+	ADDEV R3, R4                    // 7c841d14
+	ADDEVCC R3, R4                  // 7c841d15
+	ADDV R3, R4                     // 7c841e14
+	ADDVCC R3, R4                   // 7c841e15
+	ADDCCC R3, R4, R5               // 7ca41815
+	ADDME R3, R4                    // 7c8301d4
+	ADDMECC R3, R4                  // 7c8301d5
+	ADDMEV R3, R4                   // 7c8305d4
+	ADDMEVCC R3, R4                 // 7c8305d5
+	ADDCV R3, R4                    // 7c841c14
+	ADDCVCC R3, R4                  // 7c841c15
+	ADDZE R3, R4                    // 7c830194
+	ADDZECC R3, R4                  // 7c830195
+	ADDZEV R3, R4                   // 7c830594
+	ADDZEVCC R3, R4                 // 7c830595
+	SUBME R3, R4                    // 7c8301d0
+	SUBMECC R3, R4                  // 7c8301d1
+	SUBMEV R3, R4                   // 7c8305d0
+	SUBZE R3, R4                    // 7c830190
+	SUBZECC R3, R4                  // 7c830191
+	SUBZEV R3, R4                   // 7c830590
+	SUBZEVCC R3, R4                 // 7c830591
+
+	AND R3, R4                      // 7c841838
+	AND R3, R4, R5                  // 7c851838
+	ANDN R3, R4, R5                 // 7c851878
+	ANDCC R3, R4, R5                // 7c851839
+	OR R3, R4                       // 7c841b78
+	OR R3, R4, R5                   // 7c851b78
+	ORN R3, R4, R5                  // 7c851b38
+	ORCC R3, R4, R5                 // 7c851b79
+	XOR R3, R4                      // 7c841a78
+	XOR R3, R4, R5                  // 7c851a78
+	XORCC R3, R4, R5                // 7c851a79
+	NAND R3, R4, R5                 // 7c851bb8
+	NANDCC R3, R4, R5               // 7c851bb9
+	EQV R3, R4, R5                  // 7c851a38
+	EQVCC R3, R4, R5                // 7c851a39
+	NOR R3, R4, R5                  // 7c8518f8
+	NORCC R3, R4, R5                // 7c8518f9
+
+	SUB R3, R4                      // 7c832050
+	SUB R3, R4, R5                  // 7ca32050
+	SUBC R3, R4                     // 7c832010
+	SUBC R3, R4, R5                 // 7ca32010
+
+	MULLW R3, R4                    // 7c8419d6
+	MULLW R3, R4, R5                // 7ca419d6
+	MULLWCC R3, R4, R5              // 7ca419d7
+	MULHW R3, R4, R5                // 7ca41896
+
+	MULHWU R3, R4, R5               // 7ca41816
+	MULLD R3, R4                    // 7c8419d2
+	MULLD R4, R4, R5                // 7ca421d2
+	MULLDCC R3, R4, R5              // 7ca419d3
+	MULHD R3, R4, R5                // 7ca41892
+	MULHDCC R3, R4, R5              // 7ca41893
+
+	MULLWV R3, R4                   // 7c841dd6
+	MULLWV R3, R4, R5               // 7ca41dd6
+	MULLWVCC R3, R4, R5             // 7ca41dd7
+	MULHWUCC R3, R4, R5             // 7ca41817
+	MULLDV R3, R4, R5               // 7ca41dd2
+	MULLDVCC R3, R4, R5             // 7ca41dd3
+
+	DIVD R3,R4                      // 7c841bd2
+	DIVD R3, R4, R5                 // 7ca41bd2
+	DIVDCC R3,R4, R5                // 7ca41bd3
+	DIVDU R3, R4, R5                // 7ca41b92
+	DIVDV R3, R4, R5                // 7ca41fd2
+	DIVDUCC R3, R4, R5              // 7ca41b93
+	DIVDVCC R3, R4, R5              // 7ca41fd3
+	DIVDUV R3, R4, R5               // 7ca41f92
+	DIVDUVCC R3, R4, R5             // 7ca41f93
+	DIVDE R3, R4, R5                // 7ca41b52
+	DIVDECC R3, R4, R5              // 7ca41b53
+	DIVDEU R3, R4, R5               // 7ca41b12
+	DIVDEUCC R3, R4, R5             // 7ca41b13
+
+	REM R3, R4, R5                  // 7fe41bd67fff19d67cbf2050
+	REMU R3, R4, R5                 // 7fe41b967fff19d67bff00287cbf2050
+	REMD R3, R4, R5                 // 7fe41bd27fff19d27cbf2050
+	REMDU R3, R4, R5                // 7fe41b927fff19d27cbf2050
+
+	MODUD R3, R4, R5                // 7ca41a12
+	MODUW R3, R4, R5                // 7ca41a16
+	MODSD R3, R4, R5                // 7ca41e12
+	MODSW R3, R4, R5                // 7ca41e16
+
+	SLW $8, R3, R4                  // 5464402e
+	SLW R3, R4, R5                  // 7c851830
+	SLWCC R3, R4                    // 7c841831
+	SLD $16, R3, R4                 // 786483e4
+	SLD R3, R4, R5                  // 7c851836
+	SLDCC R3, R4                    // 7c841837
+
+	SRW $8, R3, R4                  // 5464c23e
+	SRW R3, R4, R5                  // 7c851c30
+	SRWCC R3, R4                    // 7c841c31
+	SRAW $8, R3, R4                 // 7c644670
+	SRAW R3, R4, R5                 // 7c851e30
+	SRAWCC R3, R4                   // 7c841e31
+	SRD $16, R3, R4                 // 78648402
+	SRD R3, R4, R5                  // 7c851c36
+	SRDCC R3, R4                    // 7c841c37
+	SRAD $16, R3, R4                // 7c648674
+	SRAD R3, R4, R5                 // 7c851e34
+	SRDCC R3, R4                    // 7c841c37
+	ROTLW $16, R3, R4               // 5464803e
+	ROTLW R3, R4, R5                // 5c85183e
+	RLWMI $7, R3, $65535, R6        // 50663c3e
+	RLWMICC $7, R3, $65535, R6      // 50663c3f
+	RLWNM $3, R4, $7, R6            // 54861f7e
+	RLWNMCC $3, R4, $7, R6          // 54861f7f
+	RLDMI $0, R4, $7, R6            // 7886076c
+	RLDMICC $0, R4, $7, R6          // 7886076d
+	RLDIMI $0, R4, $7, R6           // 788601cc
+	RLDIMICC $0, R4, $7, R6         // 788601cd
+	RLDC $0, R4, $15, R6            // 78860728
+	RLDCCC $0, R4, $15, R6          // 78860729
+	RLDCL $0, R4, $7, R6            // 78860770
+	RLDCLCC $0, R4, $15, R6         // 78860721
+	RLDCR $0, R4, $-16, R6          // 788606f2
+	RLDCRCC $0, R4, $-16, R6        // 788606f3
+	RLDICL $0, R4, $15, R6          // 788603c0
+	RLDICLCC $0, R4, $15, R6        // 788603c1
+	RLDICR $0, R4, $15, R6          // 788603c4
+	RLDICRCC $0, R4, $15, R6        // 788603c5
+
+	BEQ 0(PC)                       // 41820000
+	BGE 0(PC)                       // 40800000
+	BGT 4(PC)                       // 41810030
+	BLE 0(PC)                       // 40810000
+	BLT 0(PC)                       // 41800000
+	BNE 0(PC)                       // 40820000
+	JMP 8(PC)                       // 48000020
+
+	CRAND CR1, CR2, CR3             // 4c620a02
+	CRANDN CR1, CR2, CR3            // 4c620902
+	CREQV CR1, CR2, CR3             // 4c620a42
+	CRNAND CR1, CR2, CR3            // 4c6209c2
+	CRNOR CR1, CR2, CR3             // 4c620842
+	CROR CR1, CR2, CR3              // 4c620b82
+	CRORN CR1, CR2, CR3             // 4c620b42
+	CRXOR CR1, CR2, CR3             // 4c620982
+
+	ISEL $1, R3, R4, R5             // 7ca3205e
+	ISEL $0, R3, R4, R5             // 7ca3201e
+	ISEL $2, R3, R4, R5             // 7ca3209e
+	ISEL $3, R3, R4, R5             // 7ca320de
+	ISEL $4, R3, R4, R5             // 7ca3211e
+	POPCNTB R3, R4                  // 7c6400f4
+	POPCNTW R3, R4                  // 7c6402f4
+	POPCNTD R3, R4                  // 7c6403f4
+
+	PASTECC R3, R4                  // 7c23270d
+	COPY R3, R4                     // 7c23260c
+
 	// load-and-reserve
 	LBAR (R4)(R3*1),$1,R5           // 7ca32069
 	LBAR (R4),$0,R5                 // 7ca02068
@ -98,7 +328,304 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
 	LDAR (R4),$0,R5                 // 7ca020a8
 	LDAR (R3),R5                    // 7ca018a8

+	STBCCC R3, (R4)(R5)             // 7c65256d
+	STWCCC R3, (R4)(R5)             // 7c65212d
+	STDCCC R3, (R4)(R5)             // 7c6521ad
+	STHCCC R3, (R4)(R5)
+
+	SYNC                            // 7c0004ac
+	ISYNC                           // 4c00012c
+	LWSYNC                          // 7c2004ac
+
+	DCBF (R3)(R4)                   // 7c0418ac
+	DCBI (R3)(R4)                   // 7c041bac
+	DCBST (R3)(R4)                  // 7c04186c
+	DCBZ (R3)(R4)                   // 7c041fec
+	DCBT (R3)(R4)                   // 7c041a2c
+	ICBI (R3)(R4)                   // 7c041fac
+
 	// float constants
 	FMOVD $(0.0), F1                // f0210cd0
 	FMOVD $(-0.0), F1               // f0210cd0fc200850
+
+	FMOVD 8(R3), F1                 // c8230008
+	FMOVD (R3)(R4), F1              // 7c241cae
+	FMOVDU 8(R3), F1                // cc230008
+	FMOVDU (R3)(R4), F1             // 7c241cee
+	FMOVS 4(R3), F1                 // c0230004
+	FMOVS (R3)(R4), F1              // 7c241c2e
+	FMOVSU 4(R3), F1                // c4230004
+	FMOVSU (R3)(R4), F1             // 7c241c6e
+
+	FMOVD F1, 8(R3)                 // d8230008
+	FMOVD F1, (R3)(R4)              // 7c241dae
+	FMOVDU F1, 8(R3)                // dc230008
+	FMOVDU F1, (R3)(R4)             // 7c241dee
+	FMOVS F1, 4(R3)                 // d0230004
+	FMOVS F1, (R3)(R4)              // 7c241d2e
+	FMOVSU F1, 4(R3)                // d4230004
+	FMOVSU F1, (R3)(R4)             // 7c241d6e
+	FADD F1, F2                     // fc42082a
+	FADD F1, F2, F3                 // fc62082a
+	FADDCC F1, F2, F3               // fc62082b
+	FADDS F1, F2                    // ec42082a
+	FADDS F1, F2, F3                // ec62082a
+	FADDSCC F1, F2, F3              // ec62082b
+	FSUB F1, F2                     // fc420828
+	FSUB F1, F2, F3                 // fc620828
+	FSUBCC F1, F2, F3               // fc620829
+	FSUBS F1, F2                    // ec420828
+	FSUBS F1, F2, F3                // ec620828
+	FSUBCC F1, F2, F3               // fc620829
+	FMUL F1, F2                     // fc420072
+	FMUL F1, F2, F3                 // fc620072
+	FMULCC F1, F2, F3               // fc620073
+	FMULS F1, F2                    // ec420072
+	FMULS F1, F2, F3                // ec620072
+	FMULSCC F1, F2, F3              // ec620073
+	FDIV F1, F2                     // fc420824
+	FDIV F1, F2, F3                 // fc620824
+	FDIVCC F1, F2, F3               // fc620825
+	FDIVS F1, F2                    // ec420824
+	FDIVS F1, F2, F3                // ec620824
+	FDIVSCC F1, F2, F3              // ec620825
+	FMADD F1, F2, F3, F4            // fc8110fa
+	FMADDCC F1, F2, F3, F4          // fc8110fb
+	FMADDS F1, F2, F3, F4           // ec8110fa
+	FMADDSCC F1, F2, F3, F4         // ec8110fb
+	FMSUB F1, F2, F3, F4            // fc8110f8
+	FMSUBCC F1, F2, F3, F4          // fc8110f9
+	FMSUBS F1, F2, F3, F4           // ec8110f8
+	FMSUBSCC F1, F2, F3, F4         // ec8110f9
+	FNMADD F1, F2, F3, F4           // fc8110fe
+	FNMADDCC F1, F2, F3, F4         // fc8110ff
+	FNMADDS F1, F2, F3, F4          // ec8110fe
+	FNMADDSCC F1, F2, F3, F4        // ec8110ff
+	FNMSUB F1, F2, F3, F4           // fc8110fc
+	FNMSUBCC F1, F2, F3, F4         // fc8110fd
+	FNMSUBS F1, F2, F3, F4          // ec8110fc
+	FNMSUBSCC F1, F2, F3, F4        // ec8110fd
+	FSEL F1, F2, F3, F4             // fc8110ee
+	FSELCC F1, F2, F3, F4           // fc8110ef
+	FABS F1, F2                     // fc400a10
+	FABSCC F1, F2                   // fc400a11
+	FNEG F1, F2                     // fc400850
+	FABSCC F1, F2                   // fc400a11
+	FRSP F1, F2                     // fc400818
+	FRSPCC F1, F2                   // fc400819
+	FCTIW F1, F2                    // fc40081c
+	FCTIWCC F1, F2                  // fc40081d
+	FCTIWZ F1, F2                   // fc40081e
+	FCTIWZCC F1, F2                 // fc40081f
+	FCTID F1, F2                    // fc400e5c
+	FCTIDCC F1, F2                  // fc400e5d
+	FCTIDZ F1, F2                   // fc400e5e
+	FCTIDZCC F1, F2                 // fc400e5f
+	FCFID F1, F2                    // fc400e9c
+	FCFIDCC F1, F2                  // fc400e9d
+	FCFIDU F1, F2                   // fc400f9c
+	FCFIDUCC F1, F2                 // fc400f9d
+	FCFIDS F1, F2                   // ec400e9c
+	FCFIDSCC F1, F2                 // ec400e9d
+	FRES F1, F2                     // ec400830
+	FRESCC F1, F2                   // ec400831
+	FRIM F1, F2                     // fc400bd0
+	FRIMCC F1, F2                   // fc400bd1
+	FRIP F1, F2                     // fc400b90
+	FRIPCC F1, F2                   // fc400b91
+	FRIZ F1, F2                     // fc400b50
+	FRIZCC F1, F2                   // fc400b51
+	FRIN F1, F2                     // fc400b10
+	FRINCC F1, F2                   // fc400b11
+	FRSQRTE F1, F2                  // fc400834
+	FRSQRTECC F1, F2                // fc400835
+	FSQRT F1, F2                    // fc40082c
+	FSQRTCC F1, F2                  // fc40082d
+	FSQRTS F1, F2                   // ec40082c
+	FSQRTSCC F1, F2                 // ec40082d
+	FCPSGN F1, F2                   // fc420810
+	FCPSGNCC F1, F2                 // fc420811
+	FCMPO F1, F2                    // fc011040
+	FCMPU F1, F2                    // fc011000
+	LVX (R3)(R4), V1                // 7c2418ce
+	LVXL (R3)(R4), V1               // 7c241ace
+	LVSL (R3)(R4), V1               // 7c24180c
+	LVSR (R3)(R4), V1               // 7c24184c
+	LVEBX (R3)(R4), V1              // 7c24180e
+	LVEHX (R3)(R4), V1              // 7c24184e
+	LVEWX (R3)(R4), V1              // 7c24188e
+	STVX V1, (R3)(R4)               // 7c2419ce
+	STVXL V1, (R3)(R4)              // 7c241bce
+	STVEBX V1, (R3)(R4)             // 7c24190e
+	STVEHX V1, (R3)(R4)             // 7c24194e
+	STVEWX V1, (R3)(R4)             // 7c24198e
+
+	VAND V1, V2, V3                 // 10611404
+	VANDC V1, V2, V3                // 10611444
+	VNAND V1, V2, V3                // 10611584
+	VOR V1, V2, V3                  // 10611484
+	VORC V1, V2, V3                 // 10611544
+	VXOR V1, V2, V3                 // 106114c4
+	VNOR V1, V2, V3                 // 10611504
+	VEQV V1, V2, V3                 // 10611684
+	VADDUBM V1, V2, V3              // 10611000
+	VADDUHM V1, V2, V3              // 10611040
+	VADDUWM V1, V2, V3              // 10611080
+	VADDUDM V1, V2, V3              // 106110c0
+	VADDUQM V1, V2, V3              // 10611100
+	VADDCUQ V1, V2, V3              // 10611140
+	VADDCUW V1, V2, V3              // 10611180
+	VADDUBS V1, V2, V3              // 10611200
+	VADDUHS V1, V2, V3              // 10611240
+	VADDUWS V1, V2, V3              // 10611280
+	VSUBUBM V1, V2, V3              // 10611400
+	VSUBUHM V1, V2, V3              // 10611440
+	VSUBUWM V1, V2, V3              // 10611480
+	VSUBUDM V1, V2, V3              // 106114c0
+	VSUBUQM V1, V2, V3              // 10611500
+	VSUBCUQ V1, V2, V3              // 10611540
+	VSUBCUW V1, V2, V3              // 10611580
+	VSUBUBS V1, V2, V3              // 10611600
+	VSUBUHS V1, V2, V3              // 10611640
+	VSUBUWS V1, V2, V3              // 10611680
+	VSUBSBS V1, V2, V3              // 10611700
+	VSUBSHS V1, V2, V3              // 10611740
+	VSUBSWS V1, V2, V3              // 10611780
+	VSUBEUQM V1, V2, V3, V4         // 108110fe
+	VSUBECUQ V1, V2, V3, V4         // 108110ff
+	VMULESB V1, V2, V3              // 10611308
+	VMULOSB V1, V2, V3              // 10611108
+	VMULEUB V1, V2, V3              // 10611208
+	VMULOUB V1, V2, V3              // 10611008
+	VMULESH V1, V2, V3              // 10611348
+	VMULOSH V1, V2, V3              // 10611148
+	VMULEUH V1, V2, V3              // 10611248
+	VMULOUH V1, V2, V3              // 10611048
+	VMULESH V1, V2, V3              // 10611348
+	VMULOSW V1, V2, V3              // 10611188
+	VMULEUW V1, V2, V3              // 10611288
+	VMULOUW V1, V2, V3              // 10611088
+	VMULUWM V1, V2, V3              // 10611089
+	VPMSUMB V1, V2, V3              // 10611408
+	VPMSUMH V1, V2, V3              // 10611448
+	VPMSUMW V1, V2, V3              // 10611488
+	VPMSUMD V1, V2, V3              // 106114c8
+	VMSUMUDM V1, V2, V3, V4         // 108110e3
+	VRLB V1, V2, V3                 // 10611004
+	VRLH V1, V2, V3                 // 10611044
+	VRLW V1, V2, V3                 // 10611084
+	VRLD V1, V2, V3                 // 106110c4
+	VSLB V1, V2, V3                 // 10611104
+	VSLH V1, V2, V3                 // 10611144
+	VSLW V1, V2, V3                 // 10611184
+	VSL V1, V2, V3                  // 106111c4
+	VSLO V1, V2, V3                 // 1061140c
+	VSRB V1, V2, V3                 // 10611204
+	VSRH V1, V2, V3                 // 10611244
+	VSRW V1, V2, V3                 // 10611284
+	VSR V1, V2, V3                  // 106112c4
+	VSRO V1, V2, V3                 // 1061144c
+	VSLD V1, V2, V3                 // 106115c4
+	VSRAB V1, V2, V3                // 10611304
+	VSRAH V1, V2, V3                // 10611344
+	VSRAW V1, V2, V3                // 10611384
+	VSRAD V1, V2, V3                // 106113c4
+	VSLDOI $3, V1, V2, V3           // 106110ec
+	VCLZB V1, V2                    // 10400f02
+	VCLZH V1, V2                    // 10400f42
+	VCLZW V1, V2                    // 10400f82
+	VCLZD V1, V2                    // 10400fc2
+	VPOPCNTB V1, V2                 // 10400f03
+	VPOPCNTH V1, V2                 // 10400f43
+	VPOPCNTW V1, V2                 // 10400f83
+	VPOPCNTD V1, V2                 // 10400fc3
+	VCMPEQUB V1, V2, V3             // 10611006
+	VCMPEQUBCC V1, V2, V3           // 10611406
+	VCMPEQUH V1, V2, V3             // 10611046
+	VCMPEQUHCC V1, V2, V3           // 10611446
+	VCMPEQUW V1, V2, V3             // 10611086
+	VCMPEQUWCC V1, V2, V3           // 10611486
+	VCMPEQUD V1, V2, V3             // 106110c7
+	VCMPEQUDCC V1, V2, V3           // 106114c7
+	VCMPGTUB V1, V2, V3             // 10611206
+	VCMPGTUBCC V1, V2, V3           // 10611606
+	VCMPGTUH V1, V2, V3             // 10611246
+	VCMPGTUHCC V1, V2, V3           // 10611646
+	VCMPGTUW V1, V2, V3             // 10611286
+	VCMPGTUWCC V1, V2, V3           // 10611686
+	VCMPGTUD V1, V2, V3             // 106112c7
+	VCMPGTUDCC V1, V2, V3           // 106116c7
+	VCMPGTSB V1, V2, V3             // 10611306
+	VCMPGTSBCC V1, V2, V3           // 10611706
+	VCMPGTSH V1, V2, V3             // 10611346
+	VCMPGTSHCC V1, V2, V3           // 10611746
+	VCMPGTSW V1, V2, V3             // 10611386
+	VCMPGTSWCC V1, V2, V3           // 10611786
+	VCMPGTSD V1, V2, V3             // 106113c7
+	VCMPGTSDCC V1, V2, V3           // 106117c7
+	VCMPNEZB V1, V2, V3             // 10611107
+	VCMPNEZBCC V1, V2, V3           // 10611507
+	VCMPNEB V1, V2, V3              // 10611007
+	VCMPNEBCC V1, V2, V3            // 10611407
+	VCMPNEH V1, V2, V3              // 10611047
+	VCMPNEHCC V1, V2, V3            // 10611447
+	VCMPNEW V1, V2, V3              // 10611087
+	VCMPNEWCC V1, V2, V3            // 10611487
+	VPERM V1, V2, V3, V4            // 108110eb
+	VPERMR V1, V2, V3, V4           // 108110fb
+	VPERMXOR V1, V2, V3, V4         // 108110ed
+	VBPERMQ V1, V2, V3              // 1061154c
+	VBPERMD V1, V2, V3              // 106115cc
+	VSEL V1, V2, V3, V4             // 108110ea
+	VSPLTB $1, V1, V2               // 10410a0c
+	VSPLTH $1, V1, V2               // 10410a4c
+	VSPLTW $1, V1, V2               // 10410a8c
+	VSPLTISB $1, V1                 // 1021030c
+	VSPLTISW $1, V1                 // 1021038c
+	VSPLTISH $1, V1                 // 1021034c
+	VCIPHER V1, V2, V3              // 10611508
+	VCIPHERLAST V1, V2, V3          // 10611509
+	VNCIPHER V1, V2, V3             // 10611548
+	VNCIPHERLAST V1, V2, V3         // 10611549
+	VSBOX V1, V2                    // 104105c8
+	VSHASIGMAW $1, V1, $15, V2      // 10418e82
+	VSHASIGMAD $2, V1, $15, V2      // 104196c2
+
+	LXVD2X (R3)(R4), VS1            // 7c241e98
+	LXV 16(R3), VS1                 // f4230011
+	LXVL R3, R4, VS1                // 7c23221a
+	LXVLL R3, R4, VS1               // 7c23225a
+	LXSDX (R3)(R4), VS1             // 7c241c98
+	STXVD2X VS1, (R3)(R4)           // 7c241f98
+	STXV VS1,16(R3)                 // f4230015
+	STXVL VS1, R3, R4               // 7c23231a
+	STXVLL VS1, R3, R4              // 7c23235a
+	STXSDX VS1, (R3)(R4)            // 7c241d98
+	LXSIWAX (R3)(R4), VS1           // 7c241898
+	STXSIWX VS1, (R3)(R4)           // 7c241918
+	MFVSRD VS1, R3                  // 7c230066
+	MTVSRD R3, VS1                  // 7c230166
+	XXLAND VS1, VS2, VS3            // f0611410
+	XXLOR VS1, VS2, VS3             // f0611490
+	XXLORC VS1, VS2, VS3            // f0611550
+	XXLXOR VS1, VS2, VS3            // f06114d0
+	XXSEL VS1, VS2, VS3, VS4        // f08110f0
+	XXMRGHW VS1, VS2, VS3           // f0611090
+	XXSPLTW VS1, $1, VS2            // f0410a90
+	XXPERM VS1, VS2, VS3            // f06110d0
+	XXSLDWI VS1, VS2, $1, VS3       // f0611110
+	XSCVDPSP VS1, VS2               // f0400c24
+	XVCVDPSP VS1, VS2               // f0400e24
+	XSCVSXDDP VS1, VS2              // f0400de0
+	XVCVDPSXDS VS1, VS2             // f0400f60
+	XVCVSXDDP VS1, VS2              // f0400fe0
+
+	MOVD R3, LR                     // 7c6803a6
+	MOVD R3, CTR                    // 7c6903a6
+	MOVD R3, XER                    // 7c6103a6
+	MOVD LR, R3                     // 7c6802a6
+	MOVD CTR, R3                    // 7c6902a6
+	MOVD XER, R3                    // 7c6102a6
+	MOVFL CR3, CR1                  // 4c8c0000
+
 	RET
--- a/src/cmd/asm/internal/asm/testdata/s390x.s
+++ b/src/cmd/asm/internal/asm/testdata/s390x.s
@ -89,6 +89,7 @@ TEXT main·foo(SB),DUPOK|NOSPLIT,$16-0 // TEXT main.foo(SB), DUPOK|NOSPLIT, $16-
 	ADDW	R1, R2, R3            // b9f81032
 	ADDW	$8192, R1             // a71a2000
 	ADDW	$8192, R1, R2         // ec21200000d8
+	ADDE	R1, R2                // b9880021
 	SUB	R3, R4                // b9090043
 	SUB	R3, R4, R5            // b9e93054
 	SUB	$8192, R3             // a73be000
@ -183,6 +184,9 @@ TEXT main·foo(SB),DUPOK|NOSPLIT,$16-0 // TEXT main.foo(SB), DUPOK|NOSPLIT, $16-
 	ADDW	4095(R7), R8          // 5a807fff
 	ADDW	-1(R1), R2            // e3201fffff5a
 	ADDW	4096(R3), R4          // e3403000015a
+	ADDE	4096(R3), R4          // e34030000188
+	ADDE	4096(R3)(R2*1), R4    // e34230000188
+	ADDE	524288(R3)(R4*1), R5  // c0a10008000041aa4000e35a30000088
 	MULLD	(R1)(R2*1), R3        // e3321000000c
 	MULLW	(R3)(R4*1), R5        // 71543000
 	MULLW	4096(R3), R4          // e34030000151
@ -259,6 +263,7 @@ TEXT main·foo(SB),DUPOK|NOSPLIT,$16-0 // TEXT main.foo(SB), DUPOK|NOSPLIT, $16-
 	NC	$8, (R15), n-8(SP)       // d407f010f000
 	OC	$8, (R15), n-8(SP)       // d607f010f000
 	MVC	$8, (R15), n-8(SP)       // d207f010f000
+	MVCIN	$8, (R15), n-8(SP)       // e807f010f000
 	CLC	$8, (R15), n-8(SP)       // d507f000f010
 	XC	$256, -8(R15), -8(R15)   // b90400afc2a8fffffff8d7ffa000a000
 	MVC	$256, 8192(R1), 8192(R2) // b90400a2c2a800002000b90400b1c2b800002000d2ffa000b000
--- a/src/cmd/asm/internal/flags/flags.go
+++ b/src/cmd/asm/internal/flags/flags.go
@ -23,9 +23,10 @@ var (
 	Dynlink    = flag.Bool("dynlink", false, "support references to Go symbols defined in other shared libraries")
 	AllErrors  = flag.Bool("e", false, "no limit on number of errors reported")
 	SymABIs    = flag.Bool("gensymabis", false, "write symbol ABI information to output file, don't assemble")
-	Newobj     = flag.Bool("newobj", false, "use new object file format")
+	Importpath = flag.String("p", "", "set expected package import to path")
+	Spectre    = flag.String("spectre", "", "enable spectre mitigations in `list` (all, ret)")

-	Spectre = flag.String("spectre", "", "enable spectre mitigations in `list` (all, ret)")
+	Go115Newobj = flag.Bool("go115newobj", true, "use new object file format")
 )

 var (
--- a/src/cmd/asm/main.go
+++ b/src/cmd/asm/main.go
@ -40,7 +40,8 @@ func main() {
 	}
 	ctxt.Flag_dynlink = *flags.Dynlink
 	ctxt.Flag_shared = *flags.Shared || *flags.Dynlink
-	ctxt.Flag_newobj = *flags.Newobj
+	ctxt.Flag_go115newobj = *flags.Go115Newobj
+	ctxt.IsAsm = true
 	switch *flags.Spectre {
 	default:
 		log.Printf("unknown setting -spectre=%s", *flags.Spectre)
@ -87,7 +88,7 @@ func main() {
 			pList.Firstpc, ok = parser.Parse()
 			// reports errors to parser.Errorf
 			if ok {
-				obj.Flushplist(ctxt, pList, nil, "")
+				obj.Flushplist(ctxt, pList, nil, *flags.Importpath)
 			}
 		}
 		if !ok {
--- a/src/cmd/cgo/gcc.go
+++ b/src/cmd/cgo/gcc.go
@ -182,6 +182,9 @@ func (p *Package) Translate(f *File) {
 		numTypedefs = len(p.typedefs)
 		// Also ask about any typedefs we've seen so far.
 		for _, info := range p.typedefList {
+			if f.Name[info.typedef] != nil {
+				continue
+			}
 			n := &Name{
 				Go: info.typedef,
 				C:  info.typedef,
@ -333,7 +336,7 @@ func (p *Package) guessKinds(f *File) []*Name {
 	//	void __cgo_f_xxx_5(void) { static const char __cgo_undefined__5[] = (name); }
 	//
 	// If we see an error at not-declared:xxx, the corresponding name is not declared.
-	// If we see an error at not-type:xxx, the corresponding name is a type.
+	// If we see an error at not-type:xxx, the corresponding name is not a type.
 	// If we see an error at not-int-const:xxx, the corresponding name is not an integer constant.
 	// If we see an error at not-num-const:xxx, the corresponding name is not a number constant.
 	// If we see an error at not-str-lit:xxx, the corresponding name is not a string literal.
@ -710,6 +713,9 @@ func (p *Package) prepareNames(f *File) {
 			}
 		}
 		p.mangleName(n)
+		if n.Kind == "type" && typedef[n.Mangle] == nil {
+			typedef[n.Mangle] = n.Type
+		}
 	}
 }

@ -1348,6 +1354,9 @@ func (p *Package) rewriteRef(f *File) {

 		if *godefs {
 			// Substitute definition for mangled type name.
+			if r.Name.Type != nil {
+				expr = r.Name.Type.Go
+			}
 			if id, ok := expr.(*ast.Ident); ok {
 				if t := typedef[id.Name]; t != nil {
 					expr = t.Go
@ -1413,9 +1422,7 @@ func (p *Package) rewriteName(f *File, r *Ref) ast.Expr {
 				r.Context = ctxType
 				if r.Name.Type == nil {
 					error_(r.Pos(), "invalid conversion to C.%s: undefined C type '%s'", fixGo(r.Name.Go), r.Name.C)
-					break
 				}
-				expr = r.Name.Type.Go
 				break
 			}
 			error_(r.Pos(), "call of non-function C.%s", fixGo(r.Name.Go))
@ -1472,9 +1479,7 @@ func (p *Package) rewriteName(f *File, r *Ref) ast.Expr {
 			// Okay - might be new(T)
 			if r.Name.Type == nil {
 				error_(r.Pos(), "expression C.%s: undefined C type '%s'", fixGo(r.Name.Go), r.Name.C)
-				break
 			}
-			expr = r.Name.Type.Go
 		case "var":
 			expr = &ast.StarExpr{Star: (*r.Expr).Pos(), X: expr}
 		case "macro":
@ -1493,8 +1498,6 @@ func (p *Package) rewriteName(f *File, r *Ref) ast.Expr {
 			// Use of C.enum_x, C.struct_x or C.union_x without C definition.
 			// GCC won't raise an error when using pointers to such unknown types.
 			error_(r.Pos(), "type C.%s: undefined C type '%s'", fixGo(r.Name.Go), r.Name.C)
-		} else {
-			expr = r.Name.Type.Go
 		}
 	default:
 		if r.Name.Kind == "func" {
@ -2060,6 +2063,10 @@ var goIdent = make(map[string]*ast.Ident)
 // that may contain a pointer. This is used for cgo pointer checking.
 var unionWithPointer = make(map[ast.Expr]bool)

+// anonymousStructTag provides a consistent tag for an anonymous struct.
+// The same dwarf.StructType pointer will always get the same tag.
+var anonymousStructTag = make(map[*dwarf.StructType]string)
+
 func (c *typeConv) Init(ptrSize, intSize int64) {
 	c.ptrSize = ptrSize
 	c.intSize = intSize
@ -2408,8 +2415,12 @@ func (c *typeConv) loadType(dtype dwarf.Type, pos token.Pos, parent string) *Typ
 			break
 		}
 		if tag == "" {
-			tag = "__" + strconv.Itoa(tagGen)
-			tagGen++
+			tag = anonymousStructTag[dt]
+			if tag == "" {
+				tag = "__" + strconv.Itoa(tagGen)
+				tagGen++
+				anonymousStructTag[dt] = tag
+			}
 		} else if t.C.Empty() {
 			t.C.Set(dt.Kind + " " + tag)
 		}
@ -3006,8 +3017,9 @@ func (c *typeConv) anonymousStructTypedef(dt *dwarf.TypedefType) bool {
 	return ok && st.StructName == ""
 }

-// badPointerTypedef reports whether t is a C typedef that should not be considered a pointer in Go.
-// A typedef is bad if C code sometimes stores non-pointers in this type.
+// badPointerTypedef reports whether dt is a C typedef that should not be
+// considered a pointer in Go. A typedef is bad if C code sometimes stores
+// non-pointers in this type.
 // TODO: Currently our best solution is to find these manually and list them as
 // they come up. A better solution is desired.
 func (c *typeConv) badPointerTypedef(dt *dwarf.TypedefType) bool {
--- a/src/cmd/compile/doc.go
+++ b/src/cmd/compile/doc.go
@ -195,30 +195,58 @@ directive can skip over a directive like any other comment.
 // Line directives typically appear in machine-generated code, so that compilers and debuggers
 // will report positions in the original input to the generator.
 /*
-The line directive is an historical special case; all other directives are of the form
-//go:name and must start at the beginning of a line, indicating that the directive is defined
-by the Go toolchain.
+The line directive is a historical special case; all other directives are of the form
+//go:name, indicating that they are defined by the Go toolchain.
+Each directive must be placed its own line, with only leading spaces and tabs
+allowed before the comment.
+Each directive applies to the Go code that immediately follows it,
+which typically must be a declaration.

 	//go:noescape

-The //go:noescape directive specifies that the next declaration in the file, which
-must be a func without a body (meaning that it has an implementation not written
-in Go) does not allow any of the pointers passed as arguments to escape into the
-heap or into the values returned from the function. This information can be used
-during the compiler's escape analysis of Go code calling the function.
+The //go:noescape directive must be followed by a function declaration without
+a body (meaning that the function has an implementation not written in Go).
+It specifies that the function does not allow any of the pointers passed as
+arguments to escape into the heap or into the values returned from the function.
+This information can be used during the compiler's escape analysis of Go code
+calling the function.
+
+	//go:uintptrescapes
+
+The //go:uintptrescapes directive must be followed by a function declaration.
+It specifies that the function's uintptr arguments may be pointer values
+that have been converted to uintptr and must be treated as such by the
+garbage collector. The conversion from pointer to uintptr must appear in
+the argument list of any call to this function. This directive is necessary
+for some low-level system call implementations and should be avoided otherwise.
+
+	//go:noinline
+
+The //go:noinline directive must be followed by a function declaration.
+It specifies that calls to the function should not be inlined, overriding
+the compiler's usual optimization rules. This is typically only needed
+for special runtime functions or when debugging the compiler.
+
+	//go:norace
+
+The //go:norace directive must be followed by a function declaration.
+It specifies that the function's memory accesses must be ignored by the
+race detector. This is most commonly used in low-level code invoked
+at times when it is unsafe to call into the race detector runtime.

 	//go:nosplit

-The //go:nosplit directive specifies that the next function declared in the file must
-not include a stack overflow check. This is most commonly used by low-level
-runtime sources invoked at times when it is unsafe for the calling goroutine to be
-preempted.
+The //go:nosplit directive must be followed by a function declaration.
+It specifies that the function must omit its usual stack overflow check.
+This is most commonly used by low-level runtime code invoked
+at times when it is unsafe for the calling goroutine to be preempted.

 	//go:linkname localname [importpath.name]

-The //go:linkname directive instructs the compiler to use ``importpath.name'' as the
-object file symbol name for the variable or function declared as ``localname'' in the
-source code.
+This special directive does not apply to the Go code that follows it.
+Instead, the //go:linkname directive instructs the compiler to use ``importpath.name''
+as the object file symbol name for the variable or function declared as ``localname''
+in the source code.
 If the ``importpath.name'' argument is omitted, the directive uses the
 symbol's default object file symbol name and only has the effect of making
 the symbol accessible to other packages.
--- a/src/cmd/compile/fmtmap_test.go
+++ b/src/cmd/compile/fmtmap_test.go
@ -112,6 +112,7 @@ var knownFormats = map[string]string{
 	"cmd/compile/internal/ssa.Location %s":            "",
 	"cmd/compile/internal/ssa.Op %s":                  "",
 	"cmd/compile/internal/ssa.Op %v":                  "",
+	"cmd/compile/internal/ssa.Sym %v":                 "",
 	"cmd/compile/internal/ssa.ValAndOff %s":           "",
 	"cmd/compile/internal/ssa.domain %v":              "",
 	"cmd/compile/internal/ssa.posetNode %v":           "",
@ -156,7 +157,6 @@ var knownFormats = map[string]string{
 	"int64 %+d":                                       "",
 	"int64 %-10d":                                     "",
 	"int64 %.5d":                                      "",
-	"int64 %X":                                        "",
 	"int64 %d":                                        "",
 	"int64 %v":                                        "",
 	"int64 %x":                                        "",
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@ -752,7 +752,12 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Reg = v.Args[0].Reg()
 		gc.AddAux(&p.To, v)
 	case ssa.OpAMD64MOVBstoreidx1, ssa.OpAMD64MOVWstoreidx1, ssa.OpAMD64MOVLstoreidx1, ssa.OpAMD64MOVQstoreidx1, ssa.OpAMD64MOVSSstoreidx1, ssa.OpAMD64MOVSDstoreidx1,
-		ssa.OpAMD64MOVQstoreidx8, ssa.OpAMD64MOVSDstoreidx8, ssa.OpAMD64MOVLstoreidx8, ssa.OpAMD64MOVSSstoreidx4, ssa.OpAMD64MOVLstoreidx4, ssa.OpAMD64MOVWstoreidx2:
+		ssa.OpAMD64MOVQstoreidx8, ssa.OpAMD64MOVSDstoreidx8, ssa.OpAMD64MOVLstoreidx8, ssa.OpAMD64MOVSSstoreidx4, ssa.OpAMD64MOVLstoreidx4, ssa.OpAMD64MOVWstoreidx2,
+		ssa.OpAMD64ADDLmodifyidx1, ssa.OpAMD64ADDLmodifyidx4, ssa.OpAMD64ADDLmodifyidx8, ssa.OpAMD64ADDQmodifyidx1, ssa.OpAMD64ADDQmodifyidx8,
+		ssa.OpAMD64SUBLmodifyidx1, ssa.OpAMD64SUBLmodifyidx4, ssa.OpAMD64SUBLmodifyidx8, ssa.OpAMD64SUBQmodifyidx1, ssa.OpAMD64SUBQmodifyidx8,
+		ssa.OpAMD64ANDLmodifyidx1, ssa.OpAMD64ANDLmodifyidx4, ssa.OpAMD64ANDLmodifyidx8, ssa.OpAMD64ANDQmodifyidx1, ssa.OpAMD64ANDQmodifyidx8,
+		ssa.OpAMD64ORLmodifyidx1, ssa.OpAMD64ORLmodifyidx4, ssa.OpAMD64ORLmodifyidx8, ssa.OpAMD64ORQmodifyidx1, ssa.OpAMD64ORQmodifyidx8,
+		ssa.OpAMD64XORLmodifyidx1, ssa.OpAMD64XORLmodifyidx4, ssa.OpAMD64XORLmodifyidx8, ssa.OpAMD64XORQmodifyidx1, ssa.OpAMD64XORQmodifyidx8:
 		p := s.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = v.Args[2].Reg()
@ -796,6 +801,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Type = obj.TYPE_MEM
 		p.To.Reg = v.Args[0].Reg()
 		gc.AddAux2(&p.To, v, off)
+
 	case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst:
 		p := s.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_CONST
@ -804,11 +810,29 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Type = obj.TYPE_MEM
 		p.To.Reg = v.Args[0].Reg()
 		gc.AddAux2(&p.To, v, sc.Off())
-	case ssa.OpAMD64MOVQstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx8, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx4, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx2, ssa.OpAMD64MOVBstoreconstidx1:
+	case ssa.OpAMD64MOVQstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx8, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx4, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx2, ssa.OpAMD64MOVBstoreconstidx1,
+		ssa.OpAMD64ADDLconstmodifyidx1, ssa.OpAMD64ADDLconstmodifyidx4, ssa.OpAMD64ADDLconstmodifyidx8, ssa.OpAMD64ADDQconstmodifyidx1, ssa.OpAMD64ADDQconstmodifyidx8,
+		ssa.OpAMD64ANDLconstmodifyidx1, ssa.OpAMD64ANDLconstmodifyidx4, ssa.OpAMD64ANDLconstmodifyidx8, ssa.OpAMD64ANDQconstmodifyidx1, ssa.OpAMD64ANDQconstmodifyidx8,
+		ssa.OpAMD64ORLconstmodifyidx1, ssa.OpAMD64ORLconstmodifyidx4, ssa.OpAMD64ORLconstmodifyidx8, ssa.OpAMD64ORQconstmodifyidx1, ssa.OpAMD64ORQconstmodifyidx8,
+		ssa.OpAMD64XORLconstmodifyidx1, ssa.OpAMD64XORLconstmodifyidx4, ssa.OpAMD64XORLconstmodifyidx8, ssa.OpAMD64XORQconstmodifyidx1, ssa.OpAMD64XORQconstmodifyidx8:
 		p := s.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_CONST
 		sc := v.AuxValAndOff()
 		p.From.Offset = sc.Val()
+		switch {
+		case p.As == x86.AADDQ && p.From.Offset == 1:
+			p.As = x86.AINCQ
+			p.From.Type = obj.TYPE_NONE
+		case p.As == x86.AADDQ && p.From.Offset == -1:
+			p.As = x86.ADECQ
+			p.From.Type = obj.TYPE_NONE
+		case p.As == x86.AADDL && p.From.Offset == 1:
+			p.As = x86.AINCL
+			p.From.Type = obj.TYPE_NONE
+		case p.As == x86.AADDL && p.From.Offset == -1:
+			p.As = x86.ADECL
+			p.From.Type = obj.TYPE_NONE
+		}
 		memIdx(&p.To, v)
 		gc.AddAux2(&p.To, v, sc.Off())
 	case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX,
@ -840,6 +864,28 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p := s.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_MEM
 		p.From.Reg = v.Args[1].Reg()
+		gc.AddAux(&p.From, v)
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = v.Reg()
+		if v.Reg() != v.Args[0].Reg() {
+			v.Fatalf("input[0] and output not in same register %s", v.LongString())
+		}
+	case ssa.OpAMD64ADDLloadidx1, ssa.OpAMD64ADDLloadidx4, ssa.OpAMD64ADDLloadidx8, ssa.OpAMD64ADDQloadidx1, ssa.OpAMD64ADDQloadidx8,
+		ssa.OpAMD64SUBLloadidx1, ssa.OpAMD64SUBLloadidx4, ssa.OpAMD64SUBLloadidx8, ssa.OpAMD64SUBQloadidx1, ssa.OpAMD64SUBQloadidx8,
+		ssa.OpAMD64ANDLloadidx1, ssa.OpAMD64ANDLloadidx4, ssa.OpAMD64ANDLloadidx8, ssa.OpAMD64ANDQloadidx1, ssa.OpAMD64ANDQloadidx8,
+		ssa.OpAMD64ORLloadidx1, ssa.OpAMD64ORLloadidx4, ssa.OpAMD64ORLloadidx8, ssa.OpAMD64ORQloadidx1, ssa.OpAMD64ORQloadidx8,
+		ssa.OpAMD64XORLloadidx1, ssa.OpAMD64XORLloadidx4, ssa.OpAMD64XORLloadidx8, ssa.OpAMD64XORQloadidx1, ssa.OpAMD64XORQloadidx8:
+		p := s.Prog(v.Op.Asm())
+
+		r, i := v.Args[1].Reg(), v.Args[2].Reg()
+		p.From.Type = obj.TYPE_MEM
+		p.From.Scale = v.Op.Scale()
+		if p.From.Scale == 1 && i == x86.REG_SP {
+			r, i = i, r
+		}
+		p.From.Reg = r
+		p.From.Index = i
+
 		gc.AddAux(&p.From, v)
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = v.Reg()
@ -872,7 +918,16 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p := s.Prog(obj.ADUFFCOPY)
 		p.To.Type = obj.TYPE_ADDR
 		p.To.Sym = gc.Duffcopy
-		p.To.Offset = v.AuxInt
+		if v.AuxInt%16 != 0 {
+			v.Fatalf("bad DUFFCOPY AuxInt %v", v.AuxInt)
+		}
+		p.To.Offset = 14 * (64 - v.AuxInt/16)
+		// 14 and 64 are magic constants.  14 is the number of bytes to encode:
+		//	MOVUPS	(SI), X0
+		//	ADDQ	$16, SI
+		//	MOVUPS	X0, (DI)
+		//	ADDQ	$16, DI
+		// and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy.

 	case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
 		if v.Type.IsMemory() {
@ -902,6 +957,12 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = v.Args[0].Reg()
 		gc.AddrAuto(&p.To, v)
+	case ssa.OpAMD64LoweredHasCPUFeature:
+		p := s.Prog(x86.AMOVBQZX)
+		p.From.Type = obj.TYPE_MEM
+		gc.AddAux(&p.From, v)
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = v.Reg()
 	case ssa.OpAMD64LoweredGetClosurePtr:
 		// Closure pointer is DX.
 		gc.CheckLoweredGetClosurePtr(v)
@ -1095,7 +1156,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.From.Reg = x86.REG_AX
 		p.To.Type = obj.TYPE_MEM
 		p.To.Reg = v.Args[0].Reg()
-		gc.AddAux(&p.To, v)
 		if logopt.Enabled() {
 			logopt.LogOpt(v.Pos, "nilcheck", "genssa", v.Block.Func.Name)
 		}
--- a/src/cmd/compile/internal/arm64/ssa.go
+++ b/src/cmd/compile/internal/arm64/ssa.go
@ -1083,7 +1083,7 @@ func ssaGenBlock(s *gc.SSAGenState, b, next *ssa.Block) {
 				s.Br(obj.AJMP, b.Succs[0].Block())
 			}
 		}
-		p.From.Offset = b.Aux.(int64)
+		p.From.Offset = b.AuxInt
 		p.From.Type = obj.TYPE_CONST
 		p.Reg = b.Controls[0].Reg()

--- a/src/cmd/compile/internal/gc/alg.go
+++ b/src/cmd/compile/internal/gc/alg.go
@ -8,12 +8,15 @@ import (
 	"cmd/compile/internal/types"
 	"cmd/internal/obj"
 	"fmt"
+	"sort"
 )

 // AlgKind describes the kind of algorithms used for comparing and
 // hashing a Type.
 type AlgKind int

+//go:generate stringer -type AlgKind -trimprefix A
+
 const (
 	// These values are known by runtime.
 	ANOEQ AlgKind = iota
@ -502,7 +505,7 @@ func geneq(t *types.Type) *obj.LSym {
 		namedfield("p", types.NewPtr(t)),
 		namedfield("q", types.NewPtr(t)),
 	)
-	tfn.Rlist.Set1(anonfield(types.Types[TBOOL]))
+	tfn.Rlist.Set1(namedfield("r", types.Types[TBOOL]))

 	fn := dclfunc(sym, tfn)
 	np := asNode(tfn.Type.Params().Field(0).Nname)
@ -516,48 +519,137 @@ func geneq(t *types.Type) *obj.LSym {
 		Fatalf("geneq %v", t)

 	case TARRAY:
-		// An array of pure memory would be handled by the
-		// standard memequal, so the element type must not be
-		// pure memory. Even if we unrolled the range loop,
-		// each iteration would be a function call, so don't bother
-		// unrolling.
-		nrange := nod(ORANGE, nil, nod(ODEREF, np, nil))
+		nelem := t.NumElem()

-		ni := newname(lookup("i"))
-		ni.Type = types.Types[TINT]
-		nrange.List.Set1(ni)
-		nrange.SetColas(true)
-		colasdefn(nrange.List.Slice(), nrange)
-		ni = nrange.List.First()
+		// checkAll generates code to check the equality of all array elements.
+		// If unroll is greater than nelem, checkAll generates:
+		//
+		// if eq(p[0], q[0]) && eq(p[1], q[1]) && ... {
+		// } else {
+		//   return
+		// }
+		//
+		// And so on.
+		//
+		// Otherwise it generates:
+		//
+		// for i := 0; i < nelem; i++ {
+		//   if eq(p[i], q[i]) {
+		//   } else {
+		//     return
+		//   }
+		// }
+		//
+		// TODO(josharian): consider doing some loop unrolling
+		// for larger nelem as well, processing a few elements at a time in a loop.
+		checkAll := func(unroll int64, eq func(pi, qi *Node) *Node) {
+			// checkIdx generates a node to check for equality at index i.
+			checkIdx := func(i *Node) *Node {
+				// pi := p[i]
+				pi := nod(OINDEX, np, i)
+				pi.SetBounded(true)
+				pi.Type = t.Elem()
+				// qi := q[i]
+				qi := nod(OINDEX, nq, i)
+				qi.SetBounded(true)
+				qi.Type = t.Elem()
+				return eq(pi, qi)
+			}

-		// if p[i] != q[i] { return false }
-		nx := nod(OINDEX, np, ni)
+			if nelem <= unroll {
+				// Generate a series of checks.
+				var cond *Node
+				for i := int64(0); i < nelem; i++ {
+					c := nodintconst(i)
+					check := checkIdx(c)
+					if cond == nil {
+						cond = check
+						continue
+					}
+					cond = nod(OANDAND, cond, check)
+				}
+				nif := nod(OIF, cond, nil)
+				nif.Rlist.Append(nod(ORETURN, nil, nil))
+				fn.Nbody.Append(nif)
+				return
+			}

-		nx.SetBounded(true)
-		ny := nod(OINDEX, nq, ni)
-		ny.SetBounded(true)
-
-		nif := nod(OIF, nil, nil)
-		nif.Left = nod(ONE, nx, ny)
-		r := nod(ORETURN, nil, nil)
-		r.List.Append(nodbool(false))
-		nif.Nbody.Append(r)
-		nrange.Nbody.Append(nif)
-		fn.Nbody.Append(nrange)
+			// Generate a for loop.
+			// for i := 0; i < nelem; i++
+			i := temp(types.Types[TINT])
+			init := nod(OAS, i, nodintconst(0))
+			cond := nod(OLT, i, nodintconst(nelem))
+			post := nod(OAS, i, nod(OADD, i, nodintconst(1)))
+			loop := nod(OFOR, cond, post)
+			loop.Ninit.Append(init)
+			// if eq(pi, qi) {} else { return }
+			check := checkIdx(i)
+			nif := nod(OIF, check, nil)
+			nif.Rlist.Append(nod(ORETURN, nil, nil))
+			loop.Nbody.Append(nif)
+			fn.Nbody.Append(loop)
+		}

+		switch t.Elem().Etype {
+		case TINTER:
+			// Do two loops. First, check that all the types match (cheap).
+			// Second, check that all the data match (expensive).
+			// TODO: when the array size is small, unroll the tab match checks.
+			checkAll(3, func(pi, qi *Node) *Node {
+				// Compare types.
+				pi = typecheck(pi, ctxExpr)
+				qi = typecheck(qi, ctxExpr)
+				eqtab, _ := eqinterface(pi, qi)
+				return eqtab
+			})
+			checkAll(1, func(pi, qi *Node) *Node {
+				// Compare data.
+				pi = typecheck(pi, ctxExpr)
+				qi = typecheck(qi, ctxExpr)
+				_, eqdata := eqinterface(pi, qi)
+				return eqdata
+			})
+		case TSTRING:
+			// Do two loops. First, check that all the lengths match (cheap).
+			// Second, check that all the contents match (expensive).
+			// TODO: when the array size is small, unroll the length match checks.
+			checkAll(3, func(pi, qi *Node) *Node {
+				// Compare lengths.
+				eqlen, _ := eqstring(pi, qi)
+				return eqlen
+			})
+			checkAll(1, func(pi, qi *Node) *Node {
+				// Compare contents.
+				_, eqmem := eqstring(pi, qi)
+				return eqmem
+			})
+		case TFLOAT32, TFLOAT64:
+			checkAll(2, func(pi, qi *Node) *Node {
+				// p[i] == q[i]
+				return nod(OEQ, pi, qi)
+			})
+		// TODO: pick apart structs, do them piecemeal too
+		default:
+			checkAll(1, func(pi, qi *Node) *Node {
+				// p[i] == q[i]
+				return nod(OEQ, pi, qi)
+			})
+		}
 		// return true
 		ret := nod(ORETURN, nil, nil)
 		ret.List.Append(nodbool(true))
 		fn.Nbody.Append(ret)

 	case TSTRUCT:
-		var cond *Node
+		// Build a list of conditions to satisfy.
+		// Track their order so that we can preserve aspects of that order.
+		type nodeIdx struct {
+			n   *Node
+			idx int
+		}
+		var conds []nodeIdx
 		and := func(n *Node) {
-			if cond == nil {
-				cond = n
-				return
-			}
-			cond = nod(OANDAND, cond, n)
+			conds = append(conds, nodeIdx{n: n, idx: len(conds)})
 		}

 		// Walk the struct using memequal for runs of AMEM
@ -573,7 +665,24 @@ func geneq(t *types.Type) *obj.LSym {

 			// Compare non-memory fields with field equality.
 			if !IsRegularMemory(f.Type) {
-				and(eqfield(np, nq, f.Sym))
+				p := nodSym(OXDOT, np, f.Sym)
+				q := nodSym(OXDOT, nq, f.Sym)
+				switch {
+				case f.Type.IsString():
+					eqlen, eqmem := eqstring(p, q)
+					and(eqlen)
+					and(eqmem)
+				case f.Type.IsInterface():
+					p.Type = f.Type
+					p = typecheck(p, ctxExpr)
+					q.Type = f.Type
+					q = typecheck(q, ctxExpr)
+					eqtab, eqdata := eqinterface(p, q)
+					and(eqtab)
+					and(eqdata)
+				default:
+					and(nod(OEQ, p, q))
+				}
 				i++
 				continue
 			}
@ -595,8 +704,24 @@ func geneq(t *types.Type) *obj.LSym {
 			i = next
 		}

-		if cond == nil {
+		// Sort conditions to put runtime calls last.
+		// Preserve the rest of the ordering.
+		sort.SliceStable(conds, func(i, j int) bool {
+			x, y := conds[i], conds[j]
+			if (x.n.Op != OCALL) == (y.n.Op != OCALL) {
+				return x.idx < y.idx
+			}
+			return x.n.Op != OCALL
+		})
+
+		var cond *Node
+		if len(conds) == 0 {
 			cond = nodbool(true)
+		} else {
+			cond = conds[0].n
+			for _, c := range conds[1:] {
+				cond = nod(OANDAND, cond, c.n)
+			}
 		}

 		ret := nod(ORETURN, nil, nil)
@ -643,6 +768,70 @@ func eqfield(p *Node, q *Node, field *types.Sym) *Node {
 	return ne
 }

+// eqstring returns the nodes
+//   len(s) == len(t)
+// and
+//   memequal(s.ptr, t.ptr, len(s))
+// which can be used to construct string equality comparison.
+// eqlen must be evaluated before eqmem, and shortcircuiting is required.
+func eqstring(s, t *Node) (eqlen, eqmem *Node) {
+	s = conv(s, types.Types[TSTRING])
+	t = conv(t, types.Types[TSTRING])
+	sptr := nod(OSPTR, s, nil)
+	tptr := nod(OSPTR, t, nil)
+	slen := conv(nod(OLEN, s, nil), types.Types[TUINTPTR])
+	tlen := conv(nod(OLEN, t, nil), types.Types[TUINTPTR])
+
+	fn := syslook("memequal")
+	fn = substArgTypes(fn, types.Types[TUINT8], types.Types[TUINT8])
+	call := nod(OCALL, fn, nil)
+	call.List.Append(sptr, tptr, slen.copy())
+	call = typecheck(call, ctxExpr|ctxMultiOK)
+
+	cmp := nod(OEQ, slen, tlen)
+	cmp = typecheck(cmp, ctxExpr)
+	cmp.Type = types.Types[TBOOL]
+	return cmp, call
+}
+
+// eqinterface returns the nodes
+//   s.tab == t.tab (or s.typ == t.typ, as appropriate)
+// and
+//   ifaceeq(s.tab, s.data, t.data) (or efaceeq(s.typ, s.data, t.data), as appropriate)
+// which can be used to construct interface equality comparison.
+// eqtab must be evaluated before eqdata, and shortcircuiting is required.
+func eqinterface(s, t *Node) (eqtab, eqdata *Node) {
+	if !types.Identical(s.Type, t.Type) {
+		Fatalf("eqinterface %v %v", s.Type, t.Type)
+	}
+	// func ifaceeq(tab *uintptr, x, y unsafe.Pointer) (ret bool)
+	// func efaceeq(typ *uintptr, x, y unsafe.Pointer) (ret bool)
+	var fn *Node
+	if s.Type.IsEmptyInterface() {
+		fn = syslook("efaceeq")
+	} else {
+		fn = syslook("ifaceeq")
+	}
+
+	stab := nod(OITAB, s, nil)
+	ttab := nod(OITAB, t, nil)
+	sdata := nod(OIDATA, s, nil)
+	tdata := nod(OIDATA, t, nil)
+	sdata.Type = types.Types[TUNSAFEPTR]
+	tdata.Type = types.Types[TUNSAFEPTR]
+	sdata.SetTypecheck(1)
+	tdata.SetTypecheck(1)
+
+	call := nod(OCALL, fn, nil)
+	call.List.Append(stab, sdata, tdata)
+	call = typecheck(call, ctxExpr|ctxMultiOK)
+
+	cmp := nod(OEQ, stab, ttab)
+	cmp = typecheck(cmp, ctxExpr)
+	cmp.Type = types.Types[TBOOL]
+	return cmp, call
+}
+
 // eqmem returns the node
 // 	memequal(&p.field, &q.field [, size])
 func eqmem(p *Node, q *Node, field *types.Sym, size int64) *Node {
--- a/src/cmd/compile/internal/gc/algkind_string.go
+++ b/src/cmd/compile/internal/gc/algkind_string.go
@ -0,0 +1,48 @@
+// Code generated by "stringer -type AlgKind -trimprefix A"; DO NOT EDIT.
+
+package gc
+
+import "strconv"
+
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[ANOEQ-0]
+	_ = x[AMEM0-1]
+	_ = x[AMEM8-2]
+	_ = x[AMEM16-3]
+	_ = x[AMEM32-4]
+	_ = x[AMEM64-5]
+	_ = x[AMEM128-6]
+	_ = x[ASTRING-7]
+	_ = x[AINTER-8]
+	_ = x[ANILINTER-9]
+	_ = x[AFLOAT32-10]
+	_ = x[AFLOAT64-11]
+	_ = x[ACPLX64-12]
+	_ = x[ACPLX128-13]
+	_ = x[AMEM-100]
+	_ = x[ASPECIAL - -1]
+}
+
+const (
+	_AlgKind_name_0 = "SPECIALNOEQMEM0MEM8MEM16MEM32MEM64MEM128STRINGINTERNILINTERFLOAT32FLOAT64CPLX64CPLX128"
+	_AlgKind_name_1 = "MEM"
+)
+
+var (
+	_AlgKind_index_0 = [...]uint8{0, 7, 11, 15, 19, 24, 29, 34, 40, 46, 51, 59, 66, 73, 79, 86}
+)
+
+func (i AlgKind) String() string {
+	switch {
+	case -1 <= i && i <= 13:
+		i -= -1
+		return _AlgKind_name_0[_AlgKind_index_0[i]:_AlgKind_index_0[i+1]]
+	case i == 100:
+		return _AlgKind_name_1
+	default:
+		return "AlgKind(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+}
--- a/src/cmd/compile/internal/gc/align.go
+++ b/src/cmd/compile/internal/gc/align.go
@ -319,10 +319,10 @@ func dowidth(t *types.Type) {
 		Fatalf("dowidth any")

 	case TSTRING:
-		if sizeof_String == 0 {
+		if sizeofString == 0 {
 			Fatalf("early dowidth string")
 		}
-		w = int64(sizeof_String)
+		w = sizeofString
 		t.Align = uint8(Widthptr)

 	case TARRAY:
@ -344,7 +344,7 @@ func dowidth(t *types.Type) {
 		if t.Elem() == nil {
 			break
 		}
-		w = int64(sizeof_Slice)
+		w = sizeofSlice
 		checkwidth(t.Elem())
 		t.Align = uint8(Widthptr)

--- a/src/cmd/compile/internal/gc/bench_test.go
+++ b/src/cmd/compile/internal/gc/bench_test.go
@ -0,0 +1,40 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gc
+
+import "testing"
+
+var globl int64
+
+func BenchmarkLoadAdd(b *testing.B) {
+	x := make([]int64, 1024)
+	y := make([]int64, 1024)
+	for i := 0; i < b.N; i++ {
+		var s int64
+		for i := range x {
+			s ^= x[i] + y[i]
+		}
+		globl = s
+	}
+}
+
+func BenchmarkModify(b *testing.B) {
+	a := make([]int64, 1024)
+	v := globl
+	for i := 0; i < b.N; i++ {
+		for j := range a {
+			a[j] += v
+		}
+	}
+}
+
+func BenchmarkConstModify(b *testing.B) {
+	a := make([]int64, 1024)
+	for i := 0; i < b.N; i++ {
+		for j := range a {
+			a[j] += 3
+		}
+	}
+}
--- a/src/cmd/compile/internal/gc/builtin.go
+++ b/src/cmd/compile/internal/gc/builtin.go
@ -57,9 +57,9 @@ var runtimeDecls = [...]struct {
 	{"concatstrings", funcTag, 35},
 	{"cmpstring", funcTag, 36},
 	{"intstring", funcTag, 39},
-	{"slicebytetostring", funcTag, 41},
-	{"slicebytetostringtmp", funcTag, 42},
-	{"slicerunetostring", funcTag, 45},
+	{"slicebytetostring", funcTag, 40},
+	{"slicebytetostringtmp", funcTag, 41},
+	{"slicerunetostring", funcTag, 44},
 	{"stringtoslicebyte", funcTag, 46},
 	{"stringtoslicerune", funcTag, 49},
 	{"slicecopy", funcTag, 51},
@ -241,20 +241,20 @@ func runtimeTypes() []*types.Type {
 	typs[37] = types.NewArray(typs[0], 4)
 	typs[38] = types.NewPtr(typs[37])
 	typs[39] = functype(nil, []*Node{anonfield(typs[38]), anonfield(typs[19])}, []*Node{anonfield(typs[25])})
-	typs[40] = types.NewSlice(typs[0])
-	typs[41] = functype(nil, []*Node{anonfield(typs[29]), anonfield(typs[40])}, []*Node{anonfield(typs[25])})
-	typs[42] = functype(nil, []*Node{anonfield(typs[40])}, []*Node{anonfield(typs[25])})
-	typs[43] = types.Runetype
-	typs[44] = types.NewSlice(typs[43])
-	typs[45] = functype(nil, []*Node{anonfield(typs[29]), anonfield(typs[44])}, []*Node{anonfield(typs[25])})
-	typs[46] = functype(nil, []*Node{anonfield(typs[29]), anonfield(typs[25])}, []*Node{anonfield(typs[40])})
-	typs[47] = types.NewArray(typs[43], 32)
+	typs[40] = functype(nil, []*Node{anonfield(typs[29]), anonfield(typs[1]), anonfield(typs[11])}, []*Node{anonfield(typs[25])})
+	typs[41] = functype(nil, []*Node{anonfield(typs[1]), anonfield(typs[11])}, []*Node{anonfield(typs[25])})
+	typs[42] = types.Runetype
+	typs[43] = types.NewSlice(typs[42])
+	typs[44] = functype(nil, []*Node{anonfield(typs[29]), anonfield(typs[43])}, []*Node{anonfield(typs[25])})
+	typs[45] = types.NewSlice(typs[0])
+	typs[46] = functype(nil, []*Node{anonfield(typs[29]), anonfield(typs[25])}, []*Node{anonfield(typs[45])})
+	typs[47] = types.NewArray(typs[42], 32)
 	typs[48] = types.NewPtr(typs[47])
-	typs[49] = functype(nil, []*Node{anonfield(typs[48]), anonfield(typs[25])}, []*Node{anonfield(typs[44])})
+	typs[49] = functype(nil, []*Node{anonfield(typs[48]), anonfield(typs[25])}, []*Node{anonfield(typs[43])})
 	typs[50] = types.Types[TUINTPTR]
-	typs[51] = functype(nil, []*Node{anonfield(typs[2]), anonfield(typs[2]), anonfield(typs[50])}, []*Node{anonfield(typs[11])})
-	typs[52] = functype(nil, []*Node{anonfield(typs[2]), anonfield(typs[2])}, []*Node{anonfield(typs[11])})
-	typs[53] = functype(nil, []*Node{anonfield(typs[25]), anonfield(typs[11])}, []*Node{anonfield(typs[43]), anonfield(typs[11])})
+	typs[51] = functype(nil, []*Node{anonfield(typs[3]), anonfield(typs[11]), anonfield(typs[3]), anonfield(typs[11]), anonfield(typs[50])}, []*Node{anonfield(typs[11])})
+	typs[52] = functype(nil, []*Node{anonfield(typs[1]), anonfield(typs[11]), anonfield(typs[25])}, []*Node{anonfield(typs[11])})
+	typs[53] = functype(nil, []*Node{anonfield(typs[25]), anonfield(typs[11])}, []*Node{anonfield(typs[42]), anonfield(typs[11])})
 	typs[54] = functype(nil, []*Node{anonfield(typs[25])}, []*Node{anonfield(typs[11])})
 	typs[55] = functype(nil, []*Node{anonfield(typs[1]), anonfield(typs[2])}, []*Node{anonfield(typs[2])})
 	typs[56] = types.Types[TUNSAFEPTR]
@ -293,7 +293,7 @@ func runtimeTypes() []*types.Type {
 	typs[89] = tostruct([]*Node{namedfield("enabled", typs[15]), namedfield("pad", typs[88]), namedfield("needed", typs[15]), namedfield("cgo", typs[15]), namedfield("alignme", typs[21])})
 	typs[90] = functype(nil, []*Node{anonfield(typs[1]), anonfield(typs[3]), anonfield(typs[3])}, nil)
 	typs[91] = functype(nil, []*Node{anonfield(typs[1]), anonfield(typs[3])}, nil)
-	typs[92] = functype(nil, []*Node{anonfield(typs[1]), anonfield(typs[2]), anonfield(typs[2])}, []*Node{anonfield(typs[11])})
+	typs[92] = functype(nil, []*Node{anonfield(typs[1]), anonfield(typs[3]), anonfield(typs[11]), anonfield(typs[3]), anonfield(typs[11])}, []*Node{anonfield(typs[11])})
 	typs[93] = functype(nil, []*Node{anonfield(typs[86]), anonfield(typs[3])}, []*Node{anonfield(typs[15])})
 	typs[94] = functype(nil, []*Node{anonfield(typs[3]), anonfield(typs[83])}, []*Node{anonfield(typs[15])})
 	typs[95] = types.NewPtr(typs[15])
--- a/src/cmd/compile/internal/gc/builtin/runtime.go
+++ b/src/cmd/compile/internal/gc/builtin/runtime.go
@ -69,13 +69,13 @@ func concatstrings(*[32]byte, []string) string

 func cmpstring(string, string) int
 func intstring(*[4]byte, int64) string
-func slicebytetostring(*[32]byte, []byte) string
-func slicebytetostringtmp([]byte) string
+func slicebytetostring(buf *[32]byte, ptr *byte, n int) string
+func slicebytetostringtmp(ptr *byte, n int) string
 func slicerunetostring(*[32]byte, []rune) string
 func stringtoslicebyte(*[32]byte, string) []byte
 func stringtoslicerune(*[32]rune, string) []rune
-func slicecopy(to any, fr any, wid uintptr) int
-func slicestringcopy(to any, fr any) int
+func slicecopy(toPtr *any, toLen int, frPtr *any, frLen int, wid uintptr) int
+func slicestringcopy(toPtr *byte, toLen int, fr string) int

 func decoderune(string, int) (retv rune, retk int)
 func countrunes(string) int
@ -162,7 +162,7 @@ var writeBarrier struct {
 // *byte is really *runtime.Type
 func typedmemmove(typ *byte, dst *any, src *any)
 func typedmemclr(typ *byte, dst *any)
-func typedslicecopy(typ *byte, dst any, src any) int
+func typedslicecopy(typ *byte, dstPtr *any, dstLen int, srcPtr *any, srcLen int) int

 func selectnbsend(hchan chan<- any, elem *any) bool
 func selectnbrecv(elem *any, hchan <-chan any) bool
--- a/src/cmd/compile/internal/gc/bv.go
+++ b/src/cmd/compile/internal/gc/bv.go
@ -4,6 +4,10 @@

 package gc

+import (
+	"math/bits"
+)
+
 const (
 	wordBits  = 32
 	wordMask  = wordBits - 1
@ -108,30 +112,11 @@ func (bv bvec) Next(i int32) int32 {

 	// Find 1 bit.
 	w := bv.b[i>>wordShift] >> uint(i&wordMask)
-
-	for w&1 == 0 {
-		w >>= 1
-		i++
-	}
+	i += int32(bits.TrailingZeros32(w))

 	return i
 }

-// Len returns the minimum number of bits required to represent bv.
-// The result is 0 if no bits are set in bv.
-func (bv bvec) Len() int32 {
-	for wi := len(bv.b) - 1; wi >= 0; wi-- {
-		if w := bv.b[wi]; w != 0 {
-			for i := wordBits - 1; i >= 0; i-- {
-				if w>>uint(i) != 0 {
-					return int32(wi)*wordBits + int32(i) + 1
-				}
-			}
-		}
-	}
-	return 0
-}
-
 func (bv bvec) IsEmpty() bool {
 	for _, x := range bv.b {
 		if x != 0 {
--- a/src/cmd/compile/internal/gc/closure.go
+++ b/src/cmd/compile/internal/gc/closure.go
@ -566,3 +566,20 @@ func walkpartialcall(n *Node, init *Nodes) *Node {

 	return walkexpr(clos, init)
 }
+
+// callpartMethod returns the *types.Field representing the method
+// referenced by method value n.
+func callpartMethod(n *Node) *types.Field {
+	if n.Op != OCALLPART {
+		Fatalf("expected OCALLPART, got %v", n)
+	}
+
+	// TODO(mdempsky): Optimize this. If necessary,
+	// makepartialcall could save m for us somewhere.
+	var m *types.Field
+	if lookdot0(n.Right.Sym, n.Left.Type, &m, false) != 1 {
+		Fatalf("failed to find field for OCALLPART")
+	}
+
+	return m
+}
--- a/src/cmd/compile/internal/gc/dcl.go
+++ b/src/cmd/compile/internal/gc/dcl.go
@ -590,14 +590,6 @@ func checkdupfields(what string, fss ...[]*types.Field) {
 // a type for struct/interface/arglist
 func tostruct(l []*Node) *types.Type {
 	t := types.New(TSTRUCT)
-	tostruct0(t, l)
-	return t
-}
-
-func tostruct0(t *types.Type, l []*Node) {
-	if t == nil || !t.IsStruct() {
-		Fatalf("struct expected")
-	}

 	fields := make([]*types.Field, len(l))
 	for i, n := range l {
@ -614,6 +606,8 @@ func tostruct0(t *types.Type, l []*Node) {
 	if !t.Broke() {
 		checkwidth(t)
 	}
+
+	return t
 }

 func tofunargs(l []*Node, funarg types.Funarg) *types.Type {
@ -684,15 +678,6 @@ func tointerface(l []*Node) *types.Type {
 		return types.Types[TINTER]
 	}
 	t := types.New(TINTER)
-	tointerface0(t, l)
-	return t
-}
-
-func tointerface0(t *types.Type, l []*Node) {
-	if t == nil || !t.IsInterface() {
-		Fatalf("interface expected")
-	}
-
 	var fields []*types.Field
 	for _, n := range l {
 		f := interfacefield(n)
@ -702,6 +687,7 @@ func tointerface0(t *types.Type, l []*Node) {
 		fields = append(fields, f)
 	}
 	t.SetInterface(fields)
+	return t
 }

 func fakeRecv() *Node {
@ -724,14 +710,6 @@ func isifacemethod(f *types.Type) bool {
 // turn a parsed function declaration into a type
 func functype(this *Node, in, out []*Node) *types.Type {
 	t := types.New(TFUNC)
-	functype0(t, this, in, out)
-	return t
-}
-
-func functype0(t *types.Type, this *Node, in, out []*Node) {
-	if t == nil || t.Etype != TFUNC {
-		Fatalf("function type expected")
-	}

 	var rcvr []*Node
 	if this != nil {
@ -748,15 +726,13 @@ func functype0(t *types.Type, this *Node, in, out []*Node) {
 	}

 	t.FuncType().Outnamed = t.NumResults() > 0 && origSym(t.Results().Field(0).Sym) != nil
+
+	return t
 }

 func functypefield(this *types.Field, in, out []*types.Field) *types.Type {
 	t := types.New(TFUNC)
-	functypefield0(t, this, in, out)
-	return t
-}

-func functypefield0(t *types.Type, this *types.Field, in, out []*types.Field) {
 	var rcvr []*types.Field
 	if this != nil {
 		rcvr = []*types.Field{this}
@ -766,6 +742,8 @@ func functypefield0(t *types.Type, this *types.Field, in, out []*types.Field) {
 	t.FuncType().Results = tofunargsfield(out, types.FunargResults)

 	t.FuncType().Outnamed = t.NumResults() > 0 && origSym(t.Results().Field(0).Sym) != nil
+
+	return t
 }

 // origSym returns the original symbol written by the user.
--- a/src/cmd/compile/internal/gc/escape.go
+++ b/src/cmd/compile/internal/gc/escape.go
@ -7,6 +7,7 @@ package gc
 import (
 	"cmd/compile/internal/logopt"
 	"cmd/compile/internal/types"
+	"cmd/internal/src"
 	"fmt"
 	"math"
 	"strings"
@ -427,7 +428,12 @@ func (e *Escape) exprSkipInit(k EscHole, n *Node) {
 		lineno = lno
 	}()

-	if k.derefs >= 0 && !types.Haspointers(n.Type) {
+	uintptrEscapesHack := k.uintptrEscapesHack
+	k.uintptrEscapesHack = false
+
+	if uintptrEscapesHack && n.Op == OCONVNOP && n.Left.Type.IsUnsafePtr() {
+		// nop
+	} else if k.derefs >= 0 && !types.Haspointers(n.Type) {
 		k = e.discardHole()
 	}

@ -521,10 +527,26 @@ func (e *Escape) exprSkipInit(k EscHole, n *Node) {
 		// nop

 	case OCALLPART:
-		e.spill(k, n)
+		// Flow the receiver argument to both the closure and
+		// to the receiver parameter.

-		// TODO(mdempsky): We can do better here. See #27557.
-		e.assignHeap(n.Left, "call part", n)
+		closureK := e.spill(k, n)
+
+		m := callpartMethod(n)
+
+		// We don't know how the method value will be called
+		// later, so conservatively assume the result
+		// parameters all flow to the heap.
+		//
+		// TODO(mdempsky): Change ks into a callback, so that
+		// we don't have to create this dummy slice?
+		var ks []EscHole
+		for i := m.Type.NumResults(); i > 0; i-- {
+			ks = append(ks, e.heapHole())
+		}
+		paramK := e.tagHole(ks, asNode(m.Type.Nname()), m.Type.Recv())
+
+		e.expr(e.teeHole(paramK, closureK), n.Left)

 	case OPTRLIT:
 		e.expr(e.spill(k, n), n.Left)
@ -539,6 +561,7 @@ func (e *Escape) exprSkipInit(k EscHole, n *Node) {

 	case OSLICELIT:
 		k = e.spill(k, n)
+		k.uintptrEscapesHack = uintptrEscapesHack // for ...uintptr parameters

 		for _, elt := range n.List.Slice() {
 			if elt.Op == OKEY {
@ -717,197 +740,140 @@ func (e *Escape) assignHeap(src *Node, why string, where *Node) {
 // should contain the holes representing where the function callee's
 // results flows; where is the OGO/ODEFER context of the call, if any.
 func (e *Escape) call(ks []EscHole, call, where *Node) {
-	// First, pick out the function callee, its type, and receiver
-	// (if any) and normal arguments list.
-	var fn, recv *Node
-	var fntype *types.Type
-	args := call.List.Slice()
-	switch call.Op {
-	case OCALLFUNC:
-		fn = call.Left
-		if fn.Op == OCLOSURE {
-			fn = fn.Func.Closure.Func.Nname
-		}
-		fntype = fn.Type
-	case OCALLMETH:
-		fn = asNode(call.Left.Type.FuncType().Nname)
-		fntype = fn.Type
-		recv = call.Left.Left
-	case OCALLINTER:
-		fntype = call.Left.Type
-		recv = call.Left.Left
-	case OAPPEND, ODELETE, OPRINT, OPRINTN, ORECOVER:
-		// ok
-	case OLEN, OCAP, OREAL, OIMAG, OCLOSE, OPANIC:
-		args = []*Node{call.Left}
-	case OCOMPLEX, OCOPY:
-		args = []*Node{call.Left, call.Right}
-	default:
-		Fatalf("unexpected call op: %v", call.Op)
+	topLevelDefer := where != nil && where.Op == ODEFER && e.loopDepth == 1
+	if topLevelDefer {
+		// force stack allocation of defer record, unless
+		// open-coded defers are used (see ssa.go)
+		where.Esc = EscNever
 	}

-	static := fn != nil && fn.Op == ONAME && fn.Class() == PFUNC
-
-	// Setup evaluation holes for each receiver/argument.
-	var recvK EscHole
-	var paramKs []EscHole
-
-	if static && fn.Name.Defn != nil && fn.Name.Defn.Esc < EscFuncTagged {
-		// Static call to function in same mutually recursive
-		// group; incorporate into data flow graph.
-
-		if fn.Name.Defn.Esc == EscFuncUnknown {
-			Fatalf("graph inconsistency")
+	argument := func(k EscHole, arg *Node) {
+		if topLevelDefer {
+			// Top level defers arguments don't escape to
+			// heap, but they do need to last until end of
+			// function.
+			k = e.later(k)
+		} else if where != nil {
+			k = e.heapHole()
 		}

-		if ks != nil {
-			for i, result := range fntype.Results().FieldSlice() {
+		e.expr(k.note(call, "call parameter"), arg)
+	}
+
+	switch call.Op {
+	default:
+		Fatalf("unexpected call op: %v", call.Op)
+
+	case OCALLFUNC, OCALLMETH, OCALLINTER:
+		fixVariadicCall(call)
+
+		// Pick out the function callee, if statically known.
+		var fn *Node
+		switch call.Op {
+		case OCALLFUNC:
+			if call.Left.Op == ONAME && call.Left.Class() == PFUNC {
+				fn = call.Left
+			} else if call.Left.Op == OCLOSURE {
+				fn = call.Left.Func.Closure.Func.Nname
+			}
+		case OCALLMETH:
+			fn = asNode(call.Left.Type.FuncType().Nname)
+		}
+
+		fntype := call.Left.Type
+		if fn != nil {
+			fntype = fn.Type
+		}
+
+		if ks != nil && fn != nil && e.inMutualBatch(fn) {
+			for i, result := range fn.Type.Results().FieldSlice() {
 				e.expr(ks[i], asNode(result.Nname))
 			}
 		}

 		if r := fntype.Recv(); r != nil {
-			recvK = e.addr(asNode(r.Nname))
-		}
-		for _, param := range fntype.Params().FieldSlice() {
-			paramKs = append(paramKs, e.addr(asNode(param.Nname)))
-		}
-	} else if call.Op == OCALLFUNC || call.Op == OCALLMETH || call.Op == OCALLINTER {
-		// Dynamic call, or call to previously tagged
-		// function. Setup flows to heap and/or ks according
-		// to parameter tags.
-		if r := fntype.Recv(); r != nil {
-			recvK = e.tagHole(ks, r, static)
-		}
-		for _, param := range fntype.Params().FieldSlice() {
-			paramKs = append(paramKs, e.tagHole(ks, param, static))
-		}
-	} else {
-		// Handle escape analysis for builtins.
-		// By default, we just discard everything.
-		for range args {
-			paramKs = append(paramKs, e.discardHole())
+			argument(e.tagHole(ks, fn, r), call.Left.Left)
+		} else {
+			// Evaluate callee function expression.
+			argument(e.discardHole(), call.Left)
 		}

-		switch call.Op {
-		case OAPPEND:
-			// Appendee slice may flow directly to the
-			// result, if it has enough capacity.
-			// Alternatively, a new heap slice might be
-			// allocated, and all slice elements might
-			// flow to heap.
-			paramKs[0] = e.teeHole(paramKs[0], ks[0])
-			if types.Haspointers(args[0].Type.Elem()) {
-				paramKs[0] = e.teeHole(paramKs[0], e.heapHole().deref(call, "appendee slice"))
+		args := call.List.Slice()
+		for i, param := range fntype.Params().FieldSlice() {
+			argument(e.tagHole(ks, fn, param), args[i])
+		}
+
+	case OAPPEND:
+		args := call.List.Slice()
+
+		// Appendee slice may flow directly to the result, if
+		// it has enough capacity. Alternatively, a new heap
+		// slice might be allocated, and all slice elements
+		// might flow to heap.
+		appendeeK := ks[0]
+		if types.Haspointers(args[0].Type.Elem()) {
+			appendeeK = e.teeHole(appendeeK, e.heapHole().deref(call, "appendee slice"))
+		}
+		argument(appendeeK, args[0])
+
+		if call.IsDDD() {
+			appendedK := e.discardHole()
+			if args[1].Type.IsSlice() && types.Haspointers(args[1].Type.Elem()) {
+				appendedK = e.heapHole().deref(call, "appended slice...")
 			}
-
-			if call.IsDDD() {
-				if args[1].Type.IsSlice() && types.Haspointers(args[1].Type.Elem()) {
-					paramKs[1] = e.teeHole(paramKs[1], e.heapHole().deref(call, "appended slice..."))
-				}
-			} else {
-				for i := 1; i < len(args); i++ {
-					paramKs[i] = e.heapHole()
-				}
-			}
-
-		case OCOPY:
-			if call.Right.Type.IsSlice() && types.Haspointers(call.Right.Type.Elem()) {
-				paramKs[1] = e.teeHole(paramKs[1], e.heapHole().deref(call, "copied slice"))
-			}
-
-		case OPANIC:
-			paramKs[0] = e.heapHole()
-		}
-	}
-
-	if call.Op == OCALLFUNC {
-		// Evaluate callee function expression.
-		e.expr(e.augmentParamHole(e.discardHole(), call, where), call.Left)
-	}
-
-	if recv != nil {
-		// TODO(mdempsky): Handle go:uintptrescapes here too?
-		e.expr(e.augmentParamHole(recvK, call, where), recv)
-	}
-
-	// Apply augmentParamHole before ODDDARG so that it affects
-	// the implicit slice allocation for variadic calls, if any.
-	for i, paramK := range paramKs {
-		paramKs[i] = e.augmentParamHole(paramK, call, where)
-	}
-
-	// TODO(mdempsky): Remove after early ddd-ification.
-	if fntype != nil && fntype.IsVariadic() && !call.IsDDD() {
-		vi := fntype.NumParams() - 1
-
-		elt := fntype.Params().Field(vi).Type.Elem()
-		nva := call.List.Len()
-		nva -= vi
-
-		// Introduce ODDDARG node to represent ... allocation.
-		ddd := nodl(call.Pos, ODDDARG, nil, nil)
-		ddd.Type = types.NewPtr(types.NewArray(elt, int64(nva)))
-		call.Right = ddd
-
-		dddK := e.spill(paramKs[vi], ddd)
-		paramKs = paramKs[:vi]
-		for i := 0; i < nva; i++ {
-			paramKs = append(paramKs, dddK)
-		}
-	}
-
-	for i, arg := range args {
-		// For arguments to go:uintptrescapes, peel
-		// away an unsafe.Pointer->uintptr conversion,
-		// if present.
-		if static && arg.Op == OCONVNOP && arg.Type.Etype == TUINTPTR && arg.Left.Type.Etype == TUNSAFEPTR {
-			x := i
-			if fntype.IsVariadic() && x >= fntype.NumParams() {
-				x = fntype.NumParams() - 1
-			}
-			if fntype.Params().Field(x).Note == uintptrEscapesTag {
-				arg = arg.Left
+			argument(appendedK, args[1])
+		} else {
+			for _, arg := range args[1:] {
+				argument(e.heapHole(), arg)
 			}
 		}

-		// no augmentParamHole here; handled in loop before ODDDARG
-		e.expr(paramKs[i], arg)
-	}
-}
+	case OCOPY:
+		argument(e.discardHole(), call.Left)

-// augmentParamHole augments parameter holes as necessary for use in
-// go/defer statements.
-func (e *Escape) augmentParamHole(k EscHole, call, where *Node) EscHole {
-	k = k.note(call, "call parameter")
-	if where == nil {
-		return k
-	}
+		copiedK := e.discardHole()
+		if call.Right.Type.IsSlice() && types.Haspointers(call.Right.Type.Elem()) {
+			copiedK = e.heapHole().deref(call, "copied slice")
+		}
+		argument(copiedK, call.Right)

-	// Top level defers arguments don't escape to heap, but they
-	// do need to last until end of function. Tee with a
-	// non-transient location to avoid arguments from being
-	// transiently allocated.
-	if where.Op == ODEFER && e.loopDepth == 1 {
-		// force stack allocation of defer record, unless open-coded
-		// defers are used (see ssa.go)
-		where.Esc = EscNever
-		return e.later(k)
-	}
+	case OPANIC:
+		argument(e.heapHole(), call.Left)

-	return e.heapHole().note(where, "call parameter")
+	case OCOMPLEX:
+		argument(e.discardHole(), call.Left)
+		argument(e.discardHole(), call.Right)
+	case ODELETE, OPRINT, OPRINTN, ORECOVER:
+		for _, arg := range call.List.Slice() {
+			argument(e.discardHole(), arg)
+		}
+	case OLEN, OCAP, OREAL, OIMAG, OCLOSE:
+		argument(e.discardHole(), call.Left)
+	}
 }

 // tagHole returns a hole for evaluating an argument passed to param.
 // ks should contain the holes representing where the function
-// callee's results flows; static indicates whether this is a static
-// call.
-func (e *Escape) tagHole(ks []EscHole, param *types.Field, static bool) EscHole {
+// callee's results flows. fn is the statically-known callee function,
+// if any.
+func (e *Escape) tagHole(ks []EscHole, fn *Node, param *types.Field) EscHole {
 	// If this is a dynamic call, we can't rely on param.Note.
-	if !static {
+	if fn == nil {
 		return e.heapHole()
 	}

+	if e.inMutualBatch(fn) {
+		return e.addr(asNode(param.Nname))
+	}
+
+	// Call to previously tagged function.
+
+	if param.Note == uintptrEscapesTag {
+		k := e.heapHole()
+		k.uintptrEscapesHack = true
+		return k
+	}
+
 	var tagKs []EscHole

 	esc := ParseLeaks(param.Note)
@ -926,6 +892,21 @@ func (e *Escape) tagHole(ks []EscHole, param *types.Field, static bool) EscHole
 	return e.teeHole(tagKs...)
 }

+// inMutualBatch reports whether function fn is in the batch of
+// mutually recursive functions being analyzed. When this is true,
+// fn has not yet been analyzed, so its parameters and results
+// should be incorporated directly into the flow graph instead of
+// relying on its escape analysis tagging.
+func (e *Escape) inMutualBatch(fn *Node) bool {
+	if fn.Name.Defn != nil && fn.Name.Defn.Esc < EscFuncTagged {
+		if fn.Name.Defn.Esc == EscFuncUnknown {
+			Fatalf("graph inconsistency")
+		}
+		return true
+	}
+	return false
+}
+
 // An EscHole represents a context for evaluation a Go
 // expression. E.g., when evaluating p in "x = **p", we'd have a hole
 // with dst==x and derefs==2.
@ -933,6 +914,10 @@ type EscHole struct {
 	dst    *EscLocation
 	derefs int // >= -1
 	notes  *EscNote
+
+	// uintptrEscapesHack indicates this context is evaluating an
+	// argument for a //go:uintptrescapes function.
+	uintptrEscapesHack bool
 }

 type EscNote struct {
@ -945,7 +930,7 @@ func (k EscHole) note(where *Node, why string) EscHole {
 	if where == nil || why == "" {
 		Fatalf("note: missing where/why")
 	}
-	if Debug['m'] >= 2 {
+	if Debug['m'] >= 2 || logopt.Enabled() {
 		k.notes = &EscNote{
 			next:  k.notes,
 			where: where,
@ -1092,10 +1077,16 @@ func (e *Escape) flow(k EscHole, src *EscLocation) {
 		return
 	}
 	if dst.escapes && k.derefs < 0 { // dst = &src
-		if Debug['m'] >= 2 {
+		if Debug['m'] >= 2 || logopt.Enabled() {
 			pos := linestr(src.n.Pos)
-			fmt.Printf("%s: %v escapes to heap:\n", pos, src.n)
-			e.explainFlow(pos, dst, src, k.derefs, k.notes)
+			if Debug['m'] >= 2 {
+				fmt.Printf("%s: %v escapes to heap:\n", pos, src.n)
+			}
+			explanation := e.explainFlow(pos, dst, src, k.derefs, k.notes, []*logopt.LoggedOpt{})
+			if logopt.Enabled() {
+				logopt.LogOpt(src.n.Pos, "escapes", "escape", e.curfn.funcname(), fmt.Sprintf("%v escapes to heap", src.n), explanation)
+			}
+
 		}
 		src.escapes = true
 		return
@ -1119,7 +1110,8 @@ func (e *Escape) walkAll() {
 	// transient->!transient and !escapes->escapes, which can each
 	// happen at most once. So we take Θ(len(e.allLocs)) walks.

-	var todo []*EscLocation // LIFO queue
+	// LIFO queue, has enough room for e.allLocs and e.heapLoc.
+	todo := make([]*EscLocation, 0, len(e.allLocs)+1)
 	enqueue := func(loc *EscLocation) {
 		if !loc.queued {
 			todo = append(todo, loc)
@ -1187,9 +1179,15 @@ func (e *Escape) walkOne(root *EscLocation, walkgen uint32, enqueue func(*EscLoc
 			// that value flow for tagging the function
 			// later.
 			if l.isName(PPARAM) {
-				if Debug['m'] >= 2 && !l.escapes {
-					fmt.Printf("%s: parameter %v leaks to %s with derefs=%d:\n", linestr(l.n.Pos), l.n, e.explainLoc(root), base)
-					e.explainPath(root, l)
+				if (logopt.Enabled() || Debug['m'] >= 2) && !l.escapes {
+					if Debug['m'] >= 2 {
+						fmt.Printf("%s: parameter %v leaks to %s with derefs=%d:\n", linestr(l.n.Pos), l.n, e.explainLoc(root), base)
+					}
+					explanation := e.explainPath(root, l)
+					if logopt.Enabled() {
+						logopt.LogOpt(l.n.Pos, "leak", "escape", e.curfn.funcname(),
+							fmt.Sprintf("parameter %v leaks to %s with derefs=%d", l.n, e.explainLoc(root), base), explanation)
+					}
 				}
 				l.leakTo(root, base)
 			}
@ -1198,9 +1196,14 @@ func (e *Escape) walkOne(root *EscLocation, walkgen uint32, enqueue func(*EscLoc
 			// outlives it, then l needs to be heap
 			// allocated.
 			if addressOf && !l.escapes {
-				if Debug['m'] >= 2 {
-					fmt.Printf("%s: %v escapes to heap:\n", linestr(l.n.Pos), l.n)
-					e.explainPath(root, l)
+				if logopt.Enabled() || Debug['m'] >= 2 {
+					if Debug['m'] >= 2 {
+						fmt.Printf("%s: %v escapes to heap:\n", linestr(l.n.Pos), l.n)
+					}
+					explanation := e.explainPath(root, l)
+					if logopt.Enabled() {
+						logopt.LogOpt(l.n.Pos, "escape", "escape", e.curfn.funcname(), fmt.Sprintf("%v escapes to heap", l.n), explanation)
+					}
 				}
 				l.escapes = true
 				enqueue(l)
@ -1225,43 +1228,67 @@ func (e *Escape) walkOne(root *EscLocation, walkgen uint32, enqueue func(*EscLoc
 }

 // explainPath prints an explanation of how src flows to the walk root.
-func (e *Escape) explainPath(root, src *EscLocation) {
+func (e *Escape) explainPath(root, src *EscLocation) []*logopt.LoggedOpt {
 	visited := make(map[*EscLocation]bool)
-
 	pos := linestr(src.n.Pos)
+	var explanation []*logopt.LoggedOpt
 	for {
 		// Prevent infinite loop.
 		if visited[src] {
-			fmt.Printf("%s:   warning: truncated explanation due to assignment cycle; see golang.org/issue/35518\n", pos)
+			if Debug['m'] >= 2 {
+				fmt.Printf("%s:   warning: truncated explanation due to assignment cycle; see golang.org/issue/35518\n", pos)
+			}
 			break
 		}
 		visited[src] = true
-
 		dst := src.dst
 		edge := &dst.edges[src.dstEdgeIdx]
 		if edge.src != src {
 			Fatalf("path inconsistency: %v != %v", edge.src, src)
 		}

-		e.explainFlow(pos, dst, src, edge.derefs, edge.notes)
+		explanation = e.explainFlow(pos, dst, src, edge.derefs, edge.notes, explanation)

 		if dst == root {
 			break
 		}
 		src = dst
 	}
+
+	return explanation
 }

-func (e *Escape) explainFlow(pos string, dst, src *EscLocation, derefs int, notes *EscNote) {
+func (e *Escape) explainFlow(pos string, dst, srcloc *EscLocation, derefs int, notes *EscNote, explanation []*logopt.LoggedOpt) []*logopt.LoggedOpt {
 	ops := "&"
 	if derefs >= 0 {
 		ops = strings.Repeat("*", derefs)
 	}
+	print := Debug['m'] >= 2

-	fmt.Printf("%s:   flow: %s = %s%v:\n", pos, e.explainLoc(dst), ops, e.explainLoc(src))
-	for note := notes; note != nil; note = note.next {
-		fmt.Printf("%s:     from %v (%v) at %s\n", pos, note.where, note.why, linestr(note.where.Pos))
+	flow := fmt.Sprintf("   flow: %s = %s%v:", e.explainLoc(dst), ops, e.explainLoc(srcloc))
+	if print {
+		fmt.Printf("%s:%s\n", pos, flow)
 	}
+	if logopt.Enabled() {
+		var epos src.XPos
+		if notes != nil {
+			epos = notes.where.Pos
+		} else if srcloc != nil && srcloc.n != nil {
+			epos = srcloc.n.Pos
+		}
+		explanation = append(explanation, logopt.NewLoggedOpt(epos, "escflow", "escape", e.curfn.funcname(), flow))
+	}
+
+	for note := notes; note != nil; note = note.next {
+		if print {
+			fmt.Printf("%s:     from %v (%v) at %s\n", pos, note.where, note.why, linestr(note.where.Pos))
+		}
+		if logopt.Enabled() {
+			explanation = append(explanation, logopt.NewLoggedOpt(note.where.Pos, "escflow", "escape", e.curfn.funcname(),
+				fmt.Sprintf("     from %v (%v)", note.where, note.why)))
+		}
+	}
+	return explanation
 }

 func (e *Escape) explainLoc(l *EscLocation) string {
--- a/src/cmd/compile/internal/gc/fmt.go
+++ b/src/cmd/compile/internal/gc/fmt.go
@ -586,7 +586,7 @@ s%~	%%g
 */

 func symfmt(b *bytes.Buffer, s *types.Sym, flag FmtFlag, mode fmtMode) {
-	if s.Pkg != nil && flag&FmtShort == 0 {
+	if flag&FmtShort == 0 {
 		switch mode {
 		case FErr: // This is for the user
 			if s.Pkg == builtinpkg || s.Pkg == localpkg {
@ -1306,9 +1306,6 @@ func (n *Node) exprfmt(s fmt.State, prec int, mode fmtMode) {
 	case OPAREN:
 		mode.Fprintf(s, "(%v)", n.Left)

-	case ODDDARG:
-		fmt.Fprint(s, "... argument")
-
 	case OLITERAL: // this is a bit of a mess
 		if mode == FErr {
 			if n.Orig != nil && n.Orig != n {
@ -1404,6 +1401,10 @@ func (n *Node) exprfmt(s fmt.State, prec int, mode fmtMode) {

 	case OCOMPLIT:
 		if mode == FErr {
+			if n.Implicit() {
+				mode.Fprintf(s, "... argument")
+				return
+			}
 			if n.Right != nil {
 				mode.Fprintf(s, "%v literal", n.Right)
 				return
--- a/src/cmd/compile/internal/gc/go.go
+++ b/src/cmd/compile/internal/gc/go.go
@ -45,6 +45,14 @@ func isRuntimePkg(p *types.Pkg) bool {
 	return p.Path == "runtime"
 }

+// isReflectPkg reports whether p is package reflect.
+func isReflectPkg(p *types.Pkg) bool {
+	if p == localpkg {
+		return myimportpath == "reflect"
+	}
+	return p.Path == "reflect"
+}
+
 // The Class of a variable/function describes the "storage class"
 // of a variable or function. During parsing, storage classes are
 // called declaration contexts.
@ -64,32 +72,30 @@ const (
 	_ = uint((1 << 3) - iota) // static assert for iota <= (1 << 3)
 )

-// note this is the runtime representation
-// of the compilers slices.
+// Slices in the runtime are represented by three components:
 //
-// typedef	struct
-// {				// must not move anything
-// 	uchar	array[8];	// pointer to data
-// 	uchar	nel[4];		// number of elements
-// 	uchar	cap[4];		// allocated number of elements
-// } Slice;
-var slice_array int // runtime offsetof(Slice,array) - same for String
-
-var slice_nel int // runtime offsetof(Slice,nel) - same for String
-
-var slice_cap int // runtime offsetof(Slice,cap)
-
-var sizeof_Slice int // runtime sizeof(Slice)
-
-// note this is the runtime representation
-// of the compilers strings.
+// type slice struct {
+// 	ptr unsafe.Pointer
+// 	len int
+// 	cap int
+// }
 //
-// typedef	struct
-// {				// must not move anything
-// 	uchar	array[8];	// pointer to data
-// 	uchar	nel[4];		// number of elements
-// } String;
-var sizeof_String int // runtime sizeof(String)
+// Strings in the runtime are represented by two components:
+//
+// type string struct {
+// 	ptr unsafe.Pointer
+// 	len int
+// }
+//
+// These variables are the offsets of fields and sizes of these structs.
+var (
+	slicePtrOffset int64
+	sliceLenOffset int64
+	sliceCapOffset int64
+
+	sizeofSlice  int64
+	sizeofString int64
+)

 var pragcgobuf [][]string

--- a/src/cmd/compile/internal/gc/gsubr.go
+++ b/src/cmd/compile/internal/gc/gsubr.go
@ -70,9 +70,13 @@ func newProgs(fn *Node, worker int) *Progs {

 	pp.pos = fn.Pos
 	pp.settext(fn)
-	pp.nextLive = LivenessInvalid
 	// PCDATA tables implicitly start with index -1.
-	pp.prevLive = LivenessIndex{-1, -1}
+	pp.prevLive = LivenessIndex{-1, -1, false}
+	if go115ReduceLiveness {
+		pp.nextLive = pp.prevLive
+	} else {
+		pp.nextLive = LivenessInvalid
+	}
 	return pp
 }

@ -109,7 +113,7 @@ func (pp *Progs) Free() {

 // Prog adds a Prog with instruction As to pp.
 func (pp *Progs) Prog(as obj.As) *obj.Prog {
-	if pp.nextLive.stackMapIndex != pp.prevLive.stackMapIndex {
+	if pp.nextLive.StackMapValid() && pp.nextLive.stackMapIndex != pp.prevLive.stackMapIndex {
 		// Emit stack map index change.
 		idx := pp.nextLive.stackMapIndex
 		pp.prevLive.stackMapIndex = idx
@ -117,13 +121,32 @@ func (pp *Progs) Prog(as obj.As) *obj.Prog {
 		Addrconst(&p.From, objabi.PCDATA_StackMapIndex)
 		Addrconst(&p.To, int64(idx))
 	}
-	if pp.nextLive.regMapIndex != pp.prevLive.regMapIndex {
-		// Emit register map index change.
-		idx := pp.nextLive.regMapIndex
-		pp.prevLive.regMapIndex = idx
-		p := pp.Prog(obj.APCDATA)
-		Addrconst(&p.From, objabi.PCDATA_RegMapIndex)
-		Addrconst(&p.To, int64(idx))
+	if !go115ReduceLiveness {
+		if pp.nextLive.isUnsafePoint {
+			// Unsafe points are encoded as a special value in the
+			// register map.
+			pp.nextLive.regMapIndex = objabi.PCDATA_RegMapUnsafe
+		}
+		if pp.nextLive.regMapIndex != pp.prevLive.regMapIndex {
+			// Emit register map index change.
+			idx := pp.nextLive.regMapIndex
+			pp.prevLive.regMapIndex = idx
+			p := pp.Prog(obj.APCDATA)
+			Addrconst(&p.From, objabi.PCDATA_RegMapIndex)
+			Addrconst(&p.To, int64(idx))
+		}
+	} else {
+		if pp.nextLive.isUnsafePoint != pp.prevLive.isUnsafePoint {
+			// Emit unsafe-point marker.
+			pp.prevLive.isUnsafePoint = pp.nextLive.isUnsafePoint
+			p := pp.Prog(obj.APCDATA)
+			Addrconst(&p.From, objabi.PCDATA_UnsafePoint)
+			if pp.nextLive.isUnsafePoint {
+				Addrconst(&p.To, objabi.PCDATA_UnsafePointUnsafe)
+			} else {
+				Addrconst(&p.To, objabi.PCDATA_UnsafePointSafe)
+			}
+		}
 	}

 	p := pp.next
--- a/src/cmd/compile/internal/gc/iexport.go
+++ b/src/cmd/compile/internal/gc/iexport.go
@ -35,6 +35,8 @@
 //         }
 //     }
 //
+//     Fingerprint [8]byte
+//
 // uvarint means a uint64 written out using uvarint encoding.
 //
 // []T means a uvarint followed by that many T objects. In other
@ -296,6 +298,10 @@ func iexport(out *bufio.Writer) {
 	io.Copy(out, &hdr)
 	io.Copy(out, &p.strings)
 	io.Copy(out, &p.data0)
+
+	// Add fingerprint (used by linker object file).
+	// Attach this to the end, so tools (e.g. gcimporter) don't care.
+	out.Write(Ctxt.Fingerprint[:])
 }

 // writeIndex writes out an object index. mainIndex indicates whether
@ -991,7 +997,7 @@ func (w *exportWriter) linkname(s *types.Sym) {
 }

 func (w *exportWriter) symIdx(s *types.Sym) {
-	if Ctxt.Flag_newobj {
+	if Ctxt.Flag_go115newobj {
 		lsym := s.Linksym()
 		if lsym.PkgIdx > goobj2.PkgIdxSelf || (lsym.PkgIdx == goobj2.PkgIdxInvalid && !lsym.Indexed()) || s.Linkname != "" {
 			// Don't export index for non-package symbols, linkname'd symbols,
--- a/src/cmd/compile/internal/gc/iimport.go
+++ b/src/cmd/compile/internal/gc/iimport.go
@ -10,6 +10,7 @@ package gc
 import (
 	"cmd/compile/internal/types"
 	"cmd/internal/bio"
+	"cmd/internal/goobj2"
 	"cmd/internal/obj"
 	"cmd/internal/src"
 	"encoding/binary"
@ -95,7 +96,7 @@ func (r *intReader) uint64() uint64 {
 	return i
 }

-func iimport(pkg *types.Pkg, in *bio.Reader) {
+func iimport(pkg *types.Pkg, in *bio.Reader) (fingerprint goobj2.FingerprintType) {
 	ir := &intReader{in, pkg}

 	version := ir.uint64()
@ -188,6 +189,14 @@ func iimport(pkg *types.Pkg, in *bio.Reader) {
 			inlineImporter[s] = iimporterAndOffset{p, off}
 		}
 	}
+
+	// Fingerprint
+	n, err := in.Read(fingerprint[:])
+	if err != nil || n != len(fingerprint) {
+		yyerror("import %s: error reading fingerprint", pkg.Path)
+		errorexit()
+	}
+	return fingerprint
 }

 type iimporter struct {
@ -687,7 +696,7 @@ func (r *importReader) linkname(s *types.Sym) {
 }

 func (r *importReader) symIdx(s *types.Sym) {
-	if Ctxt.Flag_newobj {
+	if Ctxt.Flag_go115newobj {
 		lsym := s.Linksym()
 		idx := int32(r.int64())
 		if idx != -1 {
@ -790,9 +799,6 @@ func (r *importReader) node() *Node {
 	// case OPAREN:
 	// 	unreachable - unpacked by exporter

-	// case ODDDARG:
-	//	unimplemented
-
 	case OLITERAL:
 		pos := r.pos()
 		typ, val := r.value()
--- a/src/cmd/compile/internal/gc/inl.go
+++ b/src/cmd/compile/internal/gc/inl.go
@ -496,7 +496,14 @@ func inlcalls(fn *Node) {
 	if countNodes(fn) >= inlineBigFunctionNodes {
 		maxCost = inlineBigFunctionMaxCost
 	}
-	fn = inlnode(fn, maxCost)
+	// Map to keep track of functions that have been inlined at a particular
+	// call site, in order to stop inlining when we reach the beginning of a
+	// recursion cycle again. We don't inline immediately recursive functions,
+	// but allow inlining if there is a recursion cycle of many functions.
+	// Most likely, the inlining will stop before we even hit the beginning of
+	// the cycle again, but the map catches the unusual case.
+	inlMap := make(map[*Node]bool)
+	fn = inlnode(fn, maxCost, inlMap)
 	if fn != Curfn {
 		Fatalf("inlnode replaced curfn")
 	}
@ -537,10 +544,10 @@ func inlconv2list(n *Node) []*Node {
 	return s
 }

-func inlnodelist(l Nodes, maxCost int32) {
+func inlnodelist(l Nodes, maxCost int32, inlMap map[*Node]bool) {
 	s := l.Slice()
 	for i := range s {
-		s[i] = inlnode(s[i], maxCost)
+		s[i] = inlnode(s[i], maxCost, inlMap)
 	}
 }

@ -557,7 +564,7 @@ func inlnodelist(l Nodes, maxCost int32) {
 // shorter and less complicated.
 // The result of inlnode MUST be assigned back to n, e.g.
 // 	n.Left = inlnode(n.Left)
-func inlnode(n *Node, maxCost int32) *Node {
+func inlnode(n *Node, maxCost int32, inlMap map[*Node]bool) *Node {
 	if n == nil {
 		return n
 	}
@ -578,26 +585,26 @@ func inlnode(n *Node, maxCost int32) *Node {
 	case OCALLMETH:
 		// Prevent inlining some reflect.Value methods when using checkptr,
 		// even when package reflect was compiled without it (#35073).
-		if s := n.Left.Sym; Debug_checkptr != 0 && s.Pkg.Path == "reflect" && (s.Name == "Value.UnsafeAddr" || s.Name == "Value.Pointer") {
+		if s := n.Left.Sym; Debug_checkptr != 0 && isReflectPkg(s.Pkg) && (s.Name == "Value.UnsafeAddr" || s.Name == "Value.Pointer") {
 			return n
 		}
 	}

 	lno := setlineno(n)

-	inlnodelist(n.Ninit, maxCost)
+	inlnodelist(n.Ninit, maxCost, inlMap)
 	for _, n1 := range n.Ninit.Slice() {
 		if n1.Op == OINLCALL {
 			inlconv2stmt(n1)
 		}
 	}

-	n.Left = inlnode(n.Left, maxCost)
+	n.Left = inlnode(n.Left, maxCost, inlMap)
 	if n.Left != nil && n.Left.Op == OINLCALL {
 		n.Left = inlconv2expr(n.Left)
 	}

-	n.Right = inlnode(n.Right, maxCost)
+	n.Right = inlnode(n.Right, maxCost, inlMap)
 	if n.Right != nil && n.Right.Op == OINLCALL {
 		if n.Op == OFOR || n.Op == OFORUNTIL {
 			inlconv2stmt(n.Right)
@ -612,7 +619,7 @@ func inlnode(n *Node, maxCost int32) *Node {
 		}
 	}

-	inlnodelist(n.List, maxCost)
+	inlnodelist(n.List, maxCost, inlMap)
 	if n.Op == OBLOCK {
 		for _, n2 := range n.List.Slice() {
 			if n2.Op == OINLCALL {
@ -628,7 +635,7 @@ func inlnode(n *Node, maxCost int32) *Node {
 		}
 	}

-	inlnodelist(n.Rlist, maxCost)
+	inlnodelist(n.Rlist, maxCost, inlMap)
 	s := n.Rlist.Slice()
 	for i1, n1 := range s {
 		if n1.Op == OINLCALL {
@ -640,7 +647,7 @@ func inlnode(n *Node, maxCost int32) *Node {
 		}
 	}

-	inlnodelist(n.Nbody, maxCost)
+	inlnodelist(n.Nbody, maxCost, inlMap)
 	for _, n := range n.Nbody.Slice() {
 		if n.Op == OINLCALL {
 			inlconv2stmt(n)
@ -663,12 +670,12 @@ func inlnode(n *Node, maxCost int32) *Node {
 			fmt.Printf("%v:call to func %+v\n", n.Line(), n.Left)
 		}
 		if n.Left.Func != nil && n.Left.Func.Inl != nil && !isIntrinsicCall(n) { // normal case
-			n = mkinlcall(n, n.Left, maxCost)
+			n = mkinlcall(n, n.Left, maxCost, inlMap)
 		} else if n.Left.isMethodExpression() && asNode(n.Left.Sym.Def) != nil {
-			n = mkinlcall(n, asNode(n.Left.Sym.Def), maxCost)
+			n = mkinlcall(n, asNode(n.Left.Sym.Def), maxCost, inlMap)
 		} else if n.Left.Op == OCLOSURE {
 			if f := inlinableClosure(n.Left); f != nil {
-				n = mkinlcall(n, f, maxCost)
+				n = mkinlcall(n, f, maxCost, inlMap)
 			}
 		} else if n.Left.Op == ONAME && n.Left.Name != nil && n.Left.Name.Defn != nil {
 			if d := n.Left.Name.Defn; d.Op == OAS && d.Right.Op == OCLOSURE {
@ -680,6 +687,10 @@ func inlnode(n *Node, maxCost int32) *Node {
 						if Debug['m'] > 1 {
 							fmt.Printf("%v: cannot inline escaping closure variable %v\n", n.Line(), n.Left)
 						}
+						if logopt.Enabled() {
+							logopt.LogOpt(n.Pos, "cannotInlineCall", "inline", Curfn.funcname(),
+								fmt.Sprintf("%v cannot be inlined (escaping closure variable)", n.Left))
+						}
 						break
 					}

@ -688,13 +699,21 @@ func inlnode(n *Node, maxCost int32) *Node {
 						if Debug['m'] > 1 {
 							if a != nil {
 								fmt.Printf("%v: cannot inline re-assigned closure variable at %v: %v\n", n.Line(), a.Line(), a)
+								if logopt.Enabled() {
+									logopt.LogOpt(n.Pos, "cannotInlineCall", "inline", Curfn.funcname(),
+										fmt.Sprintf("%v cannot be inlined (re-assigned closure variable)", a))
+								}
 							} else {
 								fmt.Printf("%v: cannot inline global closure variable %v\n", n.Line(), n.Left)
+								if logopt.Enabled() {
+									logopt.LogOpt(n.Pos, "cannotInlineCall", "inline", Curfn.funcname(),
+										fmt.Sprintf("%v cannot be inlined (global closure variable)", n.Left))
+								}
 							}
 						}
 						break
 					}
-					n = mkinlcall(n, f, maxCost)
+					n = mkinlcall(n, f, maxCost, inlMap)
 				}
 			}
 		}
@ -713,7 +732,7 @@ func inlnode(n *Node, maxCost int32) *Node {
 			Fatalf("no function definition for [%p] %+v\n", n.Left.Type, n.Left.Type)
 		}

-		n = mkinlcall(n, asNode(n.Left.Type.FuncType().Nname), maxCost)
+		n = mkinlcall(n, asNode(n.Left.Type.FuncType().Nname), maxCost, inlMap)
 	}

 	lineno = lno
@ -833,9 +852,12 @@ var inlgen int
 // parameters.
 // The result of mkinlcall MUST be assigned back to n, e.g.
 // 	n.Left = mkinlcall(n.Left, fn, isddd)
-func mkinlcall(n, fn *Node, maxCost int32) *Node {
+func mkinlcall(n, fn *Node, maxCost int32, inlMap map[*Node]bool) *Node {
 	if fn.Func.Inl == nil {
-		// No inlinable body.
+		if logopt.Enabled() {
+			logopt.LogOpt(n.Pos, "cannotInlineCall", "inline", Curfn.funcname(),
+				fmt.Sprintf("%s cannot be inlined", fn.pkgFuncName()))
+		}
 		return n
 	}
 	if fn.Func.Inl.Cost > maxCost {
@ -866,6 +888,16 @@ func mkinlcall(n, fn *Node, maxCost int32) *Node {
 		return n
 	}

+	if inlMap[fn] {
+		if Debug['m'] > 1 {
+			fmt.Printf("%v: cannot inline %v into %v: repeated recursive cycle\n", n.Line(), fn, Curfn.funcname())
+		}
+		return n
+	}
+	inlMap[fn] = true
+	defer func() {
+		inlMap[fn] = false
+	}()
 	if Debug_typecheckinl == 0 {
 		typecheckinl(fn)
 	}
@ -879,9 +911,6 @@ func mkinlcall(n, fn *Node, maxCost int32) *Node {
 	if Debug['m'] > 2 {
 		fmt.Printf("%v: Before inlining: %+v\n", n.Line(), n)
 	}
-	if logopt.Enabled() {
-		logopt.LogOpt(n.Pos, "inlineCall", "inline", Curfn.funcname(), fn.pkgFuncName())
-	}

 	if ssaDump != "" && ssaDump == Curfn.funcname() {
 		ssaDumpInlined = append(ssaDumpInlined, fn)
@ -1129,7 +1158,7 @@ func mkinlcall(n, fn *Node, maxCost int32) *Node {
 	// instead we emit the things that the body needs
 	// and each use must redo the inlining.
 	// luckily these are small.
-	inlnodelist(call.Nbody, maxCost)
+	inlnodelist(call.Nbody, maxCost, inlMap)
 	for _, n := range call.Nbody.Slice() {
 		if n.Op == OINLCALL {
 			inlconv2stmt(n)
--- a/src/cmd/compile/internal/gc/inl_test.go
+++ b/src/cmd/compile/internal/gc/inl_test.go
@ -155,11 +155,11 @@ func TestIntendedInlining(t *testing.T) {
 		},
 	}

-	if runtime.GOARCH != "386" && runtime.GOARCH != "mips64" && runtime.GOARCH != "mips64le" {
+	if runtime.GOARCH != "386" && runtime.GOARCH != "mips64" && runtime.GOARCH != "mips64le" && runtime.GOARCH != "riscv64" {
 		// nextFreeFast calls sys.Ctz64, which on 386 is implemented in asm and is not inlinable.
 		// We currently don't have midstack inlining so nextFreeFast is also not inlinable on 386.
-		// On MIPS64x, Ctz64 is not intrinsified and causes nextFreeFast too expensive to inline
-		// (Issue 22239).
+		// On mips64x and riscv64, Ctz64 is not intrinsified and causes nextFreeFast too expensive
+		// to inline (Issue 22239).
 		want["runtime"] = append(want["runtime"], "nextFreeFast")
 	}
 	if runtime.GOARCH != "386" {
@ -175,7 +175,7 @@ func TestIntendedInlining(t *testing.T) {
 	}

 	switch runtime.GOARCH {
-	case "386", "wasm", "arm", "riscv64":
+	case "386", "wasm", "arm":
 	default:
 		// TODO(mvdan): As explained in /test/inline_sync.go, some
 		// architectures don't have atomic intrinsics, so these go over
--- a/src/cmd/compile/internal/gc/lex.go
+++ b/src/cmd/compile/internal/gc/lex.go
@ -28,16 +28,18 @@ func isQuoted(s string) bool {
 	return len(s) >= 2 && s[0] == '"' && s[len(s)-1] == '"'
 }

+type PragmaFlag int16
+
 const (
 	// Func pragmas.
-	Nointerface    syntax.Pragma = 1 << iota
-	Noescape                     // func parameters don't escape
-	Norace                       // func must not have race detector annotations
-	Nosplit                      // func should not execute on separate stack
-	Noinline                     // func should not be inlined
-	NoCheckPtr                   // func should not be instrumented by checkptr
-	CgoUnsafeArgs                // treat a pointer to one arg as a pointer to them all
-	UintptrEscapes               // pointers converted to uintptr escape
+	Nointerface    PragmaFlag = 1 << iota
+	Noescape                  // func parameters don't escape
+	Norace                    // func must not have race detector annotations
+	Nosplit                   // func should not execute on separate stack
+	Noinline                  // func should not be inlined
+	NoCheckPtr                // func should not be instrumented by checkptr
+	CgoUnsafeArgs             // treat a pointer to one arg as a pointer to them all
+	UintptrEscapes            // pointers converted to uintptr escape

 	// Runtime-only func pragmas.
 	// See ../../../../runtime/README.md for detailed descriptions.
@ -50,7 +52,24 @@ const (
 	NotInHeap // values of this type must not be heap allocated
 )

-func pragmaValue(verb string) syntax.Pragma {
+const (
+	FuncPragmas = Nointerface |
+		Noescape |
+		Norace |
+		Nosplit |
+		Noinline |
+		NoCheckPtr |
+		CgoUnsafeArgs |
+		UintptrEscapes |
+		Systemstack |
+		Nowritebarrier |
+		Nowritebarrierrec |
+		Yeswritebarrierrec
+
+	TypePragmas = NotInHeap
+)
+
+func pragmaFlag(verb string) PragmaFlag {
 	switch verb {
 	case "go:nointerface":
 		if objabi.Fieldtrack_enabled != 0 {
--- a/src/cmd/compile/internal/gc/main.go
+++ b/src/cmd/compile/internal/gc/main.go
@ -14,6 +14,7 @@ import (
 	"cmd/compile/internal/types"
 	"cmd/internal/bio"
 	"cmd/internal/dwarf"
+	"cmd/internal/goobj2"
 	"cmd/internal/obj"
 	"cmd/internal/objabi"
 	"cmd/internal/src"
@ -280,7 +281,7 @@ func Main(archInit func(*Arch)) {
 	flag.StringVar(&benchfile, "bench", "", "append benchmark times to `file`")
 	flag.BoolVar(&smallFrames, "smallframes", false, "reduce the size limit for stack allocated objects")
 	flag.BoolVar(&Ctxt.UseBASEntries, "dwarfbasentries", Ctxt.UseBASEntries, "use base address selection entries in DWARF")
-	flag.BoolVar(&Ctxt.Flag_newobj, "newobj", false, "use new object file format")
+	flag.BoolVar(&Ctxt.Flag_go115newobj, "go115newobj", true, "use new object file format")
 	flag.StringVar(&jsonLogOpt, "json", "", "version,destination for JSON compiler/optimizer logging")

 	objabi.Flagparse(usage)
@ -314,7 +315,7 @@ func Main(archInit func(*Arch)) {
 	// Record flags that affect the build result. (And don't
 	// record flags that don't, since that would cause spurious
 	// changes in the binary.)
-	recordFlags("B", "N", "l", "msan", "race", "shared", "dynlink", "dwarflocationlists", "dwarfbasentries", "smallframes", "spectre", "newobj")
+	recordFlags("B", "N", "l", "msan", "race", "shared", "dynlink", "dwarflocationlists", "dwarfbasentries", "smallframes", "spectre", "go115newobj")

 	if smallFrames {
 		maxStackVarSize = 128 * 1024
@ -379,9 +380,8 @@ func Main(archInit func(*Arch)) {
 	if flag_race && flag_msan {
 		log.Fatal("cannot use both -race and -msan")
 	}
-	if (flag_race || flag_msan) && objabi.GOOS != "windows" {
-		// -race and -msan imply -d=checkptr for now (except on windows).
-		// TODO(mdempsky): Re-evaluate before Go 1.14. See #34964.
+	if flag_race || flag_msan {
+		// -race and -msan imply -d=checkptr for now.
 		Debug_checkptr = 1
 	}
 	if ispkgin(omit_pkgs) {
@ -679,8 +679,12 @@ func Main(archInit func(*Arch)) {
 	if Debug['l'] != 0 {
 		// Find functions that can be inlined and clone them before walk expands them.
 		visitBottomUp(xtop, func(list []*Node, recursive bool) {
+			numfns := numNonClosures(list)
 			for _, n := range list {
-				if !recursive {
+				if !recursive || numfns > 1 {
+					// We allow inlining if there is no
+					// recursion, or the recursion cycle is
+					// across more than one function.
 					caninl(n)
 				} else {
 					if Debug['m'] > 1 {
@ -824,6 +828,17 @@ func Main(archInit func(*Arch)) {
 	}
 }

+// numNonClosures returns the number of functions in list which are not closures.
+func numNonClosures(list []*Node) int {
+	count := 0
+	for _, n := range list {
+		if n.Func.Closure == nil {
+			count++
+		}
+	}
+	return count
+}
+
 func writebench(filename string) error {
 	f, err := os.OpenFile(filename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666)
 	if err != nil {
@ -1240,15 +1255,6 @@ func importfile(f *Val) *types.Pkg {
 		}
 	}

-	// assume files move (get installed) so don't record the full path
-	if packageFile != nil {
-		// If using a packageFile map, assume path_ can be recorded directly.
-		Ctxt.AddImport(path_)
-	} else {
-		// For file "/Users/foo/go/pkg/darwin_amd64/math.a" record "math.a".
-		Ctxt.AddImport(file[len(file)-len(path_)-len(".a"):])
-	}
-
 	// In the importfile, if we find:
 	// $$\n  (textual format): not supported anymore
 	// $$B\n (binary format) : import directly, then feed the lexer a dummy statement
@ -1273,6 +1279,7 @@ func importfile(f *Val) *types.Pkg {
 		c, _ = imp.ReadByte()
 	}

+	var fingerprint goobj2.FingerprintType
 	switch c {
 	case '\n':
 		yyerror("cannot import %s: old export format no longer supported (recompile library)", path_)
@ -1296,13 +1303,22 @@ func importfile(f *Val) *types.Pkg {
 			yyerror("import %s: unexpected package format byte: %v", file, c)
 			errorexit()
 		}
-		iimport(importpkg, imp)
+		fingerprint = iimport(importpkg, imp)

 	default:
 		yyerror("no import in %q", path_)
 		errorexit()
 	}

+	// assume files move (get installed) so don't record the full path
+	if packageFile != nil {
+		// If using a packageFile map, assume path_ can be recorded directly.
+		Ctxt.AddImport(path_, fingerprint)
+	} else {
+		// For file "/Users/foo/go/pkg/darwin_amd64/math.a" record "math.a".
+		Ctxt.AddImport(file[len(file)-len(path_)-len(".a"):], fingerprint)
+	}
+
 	if importpkg.Height >= myheight {
 		myheight = importpkg.Height + 1
 	}
--- a/src/cmd/compile/internal/gc/noder.go
+++ b/src/cmd/compile/internal/gc/noder.go
@ -241,6 +241,10 @@ func (p *noder) node() {
 	p.setlineno(p.file.PkgName)
 	mkpackage(p.file.PkgName.Value)

+	if pragma, ok := p.file.Pragma.(*Pragma); ok {
+		p.checkUnused(pragma)
+	}
+
 	xtop = append(xtop, p.decls(p.file.DeclList)...)

 	for _, n := range p.linknames {
@ -313,6 +317,10 @@ func (p *noder) importDecl(imp *syntax.ImportDecl) {
 		return // avoid follow-on errors if there was a syntax error
 	}

+	if pragma, ok := imp.Pragma.(*Pragma); ok {
+		p.checkUnused(pragma)
+	}
+
 	val := p.basicLit(imp.Path)
 	ipkg := importfile(&val)

@ -363,6 +371,10 @@ func (p *noder) varDecl(decl *syntax.VarDecl) []*Node {
 		exprs = p.exprList(decl.Values)
 	}

+	if pragma, ok := decl.Pragma.(*Pragma); ok {
+		p.checkUnused(pragma)
+	}
+
 	p.setlineno(decl)
 	return variter(names, typ, exprs)
 }
@ -384,6 +396,10 @@ func (p *noder) constDecl(decl *syntax.ConstDecl, cs *constState) []*Node {
 		}
 	}

+	if pragma, ok := decl.Pragma.(*Pragma); ok {
+		p.checkUnused(pragma)
+	}
+
 	names := p.declNames(decl.NameList)
 	typ := p.typeExprOrNil(decl.Type)

@ -438,11 +454,13 @@ func (p *noder) typeDecl(decl *syntax.TypeDecl) *Node {

 	param := n.Name.Param
 	param.Ntype = typ
-	param.Pragma = decl.Pragma
 	param.Alias = decl.Alias
-	if param.Alias && param.Pragma != 0 {
-		yyerror("cannot specify directive with type alias")
-		param.Pragma = 0
+	if pragma, ok := decl.Pragma.(*Pragma); ok {
+		if !decl.Alias {
+			param.Pragma = pragma.Flag & TypePragmas
+			pragma.Flag &^= TypePragmas
+		}
+		p.checkUnused(pragma)
 	}

 	nod := p.nod(decl, ODCLTYPE, n, nil)
@ -493,10 +511,13 @@ func (p *noder) funcDecl(fun *syntax.FuncDecl) *Node {
 	f.Func.Nname.Name.Defn = f
 	f.Func.Nname.Name.Param.Ntype = t

-	pragma := fun.Pragma
-	f.Func.Pragma = fun.Pragma
-	if pragma&Systemstack != 0 && pragma&Nosplit != 0 {
-		yyerrorl(f.Pos, "go:nosplit and go:systemstack cannot be combined")
+	if pragma, ok := fun.Pragma.(*Pragma); ok {
+		f.Func.Pragma = pragma.Flag & FuncPragmas
+		if pragma.Flag&Systemstack != 0 && pragma.Flag&Nosplit != 0 {
+			yyerrorl(f.Pos, "go:nosplit and go:systemstack cannot be combined")
+		}
+		pragma.Flag &^= FuncPragmas
+		p.checkUnused(pragma)
 	}

 	if fun.Recv == nil {
@ -1479,13 +1500,58 @@ var allowedStdPragmas = map[string]bool{
 	"go:generate":           true,
 }

+// *Pragma is the value stored in a syntax.Pragma during parsing.
+type Pragma struct {
+	Flag PragmaFlag  // collected bits
+	Pos  []PragmaPos // position of each individual flag
+}
+
+type PragmaPos struct {
+	Flag PragmaFlag
+	Pos  syntax.Pos
+}
+
+func (p *noder) checkUnused(pragma *Pragma) {
+	for _, pos := range pragma.Pos {
+		if pos.Flag&pragma.Flag != 0 {
+			p.yyerrorpos(pos.Pos, "misplaced compiler directive")
+		}
+	}
+}
+
+func (p *noder) checkUnusedDuringParse(pragma *Pragma) {
+	for _, pos := range pragma.Pos {
+		if pos.Flag&pragma.Flag != 0 {
+			p.error(syntax.Error{Pos: pos.Pos, Msg: "misplaced compiler directive"})
+		}
+	}
+}
+
 // pragma is called concurrently if files are parsed concurrently.
-func (p *noder) pragma(pos syntax.Pos, text string) syntax.Pragma {
-	switch {
-	case strings.HasPrefix(text, "line "):
+func (p *noder) pragma(pos syntax.Pos, blankLine bool, text string, old syntax.Pragma) syntax.Pragma {
+	pragma, _ := old.(*Pragma)
+	if pragma == nil {
+		pragma = new(Pragma)
+	}
+
+	if text == "" {
+		// unused pragma; only called with old != nil.
+		p.checkUnusedDuringParse(pragma)
+		return nil
+	}
+
+	if strings.HasPrefix(text, "line ") {
 		// line directives are handled by syntax package
 		panic("unreachable")
+	}

+	if !blankLine {
+		// directive must be on line by itself
+		p.error(syntax.Error{Pos: pos, Msg: "misplaced compiler directive"})
+		return pragma
+	}
+
+	switch {
 	case strings.HasPrefix(text, "go:linkname "):
 		f := strings.Fields(text)
 		if !(2 <= len(f) && len(f) <= 3) {
@ -1513,7 +1579,8 @@ func (p *noder) pragma(pos syntax.Pos, text string) syntax.Pragma {
 				p.error(syntax.Error{Pos: pos, Msg: fmt.Sprintf("invalid library name %q in cgo_import_dynamic directive", lib)})
 			}
 			p.pragcgo(pos, text)
-			return pragmaValue("go:cgo_import_dynamic")
+			pragma.Flag |= pragmaFlag("go:cgo_import_dynamic")
+			break
 		}
 		fallthrough
 	case strings.HasPrefix(text, "go:cgo_"):
@ -1530,18 +1597,19 @@ func (p *noder) pragma(pos syntax.Pos, text string) syntax.Pragma {
 		if i := strings.Index(text, " "); i >= 0 {
 			verb = verb[:i]
 		}
-		prag := pragmaValue(verb)
+		flag := pragmaFlag(verb)
 		const runtimePragmas = Systemstack | Nowritebarrier | Nowritebarrierrec | Yeswritebarrierrec
-		if !compiling_runtime && prag&runtimePragmas != 0 {
+		if !compiling_runtime && flag&runtimePragmas != 0 {
 			p.error(syntax.Error{Pos: pos, Msg: fmt.Sprintf("//%s only allowed in runtime", verb)})
 		}
-		if prag == 0 && !allowedStdPragmas[verb] && compiling_std {
+		if flag == 0 && !allowedStdPragmas[verb] && compiling_std {
 			p.error(syntax.Error{Pos: pos, Msg: fmt.Sprintf("//%s is not allowed in the standard library", verb)})
 		}
-		return prag
+		pragma.Flag |= flag
+		pragma.Pos = append(pragma.Pos, PragmaPos{flag, pos})
 	}

-	return 0
+	return pragma
 }

 // isCgoGeneratedFile reports whether pos is in a file
--- a/src/cmd/compile/internal/gc/obj.go
+++ b/src/cmd/compile/internal/gc/obj.go
@ -365,11 +365,12 @@ func stringsym(pos src.XPos, s string) (data *obj.LSym) {

 var slicebytes_gen int

-func slicebytes(nam *Node, s string, len int) {
+func slicebytes(nam *Node, s string) {
 	slicebytes_gen++
 	symname := fmt.Sprintf(".gobytes.%d", slicebytes_gen)
 	sym := localpkg.Lookup(symname)
-	sym.Def = asTypesNode(newname(sym))
+	symnode := newname(sym)
+	sym.Def = asTypesNode(symnode)

 	lsym := sym.Linksym()
 	off := dsname(lsym, 0, s, nam.Pos, "slice")
@ -378,11 +379,7 @@ func slicebytes(nam *Node, s string, len int) {
 	if nam.Op != ONAME {
 		Fatalf("slicebytes %v", nam)
 	}
-	nsym := nam.Sym.Linksym()
-	off = int(nam.Xoffset)
-	off = dsymptr(nsym, off, lsym, 0)
-	off = duintptr(nsym, off, uint64(len))
-	duintptr(nsym, off, uint64(len))
+	slicesym(nam, symnode, int64(len(s)))
 }

 func dsname(s *obj.LSym, off int, t string, pos src.XPos, what string) int {
@ -417,69 +414,99 @@ func dsymptrWeakOff(s *obj.LSym, off int, x *obj.LSym) int {
 	return off
 }

-func gdata(nam *Node, nr *Node, wid int) {
-	if nam.Op != ONAME {
-		Fatalf("gdata nam op %v", nam.Op)
+// slicesym writes a static slice symbol {&arr, lencap, lencap} to n.
+// arr must be an ONAME. slicesym does not modify n.
+func slicesym(n, arr *Node, lencap int64) {
+	s := n.Sym.Linksym()
+	base := n.Xoffset
+	if arr.Op != ONAME {
+		Fatalf("slicesym non-name arr %v", arr)
 	}
-	if nam.Sym == nil {
-		Fatalf("gdata nil nam sym")
+	s.WriteAddr(Ctxt, base, Widthptr, arr.Sym.Linksym(), arr.Xoffset)
+	s.WriteInt(Ctxt, base+sliceLenOffset, Widthptr, lencap)
+	s.WriteInt(Ctxt, base+sliceCapOffset, Widthptr, lencap)
+}
+
+// addrsym writes the static address of a to n. a must be an ONAME.
+// Neither n nor a is modified.
+func addrsym(n, a *Node) {
+	if n.Op != ONAME {
+		Fatalf("addrsym n op %v", n.Op)
 	}
-	s := nam.Sym.Linksym()
+	if n.Sym == nil {
+		Fatalf("addrsym nil n sym")
+	}
+	if a.Op != ONAME {
+		Fatalf("addrsym a op %v", a.Op)
+	}
+	s := n.Sym.Linksym()
+	s.WriteAddr(Ctxt, n.Xoffset, Widthptr, a.Sym.Linksym(), a.Xoffset)
+}

-	switch nr.Op {
-	case OLITERAL:
-		switch u := nr.Val().U.(type) {
-		case bool:
-			i := int64(obj.Bool2int(u))
-			s.WriteInt(Ctxt, nam.Xoffset, wid, i)
+// pfuncsym writes the static address of f to n. f must be a global function.
+// Neither n nor f is modified.
+func pfuncsym(n, f *Node) {
+	if n.Op != ONAME {
+		Fatalf("pfuncsym n op %v", n.Op)
+	}
+	if n.Sym == nil {
+		Fatalf("pfuncsym nil n sym")
+	}
+	if f.Class() != PFUNC {
+		Fatalf("pfuncsym class not PFUNC %d", f.Class())
+	}
+	s := n.Sym.Linksym()
+	s.WriteAddr(Ctxt, n.Xoffset, Widthptr, funcsym(f.Sym).Linksym(), f.Xoffset)
+}

-		case *Mpint:
-			s.WriteInt(Ctxt, nam.Xoffset, wid, u.Int64())
+// litsym writes the static literal c to n.
+// Neither n nor c is modified.
+func litsym(n, c *Node, wid int) {
+	if n.Op != ONAME {
+		Fatalf("litsym n op %v", n.Op)
+	}
+	if c.Op != OLITERAL {
+		Fatalf("litsym c op %v", c.Op)
+	}
+	if n.Sym == nil {
+		Fatalf("litsym nil n sym")
+	}
+	s := n.Sym.Linksym()
+	switch u := c.Val().U.(type) {
+	case bool:
+		i := int64(obj.Bool2int(u))
+		s.WriteInt(Ctxt, n.Xoffset, wid, i)

-		case *Mpflt:
-			f := u.Float64()
-			switch nam.Type.Etype {
-			case TFLOAT32:
-				s.WriteFloat32(Ctxt, nam.Xoffset, float32(f))
-			case TFLOAT64:
-				s.WriteFloat64(Ctxt, nam.Xoffset, f)
-			}
+	case *Mpint:
+		s.WriteInt(Ctxt, n.Xoffset, wid, u.Int64())

-		case *Mpcplx:
-			r := u.Real.Float64()
-			i := u.Imag.Float64()
-			switch nam.Type.Etype {
-			case TCOMPLEX64:
-				s.WriteFloat32(Ctxt, nam.Xoffset, float32(r))
-				s.WriteFloat32(Ctxt, nam.Xoffset+4, float32(i))
-			case TCOMPLEX128:
-				s.WriteFloat64(Ctxt, nam.Xoffset, r)
-				s.WriteFloat64(Ctxt, nam.Xoffset+8, i)
-			}
-
-		case string:
-			symdata := stringsym(nam.Pos, u)
-			s.WriteAddr(Ctxt, nam.Xoffset, Widthptr, symdata, 0)
-			s.WriteInt(Ctxt, nam.Xoffset+int64(Widthptr), Widthptr, int64(len(u)))
-
-		default:
-			Fatalf("gdata unhandled OLITERAL %v", nr)
+	case *Mpflt:
+		f := u.Float64()
+		switch n.Type.Etype {
+		case TFLOAT32:
+			s.WriteFloat32(Ctxt, n.Xoffset, float32(f))
+		case TFLOAT64:
+			s.WriteFloat64(Ctxt, n.Xoffset, f)
 		}

-	case OADDR:
-		if nr.Left.Op != ONAME {
-			Fatalf("gdata ADDR left op %v", nr.Left.Op)
+	case *Mpcplx:
+		r := u.Real.Float64()
+		i := u.Imag.Float64()
+		switch n.Type.Etype {
+		case TCOMPLEX64:
+			s.WriteFloat32(Ctxt, n.Xoffset, float32(r))
+			s.WriteFloat32(Ctxt, n.Xoffset+4, float32(i))
+		case TCOMPLEX128:
+			s.WriteFloat64(Ctxt, n.Xoffset, r)
+			s.WriteFloat64(Ctxt, n.Xoffset+8, i)
 		}
-		to := nr.Left
-		s.WriteAddr(Ctxt, nam.Xoffset, wid, to.Sym.Linksym(), to.Xoffset)

-	case ONAME:
-		if nr.Class() != PFUNC {
-			Fatalf("gdata NAME not PFUNC %d", nr.Class())
-		}
-		s.WriteAddr(Ctxt, nam.Xoffset, wid, funcsym(nr.Sym).Linksym(), nr.Xoffset)
+	case string:
+		symdata := stringsym(n.Pos, u)
+		s.WriteAddr(Ctxt, n.Xoffset, Widthptr, symdata, 0)
+		s.WriteInt(Ctxt, n.Xoffset+int64(Widthptr), Widthptr, int64(len(u)))

 	default:
-		Fatalf("gdata unhandled op %v %v\n", nr, nr.Op)
+		Fatalf("litsym unhandled OLITERAL %v", c)
 	}
 }
--- a/src/cmd/compile/internal/gc/op_string.go
+++ b/src/cmd/compile/internal/gc/op_string.go
@ -1,4 +1,4 @@
-// Code generated by "stringer -type Op -trimprefix O"; DO NOT EDIT.
+// Code generated by "stringer -type=Op -trimprefix=O"; DO NOT EDIT.

 package gc

@ -144,28 +144,27 @@ func _() {
 	_ = x[OTFUNC-133]
 	_ = x[OTARRAY-134]
 	_ = x[ODDD-135]
-	_ = x[ODDDARG-136]
-	_ = x[OINLCALL-137]
-	_ = x[OEFACE-138]
-	_ = x[OITAB-139]
-	_ = x[OIDATA-140]
-	_ = x[OSPTR-141]
-	_ = x[OCLOSUREVAR-142]
-	_ = x[OCFUNC-143]
-	_ = x[OCHECKNIL-144]
-	_ = x[OVARDEF-145]
-	_ = x[OVARKILL-146]
-	_ = x[OVARLIVE-147]
-	_ = x[ORESULT-148]
-	_ = x[OINLMARK-149]
-	_ = x[ORETJMP-150]
-	_ = x[OGETG-151]
-	_ = x[OEND-152]
+	_ = x[OINLCALL-136]
+	_ = x[OEFACE-137]
+	_ = x[OITAB-138]
+	_ = x[OIDATA-139]
+	_ = x[OSPTR-140]
+	_ = x[OCLOSUREVAR-141]
+	_ = x[OCFUNC-142]
+	_ = x[OCHECKNIL-143]
+	_ = x[OVARDEF-144]
+	_ = x[OVARKILL-145]
+	_ = x[OVARLIVE-146]
+	_ = x[ORESULT-147]
+	_ = x[OINLMARK-148]
+	_ = x[ORETJMP-149]
+	_ = x[OGETG-150]
+	_ = x[OEND-151]
 }

-const _Op_name = "XXXNAMENONAMETYPEPACKLITERALADDSUBORXORADDSTRADDRANDANDAPPENDBYTES2STRBYTES2STRTMPRUNES2STRSTR2BYTESSTR2BYTESTMPSTR2RUNESASAS2AS2DOTTYPEAS2FUNCAS2MAPRAS2RECVASOPCALLCALLFUNCCALLMETHCALLINTERCALLPARTCAPCLOSECLOSURECOMPLITMAPLITSTRUCTLITARRAYLITSLICELITPTRLITCONVCONVIFACECONVNOPCOPYDCLDCLFUNCDCLFIELDDCLCONSTDCLTYPEDELETEDOTDOTPTRDOTMETHDOTINTERXDOTDOTTYPEDOTTYPE2EQNELTLEGEGTDEREFINDEXINDEXMAPKEYSTRUCTKEYLENMAKEMAKECHANMAKEMAPMAKESLICEMULDIVMODLSHRSHANDANDNOTNEWNEWOBJNOTBITNOTPLUSNEGORORPANICPRINTPRINTNPARENSENDSLICESLICEARRSLICESTRSLICE3SLICE3ARRSLICEHEADERRECOVERRECVRUNESTRSELRECVSELRECV2IOTAREALIMAGCOMPLEXALIGNOFOFFSETOFSIZEOFBLOCKBREAKCASECONTINUEDEFEREMPTYFALLFORFORUNTILGOTOIFLABELGORANGERETURNSELECTSWITCHTYPESWTCHANTMAPTSTRUCTTINTERTFUNCTARRAYDDDDDDARGINLCALLEFACEITABIDATASPTRCLOSUREVARCFUNCCHECKNILVARDEFVARKILLVARLIVERESULTINLMARKRETJMPGETGEND"
+const _Op_name = "XXXNAMENONAMETYPEPACKLITERALADDSUBORXORADDSTRADDRANDANDAPPENDBYTES2STRBYTES2STRTMPRUNES2STRSTR2BYTESSTR2BYTESTMPSTR2RUNESASAS2AS2DOTTYPEAS2FUNCAS2MAPRAS2RECVASOPCALLCALLFUNCCALLMETHCALLINTERCALLPARTCAPCLOSECLOSURECOMPLITMAPLITSTRUCTLITARRAYLITSLICELITPTRLITCONVCONVIFACECONVNOPCOPYDCLDCLFUNCDCLFIELDDCLCONSTDCLTYPEDELETEDOTDOTPTRDOTMETHDOTINTERXDOTDOTTYPEDOTTYPE2EQNELTLEGEGTDEREFINDEXINDEXMAPKEYSTRUCTKEYLENMAKEMAKECHANMAKEMAPMAKESLICEMULDIVMODLSHRSHANDANDNOTNEWNEWOBJNOTBITNOTPLUSNEGORORPANICPRINTPRINTNPARENSENDSLICESLICEARRSLICESTRSLICE3SLICE3ARRSLICEHEADERRECOVERRECVRUNESTRSELRECVSELRECV2IOTAREALIMAGCOMPLEXALIGNOFOFFSETOFSIZEOFBLOCKBREAKCASECONTINUEDEFEREMPTYFALLFORFORUNTILGOTOIFLABELGORANGERETURNSELECTSWITCHTYPESWTCHANTMAPTSTRUCTTINTERTFUNCTARRAYDDDINLCALLEFACEITABIDATASPTRCLOSUREVARCFUNCCHECKNILVARDEFVARKILLVARLIVERESULTINLMARKRETJMPGETGEND"

-var _Op_index = [...]uint16{0, 3, 7, 13, 17, 21, 28, 31, 34, 36, 39, 45, 49, 55, 61, 70, 82, 91, 100, 112, 121, 123, 126, 136, 143, 150, 157, 161, 165, 173, 181, 190, 198, 201, 206, 213, 220, 226, 235, 243, 251, 257, 261, 270, 277, 281, 284, 291, 299, 307, 314, 320, 323, 329, 336, 344, 348, 355, 363, 365, 367, 369, 371, 373, 375, 380, 385, 393, 396, 405, 408, 412, 420, 427, 436, 439, 442, 445, 448, 451, 454, 460, 463, 469, 472, 478, 482, 485, 489, 494, 499, 505, 510, 514, 519, 527, 535, 541, 550, 561, 568, 572, 579, 586, 594, 598, 602, 606, 613, 620, 628, 634, 639, 644, 648, 656, 661, 666, 670, 673, 681, 685, 687, 692, 694, 699, 705, 711, 717, 723, 728, 732, 739, 745, 750, 756, 759, 765, 772, 777, 781, 786, 790, 800, 805, 813, 819, 826, 833, 839, 846, 852, 856, 859}
+var _Op_index = [...]uint16{0, 3, 7, 13, 17, 21, 28, 31, 34, 36, 39, 45, 49, 55, 61, 70, 82, 91, 100, 112, 121, 123, 126, 136, 143, 150, 157, 161, 165, 173, 181, 190, 198, 201, 206, 213, 220, 226, 235, 243, 251, 257, 261, 270, 277, 281, 284, 291, 299, 307, 314, 320, 323, 329, 336, 344, 348, 355, 363, 365, 367, 369, 371, 373, 375, 380, 385, 393, 396, 405, 408, 412, 420, 427, 436, 439, 442, 445, 448, 451, 454, 460, 463, 469, 472, 478, 482, 485, 489, 494, 499, 505, 510, 514, 519, 527, 535, 541, 550, 561, 568, 572, 579, 586, 594, 598, 602, 606, 613, 620, 628, 634, 639, 644, 648, 656, 661, 666, 670, 673, 681, 685, 687, 692, 694, 699, 705, 711, 717, 723, 728, 732, 739, 745, 750, 756, 759, 766, 771, 775, 780, 784, 794, 799, 807, 813, 820, 827, 833, 840, 846, 850, 853}

 func (i Op) String() string {
 	if i >= Op(len(_Op_index)-1) {
--- a/src/cmd/compile/internal/gc/order.go
+++ b/src/cmd/compile/internal/gc/order.go
@ -407,41 +407,43 @@ func (o *Order) call(n *Node) {
 		// Caller should have already called o.init(n).
 		Fatalf("%v with unexpected ninit", n.Op)
 	}
-	n.Left = o.expr(n.Left, nil)
-	n.Right = o.expr(n.Right, nil) // ODDDARG temp
-	o.exprList(n.List)

-	if n.Op != OCALLFUNC && n.Op != OCALLMETH {
+	// Builtin functions.
+	if n.Op != OCALLFUNC && n.Op != OCALLMETH && n.Op != OCALLINTER {
+		n.Left = o.expr(n.Left, nil)
+		n.Right = o.expr(n.Right, nil)
+		o.exprList(n.List)
 		return
 	}
-	keepAlive := func(i int) {
+
+	fixVariadicCall(n)
+	n.Left = o.expr(n.Left, nil)
+	o.exprList(n.List)
+
+	if n.Op == OCALLINTER {
+		return
+	}
+	keepAlive := func(arg *Node) {
 		// If the argument is really a pointer being converted to uintptr,
 		// arrange for the pointer to be kept alive until the call returns,
 		// by copying it into a temp and marking that temp
 		// still alive when we pop the temp stack.
-		xp := n.List.Addr(i)
-		for (*xp).Op == OCONVNOP && !(*xp).Type.IsUnsafePtr() {
-			xp = &(*xp).Left
-		}
-		x := *xp
-		if x.Type.IsUnsafePtr() {
-			x = o.copyExpr(x, x.Type, false)
+		if arg.Op == OCONVNOP && arg.Left.Type.IsUnsafePtr() {
+			x := o.copyExpr(arg.Left, arg.Left.Type, false)
 			x.Name.SetKeepalive(true)
-			*xp = x
+			arg.Left = x
 		}
 	}

-	for i, t := range n.Left.Type.Params().FieldSlice() {
-		// Check for "unsafe-uintptr" tag provided by escape analysis.
-		if t.IsDDD() && !n.IsDDD() {
-			if t.Note == uintptrEscapesTag {
-				for ; i < n.List.Len(); i++ {
-					keepAlive(i)
+	// Check for "unsafe-uintptr" tag provided by escape analysis.
+	for i, param := range n.Left.Type.Params().FieldSlice() {
+		if param.Note == unsafeUintptrTag || param.Note == uintptrEscapesTag {
+			if arg := n.List.Index(i); arg.Op == OSLICELIT {
+				for _, elt := range arg.List.Slice() {
+					keepAlive(elt)
 				}
-			}
-		} else {
-			if t.Note == unsafeUintptrTag || t.Note == uintptrEscapesTag {
-				keepAlive(i)
+			} else {
+				keepAlive(arg)
 			}
 		}
 	}
@ -1214,15 +1216,6 @@ func (o *Order) expr(n, lhs *Node) *Node {
 			prealloc[n] = o.newTemp(t, false)
 		}

-	case ODDDARG:
-		if n.Transient() {
-			// The ddd argument does not live beyond the call it is created for.
-			// Allocate a temporary that will be cleaned up when this statement
-			// completes. We could be more aggressive and try to arrange for it
-			// to be cleaned up when the call completes.
-			prealloc[n] = o.newTemp(n.Type.Elem(), false)
-		}
-
 	case ODOTTYPE, ODOTTYPE2:
 		n.Left = o.expr(n.Left, nil)
 		if !isdirectiface(n.Type) || instrumenting {
--- a/src/cmd/compile/internal/gc/plive.go
+++ b/src/cmd/compile/internal/gc/plive.go
@ -24,6 +24,16 @@ import (
 	"strings"
 )

+// go115ReduceLiveness disables register maps and only produces stack
+// maps at call sites.
+//
+// In Go 1.15, we changed debug call injection to use conservative
+// scanning instead of precise pointer maps, so these are no longer
+// necessary.
+//
+// Keep in sync with runtime/preempt.go:go115ReduceLiveness.
+const go115ReduceLiveness = true
+
 // OpVarDef is an annotation for the liveness analysis, marking a place
 // where a complete initialization (definition) of a variable begins.
 // Since the liveness analysis can see initialization of single-word
@ -107,7 +117,11 @@ type Liveness struct {

 	be []BlockEffects

-	// unsafePoints bit i is set if Value ID i is not a safe point.
+	// allUnsafe indicates that all points in this function are
+	// unsafe-points.
+	allUnsafe bool
+	// unsafePoints bit i is set if Value ID i is an unsafe-point
+	// (preemption is not allowed). Only valid if !allUnsafe.
 	unsafePoints bvec

 	// An array with a bit vector for each safe point in the
@ -143,52 +157,72 @@ type openDeferVarInfo struct {

 // LivenessMap maps from *ssa.Value to LivenessIndex.
 type LivenessMap struct {
-	m []LivenessIndex
+	vals map[ssa.ID]LivenessIndex
 }

-func (m *LivenessMap) reset(ids int) {
-	m2 := m.m
-	if ids > cap(m2) {
-		m2 = make([]LivenessIndex, ids)
+func (m *LivenessMap) reset() {
+	if m.vals == nil {
+		m.vals = make(map[ssa.ID]LivenessIndex)
 	} else {
-		m2 = m2[:ids]
+		for k := range m.vals {
+			delete(m.vals, k)
+		}
 	}
-	none := LivenessInvalid
-	for i := range m2 {
-		m2[i] = none
-	}
-	m.m = m2
 }

 func (m *LivenessMap) set(v *ssa.Value, i LivenessIndex) {
-	m.m[v.ID] = i
+	m.vals[v.ID] = i
 }

 func (m LivenessMap) Get(v *ssa.Value) LivenessIndex {
-	if int(v.ID) < len(m.m) {
-		return m.m[int(v.ID)]
+	if !go115ReduceLiveness {
+		// All safe-points are in the map, so if v isn't in
+		// the map, it's an unsafe-point.
+		if idx, ok := m.vals[v.ID]; ok {
+			return idx
+		}
+		return LivenessInvalid
 	}
-	// Not a safe point.
-	return LivenessInvalid
+
+	// If v isn't in the map, then it's a "don't care" and not an
+	// unsafe-point.
+	if idx, ok := m.vals[v.ID]; ok {
+		return idx
+	}
+	return LivenessIndex{StackMapDontCare, StackMapDontCare, false}
 }

-// LivenessIndex stores the liveness map index for a safe-point.
+// LivenessIndex stores the liveness map information for a Value.
 type LivenessIndex struct {
 	stackMapIndex int
-	regMapIndex   int
+	regMapIndex   int // only for !go115ReduceLiveness
+
+	// isUnsafePoint indicates that this is an unsafe-point.
+	//
+	// Note that it's possible for a call Value to have a stack
+	// map while also being an unsafe-point. This means it cannot
+	// be preempted at this instruction, but that a preemption or
+	// stack growth may happen in the called function.
+	isUnsafePoint bool
 }

-// LivenessInvalid indicates an unsafe point.
-//
-// We use index -2 because PCDATA tables conventionally start at -1,
-// so -1 is used to mean the entry liveness map (which is actually at
-// index 0; sigh). TODO(austin): Maybe we should use PCDATA+1 as the
-// index into the liveness map so -1 uniquely refers to the entry
-// liveness map.
-var LivenessInvalid = LivenessIndex{-2, -2}
+// LivenessInvalid indicates an unsafe point with no stack map.
+var LivenessInvalid = LivenessIndex{StackMapDontCare, StackMapDontCare, true} // only for !go115ReduceLiveness

-func (idx LivenessIndex) Valid() bool {
-	return idx.stackMapIndex >= 0
+// StackMapDontCare indicates that the stack map index at a Value
+// doesn't matter.
+//
+// This is a sentinel value that should never be emitted to the PCDATA
+// stream. We use -1000 because that's obviously never a valid stack
+// index (but -1 is).
+const StackMapDontCare = -1000
+
+func (idx LivenessIndex) StackMapValid() bool {
+	return idx.stackMapIndex != StackMapDontCare
+}
+
+func (idx LivenessIndex) RegMapValid() bool {
+	return idx.regMapIndex != StackMapDontCare
 }

 type progeffectscache struct {
@ -377,6 +411,9 @@ func affectedNode(v *ssa.Value) (*Node, ssa.SymEffect) {

 // regEffects returns the registers affected by v.
 func (lv *Liveness) regEffects(v *ssa.Value) (uevar, kill liveRegMask) {
+	if go115ReduceLiveness {
+		return 0, 0
+	}
 	if v.Op == ssa.OpPhi {
 		// All phi node arguments must come from the same
 		// register and the result must also go to that
@ -458,7 +495,7 @@ func (lv *Liveness) regEffects(v *ssa.Value) (uevar, kill liveRegMask) {
 	return uevar, kill
 }

-type liveRegMask uint32
+type liveRegMask uint32 // only if !go115ReduceLiveness

 func (m liveRegMask) niceString(config *ssa.Config) string {
 	if m == 0 {
@ -497,7 +534,7 @@ func newliveness(fn *Node, f *ssa.Func, vars []*Node, idx map[*Node]int32, stkpt

 	// Significant sources of allocation are kept in the ssa.Cache
 	// and reused. Surprisingly, the bit vectors themselves aren't
-	// a major source of allocation, but the slices are.
+	// a major source of allocation, but the liveness maps are.
 	if lc, _ := f.Cache.Liveness.(*livenessFuncCache); lc == nil {
 		// Prep the cache so liveness can fill it later.
 		f.Cache.Liveness = new(livenessFuncCache)
@ -505,7 +542,8 @@ func newliveness(fn *Node, f *ssa.Func, vars []*Node, idx map[*Node]int32, stkpt
 		if cap(lc.be) >= f.NumBlocks() {
 			lv.be = lc.be[:f.NumBlocks()]
 		}
-		lv.livenessMap = LivenessMap{lc.livenessMap.m[:0]}
+		lv.livenessMap = LivenessMap{lc.livenessMap.vals}
+		lc.livenessMap.vals = nil
 	}
 	if lv.be == nil {
 		lv.be = make([]BlockEffects, f.NumBlocks())
@ -522,7 +560,7 @@ func newliveness(fn *Node, f *ssa.Func, vars []*Node, idx map[*Node]int32, stkpt
 		be.livein = varRegVec{vars: bulk.next()}
 		be.liveout = varRegVec{vars: bulk.next()}
 	}
-	lv.livenessMap.reset(lv.f.NumValues())
+	lv.livenessMap.reset()

 	lv.markUnsafePoints()
 	return lv
@ -644,9 +682,18 @@ func (lv *Liveness) pointerMap(liveout bvec, vars []*Node, args, locals bvec) {

 // markUnsafePoints finds unsafe points and computes lv.unsafePoints.
 func (lv *Liveness) markUnsafePoints() {
+	// The runtime assumes the only safe-points are function
+	// prologues (because that's how it used to be). We could and
+	// should improve that, but for now keep consider all points
+	// in the runtime unsafe. obj will add prologues and their
+	// safe-points.
+	//
+	// go:nosplit functions are similar. Since safe points used to
+	// be coupled with stack checks, go:nosplit often actually
+	// means "no safe points in this function".
 	if compiling_runtime || lv.f.NoSplit {
-		// No complex analysis necessary. Do this on the fly
-		// in issafepoint.
+		// No complex analysis necessary.
+		lv.allUnsafe = true
 		return
 	}

@ -801,20 +848,28 @@ func (lv *Liveness) markUnsafePoints() {
 	}
 }

-// Returns true for instructions that are safe points that must be annotated
-// with liveness information.
-func (lv *Liveness) issafepoint(v *ssa.Value) bool {
-	// The runtime was written with the assumption that
-	// safe-points only appear at call sites (because that's how
-	// it used to be). We could and should improve that, but for
-	// now keep the old safe-point rules in the runtime.
-	//
-	// go:nosplit functions are similar. Since safe points used to
-	// be coupled with stack checks, go:nosplit often actually
-	// means "no safe points in this function".
-	if compiling_runtime || lv.f.NoSplit {
-		return v.Op.IsCall()
+// Returns true for instructions that must have a stack map.
+//
+// This does not necessarily mean the instruction is a safe-point. In
+// particular, call Values can have a stack map in case the callee
+// grows the stack, but not themselves be a safe-point.
+func (lv *Liveness) hasStackMap(v *ssa.Value) bool {
+	// The runtime only has safe-points in function prologues, so
+	// we only need stack maps at call sites. go:nosplit functions
+	// are similar.
+	if go115ReduceLiveness || compiling_runtime || lv.f.NoSplit {
+		if !v.Op.IsCall() {
+			return false
+		}
+		// typedmemclr and typedmemmove are write barriers and
+		// deeply non-preemptible. They are unsafe points and
+		// hence should not have liveness maps.
+		if sym, _ := v.Aux.(*obj.LSym); sym == typedmemclr || sym == typedmemmove {
+			return false
+		}
+		return true
 	}
+
 	switch v.Op {
 	case ssa.OpInitMem, ssa.OpArg, ssa.OpSP, ssa.OpSB,
 		ssa.OpSelect0, ssa.OpSelect1, ssa.OpGetG,
@ -1049,7 +1104,7 @@ func (lv *Liveness) epilogue() {
 		// Walk forward through the basic block instructions and
 		// allocate liveness maps for those instructions that need them.
 		for _, v := range b.Values {
-			if !lv.issafepoint(v) {
+			if !lv.hasStackMap(v) {
 				continue
 			}

@ -1064,7 +1119,7 @@ func (lv *Liveness) epilogue() {
 		for i := len(b.Values) - 1; i >= 0; i-- {
 			v := b.Values[i]

-			if lv.issafepoint(v) {
+			if lv.hasStackMap(v) {
 				// Found an interesting instruction, record the
 				// corresponding liveness information.

@ -1113,7 +1168,7 @@ func (lv *Liveness) epilogue() {
 		// of the context register, so it's dead after the call.
 		index = int32(firstBitmapIndex)
 		for _, v := range b.Values {
-			if lv.issafepoint(v) {
+			if lv.hasStackMap(v) {
 				live := lv.livevars[index]
 				if v.Op.IsCall() && live.regs != 0 {
 					lv.printDebug()
@ -1139,13 +1194,15 @@ func (lv *Liveness) epilogue() {
 			lv.f.Fatalf("%v %L recorded as live on entry", lv.fn.Func.Nname, n)
 		}
 	}
-	// Check that no registers are live at function entry.
-	// The context register, if any, comes from a
-	// LoweredGetClosurePtr operation first thing in the function,
-	// so it doesn't appear live at entry.
-	if regs := lv.regMaps[0]; regs != 0 {
-		lv.printDebug()
-		lv.f.Fatalf("%v register %s recorded as live on entry", lv.fn.Func.Nname, regs.niceString(lv.f.Config))
+	if !go115ReduceLiveness {
+		// Check that no registers are live at function entry.
+		// The context register, if any, comes from a
+		// LoweredGetClosurePtr operation first thing in the function,
+		// so it doesn't appear live at entry.
+		if regs := lv.regMaps[0]; regs != 0 {
+			lv.printDebug()
+			lv.f.Fatalf("%v register %s recorded as live on entry", lv.fn.Func.Nname, regs.niceString(lv.f.Config))
+		}
 	}
 }

@ -1166,7 +1223,7 @@ func (lv *Liveness) epilogue() {
 // PCDATA tables cost about 100k. So for now we keep using a single index for
 // both bitmap lists.
 func (lv *Liveness) compact(b *ssa.Block) {
-	add := func(live varRegVec) LivenessIndex {
+	add := func(live varRegVec, isUnsafePoint bool) LivenessIndex { // only if !go115ReduceLiveness
 		// Deduplicate the stack map.
 		stackIndex := lv.stackMapSet.add(live.vars)
 		// Deduplicate the register map.
@ -1176,17 +1233,33 @@ func (lv *Liveness) compact(b *ssa.Block) {
 			lv.regMapSet[live.regs] = regIndex
 			lv.regMaps = append(lv.regMaps, live.regs)
 		}
-		return LivenessIndex{stackIndex, regIndex}
+		return LivenessIndex{stackIndex, regIndex, isUnsafePoint}
 	}
 	pos := 0
 	if b == lv.f.Entry {
 		// Handle entry stack map.
-		add(lv.livevars[0])
+		if !go115ReduceLiveness {
+			add(lv.livevars[0], false)
+		} else {
+			lv.stackMapSet.add(lv.livevars[0].vars)
+		}
 		pos++
 	}
 	for _, v := range b.Values {
-		if lv.issafepoint(v) {
-			lv.livenessMap.set(v, add(lv.livevars[pos]))
+		if go115ReduceLiveness {
+			hasStackMap := lv.hasStackMap(v)
+			isUnsafePoint := lv.allUnsafe || lv.unsafePoints.Get(int32(v.ID))
+			idx := LivenessIndex{StackMapDontCare, 0, isUnsafePoint}
+			if hasStackMap {
+				idx.stackMapIndex = lv.stackMapSet.add(lv.livevars[pos].vars)
+				pos++
+			}
+			if hasStackMap || isUnsafePoint {
+				lv.livenessMap.set(v, idx)
+			}
+		} else if lv.hasStackMap(v) {
+			isUnsafePoint := lv.allUnsafe || lv.unsafePoints.Get(int32(v.ID))
+			lv.livenessMap.set(v, add(lv.livevars[pos], isUnsafePoint))
 			pos++
 		}
 	}
@ -1291,7 +1364,6 @@ func (lv *Liveness) printeffect(printed bool, name string, pos int32, x bool, re
 func (lv *Liveness) printDebug() {
 	fmt.Printf("liveness: %s\n", lv.fn.funcname())

-	pcdata := 0
 	for i, b := range lv.f.Blocks {
 		if i > 0 {
 			fmt.Printf("\n")
@ -1327,7 +1399,7 @@ func (lv *Liveness) printDebug() {
 		// program listing, with individual effects listed

 		if b == lv.f.Entry {
-			live := lv.stackMaps[pcdata]
+			live := lv.stackMaps[0]
 			fmt.Printf("(%s) function entry\n", linestr(lv.fn.Func.Nname.Pos))
 			fmt.Printf("\tlive=")
 			printed = false
@ -1347,9 +1419,7 @@ func (lv *Liveness) printDebug() {
 		for _, v := range b.Values {
 			fmt.Printf("(%s) %v\n", linestr(v.Pos), v.LongString())

-			if pos := lv.livenessMap.Get(v); pos.Valid() {
-				pcdata = pos.stackMapIndex
-			}
+			pcdata := lv.livenessMap.Get(v)

 			pos, effect := lv.valueEffects(v)
 			regUevar, regKill := lv.regEffects(v)
@ -1360,31 +1430,38 @@ func (lv *Liveness) printDebug() {
 				fmt.Printf("\n")
 			}

-			if !lv.issafepoint(v) {
-				continue
+			if pcdata.StackMapValid() || pcdata.RegMapValid() {
+				fmt.Printf("\tlive=")
+				printed = false
+				if pcdata.StackMapValid() {
+					live := lv.stackMaps[pcdata.stackMapIndex]
+					for j, n := range lv.vars {
+						if !live.Get(int32(j)) {
+							continue
+						}
+						if printed {
+							fmt.Printf(",")
+						}
+						fmt.Printf("%v", n)
+						printed = true
+					}
+				}
+				if pcdata.RegMapValid() { // only if !go115ReduceLiveness
+					regLive := lv.regMaps[pcdata.regMapIndex]
+					if regLive != 0 {
+						if printed {
+							fmt.Printf(",")
+						}
+						fmt.Printf("%s", regLive.niceString(lv.f.Config))
+						printed = true
+					}
+				}
+				fmt.Printf("\n")
 			}

-			live := lv.stackMaps[pcdata]
-			fmt.Printf("\tlive=")
-			printed = false
-			for j, n := range lv.vars {
-				if !live.Get(int32(j)) {
-					continue
-				}
-				if printed {
-					fmt.Printf(",")
-				}
-				fmt.Printf("%v", n)
-				printed = true
+			if pcdata.isUnsafePoint {
+				fmt.Printf("\tunsafe-point\n")
 			}
-			regLive := lv.regMaps[lv.livenessMap.Get(v).regMapIndex]
-			if regLive != 0 {
-				if printed {
-					fmt.Printf(",")
-				}
-				fmt.Printf("%s", regLive.niceString(lv.f.Config))
-			}
-			fmt.Printf("\n")
 		}

 		// bb bitsets
@ -1453,19 +1530,21 @@ func (lv *Liveness) emit() (argsSym, liveSym, regsSym *obj.LSym) {
 		loff = dbvec(&liveSymTmp, loff, locals)
 	}

-	regs := bvalloc(lv.usedRegs())
-	roff := duint32(&regsSymTmp, 0, uint32(len(lv.regMaps))) // number of bitmaps
-	roff = duint32(&regsSymTmp, roff, uint32(regs.n))        // number of bits in each bitmap
-	if regs.n > 32 {
-		// Our uint32 conversion below won't work.
-		Fatalf("GP registers overflow uint32")
-	}
+	if !go115ReduceLiveness {
+		regs := bvalloc(lv.usedRegs())
+		roff := duint32(&regsSymTmp, 0, uint32(len(lv.regMaps))) // number of bitmaps
+		roff = duint32(&regsSymTmp, roff, uint32(regs.n))        // number of bits in each bitmap
+		if regs.n > 32 {
+			// Our uint32 conversion below won't work.
+			Fatalf("GP registers overflow uint32")
+		}

-	if regs.n > 0 {
-		for _, live := range lv.regMaps {
-			regs.Clear()
-			regs.b[0] = uint32(live)
-			roff = dbvec(&regsSymTmp, roff, regs)
+		if regs.n > 0 {
+			for _, live := range lv.regMaps {
+				regs.Clear()
+				regs.b[0] = uint32(live)
+				roff = dbvec(&regsSymTmp, roff, regs)
+			}
 		}
 	}

@ -1480,7 +1559,11 @@ func (lv *Liveness) emit() (argsSym, liveSym, regsSym *obj.LSym) {
 			lsym.P = tmpSym.P
 		})
 	}
-	return makeSym(&argsSymTmp), makeSym(&liveSymTmp), makeSym(&regsSymTmp)
+	if !go115ReduceLiveness {
+		return makeSym(&argsSymTmp), makeSym(&liveSymTmp), makeSym(&regsSymTmp)
+	}
+	// TODO(go115ReduceLiveness): Remove regsSym result
+	return makeSym(&argsSymTmp), makeSym(&liveSymTmp), nil
 }

 // Entry pointer for liveness analysis. Solves for the liveness of
@ -1500,7 +1583,7 @@ func liveness(e *ssafn, f *ssa.Func, pp *Progs) LivenessMap {
 		lv.showlive(nil, lv.stackMaps[0])
 		for _, b := range f.Blocks {
 			for _, val := range b.Values {
-				if idx := lv.livenessMap.Get(val); idx.Valid() {
+				if idx := lv.livenessMap.Get(val); idx.StackMapValid() {
 					lv.showlive(val, lv.stackMaps[idx.stackMapIndex])
 				}
 			}
@ -1519,7 +1602,7 @@ func liveness(e *ssafn, f *ssa.Func, pp *Progs) LivenessMap {
 			}
 			cache.be = lv.be
 		}
-		if cap(lv.livenessMap.m) < 2000 {
+		if len(lv.livenessMap.vals) < 2000 {
 			cache.livenessMap = lv.livenessMap
 		}
 	}
@ -1540,11 +1623,13 @@ func liveness(e *ssafn, f *ssa.Func, pp *Progs) LivenessMap {
 	p.To.Name = obj.NAME_EXTERN
 	p.To.Sym = ls.Func.GCLocals

-	p = pp.Prog(obj.AFUNCDATA)
-	Addrconst(&p.From, objabi.FUNCDATA_RegPointerMaps)
-	p.To.Type = obj.TYPE_MEM
-	p.To.Name = obj.NAME_EXTERN
-	p.To.Sym = ls.Func.GCRegs
+	if !go115ReduceLiveness {
+		p = pp.Prog(obj.AFUNCDATA)
+		Addrconst(&p.From, objabi.FUNCDATA_RegPointerMaps)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Name = obj.NAME_EXTERN
+		p.To.Sym = ls.Func.GCRegs
+	}

 	return lv.livenessMap
 }
--- a/src/cmd/compile/internal/gc/scc.go
+++ b/src/cmd/compile/internal/gc/scc.go
@ -82,6 +82,13 @@ func (v *bottomUpVisitor) visit(n *Node) uint32 {
 					min = m
 				}
 			}
+		case OCALLPART:
+			fn := asNode(callpartMethod(n).Type.Nname())
+			if fn != nil && fn.Op == ONAME && fn.Class() == PFUNC && fn.Name.Defn != nil {
+				if m := v.visit(fn.Name.Defn); m < min {
+					min = m
+				}
+			}
 		case OCLOSURE:
 			if m := v.visit(n.Func.Closure); m < min {
 				min = m
--- a/src/cmd/compile/internal/gc/sinit.go
+++ b/src/cmd/compile/internal/gc/sinit.go
@ -71,7 +71,7 @@ func (s *InitSchedule) staticcopy(l *Node, r *Node) bool {
 		return false
 	}
 	if r.Class() == PFUNC {
-		gdata(l, r, Widthptr)
+		pfuncsym(l, r)
 		return true
 	}
 	if r.Class() != PEXTERN || r.Sym.Pkg != localpkg {
@ -107,13 +107,12 @@ func (s *InitSchedule) staticcopy(l *Node, r *Node) bool {
 		if isZero(r) {
 			return true
 		}
-		gdata(l, r, int(l.Type.Width))
+		litsym(l, r, int(l.Type.Width))
 		return true

 	case OADDR:
-		switch r.Left.Op {
-		case ONAME:
-			gdata(l, r, int(l.Type.Width))
+		if a := r.Left; a.Op == ONAME {
+			addrsym(l, a)
 			return true
 		}

@ -121,21 +120,14 @@ func (s *InitSchedule) staticcopy(l *Node, r *Node) bool {
 		switch r.Left.Op {
 		case OARRAYLIT, OSLICELIT, OSTRUCTLIT, OMAPLIT:
 			// copy pointer
-			gdata(l, nod(OADDR, s.inittemps[r], nil), int(l.Type.Width))
+			addrsym(l, s.inittemps[r])
 			return true
 		}

 	case OSLICELIT:
 		// copy slice
 		a := s.inittemps[r]
-
-		n := l.copy()
-		n.Xoffset = l.Xoffset + int64(slice_array)
-		gdata(n, nod(OADDR, a, nil), Widthptr)
-		n.Xoffset = l.Xoffset + int64(slice_nel)
-		gdata(n, r.Right, Widthptr)
-		n.Xoffset = l.Xoffset + int64(slice_cap)
-		gdata(n, r.Right, Widthptr)
+		slicesym(l, a, r.Right.Int64())
 		return true

 	case OARRAYLIT, OSTRUCTLIT:
@ -147,7 +139,7 @@ func (s *InitSchedule) staticcopy(l *Node, r *Node) bool {
 			n.Xoffset = l.Xoffset + e.Xoffset
 			n.Type = e.Expr.Type
 			if e.Expr.Op == OLITERAL {
-				gdata(n, e.Expr, int(n.Type.Width))
+				litsym(n, e.Expr, int(n.Type.Width))
 				continue
 			}
 			ll := n.sepcopy()
@ -182,15 +174,13 @@ func (s *InitSchedule) staticassign(l *Node, r *Node) bool {
 		if isZero(r) {
 			return true
 		}
-		gdata(l, r, int(l.Type.Width))
+		litsym(l, r, int(l.Type.Width))
 		return true

 	case OADDR:
 		var nam Node
 		if stataddr(&nam, r.Left) {
-			n := *r
-			n.Left = &nam
-			gdata(l, &n, int(l.Type.Width))
+			addrsym(l, &nam)
 			return true
 		}
 		fallthrough
@ -202,7 +192,7 @@ func (s *InitSchedule) staticassign(l *Node, r *Node) bool {
 			a := staticname(r.Left.Type)

 			s.inittemps[r] = a
-			gdata(l, nod(OADDR, a, nil), int(l.Type.Width))
+			addrsym(l, a)

 			// Init underlying literal.
 			if !s.staticassign(a, r.Left) {
@ -215,7 +205,7 @@ func (s *InitSchedule) staticassign(l *Node, r *Node) bool {
 	case OSTR2BYTES:
 		if l.Class() == PEXTERN && r.Left.Op == OLITERAL {
 			sval := strlit(r.Left)
-			slicebytes(l, sval, len(sval))
+			slicebytes(l, sval)
 			return true
 		}

@ -224,16 +214,10 @@ func (s *InitSchedule) staticassign(l *Node, r *Node) bool {
 		// Init slice.
 		bound := r.Right.Int64()
 		ta := types.NewArray(r.Type.Elem(), bound)
+		ta.SetNoalg(true)
 		a := staticname(ta)
 		s.inittemps[r] = a
-		n := l.copy()
-		n.Xoffset = l.Xoffset + int64(slice_array)
-		gdata(n, nod(OADDR, a, nil), Widthptr)
-		n.Xoffset = l.Xoffset + int64(slice_nel)
-		gdata(n, r.Right, Widthptr)
-		n.Xoffset = l.Xoffset + int64(slice_cap)
-		gdata(n, r.Right, Widthptr)
-
+		slicesym(l, a, bound)
 		// Fall through to init underlying array.
 		l = a
 		fallthrough
@ -248,7 +232,7 @@ func (s *InitSchedule) staticassign(l *Node, r *Node) bool {
 			n.Xoffset = l.Xoffset + e.Xoffset
 			n.Type = e.Expr.Type
 			if e.Expr.Op == OLITERAL {
-				gdata(n, e.Expr, int(n.Type.Width))
+				litsym(n, e.Expr, int(n.Type.Width))
 				continue
 			}
 			setlineno(e.Expr)
@ -270,7 +254,7 @@ func (s *InitSchedule) staticassign(l *Node, r *Node) bool {
 			}
 			// Closures with no captured variables are globals,
 			// so the assignment can be done at link time.
-			gdata(l, r.Func.Closure.Func.Nname, Widthptr)
+			pfuncsym(l, r.Func.Closure.Func.Nname)
 			return true
 		}
 		closuredebugruntimecheck(r)
@ -304,7 +288,7 @@ func (s *InitSchedule) staticassign(l *Node, r *Node) bool {
 		n := l.copy()

 		// Emit itab, advance offset.
-		gdata(n, itab, Widthptr)
+		addrsym(n, itab.Left) // itab is an OADDR node
 		n.Xoffset += int64(Widthptr)

 		// Emit data.
@ -327,9 +311,7 @@ func (s *InitSchedule) staticassign(l *Node, r *Node) bool {
 			if !s.staticassign(a, val) {
 				s.append(nod(OAS, a, val))
 			}
-			ptr := nod(OADDR, a, nil)
-			n.Type = types.NewPtr(val.Type)
-			gdata(n, ptr, Widthptr)
+			addrsym(n, a)
 		}

 		return true
@ -610,18 +592,7 @@ func slicelit(ctxt initContext, n *Node, var_ *Node, init *Nodes) {
 		if !stataddr(&nam, var_) || nam.Class() != PEXTERN {
 			Fatalf("slicelit: %v", var_)
 		}
-
-		var v Node
-		v.Type = types.Types[TINT]
-		setintconst(&v, t.NumElem())
-
-		nam.Xoffset += int64(slice_array)
-		gdata(&nam, nod(OADDR, vstat, nil), Widthptr)
-		nam.Xoffset += int64(slice_nel) - int64(slice_array)
-		gdata(&nam, &v, Widthptr)
-		nam.Xoffset += int64(slice_cap) - int64(slice_nel)
-		gdata(&nam, &v, Widthptr)
-
+		slicesym(&nam, vstat, t.NumElem())
 		return
 	}

@ -789,7 +760,9 @@ func maplit(n *Node, m *Node, init *Nodes) {
 		tk := types.NewArray(n.Type.Key(), int64(len(entries)))
 		te := types.NewArray(n.Type.Elem(), int64(len(entries)))

-		// TODO(josharian): suppress alg generation for these types?
+		tk.SetNoalg(true)
+		te.SetNoalg(true)
+
 		dowidth(tk)
 		dowidth(te)

@ -1179,10 +1152,10 @@ func genAsStatic(as *Node) {

 	switch {
 	case as.Right.Op == OLITERAL:
+		litsym(&nam, as.Right, int(as.Right.Type.Width))
 	case as.Right.Op == ONAME && as.Right.Class() == PFUNC:
+		pfuncsym(&nam, as.Right)
 	default:
 		Fatalf("genAsStatic: rhs %v", as.Right)
 	}
-
-	gdata(&nam, as.Right, int(as.Right.Type.Width))
 }
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@ -339,7 +339,7 @@ func buildssa(fn *Node, worker int) *ssa.Func {
 	s.softFloat = s.config.SoftFloat

 	if printssa {
-		s.f.HTMLWriter = ssa.NewHTMLWriter(ssaDumpFile, s.f.Frontend(), name, ssaDumpCFG)
+		s.f.HTMLWriter = ssa.NewHTMLWriter(ssaDumpFile, s.f, ssaDumpCFG)
 		// TODO: generate and print a mapping from nodes to values and blocks
 		dumpSourcesColumn(s.f.HTMLWriter, fn)
 		s.f.HTMLWriter.WriteAST("AST", astBuf)
@ -394,7 +394,7 @@ func buildssa(fn *Node, worker int) *ssa.Func {
 		// For this value, AuxInt is initialized to zero by default
 		startDeferBits := s.entryNewValue0(ssa.OpConst8, types.Types[TUINT8])
 		s.vars[&deferBitsVar] = startDeferBits
-		s.deferBitsAddr = s.addr(deferBitsTemp, false)
+		s.deferBitsAddr = s.addr(deferBitsTemp)
 		s.store(types.Types[TUINT8], s.deferBitsAddr, startDeferBits)
 		// Make sure that the deferBits stack slot is kept alive (for use
 		// by panics) and stores to deferBits are not eliminated, even if
@ -471,7 +471,7 @@ func dumpSourcesColumn(writer *ssa.HTMLWriter, fn *Node) {
 	fname := Ctxt.PosTable.Pos(fn.Pos).Filename()
 	targetFn, err := readFuncLines(fname, fn.Pos.Line(), fn.Func.Endlineno.Line())
 	if err != nil {
-		writer.Logger.Logf("cannot read sources for function %v: %v", fn, err)
+		writer.Logf("cannot read sources for function %v: %v", fn, err)
 	}

 	// Read sources of inlined functions.
@ -487,7 +487,7 @@ func dumpSourcesColumn(writer *ssa.HTMLWriter, fn *Node) {
 		fname := Ctxt.PosTable.Pos(fi.Pos).Filename()
 		fnLines, err := readFuncLines(fname, fi.Pos.Line(), elno.Line())
 		if err != nil {
-			writer.Logger.Logf("cannot read sources for function %v: %v", fi, err)
+			writer.Logf("cannot read sources for inlined function %v: %v", fi, err)
 			continue
 		}
 		inlFns = append(inlFns, fnLines)
@ -1246,7 +1246,7 @@ func (s *state) stmt(n *Node) {
 			if rhs == nil {
 				r = nil // Signal assign to use OpZero.
 			} else {
-				r = s.addr(rhs, false)
+				r = s.addr(rhs)
 			}
 		} else {
 			if rhs == nil {
@ -1742,9 +1742,6 @@ var opToSSA = map[opAndType]ssa.Op{
 	opAndType{OLT, TFLOAT64}: ssa.OpLess64F,
 	opAndType{OLT, TFLOAT32}: ssa.OpLess32F,

-	opAndType{OGT, TFLOAT64}: ssa.OpGreater64F,
-	opAndType{OGT, TFLOAT32}: ssa.OpGreater32F,
-
 	opAndType{OLE, TINT8}:    ssa.OpLeq8,
 	opAndType{OLE, TUINT8}:   ssa.OpLeq8U,
 	opAndType{OLE, TINT16}:   ssa.OpLeq16,
@ -1755,9 +1752,6 @@ var opToSSA = map[opAndType]ssa.Op{
 	opAndType{OLE, TUINT64}:  ssa.OpLeq64U,
 	opAndType{OLE, TFLOAT64}: ssa.OpLeq64F,
 	opAndType{OLE, TFLOAT32}: ssa.OpLeq32F,
-
-	opAndType{OGE, TFLOAT64}: ssa.OpGeq64F,
-	opAndType{OGE, TFLOAT32}: ssa.OpGeq32F,
 }

 func (s *state) concreteEtype(t *types.Type) types.EType {
@ -2014,10 +2008,10 @@ func (s *state) expr(n *Node) *ssa.Value {
 		if s.canSSA(n) {
 			return s.variable(n, n.Type)
 		}
-		addr := s.addr(n, false)
+		addr := s.addr(n)
 		return s.load(n.Type, addr)
 	case OCLOSUREVAR:
-		addr := s.addr(n, false)
+		addr := s.addr(n)
 		return s.load(n.Type, addr)
 	case OLITERAL:
 		switch u := n.Val().U.(type) {
@ -2345,11 +2339,8 @@ func (s *state) expr(n *Node) *ssa.Value {
 				s.Fatalf("ordered complex compare %v", n.Op)
 			}
 		}
-		if n.Left.Type.IsFloat() {
-			return s.newValueOrSfCall2(s.ssaOp(n.Op, n.Left.Type), types.Types[TBOOL], a, b)
-		}

-		// Integer: convert OGE and OGT into OLE and OLT.
+		// Convert OGE and OGT into OLE and OLT.
 		op := n.Op
 		switch op {
 		case OGE:
@ -2357,6 +2348,11 @@ func (s *state) expr(n *Node) *ssa.Value {
 		case OGT:
 			op, a, b = OLT, b, a
 		}
+		if n.Left.Type.IsFloat() {
+			// float comparison
+			return s.newValueOrSfCall2(s.ssaOp(op, n.Left.Type), types.Types[TBOOL], a, b)
+		}
+		// integer comparison
 		return s.newValue2(s.ssaOp(op, n.Left.Type), types.Types[TBOOL], a, b)
 	case OMUL:
 		a := s.expr(n.Left)
@ -2546,14 +2542,14 @@ func (s *state) expr(n *Node) *ssa.Value {
 		return s.expr(n.Left)

 	case OADDR:
-		return s.addr(n.Left, n.Bounded())
+		return s.addr(n.Left)

 	case ORESULT:
 		addr := s.constOffPtrSP(types.NewPtr(n.Type), n.Xoffset)
 		return s.load(n.Type, addr)

 	case ODEREF:
-		p := s.exprPtr(n.Left, false, n.Pos)
+		p := s.exprPtr(n.Left, n.Bounded(), n.Pos)
 		return s.load(n.Type, p)

 	case ODOT:
@ -2571,14 +2567,14 @@ func (s *state) expr(n *Node) *ssa.Value {
 		// prevents false memory dependencies in race/msan
 		// instrumentation.
 		if islvalue(n) && !s.canSSA(n) {
-			p := s.addr(n, false)
+			p := s.addr(n)
 			return s.load(n.Type, p)
 		}
 		v := s.expr(n.Left)
 		return s.newValue1I(ssa.OpStructSelect, n.Type, int64(fieldIdx(n)), v)

 	case ODOTPTR:
-		p := s.exprPtr(n.Left, false, n.Pos)
+		p := s.exprPtr(n.Left, n.Bounded(), n.Pos)
 		p = s.newValue1I(ssa.OpOffPtr, types.NewPtr(n.Type), n.Xoffset, p)
 		return s.load(n.Type, p)

@ -2604,7 +2600,7 @@ func (s *state) expr(n *Node) *ssa.Value {
 			}
 			return s.load(types.Types[TUINT8], ptr)
 		case n.Left.Type.IsSlice():
-			p := s.addr(n, false)
+			p := s.addr(n)
 			return s.load(n.Left.Type.Elem(), p)
 		case n.Left.Type.IsArray():
 			if canSSAType(n.Left.Type) {
@ -2624,7 +2620,7 @@ func (s *state) expr(n *Node) *ssa.Value {
 				s.boundsCheck(i, len, ssa.BoundsIndex, n.Bounded()) // checks i == 0
 				return s.newValue1I(ssa.OpArraySelect, n.Type, 0, a)
 			}
-			p := s.addr(n, false)
+			p := s.addr(n)
 			return s.load(n.Left.Type.Elem(), p)
 		default:
 			s.Fatalf("bad type for index %v", n.Left.Type)
@ -2790,7 +2786,7 @@ func (s *state) append(n *Node, inplace bool) *ssa.Value {

 	var slice, addr *ssa.Value
 	if inplace {
-		addr = s.addr(sn, false)
+		addr = s.addr(sn)
 		slice = s.load(n.Type, addr)
 	} else {
 		slice = s.expr(sn)
@ -2834,7 +2830,7 @@ func (s *state) append(n *Node, inplace bool) *ssa.Value {
 			// Tell liveness we're about to build a new slice
 			s.vars[&memVar] = s.newValue1A(ssa.OpVarDef, types.TypeMem, sn, s.mem())
 		}
-		capaddr := s.newValue1I(ssa.OpOffPtr, s.f.Config.Types.IntPtr, int64(slice_cap), addr)
+		capaddr := s.newValue1I(ssa.OpOffPtr, s.f.Config.Types.IntPtr, sliceCapOffset, addr)
 		s.store(types.Types[TINT], capaddr, r[2])
 		s.store(pt, addr, r[0])
 		// load the value we just stored to avoid having to spill it
@ -2855,7 +2851,7 @@ func (s *state) append(n *Node, inplace bool) *ssa.Value {
 	if inplace {
 		l = s.variable(&lenVar, types.Types[TINT]) // generates phi for len
 		nl = s.newValue2(s.ssaOp(OADD, types.Types[TINT]), types.Types[TINT], l, s.constInt(types.Types[TINT], nargs))
-		lenaddr := s.newValue1I(ssa.OpOffPtr, s.f.Config.Types.IntPtr, int64(slice_nel), addr)
+		lenaddr := s.newValue1I(ssa.OpOffPtr, s.f.Config.Types.IntPtr, sliceLenOffset, addr)
 		s.store(types.Types[TINT], lenaddr, nl)
 	}

@ -2871,7 +2867,7 @@ func (s *state) append(n *Node, inplace bool) *ssa.Value {
 		if canSSAType(n.Type) {
 			args = append(args, argRec{v: s.expr(n), store: true})
 		} else {
-			v := s.addr(n, false)
+			v := s.addr(n)
 			args = append(args, argRec{v: v})
 		}
 	}
@ -3042,7 +3038,7 @@ func (s *state) assign(left *Node, right *ssa.Value, deref bool, skip skipMask)
 	}

 	// Left is not ssa-able. Compute its address.
-	addr := s.addr(left, false)
+	addr := s.addr(left)
 	if isReflectHeaderDataField(left) {
 		// Package unsafe's documentation says storing pointers into
 		// reflect.SliceHeader and reflect.StringHeader's Data fields
@ -3158,18 +3154,14 @@ func softfloatInit() {
 		ssa.OpDiv32F: sfRtCallDef{sysfunc("fdiv32"), TFLOAT32},
 		ssa.OpDiv64F: sfRtCallDef{sysfunc("fdiv64"), TFLOAT64},

-		ssa.OpEq64F:      sfRtCallDef{sysfunc("feq64"), TBOOL},
-		ssa.OpEq32F:      sfRtCallDef{sysfunc("feq32"), TBOOL},
-		ssa.OpNeq64F:     sfRtCallDef{sysfunc("feq64"), TBOOL},
-		ssa.OpNeq32F:     sfRtCallDef{sysfunc("feq32"), TBOOL},
-		ssa.OpLess64F:    sfRtCallDef{sysfunc("fgt64"), TBOOL},
-		ssa.OpLess32F:    sfRtCallDef{sysfunc("fgt32"), TBOOL},
-		ssa.OpGreater64F: sfRtCallDef{sysfunc("fgt64"), TBOOL},
-		ssa.OpGreater32F: sfRtCallDef{sysfunc("fgt32"), TBOOL},
-		ssa.OpLeq64F:     sfRtCallDef{sysfunc("fge64"), TBOOL},
-		ssa.OpLeq32F:     sfRtCallDef{sysfunc("fge32"), TBOOL},
-		ssa.OpGeq64F:     sfRtCallDef{sysfunc("fge64"), TBOOL},
-		ssa.OpGeq32F:     sfRtCallDef{sysfunc("fge32"), TBOOL},
+		ssa.OpEq64F:   sfRtCallDef{sysfunc("feq64"), TBOOL},
+		ssa.OpEq32F:   sfRtCallDef{sysfunc("feq32"), TBOOL},
+		ssa.OpNeq64F:  sfRtCallDef{sysfunc("feq64"), TBOOL},
+		ssa.OpNeq32F:  sfRtCallDef{sysfunc("feq32"), TBOOL},
+		ssa.OpLess64F: sfRtCallDef{sysfunc("fgt64"), TBOOL},
+		ssa.OpLess32F: sfRtCallDef{sysfunc("fgt32"), TBOOL},
+		ssa.OpLeq64F:  sfRtCallDef{sysfunc("fge64"), TBOOL},
+		ssa.OpLeq32F:  sfRtCallDef{sysfunc("fge32"), TBOOL},

 		ssa.OpCvt32to32F:  sfRtCallDef{sysfunc("fint32to32"), TFLOAT32},
 		ssa.OpCvt32Fto32:  sfRtCallDef{sysfunc("f32toint32"), TINT32},
@ -3285,10 +3277,7 @@ func init() {
 				// Compiler frontend optimizations emit OBYTES2STRTMP nodes
 				// for the backend instead of slicebytetostringtmp calls
 				// when not instrumenting.
-				slice := args[0]
-				ptr := s.newValue1(ssa.OpSlicePtr, s.f.Config.Types.BytePtr, slice)
-				len := s.newValue1(ssa.OpSliceLen, types.Types[TINT], slice)
-				return s.newValue2(ssa.OpStringMake, n.Type, ptr, len)
+				return s.newValue2(ssa.OpStringMake, n.Type, args[0], args[1])
 			},
 			all...)
 	}
@ -3547,7 +3536,7 @@ func init() {
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			return s.newValue1(ssa.OpSqrt, types.Types[TFLOAT64], args[0])
 		},
-		sys.I386, sys.AMD64, sys.ARM, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.S390X, sys.Wasm)
+		sys.I386, sys.AMD64, sys.ARM, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
 	addF("math", "Trunc",
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			return s.newValue1(ssa.OpTrunc, types.Types[TFLOAT64], args[0])
@ -3595,8 +3584,7 @@ func init() {
 				s.vars[n] = s.load(types.Types[TFLOAT64], a)
 				return s.variable(n, types.Types[TFLOAT64])
 			}
-			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), x86HasFMA, s.sb)
-			v := s.load(types.Types[TBOOL], addr)
+			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[TBOOL], x86HasFMA)
 			b := s.endBlock()
 			b.Kind = ssa.BlockIf
 			b.SetControl(v)
@ -3661,8 +3649,7 @@ func init() {

 	makeRoundAMD64 := func(op ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 		return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
-			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), x86HasSSE41, s.sb)
-			v := s.load(types.Types[TBOOL], addr)
+			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[TBOOL], x86HasSSE41)
 			b := s.endBlock()
 			b.Kind = ssa.BlockIf
 			b.SetControl(v)
@ -3869,8 +3856,7 @@ func init() {

 	makeOnesCountAMD64 := func(op64 ssa.Op, op32 ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 		return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
-			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), x86HasPOPCNT, s.sb)
-			v := s.load(types.Types[TBOOL], addr)
+			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[TBOOL], x86HasPOPCNT)
 			b := s.endBlock()
 			b.Kind = ssa.BlockIf
 			b.SetControl(v)
@ -4229,7 +4215,7 @@ func (s *state) openDeferSave(n *Node, t *types.Type, val *ssa.Value) *ssa.Value
 		argTemp.Name.SetNeedzero(true)
 	}
 	if !canSSA {
-		a := s.addr(n, false)
+		a := s.addr(n)
 		s.move(t, addrArgTemp, a)
 		return addrArgTemp
 	}
@ -4401,7 +4387,7 @@ func (s *state) call(n *Node, k callKind) *ssa.Value {
 		d := tempAt(n.Pos, s.curfn, t)

 		s.vars[&memVar] = s.newValue1A(ssa.OpVarDef, types.TypeMem, d, s.mem())
-		addr := s.addr(d, false)
+		addr := s.addr(d)

 		// Must match reflect.go:deferstruct and src/runtime/runtime2.go:_defer.
 		// 0: siz
@ -4592,9 +4578,7 @@ func etypesign(e types.EType) int8 {

 // addr converts the address of the expression n to SSA, adds it to s and returns the SSA result.
 // The value that the returned Value represents is guaranteed to be non-nil.
-// If bounded is true then this address does not require a nil check for its operand
-// even if that would otherwise be implied.
-func (s *state) addr(n *Node, bounded bool) *ssa.Value {
+func (s *state) addr(n *Node) *ssa.Value {
 	if n.Op != ONAME {
 		s.pushLine(n.Pos)
 		defer s.popLine()
@ -4647,25 +4631,25 @@ func (s *state) addr(n *Node, bounded bool) *ssa.Value {
 			p := s.newValue1(ssa.OpSlicePtr, t, a)
 			return s.newValue2(ssa.OpPtrIndex, t, p, i)
 		} else { // array
-			a := s.addr(n.Left, bounded)
+			a := s.addr(n.Left)
 			i := s.expr(n.Right)
 			len := s.constInt(types.Types[TINT], n.Left.Type.NumElem())
 			i = s.boundsCheck(i, len, ssa.BoundsIndex, n.Bounded())
 			return s.newValue2(ssa.OpPtrIndex, types.NewPtr(n.Left.Type.Elem()), a, i)
 		}
 	case ODEREF:
-		return s.exprPtr(n.Left, bounded, n.Pos)
+		return s.exprPtr(n.Left, n.Bounded(), n.Pos)
 	case ODOT:
-		p := s.addr(n.Left, bounded)
+		p := s.addr(n.Left)
 		return s.newValue1I(ssa.OpOffPtr, t, n.Xoffset, p)
 	case ODOTPTR:
-		p := s.exprPtr(n.Left, bounded, n.Pos)
+		p := s.exprPtr(n.Left, n.Bounded(), n.Pos)
 		return s.newValue1I(ssa.OpOffPtr, t, n.Xoffset, p)
 	case OCLOSUREVAR:
 		return s.newValue1I(ssa.OpOffPtr, t, n.Xoffset,
 			s.entryNewValue0(ssa.OpGetClosurePtr, s.f.Config.Types.BytePtr))
 	case OCONVNOP:
-		addr := s.addr(n.Left, bounded)
+		addr := s.addr(n.Left)
 		return s.newValue1(ssa.OpCopy, t, addr) // ensure that addr has the right type
 	case OCALLFUNC, OCALLINTER, OCALLMETH:
 		return s.call(n, callNormal)
@ -5090,7 +5074,7 @@ func (s *state) storeArgWithBase(n *Node, t *types.Type, base *ssa.Value, off in
 	}

 	if !canSSAType(t) {
-		a := s.addr(n, false)
+		a := s.addr(n)
 		s.move(t, addr, a)
 		return
 	}
@ -5644,7 +5628,7 @@ func (s *state) dottype(n *Node, commaok bool) (res, resok *ssa.Value) {
 		// TODO: get rid of some of these temporaries.
 		tmp = tempAt(n.Pos, s.curfn, n.Type)
 		s.vars[&memVar] = s.newValue1A(ssa.OpVarDef, types.TypeMem, tmp, s.mem())
-		addr = s.addr(tmp, false)
+		addr = s.addr(tmp)
 	}

 	cond := s.newValue2(ssa.OpEqPtr, types.Types[TBOOL], itab, targetITab)
@ -6027,7 +6011,7 @@ func genssa(f *ssa.Func, pp *Progs) {
 		// instruction. We won't use the actual liveness map on a
 		// control instruction. Just mark it something that is
 		// preemptible.
-		s.pp.nextLive = LivenessIndex{-1, -1}
+		s.pp.nextLive = LivenessIndex{-1, -1, false}

 		// Emit values in block
 		thearch.SSAMarkMoves(&s, b)
@ -6360,20 +6344,6 @@ func (s *SSAGenState) FPJump(b, next *ssa.Block, jumps *[2][2]FloatingEQNEJump)
 	}
 }

-func AuxOffset(v *ssa.Value) (offset int64) {
-	if v.Aux == nil {
-		return 0
-	}
-	n, ok := v.Aux.(*Node)
-	if !ok {
-		v.Fatalf("bad aux type in %s\n", v.LongString())
-	}
-	if n.Class() == PAUTO {
-		return n.Xoffset
-	}
-	return 0
-}
-
 // AddAux adds the offset in the aux fields (AuxInt and Aux) of v to a.
 func AddAux(a *obj.Addr, v *ssa.Value) {
 	AddAux2(a, v, v.AuxInt)
@ -6601,10 +6571,8 @@ func (s *SSAGenState) Call(v *ssa.Value) *obj.Prog {
 // since it emits PCDATA for the stack map at the call (calls are safe points).
 func (s *SSAGenState) PrepareCall(v *ssa.Value) {
 	idx := s.livenessMap.Get(v)
-	if !idx.Valid() {
-		// typedmemclr and typedmemmove are write barriers and
-		// deeply non-preemptible. They are unsafe points and
-		// hence should not have liveness maps.
+	if !idx.StackMapValid() {
+		// See Liveness.hasStackMap.
 		if sym, _ := v.Aux.(*obj.LSym); !(sym == typedmemclr || sym == typedmemmove) {
 			Fatalf("missing stack map index for %v", v.LongString())
 		}
@ -6672,21 +6640,21 @@ func fieldIdx(n *Node) int {
 // It also exports a bunch of compiler services for the ssa backend.
 type ssafn struct {
 	curfn        *Node
-	strings      map[string]interface{} // map from constant string to data symbols
-	scratchFpMem *Node                  // temp for floating point register / memory moves on some architectures
-	stksize      int64                  // stack size for current frame
-	stkptrsize   int64                  // prefix of stack containing pointers
-	log          bool                   // print ssa debug to the stdout
+	strings      map[string]*obj.LSym // map from constant string to data symbols
+	scratchFpMem *Node                // temp for floating point register / memory moves on some architectures
+	stksize      int64                // stack size for current frame
+	stkptrsize   int64                // prefix of stack containing pointers
+	log          bool                 // print ssa debug to the stdout
 }

-// StringData returns a symbol (a *types.Sym wrapped in an interface) which
+// StringData returns a symbol which
 // is the data component of a global string constant containing s.
-func (e *ssafn) StringData(s string) interface{} {
+func (e *ssafn) StringData(s string) *obj.LSym {
 	if aux, ok := e.strings[s]; ok {
 		return aux
 	}
 	if e.strings == nil {
-		e.strings = make(map[string]interface{})
+		e.strings = make(map[string]*obj.LSym)
 	}
 	data := stringsym(e.curfn.Pos, s)
 	e.strings[s] = data
--- a/src/cmd/compile/internal/gc/subr.go
+++ b/src/cmd/compile/internal/gc/subr.go
@ -376,7 +376,13 @@ func newnamel(pos src.XPos, s *types.Sym) *Node {
 // nodSym makes a Node with Op op and with the Left field set to left
 // and the Sym field set to sym. This is for ODOT and friends.
 func nodSym(op Op, left *Node, sym *types.Sym) *Node {
-	n := nod(op, left, nil)
+	return nodlSym(lineno, op, left, sym)
+}
+
+// nodlSym makes a Node with position Pos, with Op op, and with the Left field set to left
+// and the Sym field set to sym. This is for ODOT and friends.
+func nodlSym(pos src.XPos, op Op, left *Node, sym *types.Sym) *Node {
+	n := nodl(pos, op, left, nil)
 	n.Sym = sym
 	return n
 }
@ -923,6 +929,21 @@ func (o Op) IsSlice3() bool {
 	return false
 }

+// slicePtrLen extracts the pointer and length from a slice.
+// This constructs two nodes referring to n, so n must be a cheapexpr.
+func (n *Node) slicePtrLen() (ptr, len *Node) {
+	var init Nodes
+	c := cheapexpr(n, &init)
+	if c != n || init.Len() != 0 {
+		Fatalf("slicePtrLen not cheap: %v", n)
+	}
+	ptr = nod(OSPTR, n, nil)
+	ptr.Type = n.Type.Elem().PtrTo()
+	len = nod(OLEN, n, nil)
+	len.Type = types.Types[TINT]
+	return ptr, len
+}
+
 // labeledControl returns the control flow Node (for, switch, select)
 // associated with the label n, if any.
 func (n *Node) labeledControl() *Node {
@ -1881,18 +1902,21 @@ func itabType(itab *Node) *Node {
 // ifaceData loads the data field from an interface.
 // The concrete type must be known to have type t.
 // It follows the pointer if !isdirectiface(t).
-func ifaceData(n *Node, t *types.Type) *Node {
-	ptr := nodSym(OIDATA, n, nil)
+func ifaceData(pos src.XPos, n *Node, t *types.Type) *Node {
+	if t.IsInterface() {
+		Fatalf("ifaceData interface: %v", t)
+	}
+	ptr := nodlSym(pos, OIDATA, n, nil)
 	if isdirectiface(t) {
 		ptr.Type = t
 		ptr.SetTypecheck(1)
 		return ptr
 	}
 	ptr.Type = types.NewPtr(t)
-	ptr.SetBounded(true)
 	ptr.SetTypecheck(1)
-	ind := nod(ODEREF, ptr, nil)
+	ind := nodl(pos, ODEREF, ptr, nil)
 	ind.Type = t
 	ind.SetTypecheck(1)
+	ind.SetBounded(true)
 	return ind
 }
--- a/src/cmd/compile/internal/gc/swt.go
+++ b/src/cmd/compile/internal/gc/swt.go
@ -540,10 +540,14 @@ func walkTypeSwitch(sw *Node) {
 			caseVar = ncase.Rlist.First()
 		}

-		// For single-type cases, we initialize the case
-		// variable as part of the type assertion; but in
-		// other cases, we initialize it in the body.
-		singleType := ncase.List.Len() == 1 && ncase.List.First().Op == OTYPE
+		// For single-type cases with an interface type,
+		// we initialize the case variable as part of the type assertion.
+		// In other cases, we initialize it in the body.
+		var singleType *types.Type
+		if ncase.List.Len() == 1 && ncase.List.First().Op == OTYPE {
+			singleType = ncase.List.First().Type
+		}
+		caseVarInitialized := false

 		label := autolabel(".s")
 		jmp := npos(ncase.Pos, nodSym(OGOTO, nil, label))
@ -564,18 +568,27 @@ func walkTypeSwitch(sw *Node) {
 				continue
 			}

-			if singleType {
-				s.Add(n1.Type, caseVar, jmp)
+			if singleType != nil && singleType.IsInterface() {
+				s.Add(ncase.Pos, n1.Type, caseVar, jmp)
+				caseVarInitialized = true
 			} else {
-				s.Add(n1.Type, nil, jmp)
+				s.Add(ncase.Pos, n1.Type, nil, jmp)
 			}
 		}

 		body.Append(npos(ncase.Pos, nodSym(OLABEL, nil, label)))
-		if caseVar != nil && !singleType {
+		if caseVar != nil && !caseVarInitialized {
+			val := s.facename
+			if singleType != nil {
+				// We have a single concrete type. Extract the data.
+				if singleType.IsInterface() {
+					Fatalf("singleType interface should have been handled in Add")
+				}
+				val = ifaceData(ncase.Pos, s.facename, singleType)
+			}
 			l := []*Node{
 				nodl(ncase.Pos, ODCL, caseVar, nil),
-				nodl(ncase.Pos, OAS, caseVar, s.facename),
+				nodl(ncase.Pos, OAS, caseVar, val),
 			}
 			typecheckslice(l, ctxStmt)
 			body.Append(l...)
@ -616,12 +629,12 @@ type typeClause struct {
 	body Nodes
 }

-func (s *typeSwitch) Add(typ *types.Type, caseVar *Node, jmp *Node) {
+func (s *typeSwitch) Add(pos src.XPos, typ *types.Type, caseVar, jmp *Node) {
 	var body Nodes
 	if caseVar != nil {
 		l := []*Node{
-			nod(ODCL, caseVar, nil),
-			nod(OAS, caseVar, nil),
+			nodl(pos, ODCL, caseVar, nil),
+			nodl(pos, OAS, caseVar, nil),
 		}
 		typecheckslice(l, ctxStmt)
 		body.Append(l...)
@ -630,9 +643,9 @@ func (s *typeSwitch) Add(typ *types.Type, caseVar *Node, jmp *Node) {
 	}

 	// cv, ok = iface.(type)
-	as := nod(OAS2, nil, nil)
+	as := nodl(pos, OAS2, nil, nil)
 	as.List.Set2(caseVar, s.okname) // cv, ok =
-	dot := nod(ODOTTYPE, s.facename, nil)
+	dot := nodl(pos, ODOTTYPE, s.facename, nil)
 	dot.Type = typ // iface.(type)
 	as.Rlist.Set1(dot)
 	as = typecheck(as, ctxStmt)
@ -640,7 +653,7 @@ func (s *typeSwitch) Add(typ *types.Type, caseVar *Node, jmp *Node) {
 	body.Append(as)

 	// if ok { goto label }
-	nif := nod(OIF, nil, nil)
+	nif := nodl(pos, OIF, nil, nil)
 	nif.Left = s.okname
 	nif.Nbody.Set1(jmp)
 	body.Append(nif)
--- a/src/cmd/compile/internal/gc/syntax.go
+++ b/src/cmd/compile/internal/gc/syntax.go
@ -8,7 +8,6 @@ package gc

 import (
 	"cmd/compile/internal/ssa"
-	"cmd/compile/internal/syntax"
 	"cmd/compile/internal/types"
 	"cmd/internal/obj"
 	"cmd/internal/objabi"
@ -188,15 +187,39 @@ func (n *Node) SetImplicit(b bool)  { n.flags.set(nodeImplicit, b) }
 func (n *Node) SetIsDDD(b bool)     { n.flags.set(nodeIsDDD, b) }
 func (n *Node) SetDiag(b bool)      { n.flags.set(nodeDiag, b) }
 func (n *Node) SetColas(b bool)     { n.flags.set(nodeColas, b) }
-func (n *Node) SetNonNil(b bool)    { n.flags.set(nodeNonNil, b) }
 func (n *Node) SetTransient(b bool) { n.flags.set(nodeTransient, b) }
-func (n *Node) SetBounded(b bool)   { n.flags.set(nodeBounded, b) }
 func (n *Node) SetHasCall(b bool)   { n.flags.set(nodeHasCall, b) }
 func (n *Node) SetLikely(b bool)    { n.flags.set(nodeLikely, b) }
 func (n *Node) SetHasVal(b bool)    { n.flags.set(nodeHasVal, b) }
 func (n *Node) SetHasOpt(b bool)    { n.flags.set(nodeHasOpt, b) }
 func (n *Node) SetEmbedded(b bool)  { n.flags.set(nodeEmbedded, b) }

+// MarkNonNil marks a pointer n as being guaranteed non-nil,
+// on all code paths, at all times.
+// During conversion to SSA, non-nil pointers won't have nil checks
+// inserted before dereferencing. See state.exprPtr.
+func (n *Node) MarkNonNil() {
+	if !n.Type.IsPtr() && !n.Type.IsUnsafePtr() {
+		Fatalf("MarkNonNil(%v), type %v", n, n.Type)
+	}
+	n.flags.set(nodeNonNil, true)
+}
+
+// SetBounded indicates whether operation n does not need safety checks.
+// When n is an index or slice operation, n does not need bounds checks.
+// When n is a dereferencing operation, n does not need nil checks.
+func (n *Node) SetBounded(b bool) {
+	switch n.Op {
+	case OINDEX, OSLICE, OSLICEARR, OSLICE3, OSLICE3ARR, OSLICESTR:
+		// No bounds checks needed.
+	case ODOTPTR, ODEREF:
+		// No nil check needed.
+	default:
+		Fatalf("SetBounded(%v)", n)
+	}
+	n.flags.set(nodeBounded, b)
+}
+
 // MarkReadonly indicates that n is an ONAME with readonly contents.
 func (n *Node) MarkReadonly() {
 	if n.Op != ONAME {
@ -311,6 +334,10 @@ func (n *Node) pkgFuncName() string {
 	return p + "." + s.Name
 }

+// The compiler needs *Node to be assignable to cmd/compile/internal/ssa.Sym.
+func (n *Node) CanBeAnSSASym() {
+}
+
 // Name holds Node fields used only by named nodes (ONAME, OTYPE, OPACK, OLABEL, some OLITERAL).
 type Name struct {
 	Pack      *Node      // real package for import . names
@ -455,7 +482,7 @@ type Param struct {
 	// OTYPE
 	//
 	// TODO: Should Func pragmas also be stored on the Name?
-	Pragma syntax.Pragma
+	Pragma PragmaFlag
 	Alias  bool // node is alias for Ntype (only used when type-checking ODCLTYPE)
 }

@ -537,7 +564,7 @@ type Func struct {
 	Endlineno src.XPos
 	WBPos     src.XPos // position of first write barrier; see SetWBPos

-	Pragma syntax.Pragma // go:xxx function annotations
+	Pragma PragmaFlag // go:xxx function annotations

 	flags      bitset16
 	numDefers  int // number of defer calls in the function
@ -659,10 +686,8 @@ const (

 	// OCALLFUNC, OCALLMETH, and OCALLINTER have the same structure.
 	// Prior to walk, they are: Left(List), where List is all regular arguments.
-	// If present, Right is an ODDDARG that holds the
-	// generated slice used in a call to a variadic function.
 	// After walk, List is a series of assignments to temporaries,
-	// and Rlist is an updated set of arguments, including any ODDDARG slice.
+	// and Rlist is an updated set of arguments.
 	// TODO(josharian/khr): Use Ninit instead of List for the assignments to temporaries. See CL 114797.
 	OCALLFUNC  // Left(List/Rlist) (function call f(args))
 	OCALLMETH  // Left(List/Rlist) (direct method call x.Method(args))
@ -790,7 +815,6 @@ const (

 	// misc
 	ODDD        // func f(args ...int) or f(l...) or var a = [...]int{0, 1, 2}.
-	ODDDARG     // func f(args ...int), introduced by escape analysis.
 	OINLCALL    // intermediary representation of an inlined call.
 	OEFACE      // itable and data words of an empty-interface value.
 	OITAB       // itable word of an interface value.
--- a/src/cmd/compile/internal/gc/universe.go
+++ b/src/cmd/compile/internal/gc/universe.go
@ -342,13 +342,13 @@ func typeinit() {
 	simtype[TFUNC] = TPTR
 	simtype[TUNSAFEPTR] = TPTR

-	slice_array = int(Rnd(0, int64(Widthptr)))
-	slice_nel = int(Rnd(int64(slice_array)+int64(Widthptr), int64(Widthptr)))
-	slice_cap = int(Rnd(int64(slice_nel)+int64(Widthptr), int64(Widthptr)))
-	sizeof_Slice = int(Rnd(int64(slice_cap)+int64(Widthptr), int64(Widthptr)))
+	slicePtrOffset = 0
+	sliceLenOffset = Rnd(slicePtrOffset+int64(Widthptr), int64(Widthptr))
+	sliceCapOffset = Rnd(sliceLenOffset+int64(Widthptr), int64(Widthptr))
+	sizeofSlice = Rnd(sliceCapOffset+int64(Widthptr), int64(Widthptr))

 	// string is same as slice wo the cap
-	sizeof_String = int(Rnd(int64(slice_nel)+int64(Widthptr), int64(Widthptr)))
+	sizeofString = Rnd(sliceLenOffset+int64(Widthptr), int64(Widthptr))

 	dowidth(types.Types[TSTRING])
 	dowidth(types.Idealstring)
--- a/src/cmd/compile/internal/gc/walk.go
+++ b/src/cmd/compile/internal/gc/walk.go
@ -81,18 +81,6 @@ func walkstmtlist(s []*Node) {
 	}
 }

-func samelist(a, b []*Node) bool {
-	if len(a) != len(b) {
-		return false
-	}
-	for i, n := range a {
-		if n != b[i] {
-			return false
-		}
-	}
-	return true
-}
-
 func paramoutheap(fn *Node) bool {
 	for _, ln := range fn.Func.Dcl {
 		switch ln.Class() {
@ -295,16 +283,6 @@ func walkstmt(n *Node) *Node {
 				Fatalf("expected %v return arguments, have %v", want, got)
 			}

-			if samelist(rl, n.List.Slice()) {
-				// special return in disguise
-				// TODO(josharian, 1.12): is "special return" still relevant?
-				// Tests still pass w/o this. See comments on https://go-review.googlesource.com/c/go/+/118318
-				walkexprlist(n.List.Slice(), &n.Ninit)
-				n.List.Set(nil)
-
-				break
-			}
-
 			// move function calls out, to make reorder3's job easier.
 			walkexprlistsafe(n.List.Slice(), &n.Ninit)

@ -479,7 +457,7 @@ func walkexpr(n *Node, init *Nodes) *Node {
 		nn := nod(ODEREF, n.Name.Param.Heapaddr, nil)
 		nn = typecheck(nn, ctxExpr)
 		nn = walkexpr(nn, init)
-		nn.Left.SetNonNil(true)
+		nn.Left.MarkNonNil()
 		return nn
 	}

@ -784,7 +762,7 @@ opswitch:
 		if !a.isBlank() {
 			var_ := temp(types.NewPtr(t.Elem()))
 			var_.SetTypecheck(1)
-			var_.SetNonNil(true) // mapaccess always returns a non-nil pointer
+			var_.MarkNonNil() // mapaccess always returns a non-nil pointer
 			n.List.SetFirst(var_)
 			n = walkexpr(n, init)
 			init.Append(n)
@ -862,7 +840,6 @@ opswitch:
 			n.Left = cheapexpr(n.Left, init)
 			// byteindex widens n.Left so that the multiplication doesn't overflow.
 			index := nod(OLSH, byteindex(n.Left), nodintconst(3))
-			index.SetBounded(true)
 			if thearch.LinkArch.ByteOrder == binary.BigEndian {
 				index = nod(OADD, index, nodintconst(7))
 			}
@ -908,7 +885,7 @@ opswitch:
 			init.Append(nif)

 			// Build the result.
-			e := nod(OEFACE, tmp, ifaceData(c, types.NewPtr(types.Types[TUINT8])))
+			e := nod(OEFACE, tmp, ifaceData(n.Pos, c, types.NewPtr(types.Types[TUINT8])))
 			e.Type = toType // assign type manually, typecheck doesn't understand OEFACE.
 			e.SetTypecheck(1)
 			n = e
@ -1126,7 +1103,7 @@ opswitch:
 			}
 		}
 		n.Type = types.NewPtr(t.Elem())
-		n.SetNonNil(true) // mapaccess1* and mapassign always return non-nil pointers.
+		n.MarkNonNil() // mapaccess1* and mapassign always return non-nil pointers.
 		n = nod(ODEREF, n, nil)
 		n.Type = t.Elem()
 		n.SetTypecheck(1)
@ -1247,12 +1224,23 @@ opswitch:
 			// are stored with an indirection. So max bucket size is 2048+eps.
 			if !Isconst(hint, CTINT) ||
 				hint.Val().U.(*Mpint).CmpInt64(BUCKETSIZE) <= 0 {
+
+				// In case hint is larger than BUCKETSIZE runtime.makemap
+				// will allocate the buckets on the heap, see #20184
+				//
+				// if hint <= BUCKETSIZE {
+				//     var bv bmap
+				//     b = &bv
+				//     h.buckets = b
+				// }
+
+				nif := nod(OIF, nod(OLE, hint, nodintconst(BUCKETSIZE)), nil)
+				nif.SetLikely(true)
+
 				// var bv bmap
 				bv := temp(bmap(t))
-
 				zero = nod(OAS, bv, nil)
-				zero = typecheck(zero, ctxStmt)
-				init.Append(zero)
+				nif.Nbody.Append(zero)

 				// b = &bv
 				b := nod(OADDR, bv, nil)
@ -1260,8 +1248,11 @@ opswitch:
 				// h.buckets = b
 				bsym := hmapType.Field(5).Sym // hmap.buckets see reflect.go:hmap
 				na := nod(OAS, nodSym(ODOT, h, bsym), b)
-				na = typecheck(na, ctxStmt)
-				init.Append(na)
+				nif.Nbody.Append(na)
+
+				nif = typecheck(nif, ctxStmt)
+				nif = walkstmt(nif)
+				init.Append(nif)
 			}
 		}

@ -1338,7 +1329,8 @@ opswitch:
 				Fatalf("walkexpr: invalid index %v", r)
 			}

-			// cap is constrained to [0,2^31), so it's safe to do:
+			// cap is constrained to [0,2^31) or [0,2^63) depending on whether
+			// we're in 32-bit or 64-bit systems. So it's safe to do:
 			//
 			// if uint64(len) > cap {
 			//     if len < 0 { panicmakeslicelen() }
@ -1390,7 +1382,7 @@ opswitch:

 			fn := syslook(fnname)
 			m.Left = mkcall1(fn, types.Types[TUNSAFEPTR], init, typename(t.Elem()), conv(len, argtype), conv(cap, argtype))
-			m.Left.SetNonNil(true)
+			m.Left.MarkNonNil()
 			m.List.Set2(conv(len, types.Types[TINT]), conv(cap, types.Types[TINT]))

 			m = typecheck(m, ctxExpr)
@ -1414,13 +1406,15 @@ opswitch:
 			t := types.NewArray(types.Types[TUINT8], tmpstringbufsize)
 			a = nod(OADDR, temp(t), nil)
 		}
-		fn := "slicebytetostring"
 		if n.Op == ORUNES2STR {
-			fn = "slicerunetostring"
+			// slicerunetostring(*[32]byte, []rune) string
+			n = mkcall("slicerunetostring", n.Type, init, a, n.Left)
+		} else {
+			// slicebytetostring(*[32]byte, ptr *byte, n int) string
+			n.Left = cheapexpr(n.Left, init)
+			ptr, len := n.Left.slicePtrLen()
+			n = mkcall("slicebytetostring", n.Type, init, a, ptr, len)
 		}
-		// slicebytetostring(*[32]byte, []byte) string
-		// slicerunetostring(*[32]byte, []rune) string
-		n = mkcall(fn, n.Type, init, a, n.Left)

 	case OBYTES2STRTMP:
 		n.Left = walkexpr(n.Left, init)
@ -1429,8 +1423,10 @@ opswitch:
 			// to avoid a function call to slicebytetostringtmp.
 			break
 		}
-		// slicebytetostringtmp([]byte) string
-		n = mkcall("slicebytetostringtmp", n.Type, init, n.Left)
+		// slicebytetostringtmp(ptr *byte, n int) string
+		n.Left = cheapexpr(n.Left, init)
+		ptr, len := n.Left.slicePtrLen()
+		n = mkcall("slicebytetostringtmp", n.Type, init, ptr, len)

 	case OSTR2BYTES:
 		s := n.Left
@ -1721,57 +1717,56 @@ func ascompatet(nl Nodes, nr *types.Type) []*Node {
 }

 // package all the arguments that match a ... T parameter into a []T.
-func mkdotargslice(typ *types.Type, args []*Node, init *Nodes, ddd *Node) *Node {
-	esc := uint16(EscUnknown)
-	if ddd != nil {
-		esc = ddd.Esc
-	}
+func mkdotargslice(typ *types.Type, args []*Node) *Node {
+	var n *Node
 	if len(args) == 0 {
-		n := nodnil()
+		n = nodnil()
 		n.Type = typ
-		return n
+	} else {
+		n = nod(OCOMPLIT, nil, typenod(typ))
+		n.List.Append(args...)
+		n.SetImplicit(true)
 	}

-	n := nod(OCOMPLIT, nil, typenod(typ))
-	if ddd != nil && prealloc[ddd] != nil {
-		prealloc[n] = prealloc[ddd] // temporary to use
-	}
-	n.List.Set(args)
-	n.Esc = esc
 	n = typecheck(n, ctxExpr)
 	if n.Type == nil {
 		Fatalf("mkdotargslice: typecheck failed")
 	}
-	n = walkexpr(n, init)
 	return n
 }

+// fixVariadicCall rewrites calls to variadic functions to use an
+// explicit ... argument if one is not already present.
+func fixVariadicCall(call *Node) {
+	fntype := call.Left.Type
+	if !fntype.IsVariadic() || call.IsDDD() {
+		return
+	}
+
+	vi := fntype.NumParams() - 1
+	vt := fntype.Params().Field(vi).Type
+
+	args := call.List.Slice()
+	extra := args[vi:]
+	slice := mkdotargslice(vt, extra)
+	for i := range extra {
+		extra[i] = nil // allow GC
+	}
+
+	call.List.Set(append(args[:vi], slice))
+	call.SetIsDDD(true)
+}
+
 func walkCall(n *Node, init *Nodes) {
 	if n.Rlist.Len() != 0 {
 		return // already walked
 	}
-	n.Left = walkexpr(n.Left, init)
-	walkexprlist(n.List.Slice(), init)

 	params := n.Left.Type.Params()
 	args := n.List.Slice()
-	// If there's a ... parameter (which is only valid as the final
-	// parameter) and this is not a ... call expression,
-	// then assign the remaining arguments as a slice.
-	if nf := params.NumFields(); nf > 0 {
-		if last := params.Field(nf - 1); last.IsDDD() && !n.IsDDD() {
-			// The callsite does not use a ..., but the called function is declared
-			// with a final argument that has a ... . Build the slice that we will
-			// pass as the ... argument.
-			tail := args[nf-1:]
-			slice := mkdotargslice(last.Type, tail, init, n.Right)
-			// Allow immediate GC.
-			for i := range tail {
-				tail[i] = nil
-			}
-			args = append(args[:nf-1], slice)
-		}
-	}
+
+	n.Left = walkexpr(n.Left, init)
+	walkexprlist(args, init)

 	// If this is a method call, add the receiver at the beginning of the args.
 	if n.Op == OCALLMETH {
@ -1956,7 +1951,7 @@ func callnew(t *types.Type) *Node {
 	n := nod(ONEWOBJ, typename(t), nil)
 	n.Type = types.NewPtr(t)
 	n.SetTypecheck(1)
-	n.SetNonNil(true)
+	n.MarkNonNil()
 	return n
 }

@ -2645,6 +2640,8 @@ func appendslice(n *Node, init *Nodes) *Node {

 	l1 := n.List.First()
 	l2 := n.List.Second()
+	l2 = cheapexpr(l2, init)
+	n.List.SetSecond(l2)

 	var nodes Nodes

@ -2682,35 +2679,45 @@ func appendslice(n *Node, init *Nodes) *Node {
 	if elemtype.HasHeapPointer() {
 		// copy(s[len(l1):], l2)
 		nptr1 := nod(OSLICE, s, nil)
+		nptr1.Type = s.Type
 		nptr1.SetSliceBounds(nod(OLEN, l1, nil), nil, nil)
+		nptr1 = cheapexpr(nptr1, &nodes)

 		nptr2 := l2

 		Curfn.Func.setWBPos(n.Pos)

-		// instantiate typedslicecopy(typ *type, dst any, src any) int
+		// instantiate typedslicecopy(typ *type, dstPtr *any, dstLen int, srcPtr *any, srcLen int) int
 		fn := syslook("typedslicecopy")
-		fn = substArgTypes(fn, l1.Type, l2.Type)
-		ncopy = mkcall1(fn, types.Types[TINT], &nodes, typename(elemtype), nptr1, nptr2)
+		fn = substArgTypes(fn, l1.Type.Elem(), l2.Type.Elem())
+		ptr1, len1 := nptr1.slicePtrLen()
+		ptr2, len2 := nptr2.slicePtrLen()
+		ncopy = mkcall1(fn, types.Types[TINT], &nodes, typename(elemtype), ptr1, len1, ptr2, len2)

 	} else if instrumenting && !compiling_runtime {
 		// rely on runtime to instrument copy.
 		// copy(s[len(l1):], l2)
 		nptr1 := nod(OSLICE, s, nil)
+		nptr1.Type = s.Type
 		nptr1.SetSliceBounds(nod(OLEN, l1, nil), nil, nil)
+		nptr1 = cheapexpr(nptr1, &nodes)

 		nptr2 := l2

 		if l2.Type.IsString() {
-			// instantiate func slicestringcopy(to any, fr any) int
+			// instantiate func slicestringcopy(toPtr *byte, toLen int, fr string) int
 			fn := syslook("slicestringcopy")
-			fn = substArgTypes(fn, l1.Type, l2.Type)
-			ncopy = mkcall1(fn, types.Types[TINT], &nodes, nptr1, nptr2)
+			ptr, len := nptr1.slicePtrLen()
+			str := nod(OCONVNOP, nptr2, nil)
+			str.Type = types.Types[TSTRING]
+			ncopy = mkcall1(fn, types.Types[TINT], &nodes, ptr, len, str)
 		} else {
 			// instantiate func slicecopy(to any, fr any, wid uintptr) int
 			fn := syslook("slicecopy")
-			fn = substArgTypes(fn, l1.Type, l2.Type)
-			ncopy = mkcall1(fn, types.Types[TINT], &nodes, nptr1, nptr2, nodintconst(elemtype.Width))
+			fn = substArgTypes(fn, l1.Type.Elem(), l2.Type.Elem())
+			ptr1, len1 := nptr1.slicePtrLen()
+			ptr2, len2 := nptr2.slicePtrLen()
+			ncopy = mkcall1(fn, types.Types[TINT], &nodes, ptr1, len1, ptr2, len2, nodintconst(elemtype.Width))
 		}

 	} else {
@ -3009,20 +3016,31 @@ func walkappend(n *Node, init *Nodes, dst *Node) *Node {
 func copyany(n *Node, init *Nodes, runtimecall bool) *Node {
 	if n.Left.Type.Elem().HasHeapPointer() {
 		Curfn.Func.setWBPos(n.Pos)
-		fn := writebarrierfn("typedslicecopy", n.Left.Type, n.Right.Type)
-		return mkcall1(fn, n.Type, init, typename(n.Left.Type.Elem()), n.Left, n.Right)
+		fn := writebarrierfn("typedslicecopy", n.Left.Type.Elem(), n.Right.Type.Elem())
+		n.Left = cheapexpr(n.Left, init)
+		ptrL, lenL := n.Left.slicePtrLen()
+		n.Right = cheapexpr(n.Right, init)
+		ptrR, lenR := n.Right.slicePtrLen()
+		return mkcall1(fn, n.Type, init, typename(n.Left.Type.Elem()), ptrL, lenL, ptrR, lenR)
 	}

 	if runtimecall {
 		if n.Right.Type.IsString() {
 			fn := syslook("slicestringcopy")
-			fn = substArgTypes(fn, n.Left.Type, n.Right.Type)
-			return mkcall1(fn, n.Type, init, n.Left, n.Right)
+			n.Left = cheapexpr(n.Left, init)
+			ptr, len := n.Left.slicePtrLen()
+			str := nod(OCONVNOP, n.Right, nil)
+			str.Type = types.Types[TSTRING]
+			return mkcall1(fn, n.Type, init, ptr, len, str)
 		}

 		fn := syslook("slicecopy")
-		fn = substArgTypes(fn, n.Left.Type, n.Right.Type)
-		return mkcall1(fn, n.Type, init, n.Left, n.Right, nodintconst(n.Left.Type.Elem().Width))
+		fn = substArgTypes(fn, n.Left.Type.Elem(), n.Right.Type.Elem())
+		n.Left = cheapexpr(n.Left, init)
+		ptrL, lenL := n.Left.slicePtrLen()
+		n.Right = cheapexpr(n.Right, init)
+		ptrR, lenR := n.Right.slicePtrLen()
+		return mkcall1(fn, n.Type, init, ptrL, lenL, ptrR, lenR, nodintconst(n.Left.Type.Elem().Width))
 	}

 	n.Left = walkexpr(n.Left, init)
@ -3145,7 +3163,7 @@ func walkcompare(n *Node, init *Nodes) *Node {
 			eqtype = nod(andor, nonnil, match)
 		}
 		// Check for data equal.
-		eqdata := nod(eq, ifaceData(l, r.Type), r)
+		eqdata := nod(eq, ifaceData(n.Pos, l, r.Type), r)
 		// Put it all together.
 		expr := nod(andor, eqtype, eqdata)
 		n = finishcompare(n, expr, init)
@ -3361,36 +3379,15 @@ func tracecmpArg(n *Node, t *types.Type, init *Nodes) *Node {
 }

 func walkcompareInterface(n *Node, init *Nodes) *Node {
-	// ifaceeq(i1 any-1, i2 any-2) (ret bool);
-	if !types.Identical(n.Left.Type, n.Right.Type) {
-		Fatalf("ifaceeq %v %v %v", n.Op, n.Left.Type, n.Right.Type)
-	}
-	var fn *Node
-	if n.Left.Type.IsEmptyInterface() {
-		fn = syslook("efaceeq")
-	} else {
-		fn = syslook("ifaceeq")
-	}
-
 	n.Right = cheapexpr(n.Right, init)
 	n.Left = cheapexpr(n.Left, init)
-	lt := nod(OITAB, n.Left, nil)
-	rt := nod(OITAB, n.Right, nil)
-	ld := nod(OIDATA, n.Left, nil)
-	rd := nod(OIDATA, n.Right, nil)
-	ld.Type = types.Types[TUNSAFEPTR]
-	rd.Type = types.Types[TUNSAFEPTR]
-	ld.SetTypecheck(1)
-	rd.SetTypecheck(1)
-	call := mkcall1(fn, n.Type, init, lt, ld, rd)
-
-	// Check itable/type before full compare.
-	// Note: short-circuited because order matters.
+	eqtab, eqdata := eqinterface(n.Left, n.Right)
 	var cmp *Node
 	if n.Op == OEQ {
-		cmp = nod(OANDAND, nod(OEQ, lt, rt), call)
+		cmp = nod(OANDAND, eqtab, eqdata)
 	} else {
-		cmp = nod(OOROR, nod(ONE, lt, rt), nod(ONOT, call, nil))
+		eqtab.Op = ONE
+		cmp = nod(OOROR, eqtab, nod(ONOT, eqdata, nil))
 	}
 	return finishcompare(n, cmp, init)
 }
@ -3500,27 +3497,16 @@ func walkcompareString(n *Node, init *Nodes) *Node {
 		// prepare for rewrite below
 		n.Left = cheapexpr(n.Left, init)
 		n.Right = cheapexpr(n.Right, init)
-
-		lstr := conv(n.Left, types.Types[TSTRING])
-		rstr := conv(n.Right, types.Types[TSTRING])
-		lptr := nod(OSPTR, lstr, nil)
-		rptr := nod(OSPTR, rstr, nil)
-		llen := conv(nod(OLEN, lstr, nil), types.Types[TUINTPTR])
-		rlen := conv(nod(OLEN, rstr, nil), types.Types[TUINTPTR])
-
-		fn := syslook("memequal")
-		fn = substArgTypes(fn, types.Types[TUINT8], types.Types[TUINT8])
-		r = mkcall1(fn, types.Types[TBOOL], init, lptr, rptr, llen)
-
+		eqlen, eqmem := eqstring(n.Left, n.Right)
 		// quick check of len before full compare for == or !=.
 		// memequal then tests equality up to length len.
 		if n.Op == OEQ {
 			// len(left) == len(right) && memequal(left, right, len)
-			r = nod(OANDAND, nod(OEQ, llen, rlen), r)
+			r = nod(OANDAND, eqlen, eqmem)
 		} else {
 			// len(left) != len(right) || !memequal(left, right, len)
-			r = nod(ONOT, r, nil)
-			r = nod(OOROR, nod(ONE, llen, rlen), r)
+			eqlen.Op = ONE
+			r = nod(OOROR, eqlen, nod(ONOT, eqmem, nil))
 		}
 	} else {
 		// sys_cmpstring(s1, s2) :: 0
@ -3639,7 +3625,8 @@ func usemethod(n *Node) {

 	// Note: Don't rely on res0.Type.String() since its formatting depends on multiple factors
 	//       (including global variables such as numImports - was issue #19028).
-	if s := res0.Type.Sym; s != nil && s.Name == "Method" && s.Pkg != nil && s.Pkg.Path == "reflect" {
+	// Also need to check for reflect package itself (see Issue #38515).
+	if s := res0.Type.Sym; s != nil && s.Name == "Method" && isReflectPkg(s.Pkg) {
 		Curfn.Func.SetReflectMethod(true)
 	}
 }
@ -3959,10 +3946,8 @@ func walkCheckPtrArithmetic(n *Node, init *Nodes) *Node {

 	n = cheapexpr(n, init)

-	ddd := nodl(n.Pos, ODDDARG, nil, nil)
-	ddd.Type = types.NewPtr(types.NewArray(types.Types[TUNSAFEPTR], int64(len(originals))))
-	ddd.Esc = EscNone
-	slice := mkdotargslice(types.NewSlice(types.Types[TUNSAFEPTR]), originals, init, ddd)
+	slice := mkdotargslice(types.NewSlice(types.Types[TUNSAFEPTR]), originals)
+	slice.Esc = EscNone

 	init.Append(mkcall("checkptrArithmetic", nil, init, convnop(n, types.Types[TUNSAFEPTR]), slice))
 	// TODO(khr): Mark backing store of slice as dead. This will allow us to reuse
--- a/src/cmd/compile/internal/logopt/log_opts.go
+++ b/src/cmd/compile/internal/logopt/log_opts.go
@ -294,18 +294,23 @@ func checkLogPath(flag, destination string) {
 	dest = destination
 }

-var loggedOpts []LoggedOpt
+var loggedOpts []*LoggedOpt
 var mu = sync.Mutex{} // mu protects loggedOpts.

+func NewLoggedOpt(pos src.XPos, what, pass, fname string, args ...interface{}) *LoggedOpt {
+	pass = strings.Replace(pass, " ", "_", -1)
+	return &LoggedOpt{pos, pass, fname, what, args}
+}
+
 func LogOpt(pos src.XPos, what, pass, fname string, args ...interface{}) {
 	if Format == None {
 		return
 	}
-	pass = strings.Replace(pass, " ", "_", -1)
+	lo := NewLoggedOpt(pos, what, pass, fname, args...)
 	mu.Lock()
 	defer mu.Unlock()
 	// Because of concurrent calls from back end, no telling what the order will be, but is stable-sorted by outer Pos before use.
-	loggedOpts = append(loggedOpts, LoggedOpt{pos, pass, fname, what, args})
+	loggedOpts = append(loggedOpts, lo)
 }

 func Enabled() bool {
@ -321,7 +326,7 @@ func Enabled() bool {
 // byPos sorts diagnostics by source position.
 type byPos struct {
 	ctxt *obj.Link
-	a    []LoggedOpt
+	a    []*LoggedOpt
 }

 func (x byPos) Len() int { return len(x.a) }
@ -402,15 +407,9 @@ func FlushLoggedOpts(ctxt *obj.Link, slashPkgPath string) {
 		// For LSP, make a subdirectory for the package, and for each file foo.go, create foo.json in that subdirectory.
 		currentFile := ""
 		for _, x := range loggedOpts {
-			posTmp = ctxt.AllPos(x.pos, posTmp)
-			// Reverse posTmp to put outermost first.
-			l := len(posTmp)
-			for i := 0; i < l/2; i++ {
-				posTmp[i], posTmp[l-i-1] = posTmp[l-i-1], posTmp[i]
-			}
-
-			p0 := posTmp[0]
+			posTmp, p0 := x.parsePos(ctxt, posTmp)
 			p0f := uprootedPath(p0.Filename())
+
 			if currentFile != p0f {
 				if w != nil {
 					w.Close()
@ -429,16 +428,27 @@ func FlushLoggedOpts(ctxt *obj.Link, slashPkgPath string) {

 			diagnostic.Code = x.what
 			diagnostic.Message = target
-			diagnostic.Range = Range{Start: Position{p0.Line(), p0.Col()},
-				End: Position{p0.Line(), p0.Col()}}
+			diagnostic.Range = newPointRange(p0)
 			diagnostic.RelatedInformation = diagnostic.RelatedInformation[:0]

-			for i := 1; i < l; i++ {
-				p := posTmp[i]
-				loc := Location{URI: uriIfy(uprootedPath(p.Filename())),
-					Range: Range{Start: Position{p.Line(), p.Col()},
-						End: Position{p.Line(), p.Col()}}}
-				diagnostic.RelatedInformation = append(diagnostic.RelatedInformation, DiagnosticRelatedInformation{Location: loc, Message: "inlineLoc"})
+			appendInlinedPos(posTmp, &diagnostic)
+
+			// Diagnostic explanation is stored in RelatedInformation after inlining info
+			if len(x.target) > 1 {
+				switch y := x.target[1].(type) {
+				case []*LoggedOpt:
+					for _, z := range y {
+						posTmp, p0 := z.parsePos(ctxt, posTmp)
+						loc := newLocation(p0)
+						msg := z.what
+						if len(z.target) > 0 {
+							msg = msg + ": " + fmt.Sprint(z.target[0])
+						}
+
+						diagnostic.RelatedInformation = append(diagnostic.RelatedInformation, DiagnosticRelatedInformation{Location: loc, Message: msg})
+						appendInlinedPos(posTmp, &diagnostic)
+					}
+				}
 			}

 			encoder.Encode(diagnostic)
@ -448,3 +458,33 @@ func FlushLoggedOpts(ctxt *obj.Link, slashPkgPath string) {
 		}
 	}
 }
+
+func newPointRange(p src.Pos) Range {
+	return Range{Start: Position{p.Line(), p.Col()},
+		End: Position{p.Line(), p.Col()}}
+}
+
+func newLocation(p src.Pos) Location {
+	loc := Location{URI: uriIfy(uprootedPath(p.Filename())), Range: newPointRange(p)}
+	return loc
+}
+
+// appendInlinedPos extracts inlining information from posTmp and append it to diagnostic
+func appendInlinedPos(posTmp []src.Pos, diagnostic *Diagnostic) {
+	for i := 1; i < len(posTmp); i++ {
+		p := posTmp[i]
+		loc := newLocation(p)
+		diagnostic.RelatedInformation = append(diagnostic.RelatedInformation, DiagnosticRelatedInformation{Location: loc, Message: "inlineLoc"})
+	}
+}
+
+func (x *LoggedOpt) parsePos(ctxt *obj.Link, posTmp []src.Pos) ([]src.Pos, src.Pos) {
+	posTmp = ctxt.AllPos(x.pos, posTmp)
+	// Reverse posTmp to put outermost first.
+	l := len(posTmp)
+	for i := 0; i < l/2; i++ {
+		posTmp[i], posTmp[l-i-1] = posTmp[l-i-1], posTmp[i]
+	}
+	p0 := posTmp[0]
+	return posTmp, p0
+}
--- a/src/cmd/compile/internal/logopt/logopt_test.go
+++ b/src/cmd/compile/internal/logopt/logopt_test.go
@ -30,11 +30,28 @@ func foo(w, z *pair) *int {
 	}
 	return &a[0]
 }
+
+// address taking prevents closure inlining
+func n() int {
+	foo := func() int { return 1 }
+	bar := &foo
+	x := (*bar)() + foo()
+	return x
+}
 `

 func want(t *testing.T, out string, desired string) {
-	if !strings.Contains(out, desired) {
-		t.Errorf("did not see phrase %s in \n%s", desired, out)
+	// On Windows, Unicode escapes in the JSON output end up "normalized" elsewhere to /u....,
+	// so "normalize" what we're looking for to match that.
+	s := strings.ReplaceAll(desired, string(os.PathSeparator), "/")
+	if !strings.Contains(out, s) {
+		t.Errorf("did not see phrase %s in \n%s", s, out)
+	}
+}
+
+func wantN(t *testing.T, out string, desired string, n int) {
+	if strings.Count(out, desired) != n {
+		t.Errorf("expected exactly %d occurences of %s in \n%s", n, desired, out)
 	}
 }

@ -75,7 +92,70 @@ func TestLogOpt(t *testing.T) {

 	})

+	// replace d (dir)  with t ("tmpdir") and convert path separators to '/'
+	normalize := func(out []byte, d, t string) string {
+		s := string(out)
+		s = strings.ReplaceAll(s, d, t)
+		s = strings.ReplaceAll(s, string(os.PathSeparator), "/")
+		return s
+	}
+
+	// Ensure that <128 byte copies are not reported and that 128-byte copies are.
+	// Check at both 1 and 8-byte alignments.
+	t.Run("Copy", func(t *testing.T) {
+		const copyCode = `package x
+func s128a1(x *[128]int8) [128]int8 { 
+	return *x
+}
+func s127a1(x *[127]int8) [127]int8 {
+	return *x
+}
+func s16a8(x *[16]int64) [16]int64 {
+	return *x
+}
+func s15a8(x *[15]int64) [15]int64 {
+	return *x
+}
+`
+		copy := filepath.Join(dir, "copy.go")
+		if err := ioutil.WriteFile(copy, []byte(copyCode), 0644); err != nil {
+			t.Fatal(err)
+		}
+		outcopy := filepath.Join(dir, "copy.o")
+
+		// On not-amd64, test the host architecture and os
+		arches := []string{runtime.GOARCH}
+		goos0 := runtime.GOOS
+		if runtime.GOARCH == "amd64" { // Test many things with "linux" (wasm will get "js")
+			arches = []string{"arm", "arm64", "386", "amd64", "mips", "mips64", "ppc64le", "riscv64", "s390x", "wasm"}
+			goos0 = "linux"
+		}
+
+		for _, arch := range arches {
+			t.Run(arch, func(t *testing.T) {
+				goos := goos0
+				if arch == "wasm" {
+					goos = "js"
+				}
+				_, err := testCopy(t, dir, arch, goos, copy, outcopy)
+				if err != nil {
+					t.Error("-json=0,file://log/opt should have succeeded")
+				}
+				logged, err := ioutil.ReadFile(filepath.Join(dir, "log", "opt", "x", "copy.json"))
+				if err != nil {
+					t.Error("-json=0,file://log/opt missing expected log file")
+				}
+				slogged := normalize(logged, string(uriIfy(dir)), string(uriIfy("tmpdir")))
+				t.Logf("%s", slogged)
+				want(t, slogged, `{"range":{"start":{"line":3,"character":2},"end":{"line":3,"character":2}},"severity":3,"code":"copy","source":"go compiler","message":"128 bytes"}`)
+				want(t, slogged, `{"range":{"start":{"line":9,"character":2},"end":{"line":9,"character":2}},"severity":3,"code":"copy","source":"go compiler","message":"128 bytes"}`)
+				wantN(t, slogged, `"code":"copy"`, 2)
+			})
+		}
+	})
+
 	// Some architectures don't fault on nil dereference, so nilchecks are eliminated differently.
+	// The N-way copy test also doesn't need to run N-ways N times.
 	if runtime.GOARCH != "amd64" {
 		return
 	}
@ -83,14 +163,6 @@ func TestLogOpt(t *testing.T) {
 	t.Run("Success", func(t *testing.T) {
 		// This test is supposed to succeed

-		// replace d (dir)  with t ("tmpdir") and convert path separators to '/'
-		normalize := func(out []byte, d, t string) string {
-			s := string(out)
-			s = strings.ReplaceAll(s, d, t)
-			s = strings.ReplaceAll(s, string(os.PathSeparator), "/")
-			return s
-		}
-
 		// Note 'file://' is the I-Know-What-I-Am-Doing way of specifying a file, also to deal with corner cases for Windows.
 		_, err := testLogOptDir(t, dir, "-json=0,file://log/opt", src, outfile)
 		if err != nil {
@ -103,12 +175,26 @@ func TestLogOpt(t *testing.T) {
 		// All this delicacy with uriIfy and filepath.Join is to get this test to work right on Windows.
 		slogged := normalize(logged, string(uriIfy(dir)), string(uriIfy("tmpdir")))
 		t.Logf("%s", slogged)
-		// below shows proper inlining and nilcheck
-		want(t, slogged, `{"range":{"start":{"line":9,"character":13},"end":{"line":9,"character":13}},"severity":3,"code":"nilcheck","source":"go compiler","message":"","relatedInformation":[{"location":{"uri":"file://tmpdir/file.go","range":{"start":{"line":4,"character":11},"end":{"line":4,"character":11}}},"message":"inlineLoc"}]}`)
+		// below shows proper nilcheck
+		want(t, slogged, `{"range":{"start":{"line":9,"character":13},"end":{"line":9,"character":13}},"severity":3,"code":"nilcheck","source":"go compiler","message":"",`+
+			`"relatedInformation":[{"location":{"uri":"file://tmpdir/file.go","range":{"start":{"line":4,"character":11},"end":{"line":4,"character":11}}},"message":"inlineLoc"}]}`)
 		want(t, slogged, `{"range":{"start":{"line":11,"character":6},"end":{"line":11,"character":6}},"severity":3,"code":"isInBounds","source":"go compiler","message":""}`)
 		want(t, slogged, `{"range":{"start":{"line":7,"character":6},"end":{"line":7,"character":6}},"severity":3,"code":"canInlineFunction","source":"go compiler","message":"cost: 35"}`)
-		want(t, slogged, `{"range":{"start":{"line":9,"character":13},"end":{"line":9,"character":13}},"severity":3,"code":"inlineCall","source":"go compiler","message":"x.bar"}`)
-		want(t, slogged, `{"range":{"start":{"line":8,"character":9},"end":{"line":8,"character":9}},"severity":3,"code":"inlineCall","source":"go compiler","message":"x.bar"}`)
+		want(t, slogged, `{"range":{"start":{"line":21,"character":21},"end":{"line":21,"character":21}},"severity":3,"code":"cannotInlineCall","source":"go compiler","message":"foo cannot be inlined (escaping closure variable)"}`)
+		// escape analysis explanation
+		want(t, slogged, `{"range":{"start":{"line":7,"character":13},"end":{"line":7,"character":13}},"severity":3,"code":"leak","source":"go compiler","message":"parameter z leaks to ~r2 with derefs=0",`+
+			`"relatedInformation":[`+
+			`{"location":{"uri":"file://tmpdir/file.go","range":{"start":{"line":9,"character":13},"end":{"line":9,"character":13}}},"message":"escflow:    flow: y = z:"},`+
+			`{"location":{"uri":"file://tmpdir/file.go","range":{"start":{"line":9,"character":13},"end":{"line":9,"character":13}}},"message":"escflow:      from y = \u003cN\u003e (assign-pair)"},`+
+			`{"location":{"uri":"file://tmpdir/file.go","range":{"start":{"line":9,"character":13},"end":{"line":9,"character":13}}},"message":"escflow:    flow: ~r1 = y:"},`+
+			`{"location":{"uri":"file://tmpdir/file.go","range":{"start":{"line":4,"character":11},"end":{"line":4,"character":11}}},"message":"inlineLoc"},`+
+			`{"location":{"uri":"file://tmpdir/file.go","range":{"start":{"line":9,"character":13},"end":{"line":9,"character":13}}},"message":"escflow:      from y.b (dot of pointer)"},`+
+			`{"location":{"uri":"file://tmpdir/file.go","range":{"start":{"line":4,"character":11},"end":{"line":4,"character":11}}},"message":"inlineLoc"},`+
+			`{"location":{"uri":"file://tmpdir/file.go","range":{"start":{"line":9,"character":13},"end":{"line":9,"character":13}}},"message":"escflow:      from \u0026y.b (address-of)"},`+
+			`{"location":{"uri":"file://tmpdir/file.go","range":{"start":{"line":4,"character":9},"end":{"line":4,"character":9}}},"message":"inlineLoc"},`+
+			`{"location":{"uri":"file://tmpdir/file.go","range":{"start":{"line":9,"character":13},"end":{"line":9,"character":13}}},"message":"escflow:      from ~r1 = \u003cN\u003e (assign-pair)"},`+
+			`{"location":{"uri":"file://tmpdir/file.go","range":{"start":{"line":9,"character":3},"end":{"line":9,"character":3}}},"message":"escflow:    flow: ~r2 = ~r1:"},`+
+			`{"location":{"uri":"file://tmpdir/file.go","range":{"start":{"line":9,"character":3},"end":{"line":9,"character":3}}},"message":"escflow:      from return (*int)(~r1) (return)"}]}`)
 	})
 }

@ -131,3 +217,15 @@ func testLogOptDir(t *testing.T, dir, flag, src, outfile string) (string, error)
 	t.Logf("%s", out)
 	return string(out), err
 }
+
+func testCopy(t *testing.T, dir, goarch, goos, src, outfile string) (string, error) {
+	// Notice the specified import path "x"
+	run := []string{testenv.GoToolPath(t), "tool", "compile", "-p", "x", "-json=0,file://log/opt", "-o", outfile, src}
+	t.Log(run)
+	cmd := exec.Command(run[0], run[1:]...)
+	cmd.Dir = dir
+	cmd.Env = append(os.Environ(), "GOARCH="+goarch, "GOOS="+goos)
+	out, err := cmd.CombinedOutput()
+	t.Logf("%s", out)
+	return string(out), err
+}
--- a/src/cmd/compile/internal/ppc64/ssa.go
+++ b/src/cmd/compile/internal/ppc64/ssa.go
@ -571,7 +571,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		ssa.OpPPC64ROTL, ssa.OpPPC64ROTLW,
 		ssa.OpPPC64MULHD, ssa.OpPPC64MULHW, ssa.OpPPC64MULHDU, ssa.OpPPC64MULHWU,
 		ssa.OpPPC64FMUL, ssa.OpPPC64FMULS, ssa.OpPPC64FDIV, ssa.OpPPC64FDIVS, ssa.OpPPC64FCPSGN,
-		ssa.OpPPC64AND, ssa.OpPPC64OR, ssa.OpPPC64ANDN, ssa.OpPPC64ORN, ssa.OpPPC64NOR, ssa.OpPPC64XOR, ssa.OpPPC64EQV:
+		ssa.OpPPC64AND, ssa.OpPPC64OR, ssa.OpPPC64ANDN, ssa.OpPPC64ORN, ssa.OpPPC64NOR, ssa.OpPPC64XOR, ssa.OpPPC64EQV,
+		ssa.OpPPC64MODUD, ssa.OpPPC64MODSD, ssa.OpPPC64MODUW, ssa.OpPPC64MODSW:
 		r := v.Reg()
 		r1 := v.Args[0].Reg()
 		r2 := v.Args[1].Reg()
@ -654,15 +655,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 	case ssa.OpPPC64ANDCCconst:
 		p := s.Prog(v.Op.Asm())
 		p.Reg = v.Args[0].Reg()
-
-		if v.Aux != nil {
-			p.From.Type = obj.TYPE_CONST
-			p.From.Offset = gc.AuxOffset(v)
-		} else {
-			p.From.Type = obj.TYPE_CONST
-			p.From.Offset = v.AuxInt
-		}
-
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = v.AuxInt
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = ppc64.REGTMP // discard result

@ -850,39 +844,226 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.From.Type = obj.TYPE_CONST
 		p.From.Offset = v.AuxInt & 3

-	case ssa.OpPPC64LoweredZero:
+	case ssa.OpPPC64LoweredQuadZero, ssa.OpPPC64LoweredQuadZeroShort:
+		// The LoweredQuad code generation
+		// generates STXV instructions on
+		// power9. The Short variation is used
+		// if no loop is generated.

-		// unaligned data doesn't hurt performance
-		// for these instructions on power8 or later
+		// sizes >= 64 generate a loop as follows:

-		// for sizes >= 64 generate a loop as follows:
+		// Set up loop counter in CTR, used by BC
+		// XXLXOR clears VS32
+		//       XXLXOR VS32,VS32,VS32
+		//       MOVD len/64,REG_TMP
+		//       MOVD REG_TMP,CTR
+		//       loop:
+		//       STXV VS32,0(R20)
+		//       STXV VS32,16(R20)
+		//       STXV VS32,32(R20)
+		//       STXV VS32,48(R20)
+		//       ADD  $64,R20
+		//       BC   16, 0, loop

-		// set up loop counter in CTR, used by BC
+		// Bytes per iteration
+		ctr := v.AuxInt / 64
+
+		// Remainder bytes
+		rem := v.AuxInt % 64
+
+		// Only generate a loop if there is more
+		// than 1 iteration.
+		if ctr > 1 {
+			// Set up VS32 (V0) to hold 0s
+			p := s.Prog(ppc64.AXXLXOR)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS32
+			p.Reg = ppc64.REG_VS32
+
+			// Set up CTR loop counter
+			p = s.Prog(ppc64.AMOVD)
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = ctr
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REGTMP
+
+			p = s.Prog(ppc64.AMOVD)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REGTMP
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_CTR
+
+			// Don't generate padding for
+			// loops with few iterations.
+			if ctr > 3 {
+				p = s.Prog(obj.APCALIGN)
+				p.From.Type = obj.TYPE_CONST
+				p.From.Offset = 16
+			}
+
+			// generate 4 STXVs to zero 64 bytes
+			var top *obj.Prog
+
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = v.Args[0].Reg()
+
+			//  Save the top of loop
+			if top == nil {
+				top = p
+			}
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = v.Args[0].Reg()
+			p.To.Offset = 16
+
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = v.Args[0].Reg()
+			p.To.Offset = 32
+
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = v.Args[0].Reg()
+			p.To.Offset = 48
+
+			// Increment address for the
+			// 64 bytes just zeroed.
+			p = s.Prog(ppc64.AADD)
+			p.Reg = v.Args[0].Reg()
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = 64
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = v.Args[0].Reg()
+
+			// Branch back to top of loop
+			// based on CTR
+			// BC with BO_BCTR generates bdnz
+			p = s.Prog(ppc64.ABC)
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = ppc64.BO_BCTR
+			p.Reg = ppc64.REG_R0
+			p.To.Type = obj.TYPE_BRANCH
+			gc.Patch(p, top)
+		}
+		// When ctr == 1 the loop was not generated but
+		// there are at least 64 bytes to clear, so add
+		// that to the remainder to generate the code
+		// to clear those doublewords
+		if ctr == 1 {
+			rem += 64
+		}
+
+		// Clear the remainder starting at offset zero
+		offset := int64(0)
+
+		if rem >= 16 && ctr <= 1 {
+			// If the XXLXOR hasn't already been
+			// generated, do it here to initialize
+			// VS32 (V0) to 0.
+			p := s.Prog(ppc64.AXXLXOR)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS32
+			p.Reg = ppc64.REG_VS32
+		}
+		// Generate STXV for 32 or 64
+		// bytes.
+		for rem >= 32 {
+			p := s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = v.Args[0].Reg()
+			p.To.Offset = offset
+
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = v.Args[0].Reg()
+			p.To.Offset = offset + 16
+			offset += 32
+			rem -= 32
+		}
+		// Generate 16 bytes
+		if rem >= 16 {
+			p := s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = v.Args[0].Reg()
+			p.To.Offset = offset
+			offset += 16
+			rem -= 16
+		}
+
+		// first clear as many doublewords as possible
+		// then clear remaining sizes as available
+		for rem > 0 {
+			op, size := ppc64.AMOVB, int64(1)
+			switch {
+			case rem >= 8:
+				op, size = ppc64.AMOVD, 8
+			case rem >= 4:
+				op, size = ppc64.AMOVW, 4
+			case rem >= 2:
+				op, size = ppc64.AMOVH, 2
+			}
+			p := s.Prog(op)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_R0
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = v.Args[0].Reg()
+			p.To.Offset = offset
+			rem -= size
+			offset += size
+		}
+
+	case ssa.OpPPC64LoweredZero, ssa.OpPPC64LoweredZeroShort:
+
+		// Unaligned data doesn't hurt performance
+		// for these instructions on power8.
+
+		// For sizes >= 64 generate a loop as follows:
+
+		// Set up loop counter in CTR, used by BC
 		//       XXLXOR VS32,VS32,VS32
 		//	 MOVD len/32,REG_TMP
 		//	 MOVD REG_TMP,CTR
 		//       MOVD $16,REG_TMP
 		//	 loop:
-		//	 STXVD2X VS32,(R0)(R3)
-		//	 STXVD2X VS32,(R31)(R3)
-		//	 ADD  $32,R3
+		//	 STXVD2X VS32,(R0)(R20)
+		//	 STXVD2X VS32,(R31)(R20)
+		//	 ADD  $32,R20
 		//	 BC   16, 0, loop
 		//
 		// any remainder is done as described below

 		// for sizes < 64 bytes, first clear as many doublewords as possible,
 		// then handle the remainder
-		//	MOVD R0,(R3)
-		//	MOVD R0,8(R3)
+		//	MOVD R0,(R20)
+		//	MOVD R0,8(R20)
 		// .... etc.
 		//
 		// the remainder bytes are cleared using one or more
 		// of the following instructions with the appropriate
 		// offsets depending which instructions are needed
 		//
-		//	MOVW R0,n1(R3)	4 bytes
-		//	MOVH R0,n2(R3)	2 bytes
-		//	MOVB R0,n3(R3)	1 byte
+		//	MOVW R0,n1(R20)	4 bytes
+		//	MOVH R0,n2(R20)	2 bytes
+		//	MOVB R0,n3(R20)	1 byte
 		//
 		// 7 bytes: MOVW, MOVH, MOVB
 		// 6 bytes: MOVW, MOVH
@ -926,10 +1107,19 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			p.To.Type = obj.TYPE_REG
 			p.To.Reg = ppc64.REGTMP

+			// Don't add padding for alignment
+			// with few loop iterations.
+			if ctr > 3 {
+				p = s.Prog(obj.APCALIGN)
+				p.From.Type = obj.TYPE_CONST
+				p.From.Offset = 16
+			}
+
 			// generate 2 STXVD2Xs to store 16 bytes
 			// when this is a loop then the top must be saved
 			var top *obj.Prog
 			// This is the top of loop
+
 			p = s.Prog(ppc64.ASTXVD2X)
 			p.From.Type = obj.TYPE_REG
 			p.From.Reg = ppc64.REG_VS32
@ -940,7 +1130,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			if top == nil {
 				top = p
 			}
-
 			p = s.Prog(ppc64.ASTXVD2X)
 			p.From.Type = obj.TYPE_REG
 			p.From.Reg = ppc64.REG_VS32
@ -1001,8 +1190,9 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			offset += size
 		}

-	case ssa.OpPPC64LoweredMove:
+	case ssa.OpPPC64LoweredMove, ssa.OpPPC64LoweredMoveShort:

+		bytesPerLoop := int64(32)
 		// This will be used when moving more
 		// than 8 bytes.  Moves start with
 		// as many 8 byte moves as possible, then
@ -1019,34 +1209,34 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		//	MOVD REG_TMP,CTR
 		//	MOVD $16,REG_TMP
 		// top:
-		//	LXVD2X (R0)(R4),VS32
-		//	LXVD2X (R31)(R4),VS33
-		//	ADD $32,R4
-		//	STXVD2X VS32,(R0)(R3)
-		//	STXVD2X VS33,(R31)(R4)
-		//	ADD $32,R3
+		//	LXVD2X (R0)(R21),VS32
+		//	LXVD2X (R31)(R21),VS33
+		//	ADD $32,R21
+		//	STXVD2X VS32,(R0)(R20)
+		//	STXVD2X VS33,(R31)(R20)
+		//	ADD $32,R20
 		//	BC 16,0,top
 		// Bytes not moved by this loop are moved
 		// with a combination of the following instructions,
 		// starting with the largest sizes and generating as
 		// many as needed, using the appropriate offset value.
-		//	MOVD  n(R4),R14
-		//	MOVD  R14,n(R3)
-		//	MOVW  n1(R4),R14
-		//	MOVW  R14,n1(R3)
-		//	MOVH  n2(R4),R14
-		//	MOVH  R14,n2(R3)
-		//	MOVB  n3(R4),R14
-		//	MOVB  R14,n3(R3)
+		//	MOVD  n(R21),R31
+		//	MOVD  R31,n(R20)
+		//	MOVW  n1(R21),R31
+		//	MOVW  R31,n1(R20)
+		//	MOVH  n2(R21),R31
+		//	MOVH  R31,n2(R20)
+		//	MOVB  n3(R21),R31
+		//	MOVB  R31,n3(R20)

 		// Each loop iteration moves 32 bytes
-		ctr := v.AuxInt / 32
+		ctr := v.AuxInt / bytesPerLoop

 		// Remainder after the loop
-		rem := v.AuxInt % 32
+		rem := v.AuxInt % bytesPerLoop

-		dst_reg := v.Args[0].Reg()
-		src_reg := v.Args[1].Reg()
+		dstReg := v.Args[0].Reg()
+		srcReg := v.Args[1].Reg()

 		// The set of registers used here, must match the clobbered reg list
 		// in PPC64Ops.go.
@ -1076,57 +1266,65 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			p.To.Type = obj.TYPE_REG
 			p.To.Reg = ppc64.REGTMP

+			// Don't adding padding for
+			// alignment with small iteration
+			// counts.
+			if ctr > 3 {
+				p = s.Prog(obj.APCALIGN)
+				p.From.Type = obj.TYPE_CONST
+				p.From.Offset = 16
+			}
+
 			// Generate 16 byte loads and stores.
 			// Use temp register for index (16)
 			// on the second one.
+
 			p = s.Prog(ppc64.ALXVD2X)
 			p.From.Type = obj.TYPE_MEM
-			p.From.Reg = src_reg
+			p.From.Reg = srcReg
 			p.From.Index = ppc64.REGZERO
 			p.To.Type = obj.TYPE_REG
 			p.To.Reg = ppc64.REG_VS32
-
 			if top == nil {
 				top = p
 			}
-
 			p = s.Prog(ppc64.ALXVD2X)
 			p.From.Type = obj.TYPE_MEM
-			p.From.Reg = src_reg
+			p.From.Reg = srcReg
 			p.From.Index = ppc64.REGTMP
 			p.To.Type = obj.TYPE_REG
 			p.To.Reg = ppc64.REG_VS33

 			// increment the src reg for next iteration
 			p = s.Prog(ppc64.AADD)
-			p.Reg = src_reg
+			p.Reg = srcReg
 			p.From.Type = obj.TYPE_CONST
-			p.From.Offset = 32
+			p.From.Offset = bytesPerLoop
 			p.To.Type = obj.TYPE_REG
-			p.To.Reg = src_reg
+			p.To.Reg = srcReg

 			// generate 16 byte stores
 			p = s.Prog(ppc64.ASTXVD2X)
 			p.From.Type = obj.TYPE_REG
 			p.From.Reg = ppc64.REG_VS32
 			p.To.Type = obj.TYPE_MEM
-			p.To.Reg = dst_reg
+			p.To.Reg = dstReg
 			p.To.Index = ppc64.REGZERO

 			p = s.Prog(ppc64.ASTXVD2X)
 			p.From.Type = obj.TYPE_REG
 			p.From.Reg = ppc64.REG_VS33
 			p.To.Type = obj.TYPE_MEM
-			p.To.Reg = dst_reg
+			p.To.Reg = dstReg
 			p.To.Index = ppc64.REGTMP

 			// increment the dst reg for next iteration
 			p = s.Prog(ppc64.AADD)
-			p.Reg = dst_reg
+			p.Reg = dstReg
 			p.From.Type = obj.TYPE_CONST
-			p.From.Offset = 32
+			p.From.Offset = bytesPerLoop
 			p.To.Type = obj.TYPE_REG
-			p.To.Reg = dst_reg
+			p.To.Reg = dstReg

 			// BC with BO_BCTR generates bdnz to branch on nonzero CTR
 			// to loop top.
@ -1137,7 +1335,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			p.To.Type = obj.TYPE_BRANCH
 			gc.Patch(p, top)

-			// src_reg and dst_reg were incremented in the loop, so
+			// srcReg and dstReg were incremented in the loop, so
 			// later instructions start with offset 0.
 			offset = int64(0)
 		}
@ -1145,7 +1343,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		// No loop was generated for one iteration, so
 		// add 32 bytes to the remainder to move those bytes.
 		if ctr == 1 {
-			rem += 32
+			rem += bytesPerLoop
 		}

 		if rem >= 16 {
@ -1154,7 +1352,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			// on the second one.
 			p := s.Prog(ppc64.ALXVD2X)
 			p.From.Type = obj.TYPE_MEM
-			p.From.Reg = src_reg
+			p.From.Reg = srcReg
 			p.From.Index = ppc64.REGZERO
 			p.To.Type = obj.TYPE_REG
 			p.To.Reg = ppc64.REG_VS32
@ -1163,7 +1361,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			p.From.Type = obj.TYPE_REG
 			p.From.Reg = ppc64.REG_VS32
 			p.To.Type = obj.TYPE_MEM
-			p.To.Reg = dst_reg
+			p.To.Reg = dstReg
 			p.To.Index = ppc64.REGZERO

 			offset = 16
@ -1171,18 +1369,15 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {

 			if rem >= 16 {
 				// Use REGTMP as index reg
-				p = s.Prog(ppc64.AMOVD)
+				p := s.Prog(ppc64.AMOVD)
 				p.From.Type = obj.TYPE_CONST
 				p.From.Offset = 16
 				p.To.Type = obj.TYPE_REG
 				p.To.Reg = ppc64.REGTMP

-				// Generate 16 byte loads and stores.
-				// Use temp register for index (16)
-				// on the second one.
 				p = s.Prog(ppc64.ALXVD2X)
 				p.From.Type = obj.TYPE_MEM
-				p.From.Reg = src_reg
+				p.From.Reg = srcReg
 				p.From.Index = ppc64.REGTMP
 				p.To.Type = obj.TYPE_REG
 				p.To.Reg = ppc64.REG_VS32
@ -1191,7 +1386,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 				p.From.Type = obj.TYPE_REG
 				p.From.Reg = ppc64.REG_VS32
 				p.To.Type = obj.TYPE_MEM
-				p.To.Reg = dst_reg
+				p.To.Reg = dstReg
 				p.To.Index = ppc64.REGTMP

 				offset = 32
@ -1214,17 +1409,284 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			// Load
 			p := s.Prog(op)
 			p.To.Type = obj.TYPE_REG
-			p.To.Reg = ppc64.REG_R14
+			p.To.Reg = ppc64.REGTMP
 			p.From.Type = obj.TYPE_MEM
-			p.From.Reg = src_reg
+			p.From.Reg = srcReg
 			p.From.Offset = offset

 			// Store
 			p = s.Prog(op)
 			p.From.Type = obj.TYPE_REG
-			p.From.Reg = ppc64.REG_R14
+			p.From.Reg = ppc64.REGTMP
 			p.To.Type = obj.TYPE_MEM
-			p.To.Reg = dst_reg
+			p.To.Reg = dstReg
+			p.To.Offset = offset
+			rem -= size
+			offset += size
+		}
+
+	case ssa.OpPPC64LoweredQuadMove, ssa.OpPPC64LoweredQuadMoveShort:
+		bytesPerLoop := int64(64)
+		// This is used when moving more
+		// than 8 bytes on power9.  Moves start with
+		// as many 8 byte moves as possible, then
+		// 4, 2, or 1 byte(s) as remaining.  This will
+		// work and be efficient for power8 or later.
+		// If there are 64 or more bytes, then a
+		// loop is generated to move 32 bytes and
+		// update the src and dst addresses on each
+		// iteration. When < 64 bytes, the appropriate
+		// number of moves are generated based on the
+		// size.
+		// When moving >= 64 bytes a loop is used
+		//      MOVD len/32,REG_TMP
+		//      MOVD REG_TMP,CTR
+		// top:
+		//      LXV 0(R21),VS32
+		//      LXV 16(R21),VS33
+		//      ADD $32,R21
+		//      STXV VS32,0(R20)
+		//      STXV VS33,16(R20)
+		//      ADD $32,R20
+		//      BC 16,0,top
+		// Bytes not moved by this loop are moved
+		// with a combination of the following instructions,
+		// starting with the largest sizes and generating as
+		// many as needed, using the appropriate offset value.
+		//      MOVD  n(R21),R31
+		//      MOVD  R31,n(R20)
+		//      MOVW  n1(R21),R31
+		//      MOVW  R31,n1(R20)
+		//      MOVH  n2(R21),R31
+		//      MOVH  R31,n2(R20)
+		//      MOVB  n3(R21),R31
+		//      MOVB  R31,n3(R20)
+
+		// Each loop iteration moves 32 bytes
+		ctr := v.AuxInt / bytesPerLoop
+
+		// Remainder after the loop
+		rem := v.AuxInt % bytesPerLoop
+
+		dstReg := v.Args[0].Reg()
+		srcReg := v.Args[1].Reg()
+
+		offset := int64(0)
+
+		// top of the loop
+		var top *obj.Prog
+
+		// Only generate looping code when loop counter is > 1 for >= 64 bytes
+		if ctr > 1 {
+			// Set up the CTR
+			p := s.Prog(ppc64.AMOVD)
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = ctr
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REGTMP
+
+			p = s.Prog(ppc64.AMOVD)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REGTMP
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_CTR
+
+			p = s.Prog(obj.APCALIGN)
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = 16
+
+			// Generate 16 byte loads and stores.
+			p = s.Prog(ppc64.ALXV)
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = srcReg
+			p.From.Offset = offset
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS32
+			if top == nil {
+				top = p
+			}
+			p = s.Prog(ppc64.ALXV)
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = srcReg
+			p.From.Offset = offset + 16
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS33
+
+			// generate 16 byte stores
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = dstReg
+			p.To.Offset = offset
+
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS33
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = dstReg
+			p.To.Offset = offset + 16
+
+			// Generate 16 byte loads and stores.
+			p = s.Prog(ppc64.ALXV)
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = srcReg
+			p.From.Offset = offset + 32
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS32
+
+			p = s.Prog(ppc64.ALXV)
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = srcReg
+			p.From.Offset = offset + 48
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS33
+
+			// generate 16 byte stores
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = dstReg
+			p.To.Offset = offset + 32
+
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS33
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = dstReg
+			p.To.Offset = offset + 48
+
+			// increment the src reg for next iteration
+			p = s.Prog(ppc64.AADD)
+			p.Reg = srcReg
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = bytesPerLoop
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = srcReg
+
+			// increment the dst reg for next iteration
+			p = s.Prog(ppc64.AADD)
+			p.Reg = dstReg
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = bytesPerLoop
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = dstReg
+
+			// BC with BO_BCTR generates bdnz to branch on nonzero CTR
+			// to loop top.
+			p = s.Prog(ppc64.ABC)
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = ppc64.BO_BCTR
+			p.Reg = ppc64.REG_R0
+			p.To.Type = obj.TYPE_BRANCH
+			gc.Patch(p, top)
+
+			// srcReg and dstReg were incremented in the loop, so
+			// later instructions start with offset 0.
+			offset = int64(0)
+		}
+
+		// No loop was generated for one iteration, so
+		// add 32 bytes to the remainder to move those bytes.
+		if ctr == 1 {
+			rem += bytesPerLoop
+		}
+		if rem >= 32 {
+			p := s.Prog(ppc64.ALXV)
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = srcReg
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS32
+
+			p = s.Prog(ppc64.ALXV)
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = srcReg
+			p.From.Offset = 16
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS33
+
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = dstReg
+
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS33
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = dstReg
+			p.To.Offset = 16
+
+			offset = 32
+			rem -= 32
+		}
+
+		if rem >= 16 {
+			// Generate 16 byte loads and stores.
+			p := s.Prog(ppc64.ALXV)
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = srcReg
+			p.From.Offset = offset
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REG_VS32
+
+			p = s.Prog(ppc64.ASTXV)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REG_VS32
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = dstReg
+			p.To.Offset = offset
+
+			offset += 16
+			rem -= 16
+
+			if rem >= 16 {
+				p := s.Prog(ppc64.ALXV)
+				p.From.Type = obj.TYPE_MEM
+				p.From.Reg = srcReg
+				p.From.Offset = offset
+				p.To.Type = obj.TYPE_REG
+				p.To.Reg = ppc64.REG_VS32
+
+				p = s.Prog(ppc64.ASTXV)
+				p.From.Type = obj.TYPE_REG
+				p.From.Reg = ppc64.REG_VS32
+				p.To.Type = obj.TYPE_MEM
+				p.To.Reg = dstReg
+				p.To.Offset = offset
+
+				offset += 16
+				rem -= 16
+			}
+		}
+		// Generate all the remaining load and store pairs, starting with
+		// as many 8 byte moves as possible, then 4, 2, 1.
+		for rem > 0 {
+			op, size := ppc64.AMOVB, int64(1)
+			switch {
+			case rem >= 8:
+				op, size = ppc64.AMOVD, 8
+			case rem >= 4:
+				op, size = ppc64.AMOVW, 4
+			case rem >= 2:
+				op, size = ppc64.AMOVH, 2
+			}
+			// Load
+			p := s.Prog(op)
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = ppc64.REGTMP
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = srcReg
+			p.From.Offset = offset
+
+			// Store
+			p = s.Prog(op)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = ppc64.REGTMP
+			p.To.Type = obj.TYPE_MEM
+			p.To.Reg = dstReg
 			p.To.Offset = offset
 			rem -= size
 			offset += size
--- a/src/cmd/compile/internal/riscv64/ssa.go
+++ b/src/cmd/compile/internal/riscv64/ssa.go
@ -577,6 +577,21 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 	}
 }

+var blockBranch = [...]obj.As{
+	ssa.BlockRISCV64BEQ:  riscv.ABEQ,
+	ssa.BlockRISCV64BEQZ: riscv.ABEQZ,
+	ssa.BlockRISCV64BGE:  riscv.ABGE,
+	ssa.BlockRISCV64BGEU: riscv.ABGEU,
+	ssa.BlockRISCV64BGEZ: riscv.ABGEZ,
+	ssa.BlockRISCV64BGTZ: riscv.ABGTZ,
+	ssa.BlockRISCV64BLEZ: riscv.ABLEZ,
+	ssa.BlockRISCV64BLT:  riscv.ABLT,
+	ssa.BlockRISCV64BLTU: riscv.ABLTU,
+	ssa.BlockRISCV64BLTZ: riscv.ABLTZ,
+	ssa.BlockRISCV64BNE:  riscv.ABNE,
+	ssa.BlockRISCV64BNEZ: riscv.ABNEZ,
+}
+
 func ssaGenBlock(s *gc.SSAGenState, b, next *ssa.Block) {
 	s.SetPos(b.Pos)

@ -610,27 +625,44 @@ func ssaGenBlock(s *gc.SSAGenState, b, next *ssa.Block) {
 		p.To.Type = obj.TYPE_MEM
 		p.To.Name = obj.NAME_EXTERN
 		p.To.Sym = b.Aux.(*obj.LSym)
-	case ssa.BlockRISCV64BNE:
+	case ssa.BlockRISCV64BEQ, ssa.BlockRISCV64BEQZ, ssa.BlockRISCV64BNE, ssa.BlockRISCV64BNEZ,
+		ssa.BlockRISCV64BLT, ssa.BlockRISCV64BLEZ, ssa.BlockRISCV64BGE, ssa.BlockRISCV64BGEZ,
+		ssa.BlockRISCV64BLTZ, ssa.BlockRISCV64BGTZ, ssa.BlockRISCV64BLTU, ssa.BlockRISCV64BGEU:
+
+		as := blockBranch[b.Kind]
+		invAs := riscv.InvertBranch(as)
+
 		var p *obj.Prog
 		switch next {
 		case b.Succs[0].Block():
-			p = s.Br(riscv.ABNE, b.Succs[1].Block())
-			p.As = riscv.InvertBranch(p.As)
+			p = s.Br(invAs, b.Succs[1].Block())
 		case b.Succs[1].Block():
-			p = s.Br(riscv.ABNE, b.Succs[0].Block())
+			p = s.Br(as, b.Succs[0].Block())
 		default:
 			if b.Likely != ssa.BranchUnlikely {
-				p = s.Br(riscv.ABNE, b.Succs[0].Block())
+				p = s.Br(as, b.Succs[0].Block())
 				s.Br(obj.AJMP, b.Succs[1].Block())
 			} else {
-				p = s.Br(riscv.ABNE, b.Succs[1].Block())
-				p.As = riscv.InvertBranch(p.As)
+				p = s.Br(invAs, b.Succs[1].Block())
 				s.Br(obj.AJMP, b.Succs[0].Block())
 			}
 		}
-		p.Reg = b.Controls[0].Reg()
+
 		p.From.Type = obj.TYPE_REG
-		p.From.Reg = riscv.REG_ZERO
+		switch b.Kind {
+		case ssa.BlockRISCV64BEQ, ssa.BlockRISCV64BNE, ssa.BlockRISCV64BLT, ssa.BlockRISCV64BGE, ssa.BlockRISCV64BLTU, ssa.BlockRISCV64BGEU:
+			if b.NumControls() != 2 {
+				b.Fatalf("Unexpected number of controls (%d != 2): %s", b.NumControls(), b.LongString())
+			}
+			p.From.Reg = b.Controls[0].Reg()
+			p.Reg = b.Controls[1].Reg()
+
+		case ssa.BlockRISCV64BEQZ, ssa.BlockRISCV64BNEZ, ssa.BlockRISCV64BGEZ, ssa.BlockRISCV64BLEZ, ssa.BlockRISCV64BLTZ, ssa.BlockRISCV64BGTZ:
+			if b.NumControls() != 1 {
+				b.Fatalf("Unexpected number of controls (%d != 1): %s", b.NumControls(), b.LongString())
+			}
+			p.From.Reg = b.Controls[0].Reg()
+		}

 	default:
 		b.Fatalf("Unhandled block: %s", b.LongString())
--- a/src/cmd/compile/internal/s390x/ssa.go
+++ b/src/cmd/compile/internal/s390x/ssa.go
@ -234,13 +234,19 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 	// 2-address opcode arithmetic
 	case ssa.OpS390XMULLD, ssa.OpS390XMULLW,
 		ssa.OpS390XMULHD, ssa.OpS390XMULHDU,
-		ssa.OpS390XFADDS, ssa.OpS390XFADD, ssa.OpS390XFSUBS, ssa.OpS390XFSUB,
 		ssa.OpS390XFMULS, ssa.OpS390XFMUL, ssa.OpS390XFDIVS, ssa.OpS390XFDIV:
 		r := v.Reg()
 		if r != v.Args[0].Reg() {
 			v.Fatalf("input[0] and output not in same register %s", v.LongString())
 		}
 		opregreg(s, v.Op.Asm(), r, v.Args[1].Reg())
+	case ssa.OpS390XFSUBS, ssa.OpS390XFSUB,
+		ssa.OpS390XFADDS, ssa.OpS390XFADD:
+		r := v.Reg0()
+		if r != v.Args[0].Reg() {
+			v.Fatalf("input[0] and output not in same register %s", v.LongString())
+		}
+		opregreg(s, v.Op.Asm(), r, v.Args[1].Reg())
 	case ssa.OpS390XMLGR:
 		// MLGR Rx R3 -> R2:R3
 		r0 := v.Args[0].Reg()
--- a/src/cmd/compile/internal/ssa/addressingmodes.go
+++ b/src/cmd/compile/internal/ssa/addressingmodes.go
@ -87,6 +87,13 @@ func addressingModes(f *Func) {
 			v.resetArgs()
 			v.Op = c
 			v.AddArgs(tmp...)
+			if needSplit[c] {
+				// It turns out that some of the combined instructions have faster two-instruction equivalents,
+				// but not the two instructions that led to them being combined here.  For example
+				// (CMPBconstload c (ADDQ x y)) -> (CMPBconstloadidx1 c x y) -> (CMPB c (MOVBloadidx1 x y))
+				// The final pair of instructions turns out to be notably faster, at least in some benchmarks.
+				f.Config.splitLoad(v)
+			}
 		}
 	}
 }
@ -101,6 +108,26 @@ func init() {
 	}
 }

+// needSplit contains instructions that should be postprocessed by splitLoad
+// into a more-efficient two-instruction form.
+var needSplit = map[Op]bool{
+	OpAMD64CMPBloadidx1: true,
+	OpAMD64CMPWloadidx1: true,
+	OpAMD64CMPLloadidx1: true,
+	OpAMD64CMPQloadidx1: true,
+	OpAMD64CMPWloadidx2: true,
+	OpAMD64CMPLloadidx4: true,
+	OpAMD64CMPQloadidx8: true,
+
+	OpAMD64CMPBconstloadidx1: true,
+	OpAMD64CMPWconstloadidx1: true,
+	OpAMD64CMPLconstloadidx1: true,
+	OpAMD64CMPQconstloadidx1: true,
+	OpAMD64CMPWconstloadidx2: true,
+	OpAMD64CMPLconstloadidx4: true,
+	OpAMD64CMPQconstloadidx8: true,
+}
+
 // For each entry k, v in this map, if we have a value x with:
 //   x.Op == k[0]
 //   x.Args[0].Op == k[1]
@ -162,6 +189,8 @@ var combine = map[[2]Op]Op{
 	[2]Op{OpAMD64MOVQstoreconst, OpAMD64LEAQ1}: OpAMD64MOVQstoreconstidx1,
 	[2]Op{OpAMD64MOVQstoreconst, OpAMD64LEAQ8}: OpAMD64MOVQstoreconstidx8,

+	// These instructions are re-split differently for performance, see needSplit above.
+	// TODO if 386 versions are created, also update needSplit and gen/386splitload.rules
 	[2]Op{OpAMD64CMPBload, OpAMD64ADDQ}: OpAMD64CMPBloadidx1,
 	[2]Op{OpAMD64CMPWload, OpAMD64ADDQ}: OpAMD64CMPWloadidx1,
 	[2]Op{OpAMD64CMPLload, OpAMD64ADDQ}: OpAMD64CMPLloadidx1,
@ -188,6 +217,110 @@ var combine = map[[2]Op]Op{
 	[2]Op{OpAMD64CMPQconstload, OpAMD64LEAQ1}: OpAMD64CMPQconstloadidx1,
 	[2]Op{OpAMD64CMPQconstload, OpAMD64LEAQ8}: OpAMD64CMPQconstloadidx8,

+	[2]Op{OpAMD64ADDLload, OpAMD64ADDQ}: OpAMD64ADDLloadidx1,
+	[2]Op{OpAMD64ADDQload, OpAMD64ADDQ}: OpAMD64ADDQloadidx1,
+	[2]Op{OpAMD64SUBLload, OpAMD64ADDQ}: OpAMD64SUBLloadidx1,
+	[2]Op{OpAMD64SUBQload, OpAMD64ADDQ}: OpAMD64SUBQloadidx1,
+	[2]Op{OpAMD64ANDLload, OpAMD64ADDQ}: OpAMD64ANDLloadidx1,
+	[2]Op{OpAMD64ANDQload, OpAMD64ADDQ}: OpAMD64ANDQloadidx1,
+	[2]Op{OpAMD64ORLload, OpAMD64ADDQ}:  OpAMD64ORLloadidx1,
+	[2]Op{OpAMD64ORQload, OpAMD64ADDQ}:  OpAMD64ORQloadidx1,
+	[2]Op{OpAMD64XORLload, OpAMD64ADDQ}: OpAMD64XORLloadidx1,
+	[2]Op{OpAMD64XORQload, OpAMD64ADDQ}: OpAMD64XORQloadidx1,
+
+	[2]Op{OpAMD64ADDLload, OpAMD64LEAQ1}: OpAMD64ADDLloadidx1,
+	[2]Op{OpAMD64ADDLload, OpAMD64LEAQ4}: OpAMD64ADDLloadidx4,
+	[2]Op{OpAMD64ADDLload, OpAMD64LEAQ8}: OpAMD64ADDLloadidx8,
+	[2]Op{OpAMD64ADDQload, OpAMD64LEAQ1}: OpAMD64ADDQloadidx1,
+	[2]Op{OpAMD64ADDQload, OpAMD64LEAQ8}: OpAMD64ADDQloadidx8,
+	[2]Op{OpAMD64SUBLload, OpAMD64LEAQ1}: OpAMD64SUBLloadidx1,
+	[2]Op{OpAMD64SUBLload, OpAMD64LEAQ4}: OpAMD64SUBLloadidx4,
+	[2]Op{OpAMD64SUBLload, OpAMD64LEAQ8}: OpAMD64SUBLloadidx8,
+	[2]Op{OpAMD64SUBQload, OpAMD64LEAQ1}: OpAMD64SUBQloadidx1,
+	[2]Op{OpAMD64SUBQload, OpAMD64LEAQ8}: OpAMD64SUBQloadidx8,
+	[2]Op{OpAMD64ANDLload, OpAMD64LEAQ1}: OpAMD64ANDLloadidx1,
+	[2]Op{OpAMD64ANDLload, OpAMD64LEAQ4}: OpAMD64ANDLloadidx4,
+	[2]Op{OpAMD64ANDLload, OpAMD64LEAQ8}: OpAMD64ANDLloadidx8,
+	[2]Op{OpAMD64ANDQload, OpAMD64LEAQ1}: OpAMD64ANDQloadidx1,
+	[2]Op{OpAMD64ANDQload, OpAMD64LEAQ8}: OpAMD64ANDQloadidx8,
+	[2]Op{OpAMD64ORLload, OpAMD64LEAQ1}:  OpAMD64ORLloadidx1,
+	[2]Op{OpAMD64ORLload, OpAMD64LEAQ4}:  OpAMD64ORLloadidx4,
+	[2]Op{OpAMD64ORLload, OpAMD64LEAQ8}:  OpAMD64ORLloadidx8,
+	[2]Op{OpAMD64ORQload, OpAMD64LEAQ1}:  OpAMD64ORQloadidx1,
+	[2]Op{OpAMD64ORQload, OpAMD64LEAQ8}:  OpAMD64ORQloadidx8,
+	[2]Op{OpAMD64XORLload, OpAMD64LEAQ1}: OpAMD64XORLloadidx1,
+	[2]Op{OpAMD64XORLload, OpAMD64LEAQ4}: OpAMD64XORLloadidx4,
+	[2]Op{OpAMD64XORLload, OpAMD64LEAQ8}: OpAMD64XORLloadidx8,
+	[2]Op{OpAMD64XORQload, OpAMD64LEAQ1}: OpAMD64XORQloadidx1,
+	[2]Op{OpAMD64XORQload, OpAMD64LEAQ8}: OpAMD64XORQloadidx8,
+
+	[2]Op{OpAMD64ADDLmodify, OpAMD64ADDQ}: OpAMD64ADDLmodifyidx1,
+	[2]Op{OpAMD64ADDQmodify, OpAMD64ADDQ}: OpAMD64ADDQmodifyidx1,
+	[2]Op{OpAMD64SUBLmodify, OpAMD64ADDQ}: OpAMD64SUBLmodifyidx1,
+	[2]Op{OpAMD64SUBQmodify, OpAMD64ADDQ}: OpAMD64SUBQmodifyidx1,
+	[2]Op{OpAMD64ANDLmodify, OpAMD64ADDQ}: OpAMD64ANDLmodifyidx1,
+	[2]Op{OpAMD64ANDQmodify, OpAMD64ADDQ}: OpAMD64ANDQmodifyidx1,
+	[2]Op{OpAMD64ORLmodify, OpAMD64ADDQ}:  OpAMD64ORLmodifyidx1,
+	[2]Op{OpAMD64ORQmodify, OpAMD64ADDQ}:  OpAMD64ORQmodifyidx1,
+	[2]Op{OpAMD64XORLmodify, OpAMD64ADDQ}: OpAMD64XORLmodifyidx1,
+	[2]Op{OpAMD64XORQmodify, OpAMD64ADDQ}: OpAMD64XORQmodifyidx1,
+
+	[2]Op{OpAMD64ADDLmodify, OpAMD64LEAQ1}: OpAMD64ADDLmodifyidx1,
+	[2]Op{OpAMD64ADDLmodify, OpAMD64LEAQ4}: OpAMD64ADDLmodifyidx4,
+	[2]Op{OpAMD64ADDLmodify, OpAMD64LEAQ8}: OpAMD64ADDLmodifyidx8,
+	[2]Op{OpAMD64ADDQmodify, OpAMD64LEAQ1}: OpAMD64ADDQmodifyidx1,
+	[2]Op{OpAMD64ADDQmodify, OpAMD64LEAQ8}: OpAMD64ADDQmodifyidx8,
+	[2]Op{OpAMD64SUBLmodify, OpAMD64LEAQ1}: OpAMD64SUBLmodifyidx1,
+	[2]Op{OpAMD64SUBLmodify, OpAMD64LEAQ4}: OpAMD64SUBLmodifyidx4,
+	[2]Op{OpAMD64SUBLmodify, OpAMD64LEAQ8}: OpAMD64SUBLmodifyidx8,
+	[2]Op{OpAMD64SUBQmodify, OpAMD64LEAQ1}: OpAMD64SUBQmodifyidx1,
+	[2]Op{OpAMD64SUBQmodify, OpAMD64LEAQ8}: OpAMD64SUBQmodifyidx8,
+	[2]Op{OpAMD64ANDLmodify, OpAMD64LEAQ1}: OpAMD64ANDLmodifyidx1,
+	[2]Op{OpAMD64ANDLmodify, OpAMD64LEAQ4}: OpAMD64ANDLmodifyidx4,
+	[2]Op{OpAMD64ANDLmodify, OpAMD64LEAQ8}: OpAMD64ANDLmodifyidx8,
+	[2]Op{OpAMD64ANDQmodify, OpAMD64LEAQ1}: OpAMD64ANDQmodifyidx1,
+	[2]Op{OpAMD64ANDQmodify, OpAMD64LEAQ8}: OpAMD64ANDQmodifyidx8,
+	[2]Op{OpAMD64ORLmodify, OpAMD64LEAQ1}:  OpAMD64ORLmodifyidx1,
+	[2]Op{OpAMD64ORLmodify, OpAMD64LEAQ4}:  OpAMD64ORLmodifyidx4,
+	[2]Op{OpAMD64ORLmodify, OpAMD64LEAQ8}:  OpAMD64ORLmodifyidx8,
+	[2]Op{OpAMD64ORQmodify, OpAMD64LEAQ1}:  OpAMD64ORQmodifyidx1,
+	[2]Op{OpAMD64ORQmodify, OpAMD64LEAQ8}:  OpAMD64ORQmodifyidx8,
+	[2]Op{OpAMD64XORLmodify, OpAMD64LEAQ1}: OpAMD64XORLmodifyidx1,
+	[2]Op{OpAMD64XORLmodify, OpAMD64LEAQ4}: OpAMD64XORLmodifyidx4,
+	[2]Op{OpAMD64XORLmodify, OpAMD64LEAQ8}: OpAMD64XORLmodifyidx8,
+	[2]Op{OpAMD64XORQmodify, OpAMD64LEAQ1}: OpAMD64XORQmodifyidx1,
+	[2]Op{OpAMD64XORQmodify, OpAMD64LEAQ8}: OpAMD64XORQmodifyidx8,
+
+	[2]Op{OpAMD64ADDLconstmodify, OpAMD64ADDQ}: OpAMD64ADDLconstmodifyidx1,
+	[2]Op{OpAMD64ADDQconstmodify, OpAMD64ADDQ}: OpAMD64ADDQconstmodifyidx1,
+	[2]Op{OpAMD64ANDLconstmodify, OpAMD64ADDQ}: OpAMD64ANDLconstmodifyidx1,
+	[2]Op{OpAMD64ANDQconstmodify, OpAMD64ADDQ}: OpAMD64ANDQconstmodifyidx1,
+	[2]Op{OpAMD64ORLconstmodify, OpAMD64ADDQ}:  OpAMD64ORLconstmodifyidx1,
+	[2]Op{OpAMD64ORQconstmodify, OpAMD64ADDQ}:  OpAMD64ORQconstmodifyidx1,
+	[2]Op{OpAMD64XORLconstmodify, OpAMD64ADDQ}: OpAMD64XORLconstmodifyidx1,
+	[2]Op{OpAMD64XORQconstmodify, OpAMD64ADDQ}: OpAMD64XORQconstmodifyidx1,
+
+	[2]Op{OpAMD64ADDLconstmodify, OpAMD64LEAQ1}: OpAMD64ADDLconstmodifyidx1,
+	[2]Op{OpAMD64ADDLconstmodify, OpAMD64LEAQ4}: OpAMD64ADDLconstmodifyidx4,
+	[2]Op{OpAMD64ADDLconstmodify, OpAMD64LEAQ8}: OpAMD64ADDLconstmodifyidx8,
+	[2]Op{OpAMD64ADDQconstmodify, OpAMD64LEAQ1}: OpAMD64ADDQconstmodifyidx1,
+	[2]Op{OpAMD64ADDQconstmodify, OpAMD64LEAQ8}: OpAMD64ADDQconstmodifyidx8,
+	[2]Op{OpAMD64ANDLconstmodify, OpAMD64LEAQ1}: OpAMD64ANDLconstmodifyidx1,
+	[2]Op{OpAMD64ANDLconstmodify, OpAMD64LEAQ4}: OpAMD64ANDLconstmodifyidx4,
+	[2]Op{OpAMD64ANDLconstmodify, OpAMD64LEAQ8}: OpAMD64ANDLconstmodifyidx8,
+	[2]Op{OpAMD64ANDQconstmodify, OpAMD64LEAQ1}: OpAMD64ANDQconstmodifyidx1,
+	[2]Op{OpAMD64ANDQconstmodify, OpAMD64LEAQ8}: OpAMD64ANDQconstmodifyidx8,
+	[2]Op{OpAMD64ORLconstmodify, OpAMD64LEAQ1}:  OpAMD64ORLconstmodifyidx1,
+	[2]Op{OpAMD64ORLconstmodify, OpAMD64LEAQ4}:  OpAMD64ORLconstmodifyidx4,
+	[2]Op{OpAMD64ORLconstmodify, OpAMD64LEAQ8}:  OpAMD64ORLconstmodifyidx8,
+	[2]Op{OpAMD64ORQconstmodify, OpAMD64LEAQ1}:  OpAMD64ORQconstmodifyidx1,
+	[2]Op{OpAMD64ORQconstmodify, OpAMD64LEAQ8}:  OpAMD64ORQconstmodifyidx8,
+	[2]Op{OpAMD64XORLconstmodify, OpAMD64LEAQ1}: OpAMD64XORLconstmodifyidx1,
+	[2]Op{OpAMD64XORLconstmodify, OpAMD64LEAQ4}: OpAMD64XORLconstmodifyidx4,
+	[2]Op{OpAMD64XORLconstmodify, OpAMD64LEAQ8}: OpAMD64XORLconstmodifyidx8,
+	[2]Op{OpAMD64XORQconstmodify, OpAMD64LEAQ1}: OpAMD64XORQconstmodifyidx1,
+	[2]Op{OpAMD64XORQconstmodify, OpAMD64LEAQ8}: OpAMD64XORQconstmodifyidx8,
+
 	// 386
 	[2]Op{Op386MOVBload, Op386ADDL}:  Op386MOVBloadidx1,
 	[2]Op{Op386MOVWload, Op386ADDL}:  Op386MOVWloadidx1,
--- a/src/cmd/compile/internal/ssa/block.go
+++ b/src/cmd/compile/internal/ssa/block.go
@ -124,15 +124,8 @@ func (b *Block) LongString() string {
 	if b.Aux != nil {
 		s += fmt.Sprintf(" {%s}", b.Aux)
 	}
-	if t := b.Kind.AuxIntType(); t != "" {
-		switch t {
-		case "Int8":
-			s += fmt.Sprintf(" [%v]", int8(b.AuxInt))
-		case "UInt8":
-			s += fmt.Sprintf(" [%v]", uint8(b.AuxInt))
-		default:
-			s += fmt.Sprintf(" [%v]", b.AuxInt)
-		}
+	if t := b.AuxIntString(); t != "" {
+		s += fmt.Sprintf(" [%s]", t)
 	}
 	for _, c := range b.ControlValues() {
 		s += fmt.Sprintf(" %s", c)
@ -263,6 +256,17 @@ func (b *Block) resetWithControl2(kind BlockKind, v, w *Value) {
 	w.Uses++
 }

+// truncateValues truncates b.Values at the ith element, zeroing subsequent elements.
+// The values in b.Values after i must already have had their args reset,
+// to maintain correct value uses counts.
+func (b *Block) truncateValues(i int) {
+	tail := b.Values[i:]
+	for j := range tail {
+		tail[j] = nil
+	}
+	b.Values = b.Values[:i]
+}
+
 // AddEdgeTo adds an edge from block b to block c. Used during building of the
 // SSA graph; do not use on an already-completed SSA graph.
 func (b *Block) AddEdgeTo(c *Block) {
@ -341,6 +345,19 @@ func (b *Block) LackingPos() bool {
 	return true
 }

+func (b *Block) AuxIntString() string {
+	switch b.Kind.AuxIntType() {
+	case "int8":
+		return fmt.Sprintf("%v", int8(b.AuxInt))
+	case "uint8":
+		return fmt.Sprintf("%v", uint8(b.AuxInt))
+	default: // type specified but not implemented - print as int64
+		return fmt.Sprintf("%v", b.AuxInt)
+	case "": // no aux int type
+		return ""
+	}
+}
+
 func (b *Block) Logf(msg string, args ...interface{})   { b.Func.Logf(msg, args...) }
 func (b *Block) Log() bool                              { return b.Func.Log() }
 func (b *Block) Fatalf(msg string, args ...interface{}) { b.Func.Fatalf(msg, args...) }
--- a/src/cmd/compile/internal/ssa/check.go
+++ b/src/cmd/compile/internal/ssa/check.go
@ -5,6 +5,7 @@
 package ssa

 import (
+	"cmd/internal/obj/s390x"
 	"math"
 	"math/bits"
 )
@ -119,6 +120,7 @@ func checkFunc(f *Func) {
 			// Check to make sure aux values make sense.
 			canHaveAux := false
 			canHaveAuxInt := false
+			// TODO: enforce types of Aux in this switch (like auxString does below)
 			switch opcodeTable[v.Op].auxType {
 			case auxNone:
 			case auxBool:
@ -158,7 +160,12 @@ func checkFunc(f *Func) {
 				if math.IsNaN(v.AuxFloat()) {
 					f.Fatalf("value %v has an AuxInt that encodes a NaN", v)
 				}
-			case auxString, auxSym, auxTyp, auxArchSpecific:
+			case auxString:
+				if _, ok := v.Aux.(string); !ok {
+					f.Fatalf("value %v has Aux type %T, want string", v, v.Aux)
+				}
+				canHaveAux = true
+			case auxSym, auxTyp:
 				canHaveAux = true
 			case auxSymOff, auxSymValAndOff, auxTypSize:
 				canHaveAuxInt = true
@ -168,6 +175,16 @@ func checkFunc(f *Func) {
 					f.Fatalf("bad type %T for CCop in %v", v.Aux, v)
 				}
 				canHaveAux = true
+			case auxS390XCCMask:
+				if _, ok := v.Aux.(s390x.CCMask); !ok {
+					f.Fatalf("bad type %T for S390XCCMask in %v", v.Aux, v)
+				}
+				canHaveAux = true
+			case auxS390XRotateParams:
+				if _, ok := v.Aux.(s390x.RotateParams); !ok {
+					f.Fatalf("bad type %T for S390XRotateParams in %v", v.Aux, v)
+				}
+				canHaveAux = true
 			default:
 				f.Fatalf("unknown aux type for %s", v.Op)
 			}
--- a/src/cmd/compile/internal/ssa/compile.go
+++ b/src/cmd/compile/internal/ssa/compile.go
@ -55,7 +55,7 @@ func Compile(f *Func) {
 	if f.Log() {
 		printFunc(f)
 	}
-	f.HTMLWriter.WriteFunc("start", "start", f)
+	f.HTMLWriter.WritePhase("start", "start")
 	if BuildDump != "" && BuildDump == f.Name {
 		f.dumpFile("build")
 	}
@ -111,7 +111,7 @@ func Compile(f *Func) {
 				f.Logf("  pass %s end %s\n", p.name, stats)
 				printFunc(f)
 			}
-			f.HTMLWriter.WriteFunc(phaseName, fmt.Sprintf("%s <span class=\"stats\">%s</span>", phaseName, stats), f)
+			f.HTMLWriter.WritePhase(phaseName, fmt.Sprintf("%s <span class=\"stats\">%s</span>", phaseName, stats))
 		}
 		if p.time || p.mem {
 			// Surround timing information w/ enough context to allow comparisons.
@ -136,6 +136,11 @@ func Compile(f *Func) {
 		}
 	}

+	if f.HTMLWriter != nil {
+		// Ensure we write any pending phases to the html
+		f.HTMLWriter.flushPhases()
+	}
+
 	if f.ruleMatches != nil {
 		var keys []string
 		for key := range f.ruleMatches {
--- a/src/cmd/compile/internal/ssa/config.go
+++ b/src/cmd/compile/internal/ssa/config.go
@ -135,7 +135,7 @@ type Frontend interface {
 	Logger

 	// StringData returns a symbol pointing to the given string's contents.
-	StringData(string) interface{} // returns *gc.Sym
+	StringData(string) *obj.LSym

 	// Auto returns a Node for an auto variable of the given type.
 	// The SSA compiler uses this function to allocate space for spills.
--- a/src/cmd/compile/internal/ssa/deadcode.go
+++ b/src/cmd/compile/internal/ssa/deadcode.go
@ -242,8 +242,9 @@ func deadcode(f *Func) {
 			f.NamedValues[name] = values[:j]
 		}
 	}
-	for k := len(f.Names) - 1; k >= i; k-- {
-		f.Names[k] = LocalSlot{}
+	clearNames := f.Names[i:]
+	for j := range clearNames {
+		clearNames[j] = LocalSlot{}
 	}
 	f.Names = f.Names[:i]

@ -295,12 +296,7 @@ func deadcode(f *Func) {
 				f.freeValue(v)
 			}
 		}
-		// aid GC
-		tail := b.Values[i:]
-		for j := range tail {
-			tail[j] = nil
-		}
-		b.Values = b.Values[:i]
+		b.truncateValues(i)
 	}

 	// Remove dead blocks from WBLoads list.
@ -311,8 +307,9 @@ func deadcode(f *Func) {
 			i++
 		}
 	}
-	for j := i; j < len(f.WBLoads); j++ {
-		f.WBLoads[j] = nil
+	clearWBLoads := f.WBLoads[i:]
+	for j := range clearWBLoads {
+		clearWBLoads[j] = nil
 	}
 	f.WBLoads = f.WBLoads[:i]

--- a/src/cmd/compile/internal/ssa/debug_test.go
+++ b/src/cmd/compile/internal/ssa/debug_test.go
@ -95,6 +95,8 @@ var optimizedLibs = (!strings.Contains(gogcflags, "-N") && !strings.Contains(gog
 // go test debug_test.go -args -u -d

 func TestNexting(t *testing.T) {
+	testenv.SkipFlaky(t, 37404)
+
 	skipReasons := "" // Many possible skip reasons, list all that apply
 	if testing.Short() {
 		skipReasons = "not run in short mode; "
--- a/src/cmd/compile/internal/ssa/export_test.go
+++ b/src/cmd/compile/internal/ssa/export_test.go
@ -90,7 +90,7 @@ func (d *DummyAuto) IsAutoTmp() bool {
 	return true
 }

-func (DummyFrontend) StringData(s string) interface{} {
+func (DummyFrontend) StringData(s string) *obj.LSym {
 	return nil
 }
 func (DummyFrontend) Auto(pos src.XPos, t *types.Type) GCNode {
--- a/src/cmd/compile/internal/ssa/flagalloc.go
+++ b/src/cmd/compile/internal/ssa/flagalloc.go
@ -106,7 +106,7 @@ func flagalloc(f *Func) {
 	}

 	// Add flag spill and recomputation where they are needed.
-	// TODO: Remove original instructions if they are never used.
+	var remove []*Value // values that should be checked for possible removal
 	var oldSched []*Value
 	for _, b := range f.Blocks {
 		oldSched = append(oldSched[:0], b.Values...)
@ -131,6 +131,7 @@ func flagalloc(f *Func) {
 			// If v will be spilled, and v uses memory, then we must split it
 			// into a load + a flag generator.
 			if spill[v.ID] && v.MemoryArg() != nil {
+				remove = append(remove, v)
 				if !f.Config.splitLoad(v) {
 					f.Fatalf("can't split flag generator: %s", v.LongString())
 				}
@ -164,6 +165,7 @@ func flagalloc(f *Func) {
 		for i, v := range b.ControlValues() {
 			if v != flag && v.Type.IsFlags() {
 				// Recalculate control value.
+				remove = append(remove, v)
 				c := copyFlags(v, b)
 				b.ReplaceControl(i, c)
 				flag = v
@ -172,12 +174,15 @@ func flagalloc(f *Func) {
 		if v := end[b.ID]; v != nil && v != flag {
 			// Need to reissue flag generator for use by
 			// subsequent blocks.
+			remove = append(remove, v)
 			copyFlags(v, b)
 			// Note: this flag generator is not properly linked up
 			// with the flag users. This breaks the SSA representation.
 			// We could fix up the users with another pass, but for now
-			// we'll just leave it.  (Regalloc has the same issue for
+			// we'll just leave it. (Regalloc has the same issue for
 			// standard regs, and it runs next.)
+			// For this reason, take care not to add this flag
+			// generator to the remove list.
 		}
 	}

@ -185,6 +190,58 @@ func flagalloc(f *Func) {
 	for _, b := range f.Blocks {
 		b.FlagsLiveAtEnd = end[b.ID] != nil
 	}
+
+	const go115flagallocdeadcode = true
+	if !go115flagallocdeadcode {
+		return
+	}
+
+	// Remove any now-dead values.
+	// The number of values to remove is likely small,
+	// and removing them requires processing all values in a block,
+	// so minimize the number of blocks that we touch.
+
+	// Shrink remove to contain only dead values, and clobber those dead values.
+	for i := 0; i < len(remove); i++ {
+		v := remove[i]
+		if v.Uses == 0 {
+			v.reset(OpInvalid)
+			continue
+		}
+		// Remove v.
+		last := len(remove) - 1
+		remove[i] = remove[last]
+		remove[last] = nil
+		remove = remove[:last]
+		i-- // reprocess value at i
+	}
+
+	if len(remove) == 0 {
+		return
+	}
+
+	removeBlocks := f.newSparseSet(f.NumBlocks())
+	defer f.retSparseSet(removeBlocks)
+	for _, v := range remove {
+		removeBlocks.add(v.Block.ID)
+	}
+
+	// Process affected blocks, preserving value order.
+	for _, b := range f.Blocks {
+		if !removeBlocks.contains(b.ID) {
+			continue
+		}
+		i := 0
+		for j := 0; j < len(b.Values); j++ {
+			v := b.Values[j]
+			if v.Op == OpInvalid {
+				continue
+			}
+			b.Values[i] = v
+			i++
+		}
+		b.truncateValues(i)
+	}
 }

 func (v *Value) clobbersFlags() bool {
--- a/src/cmd/compile/internal/ssa/fuse.go
+++ b/src/cmd/compile/internal/ssa/fuse.go
@ -20,6 +20,7 @@ const (
 	fuseTypePlain fuseType = 1 << iota
 	fuseTypeIf
 	fuseTypeIntInRange
+	fuseTypeShortCircuit
 )

 // fuse simplifies control flow by joining basic blocks.
@ -38,6 +39,9 @@ func fuse(f *Func, typ fuseType) {
 			if typ&fuseTypePlain != 0 {
 				changed = fuseBlockPlain(b) || changed
 			}
+			if typ&fuseTypeShortCircuit != 0 {
+				changed = shortcircuitBlock(b) || changed
+			}
 		}
 		if changed {
 			f.invalidateCFG()
--- a/src/cmd/compile/internal/ssa/gen/386.rules
+++ b/src/cmd/compile/internal/ssa/gen/386.rules
--- a/src/cmd/compile/internal/ssa/gen/386Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/386Ops.go
@ -531,13 +531,13 @@ func init() {
 		// There are three of these functions so that they can have three different register inputs.
 		// When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
 		// default registers to match so we don't need to copy registers around unnecessarily.
-		{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{dx, bx}}, typ: "Mem"}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
-		{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{cx, dx}}, typ: "Mem"}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
-		{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{ax, cx}}, typ: "Mem"}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{dx, bx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{cx, dx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{ax, cx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
 		// Extend ops are the same as Bounds ops except the indexes are 64-bit.
-		{name: "LoweredPanicExtendA", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{si, dx, bx}}, typ: "Mem"}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
-		{name: "LoweredPanicExtendB", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{si, cx, dx}}, typ: "Mem"}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
-		{name: "LoweredPanicExtendC", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{si, ax, cx}}, typ: "Mem"}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+		{name: "LoweredPanicExtendA", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{si, dx, bx}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+		{name: "LoweredPanicExtendB", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{si, cx, dx}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+		{name: "LoweredPanicExtendC", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{si, ax, cx}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).

 		// Constant flag values. For any comparison, there are 5 possible
 		// outcomes: the three from the signed total order (<,==,>) and the
--- a/src/cmd/compile/internal/ssa/gen/386splitload.rules
+++ b/src/cmd/compile/internal/ssa/gen/386splitload.rules
@ -4,6 +4,8 @@

 // See the top of AMD64splitload.rules for discussion of these rules.

-(CMP(L|W|B)load {sym} [off] ptr x mem) -> (CMP(L|W|B) (MOV(L|W|B)load {sym} [off] ptr mem) x)
+(CMP(L|W|B)load {sym} [off] ptr x mem) => (CMP(L|W|B) (MOV(L|W|B)load {sym} [off] ptr mem) x)

-(CMP(L|W|B)constload {sym} [vo] ptr mem) -> (CMP(L|W|B)const (MOV(L|W|B)load {sym} [offOnly(vo)] ptr mem) [valOnly(vo)])
+(CMPLconstload {sym} [vo] ptr mem) => (CMPLconst (MOVLload {sym} [vo.Off32()] ptr mem) [vo.Val32()])
+(CMPWconstload {sym} [vo] ptr mem) => (CMPWconst (MOVWload {sym} [vo.Off32()] ptr mem) [vo.Val16()])
+(CMPBconstload {sym} [vo] ptr mem) => (CMPBconst (MOVBload {sym} [vo.Off32()] ptr mem) [vo.Val8()])
--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@ -3,311 +3,302 @@
 // license that can be found in the LICENSE file.

 // Lowering arithmetic
-(Add(64|32|16|8) ...) -> (ADD(Q|L|L|L) ...)
-(AddPtr ...) -> (ADDQ ...)
-(Add(32|64)F ...) -> (ADDS(S|D) ...)
+(Add(64|32|16|8) ...) => (ADD(Q|L|L|L) ...)
+(AddPtr ...) => (ADDQ ...)
+(Add(32|64)F ...) => (ADDS(S|D) ...)

-(Sub(64|32|16|8) ...) -> (SUB(Q|L|L|L) ...)
-(SubPtr ...) -> (SUBQ ...)
-(Sub(32|64)F ...) -> (SUBS(S|D) ...)
+(Sub(64|32|16|8) ...) => (SUB(Q|L|L|L) ...)
+(SubPtr ...) => (SUBQ ...)
+(Sub(32|64)F ...) => (SUBS(S|D) ...)

-(Mul(64|32|16|8) ...) -> (MUL(Q|L|L|L) ...)
-(Mul(32|64)F ...) -> (MULS(S|D) ...)
+(Mul(64|32|16|8) ...) => (MUL(Q|L|L|L) ...)
+(Mul(32|64)F ...) => (MULS(S|D) ...)

-(Select0 (Mul64uover x y)) -> (Select0 <typ.UInt64> (MULQU x y))
-(Select0 (Mul32uover x y)) -> (Select0 <typ.UInt32> (MULLU x y))
-(Select1 (Mul(64|32)uover x y)) -> (SETO (Select1 <types.TypeFlags> (MUL(Q|L)U x y)))
+(Select0 (Mul64uover x y)) => (Select0 <typ.UInt64> (MULQU x y))
+(Select0 (Mul32uover x y)) => (Select0 <typ.UInt32> (MULLU x y))
+(Select1 (Mul(64|32)uover x y)) => (SETO (Select1 <types.TypeFlags> (MUL(Q|L)U x y)))

-(Hmul(64|32) ...) -> (HMUL(Q|L) ...)
-(Hmul(64|32)u ...) -> (HMUL(Q|L)U ...)
+(Hmul(64|32) ...) => (HMUL(Q|L) ...)
+(Hmul(64|32)u ...) => (HMUL(Q|L)U ...)

-(Div(64|32|16) [a] x y) -> (Select0 (DIV(Q|L|W) [a] x y))
-(Div8  x y) -> (Select0 (DIVW  (SignExt8to16 x) (SignExt8to16 y)))
-(Div(64|32|16)u x y) -> (Select0 (DIV(Q|L|W)U x y))
-(Div8u x y) -> (Select0 (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y)))
-(Div(32|64)F ...) -> (DIVS(S|D) ...)
+(Div(64|32|16) [a] x y) => (Select0 (DIV(Q|L|W) [a] x y))
+(Div8  x y) => (Select0 (DIVW  (SignExt8to16 x) (SignExt8to16 y)))
+(Div(64|32|16)u x y) => (Select0 (DIV(Q|L|W)U x y))
+(Div8u x y) => (Select0 (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y)))
+(Div(32|64)F ...) => (DIVS(S|D) ...)

-(Select0 (Add64carry x y c)) ->
+(Select0 (Add64carry x y c)) =>
 	(Select0 <typ.UInt64> (ADCQ x y (Select1 <types.TypeFlags> (NEGLflags c))))
-(Select1 (Add64carry x y c)) ->
+(Select1 (Add64carry x y c)) =>
 	(NEGQ <typ.UInt64> (SBBQcarrymask <typ.UInt64> (Select1 <types.TypeFlags> (ADCQ x y (Select1 <types.TypeFlags> (NEGLflags c))))))
-(Select0 (Sub64borrow x y c)) ->
+(Select0 (Sub64borrow x y c)) =>
 	(Select0 <typ.UInt64> (SBBQ x y (Select1 <types.TypeFlags> (NEGLflags c))))
-(Select1 (Sub64borrow x y c)) ->
+(Select1 (Sub64borrow x y c)) =>
 	(NEGQ <typ.UInt64> (SBBQcarrymask <typ.UInt64> (Select1 <types.TypeFlags> (SBBQ x y (Select1 <types.TypeFlags> (NEGLflags c))))))

 // Optimize ADCQ and friends
-(ADCQ x (MOVQconst [c]) carry) && is32Bit(c) -> (ADCQconst x [c] carry)
-(ADCQ x y (FlagEQ)) -> (ADDQcarry x y)
-(ADCQconst x [c] (FlagEQ)) -> (ADDQconstcarry x [c])
-(ADDQcarry x (MOVQconst [c])) && is32Bit(c) -> (ADDQconstcarry x [c])
-(SBBQ x (MOVQconst [c]) borrow) && is32Bit(c) -> (SBBQconst x [c] borrow)
-(SBBQ x y (FlagEQ)) -> (SUBQborrow x y)
-(SBBQconst x [c] (FlagEQ)) -> (SUBQconstborrow x [c])
-(SUBQborrow x (MOVQconst [c])) && is32Bit(c) -> (SUBQconstborrow x [c])
-(Select1 (NEGLflags (MOVQconst [0]))) -> (FlagEQ)
-(Select1 (NEGLflags (NEGQ (SBBQcarrymask x)))) -> x
+(ADCQ x (MOVQconst [c]) carry) && is32Bit(c) => (ADCQconst x [int32(c)] carry)
+(ADCQ x y (FlagEQ)) => (ADDQcarry x y)
+(ADCQconst x [c] (FlagEQ)) => (ADDQconstcarry x [c])
+(ADDQcarry x (MOVQconst [c])) && is32Bit(c) => (ADDQconstcarry x [int32(c)])
+(SBBQ x (MOVQconst [c]) borrow) && is32Bit(c) => (SBBQconst x [int32(c)] borrow)
+(SBBQ x y (FlagEQ)) => (SUBQborrow x y)
+(SBBQconst x [c] (FlagEQ)) => (SUBQconstborrow x [c])
+(SUBQborrow x (MOVQconst [c])) && is32Bit(c) => (SUBQconstborrow x [int32(c)])
+(Select1 (NEGLflags (MOVQconst [0]))) => (FlagEQ)
+(Select1 (NEGLflags (NEGQ (SBBQcarrymask x)))) => x


-(Mul64uhilo ...) -> (MULQU2 ...)
-(Div128u ...) -> (DIVQU2 ...)
+(Mul64uhilo ...) => (MULQU2 ...)
+(Div128u ...) => (DIVQU2 ...)

-(Avg64u ...) -> (AVGQU ...)
+(Avg64u ...) => (AVGQU ...)

-(Mod(64|32|16) [a] x y) -> (Select1 (DIV(Q|L|W) [a] x y))
-(Mod8  x y) -> (Select1 (DIVW  (SignExt8to16 x) (SignExt8to16 y)))
-(Mod(64|32|16)u x y) -> (Select1 (DIV(Q|L|W)U x y))
-(Mod8u x y) -> (Select1 (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y)))
+(Mod(64|32|16) [a] x y) => (Select1 (DIV(Q|L|W) [a] x y))
+(Mod8  x y) => (Select1 (DIVW  (SignExt8to16 x) (SignExt8to16 y)))
+(Mod(64|32|16)u x y) => (Select1 (DIV(Q|L|W)U x y))
+(Mod8u x y) => (Select1 (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y)))

-(And(64|32|16|8) ...) -> (AND(Q|L|L|L) ...)
-(Or(64|32|16|8) ...) -> (OR(Q|L|L|L) ...)
-(Xor(64|32|16|8) ...) -> (XOR(Q|L|L|L) ...)
-(Com(64|32|16|8) ...) -> (NOT(Q|L|L|L) ...)
+(And(64|32|16|8) ...) => (AND(Q|L|L|L) ...)
+(Or(64|32|16|8) ...) => (OR(Q|L|L|L) ...)
+(Xor(64|32|16|8) ...) => (XOR(Q|L|L|L) ...)
+(Com(64|32|16|8) ...) => (NOT(Q|L|L|L) ...)

-(Neg(64|32|16|8) ...) -> (NEG(Q|L|L|L) ...)
-(Neg32F x) -> (PXOR x (MOVSSconst <typ.Float32> [auxFrom32F(float32(math.Copysign(0, -1)))]))
-(Neg64F x) -> (PXOR x (MOVSDconst <typ.Float64> [auxFrom64F(math.Copysign(0, -1))]))
+(Neg(64|32|16|8) ...) => (NEG(Q|L|L|L) ...)
+(Neg32F x) => (PXOR x (MOVSSconst <typ.Float32> [float32(math.Copysign(0, -1))]))
+(Neg64F x) => (PXOR x (MOVSDconst <typ.Float64> [math.Copysign(0, -1)]))

 // Lowering boolean ops
-(AndB ...) -> (ANDL ...)
-(OrB ...) -> (ORL ...)
-(Not x) -> (XORLconst [1] x)
+(AndB ...) => (ANDL ...)
+(OrB ...) => (ORL ...)
+(Not x) => (XORLconst [1] x)

 // Lowering pointer arithmetic
-(OffPtr [off] ptr) && is32Bit(off) -> (ADDQconst [off] ptr)
-(OffPtr [off] ptr) -> (ADDQ (MOVQconst [off]) ptr)
+(OffPtr [off] ptr) && is32Bit(off) => (ADDQconst [int32(off)] ptr)
+(OffPtr [off] ptr) => (ADDQ (MOVQconst [off]) ptr)

 // Lowering other arithmetic
-(Ctz64 <t> x) -> (CMOVQEQ (Select0 <t> (BSFQ x)) (MOVQconst <t> [64]) (Select1 <types.TypeFlags> (BSFQ x)))
-(Ctz32 x) -> (Select0 (BSFQ (BTSQconst <typ.UInt64> [32] x)))
-(Ctz16 x) -> (BSFL (BTSLconst <typ.UInt32> [16] x))
-(Ctz8  x) -> (BSFL (BTSLconst <typ.UInt32> [ 8] x))
+(Ctz64 <t> x) => (CMOVQEQ (Select0 <t> (BSFQ x)) (MOVQconst <t> [64]) (Select1 <types.TypeFlags> (BSFQ x)))
+(Ctz32 x) => (Select0 (BSFQ (BTSQconst <typ.UInt64> [32] x)))
+(Ctz16 x) => (BSFL (BTSLconst <typ.UInt32> [16] x))
+(Ctz8  x) => (BSFL (BTSLconst <typ.UInt32> [ 8] x))

-(Ctz64NonZero x) -> (Select0 (BSFQ x))
-(Ctz32NonZero ...) -> (BSFL ...)
-(Ctz16NonZero ...) -> (BSFL ...)
-(Ctz8NonZero  ...) -> (BSFL ...)
+(Ctz64NonZero x) => (Select0 (BSFQ x))
+(Ctz32NonZero ...) => (BSFL ...)
+(Ctz16NonZero ...) => (BSFL ...)
+(Ctz8NonZero  ...) => (BSFL ...)

 // BitLen64 of a 64 bit value x requires checking whether x == 0, since BSRQ is undefined when x == 0.
 // However, for zero-extended values, we can cheat a bit, and calculate
 // BSR(x<<1 + 1), which is guaranteed to be non-zero, and which conveniently
 // places the index of the highest set bit where we want it.
-(BitLen64 <t> x) -> (ADDQconst [1] (CMOVQEQ <t> (Select0 <t> (BSRQ x)) (MOVQconst <t> [-1]) (Select1 <types.TypeFlags> (BSRQ x))))
-(BitLen32 x) -> (Select0 (BSRQ (LEAQ1 <typ.UInt64> [1] (MOVLQZX <typ.UInt64> x) (MOVLQZX <typ.UInt64> x))))
-(BitLen16 x) -> (BSRL (LEAL1 <typ.UInt32> [1] (MOVWQZX <typ.UInt32> x) (MOVWQZX <typ.UInt32> x)))
-(BitLen8  x) -> (BSRL (LEAL1 <typ.UInt32> [1] (MOVBQZX <typ.UInt32> x) (MOVBQZX <typ.UInt32> x)))
+(BitLen64 <t> x) => (ADDQconst [1] (CMOVQEQ <t> (Select0 <t> (BSRQ x)) (MOVQconst <t> [-1]) (Select1 <types.TypeFlags> (BSRQ x))))
+(BitLen32 x) => (Select0 (BSRQ (LEAQ1 <typ.UInt64> [1] (MOVLQZX <typ.UInt64> x) (MOVLQZX <typ.UInt64> x))))
+(BitLen16 x) => (BSRL (LEAL1 <typ.UInt32> [1] (MOVWQZX <typ.UInt32> x) (MOVWQZX <typ.UInt32> x)))
+(BitLen8  x) => (BSRL (LEAL1 <typ.UInt32> [1] (MOVBQZX <typ.UInt32> x) (MOVBQZX <typ.UInt32> x)))

-(Bswap(64|32) ...) -> (BSWAP(Q|L) ...)
+(Bswap(64|32) ...) => (BSWAP(Q|L) ...)

-(PopCount(64|32) ...) -> (POPCNT(Q|L) ...)
-(PopCount16 x) -> (POPCNTL (MOVWQZX <typ.UInt32> x))
-(PopCount8 x) -> (POPCNTL (MOVBQZX <typ.UInt32> x))
+(PopCount(64|32) ...) => (POPCNT(Q|L) ...)
+(PopCount16 x) => (POPCNTL (MOVWQZX <typ.UInt32> x))
+(PopCount8 x) => (POPCNTL (MOVBQZX <typ.UInt32> x))

-(Sqrt ...) -> (SQRTSD ...)
+(Sqrt ...) => (SQRTSD ...)

-(RoundToEven x) -> (ROUNDSD [0] x)
-(Floor x)	-> (ROUNDSD [1] x)
-(Ceil x)  	-> (ROUNDSD [2] x)
-(Trunc x) 	-> (ROUNDSD [3] x)
-(FMA x y z) -> (VFMADD231SD z x y)
+(RoundToEven x) => (ROUNDSD [0] x)
+(Floor x)       => (ROUNDSD [1] x)
+(Ceil x)        => (ROUNDSD [2] x)
+(Trunc x)       => (ROUNDSD [3] x)
+
+(FMA x y z) => (VFMADD231SD z x y)

 // Lowering extension
 // Note: we always extend to 64 bits even though some ops don't need that many result bits.
-(SignExt8to16  ...) -> (MOVBQSX ...)
-(SignExt8to32  ...) -> (MOVBQSX ...)
-(SignExt8to64  ...) -> (MOVBQSX ...)
-(SignExt16to32 ...) -> (MOVWQSX ...)
-(SignExt16to64 ...) -> (MOVWQSX ...)
-(SignExt32to64 ...) -> (MOVLQSX ...)
+(SignExt8to16  ...) => (MOVBQSX ...)
+(SignExt8to32  ...) => (MOVBQSX ...)
+(SignExt8to64  ...) => (MOVBQSX ...)
+(SignExt16to32 ...) => (MOVWQSX ...)
+(SignExt16to64 ...) => (MOVWQSX ...)
+(SignExt32to64 ...) => (MOVLQSX ...)

-(ZeroExt8to16  ...) -> (MOVBQZX ...)
-(ZeroExt8to32  ...) -> (MOVBQZX ...)
-(ZeroExt8to64  ...) -> (MOVBQZX ...)
-(ZeroExt16to32 ...) -> (MOVWQZX ...)
-(ZeroExt16to64 ...) -> (MOVWQZX ...)
-(ZeroExt32to64 ...) -> (MOVLQZX ...)
+(ZeroExt8to16  ...) => (MOVBQZX ...)
+(ZeroExt8to32  ...) => (MOVBQZX ...)
+(ZeroExt8to64  ...) => (MOVBQZX ...)
+(ZeroExt16to32 ...) => (MOVWQZX ...)
+(ZeroExt16to64 ...) => (MOVWQZX ...)
+(ZeroExt32to64 ...) => (MOVLQZX ...)

-(Slicemask <t> x) -> (SARQconst (NEGQ <t> x) [63])
+(Slicemask <t> x) => (SARQconst (NEGQ <t> x) [63])

-(SpectreIndex <t> x y) -> (CMOVQCC x (MOVQconst [0]) (CMPQ x y))
-(SpectreSliceIndex <t> x y) -> (CMOVQHI x (MOVQconst [0]) (CMPQ x y))
+(SpectreIndex <t> x y) => (CMOVQCC x (MOVQconst [0]) (CMPQ x y))
+(SpectreSliceIndex <t> x y) => (CMOVQHI x (MOVQconst [0]) (CMPQ x y))

 // Lowering truncation
 // Because we ignore high parts of registers, truncates are just copies.
-(Trunc16to8  ...) -> (Copy ...)
-(Trunc32to8  ...) -> (Copy ...)
-(Trunc32to16 ...) -> (Copy ...)
-(Trunc64to8  ...) -> (Copy ...)
-(Trunc64to16 ...) -> (Copy ...)
-(Trunc64to32 ...) -> (Copy ...)
+(Trunc16to8  ...) => (Copy ...)
+(Trunc32to8  ...) => (Copy ...)
+(Trunc32to16 ...) => (Copy ...)
+(Trunc64to8  ...) => (Copy ...)
+(Trunc64to16 ...) => (Copy ...)
+(Trunc64to32 ...) => (Copy ...)

 // Lowering float <-> int
-(Cvt32to32F ...) -> (CVTSL2SS ...)
-(Cvt32to64F ...) -> (CVTSL2SD ...)
-(Cvt64to32F ...) -> (CVTSQ2SS ...)
-(Cvt64to64F ...) -> (CVTSQ2SD ...)
+(Cvt32to32F ...) => (CVTSL2SS ...)
+(Cvt32to64F ...) => (CVTSL2SD ...)
+(Cvt64to32F ...) => (CVTSQ2SS ...)
+(Cvt64to64F ...) => (CVTSQ2SD ...)

-(Cvt32Fto32 ...) -> (CVTTSS2SL ...)
-(Cvt32Fto64 ...) -> (CVTTSS2SQ ...)
-(Cvt64Fto32 ...) -> (CVTTSD2SL ...)
-(Cvt64Fto64 ...) -> (CVTTSD2SQ ...)
+(Cvt32Fto32 ...) => (CVTTSS2SL ...)
+(Cvt32Fto64 ...) => (CVTTSS2SQ ...)
+(Cvt64Fto32 ...) => (CVTTSD2SL ...)
+(Cvt64Fto64 ...) => (CVTTSD2SQ ...)

-(Cvt32Fto64F ...) -> (CVTSS2SD ...)
-(Cvt64Fto32F ...) -> (CVTSD2SS ...)
+(Cvt32Fto64F ...) => (CVTSS2SD ...)
+(Cvt64Fto32F ...) => (CVTSD2SS ...)

-(Round(32|64)F ...) -> (Copy ...)
+(Round(32|64)F ...) => (Copy ...)

-(CvtBoolToUint8 ...) -> (Copy ...)
+(CvtBoolToUint8 ...) => (Copy ...)

 // Lowering shifts
 // Unsigned shifts need to return 0 if shift amount is >= width of shifted value.
 //   result = (arg << shift) & (shift >= argbits ? 0 : 0xffffffffffffffff)
-(Lsh64x(64|32|16|8) <t> x y) && !shiftIsBounded(v) -> (ANDQ (SHLQ <t> x y) (SBBQcarrymask <t> (CMP(Q|L|W|B)const y [64])))
-(Lsh32x(64|32|16|8) <t> x y) && !shiftIsBounded(v) -> (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
-(Lsh16x(64|32|16|8) <t> x y) && !shiftIsBounded(v) -> (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
-(Lsh8x(64|32|16|8)  <t> x y) && !shiftIsBounded(v) -> (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
+(Lsh64x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDQ (SHLQ <t> x y) (SBBQcarrymask <t> (CMP(Q|L|W|B)const y [64])))
+(Lsh32x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
+(Lsh16x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
+(Lsh8x(64|32|16|8)  <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))

-(Lsh64x(64|32|16|8) x y) && shiftIsBounded(v) -> (SHLQ x y)
-(Lsh32x(64|32|16|8) x y) && shiftIsBounded(v) -> (SHLL x y)
-(Lsh16x(64|32|16|8) x y) && shiftIsBounded(v) -> (SHLL x y)
-(Lsh8x(64|32|16|8)  x y) && shiftIsBounded(v) -> (SHLL x y)
+(Lsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SHLQ x y)
+(Lsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SHLL x y)
+(Lsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SHLL x y)
+(Lsh8x(64|32|16|8)  x y) && shiftIsBounded(v) => (SHLL x y)

-(Rsh64Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) -> (ANDQ (SHRQ <t> x y) (SBBQcarrymask <t> (CMP(Q|L|W|B)const y [64])))
-(Rsh32Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) -> (ANDL (SHRL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
-(Rsh16Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) -> (ANDL (SHRW <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [16])))
-(Rsh8Ux(64|32|16|8)  <t> x y) && !shiftIsBounded(v) -> (ANDL (SHRB <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [8])))
+(Rsh64Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDQ (SHRQ <t> x y) (SBBQcarrymask <t> (CMP(Q|L|W|B)const y [64])))
+(Rsh32Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHRL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
+(Rsh16Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHRW <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [16])))
+(Rsh8Ux(64|32|16|8)  <t> x y) && !shiftIsBounded(v) => (ANDL (SHRB <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [8])))

-(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) -> (SHRQ x y)
-(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) -> (SHRL x y)
-(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) -> (SHRW x y)
-(Rsh8Ux(64|32|16|8)  x y) && shiftIsBounded(v) -> (SHRB x y)
+(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SHRQ x y)
+(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SHRL x y)
+(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SHRW x y)
+(Rsh8Ux(64|32|16|8)  x y) && shiftIsBounded(v) => (SHRB x y)

 // Signed right shift needs to return 0/-1 if shift amount is >= width of shifted value.
 // We implement this by setting the shift value to -1 (all ones) if the shift value is >= width.
-(Rsh64x(64|32|16|8) <t> x y) && !shiftIsBounded(v) -> (SARQ <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [64])))))
-(Rsh32x(64|32|16|8) <t> x y) && !shiftIsBounded(v) -> (SARL <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [32])))))
-(Rsh16x(64|32|16|8) <t> x y) && !shiftIsBounded(v) -> (SARW <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [16])))))
-(Rsh8x(64|32|16|8)  <t> x y) && !shiftIsBounded(v) -> (SARB <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [8])))))
+(Rsh64x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (SARQ <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [64])))))
+(Rsh32x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (SARL <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [32])))))
+(Rsh16x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (SARW <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [16])))))
+(Rsh8x(64|32|16|8)  <t> x y) && !shiftIsBounded(v) => (SARB <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [8])))))

-(Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) -> (SARQ x y)
-(Rsh32x(64|32|16|8) x y) && shiftIsBounded(v) -> (SARL x y)
-(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) -> (SARW x y)
-(Rsh8x(64|32|16|8) x y)  && shiftIsBounded(v) -> (SARB x y)
+(Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SARQ x y)
+(Rsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SARL x y)
+(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SARW x y)
+(Rsh8x(64|32|16|8) x y)  && shiftIsBounded(v) => (SARB x y)

-// Lowering comparisons
-(Less(64|32|16|8)  x y) -> (SETL (CMP(Q|L|W|B) x y))
-(Less(64|32|16|8)U x y) -> (SETB (CMP(Q|L|W|B) x y))
-// Use SETGF with reversed operands to dodge NaN case
-(Less(32|64)F x y) -> (SETGF (UCOMIS(S|D) y x))
-
-(Leq(64|32|16|8)  x y) -> (SETLE (CMP(Q|L|W|B) x y))
-(Leq(64|32|16|8)U x y) -> (SETBE (CMP(Q|L|W|B) x y))
-// Use SETGEF with reversed operands to dodge NaN case
-(Leq(32|64)F x y) -> (SETGEF (UCOMIS(S|D) y x))
+// Lowering integer comparisons
+(Less(64|32|16|8)      x y) => (SETL  (CMP(Q|L|W|B)     x y))
+(Less(64|32|16|8)U     x y) => (SETB  (CMP(Q|L|W|B)     x y))
+(Leq(64|32|16|8)       x y) => (SETLE (CMP(Q|L|W|B)     x y))
+(Leq(64|32|16|8)U      x y) => (SETBE (CMP(Q|L|W|B)     x y))
+(Eq(Ptr|64|32|16|8|B)  x y) => (SETEQ (CMP(Q|Q|L|W|B|B) x y))
+(Neq(Ptr|64|32|16|8|B) x y) => (SETNE (CMP(Q|Q|L|W|B|B) x y))

+// Lowering floating point comparisons
 // Note Go assembler gets UCOMISx operand order wrong, but it is right here
-// Bug is accommodated at generation of assembly language.
-(Greater(32|64)F x y) -> (SETGF (UCOMIS(S|D) x y))
-
-// Note Go assembler gets UCOMISx operand order wrong, but it is right here
-// Bug is accommodated at generation of assembly language.
-(Geq(32|64)F x y) -> (SETGEF (UCOMIS(S|D) x y))
-
-(Eq(64|32|16|8|B)  x y) -> (SETEQ (CMP(Q|L|W|B|B) x y))
-(EqPtr x y) -> (SETEQ (CMPQ x y))
-(Eq(32|64)F x y) -> (SETEQF (UCOMIS(S|D) x y))
-
-(Neq(64|32|16|8|B)  x y) -> (SETNE (CMP(Q|L|W|B|B) x y))
-(NeqPtr x y) -> (SETNE (CMPQ x y))
-(Neq(32|64)F x y) -> (SETNEF (UCOMIS(S|D) x y))
+// and the operands are reversed when generating assembly language.
+(Eq(32|64)F   x y) => (SETEQF (UCOMIS(S|D) x y))
+(Neq(32|64)F  x y) => (SETNEF (UCOMIS(S|D) x y))
+// Use SETGF/SETGEF with reversed operands to dodge NaN case.
+(Less(32|64)F x y) => (SETGF  (UCOMIS(S|D) y x))
+(Leq(32|64)F  x y) => (SETGEF (UCOMIS(S|D) y x))

 // Lowering loads
-(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) -> (MOVQload ptr mem)
-(Load <t> ptr mem) && is32BitInt(t) -> (MOVLload ptr mem)
-(Load <t> ptr mem) && is16BitInt(t) -> (MOVWload ptr mem)
-(Load <t> ptr mem) && (t.IsBoolean() || is8BitInt(t)) -> (MOVBload ptr mem)
-(Load <t> ptr mem) && is32BitFloat(t) -> (MOVSSload ptr mem)
-(Load <t> ptr mem) && is64BitFloat(t) -> (MOVSDload ptr mem)
+(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVQload ptr mem)
+(Load <t> ptr mem) && is32BitInt(t) => (MOVLload ptr mem)
+(Load <t> ptr mem) && is16BitInt(t) => (MOVWload ptr mem)
+(Load <t> ptr mem) && (t.IsBoolean() || is8BitInt(t)) => (MOVBload ptr mem)
+(Load <t> ptr mem) && is32BitFloat(t) => (MOVSSload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (MOVSDload ptr mem)

 // Lowering stores
 // These more-specific FP versions of Store pattern should come first.
-(Store {t} ptr val mem) && t.(*types.Type).Size() == 8 && is64BitFloat(val.Type) -> (MOVSDstore ptr val mem)
-(Store {t} ptr val mem) && t.(*types.Type).Size() == 4 && is32BitFloat(val.Type) -> (MOVSSstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (MOVSDstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (MOVSSstore ptr val mem)

-(Store {t} ptr val mem) && t.(*types.Type).Size() == 8 -> (MOVQstore ptr val mem)
-(Store {t} ptr val mem) && t.(*types.Type).Size() == 4 -> (MOVLstore ptr val mem)
-(Store {t} ptr val mem) && t.(*types.Type).Size() == 2 -> (MOVWstore ptr val mem)
-(Store {t} ptr val mem) && t.(*types.Type).Size() == 1 -> (MOVBstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 => (MOVQstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 => (MOVLstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)

 // Lowering moves
-(Move [0] _ _ mem) -> mem
-(Move [1] dst src mem) -> (MOVBstore dst (MOVBload src mem) mem)
-(Move [2] dst src mem) -> (MOVWstore dst (MOVWload src mem) mem)
-(Move [4] dst src mem) -> (MOVLstore dst (MOVLload src mem) mem)
-(Move [8] dst src mem) -> (MOVQstore dst (MOVQload src mem) mem)
-(Move [16] dst src mem) && config.useSSE -> (MOVOstore dst (MOVOload src mem) mem)
-(Move [16] dst src mem) && !config.useSSE ->
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBload src mem) mem)
+(Move [2] dst src mem) => (MOVWstore dst (MOVWload src mem) mem)
+(Move [4] dst src mem) => (MOVLstore dst (MOVLload src mem) mem)
+(Move [8] dst src mem) => (MOVQstore dst (MOVQload src mem) mem)
+(Move [16] dst src mem) && config.useSSE => (MOVOstore dst (MOVOload src mem) mem)
+(Move [16] dst src mem) && !config.useSSE =>
 	(MOVQstore [8] dst (MOVQload [8] src mem)
 		(MOVQstore dst (MOVQload src mem) mem))

-(Move [32] dst src mem) ->
+(Move [32] dst src mem) =>
 	(Move [16]
 		(OffPtr <dst.Type> dst [16])
 		(OffPtr <src.Type> src [16])
 		(Move [16] dst src mem))

-(Move [48] dst src mem) && config.useSSE ->
+(Move [48] dst src mem) && config.useSSE =>
 	(Move [32]
 		(OffPtr <dst.Type> dst [16])
 		(OffPtr <src.Type> src [16])
 		(Move [16] dst src mem))

-(Move [64] dst src mem) && config.useSSE ->
+(Move [64] dst src mem) && config.useSSE =>
 	(Move [32]
 		(OffPtr <dst.Type> dst [32])
 		(OffPtr <src.Type> src [32])
 		(Move [32] dst src mem))

-(Move [3] dst src mem) ->
+(Move [3] dst src mem) =>
 	(MOVBstore [2] dst (MOVBload [2] src mem)
 		(MOVWstore dst (MOVWload src mem) mem))
-(Move [5] dst src mem) ->
+(Move [5] dst src mem) =>
 	(MOVBstore [4] dst (MOVBload [4] src mem)
 		(MOVLstore dst (MOVLload src mem) mem))
-(Move [6] dst src mem) ->
+(Move [6] dst src mem) =>
 	(MOVWstore [4] dst (MOVWload [4] src mem)
 		(MOVLstore dst (MOVLload src mem) mem))
-(Move [7] dst src mem) ->
+(Move [7] dst src mem) =>
 	(MOVLstore [3] dst (MOVLload [3] src mem)
 		(MOVLstore dst (MOVLload src mem) mem))
-(Move [9] dst src mem) ->
+(Move [9] dst src mem) =>
 	(MOVBstore [8] dst (MOVBload [8] src mem)
 		(MOVQstore dst (MOVQload src mem) mem))
-(Move [10] dst src mem) ->
+(Move [10] dst src mem) =>
 	(MOVWstore [8] dst (MOVWload [8] src mem)
 		(MOVQstore dst (MOVQload src mem) mem))
-(Move [12] dst src mem) ->
+(Move [12] dst src mem) =>
 	(MOVLstore [8] dst (MOVLload [8] src mem)
 		(MOVQstore dst (MOVQload src mem) mem))
-(Move [s] dst src mem) && s == 11 || s >= 13 && s <= 15 ->
-	(MOVQstore [s-8] dst (MOVQload [s-8] src mem)
+(Move [s] dst src mem) && s == 11 || s >= 13 && s <= 15 =>
+	(MOVQstore [int32(s-8)] dst (MOVQload [int32(s-8)] src mem)
 		(MOVQstore dst (MOVQload src mem) mem))

 // Adjust moves to be a multiple of 16 bytes.
 (Move [s] dst src mem)
-	&& s > 16 && s%16 != 0 && s%16 <= 8 ->
+	&& s > 16 && s%16 != 0 && s%16 <= 8 =>
 	(Move [s-s%16]
 		(OffPtr <dst.Type> dst [s%16])
 		(OffPtr <src.Type> src [s%16])
 		(MOVQstore dst (MOVQload src mem) mem))
 (Move [s] dst src mem)
-	&& s > 16 && s%16 != 0 && s%16 > 8 && config.useSSE ->
+	&& s > 16 && s%16 != 0 && s%16 > 8 && config.useSSE =>
 	(Move [s-s%16]
 		(OffPtr <dst.Type> dst [s%16])
 		(OffPtr <src.Type> src [s%16])
 		(MOVOstore dst (MOVOload src mem) mem))
 (Move [s] dst src mem)
-	&& s > 16 && s%16 != 0 && s%16 > 8 && !config.useSSE ->
+	&& s > 16 && s%16 != 0 && s%16 > 8 && !config.useSSE =>
 	(Move [s-s%16]
 		(OffPtr <dst.Type> dst [s%16])
 		(OffPtr <src.Type> src [s%16])
@ -317,81 +308,75 @@
 // Medium copying uses a duff device.
 (Move [s] dst src mem)
 	&& s > 64 && s <= 16*64 && s%16 == 0
-	&& !config.noDuffDevice ->
-	(DUFFCOPY [14*(64-s/16)] dst src mem)
-// 14 and 64 are magic constants.  14 is the number of bytes to encode:
-//	MOVUPS	(SI), X0
-//	ADDQ	$16, SI
-//	MOVUPS	X0, (DI)
-//	ADDQ	$16, DI
-// and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy.
+	&& !config.noDuffDevice && logLargeCopy(v, s) =>
+	(DUFFCOPY [s] dst src mem)

 // Large copying uses REP MOVSQ.
-(Move [s] dst src mem) && (s > 16*64 || config.noDuffDevice) && s%8 == 0 ->
+(Move [s] dst src mem) && (s > 16*64 || config.noDuffDevice) && s%8 == 0 && logLargeCopy(v, s) =>
 	(REPMOVSQ dst src (MOVQconst [s/8]) mem)

 // Lowering Zero instructions
-(Zero [0] _ mem) -> mem
-(Zero [1] destptr mem) -> (MOVBstoreconst [0] destptr mem)
-(Zero [2] destptr mem) -> (MOVWstoreconst [0] destptr mem)
-(Zero [4] destptr mem) -> (MOVLstoreconst [0] destptr mem)
-(Zero [8] destptr mem) -> (MOVQstoreconst [0] destptr mem)
+(Zero [0] _ mem) => mem
+(Zero [1] destptr mem) => (MOVBstoreconst [makeValAndOff32(0,0)] destptr mem)
+(Zero [2] destptr mem) => (MOVWstoreconst [makeValAndOff32(0,0)] destptr mem)
+(Zero [4] destptr mem) => (MOVLstoreconst [makeValAndOff32(0,0)] destptr mem)
+(Zero [8] destptr mem) => (MOVQstoreconst [makeValAndOff32(0,0)] destptr mem)

-(Zero [3] destptr mem) ->
-	(MOVBstoreconst [makeValAndOff(0,2)] destptr
-		(MOVWstoreconst [0] destptr mem))
-(Zero [5] destptr mem) ->
-	(MOVBstoreconst [makeValAndOff(0,4)] destptr
-		(MOVLstoreconst [0] destptr mem))
-(Zero [6] destptr mem) ->
-	(MOVWstoreconst [makeValAndOff(0,4)] destptr
-		(MOVLstoreconst [0] destptr mem))
-(Zero [7] destptr mem) ->
-	(MOVLstoreconst [makeValAndOff(0,3)] destptr
-		(MOVLstoreconst [0] destptr mem))
+(Zero [3] destptr mem) =>
+	(MOVBstoreconst [makeValAndOff32(0,2)] destptr
+		(MOVWstoreconst [makeValAndOff32(0,0)] destptr mem))
+(Zero [5] destptr mem) =>
+	(MOVBstoreconst [makeValAndOff32(0,4)] destptr
+		(MOVLstoreconst [makeValAndOff32(0,0)] destptr mem))
+(Zero [6] destptr mem) =>
+	(MOVWstoreconst [makeValAndOff32(0,4)] destptr
+		(MOVLstoreconst [makeValAndOff32(0,0)] destptr mem))
+(Zero [7] destptr mem) =>
+	(MOVLstoreconst [makeValAndOff32(0,3)] destptr
+		(MOVLstoreconst [makeValAndOff32(0,0)] destptr mem))

 // Strip off any fractional word zeroing.
-(Zero [s] destptr mem) && s%8 != 0 && s > 8 && !config.useSSE ->
+(Zero [s] destptr mem) && s%8 != 0 && s > 8 && !config.useSSE =>
 	(Zero [s-s%8] (OffPtr <destptr.Type> destptr [s%8])
-		(MOVQstoreconst [0] destptr mem))
+		(MOVQstoreconst [makeValAndOff32(0,0)] destptr mem))

 // Zero small numbers of words directly.
-(Zero [16] destptr mem) && !config.useSSE ->
-	(MOVQstoreconst [makeValAndOff(0,8)] destptr
-		(MOVQstoreconst [0] destptr mem))
-(Zero [24] destptr mem) && !config.useSSE ->
-	(MOVQstoreconst [makeValAndOff(0,16)] destptr
-		(MOVQstoreconst [makeValAndOff(0,8)] destptr
-			(MOVQstoreconst [0] destptr mem)))
-(Zero [32] destptr mem) && !config.useSSE ->
-	(MOVQstoreconst [makeValAndOff(0,24)] destptr
-		(MOVQstoreconst [makeValAndOff(0,16)] destptr
-			(MOVQstoreconst [makeValAndOff(0,8)] destptr
-				(MOVQstoreconst [0] destptr mem))))
+(Zero [16] destptr mem) && !config.useSSE =>
+	(MOVQstoreconst [makeValAndOff32(0,8)] destptr
+		(MOVQstoreconst [makeValAndOff32(0,0)] destptr mem))
+(Zero [24] destptr mem) && !config.useSSE =>
+	(MOVQstoreconst [makeValAndOff32(0,16)] destptr
+		(MOVQstoreconst [makeValAndOff32(0,8)] destptr
+			(MOVQstoreconst [makeValAndOff32(0,0)] destptr mem)))
+(Zero [32] destptr mem) && !config.useSSE =>
+	(MOVQstoreconst [makeValAndOff32(0,24)] destptr
+		(MOVQstoreconst [makeValAndOff32(0,16)] destptr
+			(MOVQstoreconst [makeValAndOff32(0,8)] destptr
+				(MOVQstoreconst [makeValAndOff32(0,0)] destptr mem))))

-(Zero [s] destptr mem) && s > 8 && s < 16 && config.useSSE ->
-	(MOVQstoreconst [makeValAndOff(0,s-8)] destptr
-		(MOVQstoreconst [0] destptr mem))
+(Zero [s] destptr mem) && s > 8 && s < 16 && config.useSSE =>
+	(MOVQstoreconst [makeValAndOff32(0,int32(s-8))] destptr
+		(MOVQstoreconst [makeValAndOff32(0,0)] destptr mem))

 // Adjust zeros to be a multiple of 16 bytes.
-(Zero [s] destptr mem) && s%16 != 0 && s > 16 && s%16 > 8 && config.useSSE ->
+(Zero [s] destptr mem) && s%16 != 0 && s > 16 && s%16 > 8 && config.useSSE =>
 	(Zero [s-s%16] (OffPtr <destptr.Type> destptr [s%16])
 		(MOVOstore destptr (MOVOconst [0]) mem))

-(Zero [s] destptr mem) && s%16 != 0 && s > 16 && s%16 <= 8 && config.useSSE ->
+(Zero [s] destptr mem) && s%16 != 0 && s > 16 && s%16 <= 8 && config.useSSE =>
 	(Zero [s-s%16] (OffPtr <destptr.Type> destptr [s%16])
-		(MOVQstoreconst [0] destptr mem))
+		(MOVQstoreconst [makeValAndOff32(0,0)] destptr mem))

-(Zero [16] destptr mem) && config.useSSE ->
+(Zero [16] destptr mem) && config.useSSE =>
 	(MOVOstore destptr (MOVOconst [0]) mem)
-(Zero [32] destptr mem) && config.useSSE ->
+(Zero [32] destptr mem) && config.useSSE =>
 	(MOVOstore (OffPtr <destptr.Type> destptr [16]) (MOVOconst [0])
 		(MOVOstore destptr (MOVOconst [0]) mem))
-(Zero [48] destptr mem) && config.useSSE ->
+(Zero [48] destptr mem) && config.useSSE =>
 	(MOVOstore (OffPtr <destptr.Type> destptr [32]) (MOVOconst [0])
 		(MOVOstore (OffPtr <destptr.Type> destptr [16]) (MOVOconst [0])
 			(MOVOstore destptr (MOVOconst [0]) mem)))
-(Zero [64] destptr mem) && config.useSSE ->
+(Zero [64] destptr mem) && config.useSSE =>
 	(MOVOstore (OffPtr <destptr.Type> destptr [48]) (MOVOconst [0])
 		(MOVOstore (OffPtr <destptr.Type> destptr [32]) (MOVOconst [0])
 			(MOVOstore (OffPtr <destptr.Type> destptr [16]) (MOVOconst [0])
@ -399,24 +384,24 @@

 // Medium zeroing uses a duff device.
 (Zero [s] destptr mem)
-	&& s > 64 && s <= 1024 && s%16 == 0 && !config.noDuffDevice ->
+	&& s > 64 && s <= 1024 && s%16 == 0 && !config.noDuffDevice =>
 	(DUFFZERO [s] destptr (MOVOconst [0]) mem)

 // Large zeroing uses REP STOSQ.
 (Zero [s] destptr mem)
 	&& (s > 1024 || (config.noDuffDevice && s > 64 || !config.useSSE && s > 32))
-	&& s%8 == 0 ->
+	&& s%8 == 0 =>
 	(REPSTOSQ destptr (MOVQconst [s/8]) (MOVQconst [0]) mem)

 // Lowering constants
-(Const8   ...) -> (MOVLconst ...)
-(Const16  ...) -> (MOVLconst ...)
-(Const32  ...) -> (MOVLconst ...)
-(Const64  ...) -> (MOVQconst ...)
-(Const32F ...) -> (MOVSSconst ...)
-(Const64F ...) -> (MOVSDconst ...)
-(ConstNil ...) -> (MOVQconst ...)
-(ConstBool ...) -> (MOVLconst ...)
+(Const8   [c]) => (MOVLconst [int32(c)])
+(Const16  [c]) => (MOVLconst [int32(c)])
+(Const32  ...) => (MOVLconst ...)
+(Const64  ...) => (MOVQconst ...)
+(Const32F ...) => (MOVSSconst ...)
+(Const64F ...) => (MOVSDconst ...)
+(ConstNil    ) => (MOVQconst [0])
+(ConstBool [c]) => (MOVLconst [int32(b2i(c))])

 // Lowering calls
 (StaticCall ...) -> (CALLstatic ...)
@ -478,6 +463,8 @@
 (GetClosurePtr ...) -> (LoweredGetClosurePtr ...)
 (GetCallerPC ...) -> (LoweredGetCallerPC ...)
 (GetCallerSP ...) -> (LoweredGetCallerSP ...)
+
+(HasCPUFeature {s}) -> (SETNE (CMPQconst [0] (LoweredHasCPUFeature {s})))
 (Addr ...) -> (LEAQ ...)
 (LocalAddr {sym} base _) -> (LEAQ {sym} base)

@ -578,6 +565,10 @@
 (SETB (TEST(Q|L|W|B) x x)) -> (ConstBool [0])
 (SETAE (TEST(Q|L|W|B) x x)) -> (ConstBool [1])

+// x & 1 != 0 -> x & 1
+(SETNE (TEST(B|W)const [1] x)) => (AND(L|L)const [1] x)
+(SETB (BT(L|Q)const [0] x)) => (AND(L|Q)const [1] x)
+
 // Recognize bit tests: a&(1<<b) != 0 for b suitably bounded
 // Note that BTx instructions use the carry bit, so we need to convert tests for zero flag
 // into tests for carry flags.
@ -2153,11 +2144,14 @@
 (CMP(Q|L|W|B) l:(MOV(Q|L|W|B)load {sym} [off] ptr mem) x) && canMergeLoad(v, l) && clobber(l) -> (CMP(Q|L|W|B)load {sym} [off] ptr x mem)
 (CMP(Q|L|W|B) x l:(MOV(Q|L|W|B)load {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) -> (InvertFlags (CMP(Q|L|W|B)load {sym} [off] ptr x mem))

-(CMP(Q|L|W|B)const l:(MOV(Q|L|W|B)load {sym} [off] ptr mem) [c])
+(CMP(Q|L)const l:(MOV(Q|L)load {sym} [off] ptr mem) [c])
 	&& l.Uses == 1
-	&& validValAndOff(c, off)
-	&& clobber(l) ->
-  @l.Block (CMP(Q|L|W|B)constload {sym} [makeValAndOff(c,off)] ptr mem)
+	&& clobber(l) =>
+@l.Block (CMP(Q|L)constload {sym} [makeValAndOff32(c,off)] ptr mem)
+(CMP(W|B)const l:(MOV(W|B)load {sym} [off] ptr mem) [c])
+	&& l.Uses == 1
+	&& clobber(l) =>
+@l.Block (CMP(W|B)constload {sym} [makeValAndOff32(int32(c),off)] ptr mem)

 (CMPQload {sym} [off] ptr (MOVQconst [c]) mem) && validValAndOff(c,off) -> (CMPQconstload {sym} [makeValAndOff(c,off)] ptr mem)
 (CMPLload {sym} [off] ptr (MOVLconst [c]) mem) && validValAndOff(c,off) -> (CMPLconstload {sym} [makeValAndOff(c,off)] ptr mem)
--- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
@ -136,10 +136,11 @@ func init() {
 		readflags = regInfo{inputs: nil, outputs: gponly}
 		flagsgpax = regInfo{inputs: nil, clobbers: ax, outputs: []regMask{gp &^ ax}}

-		gpload    = regInfo{inputs: []regMask{gpspsb, 0}, outputs: gponly}
-		gp21load  = regInfo{inputs: []regMask{gp, gpspsb, 0}, outputs: gponly}
-		gploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: gponly}
-		gp21pax   = regInfo{inputs: []regMask{gp &^ ax, gp}, outputs: []regMask{gp &^ ax}, clobbers: ax}
+		gpload      = regInfo{inputs: []regMask{gpspsb, 0}, outputs: gponly}
+		gp21load    = regInfo{inputs: []regMask{gp, gpspsb, 0}, outputs: gponly}
+		gploadidx   = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: gponly}
+		gp21loadidx = regInfo{inputs: []regMask{gp, gpspsb, gpsp, 0}, outputs: gponly}
+		gp21pax     = regInfo{inputs: []regMask{gp &^ ax, gp}, outputs: []regMask{gp &^ ax}, clobbers: ax}

 		gpstore         = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
 		gpstoreconst    = regInfo{inputs: []regMask{gpspsb, 0}}
@ -409,6 +410,32 @@ func init() {
 		{name: "XORQload", argLength: 3, reg: gp21load, asm: "XORQ", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem
 		{name: "XORLload", argLength: 3, reg: gp21load, asm: "XORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from  arg1+auxint+aux, arg2 = mem

+		{name: "ADDLloadidx1", argLength: 4, reg: gp21loadidx, asm: "ADDL", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 + tmp, tmp loaded from  arg1+  arg2+auxint+aux, arg3 = mem
+		{name: "ADDLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ADDL", scale: 4, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 + tmp, tmp loaded from  arg1+4*arg2+auxint+aux, arg3 = mem
+		{name: "ADDLloadidx8", argLength: 4, reg: gp21loadidx, asm: "ADDL", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 + tmp, tmp loaded from  arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "ADDQloadidx1", argLength: 4, reg: gp21loadidx, asm: "ADDQ", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 + tmp, tmp loaded from  arg1+  arg2+auxint+aux, arg3 = mem
+		{name: "ADDQloadidx8", argLength: 4, reg: gp21loadidx, asm: "ADDQ", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 + tmp, tmp loaded from  arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "SUBLloadidx1", argLength: 4, reg: gp21loadidx, asm: "SUBL", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 - tmp, tmp loaded from  arg1+  arg2+auxint+aux, arg3 = mem
+		{name: "SUBLloadidx4", argLength: 4, reg: gp21loadidx, asm: "SUBL", scale: 4, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 - tmp, tmp loaded from  arg1+4*arg2+auxint+aux, arg3 = mem
+		{name: "SUBLloadidx8", argLength: 4, reg: gp21loadidx, asm: "SUBL", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 - tmp, tmp loaded from  arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "SUBQloadidx1", argLength: 4, reg: gp21loadidx, asm: "SUBQ", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 - tmp, tmp loaded from  arg1+  arg2+auxint+aux, arg3 = mem
+		{name: "SUBQloadidx8", argLength: 4, reg: gp21loadidx, asm: "SUBQ", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 - tmp, tmp loaded from  arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "ANDLloadidx1", argLength: 4, reg: gp21loadidx, asm: "ANDL", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 & tmp, tmp loaded from  arg1+  arg2+auxint+aux, arg3 = mem
+		{name: "ANDLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ANDL", scale: 4, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 & tmp, tmp loaded from  arg1+4*arg2+auxint+aux, arg3 = mem
+		{name: "ANDLloadidx8", argLength: 4, reg: gp21loadidx, asm: "ANDL", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 & tmp, tmp loaded from  arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "ANDQloadidx1", argLength: 4, reg: gp21loadidx, asm: "ANDQ", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 & tmp, tmp loaded from  arg1+  arg2+auxint+aux, arg3 = mem
+		{name: "ANDQloadidx8", argLength: 4, reg: gp21loadidx, asm: "ANDQ", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 & tmp, tmp loaded from  arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "ORLloadidx1", argLength: 4, reg: gp21loadidx, asm: "ORL", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},   // arg0 | tmp, tmp loaded from  arg1+  arg2+auxint+aux, arg3 = mem
+		{name: "ORLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ORL", scale: 4, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},   // arg0 | tmp, tmp loaded from  arg1+4*arg2+auxint+aux, arg3 = mem
+		{name: "ORLloadidx8", argLength: 4, reg: gp21loadidx, asm: "ORL", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},   // arg0 | tmp, tmp loaded from  arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "ORQloadidx1", argLength: 4, reg: gp21loadidx, asm: "ORQ", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},   // arg0 | tmp, tmp loaded from  arg1+  arg2+auxint+aux, arg3 = mem
+		{name: "ORQloadidx8", argLength: 4, reg: gp21loadidx, asm: "ORQ", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"},   // arg0 | tmp, tmp loaded from  arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "XORLloadidx1", argLength: 4, reg: gp21loadidx, asm: "XORL", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from  arg1+  arg2+auxint+aux, arg3 = mem
+		{name: "XORLloadidx4", argLength: 4, reg: gp21loadidx, asm: "XORL", scale: 4, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from  arg1+4*arg2+auxint+aux, arg3 = mem
+		{name: "XORLloadidx8", argLength: 4, reg: gp21loadidx, asm: "XORL", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from  arg1+8*arg2+auxint+aux, arg3 = mem
+		{name: "XORQloadidx1", argLength: 4, reg: gp21loadidx, asm: "XORQ", scale: 1, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from  arg1+  arg2+auxint+aux, arg3 = mem
+		{name: "XORQloadidx8", argLength: 4, reg: gp21loadidx, asm: "XORQ", scale: 8, aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from  arg1+8*arg2+auxint+aux, arg3 = mem
+
 		// direct binary-op on memory (read-modify-write)
 		{name: "ADDQmodify", argLength: 3, reg: gpstore, asm: "ADDQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) += arg1, arg2=mem
 		{name: "SUBQmodify", argLength: 3, reg: gpstore, asm: "SUBQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) -= arg1, arg2=mem
@ -421,6 +448,53 @@ func init() {
 		{name: "ORLmodify", argLength: 3, reg: gpstore, asm: "ORL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},   // *(arg0+auxint+aux) |= arg1, arg2=mem
 		{name: "XORLmodify", argLength: 3, reg: gpstore, asm: "XORL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) ^= arg1, arg2=mem

+		{name: "ADDQmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ADDQ", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+auxint+aux) += arg2, arg3=mem
+		{name: "ADDQmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ADDQ", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+auxint+aux) += arg2, arg3=mem
+		{name: "SUBQmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "SUBQ", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+auxint+aux) -= arg2, arg3=mem
+		{name: "SUBQmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "SUBQ", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+auxint+aux) -= arg2, arg3=mem
+		{name: "ANDQmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ANDQ", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+auxint+aux) &= arg2, arg3=mem
+		{name: "ANDQmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ANDQ", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+auxint+aux) &= arg2, arg3=mem
+		{name: "ORQmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ORQ", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+1*arg1+auxint+aux) |= arg2, arg3=mem
+		{name: "ORQmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ORQ", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+8*arg1+auxint+aux) |= arg2, arg3=mem
+		{name: "XORQmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "XORQ", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+auxint+aux) ^= arg2, arg3=mem
+		{name: "XORQmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "XORQ", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+auxint+aux) ^= arg2, arg3=mem
+		{name: "ADDLmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ADDL", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+auxint+aux) += arg2, arg3=mem
+		{name: "ADDLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ADDL", scale: 4, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+4*arg1+auxint+aux) += arg2, arg3=mem
+		{name: "ADDLmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ADDL", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+auxint+aux) += arg2, arg3=mem
+		{name: "SUBLmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "SUBL", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+auxint+aux) -= arg2, arg3=mem
+		{name: "SUBLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "SUBL", scale: 4, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+4*arg1+auxint+aux) -= arg2, arg3=mem
+		{name: "SUBLmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "SUBL", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+auxint+aux) -= arg2, arg3=mem
+		{name: "ANDLmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ANDL", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+auxint+aux) &= arg2, arg3=mem
+		{name: "ANDLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ANDL", scale: 4, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+4*arg1+auxint+aux) &= arg2, arg3=mem
+		{name: "ANDLmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ANDL", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+auxint+aux) &= arg2, arg3=mem
+		{name: "ORLmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "ORL", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+1*arg1+auxint+aux) |= arg2, arg3=mem
+		{name: "ORLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ORL", scale: 4, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+4*arg1+auxint+aux) |= arg2, arg3=mem
+		{name: "ORLmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "ORL", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+8*arg1+auxint+aux) |= arg2, arg3=mem
+		{name: "XORLmodifyidx1", argLength: 4, reg: gpstoreidx, asm: "XORL", scale: 1, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+auxint+aux) ^= arg2, arg3=mem
+		{name: "XORLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "XORL", scale: 4, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+4*arg1+auxint+aux) ^= arg2, arg3=mem
+		{name: "XORLmodifyidx8", argLength: 4, reg: gpstoreidx, asm: "XORL", scale: 8, aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+auxint+aux) ^= arg2, arg3=mem
+
+		{name: "ADDQconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ADDQ", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+ValAndOff(AuxInt).Off()+aux) += ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ADDQconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ADDQ", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+ValAndOff(AuxInt).Off()+aux) += ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ANDQconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ANDQ", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+ValAndOff(AuxInt).Off()+aux) &= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ANDQconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ANDQ", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+ValAndOff(AuxInt).Off()+aux) &= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ORQconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ORQ", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+1*arg1+ValAndOff(AuxInt).Off()+aux) |= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ORQconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ORQ", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+8*arg1+ValAndOff(AuxInt).Off()+aux) |= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "XORQconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "XORQ", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+ValAndOff(AuxInt).Off()+aux) ^= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "XORQconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "XORQ", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+ValAndOff(AuxInt).Off()+aux) ^= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ADDLconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ADDL", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+ValAndOff(AuxInt).Off()+aux) += ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ADDLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ADDL", scale: 4, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+4*arg1+ValAndOff(AuxInt).Off()+aux) += ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ADDLconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ADDL", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+ValAndOff(AuxInt).Off()+aux) += ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ANDLconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ANDL", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+ValAndOff(AuxInt).Off()+aux) &= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ANDLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ANDL", scale: 4, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+4*arg1+ValAndOff(AuxInt).Off()+aux) &= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ANDLconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ANDL", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+ValAndOff(AuxInt).Off()+aux) &= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ORLconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "ORL", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+1*arg1+ValAndOff(AuxInt).Off()+aux) |= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ORLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ORL", scale: 4, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+4*arg1+ValAndOff(AuxInt).Off()+aux) |= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "ORLconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "ORL", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"},   // *(arg0+8*arg1+ValAndOff(AuxInt).Off()+aux) |= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "XORLconstmodifyidx1", argLength: 3, reg: gpstoreconstidx, asm: "XORL", scale: 1, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+1*arg1+ValAndOff(AuxInt).Off()+aux) ^= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "XORLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "XORL", scale: 4, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+4*arg1+ValAndOff(AuxInt).Off()+aux) ^= ValAndOff(AuxInt).Val(), arg2=mem
+		{name: "XORLconstmodifyidx8", argLength: 3, reg: gpstoreconstidx, asm: "XORL", scale: 8, aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+8*arg1+ValAndOff(AuxInt).Off()+aux) ^= ValAndOff(AuxInt).Val(), arg2=mem
+
 		// unary ops
 		{name: "NEGQ", argLength: 1, reg: gp11, asm: "NEGQ", resultInArg0: true, clobberFlags: true}, // -arg0
 		{name: "NEGL", argLength: 1, reg: gp11, asm: "NEGL", resultInArg0: true, clobberFlags: true}, // -arg0
@ -681,7 +755,7 @@ func init() {
 		// arg0 = destination pointer
 		// arg1 = source pointer
 		// arg2 = mem
-		// auxint = offset from duffcopy symbol to call
+		// auxint = # of bytes to copy, must be multiple of 16
 		// returns memory
 		{
 			name:      "DUFFCOPY",
@ -738,12 +812,14 @@ func init() {
 		// It saves all GP registers if necessary, but may clobber others.
 		{name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("DI"), buildReg("AX CX DX BX BP SI R8 R9")}, clobbers: callerSave &^ gp}, clobberFlags: true, aux: "Sym", symEffect: "None"},

+		{name: "LoweredHasCPUFeature", argLength: 0, reg: gp01, rematerializeable: true, typ: "UInt64", aux: "Sym", symEffect: "None"},
+
 		// There are three of these functions so that they can have three different register inputs.
 		// When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
 		// default registers to match so we don't need to copy registers around unnecessarily.
-		{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{dx, bx}}, typ: "Mem"}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
-		{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{cx, dx}}, typ: "Mem"}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
-		{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{ax, cx}}, typ: "Mem"}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+		{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{dx, bx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+		{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{cx, dx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+		{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{ax, cx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).

 		// Constant flag values. For any comparison, there are 5 possible
 		// outcomes: the three from the signed total order (<,==,>) and the
--- a/src/cmd/compile/internal/ssa/gen/AMD64splitload.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64splitload.rules
@ -2,25 +2,44 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// This file contains rules used by flagalloc to split
-// a flag-generating merged load op into separate load and op.
+// This file contains rules used by flagalloc and addressingmodes to
+// split a flag-generating merged load op into separate load and op.
 // Unlike with the other rules files, not all of these
 // rules will be applied to all values.
 // Rather, flagalloc will request for rules to be applied
 // to a particular problematic value.
 // These are often the exact inverse of rules in AMD64.rules,
 // only with the conditions removed.
+//
+// For addressingmodes, certain single instructions are slower than the two instruction
+// split generated here (which is different from the inputs to addressingmodes).
+// For example:
+// (CMPBconstload c (ADDQ x y)) -> (CMPBconstloadidx1 c x y) -> (CMPB c (MOVBloadidx1 x y))

-(CMP(Q|L|W|B)load {sym} [off] ptr x mem) -> (CMP(Q|L|W|B) (MOV(Q|L|W|B)load {sym} [off] ptr mem) x)
+(CMP(Q|L|W|B)load {sym} [off] ptr x mem) => (CMP(Q|L|W|B) (MOV(Q|L|W|B)load {sym} [off] ptr mem) x)

-(CMP(Q|L|W|B)constload {sym} [vo] ptr mem) -> (CMP(Q|L|W|B)const (MOV(Q|L|W|B)load {sym} [offOnly(vo)] ptr mem) [valOnly(vo)])
+(CMP(Q|L|W|B)constload {sym} [vo] ptr mem) && vo.Val() == 0 => (TEST(Q|L|W|B) x:(MOV(Q|L|W|B)load {sym} [vo.Off32()] ptr mem) x)

-(CMP(Q|L|W|B)loadidx1 {sym} [off] ptr idx x mem) -> (CMP(Q|L|W|B) (MOV(Q|L|W|B)loadidx1 {sym} [off] ptr idx mem) x)
-(CMPQloadidx8 {sym} [off] ptr idx x mem) -> (CMPQ (MOVQloadidx8 {sym} [off] ptr idx mem) x)
-(CMPLloadidx4 {sym} [off] ptr idx x mem) -> (CMPL (MOVLloadidx4 {sym} [off] ptr idx mem) x)
-(CMPWloadidx2 {sym} [off] ptr idx x mem) -> (CMPW (MOVWloadidx2 {sym} [off] ptr idx mem) x)
+(CMPQconstload {sym} [vo] ptr mem) && vo.Val() != 0 => (CMPQconst (MOVQload {sym} [vo.Off32()] ptr mem) [vo.Val32()])
+(CMPLconstload {sym} [vo] ptr mem) && vo.Val() != 0 => (CMPLconst (MOVLload {sym} [vo.Off32()] ptr mem) [vo.Val32()])
+(CMPWconstload {sym} [vo] ptr mem) && vo.Val() != 0 => (CMPWconst (MOVWload {sym} [vo.Off32()] ptr mem) [vo.Val16()])
+(CMPBconstload {sym} [vo] ptr mem) && vo.Val() != 0 => (CMPBconst (MOVBload {sym} [vo.Off32()] ptr mem) [vo.Val8()])

-(CMP(Q|L|W|B)constloadidx1 {sym} [vo] ptr idx mem) -> (CMP(Q|L|W|B)const (MOV(Q|L|W|B)loadidx1 {sym} [offOnly(vo)] ptr idx mem) [valOnly(vo)])
-(CMPQconstloadidx8 {sym} [vo] ptr idx mem) -> (CMPQconst (MOVQloadidx8 {sym} [offOnly(vo)] ptr idx mem) [valOnly(vo)])
-(CMPLconstloadidx4 {sym} [vo] ptr idx mem) -> (CMPLconst (MOVLloadidx4 {sym} [offOnly(vo)] ptr idx mem) [valOnly(vo)])
-(CMPWconstloadidx2 {sym} [vo] ptr idx mem) -> (CMPWconst (MOVWloadidx2 {sym} [offOnly(vo)] ptr idx mem) [valOnly(vo)])
+(CMP(Q|L|W|B)loadidx1 {sym} [off] ptr idx x mem) => (CMP(Q|L|W|B) (MOV(Q|L|W|B)loadidx1 {sym} [off] ptr idx mem) x)
+(CMPQloadidx8 {sym} [off] ptr idx x mem) => (CMPQ (MOVQloadidx8 {sym} [off] ptr idx mem) x)
+(CMPLloadidx4 {sym} [off] ptr idx x mem) => (CMPL (MOVLloadidx4 {sym} [off] ptr idx mem) x)
+(CMPWloadidx2 {sym} [off] ptr idx x mem) => (CMPW (MOVWloadidx2 {sym} [off] ptr idx mem) x)
+
+(CMP(Q|L|W|B)constloadidx1 {sym} [vo] ptr idx mem) && vo.Val() == 0 => (TEST(Q|L|W|B) x:(MOV(Q|L|W|B)loadidx1 {sym} [vo.Off32()] ptr idx mem) x)
+(CMPQconstloadidx8         {sym} [vo] ptr idx mem) && vo.Val() == 0 => (TESTQ         x:(MOVQloadidx8         {sym} [vo.Off32()] ptr idx mem) x)
+(CMPLconstloadidx4         {sym} [vo] ptr idx mem) && vo.Val() == 0 => (TESTL         x:(MOVLloadidx4         {sym} [vo.Off32()] ptr idx mem) x)
+(CMPWconstloadidx2         {sym} [vo] ptr idx mem) && vo.Val() == 0 => (TESTW         x:(MOVWloadidx2         {sym} [vo.Off32()] ptr idx mem) x)
+
+(CMPQconstloadidx1 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPQconst (MOVQloadidx1 {sym} [vo.Off32()] ptr idx mem) [vo.Val32()])
+(CMPLconstloadidx1 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPLconst (MOVLloadidx1 {sym} [vo.Off32()] ptr idx mem) [vo.Val32()])
+(CMPWconstloadidx1 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPWconst (MOVWloadidx1 {sym} [vo.Off32()] ptr idx mem) [vo.Val16()])
+(CMPBconstloadidx1 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPBconst (MOVBloadidx1 {sym} [vo.Off32()] ptr idx mem) [vo.Val8()])
+
+(CMPQconstloadidx8 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPQconst (MOVQloadidx8 {sym} [vo.Off32()] ptr idx mem) [vo.Val32()])
+(CMPLconstloadidx4 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPLconst (MOVLloadidx4 {sym} [vo.Off32()] ptr idx mem) [vo.Val32()])
+(CMPWconstloadidx2 {sym} [vo] ptr idx mem) && vo.Val() != 0 => (CMPWconst (MOVWloadidx2 {sym} [vo.Off32()] ptr idx mem) [vo.Val16()])
--- a/src/cmd/compile/internal/ssa/gen/ARM.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM.rules
@ -243,10 +243,6 @@
 (Leq16U x y) -> (LessEqualU (CMP (ZeroExt16to32 x) (ZeroExt16to32 y)))
 (Leq32U x y) -> (LessEqualU (CMP x y))

-(Greater(32|64)F x y) -> (GreaterThan (CMP(F|D) x y))
-
-(Geq(32|64)F x y) -> (GreaterEqual (CMP(F|D) x y))
-
 (OffPtr [off] ptr:(SP)) -> (MOVWaddr [off] ptr)
 (OffPtr [off] ptr) -> (ADDconst [off] ptr)

@ -338,12 +334,12 @@
 // 8 and 128 are magic constants, see runtime/mkduff.go
 (Move [s] {t} dst src mem)
 	&& s%4 == 0 && s > 4 && s <= 512
-	&& t.(*types.Type).Alignment()%4 == 0 && !config.noDuffDevice ->
+	&& t.(*types.Type).Alignment()%4 == 0 && !config.noDuffDevice && logLargeCopy(v, s) ->
 	(DUFFCOPY [8 * (128 - s/4)] dst src mem)

 // Large move uses a loop
 (Move [s] {t} dst src mem)
-	&& (s > 512 || config.noDuffDevice) || t.(*types.Type).Alignment()%4 != 0 ->
+	&& ((s > 512 || config.noDuffDevice) || t.(*types.Type).Alignment()%4 != 0) && logLargeCopy(v, s) ->
 	(LoweredMove [t.(*types.Type).Alignment()]
 		dst
 		src
--- a/src/cmd/compile/internal/ssa/gen/ARM64.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules
--- a/src/cmd/compile/internal/ssa/gen/ARM64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/ARM64Ops.go
@ -351,7 +351,7 @@ func init() {
 		{name: "UBFX", argLength: 1, reg: gp11, asm: "UBFX", aux: "ARM64BitField"},

 		// moves
-		{name: "MOVDconst", argLength: 0, reg: gp01, aux: "Int64", asm: "MOVD", typ: "UInt64", rematerializeable: true},      // 32 low bits of auxint
+		{name: "MOVDconst", argLength: 0, reg: gp01, aux: "Int64", asm: "MOVD", typ: "UInt64", rematerializeable: true},      // 64 bits from auxint
 		{name: "FMOVSconst", argLength: 0, reg: fp01, aux: "Float64", asm: "FMOVS", typ: "Float32", rematerializeable: true}, // auxint as 64-bit float, convert to 32-bit float
 		{name: "FMOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "FMOVD", typ: "Float64", rematerializeable: true}, // auxint as 64-bit float

@ -675,9 +675,9 @@ func init() {
 		// There are three of these functions so that they can have three different register inputs.
 		// When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
 		// default registers to match so we don't need to copy registers around unnecessarily.
-		{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem"}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
-		{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem"}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
-		{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r0, r1}}, typ: "Mem"}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+		{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+		{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
+		{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r0, r1}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go).
 	}

 	blocks := []blockData{
@ -691,12 +691,12 @@ func init() {
 		{name: "ULE", controls: 1},
 		{name: "UGT", controls: 1},
 		{name: "UGE", controls: 1},
-		{name: "Z", controls: 1},    // Control == 0 (take a register instead of flags)
-		{name: "NZ", controls: 1},   // Control != 0
-		{name: "ZW", controls: 1},   // Control == 0, 32-bit
-		{name: "NZW", controls: 1},  // Control != 0, 32-bit
-		{name: "TBZ", controls: 1},  // Control & (1 << Aux.(int64)) == 0
-		{name: "TBNZ", controls: 1}, // Control & (1 << Aux.(int64)) != 0
+		{name: "Z", controls: 1},                  // Control == 0 (take a register instead of flags)
+		{name: "NZ", controls: 1},                 // Control != 0
+		{name: "ZW", controls: 1},                 // Control == 0, 32-bit
+		{name: "NZW", controls: 1},                // Control != 0, 32-bit
+		{name: "TBZ", controls: 1, aux: "Int64"},  // Control & (1 << AuxInt) == 0
+		{name: "TBNZ", controls: 1, aux: "Int64"}, // Control & (1 << AuxInt) != 0
 		{name: "FLT", controls: 1},
 		{name: "FLE", controls: 1},
 		{name: "FGT", controls: 1},
--- a/src/cmd/compile/internal/ssa/gen/ARMOps.go
+++ b/src/cmd/compile/internal/ssa/gen/ARMOps.go
@ -542,13 +542,13 @@ func init() {
 		// There are three of these functions so that they can have three different register inputs.
 		// When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
 		// default registers to match so we don't need to copy registers around unnecessarily.
-		{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem"}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
-		{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem"}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
-		{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r0, r1}}, typ: "Mem"}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r0, r1}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
 		// Extend ops are the same as Bounds ops except the indexes are 64-bit.
-		{name: "LoweredPanicExtendA", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r4, r2, r3}}, typ: "Mem"}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
-		{name: "LoweredPanicExtendB", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r4, r1, r2}}, typ: "Mem"}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
-		{name: "LoweredPanicExtendC", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r4, r0, r1}}, typ: "Mem"}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+		{name: "LoweredPanicExtendA", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r4, r2, r3}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+		{name: "LoweredPanicExtendB", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r4, r1, r2}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
+		{name: "LoweredPanicExtendC", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{r4, r0, r1}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).

 		// Constant flag values. For any comparison, there are 5 possible
 		// outcomes: the three from the signed total order (<,==,>) and the
--- a/src/cmd/compile/internal/ssa/gen/MIPS.rules
+++ b/src/cmd/compile/internal/ssa/gen/MIPS.rules
--- a/src/cmd/compile/internal/ssa/gen/MIPS64.rules
+++ b/src/cmd/compile/internal/ssa/gen/MIPS64.rules
@ -2,220 +2,217 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-(Add(Ptr|64|32|16|8) ...) -> (ADDV ...)
-(Add(32|64)F ...) -> (ADD(F|D) ...)
+(Add(Ptr|64|32|16|8) ...) => (ADDV ...)
+(Add(32|64)F ...) => (ADD(F|D) ...)

-(Sub(Ptr|64|32|16|8) ...) -> (SUBV ...)
-(Sub(32|64)F ...) -> (SUB(F|D) ...)
+(Sub(Ptr|64|32|16|8) ...) => (SUBV ...)
+(Sub(32|64)F ...) => (SUB(F|D) ...)

-(Mul(64|32|16|8) x y) -> (Select1 (MULVU x y))
-(Mul(32|64)F ...) -> (MUL(F|D) ...)
-(Mul64uhilo ...) -> (MULVU ...)
+(Mul(64|32|16|8) x y) => (Select1 (MULVU x y))
+(Mul(32|64)F ...) => (MUL(F|D) ...)
+(Mul64uhilo ...) => (MULVU ...)
 (Select0 (Mul64uover x y)) -> (Select1 <typ.UInt64> (MULVU x y))
 (Select1 (Mul64uover x y)) -> (SGTU <typ.Bool> (Select0 <typ.UInt64> (MULVU x y)) (MOVVconst <typ.UInt64> [0]))

-(Hmul64 x y) -> (Select0 (MULV x y))
-(Hmul64u x y) -> (Select0 (MULVU x y))
-(Hmul32 x y) -> (SRAVconst (Select1 <typ.Int64> (MULV (SignExt32to64 x) (SignExt32to64 y))) [32])
-(Hmul32u x y) -> (SRLVconst (Select1 <typ.UInt64> (MULVU (ZeroExt32to64 x) (ZeroExt32to64 y))) [32])
+(Hmul64 x y) => (Select0 (MULV x y))
+(Hmul64u x y) => (Select0 (MULVU x y))
+(Hmul32 x y) => (SRAVconst (Select1 <typ.Int64> (MULV (SignExt32to64 x) (SignExt32to64 y))) [32])
+(Hmul32u x y) => (SRLVconst (Select1 <typ.UInt64> (MULVU (ZeroExt32to64 x) (ZeroExt32to64 y))) [32])

-(Div64 x y) -> (Select1 (DIVV x y))
-(Div64u x y) -> (Select1 (DIVVU x y))
-(Div32 x y) -> (Select1 (DIVV (SignExt32to64 x) (SignExt32to64 y)))
-(Div32u x y) -> (Select1 (DIVVU (ZeroExt32to64 x) (ZeroExt32to64 y)))
-(Div16 x y) -> (Select1 (DIVV (SignExt16to64 x) (SignExt16to64 y)))
-(Div16u x y) -> (Select1 (DIVVU (ZeroExt16to64 x) (ZeroExt16to64 y)))
-(Div8 x y) -> (Select1 (DIVV (SignExt8to64 x) (SignExt8to64 y)))
-(Div8u x y) -> (Select1 (DIVVU (ZeroExt8to64 x) (ZeroExt8to64 y)))
-(Div(32|64)F ...) -> (DIV(F|D) ...)
+(Div64 x y) => (Select1 (DIVV x y))
+(Div64u x y) => (Select1 (DIVVU x y))
+(Div32 x y) => (Select1 (DIVV (SignExt32to64 x) (SignExt32to64 y)))
+(Div32u x y) => (Select1 (DIVVU (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Div16 x y) => (Select1 (DIVV (SignExt16to64 x) (SignExt16to64 y)))
+(Div16u x y) => (Select1 (DIVVU (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Div8 x y) => (Select1 (DIVV (SignExt8to64 x) (SignExt8to64 y)))
+(Div8u x y) => (Select1 (DIVVU (ZeroExt8to64 x) (ZeroExt8to64 y)))
+(Div(32|64)F ...) => (DIV(F|D) ...)

-(Mod64 x y) -> (Select0 (DIVV x y))
-(Mod64u x y) -> (Select0 (DIVVU x y))
-(Mod32 x y) -> (Select0 (DIVV (SignExt32to64 x) (SignExt32to64 y)))
-(Mod32u x y) -> (Select0 (DIVVU (ZeroExt32to64 x) (ZeroExt32to64 y)))
-(Mod16 x y) -> (Select0 (DIVV (SignExt16to64 x) (SignExt16to64 y)))
-(Mod16u x y) -> (Select0 (DIVVU (ZeroExt16to64 x) (ZeroExt16to64 y)))
-(Mod8 x y) -> (Select0 (DIVV (SignExt8to64 x) (SignExt8to64 y)))
-(Mod8u x y) -> (Select0 (DIVVU (ZeroExt8to64 x) (ZeroExt8to64 y)))
+(Mod64 x y) => (Select0 (DIVV x y))
+(Mod64u x y) => (Select0 (DIVVU x y))
+(Mod32 x y) => (Select0 (DIVV (SignExt32to64 x) (SignExt32to64 y)))
+(Mod32u x y) => (Select0 (DIVVU (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Mod16 x y) => (Select0 (DIVV (SignExt16to64 x) (SignExt16to64 y)))
+(Mod16u x y) => (Select0 (DIVVU (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Mod8 x y) => (Select0 (DIVV (SignExt8to64 x) (SignExt8to64 y)))
+(Mod8u x y) => (Select0 (DIVVU (ZeroExt8to64 x) (ZeroExt8to64 y)))

 // (x + y) / 2 with x>=y -> (x - y) / 2 + y
 (Avg64u <t> x y) -> (ADDV (SRLVconst <t> (SUBV <t> x y) [1]) y)

-(And(64|32|16|8) ...) -> (AND ...)
-(Or(64|32|16|8) ...) -> (OR ...)
-(Xor(64|32|16|8) ...) -> (XOR ...)
+(And(64|32|16|8) ...) => (AND ...)
+(Or(64|32|16|8) ...) => (OR ...)
+(Xor(64|32|16|8) ...) => (XOR ...)

 // shifts
 // hardware instruction uses only the low 6 bits of the shift
 // we compare to 64 to ensure Go semantics for large shifts
-(Lsh64x64 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y))
-(Lsh64x32 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y)))
-(Lsh64x16 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y)))
-(Lsh64x8  <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SLLV <t> x (ZeroExt8to64  y)))
+(Lsh64x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y))
+(Lsh64x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y)))
+(Lsh64x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y)))
+(Lsh64x8  <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SLLV <t> x (ZeroExt8to64  y)))

-(Lsh32x64 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y))
-(Lsh32x32 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y)))
-(Lsh32x16 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y)))
-(Lsh32x8  <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SLLV <t> x (ZeroExt8to64  y)))
+(Lsh32x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y))
+(Lsh32x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y)))
+(Lsh32x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y)))
+(Lsh32x8  <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SLLV <t> x (ZeroExt8to64  y)))

-(Lsh16x64 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y))
-(Lsh16x32 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y)))
-(Lsh16x16 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y)))
-(Lsh16x8  <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SLLV <t> x (ZeroExt8to64  y)))
+(Lsh16x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y))
+(Lsh16x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y)))
+(Lsh16x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y)))
+(Lsh16x8  <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SLLV <t> x (ZeroExt8to64  y)))

-(Lsh8x64 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y))
-(Lsh8x32 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y)))
-(Lsh8x16 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y)))
-(Lsh8x8  <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SLLV <t> x (ZeroExt8to64  y)))
+(Lsh8x64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SLLV <t> x y))
+(Lsh8x32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SLLV <t> x (ZeroExt32to64 y)))
+(Lsh8x16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SLLV <t> x (ZeroExt16to64 y)))
+(Lsh8x8  <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SLLV <t> x (ZeroExt8to64  y)))

-(Rsh64Ux64 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> x y))
-(Rsh64Ux32 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> x (ZeroExt32to64 y)))
-(Rsh64Ux16 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> x (ZeroExt16to64 y)))
-(Rsh64Ux8  <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SRLV <t> x (ZeroExt8to64  y)))
+(Rsh64Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> x y))
+(Rsh64Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> x (ZeroExt32to64 y)))
+(Rsh64Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> x (ZeroExt16to64 y)))
+(Rsh64Ux8  <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SRLV <t> x (ZeroExt8to64  y)))

-(Rsh32Ux64 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> (ZeroExt32to64 x) y))
-(Rsh32Ux32 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> (ZeroExt32to64 x) (ZeroExt32to64 y)))
-(Rsh32Ux16 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> (ZeroExt32to64 x) (ZeroExt16to64 y)))
-(Rsh32Ux8  <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SRLV <t> (ZeroExt32to64 x) (ZeroExt8to64  y)))
+(Rsh32Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> (ZeroExt32to64 x) y))
+(Rsh32Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Rsh32Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> (ZeroExt32to64 x) (ZeroExt16to64 y)))
+(Rsh32Ux8  <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SRLV <t> (ZeroExt32to64 x) (ZeroExt8to64  y)))

-(Rsh16Ux64 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> (ZeroExt16to64 x) y))
-(Rsh16Ux32 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> (ZeroExt16to64 x) (ZeroExt32to64 y)))
-(Rsh16Ux16 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> (ZeroExt16to64 x) (ZeroExt16to64 y)))
-(Rsh16Ux8  <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SRLV <t> (ZeroExt16to64 x) (ZeroExt8to64  y)))
+(Rsh16Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> (ZeroExt16to64 x) y))
+(Rsh16Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> (ZeroExt16to64 x) (ZeroExt32to64 y)))
+(Rsh16Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Rsh16Ux8  <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SRLV <t> (ZeroExt16to64 x) (ZeroExt8to64  y)))

-(Rsh8Ux64 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> (ZeroExt8to64 x) y))
-(Rsh8Ux32 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> (ZeroExt8to64 x) (ZeroExt32to64 y)))
-(Rsh8Ux16 <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> (ZeroExt8to64 x) (ZeroExt16to64 y)))
-(Rsh8Ux8  <t> x y) -> (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SRLV <t> (ZeroExt8to64 x) (ZeroExt8to64  y)))
+(Rsh8Ux64 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) y)) (SRLV <t> (ZeroExt8to64 x) y))
+(Rsh8Ux32 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y))) (SRLV <t> (ZeroExt8to64 x) (ZeroExt32to64 y)))
+(Rsh8Ux16 <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y))) (SRLV <t> (ZeroExt8to64 x) (ZeroExt16to64 y)))
+(Rsh8Ux8  <t> x y) => (AND (NEGV <t> (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y))) (SRLV <t> (ZeroExt8to64 x) (ZeroExt8to64  y)))

-(Rsh64x64 <t> x y) -> (SRAV x (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
-(Rsh64x32 <t> x y) -> (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
-(Rsh64x16 <t> x y) -> (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
-(Rsh64x8  <t> x y) -> (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64  y)))
+(Rsh64x64 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+(Rsh64x32 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+(Rsh64x16 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+(Rsh64x8  <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64  y)))

-(Rsh32x64 <t> x y) -> (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
-(Rsh32x32 <t> x y) -> (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
-(Rsh32x16 <t> x y) -> (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
-(Rsh32x8  <t> x y) -> (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64  y)))
+(Rsh32x64 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+(Rsh32x32 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+(Rsh32x16 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+(Rsh32x8  <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64  y)))

-(Rsh16x64 <t> x y) -> (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
-(Rsh16x32 <t> x y) -> (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
-(Rsh16x16 <t> x y) -> (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
-(Rsh16x8  <t> x y) -> (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64  y)))
+(Rsh16x64 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+(Rsh16x32 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+(Rsh16x16 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+(Rsh16x8  <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64  y)))

-(Rsh8x64 <t> x y) -> (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
-(Rsh8x32 <t> x y) -> (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
-(Rsh8x16 <t> x y) -> (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
-(Rsh8x8  <t> x y) -> (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64  y)))
+(Rsh8x64 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+(Rsh8x32 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+(Rsh8x16 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+(Rsh8x8  <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64  y)))

 // rotates
-(RotateLeft8 <t> x (MOVVconst [c])) -> (Or8 (Lsh8x64 <t> x (MOVVconst [c&7])) (Rsh8Ux64 <t> x (MOVVconst [-c&7])))
-(RotateLeft16 <t> x (MOVVconst [c])) -> (Or16 (Lsh16x64 <t> x (MOVVconst [c&15])) (Rsh16Ux64 <t> x (MOVVconst [-c&15])))
-(RotateLeft32 <t> x (MOVVconst [c])) -> (Or32 (Lsh32x64 <t> x (MOVVconst [c&31])) (Rsh32Ux64 <t> x (MOVVconst [-c&31])))
-(RotateLeft64 <t> x (MOVVconst [c])) -> (Or64 (Lsh64x64 <t> x (MOVVconst [c&63])) (Rsh64Ux64 <t> x (MOVVconst [-c&63])))
+(RotateLeft8 <t> x (MOVVconst [c])) => (Or8 (Lsh8x64 <t> x (MOVVconst [c&7])) (Rsh8Ux64 <t> x (MOVVconst [-c&7])))
+(RotateLeft16 <t> x (MOVVconst [c])) => (Or16 (Lsh16x64 <t> x (MOVVconst [c&15])) (Rsh16Ux64 <t> x (MOVVconst [-c&15])))
+(RotateLeft32 <t> x (MOVVconst [c])) => (Or32 (Lsh32x64 <t> x (MOVVconst [c&31])) (Rsh32Ux64 <t> x (MOVVconst [-c&31])))
+(RotateLeft64 <t> x (MOVVconst [c])) => (Or64 (Lsh64x64 <t> x (MOVVconst [c&63])) (Rsh64Ux64 <t> x (MOVVconst [-c&63])))

 // unary ops
-(Neg(64|32|16|8) ...) -> (NEGV ...)
-(Neg(32|64)F ...) -> (NEG(F|D) ...)
+(Neg(64|32|16|8) ...) => (NEGV ...)
+(Neg(32|64)F ...) => (NEG(F|D) ...)

-(Com(64|32|16|8) x) -> (NOR (MOVVconst [0]) x)
+(Com(64|32|16|8) x) => (NOR (MOVVconst [0]) x)

-(Sqrt ...) -> (SQRTD ...)
+(Sqrt ...) => (SQRTD ...)

 // boolean ops -- booleans are represented with 0=false, 1=true
-(AndB ...) -> (AND ...)
-(OrB ...) -> (OR ...)
-(EqB x y) -> (XOR (MOVVconst [1]) (XOR <typ.Bool> x y))
-(NeqB ...) -> (XOR ...)
-(Not x) -> (XORconst [1] x)
+(AndB ...) => (AND ...)
+(OrB ...) => (OR ...)
+(EqB x y) => (XOR (MOVVconst [1]) (XOR <typ.Bool> x y))
+(NeqB ...) => (XOR ...)
+(Not x) => (XORconst [1] x)

 // constants
 (Const(64|32|16|8) ...) -> (MOVVconst ...)
 (Const(32|64)F ...) -> (MOV(F|D)const ...)
-(ConstNil) -> (MOVVconst [0])
+(ConstNil) => (MOVVconst [0])
 (ConstBool ...) -> (MOVVconst ...)

-(Slicemask <t> x) -> (SRAVconst (NEGV <t> x) [63])
+(Slicemask <t> x) => (SRAVconst (NEGV <t> x) [63])

 // truncations
 // Because we ignore high parts of registers, truncates are just copies.
-(Trunc16to8 ...) -> (Copy ...)
-(Trunc32to8 ...) -> (Copy ...)
-(Trunc32to16 ...) -> (Copy ...)
-(Trunc64to8 ...) -> (Copy ...)
-(Trunc64to16 ...) -> (Copy ...)
-(Trunc64to32 ...) -> (Copy ...)
+(Trunc16to8 ...) => (Copy ...)
+(Trunc32to8 ...) => (Copy ...)
+(Trunc32to16 ...) => (Copy ...)
+(Trunc64to8 ...) => (Copy ...)
+(Trunc64to16 ...) => (Copy ...)
+(Trunc64to32 ...) => (Copy ...)

 // Zero-/Sign-extensions
-(ZeroExt8to16 ...) -> (MOVBUreg ...)
-(ZeroExt8to32 ...) -> (MOVBUreg ...)
-(ZeroExt16to32 ...) -> (MOVHUreg ...)
-(ZeroExt8to64 ...) -> (MOVBUreg ...)
-(ZeroExt16to64 ...) -> (MOVHUreg ...)
-(ZeroExt32to64 ...) -> (MOVWUreg ...)
+(ZeroExt8to16 ...) => (MOVBUreg ...)
+(ZeroExt8to32 ...) => (MOVBUreg ...)
+(ZeroExt16to32 ...) => (MOVHUreg ...)
+(ZeroExt8to64 ...) => (MOVBUreg ...)
+(ZeroExt16to64 ...) => (MOVHUreg ...)
+(ZeroExt32to64 ...) => (MOVWUreg ...)

-(SignExt8to16 ...) -> (MOVBreg ...)
-(SignExt8to32 ...) -> (MOVBreg ...)
-(SignExt16to32 ...) -> (MOVHreg ...)
-(SignExt8to64 ...) -> (MOVBreg ...)
-(SignExt16to64 ...) -> (MOVHreg ...)
-(SignExt32to64 ...) -> (MOVWreg ...)
+(SignExt8to16 ...) => (MOVBreg ...)
+(SignExt8to32 ...) => (MOVBreg ...)
+(SignExt16to32 ...) => (MOVHreg ...)
+(SignExt8to64 ...) => (MOVBreg ...)
+(SignExt16to64 ...) => (MOVHreg ...)
+(SignExt32to64 ...) => (MOVWreg ...)

 // float <-> int conversion
-(Cvt32to32F ...) -> (MOVWF ...)
-(Cvt32to64F ...) -> (MOVWD ...)
-(Cvt64to32F ...) -> (MOVVF ...)
-(Cvt64to64F ...) -> (MOVVD ...)
-(Cvt32Fto32 ...) -> (TRUNCFW ...)
-(Cvt64Fto32 ...) -> (TRUNCDW ...)
-(Cvt32Fto64 ...) -> (TRUNCFV ...)
-(Cvt64Fto64 ...) -> (TRUNCDV ...)
-(Cvt32Fto64F ...) -> (MOVFD ...)
-(Cvt64Fto32F ...) -> (MOVDF ...)
+(Cvt32to32F ...) => (MOVWF ...)
+(Cvt32to64F ...) => (MOVWD ...)
+(Cvt64to32F ...) => (MOVVF ...)
+(Cvt64to64F ...) => (MOVVD ...)
+(Cvt32Fto32 ...) => (TRUNCFW ...)
+(Cvt64Fto32 ...) => (TRUNCDW ...)
+(Cvt32Fto64 ...) => (TRUNCFV ...)
+(Cvt64Fto64 ...) => (TRUNCDV ...)
+(Cvt32Fto64F ...) => (MOVFD ...)
+(Cvt64Fto32F ...) => (MOVDF ...)

-(CvtBoolToUint8 ...) -> (Copy ...)
+(CvtBoolToUint8 ...) => (Copy ...)

-(Round(32|64)F ...) -> (Copy ...)
+(Round(32|64)F ...) => (Copy ...)

 // comparisons
-(Eq8 x y)  -> (SGTU (MOVVconst [1]) (XOR (ZeroExt8to64 x) (ZeroExt8to64 y)))
-(Eq16 x y) -> (SGTU (MOVVconst [1]) (XOR (ZeroExt16to64 x) (ZeroExt16to64 y)))
-(Eq32 x y) -> (SGTU (MOVVconst [1]) (XOR (ZeroExt32to64 x) (ZeroExt32to64 y)))
-(Eq64 x y) -> (SGTU (MOVVconst [1]) (XOR x y))
-(EqPtr x y) -> (SGTU (MOVVconst [1]) (XOR x y))
-(Eq(32|64)F x y) -> (FPFlagTrue (CMPEQ(F|D) x y))
+(Eq8 x y)  => (SGTU (MOVVconst [1]) (XOR (ZeroExt8to64 x) (ZeroExt8to64 y)))
+(Eq16 x y) => (SGTU (MOVVconst [1]) (XOR (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Eq32 x y) => (SGTU (MOVVconst [1]) (XOR (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Eq64 x y) => (SGTU (MOVVconst [1]) (XOR x y))
+(EqPtr x y) => (SGTU (MOVVconst [1]) (XOR x y))
+(Eq(32|64)F x y) => (FPFlagTrue (CMPEQ(F|D) x y))

-(Neq8 x y)  -> (SGTU (XOR (ZeroExt8to64 x) (ZeroExt8to64 y)) (MOVVconst [0]))
-(Neq16 x y) -> (SGTU (XOR (ZeroExt16to32 x) (ZeroExt16to64 y)) (MOVVconst [0]))
-(Neq32 x y) -> (SGTU (XOR (ZeroExt32to64 x) (ZeroExt32to64 y)) (MOVVconst [0]))
-(Neq64 x y) -> (SGTU (XOR x y) (MOVVconst [0]))
-(NeqPtr x y) -> (SGTU (XOR x y) (MOVVconst [0]))
-(Neq(32|64)F x y) -> (FPFlagFalse (CMPEQ(F|D) x y))
+(Neq8 x y)  => (SGTU (XOR (ZeroExt8to64 x) (ZeroExt8to64 y)) (MOVVconst [0]))
+(Neq16 x y) => (SGTU (XOR (ZeroExt16to32 x) (ZeroExt16to64 y)) (MOVVconst [0]))
+(Neq32 x y) => (SGTU (XOR (ZeroExt32to64 x) (ZeroExt32to64 y)) (MOVVconst [0]))
+(Neq64 x y) => (SGTU (XOR x y) (MOVVconst [0]))
+(NeqPtr x y) => (SGTU (XOR x y) (MOVVconst [0]))
+(Neq(32|64)F x y) => (FPFlagFalse (CMPEQ(F|D) x y))

-(Less8 x y)  -> (SGT (SignExt8to64 y) (SignExt8to64 x))
-(Less16 x y) -> (SGT (SignExt16to64 y) (SignExt16to64 x))
-(Less32 x y) -> (SGT (SignExt32to64 y) (SignExt32to64 x))
-(Less64 x y) -> (SGT y x)
-(Less(32|64)F x y) -> (FPFlagTrue (CMPGT(F|D) y x)) // reverse operands to work around NaN
+(Less8 x y)  => (SGT (SignExt8to64 y) (SignExt8to64 x))
+(Less16 x y) => (SGT (SignExt16to64 y) (SignExt16to64 x))
+(Less32 x y) => (SGT (SignExt32to64 y) (SignExt32to64 x))
+(Less64 x y) => (SGT y x)
+(Less(32|64)F x y) => (FPFlagTrue (CMPGT(F|D) y x)) // reverse operands to work around NaN

-(Less8U x y)  -> (SGTU (ZeroExt8to64 y) (ZeroExt8to64 x))
-(Less16U x y) -> (SGTU (ZeroExt16to64 y) (ZeroExt16to64 x))
-(Less32U x y) -> (SGTU (ZeroExt32to64 y) (ZeroExt32to64 x))
-(Less64U x y) -> (SGTU y x)
+(Less8U x y)  => (SGTU (ZeroExt8to64 y) (ZeroExt8to64 x))
+(Less16U x y) => (SGTU (ZeroExt16to64 y) (ZeroExt16to64 x))
+(Less32U x y) => (SGTU (ZeroExt32to64 y) (ZeroExt32to64 x))
+(Less64U x y) => (SGTU y x)

-(Leq8 x y)  -> (XOR (MOVVconst [1]) (SGT (SignExt8to64 x) (SignExt8to64 y)))
-(Leq16 x y) -> (XOR (MOVVconst [1]) (SGT (SignExt16to64 x) (SignExt16to64 y)))
-(Leq32 x y) -> (XOR (MOVVconst [1]) (SGT (SignExt32to64 x) (SignExt32to64 y)))
-(Leq64 x y) -> (XOR (MOVVconst [1]) (SGT x y))
-(Leq(32|64)F x y) -> (FPFlagTrue (CMPGE(F|D) y x)) // reverse operands to work around NaN
+(Leq8 x y)  => (XOR (MOVVconst [1]) (SGT (SignExt8to64 x) (SignExt8to64 y)))
+(Leq16 x y) => (XOR (MOVVconst [1]) (SGT (SignExt16to64 x) (SignExt16to64 y)))
+(Leq32 x y) => (XOR (MOVVconst [1]) (SGT (SignExt32to64 x) (SignExt32to64 y)))
+(Leq64 x y) => (XOR (MOVVconst [1]) (SGT x y))
+(Leq(32|64)F x y) => (FPFlagTrue (CMPGE(F|D) y x)) // reverse operands to work around NaN

-(Leq8U x y)  -> (XOR (MOVVconst [1]) (SGTU (ZeroExt8to64 x) (ZeroExt8to64 y)))
-(Leq16U x y) -> (XOR (MOVVconst [1]) (SGTU (ZeroExt16to64 x) (ZeroExt16to64 y)))
-(Leq32U x y) -> (XOR (MOVVconst [1]) (SGTU (ZeroExt32to64 x) (ZeroExt32to64 y)))
-(Leq64U x y) -> (XOR (MOVVconst [1]) (SGTU x y))
-
-(Greater(32|64)F x y) -> (FPFlagTrue (CMPGT(F|D) x y))
-(Geq(32|64)F     x y) -> (FPFlagTrue (CMPGE(F|D) x y))
+(Leq8U x y)  => (XOR (MOVVconst [1]) (SGTU (ZeroExt8to64 x) (ZeroExt8to64 y)))
+(Leq16U x y) => (XOR (MOVVconst [1]) (SGTU (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Leq32U x y) => (XOR (MOVVconst [1]) (SGTU (ZeroExt32to64 x) (ZeroExt32to64 y)))
+(Leq64U x y) => (XOR (MOVVconst [1]) (SGTU x y))

 (OffPtr [off] ptr:(SP)) -> (MOVVaddr [off] ptr)
 (OffPtr [off] ptr) -> (ADDVconst [off] ptr)
@ -224,70 +221,70 @@
 (LocalAddr {sym} base _) -> (MOVVaddr {sym} base)

 // loads
-(Load <t> ptr mem) && t.IsBoolean() -> (MOVBUload ptr mem)
-(Load <t> ptr mem) && (is8BitInt(t) && isSigned(t)) -> (MOVBload ptr mem)
-(Load <t> ptr mem) && (is8BitInt(t) && !isSigned(t)) -> (MOVBUload ptr mem)
-(Load <t> ptr mem) && (is16BitInt(t) && isSigned(t)) -> (MOVHload ptr mem)
-(Load <t> ptr mem) && (is16BitInt(t) && !isSigned(t)) -> (MOVHUload ptr mem)
-(Load <t> ptr mem) && (is32BitInt(t) && isSigned(t)) -> (MOVWload ptr mem)
-(Load <t> ptr mem) && (is32BitInt(t) && !isSigned(t)) -> (MOVWUload ptr mem)
-(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) -> (MOVVload ptr mem)
-(Load <t> ptr mem) && is32BitFloat(t) -> (MOVFload ptr mem)
-(Load <t> ptr mem) && is64BitFloat(t) -> (MOVDload ptr mem)
+(Load <t> ptr mem) && t.IsBoolean() => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is8BitInt(t) && isSigned(t)) => (MOVBload ptr mem)
+(Load <t> ptr mem) && (is8BitInt(t) && !isSigned(t)) => (MOVBUload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && isSigned(t)) => (MOVHload ptr mem)
+(Load <t> ptr mem) && (is16BitInt(t) && !isSigned(t)) => (MOVHUload ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) && isSigned(t)) => (MOVWload ptr mem)
+(Load <t> ptr mem) && (is32BitInt(t) && !isSigned(t)) => (MOVWUload ptr mem)
+(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVVload ptr mem)
+(Load <t> ptr mem) && is32BitFloat(t) => (MOVFload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (MOVDload ptr mem)

 // stores
-(Store {t} ptr val mem) && t.(*types.Type).Size() == 1 -> (MOVBstore ptr val mem)
-(Store {t} ptr val mem) && t.(*types.Type).Size() == 2 -> (MOVHstore ptr val mem)
-(Store {t} ptr val mem) && t.(*types.Type).Size() == 4 && !is32BitFloat(val.Type) -> (MOVWstore ptr val mem)
-(Store {t} ptr val mem) && t.(*types.Type).Size() == 8 && !is64BitFloat(val.Type) -> (MOVVstore ptr val mem)
-(Store {t} ptr val mem) && t.(*types.Type).Size() == 4 && is32BitFloat(val.Type) -> (MOVFstore ptr val mem)
-(Store {t} ptr val mem) && t.(*types.Type).Size() == 8 && is64BitFloat(val.Type) -> (MOVDstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && !is32BitFloat(val.Type) => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && !is64BitFloat(val.Type) => (MOVVstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (MOVFstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (MOVDstore ptr val mem)

 // zeroing
-(Zero [0] _ mem) -> mem
-(Zero [1] ptr mem) -> (MOVBstore ptr (MOVVconst [0]) mem)
-(Zero [2] {t} ptr mem) && t.(*types.Type).Alignment()%2 == 0 ->
+(Zero [0] _ mem) => mem
+(Zero [1] ptr mem) => (MOVBstore ptr (MOVVconst [0]) mem)
+(Zero [2] {t} ptr mem) && t.Alignment()%2 == 0 =>
 	(MOVHstore ptr (MOVVconst [0]) mem)
-(Zero [2] ptr mem) ->
+(Zero [2] ptr mem) =>
 	(MOVBstore [1] ptr (MOVVconst [0])
 		(MOVBstore [0] ptr (MOVVconst [0]) mem))
-(Zero [4] {t} ptr mem) && t.(*types.Type).Alignment()%4 == 0 ->
+(Zero [4] {t} ptr mem) && t.Alignment()%4 == 0 =>
 	(MOVWstore ptr (MOVVconst [0]) mem)
-(Zero [4] {t} ptr mem) && t.(*types.Type).Alignment()%2 == 0 ->
+(Zero [4] {t} ptr mem) && t.Alignment()%2 == 0 =>
 	(MOVHstore [2] ptr (MOVVconst [0])
 		(MOVHstore [0] ptr (MOVVconst [0]) mem))
-(Zero [4] ptr mem) ->
+(Zero [4] ptr mem) =>
 	(MOVBstore [3] ptr (MOVVconst [0])
 		(MOVBstore [2] ptr (MOVVconst [0])
 			(MOVBstore [1] ptr (MOVVconst [0])
 				(MOVBstore [0] ptr (MOVVconst [0]) mem))))
-(Zero [8] {t} ptr mem) && t.(*types.Type).Alignment()%8 == 0 ->
+(Zero [8] {t} ptr mem) && t.Alignment()%8 == 0 =>
 	(MOVVstore ptr (MOVVconst [0]) mem)
-(Zero [8] {t} ptr mem) && t.(*types.Type).Alignment()%4 == 0 ->
+(Zero [8] {t} ptr mem) && t.Alignment()%4 == 0 =>
 	(MOVWstore [4] ptr (MOVVconst [0])
 		(MOVWstore [0] ptr (MOVVconst [0]) mem))
-(Zero [8] {t} ptr mem) && t.(*types.Type).Alignment()%2 == 0 ->
+(Zero [8] {t} ptr mem) && t.Alignment()%2 == 0 =>
 	(MOVHstore [6] ptr (MOVVconst [0])
 		(MOVHstore [4] ptr (MOVVconst [0])
 			(MOVHstore [2] ptr (MOVVconst [0])
 				(MOVHstore [0] ptr (MOVVconst [0]) mem))))

-(Zero [3] ptr mem) ->
+(Zero [3] ptr mem) =>
 	(MOVBstore [2] ptr (MOVVconst [0])
 		(MOVBstore [1] ptr (MOVVconst [0])
 			(MOVBstore [0] ptr (MOVVconst [0]) mem)))
-(Zero [6] {t} ptr mem) && t.(*types.Type).Alignment()%2 == 0 ->
+(Zero [6] {t} ptr mem) && t.Alignment()%2 == 0 =>
 	(MOVHstore [4] ptr (MOVVconst [0])
 		(MOVHstore [2] ptr (MOVVconst [0])
 			(MOVHstore [0] ptr (MOVVconst [0]) mem)))
-(Zero [12] {t} ptr mem) && t.(*types.Type).Alignment()%4 == 0 ->
+(Zero [12] {t} ptr mem) && t.Alignment()%4 == 0 =>
 	(MOVWstore [8] ptr (MOVVconst [0])
 		(MOVWstore [4] ptr (MOVVconst [0])
 			(MOVWstore [0] ptr (MOVVconst [0]) mem)))
-(Zero [16] {t} ptr mem) && t.(*types.Type).Alignment()%8 == 0 ->
+(Zero [16] {t} ptr mem) && t.Alignment()%8 == 0 =>
 	(MOVVstore [8] ptr (MOVVconst [0])
 		(MOVVstore [0] ptr (MOVVconst [0]) mem))
-(Zero [24] {t} ptr mem) && t.(*types.Type).Alignment()%8 == 0 ->
+(Zero [24] {t} ptr mem) && t.Alignment()%8 == 0 =>
 	(MOVVstore [16] ptr (MOVVconst [0])
 		(MOVVstore [8] ptr (MOVVconst [0])
 			(MOVVstore [0] ptr (MOVVconst [0]) mem)))
@ -296,70 +293,70 @@
 // 8, and 128 are magic constants, see runtime/mkduff.go
 (Zero [s] {t} ptr mem)
 	&& s%8 == 0 && s > 24 && s <= 8*128
-	&& t.(*types.Type).Alignment()%8 == 0 && !config.noDuffDevice ->
+	&& t.Alignment()%8 == 0 && !config.noDuffDevice =>
 	(DUFFZERO [8 * (128 - s/8)] ptr mem)

 // large or unaligned zeroing uses a loop
 (Zero [s] {t} ptr mem)
-	&& (s > 8*128 || config.noDuffDevice) || t.(*types.Type).Alignment()%8 != 0 ->
-	(LoweredZero [t.(*types.Type).Alignment()]
+	&& (s > 8*128 || config.noDuffDevice) || t.Alignment()%8 != 0 =>
+	(LoweredZero [t.Alignment()]
 		ptr
-		(ADDVconst <ptr.Type> ptr [s-moveSize(t.(*types.Type).Alignment(), config)])
+		(ADDVconst <ptr.Type> ptr [s-moveSize(t.Alignment(), config)])
 		mem)

 // moves
-(Move [0] _ _ mem) -> mem
-(Move [1] dst src mem) -> (MOVBstore dst (MOVBload src mem) mem)
-(Move [2] {t} dst src mem) && t.(*types.Type).Alignment()%2 == 0 ->
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBload src mem) mem)
+(Move [2] {t} dst src mem) && t.Alignment()%2 == 0 =>
 	(MOVHstore dst (MOVHload src mem) mem)
-(Move [2] dst src mem) ->
+(Move [2] dst src mem) =>
 	(MOVBstore [1] dst (MOVBload [1] src mem)
 		(MOVBstore dst (MOVBload src mem) mem))
-(Move [4] {t} dst src mem) && t.(*types.Type).Alignment()%4 == 0 ->
+(Move [4] {t} dst src mem) && t.Alignment()%4 == 0 =>
 	(MOVWstore dst (MOVWload src mem) mem)
-(Move [4] {t} dst src mem) && t.(*types.Type).Alignment()%2 == 0 ->
+(Move [4] {t} dst src mem) && t.Alignment()%2 == 0 =>
 	(MOVHstore [2] dst (MOVHload [2] src mem)
 		(MOVHstore dst (MOVHload src mem) mem))
-(Move [4] dst src mem) ->
+(Move [4] dst src mem) =>
 	(MOVBstore [3] dst (MOVBload [3] src mem)
 		(MOVBstore [2] dst (MOVBload [2] src mem)
 			(MOVBstore [1] dst (MOVBload [1] src mem)
 				(MOVBstore dst (MOVBload src mem) mem))))
-(Move [8] {t} dst src mem) && t.(*types.Type).Alignment()%8 == 0 ->
+(Move [8] {t} dst src mem) && t.Alignment()%8 == 0 =>
 	(MOVVstore dst (MOVVload src mem) mem)
-(Move [8] {t} dst src mem) && t.(*types.Type).Alignment()%4 == 0 ->
+(Move [8] {t} dst src mem) && t.Alignment()%4 == 0 =>
 	(MOVWstore [4] dst (MOVWload [4] src mem)
 		(MOVWstore dst (MOVWload src mem) mem))
-(Move [8] {t} dst src mem) && t.(*types.Type).Alignment()%2 == 0 ->
+(Move [8] {t} dst src mem) && t.Alignment()%2 == 0 =>
 	(MOVHstore [6] dst (MOVHload [6] src mem)
 		(MOVHstore [4] dst (MOVHload [4] src mem)
 			(MOVHstore [2] dst (MOVHload [2] src mem)
 				(MOVHstore dst (MOVHload src mem) mem))))

-(Move [3] dst src mem) ->
+(Move [3] dst src mem) =>
 	(MOVBstore [2] dst (MOVBload [2] src mem)
 		(MOVBstore [1] dst (MOVBload [1] src mem)
 			(MOVBstore dst (MOVBload src mem) mem)))
-(Move [6] {t} dst src mem) && t.(*types.Type).Alignment()%2 == 0 ->
+(Move [6] {t} dst src mem) && t.Alignment()%2 == 0 =>
 	(MOVHstore [4] dst (MOVHload [4] src mem)
 		(MOVHstore [2] dst (MOVHload [2] src mem)
 			(MOVHstore dst (MOVHload src mem) mem)))
-(Move [12] {t} dst src mem) && t.(*types.Type).Alignment()%4 == 0 ->
+(Move [12] {t} dst src mem) && t.Alignment()%4 == 0 =>
 	(MOVWstore [8] dst (MOVWload [8] src mem)
 		(MOVWstore [4] dst (MOVWload [4] src mem)
 			(MOVWstore dst (MOVWload src mem) mem)))
-(Move [16] {t} dst src mem) && t.(*types.Type).Alignment()%8 == 0 ->
+(Move [16] {t} dst src mem) && t.Alignment()%8 == 0 =>
 	(MOVVstore [8] dst (MOVVload [8] src mem)
 		(MOVVstore dst (MOVVload src mem) mem))
-(Move [24] {t} dst src mem) && t.(*types.Type).Alignment()%8 == 0 ->
+(Move [24] {t} dst src mem) && t.Alignment()%8 == 0 =>
 	(MOVVstore [16] dst (MOVVload [16] src mem)
 		(MOVVstore [8] dst (MOVVload [8] src mem)
 			(MOVVstore dst (MOVVload src mem) mem)))

 // medium move uses a duff device
 (Move [s] {t} dst src mem)
-	&& s%8 == 0 && s >= 24 && s <= 8*128 && t.(*types.Type).Alignment()%8 == 0
-	&& !config.noDuffDevice ->
+	&& s%8 == 0 && s >= 24 && s <= 8*128 && t.Alignment()%8 == 0
+	&& !config.noDuffDevice && logLargeCopy(v, s)  =>
 	(DUFFCOPY [16 * (128 - s/8)] dst src mem)
 // 16 and 128 are magic constants.  16 is the number of bytes to encode:
 //	MOVV	(R1), R23
@ -370,17 +367,17 @@

 // large or unaligned move uses a loop
 (Move [s] {t} dst src mem)
-	&& s > 24 || t.(*types.Type).Alignment()%8 != 0 ->
-	(LoweredMove [t.(*types.Type).Alignment()]
+	&& s > 24 && logLargeCopy(v, s) || t.Alignment()%8 != 0 =>
+	(LoweredMove [t.Alignment()]
 		dst
 		src
-		(ADDVconst <src.Type> src [s-moveSize(t.(*types.Type).Alignment(), config)])
+		(ADDVconst <src.Type> src [s-moveSize(t.Alignment(), config)])
 		mem)

 // calls
-(StaticCall ...) -> (CALLstatic ...)
-(ClosureCall ...) -> (CALLclosure ...)
-(InterCall ...) -> (CALLinter ...)
+(StaticCall ...) => (CALLstatic ...)
+(ClosureCall ...) => (CALLclosure ...)
+(InterCall ...) => (CALLinter ...)

 // atomic intrinsics
 (AtomicLoad8   ...) -> (LoweredAtomicLoad8  ...)
@ -403,48 +400,48 @@
 (AtomicCompareAndSwap64 ...) -> (LoweredAtomicCas64 ...)

 // checks
-(NilCheck ...) -> (LoweredNilCheck ...)
-(IsNonNil ptr) -> (SGTU ptr (MOVVconst [0]))
-(IsInBounds idx len) -> (SGTU len idx)
-(IsSliceInBounds idx len) -> (XOR (MOVVconst [1]) (SGTU idx len))
+(NilCheck ...) => (LoweredNilCheck ...)
+(IsNonNil ptr) => (SGTU ptr (MOVVconst [0]))
+(IsInBounds idx len) => (SGTU len idx)
+(IsSliceInBounds idx len) => (XOR (MOVVconst [1]) (SGTU idx len))

 // pseudo-ops
-(GetClosurePtr ...) -> (LoweredGetClosurePtr ...)
-(GetCallerSP ...) -> (LoweredGetCallerSP ...)
-(GetCallerPC ...) -> (LoweredGetCallerPC ...)
+(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
+(GetCallerSP ...) => (LoweredGetCallerSP ...)
+(GetCallerPC ...) => (LoweredGetCallerPC ...)

-(If cond yes no) -> (NE cond yes no)
+(If cond yes no) => (NE cond yes no)

 // Write barrier.
-(WB ...) -> (LoweredWB ...)
+(WB ...) => (LoweredWB ...)

-(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 -> (LoweredPanicBoundsA [kind] x y mem)
-(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 -> (LoweredPanicBoundsB [kind] x y mem)
-(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 -> (LoweredPanicBoundsC [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 0 => (LoweredPanicBoundsA [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 1 => (LoweredPanicBoundsB [kind] x y mem)
+(PanicBounds [kind] x y mem) && boundsABI(kind) == 2 => (LoweredPanicBoundsC [kind] x y mem)

 // Optimizations

 // Absorb boolean tests into block
-(NE (FPFlagTrue cmp) yes no) -> (FPT cmp yes no)
-(NE (FPFlagFalse cmp) yes no) -> (FPF cmp yes no)
-(EQ (FPFlagTrue cmp) yes no) -> (FPF cmp yes no)
-(EQ (FPFlagFalse cmp) yes no) -> (FPT cmp yes no)
-(NE (XORconst [1] cmp:(SGT _ _)) yes no) -> (EQ cmp yes no)
-(NE (XORconst [1] cmp:(SGTU _ _)) yes no) -> (EQ cmp yes no)
-(NE (XORconst [1] cmp:(SGTconst _)) yes no) -> (EQ cmp yes no)
-(NE (XORconst [1] cmp:(SGTUconst _)) yes no) -> (EQ cmp yes no)
-(EQ (XORconst [1] cmp:(SGT _ _)) yes no) -> (NE cmp yes no)
-(EQ (XORconst [1] cmp:(SGTU _ _)) yes no) -> (NE cmp yes no)
-(EQ (XORconst [1] cmp:(SGTconst _)) yes no) -> (NE cmp yes no)
-(EQ (XORconst [1] cmp:(SGTUconst _)) yes no) -> (NE cmp yes no)
-(NE (SGTUconst [1] x) yes no) -> (EQ x yes no)
-(EQ (SGTUconst [1] x) yes no) -> (NE x yes no)
-(NE (SGTU x (MOVVconst [0])) yes no) -> (NE x yes no)
-(EQ (SGTU x (MOVVconst [0])) yes no) -> (EQ x yes no)
-(NE (SGTconst [0] x) yes no) -> (LTZ x yes no)
-(EQ (SGTconst [0] x) yes no) -> (GEZ x yes no)
-(NE (SGT x (MOVVconst [0])) yes no) -> (GTZ x yes no)
-(EQ (SGT x (MOVVconst [0])) yes no) -> (LEZ x yes no)
+(NE (FPFlagTrue cmp) yes no) => (FPT cmp yes no)
+(NE (FPFlagFalse cmp) yes no) => (FPF cmp yes no)
+(EQ (FPFlagTrue cmp) yes no) => (FPF cmp yes no)
+(EQ (FPFlagFalse cmp) yes no) => (FPT cmp yes no)
+(NE (XORconst [1] cmp:(SGT _ _)) yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTU _ _)) yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTconst _)) yes no) => (EQ cmp yes no)
+(NE (XORconst [1] cmp:(SGTUconst _)) yes no) => (EQ cmp yes no)
+(EQ (XORconst [1] cmp:(SGT _ _)) yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTU _ _)) yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTconst _)) yes no) => (NE cmp yes no)
+(EQ (XORconst [1] cmp:(SGTUconst _)) yes no) => (NE cmp yes no)
+(NE (SGTUconst [1] x) yes no) => (EQ x yes no)
+(EQ (SGTUconst [1] x) yes no) => (NE x yes no)
+(NE (SGTU x (MOVVconst [0])) yes no) => (NE x yes no)
+(EQ (SGTU x (MOVVconst [0])) yes no) => (EQ x yes no)
+(NE (SGTconst [0] x) yes no) => (LTZ x yes no)
+(EQ (SGTconst [0] x) yes no) => (GEZ x yes no)
+(NE (SGT x (MOVVconst [0])) yes no) => (GTZ x yes no)
+(EQ (SGT x (MOVVconst [0])) yes no) => (LEZ x yes no)

 // fold offset into address
 (ADDVconst [off1] (MOVVaddr [off2] {sym} ptr)) -> (MOVVaddr [off1+off2] {sym} ptr)
@ -512,178 +509,178 @@
 	(MOVVstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)

 // store zero
-(MOVBstore [off] {sym} ptr (MOVVconst [0]) mem) -> (MOVBstorezero [off] {sym} ptr mem)
-(MOVHstore [off] {sym} ptr (MOVVconst [0]) mem) -> (MOVHstorezero [off] {sym} ptr mem)
-(MOVWstore [off] {sym} ptr (MOVVconst [0]) mem) -> (MOVWstorezero [off] {sym} ptr mem)
-(MOVVstore [off] {sym} ptr (MOVVconst [0]) mem) -> (MOVVstorezero [off] {sym} ptr mem)
+(MOVBstore [off] {sym} ptr (MOVVconst [0]) mem) => (MOVBstorezero [off] {sym} ptr mem)
+(MOVHstore [off] {sym} ptr (MOVVconst [0]) mem) => (MOVHstorezero [off] {sym} ptr mem)
+(MOVWstore [off] {sym} ptr (MOVVconst [0]) mem) => (MOVWstorezero [off] {sym} ptr mem)
+(MOVVstore [off] {sym} ptr (MOVVconst [0]) mem) => (MOVVstorezero [off] {sym} ptr mem)

 // don't extend after proper load
-(MOVBreg x:(MOVBload _ _)) -> (MOVVreg x)
-(MOVBUreg x:(MOVBUload _ _)) -> (MOVVreg x)
-(MOVHreg x:(MOVBload _ _)) -> (MOVVreg x)
-(MOVHreg x:(MOVBUload _ _)) -> (MOVVreg x)
-(MOVHreg x:(MOVHload _ _)) -> (MOVVreg x)
-(MOVHUreg x:(MOVBUload _ _)) -> (MOVVreg x)
-(MOVHUreg x:(MOVHUload _ _)) -> (MOVVreg x)
-(MOVWreg x:(MOVBload _ _)) -> (MOVVreg x)
-(MOVWreg x:(MOVBUload _ _)) -> (MOVVreg x)
-(MOVWreg x:(MOVHload _ _)) -> (MOVVreg x)
-(MOVWreg x:(MOVHUload _ _)) -> (MOVVreg x)
-(MOVWreg x:(MOVWload _ _)) -> (MOVVreg x)
-(MOVWUreg x:(MOVBUload _ _)) -> (MOVVreg x)
-(MOVWUreg x:(MOVHUload _ _)) -> (MOVVreg x)
-(MOVWUreg x:(MOVWUload _ _)) -> (MOVVreg x)
+(MOVBreg x:(MOVBload _ _)) => (MOVVreg x)
+(MOVBUreg x:(MOVBUload _ _)) => (MOVVreg x)
+(MOVHreg x:(MOVBload _ _)) => (MOVVreg x)
+(MOVHreg x:(MOVBUload _ _)) => (MOVVreg x)
+(MOVHreg x:(MOVHload _ _)) => (MOVVreg x)
+(MOVHUreg x:(MOVBUload _ _)) => (MOVVreg x)
+(MOVHUreg x:(MOVHUload _ _)) => (MOVVreg x)
+(MOVWreg x:(MOVBload _ _)) => (MOVVreg x)
+(MOVWreg x:(MOVBUload _ _)) => (MOVVreg x)
+(MOVWreg x:(MOVHload _ _)) => (MOVVreg x)
+(MOVWreg x:(MOVHUload _ _)) => (MOVVreg x)
+(MOVWreg x:(MOVWload _ _)) => (MOVVreg x)
+(MOVWUreg x:(MOVBUload _ _)) => (MOVVreg x)
+(MOVWUreg x:(MOVHUload _ _)) => (MOVVreg x)
+(MOVWUreg x:(MOVWUload _ _)) => (MOVVreg x)

 // fold double extensions
-(MOVBreg x:(MOVBreg _)) -> (MOVVreg x)
-(MOVBUreg x:(MOVBUreg _)) -> (MOVVreg x)
-(MOVHreg x:(MOVBreg _)) -> (MOVVreg x)
-(MOVHreg x:(MOVBUreg _)) -> (MOVVreg x)
-(MOVHreg x:(MOVHreg _)) -> (MOVVreg x)
-(MOVHUreg x:(MOVBUreg _)) -> (MOVVreg x)
-(MOVHUreg x:(MOVHUreg _)) -> (MOVVreg x)
-(MOVWreg x:(MOVBreg _)) -> (MOVVreg x)
-(MOVWreg x:(MOVBUreg _)) -> (MOVVreg x)
-(MOVWreg x:(MOVHreg _)) -> (MOVVreg x)
-(MOVWreg x:(MOVWreg _)) -> (MOVVreg x)
-(MOVWUreg x:(MOVBUreg _)) -> (MOVVreg x)
-(MOVWUreg x:(MOVHUreg _)) -> (MOVVreg x)
-(MOVWUreg x:(MOVWUreg _)) -> (MOVVreg x)
+(MOVBreg x:(MOVBreg _)) => (MOVVreg x)
+(MOVBUreg x:(MOVBUreg _)) => (MOVVreg x)
+(MOVHreg x:(MOVBreg _)) => (MOVVreg x)
+(MOVHreg x:(MOVBUreg _)) => (MOVVreg x)
+(MOVHreg x:(MOVHreg _)) => (MOVVreg x)
+(MOVHUreg x:(MOVBUreg _)) => (MOVVreg x)
+(MOVHUreg x:(MOVHUreg _)) => (MOVVreg x)
+(MOVWreg x:(MOVBreg _)) => (MOVVreg x)
+(MOVWreg x:(MOVBUreg _)) => (MOVVreg x)
+(MOVWreg x:(MOVHreg _)) => (MOVVreg x)
+(MOVWreg x:(MOVWreg _)) => (MOVVreg x)
+(MOVWUreg x:(MOVBUreg _)) => (MOVVreg x)
+(MOVWUreg x:(MOVHUreg _)) => (MOVVreg x)
+(MOVWUreg x:(MOVWUreg _)) => (MOVVreg x)

 // don't extend before store
-(MOVBstore [off] {sym} ptr (MOVBreg x) mem) -> (MOVBstore [off] {sym} ptr x mem)
-(MOVBstore [off] {sym} ptr (MOVBUreg x) mem) -> (MOVBstore [off] {sym} ptr x mem)
-(MOVBstore [off] {sym} ptr (MOVHreg x) mem) -> (MOVBstore [off] {sym} ptr x mem)
-(MOVBstore [off] {sym} ptr (MOVHUreg x) mem) -> (MOVBstore [off] {sym} ptr x mem)
-(MOVBstore [off] {sym} ptr (MOVWreg x) mem) -> (MOVBstore [off] {sym} ptr x mem)
-(MOVBstore [off] {sym} ptr (MOVWUreg x) mem) -> (MOVBstore [off] {sym} ptr x mem)
-(MOVHstore [off] {sym} ptr (MOVHreg x) mem) -> (MOVHstore [off] {sym} ptr x mem)
-(MOVHstore [off] {sym} ptr (MOVHUreg x) mem) -> (MOVHstore [off] {sym} ptr x mem)
-(MOVHstore [off] {sym} ptr (MOVWreg x) mem) -> (MOVHstore [off] {sym} ptr x mem)
-(MOVHstore [off] {sym} ptr (MOVWUreg x) mem) -> (MOVHstore [off] {sym} ptr x mem)
-(MOVWstore [off] {sym} ptr (MOVWreg x) mem) -> (MOVWstore [off] {sym} ptr x mem)
-(MOVWstore [off] {sym} ptr (MOVWUreg x) mem) -> (MOVWstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVWreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVWreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWreg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVWstore [off] {sym} ptr x mem)

 // if a register move has only 1 use, just use the same register without emitting instruction
 // MOVVnop doesn't emit instruction, only for ensuring the type.
-(MOVVreg x) && x.Uses == 1 -> (MOVVnop x)
+(MOVVreg x) && x.Uses == 1 => (MOVVnop x)

 // fold constant into arithmatic ops
-(ADDV x (MOVVconst [c])) && is32Bit(c) -> (ADDVconst [c] x)
-(SUBV x (MOVVconst [c])) && is32Bit(c) -> (SUBVconst [c] x)
-(AND x (MOVVconst [c])) && is32Bit(c) -> (ANDconst [c] x)
-(OR  x (MOVVconst [c])) && is32Bit(c) -> (ORconst  [c] x)
-(XOR x (MOVVconst [c])) && is32Bit(c) -> (XORconst [c] x)
-(NOR x (MOVVconst [c])) && is32Bit(c) -> (NORconst [c] x)
+(ADDV x (MOVVconst [c])) && is32Bit(c) => (ADDVconst [c] x)
+(SUBV x (MOVVconst [c])) && is32Bit(c) => (SUBVconst [c] x)
+(AND x (MOVVconst [c])) && is32Bit(c) => (ANDconst [c] x)
+(OR  x (MOVVconst [c])) && is32Bit(c) => (ORconst  [c] x)
+(XOR x (MOVVconst [c])) && is32Bit(c) => (XORconst [c] x)
+(NOR x (MOVVconst [c])) && is32Bit(c) => (NORconst [c] x)

-(SLLV _ (MOVVconst [c])) && uint64(c)>=64 -> (MOVVconst [0])
-(SRLV _ (MOVVconst [c])) && uint64(c)>=64 -> (MOVVconst [0])
-(SRAV x (MOVVconst [c])) && uint64(c)>=64 -> (SRAVconst x [63])
-(SLLV x (MOVVconst [c])) -> (SLLVconst x [c])
-(SRLV x (MOVVconst [c])) -> (SRLVconst x [c])
-(SRAV x (MOVVconst [c])) -> (SRAVconst x [c])
+(SLLV _ (MOVVconst [c])) && uint64(c)>=64 => (MOVVconst [0])
+(SRLV _ (MOVVconst [c])) && uint64(c)>=64 => (MOVVconst [0])
+(SRAV x (MOVVconst [c])) && uint64(c)>=64 => (SRAVconst x [63])
+(SLLV x (MOVVconst [c])) => (SLLVconst x [c])
+(SRLV x (MOVVconst [c])) => (SRLVconst x [c])
+(SRAV x (MOVVconst [c])) => (SRAVconst x [c])

-(SGT  (MOVVconst [c]) x) && is32Bit(c) -> (SGTconst  [c] x)
-(SGTU (MOVVconst [c]) x) && is32Bit(c) -> (SGTUconst [c] x)
+(SGT  (MOVVconst [c]) x) && is32Bit(c) => (SGTconst  [c] x)
+(SGTU (MOVVconst [c]) x) && is32Bit(c) => (SGTUconst [c] x)

 // mul by constant
-(Select1 (MULVU x (MOVVconst [-1]))) -> (NEGV x)
-(Select1 (MULVU _ (MOVVconst [0]))) -> (MOVVconst [0])
-(Select1 (MULVU x (MOVVconst [1]))) -> x
-(Select1 (MULVU x (MOVVconst [c]))) && isPowerOfTwo(c) -> (SLLVconst [log2(c)] x)
+(Select1 (MULVU x (MOVVconst [-1]))) => (NEGV x)
+(Select1 (MULVU _ (MOVVconst [0]))) => (MOVVconst [0])
+(Select1 (MULVU x (MOVVconst [1]))) => x
+(Select1 (MULVU x (MOVVconst [c]))) && isPowerOfTwo(c) => (SLLVconst [log2(c)] x)

 // div by constant
-(Select1 (DIVVU x (MOVVconst [1]))) -> x
-(Select1 (DIVVU x (MOVVconst [c]))) && isPowerOfTwo(c) -> (SRLVconst [log2(c)] x)
-(Select0 (DIVVU _ (MOVVconst [1]))) -> (MOVVconst [0])                       // mod
-(Select0 (DIVVU x (MOVVconst [c]))) && isPowerOfTwo(c) -> (ANDconst [c-1] x) // mod
+(Select1 (DIVVU x (MOVVconst [1]))) => x
+(Select1 (DIVVU x (MOVVconst [c]))) && isPowerOfTwo(c) => (SRLVconst [log2(c)] x)
+(Select0 (DIVVU _ (MOVVconst [1]))) => (MOVVconst [0])                       // mod
+(Select0 (DIVVU x (MOVVconst [c]))) && isPowerOfTwo(c) => (ANDconst [c-1] x) // mod

 // generic simplifications
-(ADDV x (NEGV y)) -> (SUBV x y)
-(SUBV x x) -> (MOVVconst [0])
-(SUBV (MOVVconst [0]) x) -> (NEGV x)
-(AND x x) -> x
-(OR  x x) -> x
-(XOR x x) -> (MOVVconst [0])
+(ADDV x (NEGV y)) => (SUBV x y)
+(SUBV x x) => (MOVVconst [0])
+(SUBV (MOVVconst [0]) x) => (NEGV x)
+(AND x x) => x
+(OR  x x) => x
+(XOR x x) => (MOVVconst [0])

 // remove redundant *const ops
-(ADDVconst [0]  x) -> x
-(SUBVconst [0]  x) -> x
-(ANDconst [0]  _) -> (MOVVconst [0])
-(ANDconst [-1] x) -> x
-(ORconst  [0]  x) -> x
-(ORconst  [-1] _) -> (MOVVconst [-1])
-(XORconst [0]  x) -> x
-(XORconst [-1] x) -> (NORconst [0] x)
+(ADDVconst [0]  x) => x
+(SUBVconst [0]  x) => x
+(ANDconst [0]  _) => (MOVVconst [0])
+(ANDconst [-1] x) => x
+(ORconst  [0]  x) => x
+(ORconst  [-1] _) => (MOVVconst [-1])
+(XORconst [0]  x) => x
+(XORconst [-1] x) => (NORconst [0] x)

 // generic constant folding
-(ADDVconst [c] (MOVVconst [d]))  -> (MOVVconst [c+d])
-(ADDVconst [c] (ADDVconst [d] x)) && is32Bit(c+d) -> (ADDVconst [c+d] x)
-(ADDVconst [c] (SUBVconst [d] x)) && is32Bit(c-d) -> (ADDVconst [c-d] x)
-(SUBVconst [c] (MOVVconst [d]))  -> (MOVVconst [d-c])
-(SUBVconst [c] (SUBVconst [d] x)) && is32Bit(-c-d) -> (ADDVconst [-c-d] x)
-(SUBVconst [c] (ADDVconst [d] x)) && is32Bit(-c+d) -> (ADDVconst [-c+d] x)
-(SLLVconst [c] (MOVVconst [d]))  -> (MOVVconst [d<<uint64(c)])
-(SRLVconst [c] (MOVVconst [d]))  -> (MOVVconst [int64(uint64(d)>>uint64(c))])
-(SRAVconst [c] (MOVVconst [d]))  -> (MOVVconst [d>>uint64(c)])
-(Select1 (MULVU (MOVVconst [c]) (MOVVconst [d]))) -> (MOVVconst [c*d])
-(Select1 (DIVV  (MOVVconst [c]) (MOVVconst [d]))) -> (MOVVconst [c/d])
-(Select1 (DIVVU (MOVVconst [c]) (MOVVconst [d]))) -> (MOVVconst [int64(uint64(c)/uint64(d))])
-(Select0 (DIVV  (MOVVconst [c]) (MOVVconst [d]))) -> (MOVVconst [c%d])   // mod
-(Select0 (DIVVU (MOVVconst [c]) (MOVVconst [d]))) -> (MOVVconst [int64(uint64(c)%uint64(d))]) // mod
-(ANDconst [c] (MOVVconst [d])) -> (MOVVconst [c&d])
-(ANDconst [c] (ANDconst [d] x)) -> (ANDconst [c&d] x)
-(ORconst [c] (MOVVconst [d])) -> (MOVVconst [c|d])
-(ORconst [c] (ORconst [d] x)) && is32Bit(c|d) -> (ORconst [c|d] x)
-(XORconst [c] (MOVVconst [d])) -> (MOVVconst [c^d])
-(XORconst [c] (XORconst [d] x)) && is32Bit(c^d) -> (XORconst [c^d] x)
-(NORconst [c] (MOVVconst [d])) -> (MOVVconst [^(c|d)])
-(NEGV (MOVVconst [c])) -> (MOVVconst [-c])
-(MOVBreg  (MOVVconst [c])) -> (MOVVconst [int64(int8(c))])
-(MOVBUreg (MOVVconst [c])) -> (MOVVconst [int64(uint8(c))])
-(MOVHreg  (MOVVconst [c])) -> (MOVVconst [int64(int16(c))])
-(MOVHUreg (MOVVconst [c])) -> (MOVVconst [int64(uint16(c))])
-(MOVWreg  (MOVVconst [c])) -> (MOVVconst [int64(int32(c))])
-(MOVWUreg (MOVVconst [c])) -> (MOVVconst [int64(uint32(c))])
-(MOVVreg  (MOVVconst [c])) -> (MOVVconst [c])
+(ADDVconst [c] (MOVVconst [d]))  => (MOVVconst [c+d])
+(ADDVconst [c] (ADDVconst [d] x)) && is32Bit(c+d) => (ADDVconst [c+d] x)
+(ADDVconst [c] (SUBVconst [d] x)) && is32Bit(c-d) => (ADDVconst [c-d] x)
+(SUBVconst [c] (MOVVconst [d]))  => (MOVVconst [d-c])
+(SUBVconst [c] (SUBVconst [d] x)) && is32Bit(-c-d) => (ADDVconst [-c-d] x)
+(SUBVconst [c] (ADDVconst [d] x)) && is32Bit(-c+d) => (ADDVconst [-c+d] x)
+(SLLVconst [c] (MOVVconst [d]))  => (MOVVconst [d<<uint64(c)])
+(SRLVconst [c] (MOVVconst [d]))  => (MOVVconst [int64(uint64(d)>>uint64(c))])
+(SRAVconst [c] (MOVVconst [d]))  => (MOVVconst [d>>uint64(c)])
+(Select1 (MULVU (MOVVconst [c]) (MOVVconst [d]))) => (MOVVconst [c*d])
+(Select1 (DIVV  (MOVVconst [c]) (MOVVconst [d]))) => (MOVVconst [c/d])
+(Select1 (DIVVU (MOVVconst [c]) (MOVVconst [d]))) => (MOVVconst [int64(uint64(c)/uint64(d))])
+(Select0 (DIVV  (MOVVconst [c]) (MOVVconst [d]))) => (MOVVconst [c%d])   // mod
+(Select0 (DIVVU (MOVVconst [c]) (MOVVconst [d]))) => (MOVVconst [int64(uint64(c)%uint64(d))]) // mod
+(ANDconst [c] (MOVVconst [d])) => (MOVVconst [c&d])
+(ANDconst [c] (ANDconst [d] x)) => (ANDconst [c&d] x)
+(ORconst [c] (MOVVconst [d])) => (MOVVconst [c|d])
+(ORconst [c] (ORconst [d] x)) && is32Bit(c|d) => (ORconst [c|d] x)
+(XORconst [c] (MOVVconst [d])) => (MOVVconst [c^d])
+(XORconst [c] (XORconst [d] x)) && is32Bit(c^d) => (XORconst [c^d] x)
+(NORconst [c] (MOVVconst [d])) => (MOVVconst [^(c|d)])
+(NEGV (MOVVconst [c])) => (MOVVconst [-c])
+(MOVBreg  (MOVVconst [c])) => (MOVVconst [int64(int8(c))])
+(MOVBUreg (MOVVconst [c])) => (MOVVconst [int64(uint8(c))])
+(MOVHreg  (MOVVconst [c])) => (MOVVconst [int64(int16(c))])
+(MOVHUreg (MOVVconst [c])) => (MOVVconst [int64(uint16(c))])
+(MOVWreg  (MOVVconst [c])) => (MOVVconst [int64(int32(c))])
+(MOVWUreg (MOVVconst [c])) => (MOVVconst [int64(uint32(c))])
+(MOVVreg  (MOVVconst [c])) => (MOVVconst [c])
 (LoweredAtomicStore32 ptr (MOVVconst [0]) mem) -> (LoweredAtomicStorezero32 ptr mem)
 (LoweredAtomicStore64 ptr (MOVVconst [0]) mem) -> (LoweredAtomicStorezero64 ptr mem)
 (LoweredAtomicAdd32 ptr (MOVVconst [c]) mem) && is32Bit(c) -> (LoweredAtomicAddconst32 [c] ptr mem)
 (LoweredAtomicAdd64 ptr (MOVVconst [c]) mem) && is32Bit(c) -> (LoweredAtomicAddconst64 [c] ptr mem)

 // constant comparisons
-(SGTconst [c] (MOVVconst [d])) && c>d -> (MOVVconst [1])
-(SGTconst [c] (MOVVconst [d])) && c<=d -> (MOVVconst [0])
-(SGTUconst [c] (MOVVconst [d])) && uint64(c)>uint64(d) -> (MOVVconst [1])
-(SGTUconst [c] (MOVVconst [d])) && uint64(c)<=uint64(d) -> (MOVVconst [0])
+(SGTconst [c] (MOVVconst [d])) && c>d => (MOVVconst [1])
+(SGTconst [c] (MOVVconst [d])) && c<=d => (MOVVconst [0])
+(SGTUconst [c] (MOVVconst [d])) && uint64(c)>uint64(d) => (MOVVconst [1])
+(SGTUconst [c] (MOVVconst [d])) && uint64(c)<=uint64(d) => (MOVVconst [0])

 // other known comparisons
-(SGTconst [c] (MOVBreg _)) && 0x7f < c -> (MOVVconst [1])
-(SGTconst [c] (MOVBreg _)) && c <= -0x80 -> (MOVVconst [0])
-(SGTconst [c] (MOVBUreg _)) && 0xff < c -> (MOVVconst [1])
-(SGTconst [c] (MOVBUreg _)) && c < 0 -> (MOVVconst [0])
-(SGTUconst [c] (MOVBUreg _)) && 0xff < uint64(c) -> (MOVVconst [1])
-(SGTconst [c] (MOVHreg _)) && 0x7fff < c -> (MOVVconst [1])
-(SGTconst [c] (MOVHreg _)) && c <= -0x8000 -> (MOVVconst [0])
-(SGTconst [c] (MOVHUreg _)) && 0xffff < c -> (MOVVconst [1])
-(SGTconst [c] (MOVHUreg _)) && c < 0 -> (MOVVconst [0])
-(SGTUconst [c] (MOVHUreg _)) && 0xffff < uint64(c) -> (MOVVconst [1])
-(SGTconst [c] (MOVWUreg _)) && c < 0 -> (MOVVconst [0])
-(SGTconst [c] (ANDconst [m] _)) && 0 <= m && m < c -> (MOVVconst [1])
-(SGTUconst [c] (ANDconst [m] _)) && uint64(m) < uint64(c) -> (MOVVconst [1])
-(SGTconst [c] (SRLVconst _ [d])) && 0 <= c && 0 < d && d <= 63 && 0xffffffffffffffff>>uint64(d) < uint64(c) -> (MOVVconst [1])
-(SGTUconst [c] (SRLVconst _ [d])) && 0 < d && d <= 63 && 0xffffffffffffffff>>uint64(d) < uint64(c) -> (MOVVconst [1])
+(SGTconst [c] (MOVBreg _)) && 0x7f < c => (MOVVconst [1])
+(SGTconst [c] (MOVBreg _)) && c <= -0x80 => (MOVVconst [0])
+(SGTconst [c] (MOVBUreg _)) && 0xff < c => (MOVVconst [1])
+(SGTconst [c] (MOVBUreg _)) && c < 0 => (MOVVconst [0])
+(SGTUconst [c] (MOVBUreg _)) && 0xff < uint64(c) => (MOVVconst [1])
+(SGTconst [c] (MOVHreg _)) && 0x7fff < c => (MOVVconst [1])
+(SGTconst [c] (MOVHreg _)) && c <= -0x8000 => (MOVVconst [0])
+(SGTconst [c] (MOVHUreg _)) && 0xffff < c => (MOVVconst [1])
+(SGTconst [c] (MOVHUreg _)) && c < 0 => (MOVVconst [0])
+(SGTUconst [c] (MOVHUreg _)) && 0xffff < uint64(c) => (MOVVconst [1])
+(SGTconst [c] (MOVWUreg _)) && c < 0 => (MOVVconst [0])
+(SGTconst [c] (ANDconst [m] _)) && 0 <= m && m < c => (MOVVconst [1])
+(SGTUconst [c] (ANDconst [m] _)) && uint64(m) < uint64(c) => (MOVVconst [1])
+(SGTconst [c] (SRLVconst _ [d])) && 0 <= c && 0 < d && d <= 63 && 0xffffffffffffffff>>uint64(d) < uint64(c) => (MOVVconst [1])
+(SGTUconst [c] (SRLVconst _ [d])) && 0 < d && d <= 63 && 0xffffffffffffffff>>uint64(d) < uint64(c) => (MOVVconst [1])

 // absorb constants into branches
-(EQ  (MOVVconst [0]) yes no) -> (First yes no)
-(EQ  (MOVVconst [c]) yes no) && c != 0 -> (First no yes)
-(NE  (MOVVconst [0]) yes no) -> (First no yes)
-(NE  (MOVVconst [c]) yes no) && c != 0 -> (First yes no)
-(LTZ (MOVVconst [c]) yes no) && c <  0 -> (First yes no)
-(LTZ (MOVVconst [c]) yes no) && c >= 0 -> (First no yes)
-(LEZ (MOVVconst [c]) yes no) && c <= 0 -> (First yes no)
-(LEZ (MOVVconst [c]) yes no) && c >  0 -> (First no yes)
-(GTZ (MOVVconst [c]) yes no) && c >  0 -> (First yes no)
-(GTZ (MOVVconst [c]) yes no) && c <= 0 -> (First no yes)
-(GEZ (MOVVconst [c]) yes no) && c >= 0 -> (First yes no)
-(GEZ (MOVVconst [c]) yes no) && c <  0 -> (First no yes)
+(EQ  (MOVVconst [0]) yes no) => (First yes no)
+(EQ  (MOVVconst [c]) yes no) && c != 0 => (First no yes)
+(NE  (MOVVconst [0]) yes no) => (First no yes)
+(NE  (MOVVconst [c]) yes no) && c != 0 => (First yes no)
+(LTZ (MOVVconst [c]) yes no) && c <  0 => (First yes no)
+(LTZ (MOVVconst [c]) yes no) && c >= 0 => (First no yes)
+(LEZ (MOVVconst [c]) yes no) && c <= 0 => (First yes no)
+(LEZ (MOVVconst [c]) yes no) && c >  0 => (First no yes)
+(GTZ (MOVVconst [c]) yes no) && c >  0 => (First yes no)
+(GTZ (MOVVconst [c]) yes no) && c <= 0 => (First no yes)
+(GEZ (MOVVconst [c]) yes no) && c >= 0 => (First yes no)
+(GEZ (MOVVconst [c]) yes no) && c <  0 => (First no yes)
--- a/src/cmd/compile/internal/ssa/gen/MIPS64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/MIPS64Ops.go
@ -450,9 +450,9 @@ func init() {
 		// There are three of these functions so that they can have three different register inputs.
 		// When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
 		// default registers to match so we don't need to copy registers around unnecessarily.
-		{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r3, r4}}, typ: "Mem"}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
-		{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem"}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
-		{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem"}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r3, r4}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+		{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
 	}

 	blocks := []blockData{
--- a/Show more
+++ b/Show more