cmd/link: support 32b TLS_LE offsets on PPC64

When using the GCC thread sanitizer, it links in additional code which uses TLS, which causes us to exceed the range of the 16 bit TLS relocation used by statically compiled go code. Rewrite objabi.R_POWER_TLS_LE to handle 32b offsets when linking internally or externally into an ELF binary. The elf relocation translation is changed to generate a pair of R_PPC64_TPREL16_HA/LO relocations instead of a single R_PPC64_TPREL16. Likewise, updating the above exposed some behavioral differences in gnu ld which can rewrite TLS sequences. It expects the sequence to generate a valid TLS address, not offset. This was exposed when compiling PIC code. The proper fix is to generate the full TLS address in the destination register of the "MOVD tlsaddr, $Rx" pseudo-op. This removes the need to insert special objabi.R_POWER_TLS relocations elsewhere. Unfortunately, XCOFF (used by aix) doesn't appear to support 32 bit offsets, so we rewrite this back into a 16b relocation when externally linking a static binary. Fixes #45040 Change-Id: I1ee9afd0b427cd79888032aa1f60d3e265073e1d Reviewed-on: https://go-review.googlesource.com/c/go/+/302209 Run-TryBot: Paul Murphy <murp@ibm.com> Reviewed-by: Cherry Zhang <cherryyz@google.com> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Go Bot <gobot@golang.org>
2025-12-08 06:10:04 +00:00 · 2021-03-15 15:21:57 -05:00 · 2021-03-15 15:21:57 -05:00 · 3e5bba0a44
commit 3e5bba0a44
parent d948b8633d
5 changed files with 76 additions and 52 deletions
--- a/src/cmd/internal/obj/ppc64/asm9.go
+++ b/src/cmd/internal/obj/ppc64/asm9.go
@ -229,8 +229,8 @@ var optab = []Optab{
 	{as: AMOVD, a1: C_LACON, a6: C_REG, type_: 26, size: 8},
 	{as: AMOVD, a1: C_SOREG, a6: C_REG, type_: 8, size: 4},
 	{as: AMOVD, a1: C_LOREG, a6: C_REG, type_: 36, size: 8},
-	{as: AMOVD, a1: C_TLS_LE, a6: C_REG, type_: 79, size: 4},
+	{as: AMOVD, a1: C_TLS_LE, a6: C_REG, type_: 79, size: 8},
-	{as: AMOVD, a1: C_TLS_IE, a6: C_REG, type_: 80, size: 8},
+	{as: AMOVD, a1: C_TLS_IE, a6: C_REG, type_: 80, size: 12},
 	{as: AMOVD, a1: C_TOCADDR, a6: C_REG, type_: 95, size: 8},
 	{as: AMOVD, a1: C_SPR, a6: C_REG, type_: 66, size: 4},
 	{as: AMOVD, a1: C_REG, a6: C_ADDR, type_: 74, size: 8},
@ -2500,18 +2500,6 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 			if v != 0 {
 				c.ctxt.Diag("illegal indexed instruction\n%v", p)
 			}
 			if c.ctxt.Flag_shared && r == REG_R13 {
 				rel := obj.Addrel(c.cursym)
 				rel.Off = int32(c.pc)
 				rel.Siz = 4
 				// This (and the matching part in the load case
 				// below) are the only places in the ppc64 toolchain
 				// that knows the name of the tls variable. Possibly
 				// we could add some assembly syntax so that the name
 				// of the variable does not have to be assumed.
 				rel.Sym = c.ctxt.Lookup("runtime.tls_g")
 				rel.Type = objabi.R_POWER_TLS
 			}
 			o1 = AOP_RRR(c.opstorex(p.As), uint32(p.From.Reg), uint32(p.To.Index), uint32(r))
 		} else {
 			if int32(int16(v)) != v {
@ -2536,13 +2524,6 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 			if v != 0 {
 				c.ctxt.Diag("illegal indexed instruction\n%v", p)
 			}
 			if c.ctxt.Flag_shared && r == REG_R13 {
 				rel := obj.Addrel(c.cursym)
 				rel.Off = int32(c.pc)
 				rel.Siz = 4
 				rel.Sym = c.ctxt.Lookup("runtime.tls_g")
 				rel.Type = objabi.R_POWER_TLS
 			}
 			o1 = AOP_RRR(c.oploadx(p.As), uint32(p.To.Reg), uint32(p.From.Index), uint32(r))
 		} else {
 			if int32(int16(v)) != v {
@ -3511,10 +3492,11 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		if p.From.Offset != 0 {
 			c.ctxt.Diag("invalid offset against tls var %v", p)
 		}
-		o1 = AOP_IRR(OP_ADDI, uint32(p.To.Reg), REGZERO, 0)
+		o1 = AOP_IRR(OP_ADDIS, uint32(p.To.Reg), REG_R13, 0)
 		o2 = AOP_IRR(OP_ADDI, uint32(p.To.Reg), uint32(p.To.Reg), 0)
 		rel := obj.Addrel(c.cursym)
 		rel.Off = int32(c.pc)
-		rel.Siz = 4
+		rel.Siz = 8
 		rel.Sym = p.From.Sym
 		rel.Type = objabi.R_POWER_TLS_LE
@ -3524,11 +3506,17 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		}
 		o1 = AOP_IRR(OP_ADDIS, uint32(p.To.Reg), REG_R2, 0)
 		o2 = AOP_IRR(c.opload(AMOVD), uint32(p.To.Reg), uint32(p.To.Reg), 0)
 		o3 = AOP_RRR(OP_ADD, uint32(p.To.Reg), uint32(p.To.Reg), REG_R13)
 		rel := obj.Addrel(c.cursym)
 		rel.Off = int32(c.pc)
 		rel.Siz = 8
 		rel.Sym = p.From.Sym
 		rel.Type = objabi.R_POWER_TLS_IE
 		rel = obj.Addrel(c.cursym)
 		rel.Off = int32(c.pc) + 8
 		rel.Siz = 4
 		rel.Sym = p.From.Sym
 		rel.Type = objabi.R_POWER_TLS
 	case 81:
 		v := c.vregoff(&p.To)
--- a/src/cmd/internal/objabi/reloctype.go
+++ b/src/cmd/internal/objabi/reloctype.go
@ -167,8 +167,8 @@ const (
 	// R_POWER_TLS_LE is used to implement the "local exec" model for tls
 	// access. It resolves to the offset of the thread-local symbol from the
-	// thread pointer (R13) and inserts this value into the low 16 bits of an
+	// thread pointer (R13) and is split against a pair of instructions to
-	// instruction word.
+	// support a 32 bit displacement.
 	R_POWER_TLS_LE
 	// R_POWER_TLS_IE is used to implement the "initial exec" model for tls access. It
@ -178,10 +178,12 @@ const (
 	// symbol from the thread pointer (R13)).
 	R_POWER_TLS_IE
-	// R_POWER_TLS marks an X-form instruction such as "MOVD 0(R13)(R31*1), g" as
+	// R_POWER_TLS marks an X-form instruction such as "ADD R3,R13,R4" as completing
-	// accessing a particular thread-local symbol. It does not affect code generation
+	// a sequence of GOT-relative relocations to compute a TLS address. This can be
-	// but is used by the system linker when relaxing "initial exec" model code to
+	// used by the system linker to to rewrite the GOT-relative TLS relocation into a
-	// "local exec" model code.
+	// simpler thread-pointer relative relocation. See table 3.26 and 3.28 in the
 	// ppc64 elfv2 1.4 ABI on this transformation.  Likewise, the second argument
 	// (usually called RB in X-form instructions) is assumed to be R13.
 	R_POWER_TLS
 	// R_ADDRPOWER_DS is similar to R_ADDRPOWER above, but assumes the second
--- a/src/cmd/link/internal/ppc64/asm.go
+++ b/src/cmd/link/internal/ppc64/asm.go
@ -418,6 +418,7 @@ func xcoffreloc1(arch *sys.Arch, out *ld.OutBuf, ldr *loader.Loader, s loader.Sy
 		emitReloc(ld.XCOFF_R_TOCU|(0x0F<<8), 2)
 		emitReloc(ld.XCOFF_R_TOCL|(0x0F<<8), 6)
 	case objabi.R_POWER_TLS_LE:
 		// This only supports 16b relocations.  It is fixed up in archreloc.
 		emitReloc(ld.XCOFF_R_TLS_LE|0x0F<<8, 2)
 	case objabi.R_CALLPOWER:
 		if r.Size != 4 {
@ -458,7 +459,10 @@ func elfreloc1(ctxt *ld.Link, out *ld.OutBuf, ldr *loader.Loader, s loader.Sym,
 	case objabi.R_POWER_TLS:
 		out.Write64(uint64(elf.R_PPC64_TLS) | uint64(elfsym)<<32)
 	case objabi.R_POWER_TLS_LE:
-		out.Write64(uint64(elf.R_PPC64_TPREL16) | uint64(elfsym)<<32)
+		out.Write64(uint64(elf.R_PPC64_TPREL16_HA) | uint64(elfsym)<<32)
 		out.Write64(uint64(r.Xadd))
 		out.Write64(uint64(sectoff + 4))
 		out.Write64(uint64(elf.R_PPC64_TPREL16_LO) | uint64(elfsym)<<32)
 	case objabi.R_POWER_TLS_IE:
 		out.Write64(uint64(elf.R_PPC64_GOT_TPREL16_HA) | uint64(elfsym)<<32)
 		out.Write64(uint64(r.Xadd))
@ -797,11 +801,25 @@ func archreloc(target *ld.Target, ldr *loader.Loader, syms *ld.ArchSyms, r loade
 			if !target.IsAIX() {
 				return val, nExtReloc, false
 			}
-		case objabi.R_POWER_TLS, objabi.R_POWER_TLS_LE, objabi.R_POWER_TLS_IE:
+		case objabi.R_POWER_TLS:
 			// check Outer is nil, Type is TLSBSS?
 			nExtReloc = 1
-			if rt == objabi.R_POWER_TLS_IE {
+			return val, nExtReloc, true
-				nExtReloc = 2 // need two ELF relocations, see elfreloc1
+		case objabi.R_POWER_TLS_LE, objabi.R_POWER_TLS_IE:
 			if target.IsAIX() && rt == objabi.R_POWER_TLS_LE {
 				// Fixup val, an addis/addi pair of instructions, which generate a 32b displacement
 				// from the threadpointer (R13), into a 16b relocation. XCOFF only supports 16b
 				// TLS LE relocations. Likewise, verify this is an addis/addi sequence.
 				const expectedOpcodes = 0x3C00000038000000
 				const expectedOpmasks = 0xFC000000FC000000
 				if uint64(val)&expectedOpmasks != expectedOpcodes {
 					ldr.Errorf(s, "relocation for %s+%d is not an addis/addi pair: %16x", ldr.SymName(rs), r.Off(), uint64(val))
 				}
 				nval := (int64(uint32(0x380d0000)) | val&0x03e00000) << 32 // addi rX, r13, $0
 				nval |= int64(0x60000000)                                  // nop
 				val = nval
 				nExtReloc = 1
 			} else {
 				nExtReloc = 2
 			}
 			return val, nExtReloc, true
 		case objabi.R_ADDRPOWER,
@ -855,10 +873,26 @@ func archreloc(target *ld.Target, ldr *loader.Loader, syms *ld.ArchSyms, r loade
 			// the TLS.
 			v -= 0x800
 		}
-		if int64(int16(v)) != v {
+
 		var o1, o2 uint32
 		if int64(int32(v)) != v {
 			ldr.Errorf(s, "TLS offset out of range %d", v)
 		}
-		return (val &^ 0xffff) | (v & 0xffff), nExtReloc, true
+		if target.IsBigEndian() {
 			o1 = uint32(val >> 32)
 			o2 = uint32(val)
 		} else {
 			o1 = uint32(val)
 			o2 = uint32(val >> 32)
 		}
 		o1 |= uint32(((v + 0x8000) >> 16) & 0xFFFF)
 		o2 |= uint32(v & 0xFFFF)
 		if target.IsBigEndian() {
 			return int64(o1)<<32 | int64(o2), nExtReloc, true
 		}
 		return int64(o2)<<32 | int64(o1), nExtReloc, true
 	}
 	return val, nExtReloc, false
--- a/src/runtime/race_ppc64le.s
+++ b/src/runtime/race_ppc64le.s
@ -36,8 +36,8 @@
 // racecalladdr.
 //
 // The sequence used to get the race ctx:
-//    MOVD    runtime·tls_g(SB), R10	// offset to TLS
+//    MOVD    runtime·tls_g(SB), R10 // Address of TLS variable
-//    MOVD    0(R13)(R10*1), g		// R13=TLS for this thread, g = R30
+//    MOVD    0(R10), g              // g = R30
 //    MOVD    g_racectx(g), R3       // racectx == ThreadState
 // func runtime·RaceRead(addr uintptr)
@ -137,7 +137,7 @@ TEXT	runtime·racewriterangepc1(SB), NOSPLIT, $0-24
 // Otherwise, setup goroutine context and invoke racecall. Other arguments already set.
 TEXT	racecalladdr<>(SB), NOSPLIT, $0-0
 	MOVD    runtime·tls_g(SB), R10
-	MOVD	0(R13)(R10*1), g
+	MOVD	0(R10), g
 	MOVD	g_racectx(g), R3	// goroutine context
 	// Check that addr is within [arenastart, arenaend) or within [racedatastart, racedataend).
 	MOVD	runtime·racearenastart(SB), R9
@ -173,7 +173,7 @@ TEXT	runtime·racefuncenter(SB), NOSPLIT, $0-8
 // R11 = caller's return address
 TEXT	racefuncenter<>(SB), NOSPLIT, $0-0
 	MOVD    runtime·tls_g(SB), R10
-	MOVD    0(R13)(R10*1), g
+	MOVD    0(R10), g
 	MOVD    g_racectx(g), R3        // goroutine racectx aka *ThreadState
 	MOVD	R8, R4			// caller pc set by caller in R8
 	// void __tsan_func_enter(ThreadState *thr, void *pc);
@ -185,7 +185,7 @@ TEXT	racefuncenter<>(SB), NOSPLIT, $0-0
 // Called from Go instrumented code.
 TEXT	runtime·racefuncexit(SB), NOSPLIT, $0-0
 	MOVD    runtime·tls_g(SB), R10
-	MOVD    0(R13)(R10*1), g
+	MOVD    0(R10), g
 	MOVD    g_racectx(g), R3        // goroutine racectx aka *ThreadState
 	// void __tsan_func_exit(ThreadState *thr);
 	MOVD	$__tsan_func_exit(SB), R8
@ -380,7 +380,7 @@ racecallatomic_data:
 racecallatomic_ok:
 	// Addr is within the good range, call the atomic function.
 	MOVD    runtime·tls_g(SB), R10
-	MOVD    0(R13)(R10*1), g
+	MOVD    0(R10), g
 	MOVD    g_racectx(g), R3        // goroutine racectx aka *ThreadState
 	MOVD	R8, R5			// pc is the function called
 	MOVD	(R1), R4		// caller pc from stack
@ -394,7 +394,7 @@ racecallatomic_ignore:
 	MOVD	R6, R17 // save the original arg list addr
 	MOVD	$__tsan_go_ignore_sync_begin(SB), R8 // func addr to call
 	MOVD    runtime·tls_g(SB), R10
-	MOVD    0(R13)(R10*1), g
+	MOVD    0(R10), g
 	MOVD    g_racectx(g), R3        // goroutine context
 	BL	racecall<>(SB)
 	MOVD	R15, R8	// restore the original function
@ -402,7 +402,7 @@ racecallatomic_ignore:
 	// Call the atomic function.
 	// racecall will call LLVM race code which might clobber r30 (g)
 	MOVD	runtime·tls_g(SB), R10
-	MOVD	0(R13)(R10*1), g
+	MOVD	0(R10), g
 	MOVD	g_racectx(g), R3
 	MOVD	R8, R4		// pc being called same TODO as above
@ -434,7 +434,7 @@ TEXT	racecall<>(SB), NOSPLIT, $0-0
 	MOVD	R10, 16(R1)	// C ABI
 	// Get info from the current goroutine
 	MOVD    runtime·tls_g(SB), R10	// g offset in TLS
-	MOVD    0(R13)(R10*1), g	// R13 = current TLS
+	MOVD    0(R10), g
 	MOVD	g_m(g), R7		// m for g
 	MOVD	R1, R16			// callee-saved, preserved across C call
 	MOVD	m_g0(R7), R10		// g0 for m
@ -448,7 +448,7 @@ call:
 	XOR     R0, R0			// clear R0 on return from Clang
 	MOVD	R16, R1			// restore R1; R16 nonvol in Clang
 	MOVD    runtime·tls_g(SB), R10	// find correct g
-	MOVD    0(R13)(R10*1), g
+	MOVD    0(R10), g
 	MOVD	16(R1), R10		// LR was saved away, restore for return
 	MOVD	R10, LR
 	RET
@ -469,7 +469,7 @@ TEXT	runtime·racecallbackthunk(SB), NOSPLIT, $-8
 	// g0 TODO: Don't modify g here since R30 is nonvolatile
 	MOVD	g, R9
 	MOVD    runtime·tls_g(SB), R10
-	MOVD    0(R13)(R10*1), g
+	MOVD    0(R10), g
 	MOVD	g_m(g), R3
 	MOVD	m_p(R3), R3
 	MOVD	p_raceprocctx(R3), R3
@ -527,7 +527,7 @@ rest:
 	MOVD	R4, FIXED_FRAME+8(R1)
 	MOVD    runtime·tls_g(SB), R10
-	MOVD    0(R13)(R10*1), g
+	MOVD    0(R10), g
 	MOVD	g_m(g), R7
 	MOVD	m_g0(R7), R8
@ -540,7 +540,7 @@ rest:
 	// All registers are clobbered after Go code, reload.
 	MOVD    runtime·tls_g(SB), R10
-	MOVD    0(R13)(R10*1), g
+	MOVD    0(R10), g
 	MOVD	g_m(g), R7
 	MOVD	m_curg(R7), g // restore g = m->curg
--- a/src/runtime/tls_ppc64x.s
+++ b/src/runtime/tls_ppc64x.s
@ -29,7 +29,7 @@ TEXT runtime·save_g(SB),NOSPLIT|NOFRAME,$0-0
 	BEQ	nocgo
 #endif
 	MOVD	runtime·tls_g(SB), R31
-	MOVD	g, 0(R13)(R31*1)
+	MOVD	g, 0(R31)
 nocgo:
 	RET
@ -45,7 +45,7 @@ nocgo:
 // NOTE: _cgo_topofstack assumes this only clobbers g (R30), and R31.
 TEXT runtime·load_g(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD	runtime·tls_g(SB), R31
-	MOVD	0(R13)(R31*1), g
+	MOVD	0(R31), g
 	RET
 GLOBL runtime·tls_g+0(SB), TLSBSS+DUPOK, $8