From 700751006e14ecd6b1d40856e2eb58494ab8556b Mon Sep 17 00:00:00 2001 From: rickyleung Date: Fri, 10 May 2024 22:00:14 +0800 Subject: [PATCH] Support stack clash protection (cherry picked from commit 4f4298791f15f26e0649f57c6edfd999af51ec41) --- ...acksave-stackrestore-in-IRTranslator.patch | 315 ++ ...tion-of-locals-and-stack-realignment.patch | 546 ++++ ...Stack-probing-for-function-prologues.patch | 2652 +++++++++++++++++ ...-for-dynamic-allocas-in-SelectionDAG.patch | 744 +++++ ...ng-for-dynamic-allocas-in-GlobalISel.patch | 496 +++ ...-for-stack-clash-protection-backport.patch | 177 ++ llvm.spec | 12 +- 7 files changed, 4941 insertions(+), 1 deletion(-) create mode 100644 0021-Backport-GlobalISel-Don-t-expand-stacksave-stackrestore-in-IRTranslator.patch create mode 100644 0022-Backport-AArch64-Refactor-allocation-of-locals-and-stack-realignment.patch create mode 100644 0023-Backport-AArch64-Stack-probing-for-function-prologues.patch create mode 100644 0024-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-SelectionDAG.patch create mode 100644 0025-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-GlobalISel.patch create mode 100644 0026-Update-testcase-for-stack-clash-protection-backport.patch diff --git a/0021-Backport-GlobalISel-Don-t-expand-stacksave-stackrestore-in-IRTranslator.patch b/0021-Backport-GlobalISel-Don-t-expand-stacksave-stackrestore-in-IRTranslator.patch new file mode 100644 index 0000000..3fbe2e7 --- /dev/null +++ b/0021-Backport-GlobalISel-Don-t-expand-stacksave-stackrestore-in-IRTranslator.patch @@ -0,0 +1,315 @@ +From 7aeecae6393d5c3333beec64ad343ed1cabe75e4 Mon Sep 17 00:00:00 2001 +From: Matt Arsenault +Date: Sat, 29 Jul 2023 19:12:24 -0400 +Subject: [PATCH 1/7] GlobalISel: Don't expand stacksave/stackrestore in + IRTranslator + +In some (likely invalid edge cases anyway), it's not correct to +directly copy the stack pointer register. +--- + .../llvm/CodeGen/GlobalISel/LegalizerHelper.h | 2 + + llvm/include/llvm/Support/TargetOpcodes.def | 6 +++ + llvm/include/llvm/Target/GenericOpcodes.td | 12 ++++++ + llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 25 ++---------- + .../CodeGen/GlobalISel/LegalizerHelper.cpp | 26 +++++++++++++ + .../AArch64/GISel/AArch64LegalizerInfo.cpp | 4 +- + llvm/lib/Target/X86/X86LegalizerInfo.cpp | 4 ++ + .../AArch64/GlobalISel/arm64-irtranslator.ll | 4 +- + .../GlobalISel/legalizer-info-validation.mir | 10 ++++- + .../GlobalISel/stacksave-stackrestore.ll | 35 +++++++++++++++++ + .../X86/GlobalISel/stacksave-stackrestore.ll | 39 +++++++++++++++++++ + 11 files changed, 141 insertions(+), 26 deletions(-) + create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll + create mode 100644 llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll + +diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +index a568edd0e640..9288091874cf 100644 +--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h ++++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +@@ -401,6 +401,8 @@ public: + LegalizeResult lowerExtractInsertVectorElt(MachineInstr &MI); + LegalizeResult lowerShuffleVector(MachineInstr &MI); + LegalizeResult lowerDynStackAlloc(MachineInstr &MI); ++ LegalizeResult lowerStackSave(MachineInstr &MI); ++ LegalizeResult lowerStackRestore(MachineInstr &MI); + LegalizeResult lowerExtract(MachineInstr &MI); + LegalizeResult lowerInsert(MachineInstr &MI); + LegalizeResult lowerSADDO_SSUBO(MachineInstr &MI); +diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def +index 186bea75ae96..c92ce6dc701c 100644 +--- a/llvm/include/llvm/Support/TargetOpcodes.def ++++ b/llvm/include/llvm/Support/TargetOpcodes.def +@@ -763,6 +763,12 @@ HANDLE_TARGET_OPCODE(G_JUMP_TABLE) + /// Generic dynamic stack allocation. + HANDLE_TARGET_OPCODE(G_DYN_STACKALLOC) + ++/// Generic stack pointer save. ++HANDLE_TARGET_OPCODE(G_STACKSAVE) ++ ++/// Generic stack pointer restore. ++HANDLE_TARGET_OPCODE(G_STACKRESTORE) ++ + /// Strict floating point instructions. + HANDLE_TARGET_OPCODE(G_STRICT_FADD) + HANDLE_TARGET_OPCODE(G_STRICT_FSUB) +diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td +index 00d56d1c4bd5..e8cfaeab3cd8 100644 +--- a/llvm/include/llvm/Target/GenericOpcodes.td ++++ b/llvm/include/llvm/Target/GenericOpcodes.td +@@ -225,6 +225,18 @@ def G_DYN_STACKALLOC : GenericInstruction { + let hasSideEffects = true; + } + ++def G_STACKSAVE : GenericInstruction { ++ let OutOperandList = (outs ptype0:$dst); ++ let InOperandList = (ins); ++ let hasSideEffects = true; ++} ++ ++def G_STACKRESTORE : GenericInstruction { ++ let OutOperandList = (outs); ++ let InOperandList = (ins ptype0:$src); ++ let hasSideEffects = true; ++} ++ + def G_FREEZE : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); +diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +index 9a67a8d05a4d..e4b837c6b8ce 100644 +--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp ++++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +@@ -2229,31 +2229,12 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, + return true; + } + case Intrinsic::stacksave: { +- // Save the stack pointer to the location provided by the intrinsic. +- Register Reg = getOrCreateVReg(CI); +- Register StackPtr = MF->getSubtarget() +- .getTargetLowering() +- ->getStackPointerRegisterToSaveRestore(); +- +- // If the target doesn't specify a stack pointer, then fall back. +- if (!StackPtr) +- return false; +- +- MIRBuilder.buildCopy(Reg, StackPtr); ++ MIRBuilder.buildInstr(TargetOpcode::G_STACKSAVE, {getOrCreateVReg(CI)}, {}); + return true; + } + case Intrinsic::stackrestore: { +- // Restore the stack pointer from the location provided by the intrinsic. +- Register Reg = getOrCreateVReg(*CI.getArgOperand(0)); +- Register StackPtr = MF->getSubtarget() +- .getTargetLowering() +- ->getStackPointerRegisterToSaveRestore(); +- +- // If the target doesn't specify a stack pointer, then fall back. +- if (!StackPtr) +- return false; +- +- MIRBuilder.buildCopy(StackPtr, Reg); ++ MIRBuilder.buildInstr(TargetOpcode::G_STACKRESTORE, {}, ++ {getOrCreateVReg(*CI.getArgOperand(0))}); + return true; + } + case Intrinsic::cttz: +diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +index f0da0d88140f..75d9789be4d0 100644 +--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp ++++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +@@ -3503,6 +3503,10 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { + return lowerShuffleVector(MI); + case G_DYN_STACKALLOC: + return lowerDynStackAlloc(MI); ++ case G_STACKSAVE: ++ return lowerStackSave(MI); ++ case G_STACKRESTORE: ++ return lowerStackRestore(MI); + case G_EXTRACT: + return lowerExtract(MI); + case G_INSERT: +@@ -6810,6 +6814,28 @@ LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) { + return Legalized; + } + ++LegalizerHelper::LegalizeResult ++LegalizerHelper::lowerStackSave(MachineInstr &MI) { ++ Register StackPtr = TLI.getStackPointerRegisterToSaveRestore(); ++ if (!StackPtr) ++ return UnableToLegalize; ++ ++ MIRBuilder.buildCopy(MI.getOperand(0), StackPtr); ++ MI.eraseFromParent(); ++ return Legalized; ++} ++ ++LegalizerHelper::LegalizeResult ++LegalizerHelper::lowerStackRestore(MachineInstr &MI) { ++ Register StackPtr = TLI.getStackPointerRegisterToSaveRestore(); ++ if (!StackPtr) ++ return UnableToLegalize; ++ ++ MIRBuilder.buildCopy(StackPtr, MI.getOperand(0)); ++ MI.eraseFromParent(); ++ return Legalized; ++} ++ + LegalizerHelper::LegalizeResult + LegalizerHelper::lowerExtract(MachineInstr &MI) { + auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); +diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +index d905da4eaec3..f0130a0be29d 100644 +--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp ++++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +@@ -797,7 +797,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) + return Query.Types[0] == p0 && Query.Types[1] == s64; + }); + +- getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower(); ++ getActionDefinitionsBuilder({G_DYN_STACKALLOC, ++ G_STACKSAVE, ++ G_STACKRESTORE}).lower(); + + if (ST.hasMOPS()) { + // G_BZERO is not supported. Currently it is only emitted by +diff --git a/llvm/lib/Target/X86/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/X86LegalizerInfo.cpp +index a4a247f85f3d..104461cff0a9 100644 +--- a/llvm/lib/Target/X86/X86LegalizerInfo.cpp ++++ b/llvm/lib/Target/X86/X86LegalizerInfo.cpp +@@ -528,6 +528,10 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, + // memory intrinsics + getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall(); + ++ getActionDefinitionsBuilder({G_DYN_STACKALLOC, ++ G_STACKSAVE, ++ G_STACKRESTORE}).lower(); ++ + // fp intrinsics + getActionDefinitionsBuilder(G_INTRINSIC_ROUNDEVEN) + .scalarize(0) +diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll +index 5f3544add398..575cd6b874e3 100644 +--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll ++++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll +@@ -2392,8 +2392,8 @@ declare ptr @llvm.stacksave() + declare void @llvm.stackrestore(ptr) + define void @test_stacksaverestore() { + ; CHECK-LABEL: name: test_stacksaverestore +- ; CHECK: [[SAVE:%[0-9]+]]:_(p0) = COPY $sp +- ; CHECK-NEXT: $sp = COPY [[SAVE]](p0) ++ ; CHECK: [[SAVE:%[0-9]+]]:_(p0) = G_STACKSAVE ++ ; CHECK-NEXT: G_STACKRESTORE [[SAVE]] + ; CHECK-NEXT: RET_ReallyLR + %sp = call ptr @llvm.stacksave() + call void @llvm.stackrestore(ptr %sp) +diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +index b4fe73d29fa6..461161f5b338 100644 +--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir ++++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +@@ -641,7 +641,15 @@ + # DEBUG-NEXT: G_JUMP_TABLE (opcode {{[0-9]+}}): 1 type index, 0 imm indices + # DEBUG-NEXT: .. the first uncovered type index: 1, OK + # DEBUG-NEXT: .. the first uncovered imm index: 0, OK +-# DEBUG-NEXT: G_DYN_STACKALLOC (opcode {{[0-9]+}}): 2 type indices, 0 imm indices ++# DEBUG-NEXT: G_DYN_STACKALLOC (opcode [[DYN_STACKALLOC:[0-9]+]]): 2 type indices, 0 imm indices ++# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected ++# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected ++# DEBUG-NEXT: G_STACKSAVE (opcode {{[0-9]+}}): 1 type index, 0 imm indices ++# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to [[DYN_STACKALLOC]] ++# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected ++# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected ++# DEBUG-NEXT: G_STACKRESTORE (opcode {{[0-9]+}}): 1 type index, 0 imm indices ++# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to [[DYN_STACKALLOC]] + # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected + # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected + # DEBUG-NEXT: G_STRICT_FADD (opcode {{[0-9]+}}): 1 type index, 0 imm indices +diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll b/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll +new file mode 100644 +index 000000000000..16bf85af9c17 +--- /dev/null ++++ b/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll +@@ -0,0 +1,35 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ++; RUN: llc -global-isel=1 -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s ++ ++declare void @use_addr(ptr) ++declare ptr @llvm.stacksave.p0() ++declare void @llvm.stackrestore.p0(ptr) ++ ++define void @test_scoped_alloca(i64 %n) { ++; CHECK-LABEL: test_scoped_alloca: ++; CHECK: // %bb.0: ++; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill ++; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill ++; CHECK-NEXT: mov x29, sp ++; CHECK-NEXT: .cfi_def_cfa w29, 32 ++; CHECK-NEXT: .cfi_offset w19, -16 ++; CHECK-NEXT: .cfi_offset w30, -24 ++; CHECK-NEXT: .cfi_offset w29, -32 ++; CHECK-NEXT: add x9, x0, #15 ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 ++; CHECK-NEXT: mov x19, sp ++; CHECK-NEXT: sub x0, x8, x9 ++; CHECK-NEXT: mov sp, x0 ++; CHECK-NEXT: bl use_addr ++; CHECK-NEXT: mov sp, x19 ++; CHECK-NEXT: mov sp, x29 ++; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload ++; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ++; CHECK-NEXT: ret ++ %sp = call ptr @llvm.stacksave.p0() ++ %addr = alloca i8, i64 %n ++ call void @use_addr(ptr %addr) ++ call void @llvm.stackrestore.p0(ptr %sp) ++ ret void ++} +diff --git a/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll b/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll +new file mode 100644 +index 000000000000..e86c04ee22db +--- /dev/null ++++ b/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll +@@ -0,0 +1,39 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ++; RUN: llc -global-isel=1 -mtriple=x86_64-linux-gnu -o - %s | FileCheck %s ++ ++declare void @use_addr(ptr) ++declare ptr @llvm.stacksave.p0() ++declare void @llvm.stackrestore.p0(ptr) ++ ++define void @test_scoped_alloca(i64 %n) { ++; CHECK-LABEL: test_scoped_alloca: ++; CHECK: # %bb.0: ++; CHECK-NEXT: pushq %rbp ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset %rbp, -16 ++; CHECK-NEXT: movq %rsp, %rbp ++; CHECK-NEXT: .cfi_def_cfa_register %rbp ++; CHECK-NEXT: pushq %rbx ++; CHECK-NEXT: pushq %rax ++; CHECK-NEXT: .cfi_offset %rbx, -24 ++; CHECK-NEXT: movq %rsp, %rbx ++; CHECK-NEXT: movq %rsp, %rax ++; CHECK-NEXT: imulq $1, %rdi, %rcx ++; CHECK-NEXT: addq $15, %rcx ++; CHECK-NEXT: andq $-16, %rcx ++; CHECK-NEXT: subq %rcx, %rax ++; CHECK-NEXT: movq %rax, %rsp ++; CHECK-NEXT: movq %rax, %rdi ++; CHECK-NEXT: callq use_addr ++; CHECK-NEXT: movq %rbx, %rsp ++; CHECK-NEXT: leaq -8(%rbp), %rsp ++; CHECK-NEXT: popq %rbx ++; CHECK-NEXT: popq %rbp ++; CHECK-NEXT: .cfi_def_cfa %rsp, 8 ++; CHECK-NEXT: retq ++ %sp = call ptr @llvm.stacksave.p0() ++ %addr = alloca i8, i64 %n ++ call void @use_addr(ptr %addr) ++ call void @llvm.stackrestore.p0(ptr %sp) ++ ret void ++} +-- +2.42.0.windows.2 + diff --git a/0022-Backport-AArch64-Refactor-allocation-of-locals-and-stack-realignment.patch b/0022-Backport-AArch64-Refactor-allocation-of-locals-and-stack-realignment.patch new file mode 100644 index 0000000..6fefb9c --- /dev/null +++ b/0022-Backport-AArch64-Refactor-allocation-of-locals-and-stack-realignment.patch @@ -0,0 +1,546 @@ +From 8db377e2a22d83637171008b6c8723f1869a2926 Mon Sep 17 00:00:00 2001 +From: rickyleung +Date: Tue, 7 May 2024 21:24:49 +0800 +Subject: [PATCH 3/7] [backport][AArch64] Refactor allocation of locals and + stack realignment + +Reference: https://github.com/wc00862805aj/llvm-project/commit/dedf2c6bb5193652f6ad7d9ff9e676624c2485b7? + +Factor out some stack allocation in a separate function. This patch +splits out the generic portion of a larger refactoring done as a part of +stack clash protection support. + +The patch is almost, but not quite NFC. The only difference should +be that where we have adjacent allocation of stack space +for local SVE objects and non-local SVE objects the order +of `sub sp, ...` and `addvl sp, ...` instructions is reversed, because now +it's done with a single call to `emitFrameOffset` and it happens +add/subtract the fixed part before the scalable part, e.g. + + addvl sp, sp, #-2 + sub sp, sp, llvm#16, lsl llvm#12 + sub sp, sp, llvm#16 + +becomes + + sub sp, sp, llvm#16, lsl llvm#12 + sub sp, sp, llvm#16 + addvl sp, sp, #-2 +--- + .../Target/AArch64/AArch64FrameLowering.cpp | 114 +++++++++--------- + .../lib/Target/AArch64/AArch64FrameLowering.h | 5 + + .../AArch64/framelayout-sve-basepointer.mir | 4 +- + .../framelayout-sve-fixed-width-access.mir | 2 +- + .../framelayout-sve-scavengingslot.mir | 4 +- + llvm/test/CodeGen/AArch64/framelayout-sve.mir | 54 ++++----- + .../AArch64/spill-stack-realignment.mir | 2 +- + llvm/test/CodeGen/AArch64/stack-guard-sve.ll | 4 +- + .../AArch64/sve-calling-convention-mixed.ll | 4 +- + .../CodeGen/AArch64/sve-fixed-length-fp128.ll | 4 +- + 10 files changed, 103 insertions(+), 94 deletions(-) + +diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +index 4d5676f34101..eeb6185fa36d 100644 +--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp ++++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +@@ -300,6 +300,7 @@ static int64_t getArgumentStackToRestore(MachineFunction &MF, + static bool produceCompactUnwindFrame(MachineFunction &MF); + static bool needsWinCFI(const MachineFunction &MF); + static StackOffset getSVEStackSize(const MachineFunction &MF); ++static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB); + static bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF); + + /// Returns true if a homogeneous prolog or epilog code can be emitted +@@ -671,6 +672,44 @@ void AArch64FrameLowering::emitCalleeSavedSVERestores( + emitCalleeSavedRestores(MBB, MBBI, true); + } + ++void AArch64FrameLowering::allocateStackSpace( ++ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, ++ bool NeedsRealignment, StackOffset AllocSize, bool NeedsWinCFI, ++ bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset) const { ++ ++ if (!AllocSize) ++ return; ++ ++ DebugLoc DL; ++ MachineFunction &MF = *MBB.getParent(); ++ const AArch64Subtarget &Subtarget = MF.getSubtarget(); ++ const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); ++ AArch64FunctionInfo &AFI = *MF.getInfo(); ++ const MachineFrameInfo &MFI = MF.getFrameInfo(); ++ ++ Register TargetReg = ++ NeedsRealignment ? findScratchNonCalleeSaveRegister(&MBB) : AArch64::SP; ++ // SUB Xd/SP, SP, AllocSize ++ emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII, ++ MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI, ++ EmitCFI, InitialOffset); ++ ++ if (NeedsRealignment) { ++ const int64_t MaxAlign = MFI.getMaxAlign().value(); ++ const uint64_t AndMask = ~(MaxAlign - 1); ++ // AND SP, Xd, 0b11111...0000 ++ BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP) ++ .addReg(TargetReg, RegState::Kill) ++ .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)) ++ .setMIFlags(MachineInstr::FrameSetup); ++ AFI.setStackRealigned(true); ++ ++ // No need for SEH instructions here; if we're realigning the stack, ++ // we've set a frame pointer and already finished the SEH prologue. ++ assert(!NeedsWinCFI); ++ } ++} ++ + static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) { + switch (Reg.id()) { + default: +@@ -1769,7 +1808,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, + } + } + +- StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {}; ++ StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize; + MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI; + + // Process the SVE callee-saves to determine what space needs to be +@@ -1782,67 +1821,32 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, + ++MBBI; + CalleeSavesEnd = MBBI; + +- AllocateBefore = StackOffset::getScalable(CalleeSavedSize); +- AllocateAfter = SVEStackSize - AllocateBefore; ++ SVECalleeSavesSize = StackOffset::getScalable(CalleeSavedSize); ++ SVELocalsSize = SVEStackSize - SVECalleeSavesSize; + } + + // Allocate space for the callee saves (if any). +- emitFrameOffset( +- MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, -AllocateBefore, TII, +- MachineInstr::FrameSetup, false, false, nullptr, +- EmitAsyncCFI && !HasFP && AllocateBefore, +- StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes)); ++ StackOffset CFAOffset = ++ StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes); ++ allocateStackSpace(MBB, CalleeSavesBegin, false, SVECalleeSavesSize, false, ++ nullptr, EmitAsyncCFI && !HasFP, CFAOffset); ++ CFAOffset += SVECalleeSavesSize; + + if (EmitAsyncCFI) + emitCalleeSavedSVELocations(MBB, CalleeSavesEnd); + +- // Finally allocate remaining SVE stack space. +- emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP, +- -AllocateAfter, TII, MachineInstr::FrameSetup, false, false, +- nullptr, EmitAsyncCFI && !HasFP && AllocateAfter, +- AllocateBefore + StackOffset::getFixed( +- (int64_t)MFI.getStackSize() - NumBytes)); +- +- // Allocate space for the rest of the frame. +- if (NumBytes) { +- unsigned scratchSPReg = AArch64::SP; +- +- if (NeedsRealignment) { +- scratchSPReg = findScratchNonCalleeSaveRegister(&MBB); +- assert(scratchSPReg != AArch64::NoRegister); +- } +- +- // If we're a leaf function, try using the red zone. +- if (!canUseRedZone(MF)) { +- // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have +- // the correct value here, as NumBytes also includes padding bytes, +- // which shouldn't be counted here. +- emitFrameOffset( +- MBB, MBBI, DL, scratchSPReg, AArch64::SP, +- StackOffset::getFixed(-NumBytes), TII, MachineInstr::FrameSetup, +- false, NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP, +- SVEStackSize + +- StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes)); +- } +- if (NeedsRealignment) { +- assert(MFI.getMaxAlign() > Align(1)); +- assert(scratchSPReg != AArch64::SP); +- +- // SUB X9, SP, NumBytes +- // -- X9 is temporary register, so shouldn't contain any live data here, +- // -- free to use. This is already produced by emitFrameOffset above. +- // AND SP, X9, 0b11111...0000 +- uint64_t AndMask = ~(MFI.getMaxAlign().value() - 1); +- +- BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) +- .addReg(scratchSPReg, RegState::Kill) +- .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)); +- AFI->setStackRealigned(true); +- +- // No need for SEH instructions here; if we're realigning the stack, +- // we've set a frame pointer and already finished the SEH prologue. +- assert(!NeedsWinCFI); +- } ++ // Allocate space for the rest of the frame including SVE locals. Align the ++ // stack as necessary. ++ assert(!(canUseRedZone(MF) && NeedsRealignment) && ++ "Cannot use redzone with stack realignment"); ++ if (!canUseRedZone(MF)) { ++ // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have ++ // the correct value here, as NumBytes also includes padding bytes, ++ // which shouldn't be counted here. ++ allocateStackSpace(MBB, CalleeSavesEnd, NeedsRealignment, ++ SVELocalsSize + StackOffset::getFixed(NumBytes), ++ NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP, ++ CFAOffset); + } + + // If we need a base pointer, set it up here. It's whatever the value of the +diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h +index 147b5c181be5..f3313f3b53ff 100644 +--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h ++++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h +@@ -150,6 +150,11 @@ private: + MachineBasicBlock::iterator MBBI) const; + void emitCalleeSavedSVERestores(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const; ++ void allocateStackSpace(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator MBBI, ++ bool NeedsRealignment, StackOffset AllocSize, ++ bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI, ++ StackOffset InitialOffset) const; + + /// Emit target zero call-used regs. + void emitZeroCallUsedRegs(BitVector RegsToZero, +diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-basepointer.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-basepointer.mir +index 623c0f240be4..265c474fbc5d 100644 +--- a/llvm/test/CodeGen/AArch64/framelayout-sve-basepointer.mir ++++ b/llvm/test/CodeGen/AArch64/framelayout-sve-basepointer.mir +@@ -4,8 +4,8 @@ + name: hasBasepointer + # CHECK-LABEL: name: hasBasepointer + # CHECK: bb.0: +-# CHECK: $sp = frame-setup ADDVL_XXI $sp, -1 +-# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 ++# CHECK: $sp = frame-setup SUBXri $sp, 16, 0 ++# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 + # CHECK-NEXT: $x19 = ADDXri $sp, 0, 0 + # CHECK: STRXui $x0, $x19, 0 + tracksRegLiveness: true +diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-fixed-width-access.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-fixed-width-access.mir +index e367a380f8ba..35fd7ca77d5c 100644 +--- a/llvm/test/CodeGen/AArch64/framelayout-sve-fixed-width-access.mir ++++ b/llvm/test/CodeGen/AArch64/framelayout-sve-fixed-width-access.mir +@@ -7,9 +7,9 @@ + ; CHECK: // %bb.0: // %entry + ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill + ; CHECK-NEXT: mov x29, sp ++ ; CHECK-NEXT: sub sp, sp, #2064 + ; CHECK-NEXT: addvl sp, sp, #-32 + ; CHECK-NEXT: addvl sp, sp, #-28 +- ; CHECK-NEXT: sub sp, sp, #2064 + ; CHECK-NEXT: ldr x8, [sp, #2048] + ; CHECK-NEXT: addvl sp, sp, #31 + ; CHECK-NEXT: addvl sp, sp, #29 +diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir +index d54f67634d02..680f9c335c25 100644 +--- a/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir ++++ b/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir +@@ -4,9 +4,9 @@ + name: LateScavengingSlot + # CHECK-LABEL: name: LateScavengingSlot + # CHECK: bb.0: +-# CHECK: $sp = frame-setup ADDVL_XXI $sp, -1 +-# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 8, 12 ++# CHECK: $sp = frame-setup SUBXri $sp, 8, 12 + # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 ++# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 + # CHECK: STRXui killed $[[SCRATCH:x[0-9]+]], $sp, 0 + # CHECK-NEXT: $[[SCRATCH]] = ADDVL_XXI $fp, -1 + # CHECK-NEXT: STRXui $x0, killed $[[SCRATCH]], 0 +diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir +index 7c87587c6dc4..8b657c95bfc7 100644 +--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir ++++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir +@@ -60,10 +60,10 @@ + # CHECK-NEXT: $sp = frame-setup STRXpre killed $[[SCRATCH:[a-z0-9]+]], $sp, -16 + # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 + # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 +-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 +-# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 + # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 +-# CHECK-NEXT: CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 ++# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 32 ++# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 ++# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 + + # CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2 + # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 32 +@@ -77,7 +77,7 @@ + # ASM-LABEL: test_allocate_sve: + # ASM: .cfi_def_cfa_offset 16 + # ASM-NEXT: .cfi_offset w29, -16 +-# ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ++# ASM: .cfi_def_cfa_offset 32 + # ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 16 * VG + # ASM: .cfi_def_cfa wsp, 32 + # ASM: .cfi_def_cfa_offset 16 +@@ -87,7 +87,7 @@ + # + # UNWINDINFO: DW_CFA_def_cfa_offset: +16 + # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 +-# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +16, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus ++# UNWINDINFO: DW_CFA_def_cfa_offset: +32 + # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus + # UNWINDINFO: DW_CFA_def_cfa: reg31 +32 + # UNWINDINFO: DW_CFA_def_cfa_offset: +16 +@@ -125,9 +125,9 @@ body: | + # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w20, -8 + # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w21, -16 + # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -32 +-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 +-# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 + # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 ++# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 48 ++# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 + # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 + # + # CHECK-NEXT: $x20 = IMPLICIT_DEF +@@ -149,7 +149,7 @@ body: | + # ASM: .cfi_offset w20, -8 + # ASM-NEXT: .cfi_offset w21, -16 + # ASM-NEXT: .cfi_offset w29, -32 +-# ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 16 * VG ++# ASM: .cfi_def_cfa_offset 48 + # ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 16 * VG + # + # ASM: .cfi_def_cfa wsp, 48 +@@ -164,7 +164,7 @@ body: | + # UNWINDINFO: DW_CFA_offset: reg20 -8 + # UNWINDINFO-NEXT: DW_CFA_offset: reg21 -16 + # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -32 +-# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus ++# UNWINDINFO: DW_CFA_def_cfa_offset: +48 + # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +48, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus + # + # UNWINDINFO: DW_CFA_def_cfa: reg31 +48 +@@ -205,9 +205,9 @@ body: | + # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16 + # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8 + # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 +-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 + # CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0 +-# CHECK-NEXT: $sp = ANDXri killed $[[TMP]] ++# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -2 ++# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]] + # CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0 + # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16 + # CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 +@@ -267,9 +267,9 @@ body: | + # CHECK-NEXT: $sp = frame-setup STRXpre killed $[[SCRATCH:[a-z0-9]+]], $sp, -16 + # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 + # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 +-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3 +-# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 + # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 ++# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 32 ++# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3 + # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 + + # CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 16 +@@ -292,7 +292,7 @@ body: | + # ASM-LABEL: test_address_sve: + # ASM: .cfi_def_cfa_offset 16 + # ASM-NEXT: .cfi_offset w29, -16 +-# ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG ++# ASM: .cfi_def_cfa_offset 32 + # ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 24 * VG + # + # ASM: .cfi_def_cfa wsp, 32 +@@ -302,7 +302,7 @@ body: | + # + # UNWINDINFO: DW_CFA_def_cfa_offset: +16 + # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 +-# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +16, DW_OP_plus, DW_OP_consts +24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus ++# UNWINDINFO: DW_CFA_def_cfa_offset: +32 + # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus + # + # UNWINDINFO: DW_CFA_def_cfa: reg31 +32 +@@ -353,8 +353,8 @@ body: | + # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16 + # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8 + # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 +-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3 + # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 ++# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3 + + # CHECK-NEXT: STR_ZXI $z0, $fp, -1 + # CHECK-NEXT: STR_ZXI $z1, $fp, -2 +@@ -429,9 +429,9 @@ body: | + # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 + # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 + +-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 +-# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 + # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 ++# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 32 ++# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 + # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 + # CHECK: $[[TMP:x[0-9]+]] = ADDVL_XXI $sp, 1 + # CHECK-NEXT: $x0 = LDRXui killed $[[TMP]], 4 +@@ -448,7 +448,7 @@ body: | + # ASM-LABEL: test_stack_arg_sve: + # ASM: .cfi_def_cfa_offset 16 + # ASM-NEXT: .cfi_offset w29, -16 +-# ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ++# ASM: .cfi_def_cfa_offset 32 + # ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG + # + # ASM: .cfi_def_cfa wsp, 32 +@@ -458,7 +458,7 @@ body: | + + # UNWINDINFO: DW_CFA_def_cfa_offset: +16 + # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 +-# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +16, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus ++# UNWINDINFO: DW_CFA_def_cfa_offset: +32 + # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus + # + # UNWINDINFO: DW_CFA_def_cfa: reg31 +32 +@@ -640,8 +640,8 @@ body: | + # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w19, -16 + # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -24 + # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -32 +-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 + # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 ++# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 + # CHECK-NEXT: $x19 = ADDXri $sp, 0, 0 + # CHECK-NEXT: STRXui $xzr, $x19, 0 + # CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0 +@@ -863,9 +863,9 @@ body: | + # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4d, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 + # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4e, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 + # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4f, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 +-# CHECK: $sp = frame-setup ADDVL_XXI $sp, -1 +-# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 + # CHECK: $sp = frame-setup SUBXri $sp, 32, 0 ++# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 ++# CHECK: $sp = frame-setup ADDVL_XXI $sp, -1 + # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 + + # CHECK: $sp = frame-destroy ADDXri $sp, 32, 0 +@@ -916,7 +916,7 @@ body: | + # ASM-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 32 - 48 * VG + # ASM-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 32 - 56 * VG + # ASM-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 32 - 64 * VG +-# ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 152 * VG ++# ASM: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 64 + 144 * VG + # ASM: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 64 + 152 * VG + # + # ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 152 * VG +@@ -950,7 +950,7 @@ body: | + # UNWINDINFO-NEXT: DW_CFA_expression: reg77 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -48, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus + # UNWINDINFO-NEXT: DW_CFA_expression: reg78 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -56, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus + # UNWINDINFO-NEXT: DW_CFA_expression: reg79 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -64, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +-# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +152, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus ++# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +64, DW_OP_plus, DW_OP_consts +144, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus + # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +64, DW_OP_plus, DW_OP_consts +152, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus + # + # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +152, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +@@ -1031,9 +1031,9 @@ body: | + # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 + # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 + # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 +-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 + # CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0 +-# CHECK-NEXT: $sp = ANDXri killed $[[TMP]] ++# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -1 ++# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]] + + # CHECK: $sp = frame-destroy ADDVL_XXI $fp, -18 + # CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4 +diff --git a/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir b/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir +index 1b9411d07f43..f6fc627ac2d3 100644 +--- a/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir ++++ b/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir +@@ -21,7 +21,7 @@ stack: + - { id: 1, size: 4, alignment: 4, local-offset: -68 } + + # CHECK: body: +-# CHECK: $sp = ANDXri killed ${{x[0-9]+}}, 7865 ++# CHECK: $sp = frame-setup ANDXri killed ${{x[0-9]+}}, 7865 + # CHECK: STRSui $s0, $sp, 0 + # CHECK: STRSui $s0, $fp, 7 + body: | +diff --git a/llvm/test/CodeGen/AArch64/stack-guard-sve.ll b/llvm/test/CodeGen/AArch64/stack-guard-sve.ll +index 1672a7eb8739..5acbb22bf1ab 100644 +--- a/llvm/test/CodeGen/AArch64/stack-guard-sve.ll ++++ b/llvm/test/CodeGen/AArch64/stack-guard-sve.ll +@@ -148,9 +148,9 @@ entry: + + ; CHECK-LABEL: local_stack_alloc: + ; CHECK: mov x29, sp +-; CHECK: addvl sp, sp, #-2 + ; CHECK: sub sp, sp, #16, lsl #12 + ; CHECK: sub sp, sp, #16 ++; CHECK: addvl sp, sp, #-2 + + ; Stack guard is placed below the SVE stack area (and above all fixed-width objects) + ; CHECK-DAG: add [[STACK_GUARD_SPILL_PART_LOC:x[0-9]+]], sp, #8, lsl #12 +@@ -198,9 +198,9 @@ entry: + + ; CHECK-LABEL: local_stack_alloc_strong: + ; CHECK: mov x29, sp +-; CHECK: addvl sp, sp, #-3 + ; CHECK: sub sp, sp, #16, lsl #12 + ; CHECK: sub sp, sp, #16 ++; CHECK: addvl sp, sp, #-3 + + ; Stack guard is placed at the top of the SVE stack area + ; CHECK-DAG: ldr [[STACK_GUARD:x[0-9]+]], [{{x[0-9]+}}, :lo12:__stack_chk_guard] +diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll +index a97649523565..235364ac2321 100644 +--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll ++++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll +@@ -56,8 +56,8 @@ define float @foo2(ptr %x0, ptr %x1) nounwind { + ; CHECK-LABEL: foo2: + ; CHECK: // %bb.0: // %entry + ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +-; CHECK-NEXT: addvl sp, sp, #-4 + ; CHECK-NEXT: sub sp, sp, #16 ++; CHECK-NEXT: addvl sp, sp, #-4 + ; CHECK-NEXT: ptrue p0.b + ; CHECK-NEXT: add x8, sp, #16 + ; CHECK-NEXT: ld4d { z1.d - z4.d }, p0/z, [x0] +@@ -699,8 +699,8 @@ define void @verify_all_operands_are_initialised() { + ; CHECK-LABEL: verify_all_operands_are_initialised: + ; CHECK: // %bb.0: + ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +-; CHECK-NEXT: addvl sp, sp, #-1 + ; CHECK-NEXT: sub sp, sp, #16 ++; CHECK-NEXT: addvl sp, sp, #-1 + ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG + ; CHECK-NEXT: .cfi_offset w30, -8 + ; CHECK-NEXT: .cfi_offset w29, -16 +diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll +index 31ff9287046c..b3529549c22b 100644 +--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll ++++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll +@@ -9,8 +9,8 @@ define void @fcvt_v4f64_v4f128(ptr %a, ptr %b) vscale_range(2,0) #0 { + ; CHECK: // %bb.0: + ; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill + ; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill +-; CHECK-NEXT: addvl sp, sp, #-2 + ; CHECK-NEXT: sub sp, sp, #48 ++; CHECK-NEXT: addvl sp, sp, #-2 + ; CHECK-NEXT: ptrue p0.d, vl4 + ; CHECK-NEXT: add x8, sp, #48 + ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +@@ -59,8 +59,8 @@ define void @fcvt_v4f128_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { + ; CHECK: // %bb.0: + ; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill + ; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill +-; CHECK-NEXT: addvl sp, sp, #-2 + ; CHECK-NEXT: sub sp, sp, #128 ++; CHECK-NEXT: addvl sp, sp, #-2 + ; CHECK-NEXT: ldr q1, [x0, #64] + ; CHECK-NEXT: mov x19, x1 + ; CHECK-NEXT: ldr q0, [x0, #80] +-- +2.42.0.windows.2 + diff --git a/0023-Backport-AArch64-Stack-probing-for-function-prologues.patch b/0023-Backport-AArch64-Stack-probing-for-function-prologues.patch new file mode 100644 index 0000000..ff40b08 --- /dev/null +++ b/0023-Backport-AArch64-Stack-probing-for-function-prologues.patch @@ -0,0 +1,2652 @@ +From 3a9ddc2f95926a75a9b436ad4dfd4070f535a113 Mon Sep 17 00:00:00 2001 +From: rickyleung +Date: Tue, 7 May 2024 21:25:52 +0800 +Subject: [PATCH 4/7] [backport][AArch64] Stack probing for function prologues + +Reference: https://github.com/llvm/llvm-project/commit/cc944f502f1ee20d73ff88c2c86cc909f12caadb + +This adds code to AArch64 function prologues to protect against stack +clash attacks by probing (writing to) the stack at regular enough +intervals to ensure that the guard page cannot be skipped over. + +The patch depends on and maintains the following invariants: + +Upon function entry the caller guarantees that it has probed the stack +(e.g. performed a store) at some address [sp, #N], where`0 <= N <= +1024`. This invariant comes from a requirement for compatibility with +GCC. Any address range in the allocated stack, no smaller than +stack-probe-size bytes contains at least one probe At any time the stack +pointer is above or in the guard page Probes are performed in +descreasing address order +The stack-probe-size is a function attribute that can be set by a +platform to correspond to the guard page size. + +By default, the stack probe size is 4KiB, which is a safe default as +this is the smallest possible page size for AArch64. Linux uses a 64KiB +guard for AArch64, so this can be overridden by the stack-probe-size +function attribute. + +For small frames without a frame pointer (<= 240 bytes), no probes are +needed. + +For larger frame sizes, LLVM always stores x29 to the stack. This serves +as an implicit stack probe. Thus, while allocating stack objects the +compiler assumes that the stack has been probed at [sp]. + +There are multiple probing sequences that can be emitted, depending on +the size of the stack allocation: + +A straight-line sequence of subtracts and stores, used when the +allocation size is smaller than 5 guard pages. A loop allocating and +probing one page size per iteration, plus at most a single probe to deal +with the remainder, used when the allocation size is larger but still +known at compile time. A loop which moves the SP down to the target +value held in a register (or a loop, moving a scratch register to the +target value help in SP), used when the allocation size is not known at +compile-time, such as when allocating space for SVE values, or when +over-aligning the stack. This is emitted in AArch64InstrInfo because it +will also be used for dynamic allocas in a future patch. A single probe +where the amount of stack adjustment is unknown, but is known to be less +than or equal to a page size. + +--------- + +Co-authored-by: Oliver Stannard +--- + .../Target/AArch64/AArch64FrameLowering.cpp | 335 +++++++- + .../lib/Target/AArch64/AArch64FrameLowering.h | 17 +- + .../Target/AArch64/AArch64ISelLowering.cpp | 6 + + llvm/lib/Target/AArch64/AArch64ISelLowering.h | 10 + + llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 90 +++ + llvm/lib/Target/AArch64/AArch64InstrInfo.h | 6 + + llvm/lib/Target/AArch64/AArch64InstrInfo.td | 24 +- + .../AArch64/AArch64MachineFunctionInfo.cpp | 43 +- + .../AArch64/AArch64MachineFunctionInfo.h | 6 + + .../test/CodeGen/AArch64/stack-probing-64k.ll | 392 ++++++++++ + .../AArch64/stack-probing-last-in-block.mir | 146 ++++ + .../test/CodeGen/AArch64/stack-probing-sve.ll | 724 ++++++++++++++++++ + llvm/test/CodeGen/AArch64/stack-probing.ll | 539 +++++++++++++ + 13 files changed, 2300 insertions(+), 38 deletions(-) + create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-64k.ll + create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir + create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-sve.ll + create mode 100644 llvm/test/CodeGen/AArch64/stack-probing.ll + +diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +index eeb6185fa36d..af019ab23770 100644 +--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp ++++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +@@ -672,10 +672,18 @@ void AArch64FrameLowering::emitCalleeSavedSVERestores( + emitCalleeSavedRestores(MBB, MBBI, true); + } + ++// Return the maximum possible number of bytes for `Size` due to the ++// architectural limit on the size of a SVE register. ++static int64_t upperBound(StackOffset Size) { ++ static const int64_t MAX_BYTES_PER_SCALABLE_BYTE = 16; ++ return Size.getScalable() * MAX_BYTES_PER_SCALABLE_BYTE + Size.getFixed(); ++} ++ + void AArch64FrameLowering::allocateStackSpace( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, +- bool NeedsRealignment, StackOffset AllocSize, bool NeedsWinCFI, +- bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset) const { ++ int64_t RealignmentPadding, StackOffset AllocSize, bool NeedsWinCFI, ++ bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset, ++ bool FollowupAllocs) const { + + if (!AllocSize) + return; +@@ -687,27 +695,129 @@ void AArch64FrameLowering::allocateStackSpace( + AArch64FunctionInfo &AFI = *MF.getInfo(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + +- Register TargetReg = +- NeedsRealignment ? findScratchNonCalleeSaveRegister(&MBB) : AArch64::SP; +- // SUB Xd/SP, SP, AllocSize ++ const int64_t MaxAlign = MFI.getMaxAlign().value(); ++ const uint64_t AndMask = ~(MaxAlign - 1); ++ ++ if (!Subtarget.getTargetLowering()->hasInlineStackProbe(MF)) { ++ Register TargetReg = RealignmentPadding ++ ? findScratchNonCalleeSaveRegister(&MBB) ++ : AArch64::SP; ++ // SUB Xd/SP, SP, AllocSize ++ emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII, ++ MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI, ++ EmitCFI, InitialOffset); ++ ++ if (RealignmentPadding) { ++ // AND SP, X9, 0b11111...0000 ++ BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP) ++ .addReg(TargetReg, RegState::Kill) ++ .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)) ++ .setMIFlags(MachineInstr::FrameSetup); ++ AFI.setStackRealigned(true); ++ ++ // No need for SEH instructions here; if we're realigning the stack, ++ // we've set a frame pointer and already finished the SEH prologue. ++ assert(!NeedsWinCFI); ++ } ++ return; ++ } ++ ++ // ++ // Stack probing allocation. ++ // ++ ++ // Fixed length allocation. If we don't need to re-align the stack and don't ++ // have SVE objects, we can use a more efficient sequence for stack probing. ++ if (AllocSize.getScalable() == 0 && RealignmentPadding == 0) { ++ Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB); ++ assert(ScratchReg != AArch64::NoRegister); ++ BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC)) ++ .addDef(ScratchReg) ++ .addImm(AllocSize.getFixed()) ++ .addImm(InitialOffset.getFixed()) ++ .addImm(InitialOffset.getScalable()); ++ // The fixed allocation may leave unprobed bytes at the top of the ++ // stack. If we have subsequent alocation (e.g. if we have variable-sized ++ // objects), we need to issue an extra probe, so these allocations start in ++ // a known state. ++ if (FollowupAllocs) { ++ // STR XZR, [SP] ++ BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui)) ++ .addReg(AArch64::XZR) ++ .addReg(AArch64::SP) ++ .addImm(0) ++ .setMIFlags(MachineInstr::FrameSetup); ++ } ++ ++ return; ++ } ++ ++ // Variable length allocation. ++ ++ // If the (unknown) allocation size cannot exceed the probe size, decrement ++ // the stack pointer right away. ++ int64_t ProbeSize = AFI.getStackProbeSize(); ++ if (upperBound(AllocSize) + RealignmentPadding <= ProbeSize) { ++ Register ScratchReg = RealignmentPadding ++ ? findScratchNonCalleeSaveRegister(&MBB) ++ : AArch64::SP; ++ assert(ScratchReg != AArch64::NoRegister); ++ // SUB Xd, SP, AllocSize ++ emitFrameOffset(MBB, MBBI, DL, ScratchReg, AArch64::SP, -AllocSize, &TII, ++ MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI, ++ EmitCFI, InitialOffset); ++ if (RealignmentPadding) { ++ // AND SP, Xn, 0b11111...0000 ++ BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP) ++ .addReg(ScratchReg, RegState::Kill) ++ .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)) ++ .setMIFlags(MachineInstr::FrameSetup); ++ AFI.setStackRealigned(true); ++ } ++ if (FollowupAllocs || upperBound(AllocSize) + RealignmentPadding > ++ AArch64::StackProbeMaxUnprobedStack) { ++ // STR XZR, [SP] ++ BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui)) ++ .addReg(AArch64::XZR) ++ .addReg(AArch64::SP) ++ .addImm(0) ++ .setMIFlags(MachineInstr::FrameSetup); ++ } ++ return; ++ } ++ ++ // Emit a variable-length allocation probing loop. ++ // TODO: As an optimisation, the loop can be "unrolled" into a few parts, ++ // each of them guaranteed to adjust the stack by less than the probe size. ++ Register TargetReg = findScratchNonCalleeSaveRegister(&MBB); ++ assert(TargetReg != AArch64::NoRegister); ++ // SUB Xd, SP, AllocSize + emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII, + MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI, + EmitCFI, InitialOffset); + +- if (NeedsRealignment) { +- const int64_t MaxAlign = MFI.getMaxAlign().value(); +- const uint64_t AndMask = ~(MaxAlign - 1); +- // AND SP, Xd, 0b11111...0000 +- BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP) ++ if (RealignmentPadding) { ++ // AND Xn, Xn, 0b11111...0000 ++ BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), TargetReg) + .addReg(TargetReg, RegState::Kill) + .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)) + .setMIFlags(MachineInstr::FrameSetup); +- AFI.setStackRealigned(true); ++ } + +- // No need for SEH instructions here; if we're realigning the stack, +- // we've set a frame pointer and already finished the SEH prologue. +- assert(!NeedsWinCFI); ++ BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC_VAR)) ++ .addReg(TargetReg); ++ if (EmitCFI) { ++ // Set the CFA register back to SP. ++ unsigned Reg = ++ Subtarget.getRegisterInfo()->getDwarfRegNum(AArch64::SP, true); ++ unsigned CFIIndex = ++ MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg)); ++ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) ++ .addCFIIndex(CFIIndex) ++ .setMIFlags(MachineInstr::FrameSetup); + } ++ if (RealignmentPadding) ++ AFI.setStackRealigned(true); + } + + static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) { +@@ -893,9 +1003,11 @@ bool AArch64FrameLowering::canUseAsPrologue( + MachineBasicBlock *TmpMBB = const_cast(&MBB); + const AArch64Subtarget &Subtarget = MF->getSubtarget(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); ++ const AArch64TargetLowering *TLI = Subtarget.getTargetLowering(); + +- // Don't need a scratch register if we're not going to re-align the stack. +- if (!RegInfo->hasStackRealignment(*MF)) ++ // Don't need a scratch register if we're not going to re-align the stack or ++ // emit stack probes. ++ if (!RegInfo->hasStackRealignment(*MF) && TLI->hasInlineStackProbe(*MF)) + return true; + // Otherwise, we can use any block as long as it has a scratch register + // available. +@@ -905,15 +1017,11 @@ bool AArch64FrameLowering::canUseAsPrologue( + static bool windowsRequiresStackProbe(MachineFunction &MF, + uint64_t StackSizeInBytes) { + const AArch64Subtarget &Subtarget = MF.getSubtarget(); +- if (!Subtarget.isTargetWindows()) +- return false; +- const Function &F = MF.getFunction(); ++ const AArch64FunctionInfo &MFI = *MF.getInfo(); + // TODO: When implementing stack protectors, take that into account + // for the probe threshold. +- unsigned StackProbeSize = +- F.getFnAttributeAsParsedInteger("stack-probe-size", 4096); +- return (StackSizeInBytes >= StackProbeSize) && +- !F.hasFnAttribute("no-stack-arg-probe"); ++ return Subtarget.isTargetWindows() && MFI.hasStackProbing() && ++ StackSizeInBytes >= uint64_t(MFI.getStackProbeSize()); + } + + static bool needsWinCFI(const MachineFunction &MF) { +@@ -1678,7 +1786,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, + // Alignment is required for the parent frame, not the funclet + const bool NeedsRealignment = + NumBytes && !IsFunclet && RegInfo->hasStackRealignment(MF); +- int64_t RealignmentPadding = ++ const int64_t RealignmentPadding = + (NeedsRealignment && MFI.getMaxAlign() > Align(16)) + ? MFI.getMaxAlign().value() - 16 + : 0; +@@ -1814,6 +1922,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, + // Process the SVE callee-saves to determine what space needs to be + // allocated. + if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { ++ LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize ++ << "\n"); + // Find callee save instructions in frame. + CalleeSavesBegin = MBBI; + assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); +@@ -1828,8 +1938,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, + // Allocate space for the callee saves (if any). + StackOffset CFAOffset = + StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes); +- allocateStackSpace(MBB, CalleeSavesBegin, false, SVECalleeSavesSize, false, +- nullptr, EmitAsyncCFI && !HasFP, CFAOffset); ++ StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes); ++ allocateStackSpace(MBB, CalleeSavesBegin, 0, SVECalleeSavesSize, false, ++ nullptr, EmitAsyncCFI && !HasFP, CFAOffset, ++ MFI.hasVarSizedObjects() || LocalsSize); + CFAOffset += SVECalleeSavesSize; + + if (EmitAsyncCFI) +@@ -1843,10 +1955,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, + // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have + // the correct value here, as NumBytes also includes padding bytes, + // which shouldn't be counted here. +- allocateStackSpace(MBB, CalleeSavesEnd, NeedsRealignment, ++ allocateStackSpace(MBB, CalleeSavesEnd, RealignmentPadding, + SVELocalsSize + StackOffset::getFixed(NumBytes), + NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP, +- CFAOffset); ++ CFAOffset, MFI.hasVarSizedObjects()); + } + + // If we need a base pointer, set it up here. It's whatever the value of the +@@ -4028,3 +4140,170 @@ void AArch64FrameLowering::orderFrameObjects( + dbgs() << "\n"; + }); + } ++ ++/// Emit a loop to decrement SP until it is equal to TargetReg, with probes at ++/// least every ProbeSize bytes. Returns an iterator of the first instruction ++/// after the loop. The difference between SP and TargetReg must be an exact ++/// multiple of ProbeSize. ++MachineBasicBlock::iterator ++AArch64FrameLowering::inlineStackProbeLoopExactMultiple( ++ MachineBasicBlock::iterator MBBI, int64_t ProbeSize, ++ Register TargetReg) const { ++ MachineBasicBlock &MBB = *MBBI->getParent(); ++ MachineFunction &MF = *MBB.getParent(); ++ const AArch64InstrInfo *TII = ++ MF.getSubtarget().getInstrInfo(); ++ DebugLoc DL = MBB.findDebugLoc(MBBI); ++ ++ MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); ++ MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); ++ MF.insert(MBBInsertPoint, LoopMBB); ++ MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); ++ MF.insert(MBBInsertPoint, ExitMBB); ++ ++ // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not encodable ++ // in SUB). ++ emitFrameOffset(*LoopMBB, LoopMBB->end(), DL, AArch64::SP, AArch64::SP, ++ StackOffset::getFixed(-ProbeSize), TII, ++ MachineInstr::FrameSetup); ++ // STR XZR, [SP] ++ BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::STRXui)) ++ .addReg(AArch64::XZR) ++ .addReg(AArch64::SP) ++ .addImm(0) ++ .setMIFlags(MachineInstr::FrameSetup); ++ // CMP SP, TargetReg ++ BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::SUBSXrx64), ++ AArch64::XZR) ++ .addReg(AArch64::SP) ++ .addReg(TargetReg) ++ .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) ++ .setMIFlags(MachineInstr::FrameSetup); ++ // B.CC Loop ++ BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::Bcc)) ++ .addImm(AArch64CC::NE) ++ .addMBB(LoopMBB) ++ .setMIFlags(MachineInstr::FrameSetup); ++ ++ LoopMBB->addSuccessor(ExitMBB); ++ LoopMBB->addSuccessor(LoopMBB); ++ // Synthesize the exit MBB. ++ ExitMBB->splice(ExitMBB->end(), &MBB, MBBI, MBB.end()); ++ ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); ++ MBB.addSuccessor(LoopMBB); ++ // Update liveins. ++ recomputeLiveIns(*LoopMBB); ++ recomputeLiveIns(*ExitMBB); ++ ++ return ExitMBB->begin(); ++} ++ ++void AArch64FrameLowering::inlineStackProbeFixed( ++ MachineBasicBlock::iterator MBBI, Register ScratchReg, int64_t FrameSize, ++ StackOffset CFAOffset) const { ++ MachineBasicBlock *MBB = MBBI->getParent(); ++ MachineFunction &MF = *MBB->getParent(); ++ const AArch64InstrInfo *TII = ++ MF.getSubtarget().getInstrInfo(); ++ AArch64FunctionInfo *AFI = MF.getInfo(); ++ bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF); ++ bool HasFP = hasFP(MF); ++ ++ DebugLoc DL; ++ int64_t ProbeSize = MF.getInfo()->getStackProbeSize(); ++ int64_t NumBlocks = FrameSize / ProbeSize; ++ int64_t ResidualSize = FrameSize % ProbeSize; ++ ++ LLVM_DEBUG(dbgs() << "Stack probing: total " << FrameSize << " bytes, " ++ << NumBlocks << " blocks of " << ProbeSize ++ << " bytes, plus " << ResidualSize << " bytes\n"); ++ ++ // Decrement SP by NumBlock * ProbeSize bytes, with either unrolled or ++ // ordinary loop. ++ if (NumBlocks <= AArch64::StackProbeMaxLoopUnroll) { ++ for (int i = 0; i < NumBlocks; ++i) { ++ // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not ++ // encodable in a SUB). ++ emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP, ++ StackOffset::getFixed(-ProbeSize), TII, ++ MachineInstr::FrameSetup, false, false, nullptr, ++ EmitAsyncCFI && !HasFP, CFAOffset); ++ CFAOffset += StackOffset::getFixed(ProbeSize); ++ // STR XZR, [SP] ++ BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui)) ++ .addReg(AArch64::XZR) ++ .addReg(AArch64::SP) ++ .addImm(0) ++ .setMIFlags(MachineInstr::FrameSetup); ++ } ++ } else if (NumBlocks != 0) { ++ // SUB ScratchReg, SP, #FrameSize (or equivalent if FrameSize is not ++ // encodable in ADD). ScrathReg may temporarily become the CFA register. ++ emitFrameOffset(*MBB, MBBI, DL, ScratchReg, AArch64::SP, ++ StackOffset::getFixed(-ProbeSize * NumBlocks), TII, ++ MachineInstr::FrameSetup, false, false, nullptr, ++ EmitAsyncCFI && !HasFP, CFAOffset); ++ CFAOffset += StackOffset::getFixed(ProbeSize * NumBlocks); ++ MBBI = inlineStackProbeLoopExactMultiple(MBBI, ProbeSize, ScratchReg); ++ MBB = MBBI->getParent(); ++ if (EmitAsyncCFI && !HasFP) { ++ // Set the CFA register back to SP. ++ const AArch64RegisterInfo &RegInfo = ++ *MF.getSubtarget().getRegisterInfo(); ++ unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true); ++ unsigned CFIIndex = ++ MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg)); ++ BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) ++ .addCFIIndex(CFIIndex) ++ .setMIFlags(MachineInstr::FrameSetup); ++ } ++ } ++ ++ if (ResidualSize != 0) { ++ // SUB SP, SP, #ResidualSize (or equivalent if ResidualSize is not encodable ++ // in SUB). ++ emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP, ++ StackOffset::getFixed(-ResidualSize), TII, ++ MachineInstr::FrameSetup, false, false, nullptr, ++ EmitAsyncCFI && !HasFP, CFAOffset); ++ if (ResidualSize > AArch64::StackProbeMaxUnprobedStack) { ++ // STR XZR, [SP] ++ BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui)) ++ .addReg(AArch64::XZR) ++ .addReg(AArch64::SP) ++ .addImm(0) ++ .setMIFlags(MachineInstr::FrameSetup); ++ } ++ } ++} ++ ++void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF, ++ MachineBasicBlock &MBB) const { ++ // Get the instructions that need to be replaced. We emit at most two of ++ // these. Remember them in order to avoid complications coming from the need ++ // to traverse the block while potentially creating more blocks. ++ SmallVector ToReplace; ++ for (MachineInstr &MI : MBB) ++ if (MI.getOpcode() == AArch64::PROBED_STACKALLOC || ++ MI.getOpcode() == AArch64::PROBED_STACKALLOC_VAR) ++ ToReplace.push_back(&MI); ++ ++ for (MachineInstr *MI : ToReplace) { ++ if (MI->getOpcode() == AArch64::PROBED_STACKALLOC) { ++ Register ScratchReg = MI->getOperand(0).getReg(); ++ int64_t FrameSize = MI->getOperand(1).getImm(); ++ StackOffset CFAOffset = StackOffset::get(MI->getOperand(2).getImm(), ++ MI->getOperand(3).getImm()); ++ inlineStackProbeFixed(MI->getIterator(), ScratchReg, FrameSize, ++ CFAOffset); ++ } else { ++ assert(MI->getOpcode() == AArch64::PROBED_STACKALLOC_VAR && ++ "Stack probe pseudo-instruction expected"); ++ const AArch64InstrInfo *TII = ++ MI->getMF()->getSubtarget().getInstrInfo(); ++ Register TargetReg = MI->getOperand(0).getReg(); ++ (void)TII->probedStackAlloc(MI->getIterator(), TargetReg, true); ++ } ++ MI->eraseFromParent(); ++ } ++} +\ No newline at end of file +diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h +index f3313f3b53ff..941af03a78b7 100644 +--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h ++++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h +@@ -152,13 +152,26 @@ private: + MachineBasicBlock::iterator MBBI) const; + void allocateStackSpace(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, +- bool NeedsRealignment, StackOffset AllocSize, ++ int64_t RealignmentPadding, StackOffset AllocSize, + bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI, +- StackOffset InitialOffset) const; ++ StackOffset InitialOffset, bool FollowupAllocs) const; + + /// Emit target zero call-used regs. + void emitZeroCallUsedRegs(BitVector RegsToZero, + MachineBasicBlock &MBB) const override; ++ ++ /// Replace a StackProbe stub (if any) with the actual probe code inline ++ void inlineStackProbe(MachineFunction &MF, ++ MachineBasicBlock &PrologueMBB) const override; ++ ++ void inlineStackProbeFixed(MachineBasicBlock::iterator MBBI, ++ Register ScratchReg, int64_t FrameSize, ++ StackOffset CFAOffset) const; ++ ++ MachineBasicBlock::iterator ++ inlineStackProbeLoopExactMultiple(MachineBasicBlock::iterator MBBI, ++ int64_t NegProbeSize, ++ Register TargetReg) const; + }; + + } // End llvm namespace +diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +index 6e721b937846..082043420fb9 100644 +--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp ++++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +@@ -26051,3 +26051,9 @@ bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const { + } + return true; + } ++ ++bool AArch64TargetLowering::hasInlineStackProbe( ++ const MachineFunction &MF) const { ++ return !Subtarget->isTargetWindows() && ++ MF.getInfo()->hasStackProbing(); ++} +\ No newline at end of file +diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h +index aca45f113e73..643d363e234a 100644 +--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h ++++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h +@@ -508,6 +508,13 @@ const unsigned RoundingBitsPos = 22; + const ArrayRef getGPRArgRegs(); + const ArrayRef getFPRArgRegs(); + ++/// Maximum allowed number of unprobed bytes above SP at an ABI ++/// boundary. ++const unsigned StackProbeMaxUnprobedStack = 1024; ++ ++/// Maximum number of iterations to unroll for a constant size probing loop. ++const unsigned StackProbeMaxLoopUnroll = 4; ++ + } // namespace AArch64 + + class AArch64Subtarget; +@@ -942,6 +949,9 @@ public: + // used for 64bit and 128bit vectors as well. + bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON = false) const; + ++ /// True if stack clash protection is enabled for this functions. ++ bool hasInlineStackProbe(const MachineFunction &MF) const override; ++ + private: + /// Keep a pointer to the AArch64Subtarget around so that we can + /// make the right decision when generating code for different targets. +diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +index 0691e07a639b..b3b42a97e8c9 100644 +--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp ++++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +@@ -11,6 +11,7 @@ + //===----------------------------------------------------------------------===// + + #include "AArch64InstrInfo.h" ++#include "AArch64ExpandImm.h" + #include "AArch64MachineFunctionInfo.h" + #include "AArch64Subtarget.h" + #include "MCTargetDesc/AArch64AddressingModes.h" +@@ -18,6 +19,7 @@ + #include "llvm/ADT/ArrayRef.h" + #include "llvm/ADT/STLExtras.h" + #include "llvm/ADT/SmallVector.h" ++#include "llvm/CodeGen/LivePhysRegs.h" + #include "llvm/CodeGen/MachineBasicBlock.h" + #include "llvm/CodeGen/MachineCombinerPattern.h" + #include "llvm/CodeGen/MachineFrameInfo.h" +@@ -8428,6 +8430,94 @@ unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) { + return AArch64::BLR; + } + ++MachineBasicBlock::iterator ++AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI, ++ Register TargetReg, bool FrameSetup) const { ++ assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP"); ++ ++ MachineBasicBlock &MBB = *MBBI->getParent(); ++ MachineFunction &MF = *MBB.getParent(); ++ const AArch64InstrInfo *TII = ++ MF.getSubtarget().getInstrInfo(); ++ int64_t ProbeSize = MF.getInfo()->getStackProbeSize(); ++ DebugLoc DL = MBB.findDebugLoc(MBBI); ++ ++ MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); ++ MachineBasicBlock *LoopTestMBB = ++ MF.CreateMachineBasicBlock(MBB.getBasicBlock()); ++ MF.insert(MBBInsertPoint, LoopTestMBB); ++ MachineBasicBlock *LoopBodyMBB = ++ MF.CreateMachineBasicBlock(MBB.getBasicBlock()); ++ MF.insert(MBBInsertPoint, LoopBodyMBB); ++ MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); ++ MF.insert(MBBInsertPoint, ExitMBB); ++ MachineInstr::MIFlag Flags = ++ FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags; ++ ++ // LoopTest: ++ // SUB SP, SP, #ProbeSize ++ emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP, ++ AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags); ++ ++ // CMP SP, TargetReg ++ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64), ++ AArch64::XZR) ++ .addReg(AArch64::SP) ++ .addReg(TargetReg) ++ .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) ++ .setMIFlags(Flags); ++ ++ // B. LoopExit ++ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc)) ++ .addImm(AArch64CC::LE) ++ .addMBB(ExitMBB) ++ .setMIFlags(Flags); ++ ++ // STR XZR, [SP] ++ BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui)) ++ .addReg(AArch64::XZR) ++ .addReg(AArch64::SP) ++ .addImm(0) ++ .setMIFlags(Flags); ++ ++ // B loop ++ BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B)) ++ .addMBB(LoopTestMBB) ++ .setMIFlags(Flags); ++ ++ // LoopExit: ++ // MOV SP, TargetReg ++ BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP) ++ .addReg(TargetReg) ++ .addImm(0) ++ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) ++ .setMIFlags(Flags); ++ ++ // STR XZR, [SP] ++ BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::STRXui)) ++ .addReg(AArch64::XZR) ++ .addReg(AArch64::SP) ++ .addImm(0) ++ .setMIFlags(Flags); ++ ++ ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end()); ++ ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); ++ ++ LoopTestMBB->addSuccessor(ExitMBB); ++ LoopTestMBB->addSuccessor(LoopBodyMBB); ++ LoopBodyMBB->addSuccessor(LoopTestMBB); ++ MBB.addSuccessor(LoopTestMBB); ++ ++ // Update liveins. ++ if (MF.getRegInfo().reservedRegsFrozen()) { ++ recomputeLiveIns(*LoopTestMBB); ++ recomputeLiveIns(*LoopBodyMBB); ++ recomputeLiveIns(*ExitMBB); ++ } ++ ++ return ExitMBB->begin(); ++} ++ + #define GET_INSTRINFO_HELPERS + #define GET_INSTRMAP_INFO + #include "AArch64GenInstrInfo.inc" +diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h +index 20210a96d67a..7e84b86fc52c 100644 +--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h ++++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h +@@ -340,6 +340,12 @@ public: + static void decomposeStackOffsetForDwarfOffsets(const StackOffset &Offset, + int64_t &ByteSized, + int64_t &VGSized); ++ // Decrement the SP, issuing probes along the way. `TargetReg` is the new top ++ // of the stack. `FrameSetup` is passed as true, if the allocation is a part ++ // of constructing the activation frame of a function. ++ MachineBasicBlock::iterator probedStackAlloc(MachineBasicBlock::iterator MBBI, ++ Register TargetReg, ++ bool FrameSetup) const; + #define GET_INSTRINFO_HELPER_DECLS + #include "AArch64GenInstrInfo.inc" + +diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td +index 9e72d37880c5..09980c2f45e6 100644 +--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td ++++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td +@@ -880,7 +880,8 @@ include "SMEInstrFormats.td" + // Miscellaneous instructions. + //===----------------------------------------------------------------------===// + +-let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in { ++let hasSideEffects = 1, isCodeGenOnly = 1 in { ++let Defs = [SP], Uses = [SP] in { + // We set Sched to empty list because we expect these instructions to simply get + // removed in most cases. + def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), +@@ -889,7 +890,26 @@ def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), + def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), + [(AArch64callseq_end timm:$amt1, timm:$amt2)]>, + Sched<[]>; +-} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 ++} ++ ++let Defs = [SP, NZCV], Uses = [SP] in { ++// Probed stack allocation of a constant size, used in function prologues when ++// stack-clash protection is enabled. ++def PROBED_STACKALLOC : Pseudo<(outs GPR64:$scratch), ++ (ins i64imm:$stacksize, i64imm:$fixed_offset, ++ i64imm:$scalable_offset), ++ []>, ++ Sched<[]>; ++ ++// Probed stack allocation of a variable size, used in function prologues when ++// stack-clash protection is enabled. ++def PROBED_STACKALLOC_VAR : Pseudo<(outs), ++ (ins GPR64sp:$target), ++ []>, ++ Sched<[]>; ++ ++} // Defs = [SP, NZCV], Uses = [SP] in ++} // hasSideEffects = 1, isCodeGenOnly = 1 + + let isReMaterializable = 1, isCodeGenOnly = 1 in { + // FIXME: The following pseudo instructions are only needed because remat +diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp +index 961a19317d66..0bef3c2d2483 100644 +--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp ++++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp +@@ -97,14 +97,45 @@ AArch64FunctionInfo::AArch64FunctionInfo(const Function &F, + if (const auto *BTE = mdconst::extract_or_null( + F.getParent()->getModuleFlag("branch-target-enforcement"))) + BranchTargetEnforcement = BTE->getZExtValue(); +- return; ++ } else { ++ const StringRef BTIEnable = ++ F.getFnAttribute("branch-target-enforcement").getValueAsString(); ++ assert(BTIEnable.equals_insensitive("true") || ++ BTIEnable.equals_insensitive("false")); ++ BranchTargetEnforcement = BTIEnable.equals_insensitive("true"); + } + +- const StringRef BTIEnable = +- F.getFnAttribute("branch-target-enforcement").getValueAsString(); +- assert(BTIEnable.equals_insensitive("true") || +- BTIEnable.equals_insensitive("false")); +- BranchTargetEnforcement = BTIEnable.equals_insensitive("true"); ++ // The default stack probe size is 4096 if the function has no ++ // stack-probe-size attribute. This is a safe default because it is the ++ // smallest possible guard page size. ++ uint64_t ProbeSize = 4096; ++ if (F.hasFnAttribute("stack-probe-size")) ++ ProbeSize = F.getFnAttributeAsParsedInteger("stack-probe-size"); ++ else if (const auto *PS = mdconst::extract_or_null( ++ F.getParent()->getModuleFlag("stack-probe-size"))) ++ ProbeSize = PS->getZExtValue(); ++ assert(int64_t(ProbeSize) > 0 && "Invalid stack probe size"); ++ ++ if (STI->isTargetWindows()) { ++ if (!F.hasFnAttribute("no-stack-arg-probe")) ++ StackProbeSize = ProbeSize; ++ } else { ++ // Round down to the stack alignment. ++ uint64_t StackAlign = ++ STI->getFrameLowering()->getTransientStackAlign().value(); ++ ProbeSize = std::max(StackAlign, ProbeSize & ~(StackAlign - 1U)); ++ StringRef ProbeKind; ++ if (F.hasFnAttribute("probe-stack")) ++ ProbeKind = F.getFnAttribute("probe-stack").getValueAsString(); ++ else if (const auto *PS = dyn_cast_or_null( ++ F.getParent()->getModuleFlag("probe-stack"))) ++ ProbeKind = PS->getString(); ++ if (ProbeKind.size()) { ++ if (ProbeKind != "inline-asm") ++ report_fatal_error("Unsupported stack probing method"); ++ StackProbeSize = ProbeSize; ++ } ++ } + } + + MachineFunctionInfo *AArch64FunctionInfo::clone( +diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +index d82fb436925e..d50011594eb1 100644 +--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h ++++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +@@ -192,6 +192,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { + /// True if the function need asynchronous unwind information. + mutable std::optional NeedsAsyncDwarfUnwindInfo; + ++ int64_t StackProbeSize = 0; ++ + public: + AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI); + +@@ -447,6 +449,10 @@ public: + bool needsDwarfUnwindInfo(const MachineFunction &MF) const; + bool needsAsyncDwarfUnwindInfo(const MachineFunction &MF) const; + ++ bool hasStackProbing() const { return StackProbeSize != 0; } ++ ++ int64_t getStackProbeSize() const { return StackProbeSize; } ++ + private: + // Hold the lists of LOHs. + MILOHContainer LOHContainerSet; +diff --git a/llvm/test/CodeGen/AArch64/stack-probing-64k.ll b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll +new file mode 100644 +index 000000000000..0a3198fc520e +--- /dev/null ++++ b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll +@@ -0,0 +1,392 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ++; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s ++; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s ++ ++; Tests for prolog sequences for stack probing, when using a 64KiB stack guard. ++ ++; 64k bytes is the largest frame we can probe in one go. ++define void @static_65536(ptr %out) #0 { ++; CHECK-LABEL: static_65536: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 ++; CHECK-NEXT: .cfi_def_cfa_offset 65552 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 65536, align 1 ++ store i8* %v, ptr %out, align 8 ++ ret void ++} ++ ++; 64k+16 bytes, still needs just one probe. ++define void @static_65552(ptr %out) #0 { ++; CHECK-LABEL: static_65552: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 ++; CHECK-NEXT: .cfi_def_cfa_offset 65552 ++; CHECK-NEXT: str xzr, [sp], #-16 ++; CHECK-NEXT: .cfi_def_cfa_offset 65568 ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 ++; CHECK-NEXT: .cfi_def_cfa_offset 32 ++; CHECK-NEXT: add sp, sp, #16 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 65552, align 1 ++ store i8* %v, ptr %out, align 8 ++ ret void ++} ++ ++; 64k+1024 bytes, the largest frame which needs just one probe. ++define void @static_66560(ptr %out) #0 { ++; CHECK-LABEL: static_66560: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 ++; CHECK-NEXT: .cfi_def_cfa_offset 65552 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: sub sp, sp, #1024 ++; CHECK-NEXT: .cfi_def_cfa_offset 66576 ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 ++; CHECK-NEXT: .cfi_def_cfa_offset 1040 ++; CHECK-NEXT: add sp, sp, #1024 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 66560, align 1 ++ store i8* %v, ptr %out, align 8 ++ ret void ++} ++ ++; 64k+1024+16 bytes, the smallest frame which needs two probes. ++define void @static_66576(ptr %out) #0 { ++; CHECK-LABEL: static_66576: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 ++; CHECK-NEXT: .cfi_def_cfa_offset 65552 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: sub sp, sp, #1040 ++; CHECK-NEXT: .cfi_def_cfa_offset 66592 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 ++; CHECK-NEXT: .cfi_def_cfa_offset 1056 ++; CHECK-NEXT: add sp, sp, #1040 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 66576, align 1 ++ store i8* %v, ptr %out, align 8 ++ ret void ++} ++ ++; 2*64k+1024, the largest frame needing two probes. ++define void @static_132096(ptr %out) #0 { ++; CHECK-LABEL: static_132096: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 ++; CHECK-NEXT: .cfi_def_cfa_offset 65552 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 ++; CHECK-NEXT: .cfi_def_cfa_offset 131088 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: sub sp, sp, #1024 ++; CHECK-NEXT: .cfi_def_cfa_offset 132112 ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #32, lsl #12 // =131072 ++; CHECK-NEXT: .cfi_def_cfa_offset 1040 ++; CHECK-NEXT: add sp, sp, #1024 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 132096, align 1 ++ store i8* %v, ptr %out, align 8 ++ ret void ++} ++ ++; 5*64k-16, the largest frame probed without a loop. ++define void @static_327664(ptr %out) #0 { ++; CHECK-LABEL: static_327664: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 ++; CHECK-NEXT: .cfi_def_cfa_offset 65552 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 ++; CHECK-NEXT: .cfi_def_cfa_offset 131088 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 ++; CHECK-NEXT: .cfi_def_cfa_offset 196624 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 ++; CHECK-NEXT: .cfi_def_cfa_offset 262160 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: sub sp, sp, #15, lsl #12 // =61440 ++; CHECK-NEXT: .cfi_def_cfa_offset 323600 ++; CHECK-NEXT: sub sp, sp, #4080 ++; CHECK-NEXT: .cfi_def_cfa_offset 327680 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #79, lsl #12 // =323584 ++; CHECK-NEXT: .cfi_def_cfa_offset 4096 ++; CHECK-NEXT: add sp, sp, #4080 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 327664, align 1 ++ store i8* %v, ptr %out, align 8 ++ ret void ++} ++ ++; 5*64k, smallest frame probed with a loop. ++define void @static_327680(ptr %out) #0 { ++; CHECK-LABEL: static_327680: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680 ++; CHECK-NEXT: .cfi_def_cfa w9, 327696 ++; CHECK-NEXT: .LBB6_1: // %entry ++; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: cmp sp, x9 ++; CHECK-NEXT: b.ne .LBB6_1 ++; CHECK-NEXT: // %bb.2: // %entry ++; CHECK-NEXT: .cfi_def_cfa_register wsp ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 327680, align 1 ++ store i8* %v, ptr %out, align 8 ++ ret void ++} ++ ++; 5*64k+1024, large enough to use a loop, but not a multiple of 64KiB ++; so has a reminder, but no extra probe. ++define void @static_328704(ptr %out) #0 { ++; CHECK-LABEL: static_328704: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680 ++; CHECK-NEXT: .cfi_def_cfa w9, 327696 ++; CHECK-NEXT: .LBB7_1: // %entry ++; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: cmp sp, x9 ++; CHECK-NEXT: b.ne .LBB7_1 ++; CHECK-NEXT: // %bb.2: // %entry ++; CHECK-NEXT: .cfi_def_cfa_register wsp ++; CHECK-NEXT: sub sp, sp, #1024 ++; CHECK-NEXT: .cfi_def_cfa_offset 328720 ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680 ++; CHECK-NEXT: .cfi_def_cfa_offset 1040 ++; CHECK-NEXT: add sp, sp, #1024 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 328704, align 1 ++ store i8* %v, ptr %out, align 8 ++ ret void ++} ++ ++; 5*64k+1040, large enough to use a loop, has a reminder and ++; an extra probe. ++define void @static_328720(ptr %out) #0 { ++; CHECK-LABEL: static_328720: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680 ++; CHECK-NEXT: .cfi_def_cfa w9, 327696 ++; CHECK-NEXT: .LBB8_1: // %entry ++; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: cmp sp, x9 ++; CHECK-NEXT: b.ne .LBB8_1 ++; CHECK-NEXT: // %bb.2: // %entry ++; CHECK-NEXT: .cfi_def_cfa_register wsp ++; CHECK-NEXT: sub sp, sp, #1040 ++; CHECK-NEXT: .cfi_def_cfa_offset 328736 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680 ++; CHECK-NEXT: .cfi_def_cfa_offset 1056 ++; CHECK-NEXT: add sp, sp, #1040 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 328720, align 1 ++ store i8* %v, ptr %out, align 8 ++ ret void ++} ++ ++; A small allocation, but with a very large alignment requirement. We do this ++; by moving SP far enough that a sufficiently-aligned block will exist ++; somewhere in the stack frame, so must probe the whole of that larger SP move. ++define void @static_16_align_131072(ptr %out) #0 { ++; CHECK-LABEL: static_16_align_131072: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: mov x29, sp ++; CHECK-NEXT: .cfi_def_cfa w29, 16 ++; CHECK-NEXT: .cfi_offset w30, -8 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub x9, sp, #31, lsl #12 // =126976 ++; CHECK-NEXT: sub x9, x9, #4080 ++; CHECK-NEXT: and x9, x9, #0xfffffffffffe0000 ++; CHECK-NEXT: .LBB9_1: // %entry ++; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 ++; CHECK-NEXT: cmp sp, x9 ++; CHECK-NEXT: b.le .LBB9_3 ++; CHECK-NEXT: // %bb.2: // %entry ++; CHECK-NEXT: // in Loop: Header=BB9_1 Depth=1 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: b .LBB9_1 ++; CHECK-NEXT: .LBB9_3: // %entry ++; CHECK-NEXT: mov sp, x9 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: mov sp, x29 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w30 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 16, align 131072 ++ store i8* %v, ptr %out, align 8 ++ ret void ++} ++ ++; A small allocation, but with a very large alignment requirement which ++; is nevertheless small enough as to not need a loop. ++define void @static_16_align_8192(ptr %out) #0 { ++; CHECK-LABEL: static_16_align_8192: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: mov x29, sp ++; CHECK-NEXT: .cfi_def_cfa w29, 16 ++; CHECK-NEXT: .cfi_offset w30, -8 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: sub x9, x9, #4080 ++; CHECK-NEXT: and sp, x9, #0xffffffffffffe000 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: mov sp, x29 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w30 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 16, align 8192 ++ store i8* %v, ptr %out, align 8 ++ ret void ++} ++ ++; A large allocation with a very large alignment requirement which ++; is nevertheless small enough as to not need a loop. ++define void @static_32752_align_32k(ptr %out) #0 { ++; CHECK-LABEL: static_32752_align_32k: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: mov x29, sp ++; CHECK-NEXT: .cfi_def_cfa w29, 16 ++; CHECK-NEXT: .cfi_offset w30, -8 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub x9, sp, #7, lsl #12 // =28672 ++; CHECK-NEXT: sub x9, x9, #4080 ++; CHECK-NEXT: and sp, x9, #0xffffffffffff8000 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: mov sp, x29 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w30 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 32752, align 32768 ++ store i8* %v, ptr %out, align 8 ++ ret void ++} ++ ++attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" } +\ No newline at end of file +diff --git a/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir b/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir +new file mode 100644 +index 000000000000..a8a21ab330ba +--- /dev/null ++++ b/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir +@@ -0,0 +1,146 @@ ++# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 ++# RUN: llc -run-pass=prologepilog %s -o - | FileCheck %s ++# Regression test for a crash when the probing instruction ++# to replace is last in the block. ++--- | ++ source_filename = "tt.ll" ++ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" ++ target triple = "aarch64-linux" ++ ++ declare i1 @g(ptr) ++ ++ define void @f(ptr %out) #0 { ++ entry: ++ %p = alloca i32, i32 50000, align 4 ++ br label %loop ++ ++ loop: ; preds = %loop, %entry ++ %c = call i1 @g(ptr %p) ++ br i1 %c, label %loop, label %exit ++ ++ exit: ; preds = %loop ++ ret void ++ } ++ ++ attributes #0 = { uwtable "frame-pointer"="none" "probe-stack"="inline-asm" "target-features"="+sve" } ++ ++... ++--- ++name: f ++alignment: 4 ++exposesReturnsTwice: false ++legalized: false ++regBankSelected: false ++selected: false ++failedISel: false ++tracksRegLiveness: true ++hasWinCFI: false ++callsEHReturn: false ++callsUnwindInit: false ++hasEHCatchret: false ++hasEHScopes: false ++hasEHFunclets: false ++isOutlined: false ++debugInstrRef: false ++failsVerification: false ++tracksDebugUserValues: true ++registers: [] ++liveins: [] ++frameInfo: ++ isFrameAddressTaken: false ++ isReturnAddressTaken: false ++ hasStackMap: false ++ hasPatchPoint: false ++ stackSize: 0 ++ offsetAdjustment: 0 ++ maxAlignment: 4 ++ adjustsStack: true ++ hasCalls: true ++ stackProtector: '' ++ functionContext: '' ++ maxCallFrameSize: 0 ++ cvBytesOfCalleeSavedRegisters: 0 ++ hasOpaqueSPAdjustment: false ++ hasVAStart: false ++ hasMustTailInVarArgFunc: false ++ hasTailCall: false ++ localFrameSize: 200000 ++ savePoint: '' ++ restorePoint: '' ++fixedStack: [] ++stack: ++ - { id: 0, name: p, type: default, offset: 0, size: 200000, alignment: 4, ++ stack-id: default, callee-saved-register: '', callee-saved-restored: true, ++ local-offset: -200000, debug-info-variable: '', debug-info-expression: '', ++ debug-info-location: '' } ++entry_values: [] ++callSites: [] ++debugValueSubstitutions: [] ++constants: [] ++machineFunctionInfo: {} ++body: | ++ ; CHECK-LABEL: name: f ++ ; CHECK: bb.0.entry: ++ ; CHECK-NEXT: successors: %bb.3(0x80000000) ++ ; CHECK-NEXT: liveins: $lr, $fp ++ ; CHECK-NEXT: {{ $}} ++ ; CHECK-NEXT: early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.2), (store (s64) into %stack.1) ++ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 ++ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8 ++ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 ++ ; CHECK-NEXT: $x9 = frame-setup SUBXri $sp, 48, 12 ++ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w9, 196624 ++ ; CHECK-NEXT: {{ $}} ++ ; CHECK-NEXT: bb.3.entry: ++ ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.3(0x40000000) ++ ; CHECK-NEXT: liveins: $x9 ++ ; CHECK-NEXT: {{ $}} ++ ; CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1, 12 ++ ; CHECK-NEXT: frame-setup STRXui $xzr, $sp, 0 ++ ; CHECK-NEXT: $xzr = frame-setup SUBSXrx64 $sp, $x9, 24, implicit-def $nzcv ++ ; CHECK-NEXT: frame-setup Bcc 1, %bb.3, implicit $nzcv ++ ; CHECK-NEXT: {{ $}} ++ ; CHECK-NEXT: bb.4.entry: ++ ; CHECK-NEXT: successors: %bb.1(0x80000000) ++ ; CHECK-NEXT: {{ $}} ++ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_register $wsp ++ ; CHECK-NEXT: $sp = frame-setup SUBXri $sp, 3392, 0 ++ ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 200016 ++ ; CHECK-NEXT: frame-setup STRXui $xzr, $sp, 0 ++ ; CHECK-NEXT: {{ $}} ++ ; CHECK-NEXT: bb.1.loop: ++ ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ++ ; CHECK-NEXT: {{ $}} ++ ; CHECK-NEXT: $x0 = ADDXri $sp, 0, 0 ++ ; CHECK-NEXT: BL @g, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0 ++ ; CHECK-NEXT: TBNZW killed renamable $w0, 0, %bb.1 ++ ; CHECK-NEXT: B %bb.2 ++ ; CHECK-NEXT: {{ $}} ++ ; CHECK-NEXT: bb.2.exit: ++ ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 48, 12 ++ ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 3408 ++ ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 3392, 0 ++ ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16 ++ ; CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.1) ++ ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 ++ ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30 ++ ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 ++ ; CHECK-NEXT: RET_ReallyLR ++ bb.0.entry: ++ successors: %bb.1(0x80000000) ++ ++ ++ bb.1.loop: ++ successors: %bb.1(0x7c000000), %bb.2(0x04000000) ++ ++ ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp ++ $x0 = ADDXri %stack.0.p, 0, 0 ++ BL @g, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0 ++ ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp ++ TBNZW killed renamable $w0, 0, %bb.1 ++ B %bb.2 ++ ++ bb.2.exit: ++ RET_ReallyLR ++ ++... +\ No newline at end of file +diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll +new file mode 100644 +index 000000000000..e765d071e722 +--- /dev/null ++++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll +@@ -0,0 +1,724 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ++; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s ++; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s ++ ++; Test prolog sequences for stack probing when SVE objects are involved. ++ ++; The space for SVE objects needs probing in the general case, because ++; the stack adjustment may happen to be too big (i.e. greater than the ++; probe size) to allocate with a single `addvl`. ++; When we do know that the stack adjustment cannot exceed the probe size ++; we can avoid emitting a probe loop and emit a simple `addvl; str` ++; sequence instead. ++ ++define void @sve_1_vector(ptr %out) #0 { ++; CHECK-LABEL: sve_1_vector: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: addvl sp, sp, #-1 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ++; CHECK-NEXT: addvl sp, sp, #1 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %vec = alloca , align 16 ++ ret void ++} ++ ++; As above, but with 4 SVE vectors of stack space. ++define void @sve_4_vector(ptr %out) #0 { ++; CHECK-LABEL: sve_4_vector: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: addvl sp, sp, #-4 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ++; CHECK-NEXT: addvl sp, sp, #4 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %vec1 = alloca , align 16 ++ %vec2 = alloca , align 16 ++ %vec3 = alloca , align 16 ++ %vec4 = alloca , align 16 ++ ret void ++} ++ ++; As above, but with 16 SVE vectors of stack space. ++; The stack adjustment is less than or equal to 16 x 256 = 4096, so ++; we can allocate the locals at once. ++define void @sve_16_vector(ptr %out) #0 { ++; CHECK-LABEL: sve_16_vector: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: addvl sp, sp, #-16 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: addvl sp, sp, #16 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %vec1 = alloca , align 16 ++ %vec2 = alloca , align 16 ++ %vec3 = alloca , align 16 ++ %vec4 = alloca , align 16 ++ %vec5 = alloca , align 16 ++ %vec6 = alloca , align 16 ++ %vec7 = alloca , align 16 ++ %vec8 = alloca , align 16 ++ %vec9 = alloca , align 16 ++ %vec10 = alloca , align 16 ++ %vec11 = alloca , align 16 ++ %vec12 = alloca , align 16 ++ %vec13 = alloca , align 16 ++ %vec14 = alloca , align 16 ++ %vec15 = alloca , align 16 ++ %vec16 = alloca , align 16 ++ ret void ++} ++ ++; As above, but with 17 SVE vectors of stack space. Now we need ++; a probing loops since stack adjustment may be greater than ++; the probe size (17 x 256 = 4354 bytes) ++; TODO: Allocating `k*16+r` SVE vectors can be unrolled into ++; emiting the `k + r` sequences of `addvl sp, sp, #-N; str xzr, [sp]` ++define void @sve_17_vector(ptr %out) #0 { ++; CHECK-LABEL: sve_17_vector: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: addvl x9, sp, #-17 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG ++; CHECK-NEXT: .LBB3_1: // %entry ++; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: cmp sp, x9 ++; CHECK-NEXT: b.le .LBB3_3 ++; CHECK-NEXT: // %bb.2: // %entry ++; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: b .LBB3_1 ++; CHECK-NEXT: .LBB3_3: // %entry ++; CHECK-NEXT: mov sp, x9 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: .cfi_def_cfa_register wsp ++; CHECK-NEXT: addvl sp, sp, #17 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %vec1 = alloca , align 16 ++ %vec2 = alloca , align 16 ++ %vec3 = alloca , align 16 ++ %vec4 = alloca , align 16 ++ %vec5 = alloca , align 16 ++ %vec6 = alloca , align 16 ++ %vec7 = alloca , align 16 ++ %vec8 = alloca , align 16 ++ %vec9 = alloca , align 16 ++ %vec10 = alloca , align 16 ++ %vec11 = alloca , align 16 ++ %vec12 = alloca , align 16 ++ %vec13 = alloca , align 16 ++ %vec14 = alloca , align 16 ++ %vec15 = alloca , align 16 ++ %vec16 = alloca , align 16 ++ %vec17 = alloca , align 16 ++ ret void ++} ++ ++; Space for callee-saved SVE register is allocated similarly to allocating ++; space for SVE locals. When we know the stack adjustment cannot exceed the ++; probe size we can skip the explict probe, since saving SVE registers serves ++; as an implicit probe. ++define void @sve_1v_csr( %a) #0 { ++; CHECK-LABEL: sve_1v_csr: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: addvl sp, sp, #-1 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ++; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill ++; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG ++; CHECK-NEXT: //APP ++; CHECK-NEXT: //NO_APP ++; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload ++; CHECK-NEXT: addvl sp, sp, #1 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: .cfi_restore z8 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ call void asm sideeffect "", "~{z8}" () ++ ret void ++} ++ ++define void @sve_4v_csr( %a) #0 { ++; CHECK-LABEL: sve_4v_csr: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: addvl sp, sp, #-4 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ++; CHECK-NEXT: str z11, [sp] // 16-byte Folded Spill ++; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG ++; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG ++; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG ++; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG ++; CHECK-NEXT: //APP ++; CHECK-NEXT: //NO_APP ++; CHECK-NEXT: ldr z11, [sp] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: addvl sp, sp, #4 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: .cfi_restore z8 ++; CHECK-NEXT: .cfi_restore z9 ++; CHECK-NEXT: .cfi_restore z10 ++; CHECK-NEXT: .cfi_restore z11 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11}" () ++ ret void ++} ++ ++define void @sve_16v_csr( %a) #0 { ++; CHECK-LABEL: sve_16v_csr: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: addvl sp, sp, #-16 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill ++; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG ++; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG ++; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG ++; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG ++; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG ++; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG ++; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG ++; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG ++; CHECK-NEXT: //APP ++; CHECK-NEXT: //NO_APP ++; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: addvl sp, sp, #16 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: .cfi_restore z8 ++; CHECK-NEXT: .cfi_restore z9 ++; CHECK-NEXT: .cfi_restore z10 ++; CHECK-NEXT: .cfi_restore z11 ++; CHECK-NEXT: .cfi_restore z12 ++; CHECK-NEXT: .cfi_restore z13 ++; CHECK-NEXT: .cfi_restore z14 ++; CHECK-NEXT: .cfi_restore z15 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" () ++ ret void ++} ++ ++define void @sve_1p_csr( %a) #0 { ++; CHECK-LABEL: sve_1p_csr: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: addvl sp, sp, #-1 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ++; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ++; CHECK-NEXT: //APP ++; CHECK-NEXT: //NO_APP ++; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ++; CHECK-NEXT: addvl sp, sp, #1 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ call void asm sideeffect "", "~{p8}" () ++ ret void ++} ++ ++define void @sve_4p_csr( %a) #0 { ++; CHECK-LABEL: sve_4p_csr: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: addvl sp, sp, #-1 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ++; CHECK-NEXT: str p11, [sp, #4, mul vl] // 2-byte Folded Spill ++; CHECK-NEXT: str p10, [sp, #5, mul vl] // 2-byte Folded Spill ++; CHECK-NEXT: str p9, [sp, #6, mul vl] // 2-byte Folded Spill ++; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ++; CHECK-NEXT: //APP ++; CHECK-NEXT: //NO_APP ++; CHECK-NEXT: ldr p11, [sp, #4, mul vl] // 2-byte Folded Reload ++; CHECK-NEXT: ldr p10, [sp, #5, mul vl] // 2-byte Folded Reload ++; CHECK-NEXT: ldr p9, [sp, #6, mul vl] // 2-byte Folded Reload ++; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ++; CHECK-NEXT: addvl sp, sp, #1 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ call void asm sideeffect "", "~{p8},~{p9},~{p10},~{p11}" () ++ ret void ++} ++ ++define void @sve_16v_1p_csr( %a) #0 { ++; CHECK-LABEL: sve_16v_1p_csr: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: addvl x9, sp, #-17 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG ++; CHECK-NEXT: .LBB9_1: // %entry ++; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: cmp sp, x9 ++; CHECK-NEXT: b.le .LBB9_3 ++; CHECK-NEXT: // %bb.2: // %entry ++; CHECK-NEXT: // in Loop: Header=BB9_1 Depth=1 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: b .LBB9_1 ++; CHECK-NEXT: .LBB9_3: // %entry ++; CHECK-NEXT: mov sp, x9 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: .cfi_def_cfa_register wsp ++; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ++; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG ++; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG ++; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG ++; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG ++; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG ++; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG ++; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG ++; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG ++; CHECK-NEXT: //APP ++; CHECK-NEXT: //NO_APP ++; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ++; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: addvl sp, sp, #17 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: .cfi_restore z8 ++; CHECK-NEXT: .cfi_restore z9 ++; CHECK-NEXT: .cfi_restore z10 ++; CHECK-NEXT: .cfi_restore z11 ++; CHECK-NEXT: .cfi_restore z12 ++; CHECK-NEXT: .cfi_restore z13 ++; CHECK-NEXT: .cfi_restore z14 ++; CHECK-NEXT: .cfi_restore z15 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ call void asm sideeffect "", "~{p8},~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" () ++ ret void ++} ++ ++; A SVE vector and a 16-byte fixed size object. ++define void @sve_1_vector_16_arr(ptr %out) #0 { ++; CHECK-LABEL: sve_1_vector_16_arr: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub sp, sp, #16 ++; CHECK-NEXT: .cfi_def_cfa_offset 32 ++; CHECK-NEXT: addvl sp, sp, #-1 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG ++; CHECK-NEXT: addvl sp, sp, #1 ++; CHECK-NEXT: .cfi_def_cfa wsp, 32 ++; CHECK-NEXT: add sp, sp, #16 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %vec = alloca , align 16 ++ %arr = alloca i8, i64 16, align 1 ++ ret void ++} ++ ++; A large SVE stack object and a large stack slot, both of which need probing. ++; TODO: This could be optimised by combining the fixed-size offset into the ++; loop. ++define void @sve_1_vector_4096_arr(ptr %out) #0 { ++; CHECK-LABEL: sve_1_vector_4096_arr: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub x9, sp, #3, lsl #12 // =12288 ++; CHECK-NEXT: .cfi_def_cfa w9, 12304 ++; CHECK-NEXT: addvl x9, x9, #-32 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 256 * VG ++; CHECK-NEXT: addvl x9, x9, #-32 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 512 * VG ++; CHECK-NEXT: .LBB11_1: // %entry ++; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: cmp sp, x9 ++; CHECK-NEXT: b.le .LBB11_3 ++; CHECK-NEXT: // %bb.2: // %entry ++; CHECK-NEXT: // in Loop: Header=BB11_1 Depth=1 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: b .LBB11_1 ++; CHECK-NEXT: .LBB11_3: // %entry ++; CHECK-NEXT: mov sp, x9 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: .cfi_def_cfa_register wsp ++; CHECK-NEXT: addvl sp, sp, #31 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x88, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 264 * VG ++; CHECK-NEXT: addvl sp, sp, #31 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 16 * VG ++; CHECK-NEXT: addvl sp, sp, #2 ++; CHECK-NEXT: .cfi_def_cfa wsp, 12304 ++; CHECK-NEXT: add sp, sp, #3, lsl #12 // =12288 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %vec = alloca , align 16 ++ %arr = alloca i8, i64 12288, align 1 ++ ret void ++} ++ ++; Not tested: SVE stack objects with alignment >16 bytes, which isn't currently ++; supported even without stack-probing. ++ ++; An SVE vector, and a 16-byte fixed size object, which ++; has a large alignment requirement. ++define void @sve_1_vector_16_arr_align_8192(ptr %out) #0 { ++; CHECK-LABEL: sve_1_vector_16_arr_align_8192: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: mov x29, sp ++; CHECK-NEXT: .cfi_def_cfa w29, 16 ++; CHECK-NEXT: .cfi_offset w30, -8 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: sub x9, x9, #4080 ++; CHECK-NEXT: addvl x9, x9, #-1 ++; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 ++; CHECK-NEXT: .LBB12_1: // %entry ++; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: cmp sp, x9 ++; CHECK-NEXT: b.le .LBB12_3 ++; CHECK-NEXT: // %bb.2: // %entry ++; CHECK-NEXT: // in Loop: Header=BB12_1 Depth=1 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: b .LBB12_1 ++; CHECK-NEXT: .LBB12_3: // %entry ++; CHECK-NEXT: mov sp, x9 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: mov sp, x29 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w30 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %vec = alloca , align 16 ++ %arr = alloca i8, i64 16, align 8192 ++ ret void ++} ++ ++; With 64k guard pages, we can allocate bigger SVE space without a probing loop. ++define void @sve_1024_64k_guard(ptr %out) #0 "stack-probe-size"="65536" { ++; CHECK-LABEL: sve_1024_64k_guard: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: addvl sp, sp, #-32 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 256 * VG ++; CHECK-NEXT: addvl sp, sp, #-32 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 512 * VG ++; CHECK-NEXT: addvl sp, sp, #-32 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 768 * VG ++; CHECK-NEXT: addvl sp, sp, #-32 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1024 * VG ++; CHECK-NEXT: addvl sp, sp, #-32 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1280 * VG ++; CHECK-NEXT: addvl sp, sp, #-32 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1536 * VG ++; CHECK-NEXT: addvl sp, sp, #-32 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1792 * VG ++; CHECK-NEXT: addvl sp, sp, #-32 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 2048 * VG ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: addvl sp, sp, #31 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1800 * VG ++; CHECK-NEXT: addvl sp, sp, #31 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1552 * VG ++; CHECK-NEXT: addvl sp, sp, #31 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1304 * VG ++; CHECK-NEXT: addvl sp, sp, #31 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1056 * VG ++; CHECK-NEXT: addvl sp, sp, #31 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 808 * VG ++; CHECK-NEXT: addvl sp, sp, #31 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 560 * VG ++; CHECK-NEXT: addvl sp, sp, #31 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 312 * VG ++; CHECK-NEXT: addvl sp, sp, #31 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG ++; CHECK-NEXT: addvl sp, sp, #8 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %vec = alloca , align 16 ++ ret void ++} ++ ++define void @sve_1028_64k_guard(ptr %out) #0 "stack-probe-size"="65536" { ++; CHECK-LABEL: sve_1028_64k_guard: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: addvl x9, sp, #-32 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 256 * VG ++; CHECK-NEXT: addvl x9, x9, #-32 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 512 * VG ++; CHECK-NEXT: addvl x9, x9, #-32 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 768 * VG ++; CHECK-NEXT: addvl x9, x9, #-32 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1024 * VG ++; CHECK-NEXT: addvl x9, x9, #-32 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1280 * VG ++; CHECK-NEXT: addvl x9, x9, #-32 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1536 * VG ++; CHECK-NEXT: addvl x9, x9, #-32 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1792 * VG ++; CHECK-NEXT: addvl x9, x9, #-32 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2048 * VG ++; CHECK-NEXT: addvl x9, x9, #-1 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2056 * VG ++; CHECK-NEXT: .LBB14_1: // %entry ++; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 ++; CHECK-NEXT: cmp sp, x9 ++; CHECK-NEXT: b.le .LBB14_3 ++; CHECK-NEXT: // %bb.2: // %entry ++; CHECK-NEXT: // in Loop: Header=BB14_1 Depth=1 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: b .LBB14_1 ++; CHECK-NEXT: .LBB14_3: // %entry ++; CHECK-NEXT: mov sp, x9 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: .cfi_def_cfa_register wsp ++; CHECK-NEXT: addvl sp, sp, #31 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1808 * VG ++; CHECK-NEXT: addvl sp, sp, #31 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1560 * VG ++; CHECK-NEXT: addvl sp, sp, #31 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1312 * VG ++; CHECK-NEXT: addvl sp, sp, #31 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1064 * VG ++; CHECK-NEXT: addvl sp, sp, #31 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 816 * VG ++; CHECK-NEXT: addvl sp, sp, #31 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 568 * VG ++; CHECK-NEXT: addvl sp, sp, #31 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 320 * VG ++; CHECK-NEXT: addvl sp, sp, #31 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG ++; CHECK-NEXT: addvl sp, sp, #9 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %vec = alloca , align 16 ++ %vec1 = alloca , align 16 ++ ret void ++} ++ ++; With 5 SVE vectors of stack space the unprobed area ++; at the top of the stack can exceed 1024 bytes (5 x 256 == 1280), ++; hence we need to issue a probe. ++define void @sve_5_vector(ptr %out) #0 { ++; CHECK-LABEL: sve_5_vector: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: addvl sp, sp, #-5 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 40 * VG ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: addvl sp, sp, #5 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %vec1 = alloca , align 16 ++ %vec2 = alloca , align 16 ++ %vec3 = alloca , align 16 ++ %vec4 = alloca , align 16 ++ %vec5 = alloca , align 16 ++ ret void ++} ++ ++; Test with a 14 scalable bytes (so up to 14 * 16 = 224) of unprobed ++; are bellow the save location of `p9`. ++define void @sve_unprobed_area( %a, i32 %n) #0 { ++; CHECK-LABEL: sve_unprobed_area: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: addvl sp, sp, #-4 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: str p9, [sp, #7, mul vl] // 2-byte Folded Spill ++; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill ++; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG ++; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG ++; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG ++; CHECK-NEXT: addvl sp, sp, #-4 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG ++; CHECK-NEXT: //APP ++; CHECK-NEXT: //NO_APP ++; CHECK-NEXT: addvl sp, sp, #4 ++; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ++; CHECK-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload ++; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload ++; CHECK-NEXT: addvl sp, sp, #4 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: .cfi_restore z8 ++; CHECK-NEXT: .cfi_restore z9 ++; CHECK-NEXT: .cfi_restore z10 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ call void asm sideeffect "", "~{z8},~{z9},~{z10},~{p9}" () ++ ++ %v0 = alloca , align 16 ++ %v1 = alloca , align 16 ++ %v2 = alloca , align 16 ++ %v3 = alloca , align 16 ++ ++ ret void ++} ++ ++attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" } +\ No newline at end of file +diff --git a/llvm/test/CodeGen/AArch64/stack-probing.ll b/llvm/test/CodeGen/AArch64/stack-probing.ll +new file mode 100644 +index 000000000000..95001450622f +--- /dev/null ++++ b/llvm/test/CodeGen/AArch64/stack-probing.ll +@@ -0,0 +1,539 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ++; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s ++; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s ++ ++; Tests for prolog sequences for stack probing, when using a 4KiB stack guard. ++ ++; The stack probing parameters in function attributes take precedence over ++; ones in the module flags. ++ ++; Small stack frame, no probing required. ++define void @static_64(ptr %out) #0 { ++; CHECK-LABEL: static_64: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: sub sp, sp, #64 ++; CHECK-NEXT: .cfi_def_cfa_offset 64 ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #64 ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 64, align 1 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++; At 256 bytes we start to always create a frame pointer. No frame smaller then ++; this needs a probe, so we can use the saving of at least one CSR as a probe ++; at the top of our frame. ++define void @static_256(ptr %out) #0 { ++; CHECK-LABEL: static_256: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: sub sp, sp, #272 ++; CHECK-NEXT: .cfi_def_cfa_offset 272 ++; CHECK-NEXT: str x29, [sp, #256] // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #272 ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 256, align 1 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++; At 1024 bytes, this is the largest frame which doesn't need probing. ++define void @static_1024(ptr %out) #0 { ++; CHECK-LABEL: static_1024: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub sp, sp, #1024 ++; CHECK-NEXT: .cfi_def_cfa_offset 1040 ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #1024 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 1024, align 1 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++; At 1024+16 bytes, this is the smallest frame which needs probing. ++define void @static_1040(ptr %out) #0 { ++; CHECK-LABEL: static_1040: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub sp, sp, #1040 ++; CHECK-NEXT: .cfi_def_cfa_offset 1056 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #1040 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 1040, align 1 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++; 4k bytes is the largest frame we can probe in one go. ++define void @static_4096(ptr %out) #0 { ++; CHECK-LABEL: static_4096: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: .cfi_def_cfa_offset 4112 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 4096, align 1 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++; 4k+16 bytes, still needs just one probe. ++define void @static_4112(ptr %out) #0 { ++; CHECK-LABEL: static_4112: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: .cfi_def_cfa_offset 4112 ++; CHECK-NEXT: str xzr, [sp], #-16 ++; CHECK-NEXT: .cfi_def_cfa_offset 4128 ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: .cfi_def_cfa_offset 32 ++; CHECK-NEXT: add sp, sp, #16 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 4112, align 1 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++; 4k+1024 bytes, the largest frame which needs just one probe. ++define void @static_5120(ptr %out) #0 { ++; CHECK-LABEL: static_5120: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: .cfi_def_cfa_offset 4112 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: sub sp, sp, #1024 ++; CHECK-NEXT: .cfi_def_cfa_offset 5136 ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: .cfi_def_cfa_offset 1040 ++; CHECK-NEXT: add sp, sp, #1024 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 5120, align 1 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++; 4k+1024+16, the smallest frame which needs two probes. ++define void @static_5136(ptr %out) #0 { ++; CHECK-LABEL: static_5136: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: .cfi_def_cfa_offset 4112 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: sub sp, sp, #1040 ++; CHECK-NEXT: .cfi_def_cfa_offset 5152 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: .cfi_def_cfa_offset 1056 ++; CHECK-NEXT: add sp, sp, #1040 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 5136, align 1 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++; 2*4k+1024, the largest frame needing two probes ++define void @static_9216(ptr %out) #0 { ++; CHECK-LABEL: static_9216: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: .cfi_def_cfa_offset 4112 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: .cfi_def_cfa_offset 8208 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: sub sp, sp, #1024 ++; CHECK-NEXT: .cfi_def_cfa_offset 9232 ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #2, lsl #12 // =8192 ++; CHECK-NEXT: .cfi_def_cfa_offset 1040 ++; CHECK-NEXT: add sp, sp, #1024 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 9216, align 1 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++; 5*4k-16, the largest frame probed without a loop ++define void @static_20464(ptr %out) #0 { ++; CHECK-LABEL: static_20464: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: .cfi_def_cfa_offset 4112 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: .cfi_def_cfa_offset 8208 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: .cfi_def_cfa_offset 12304 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: .cfi_def_cfa_offset 16400 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: sub sp, sp, #4080 ++; CHECK-NEXT: .cfi_def_cfa_offset 20480 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #4, lsl #12 // =16384 ++; CHECK-NEXT: .cfi_def_cfa_offset 4096 ++; CHECK-NEXT: add sp, sp, #4080 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 20464, align 1 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++; 5*4k, the smallest frame probed with a loop ++define void @static_20480(ptr %out) #0 { ++; CHECK-LABEL: static_20480: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub x9, sp, #5, lsl #12 // =20480 ++; CHECK-NEXT: .cfi_def_cfa w9, 20496 ++; CHECK-NEXT: .LBB10_1: // %entry ++; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: cmp sp, x9 ++; CHECK-NEXT: b.ne .LBB10_1 ++; CHECK-NEXT: // %bb.2: // %entry ++; CHECK-NEXT: .cfi_def_cfa_register wsp ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #5, lsl #12 // =20480 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 20480, align 1 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++; 5*4k + 1024, large enough to use a loop, but not a multiple of 4KiB ++; so has a reminder, but no extra probe. ++define void @static_21504(ptr %out) #0 { ++; CHECK-LABEL: static_21504: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub x9, sp, #5, lsl #12 // =20480 ++; CHECK-NEXT: .cfi_def_cfa w9, 20496 ++; CHECK-NEXT: .LBB11_1: // %entry ++; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: cmp sp, x9 ++; CHECK-NEXT: b.ne .LBB11_1 ++; CHECK-NEXT: // %bb.2: // %entry ++; CHECK-NEXT: .cfi_def_cfa_register wsp ++; CHECK-NEXT: sub sp, sp, #1024 ++; CHECK-NEXT: .cfi_def_cfa_offset 21520 ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #5, lsl #12 // =20480 ++; CHECK-NEXT: .cfi_def_cfa_offset 1040 ++; CHECK-NEXT: add sp, sp, #1024 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 21504, align 1 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++; 5*4k+1040, large enough to use a loop, has a reminder and ++; an extra probe. ++define void @static_21520(ptr %out) #0 { ++; CHECK-LABEL: static_21520: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub x9, sp, #5, lsl #12 // =20480 ++; CHECK-NEXT: .cfi_def_cfa w9, 20496 ++; CHECK-NEXT: .LBB12_1: // %entry ++; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: cmp sp, x9 ++; CHECK-NEXT: b.ne .LBB12_1 ++; CHECK-NEXT: // %bb.2: // %entry ++; CHECK-NEXT: .cfi_def_cfa_register wsp ++; CHECK-NEXT: sub sp, sp, #1040 ++; CHECK-NEXT: .cfi_def_cfa_offset 21536 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #5, lsl #12 // =20480 ++; CHECK-NEXT: .cfi_def_cfa_offset 1056 ++; CHECK-NEXT: add sp, sp, #1040 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 21520, align 1 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++; A small allocation, but with a very large alignment requirement. We do this ++; by moving SP far enough that a sufficiently-aligned block will exist ++; somewhere in the stack frame, so must probe the whole of that larger SP move. ++define void @static_16_align_8192(ptr %out) #0 { ++; CHECK-LABEL: static_16_align_8192: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: mov x29, sp ++; CHECK-NEXT: .cfi_def_cfa w29, 16 ++; CHECK-NEXT: .cfi_offset w30, -8 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: sub x9, x9, #4080 ++; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 ++; CHECK-NEXT: .LBB13_1: // %entry ++; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: cmp sp, x9 ++; CHECK-NEXT: b.le .LBB13_3 ++; CHECK-NEXT: // %bb.2: // %entry ++; CHECK-NEXT: // in Loop: Header=BB13_1 Depth=1 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: b .LBB13_1 ++; CHECK-NEXT: .LBB13_3: // %entry ++; CHECK-NEXT: mov sp, x9 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: mov sp, x29 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w30 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 16, align 8192 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++; A small allocation with a very large alignment requirement, but ++; nevertheless small enough as to not need a loop. ++define void @static_16_align_2048(ptr %out) #0 { ++; CHECK-LABEL: static_16_align_2048: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: mov x29, sp ++; CHECK-NEXT: .cfi_def_cfa w29, 16 ++; CHECK-NEXT: .cfi_offset w30, -8 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub x9, sp, #2032 ++; CHECK-NEXT: and sp, x9, #0xfffffffffffff800 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: mov sp, x29 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w30 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 16, align 2048 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++; A large(-ish) allocation with a very large alignment requirement, but ++; nevertheless small enough as to not need a loop. ++define void @static_2032_align_2048(ptr %out) #0 { ++; CHECK-LABEL: static_2032_align_2048: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: mov x29, sp ++; CHECK-NEXT: .cfi_def_cfa w29, 16 ++; CHECK-NEXT: .cfi_offset w30, -8 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub x9, sp, #2032 ++; CHECK-NEXT: and sp, x9, #0xfffffffffffff800 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: mov sp, x29 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w30 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 2032, align 2048 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++; Test stack probing is enabled by module flags ++define void @static_9232(ptr %out) uwtable(async) { ++; CHECK-LABEL: static_9232: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub sp, sp, #2, lsl #12 // =8192 ++; CHECK-NEXT: .cfi_def_cfa_offset 8208 ++; CHECK-NEXT: sub sp, sp, #800 ++; CHECK-NEXT: .cfi_def_cfa_offset 9008 ++; CHECK-NEXT: str xzr, [sp], #-240 ++; CHECK-NEXT: .cfi_def_cfa_offset 9248 ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: add sp, sp, #2, lsl #12 // =8192 ++; CHECK-NEXT: .cfi_def_cfa_offset 1056 ++; CHECK-NEXT: add sp, sp, #1040 ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i64 9232, align 1 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++; Test for a tight upper bound on the amount of stack adjustment ++; due to stack realignment. No probes should appear. ++define void @static_1008(ptr %out) #0 { ++; CHECK-LABEL: static_1008: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: mov x29, sp ++; CHECK-NEXT: .cfi_def_cfa w29, 16 ++; CHECK-NEXT: .cfi_offset w30, -8 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: sub x9, sp, #1008 ++; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str x8, [x0] ++; CHECK-NEXT: mov sp, x29 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w30 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i8, i32 1008, align 32 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "stack-probe-size"="4096" "frame-pointer"="none" } ++ ++!llvm.module.flags = !{!0, !1} ++ ++!0 = !{i32 4, !"probe-stack", !"inline-asm"} ++!1 = !{i32 8, !"stack-probe-size", i32 9000} +\ No newline at end of file +-- +2.42.0.windows.2 + diff --git a/0024-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-SelectionDAG.patch b/0024-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-SelectionDAG.patch new file mode 100644 index 0000000..2cd9d3e --- /dev/null +++ b/0024-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-SelectionDAG.patch @@ -0,0 +1,744 @@ +From e433199a7dbe87324a671299f6509f19d295382f Mon Sep 17 00:00:00 2001 +From: rickyleung +Date: Fri, 26 Apr 2024 16:59:48 +0800 +Subject: [PATCH 5/7] [backport][AArch64] Stack probing for dynamic allocas in + SelectionDAG + +Reference: https://github.com/llvm/llvm-project/commit/b1806e6a1f0589acc88499419531c4eb82488f1a + +Add support for probing for dynamic allocas (variable-size objects and +outgoing stack arguments). + +Co-authored-by: Oliver Stannard +--- + .../Target/AArch64/AArch64FrameLowering.cpp | 26 ++ + .../Target/AArch64/AArch64ISelLowering.cpp | 152 +++++--- + llvm/lib/Target/AArch64/AArch64ISelLowering.h | 13 +- + llvm/lib/Target/AArch64/AArch64InstrInfo.td | 14 + + .../stack-probing-dynamic-no-frame-setup.ll | 14 + + .../CodeGen/AArch64/stack-probing-dynamic.ll | 362 ++++++++++++++++++ + 6 files changed, 526 insertions(+), 55 deletions(-) + create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll + create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll + +diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +index af019ab23770..fe21173f531f 100644 +--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp ++++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +@@ -462,6 +462,11 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { + /// included as part of the stack frame. + bool + AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { ++ // The stack probing code for the dynamically allocated outgoing arguments ++ // area assumes that the stack is probed at the top - either by the prologue ++ // code, which issues a probe if `hasVarSizedObjects` return true, or by the ++ // most recent variable-sized object allocation. Changing the condition here ++ // may need to be followed up by changes to the probe issuing logic. + return !MF.getFrameInfo().hasVarSizedObjects(); + } + +@@ -470,6 +475,9 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( + MachineBasicBlock::iterator I) const { + const AArch64InstrInfo *TII = + static_cast(MF.getSubtarget().getInstrInfo()); ++ const AArch64TargetLowering *TLI = ++ MF.getSubtarget().getTargetLowering(); ++ MachineFrameInfo &MFI = MF.getFrameInfo(); + DebugLoc DL = I->getDebugLoc(); + unsigned Opc = I->getOpcode(); + bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); +@@ -496,6 +504,24 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( + // Most call frames will be allocated at the start of a function so + // this is OK, but it is a limitation that needs dealing with. + assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large"); ++ ++ if (TLI->hasInlineStackProbe(MF) && ++ -Amount >= AArch64::StackProbeMaxUnprobedStack) { ++ // When stack probing is enabled, the decrement of SP may need to be ++ // probed. We only need to do this if the call site needs 1024 bytes of ++ // space or more, because a region smaller than that is allowed to be ++ // unprobed at an ABI boundary. We rely on the fact that SP has been ++ // probed exactly at this point, either by the prologue or most recent ++ // dynamic allocation. ++ assert(MFI.hasVarSizedObjects() && ++ "non-reserved call frame without var sized objects?"); ++ Register ScratchReg = ++ MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); ++ inlineStackProbeFixed(I, ScratchReg, -Amount, StackOffset::get(0, 0)); ++ } else { ++ emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, ++ StackOffset::getFixed(Amount), TII); ++ } + emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(Amount), TII); + } +diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +index 082043420fb9..eff0722e1c77 100644 +--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp ++++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +@@ -556,10 +556,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + +- if (Subtarget->isTargetWindows()) +- setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); +- else +- setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); ++ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); + + // Constant pool entries + setOperationAction(ISD::ConstantPool, MVT::i64, Custom); +@@ -2288,6 +2285,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { + MAKE_CASE(AArch64ISD::CSINC) + MAKE_CASE(AArch64ISD::THREAD_POINTER) + MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) ++ MAKE_CASE(AArch64ISD::PROBED_ALLOCA) + MAKE_CASE(AArch64ISD::ABDS_PRED) + MAKE_CASE(AArch64ISD::ABDU_PRED) + MAKE_CASE(AArch64ISD::HADDS_PRED) +@@ -2646,6 +2644,22 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet( + return BB; + } + ++MachineBasicBlock * ++AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI, ++ MachineBasicBlock *MBB) const { ++ MachineFunction &MF = *MBB->getParent(); ++ MachineBasicBlock::iterator MBBI = MI.getIterator(); ++ DebugLoc DL = MBB->findDebugLoc(MBBI); ++ const AArch64InstrInfo &TII = ++ *MF.getSubtarget().getInstrInfo(); ++ Register TargetReg = MI.getOperand(0).getReg(); ++ MachineBasicBlock::iterator NextInst = ++ TII.probedStackAlloc(MBBI, TargetReg, false); ++ ++ MI.eraseFromParent(); ++ return NextInst->getParent(); ++} ++ + MachineBasicBlock * + AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, +@@ -2774,6 +2788,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( + + case AArch64::CATCHRET: + return EmitLoweredCatchRet(MI, BB); ++ case AArch64::PROBED_STACKALLOC_DYN: ++ return EmitDynamicProbedAlloc(MI, BB); + case AArch64::LD1_MXIPXX_H_PSEUDO_B: + return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB); + case AArch64::LD1_MXIPXX_H_PSEUDO_H: +@@ -13666,9 +13682,34 @@ SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op, + AN->getMemOperand()); + } + +-SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC( +- SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const { ++SDValue ++AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, ++ SelectionDAG &DAG) const { ++ + SDLoc dl(Op); ++ // Get the inputs. ++ SDNode *Node = Op.getNode(); ++ SDValue Chain = Op.getOperand(0); ++ SDValue Size = Op.getOperand(1); ++ MaybeAlign Align = ++ cast(Op.getOperand(2))->getMaybeAlignValue(); ++ EVT VT = Node->getValueType(0); ++ ++ if (DAG.getMachineFunction().getFunction().hasFnAttribute( ++ "no-stack-arg-probe")) { ++ SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); ++ Chain = SP.getValue(1); ++ SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); ++ if (Align) ++ SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), ++ DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); ++ Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); ++ SDValue Ops[2] = {SP, Chain}; ++ return DAG.getMergeValues(Ops, dl); ++ } ++ ++ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); ++ + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(), + PtrVT, 0); +@@ -13692,7 +13733,59 @@ SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC( + + Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size, + DAG.getConstant(4, dl, MVT::i64)); +- return Chain; ++ ++ SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); ++ Chain = SP.getValue(1); ++ SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); ++ if (Align) ++ SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), ++ DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); ++ Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); ++ ++ Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); ++ ++ SDValue Ops[2] = {SP, Chain}; ++ return DAG.getMergeValues(Ops, dl); ++} ++ ++SDValue ++AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op, ++ SelectionDAG &DAG) const { ++ // Get the inputs. ++ SDNode *Node = Op.getNode(); ++ SDValue Chain = Op.getOperand(0); ++ SDValue Size = Op.getOperand(1); ++ ++ MaybeAlign Align = ++ cast(Op.getOperand(2))->getMaybeAlignValue(); ++ SDLoc dl(Op); ++ EVT VT = Node->getValueType(0); ++ ++ // Construct the new SP value in a GPR. ++ SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); ++ Chain = SP.getValue(1); ++ SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); ++ if (Align) ++ SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), ++ DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); ++ ++ // Set the real SP to the new value with a probing loop. ++ Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP); ++ SDValue Ops[2] = {SP, Chain}; ++ return DAG.getMergeValues(Ops, dl); ++} ++ ++SDValue ++AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, ++ SelectionDAG &DAG) const { ++ MachineFunction &MF = DAG.getMachineFunction(); ++ ++ if (Subtarget->isTargetWindows()) ++ return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG); ++ else if (hasInlineStackProbe(MF)) ++ return LowerInlineDYNAMIC_STACKALLOC(Op, DAG); ++ else ++ return SDValue(); + } + + // When x and y are extended, lower: +@@ -13746,51 +13839,6 @@ SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG, + return DAG.getNode(ISD::ADD, dl, VT, Add, tmp); + } + +-SDValue +-AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, +- SelectionDAG &DAG) const { +- assert(Subtarget->isTargetWindows() && +- "Only Windows alloca probing supported"); +- SDLoc dl(Op); +- // Get the inputs. +- SDNode *Node = Op.getNode(); +- SDValue Chain = Op.getOperand(0); +- SDValue Size = Op.getOperand(1); +- MaybeAlign Align = +- cast(Op.getOperand(2))->getMaybeAlignValue(); +- EVT VT = Node->getValueType(0); +- +- if (DAG.getMachineFunction().getFunction().hasFnAttribute( +- "no-stack-arg-probe")) { +- SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); +- Chain = SP.getValue(1); +- SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); +- if (Align) +- SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), +- DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); +- Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); +- SDValue Ops[2] = {SP, Chain}; +- return DAG.getMergeValues(Ops, dl); +- } +- +- Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); +- +- Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG); +- +- SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); +- Chain = SP.getValue(1); +- SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); +- if (Align) +- SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), +- DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); +- Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); +- +- Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); +- +- SDValue Ops[2] = {SP, Chain}; +- return DAG.getMergeValues(Ops, dl); +-} +- + SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); +diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h +index 643d363e234a..9b388c7f8668 100644 +--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h ++++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h +@@ -90,6 +90,10 @@ enum NodeType : unsigned { + ADC, + SBC, // adc, sbc instructions + ++ // To avoid stack clash, allocation is performed by block and each block is ++ // probed. ++ PROBED_ALLOCA, ++ + // Predicated instructions where inactive lanes produce undefined results. + ABDS_PRED, + ABDU_PRED, +@@ -610,6 +614,9 @@ public: + MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI, + MachineBasicBlock *BB) const; + ++ MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI, ++ MachineBasicBlock *MBB) const; ++ + MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, + MachineBasicBlock *BB) const; +@@ -1113,10 +1120,10 @@ private: + SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; +- SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain, +- SDValue &Size, +- SelectionDAG &DAG) const; ++ + SDValue LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const; + + SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op, +diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td +index 09980c2f45e6..9b9103e01d67 100644 +--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td ++++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td +@@ -818,6 +818,12 @@ def AArch64stilp : SDNode<"AArch64ISD::STILP", SDT_AArch64stilp, [SDNPHasChain, + def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + + def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>; ++ ++def AArch64probedalloca ++ : SDNode<"AArch64ISD::PROBED_ALLOCA", ++ SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, ++ [SDNPHasChain, SDNPMayStore]>; ++ + def AArch64mrs : SDNode<"AArch64ISD::MRS", + SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>, + [SDNPHasChain, SDNPOutGlue]>; +@@ -908,6 +914,14 @@ def PROBED_STACKALLOC_VAR : Pseudo<(outs), + []>, + Sched<[]>; + ++// Probed stack allocations of a variable size, used for allocas of unknown size ++// when stack-clash protection is enabled. ++let usesCustomInserter = 1 in ++def PROBED_STACKALLOC_DYN : Pseudo<(outs), ++ (ins GPR64common:$target), ++ [(AArch64probedalloca GPR64common:$target)]>, ++ Sched<[]>; ++ + } // Defs = [SP, NZCV], Uses = [SP] in + } // hasSideEffects = 1, isCodeGenOnly = 1 + +diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll +new file mode 100644 +index 000000000000..673f9038a35f +--- /dev/null ++++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll +@@ -0,0 +1,14 @@ ++; RUN: llc --stop-after=finalize-isel -o - | FileCheck %s ++target triple = "aarch64-linux" ++ ++; Check dynamic stack allocation and probing instructions do not have ++; the FrameSetup flag. ++ ++; CHECK-NOT: frame-setup ++define void @no_frame_setup(i64 %size, ptr %out) #0 { ++ %v = alloca i8, i64 %size, align 1 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" } +\ No newline at end of file +diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll +new file mode 100644 +index 000000000000..4d9ef77f7a0d +--- /dev/null ++++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll +@@ -0,0 +1,362 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ++; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s ++ ++; Dynamically-sized allocation, needs a loop which can handle any size at ++; runtime. The final iteration of the loop will temporarily put SP below the ++; target address, but this doesn't break any of the ABI constraints on the ++; stack, and also doesn't probe below the target SP value. ++define void @dynamic(i64 %size, ptr %out) #0 { ++; CHECK-LABEL: dynamic: ++; CHECK: // %bb.0: ++; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: mov x29, sp ++; CHECK-NEXT: .cfi_def_cfa w29, 16 ++; CHECK-NEXT: .cfi_offset w30, -8 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: add x9, x0, #15 ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 ++; CHECK-NEXT: sub x8, x8, x9 ++; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: cmp sp, x8 ++; CHECK-NEXT: b.le .LBB0_3 ++; CHECK-NEXT: // %bb.2: // in Loop: Header=BB0_1 Depth=1 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: b .LBB0_1 ++; CHECK-NEXT: .LBB0_3: ++; CHECK-NEXT: mov sp, x8 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: str x8, [x1] ++; CHECK-NEXT: mov sp, x29 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w30 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++ %v = alloca i8, i64 %size, align 1 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++; This function has a fixed-size stack slot and a dynamic one. The fixed size ++; slot isn't large enough that we would normally probe it, but we need to do so ++; here otherwise the gap between the CSR save and the first probe of the ++; dynamic allocation could be too far apart when the size of the dynamic ++; allocation is close to the guard size. ++define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 { ++; CHECK-LABEL: dynamic_fixed: ++; CHECK: // %bb.0: ++; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: mov x29, sp ++; CHECK-NEXT: .cfi_def_cfa w29, 16 ++; CHECK-NEXT: .cfi_offset w30, -8 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: str xzr, [sp, #-64]! ++; CHECK-NEXT: add x9, x0, #15 ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: sub x10, x29, #64 ++; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 ++; CHECK-NEXT: str x10, [x1] ++; CHECK-NEXT: sub x8, x8, x9 ++; CHECK-NEXT: .LBB1_1: // =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: cmp sp, x8 ++; CHECK-NEXT: b.le .LBB1_3 ++; CHECK-NEXT: // %bb.2: // in Loop: Header=BB1_1 Depth=1 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: b .LBB1_1 ++; CHECK-NEXT: .LBB1_3: ++; CHECK-NEXT: mov sp, x8 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: str x8, [x2] ++; CHECK-NEXT: mov sp, x29 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w30 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++ %v1 = alloca i8, i64 64, align 1 ++ store ptr %v1, ptr %out1, align 8 ++ %v2 = alloca i8, i64 %size, align 1 ++ store ptr %v2, ptr %out2, align 8 ++ ret void ++} ++ ++; Dynamic allocation, with an alignment requirement greater than the alignment ++; of SP. Done by ANDing the target SP with a constant to align it down, then ++; doing the loop as normal. Note that we also re-align the stack in the prolog, ++; which isn't actually needed because the only aligned allocations are dynamic, ++; this is done even without stack probing. ++define void @dynamic_align_64(i64 %size, ptr %out) #0 { ++; CHECK-LABEL: dynamic_align_64: ++; CHECK: // %bb.0: ++; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 32 ++; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill ++; CHECK-NEXT: mov x29, sp ++; CHECK-NEXT: .cfi_def_cfa w29, 32 ++; CHECK-NEXT: .cfi_offset w19, -16 ++; CHECK-NEXT: .cfi_offset w30, -24 ++; CHECK-NEXT: .cfi_offset w29, -32 ++; CHECK-NEXT: sub x9, sp, #32 ++; CHECK-NEXT: and sp, x9, #0xffffffffffffffc0 ++; CHECK-NEXT: add x9, x0, #15 ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 ++; CHECK-NEXT: mov x19, sp ++; CHECK-NEXT: sub x8, x8, x9 ++; CHECK-NEXT: and x8, x8, #0xffffffffffffffc0 ++; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: cmp sp, x8 ++; CHECK-NEXT: b.le .LBB2_3 ++; CHECK-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: b .LBB2_1 ++; CHECK-NEXT: .LBB2_3: ++; CHECK-NEXT: mov sp, x8 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: str x8, [x1] ++; CHECK-NEXT: mov sp, x29 ++; CHECK-NEXT: .cfi_def_cfa wsp, 32 ++; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload ++; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w19 ++; CHECK-NEXT: .cfi_restore w30 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++ %v = alloca i8, i64 %size, align 64 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++; Dynamic allocation, with an alignment greater than the stack guard size. The ++; only difference to the dynamic allocation is the constant used for aligning ++; the target SP, the loop will probe the whole allocation without needing to ++; know about the alignment padding. ++define void @dynamic_align_8192(i64 %size, ptr %out) #0 { ++; CHECK-LABEL: dynamic_align_8192: ++; CHECK: // %bb.0: ++; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 32 ++; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill ++; CHECK-NEXT: mov x29, sp ++; CHECK-NEXT: .cfi_def_cfa w29, 32 ++; CHECK-NEXT: .cfi_offset w19, -16 ++; CHECK-NEXT: .cfi_offset w30, -24 ++; CHECK-NEXT: .cfi_offset w29, -32 ++; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: sub x9, x9, #4064 ++; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 ++; CHECK-NEXT: .LBB3_1: // =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: cmp sp, x9 ++; CHECK-NEXT: b.le .LBB3_3 ++; CHECK-NEXT: // %bb.2: // in Loop: Header=BB3_1 Depth=1 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: b .LBB3_1 ++; CHECK-NEXT: .LBB3_3: ++; CHECK-NEXT: mov sp, x9 ++; CHECK-NEXT: add x9, x0, #15 ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 ++; CHECK-NEXT: mov x19, sp ++; CHECK-NEXT: sub x8, x8, x9 ++; CHECK-NEXT: and x8, x8, #0xffffffffffffe000 ++; CHECK-NEXT: .LBB3_4: // =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: cmp sp, x8 ++; CHECK-NEXT: b.le .LBB3_6 ++; CHECK-NEXT: // %bb.5: // in Loop: Header=BB3_4 Depth=1 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: b .LBB3_4 ++; CHECK-NEXT: .LBB3_6: ++; CHECK-NEXT: mov sp, x8 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: str x8, [x1] ++; CHECK-NEXT: mov sp, x29 ++; CHECK-NEXT: .cfi_def_cfa wsp, 32 ++; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload ++; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w19 ++; CHECK-NEXT: .cfi_restore w30 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++ %v = alloca i8, i64 %size, align 8192 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++; For 64k guard pages, the only difference is the constant subtracted from SP ++; in the loop. ++define void @dynamic_64k_guard(i64 %size, ptr %out) #0 "stack-probe-size"="65536" { ++; CHECK-LABEL: dynamic_64k_guard: ++; CHECK: // %bb.0: ++; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: mov x29, sp ++; CHECK-NEXT: .cfi_def_cfa w29, 16 ++; CHECK-NEXT: .cfi_offset w30, -8 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: add x9, x0, #15 ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 ++; CHECK-NEXT: sub x8, x8, x9 ++; CHECK-NEXT: .LBB4_1: // =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 ++; CHECK-NEXT: cmp sp, x8 ++; CHECK-NEXT: b.le .LBB4_3 ++; CHECK-NEXT: // %bb.2: // in Loop: Header=BB4_1 Depth=1 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: b .LBB4_1 ++; CHECK-NEXT: .LBB4_3: ++; CHECK-NEXT: mov sp, x8 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: str x8, [x1] ++; CHECK-NEXT: mov sp, x29 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w30 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++ %v = alloca i8, i64 %size, align 1 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++; If a function has variable-sized stack objects, then any function calls which ++; need to pass arguments on the stack must allocate the stack space for them ++; dynamically, to ensure they are at the bottom of the frame. We need to probe ++; that space when it is larger than the unprobed space allowed by the ABI (1024 ++; bytes), so this needs a very large number of arguments. ++define void @no_reserved_call_frame(i64 %n) #0 { ++; CHECK-LABEL: no_reserved_call_frame: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: mov x29, sp ++; CHECK-NEXT: .cfi_def_cfa w29, 16 ++; CHECK-NEXT: .cfi_offset w30, -8 ++; CHECK-NEXT: .cfi_offset w29, -16 ++; CHECK-NEXT: lsl x9, x0, #2 ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: add x9, x9, #15 ++; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 ++; CHECK-NEXT: sub x0, x8, x9 ++; CHECK-NEXT: .LBB5_1: // %entry ++; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: cmp sp, x0 ++; CHECK-NEXT: b.le .LBB5_3 ++; CHECK-NEXT: // %bb.2: // %entry ++; CHECK-NEXT: // in Loop: Header=BB5_1 Depth=1 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: b .LBB5_1 ++; CHECK-NEXT: .LBB5_3: // %entry ++; CHECK-NEXT: mov sp, x0 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: sub sp, sp, #1104 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: bl callee_stack_args ++; CHECK-NEXT: add sp, sp, #1104 ++; CHECK-NEXT: mov sp, x29 ++; CHECK-NEXT: .cfi_def_cfa wsp, 16 ++; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w30 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i32, i64 %n ++ call void @callee_stack_args(ptr %v, [138 x i64] undef) ++ ret void ++} ++ ++; Same as above but without a variable-sized allocation, so the reserved call ++; frame can be folded into the fixed-size allocation in the prologue. ++define void @reserved_call_frame(i64 %n) #0 { ++; CHECK-LABEL: reserved_call_frame: ++; CHECK: // %bb.0: // %entry ++; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 32 ++; CHECK-NEXT: str x28, [sp, #16] // 8-byte Folded Spill ++; CHECK-NEXT: mov x29, sp ++; CHECK-NEXT: .cfi_def_cfa w29, 32 ++; CHECK-NEXT: .cfi_offset w28, -16 ++; CHECK-NEXT: .cfi_offset w30, -24 ++; CHECK-NEXT: .cfi_offset w29, -32 ++; CHECK-NEXT: sub sp, sp, #1504 ++; CHECK-NEXT: add x0, sp, #1104 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: bl callee_stack_args ++; CHECK-NEXT: add sp, sp, #1504 ++; CHECK-NEXT: .cfi_def_cfa wsp, 32 ++; CHECK-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload ++; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w28 ++; CHECK-NEXT: .cfi_restore w30 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++entry: ++ %v = alloca i32, i64 100 ++ call void @callee_stack_args(ptr %v, [138 x i64] undef) ++ ret void ++} ++ ++declare void @callee_stack_args(ptr, [138 x i64]) ++ ++; Dynamic allocation of SVE vectors ++define void @dynamic_sve(i64 %size, ptr %out) #0 "target-features"="+sve" { ++; CHECK-LABEL: dynamic_sve: ++; CHECK: // %bb.0: ++; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill ++; CHECK-NEXT: .cfi_def_cfa_offset 32 ++; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill ++; CHECK-NEXT: mov x29, sp ++; CHECK-NEXT: .cfi_def_cfa w29, 32 ++; CHECK-NEXT: .cfi_offset w19, -16 ++; CHECK-NEXT: .cfi_offset w30, -24 ++; CHECK-NEXT: .cfi_offset w29, -32 ++; CHECK-NEXT: rdvl x9, #1 ++; CHECK-NEXT: mov x10, #15 // =0xf ++; CHECK-NEXT: mov x8, sp ++; CHECK-NEXT: madd x9, x0, x9, x10 ++; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 ++; CHECK-NEXT: sub x8, x8, x9 ++; CHECK-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 ++; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 ++; CHECK-NEXT: cmp sp, x8 ++; CHECK-NEXT: b.le .LBB7_3 ++; CHECK-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: b .LBB7_1 ++; CHECK-NEXT: .LBB7_3: ++; CHECK-NEXT: mov sp, x8 ++; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: str x8, [x1] ++; CHECK-NEXT: mov sp, x29 ++; CHECK-NEXT: .cfi_def_cfa wsp, 32 ++; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload ++; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ++; CHECK-NEXT: .cfi_def_cfa_offset 0 ++; CHECK-NEXT: .cfi_restore w19 ++; CHECK-NEXT: .cfi_restore w30 ++; CHECK-NEXT: .cfi_restore w29 ++; CHECK-NEXT: ret ++ %v = alloca , i64 %size, align 16 ++ store ptr %v, ptr %out, align 8 ++ ret void ++} ++ ++attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" } +\ No newline at end of file +-- +2.42.0.windows.2 + diff --git a/0025-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-GlobalISel.patch b/0025-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-GlobalISel.patch new file mode 100644 index 0000000..42d7350 --- /dev/null +++ b/0025-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-GlobalISel.patch @@ -0,0 +1,496 @@ +From dbca022577e0da1f411ee84143d59c6c9d941969 Mon Sep 17 00:00:00 2001 +From: rickyleung +Date: Fri, 26 Apr 2024 17:29:18 +0800 +Subject: [PATCH 6/7] [backport][AArch64] Stack probing for dynamic allocas in + GlobalISel + +Reference: https://github.com/llvm/llvm-project/commit/c1140d49ec3363bf903e4c1dbf7a3f5e8c1b6523 + +Co-authored-by: Oliver Stannard +--- + .../llvm/CodeGen/GlobalISel/LegalizerHelper.h | 2 + + .../CodeGen/GlobalISel/LegalizerHelper.cpp | 37 ++- + .../AArch64/GISel/AArch64LegalizerInfo.cpp | 47 +++- + .../AArch64/GISel/AArch64LegalizerInfo.h | 1 + + .../GlobalISel/legalize-dyn-alloca.mir | 255 ++++++++++++++---- + .../GlobalISel/legalizer-info-validation.mir | 7 + + .../CodeGen/AArch64/stack-probing-dynamic.ll | 3 +- + 7 files changed, 284 insertions(+), 68 deletions(-) + +diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +index 9288091874cf..7abbd1f03f16 100644 +--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h ++++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +@@ -400,6 +400,8 @@ public: + LegalizeResult lowerUnmergeValues(MachineInstr &MI); + LegalizeResult lowerExtractInsertVectorElt(MachineInstr &MI); + LegalizeResult lowerShuffleVector(MachineInstr &MI); ++ Register getDynStackAllocTargetPtr(Register SPReg, Register AllocSize, ++ Align Alignment, LLT PtrTy); + LegalizeResult lowerDynStackAlloc(MachineInstr &MI); + LegalizeResult lowerStackSave(MachineInstr &MI); + LegalizeResult lowerStackRestore(MachineInstr &MI); +diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +index 75d9789be4d0..5557456e706d 100644 +--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp ++++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +@@ -6777,21 +6777,12 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) { + return Legalized; + } + +-LegalizerHelper::LegalizeResult +-LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) { +- const auto &MF = *MI.getMF(); +- const auto &TFI = *MF.getSubtarget().getFrameLowering(); +- if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp) +- return UnableToLegalize; +- +- Register Dst = MI.getOperand(0).getReg(); +- Register AllocSize = MI.getOperand(1).getReg(); +- Align Alignment = assumeAligned(MI.getOperand(2).getImm()); +- +- LLT PtrTy = MRI.getType(Dst); ++Register LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg, ++ Register AllocSize, ++ Align Alignment, ++ LLT PtrTy) { + LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); + +- Register SPReg = TLI.getStackPointerRegisterToSaveRestore(); + auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg); + SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp); + +@@ -6806,7 +6797,25 @@ LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) { + Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst); + } + +- SPTmp = MIRBuilder.buildCast(PtrTy, Alloc); ++ return MIRBuilder.buildCast(PtrTy, Alloc).getReg(0); ++} ++ ++LegalizerHelper::LegalizeResult ++LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) { ++ const auto &MF = *MI.getMF(); ++ const auto &TFI = *MF.getSubtarget().getFrameLowering(); ++ if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp) ++ return UnableToLegalize; ++ ++ Register Dst = MI.getOperand(0).getReg(); ++ Register AllocSize = MI.getOperand(1).getReg(); ++ Align Alignment = assumeAligned(MI.getOperand(2).getImm()); ++ ++ LLT PtrTy = MRI.getType(Dst); ++ Register SPReg = TLI.getStackPointerRegisterToSaveRestore(); ++ Register SPTmp = ++ getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy); ++ + MIRBuilder.buildCopy(SPReg, SPTmp); + MIRBuilder.buildCopy(Dst, SPTmp); + +diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +index f0130a0be29d..0dd2b4d48dd6 100644 +--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp ++++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +@@ -797,9 +797,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) + return Query.Types[0] == p0 && Query.Types[1] == s64; + }); + +- getActionDefinitionsBuilder({G_DYN_STACKALLOC, +- G_STACKSAVE, +- G_STACKRESTORE}).lower(); ++ getActionDefinitionsBuilder(G_DYN_STACKALLOC).custom(); ++ ++ getActionDefinitionsBuilder({G_STACKSAVE, G_STACKRESTORE}).lower(); + + if (ST.hasMOPS()) { + // G_BZERO is not supported. Currently it is only emitted by +@@ -993,6 +993,8 @@ bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, + return legalizeMemOps(MI, Helper); + case TargetOpcode::G_FCOPYSIGN: + return legalizeFCopySign(MI, Helper); ++ case TargetOpcode::G_DYN_STACKALLOC: ++ return legalizeDynStackAlloc(MI, Helper); + } + + llvm_unreachable("expected switch to return"); +@@ -1689,3 +1691,42 @@ bool AArch64LegalizerInfo::legalizeFCopySign(MachineInstr &MI, + MI.eraseFromParent(); + return true; + } ++ ++bool AArch64LegalizerInfo::legalizeDynStackAlloc( ++ MachineInstr &MI, LegalizerHelper &Helper) const { ++ MachineFunction &MF = *MI.getParent()->getParent(); ++ MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; ++ MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); ++ ++ // If stack probing is not enabled for this function, use the default ++ // lowering. ++ if (!MF.getFunction().hasFnAttribute("probe-stack") || ++ MF.getFunction().getFnAttribute("probe-stack").getValueAsString() != ++ "inline-asm") { ++ Helper.lowerDynStackAlloc(MI); ++ return true; ++ } ++ ++ Register Dst = MI.getOperand(0).getReg(); ++ Register AllocSize = MI.getOperand(1).getReg(); ++ Align Alignment = assumeAligned(MI.getOperand(2).getImm()); ++ ++ assert(MRI.getType(Dst) == LLT::pointer(0, 64) && ++ "Unexpected type for dynamic alloca"); ++ assert(MRI.getType(AllocSize) == LLT::scalar(64) && ++ "Unexpected type for dynamic alloca"); ++ ++ LLT PtrTy = MRI.getType(Dst); ++ Register SPReg = ++ Helper.getTargetLowering().getStackPointerRegisterToSaveRestore(); ++ Register SPTmp = ++ Helper.getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy); ++ auto NewMI = ++ MIRBuilder.buildInstr(AArch64::PROBED_STACKALLOC_DYN, {}, {SPTmp}); ++ MRI.setRegClass(NewMI.getReg(0), &AArch64::GPR64commonRegClass); ++ MIRBuilder.setInsertPt(*NewMI->getParent(), NewMI); ++ MIRBuilder.buildCopy(Dst, SPTmp); ++ ++ MI.eraseFromParent(); ++ return true; ++} +\ No newline at end of file +diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h +index c10f6e071ed4..94484ea59d15 100644 +--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h ++++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h +@@ -58,6 +58,7 @@ private: + bool legalizeCTTZ(MachineInstr &MI, LegalizerHelper &Helper) const; + bool legalizeMemOps(MachineInstr &MI, LegalizerHelper &Helper) const; + bool legalizeFCopySign(MachineInstr &MI, LegalizerHelper &Helper) const; ++ bool legalizeDynStackAlloc(MachineInstr &MI, LegalizerHelper &Helper) const; + const AArch64Subtarget *ST; + }; + } // End llvm namespace. +diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir +index e9188fb89f69..882c7468e70f 100644 +--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir ++++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir +@@ -19,6 +19,21 @@ + ret i128* %addr + } + ++ define i8* @test_simple_alloca_stack_probing(i32 %numelts) "probe-stack"="inline-asm" { ++ %addr = alloca i8, i32 %numelts ++ ret i8* %addr ++ } ++ ++ define i8* @test_aligned_alloca_stack_probing(i32 %numelts) "probe-stack"="inline-asm" { ++ %addr = alloca i8, i32 %numelts, align 32 ++ ret i8* %addr ++ } ++ ++ define i128* @test_natural_alloca_stack_probing(i32 %numelts) "probe-stack"="inline-asm" { ++ %addr = alloca i128, i32 %numelts ++ ret i128* %addr ++ } ++ + ... + --- + name: test_simple_alloca +@@ -37,22 +52,23 @@ body: | + + ; CHECK-LABEL: name: test_simple_alloca + ; CHECK: liveins: $w0 +- ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 +- ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 +- ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) +- ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]] +- ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 +- ; CHECK: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]] +- ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 +- ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] +- ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp +- ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) +- ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] +- ; CHECK: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[SUB]](s64) +- ; CHECK: $sp = COPY [[INTTOPTR]](p0) +- ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) +- ; CHECK: $x0 = COPY [[COPY2]](p0) +- ; CHECK: RET_ReallyLR implicit $x0 ++ ; CHECK-NEXT: {{ $}} ++ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 ++ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ++ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) ++ ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]] ++ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 ++ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]] ++ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 ++ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] ++ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp ++ ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) ++ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] ++ ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[SUB]](s64) ++ ; CHECK-NEXT: $sp = COPY [[INTTOPTR]](p0) ++ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) ++ ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0) ++ ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s32) = COPY $w0 + %3:_(s64) = G_CONSTANT i64 1 + %1:_(s64) = G_ZEXT %0(s32) +@@ -83,24 +99,25 @@ body: | + + ; CHECK-LABEL: name: test_aligned_alloca + ; CHECK: liveins: $w0 +- ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 +- ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 +- ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) +- ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]] +- ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 +- ; CHECK: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]] +- ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 +- ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] +- ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp +- ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) +- ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] +- ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 -32 +- ; CHECK: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C3]] +- ; CHECK: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[AND1]](s64) +- ; CHECK: $sp = COPY [[INTTOPTR]](p0) +- ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) +- ; CHECK: $x0 = COPY [[COPY2]](p0) +- ; CHECK: RET_ReallyLR implicit $x0 ++ ; CHECK-NEXT: {{ $}} ++ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 ++ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ++ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) ++ ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]] ++ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 ++ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]] ++ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 ++ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] ++ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp ++ ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) ++ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] ++ ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 -32 ++ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C3]] ++ ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[AND1]](s64) ++ ; CHECK-NEXT: $sp = COPY [[INTTOPTR]](p0) ++ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) ++ ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0) ++ ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s32) = COPY $w0 + %3:_(s64) = G_CONSTANT i64 1 + %1:_(s64) = G_ZEXT %0(s32) +@@ -131,22 +148,23 @@ body: | + + ; CHECK-LABEL: name: test_natural_alloca + ; CHECK: liveins: $w0 +- ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 +- ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 +- ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) +- ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]] +- ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 +- ; CHECK: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]] +- ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 +- ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] +- ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp +- ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) +- ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] +- ; CHECK: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[SUB]](s64) +- ; CHECK: $sp = COPY [[INTTOPTR]](p0) +- ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) +- ; CHECK: $x0 = COPY [[COPY2]](p0) +- ; CHECK: RET_ReallyLR implicit $x0 ++ ; CHECK-NEXT: {{ $}} ++ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 ++ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ++ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) ++ ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]] ++ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 ++ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]] ++ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 ++ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] ++ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp ++ ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) ++ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] ++ ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[SUB]](s64) ++ ; CHECK-NEXT: $sp = COPY [[INTTOPTR]](p0) ++ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) ++ ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0) ++ ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s32) = COPY $w0 + %3:_(s64) = G_CONSTANT i64 16 + %1:_(s64) = G_ZEXT %0(s32) +@@ -160,3 +178,140 @@ body: | + RET_ReallyLR implicit $x0 + + ... ++--- ++name: test_simple_alloca_stack_probing ++alignment: 4 ++tracksRegLiveness: true ++liveins: ++ - { reg: '$w0' } ++frameInfo: ++ maxAlignment: 1 ++stack: ++ - { id: 0, name: addr, type: variable-sized, alignment: 1 } ++machineFunctionInfo: {} ++body: | ++ bb.1 (%ir-block.0): ++ liveins: $w0 ++ ; CHECK-LABEL: name: test_simple_alloca_stack_probing ++ ; CHECK: liveins: $w0 ++ ; CHECK-NEXT: {{ $}} ++ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 ++ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) ++ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ++ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[C]](s64) ++ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 ++ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[SHL]], [[C1]] ++ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 ++ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] ++ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp ++ ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) ++ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] ++ ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:gpr64common(p0) = G_INTTOPTR [[SUB]](s64) ++ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) ++ ; CHECK-NEXT: PROBED_STACKALLOC_DYN [[INTTOPTR]](p0), implicit-def $sp, implicit-def $nzcv, implicit $sp ++ ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0) ++ ; CHECK-NEXT: RET_ReallyLR implicit $x0 ++ %0:_(s32) = COPY $w0 ++ %1:_(s64) = G_ZEXT %0(s32) ++ %9:_(s64) = G_CONSTANT i64 0 ++ %2:_(s64) = G_SHL %1, %9(s64) ++ %4:_(s64) = G_CONSTANT i64 15 ++ %5:_(s64) = nuw G_ADD %2, %4 ++ %6:_(s64) = G_CONSTANT i64 -16 ++ %7:_(s64) = G_AND %5, %6 ++ %8:_(p0) = G_DYN_STACKALLOC %7(s64), 1 ++ $x0 = COPY %8(p0) ++ RET_ReallyLR implicit $x0 ++... ++--- ++name: test_aligned_alloca_stack_probing ++alignment: 4 ++tracksRegLiveness: true ++liveins: ++ - { reg: '$w0' } ++frameInfo: ++ maxAlignment: 32 ++stack: ++ - { id: 0, name: addr, type: variable-sized, alignment: 32 } ++machineFunctionInfo: {} ++body: | ++ bb.1 (%ir-block.0): ++ liveins: $w0 ++ ; CHECK-LABEL: name: test_aligned_alloca_stack_probing ++ ; CHECK: liveins: $w0 ++ ; CHECK-NEXT: {{ $}} ++ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 ++ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) ++ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 ++ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[C]](s64) ++ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 ++ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[SHL]], [[C1]] ++ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 ++ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] ++ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp ++ ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) ++ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] ++ ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 -32 ++ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C3]] ++ ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:gpr64common(p0) = G_INTTOPTR [[AND1]](s64) ++ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) ++ ; CHECK-NEXT: PROBED_STACKALLOC_DYN [[INTTOPTR]](p0), implicit-def $sp, implicit-def $nzcv, implicit $sp ++ ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0) ++ ; CHECK-NEXT: RET_ReallyLR implicit $x0 ++ %0:_(s32) = COPY $w0 ++ %1:_(s64) = G_ZEXT %0(s32) ++ %9:_(s64) = G_CONSTANT i64 0 ++ %2:_(s64) = G_SHL %1, %9(s64) ++ %4:_(s64) = G_CONSTANT i64 15 ++ %5:_(s64) = nuw G_ADD %2, %4 ++ %6:_(s64) = G_CONSTANT i64 -16 ++ %7:_(s64) = G_AND %5, %6 ++ %8:_(p0) = G_DYN_STACKALLOC %7(s64), 32 ++ $x0 = COPY %8(p0) ++ RET_ReallyLR implicit $x0 ++... ++--- ++name: test_natural_alloca_stack_probing ++alignment: 4 ++tracksRegLiveness: true ++liveins: ++ - { reg: '$w0' } ++frameInfo: ++ maxAlignment: 1 ++stack: ++ - { id: 0, name: addr, type: variable-sized, alignment: 1 } ++machineFunctionInfo: {} ++body: | ++ bb.1 (%ir-block.0): ++ liveins: $w0 ++ ; CHECK-LABEL: name: test_natural_alloca_stack_probing ++ ; CHECK: liveins: $w0 ++ ; CHECK-NEXT: {{ $}} ++ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 ++ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) ++ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ++ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[C]](s64) ++ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 ++ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[SHL]], [[C1]] ++ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 ++ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] ++ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp ++ ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) ++ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] ++ ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:gpr64common(p0) = G_INTTOPTR [[SUB]](s64) ++ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) ++ ; CHECK-NEXT: PROBED_STACKALLOC_DYN [[INTTOPTR]](p0), implicit-def $sp, implicit-def $nzcv, implicit $sp ++ ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0) ++ ; CHECK-NEXT: RET_ReallyLR implicit $x0 ++ %0:_(s32) = COPY $w0 ++ %1:_(s64) = G_ZEXT %0(s32) ++ %9:_(s64) = G_CONSTANT i64 4 ++ %2:_(s64) = G_SHL %1, %9(s64) ++ %4:_(s64) = G_CONSTANT i64 15 ++ %5:_(s64) = nuw G_ADD %2, %4 ++ %6:_(s64) = G_CONSTANT i64 -16 ++ %7:_(s64) = G_AND %5, %6 ++ %8:_(p0) = G_DYN_STACKALLOC %7(s64), 1 ++ $x0 = COPY %8(p0) ++ RET_ReallyLR implicit $x0 ++... +\ No newline at end of file +diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +index 461161f5b338..efae9b66b53d 100644 +--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir ++++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +@@ -652,6 +652,13 @@ + # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to [[DYN_STACKALLOC]] + # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected + # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected ++# DEBUG-NEXT: G_STACKSAVE (opcode [[STACKSAVE:[0-9]+]]): 1 type index, 0 imm indices ++# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected ++# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected ++# DEBUG-NEXT: G_STACKRESTORE (opcode {{[0-9]+}}): 1 type index, 0 imm indices ++# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to [[STACKSAVE]] ++# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected ++# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected + # DEBUG-NEXT: G_STRICT_FADD (opcode {{[0-9]+}}): 1 type index, 0 imm indices + # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined + # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll +index 4d9ef77f7a0d..ad9cdbe92b23 100644 +--- a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll ++++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll +@@ -1,5 +1,6 @@ + ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +-; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s ++; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s ++; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s + + ; Dynamically-sized allocation, needs a loop which can handle any size at + ; runtime. The final iteration of the loop will temporarily put SP below the +-- +2.42.0.windows.2 + diff --git a/0026-Update-testcase-for-stack-clash-protection-backport.patch b/0026-Update-testcase-for-stack-clash-protection-backport.patch new file mode 100644 index 0000000..4b36a24 --- /dev/null +++ b/0026-Update-testcase-for-stack-clash-protection-backport.patch @@ -0,0 +1,177 @@ +From 9425ee5f8608ff8611628d83386f61950d7fff85 Mon Sep 17 00:00:00 2001 +From: rickyleung +Date: Tue, 7 May 2024 21:37:03 +0800 +Subject: [PATCH 7/7] Update testcase for stack clash protection backport + +--- + .../GlobalISel/legalize-dyn-alloca.mir | 3 +- + .../GlobalISel/stacksave-stackrestore.ll | 14 ++++++---- + .../CodeGen/AArch64/stack-probing-dynamic.ll | 16 ++++++----- + .../AArch64/stack-probing-last-in-block.mir | 4 +-- + .../X86/GlobalISel/stacksave-stackrestore.ll | 28 +++++++++++-------- + 5 files changed, 36 insertions(+), 29 deletions(-) + +diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir +index 882c7468e70f..82781cebc55a 100644 +--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir ++++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir +@@ -313,5 +313,4 @@ body: | + %7:_(s64) = G_AND %5, %6 + %8:_(p0) = G_DYN_STACKALLOC %7(s64), 1 + $x0 = COPY %8(p0) +- RET_ReallyLR implicit $x0 +-... +\ No newline at end of file ++ RET_ReallyLR implicit $x0 +\ No newline at end of file +diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll b/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll +index 16bf85af9c17..97ecca0bd77b 100644 +--- a/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll ++++ b/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll +@@ -15,14 +15,18 @@ define void @test_scoped_alloca(i64 %n) { + ; CHECK-NEXT: .cfi_offset w19, -16 + ; CHECK-NEXT: .cfi_offset w30, -24 + ; CHECK-NEXT: .cfi_offset w29, -32 +-; CHECK-NEXT: add x9, x0, #15 ++; CHECK-NEXT: mov x19, x0 ++; CHECK-NEXT: bl llvm.stacksave.p0 ++; CHECK-NEXT: add x9, x19, #15 + ; CHECK-NEXT: mov x8, sp + ; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +-; CHECK-NEXT: mov x19, sp +-; CHECK-NEXT: sub x0, x8, x9 +-; CHECK-NEXT: mov sp, x0 ++; CHECK-NEXT: mov x19, x0 ++; CHECK-NEXT: sub x8, x8, x9 ++; CHECK-NEXT: mov sp, x8 ++; CHECK-NEXT: mov x0, x8 + ; CHECK-NEXT: bl use_addr +-; CHECK-NEXT: mov sp, x19 ++; CHECK-NEXT: mov x0, x19 ++; CHECK-NEXT: bl llvm.stackrestore.p0 + ; CHECK-NEXT: mov sp, x29 + ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload + ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll +index ad9cdbe92b23..3cbcf7749b2a 100644 +--- a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll ++++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll +@@ -59,10 +59,10 @@ define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 { + ; CHECK-NEXT: str xzr, [sp, #-64]! + ; CHECK-NEXT: add x9, x0, #15 + ; CHECK-NEXT: mov x8, sp +-; CHECK-NEXT: sub x10, x29, #64 + ; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +-; CHECK-NEXT: str x10, [x1] ++; CHECK-NEXT: sub x10, x29, #64 + ; CHECK-NEXT: sub x8, x8, x9 ++; CHECK-NEXT: str x10, [x1] + ; CHECK-NEXT: .LBB1_1: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 + ; CHECK-NEXT: cmp sp, x8 +@@ -108,10 +108,10 @@ define void @dynamic_align_64(i64 %size, ptr %out) #0 { + ; CHECK-NEXT: and sp, x9, #0xffffffffffffffc0 + ; CHECK-NEXT: add x9, x0, #15 + ; CHECK-NEXT: mov x8, sp +-; CHECK-NEXT: str xzr, [sp] + ; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +-; CHECK-NEXT: mov x19, sp ++; CHECK-NEXT: str xzr, [sp] + ; CHECK-NEXT: sub x8, x8, x9 ++; CHECK-NEXT: mov x19, sp + ; CHECK-NEXT: and x8, x8, #0xffffffffffffffc0 + ; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +@@ -167,10 +167,10 @@ define void @dynamic_align_8192(i64 %size, ptr %out) #0 { + ; CHECK-NEXT: mov sp, x9 + ; CHECK-NEXT: add x9, x0, #15 + ; CHECK-NEXT: mov x8, sp +-; CHECK-NEXT: str xzr, [sp] + ; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +-; CHECK-NEXT: mov x19, sp ++; CHECK-NEXT: str xzr, [sp] + ; CHECK-NEXT: sub x8, x8, x9 ++; CHECK-NEXT: mov x19, sp + ; CHECK-NEXT: and x8, x8, #0xffffffffffffe000 + ; CHECK-NEXT: .LBB3_4: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +@@ -268,8 +268,10 @@ define void @no_reserved_call_frame(i64 %n) #0 { + ; CHECK-NEXT: str xzr, [sp] + ; CHECK-NEXT: sub sp, sp, #1104 + ; CHECK-NEXT: str xzr, [sp] ++; CHECK-NEXT: sub sp, sp, #1104 + ; CHECK-NEXT: bl callee_stack_args + ; CHECK-NEXT: add sp, sp, #1104 ++; CHECK-NEXT: add sp, sp, #1104 + ; CHECK-NEXT: mov sp, x29 + ; CHECK-NEXT: .cfi_def_cfa wsp, 16 + ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +@@ -331,8 +333,8 @@ define void @dynamic_sve(i64 %size, ptr %out) #0 "target-features"="+sve" { + ; CHECK-NEXT: .cfi_offset w29, -32 + ; CHECK-NEXT: rdvl x9, #1 + ; CHECK-NEXT: mov x10, #15 // =0xf +-; CHECK-NEXT: mov x8, sp + ; CHECK-NEXT: madd x9, x0, x9, x10 ++; CHECK-NEXT: mov x8, sp + ; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 + ; CHECK-NEXT: sub x8, x8, x9 + ; CHECK-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 +diff --git a/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir b/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir +index a8a21ab330ba..9a173be5857e 100644 +--- a/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir ++++ b/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir +@@ -141,6 +141,4 @@ body: | + B %bb.2 + + bb.2.exit: +- RET_ReallyLR +- +-... +\ No newline at end of file ++ RET_ReallyLR +\ No newline at end of file +diff --git a/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll b/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll +index e86c04ee22db..8f665924577f 100644 +--- a/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll ++++ b/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll +@@ -13,21 +13,25 @@ define void @test_scoped_alloca(i64 %n) { + ; CHECK-NEXT: .cfi_offset %rbp, -16 + ; CHECK-NEXT: movq %rsp, %rbp + ; CHECK-NEXT: .cfi_def_cfa_register %rbp ++; CHECK-NEXT: pushq %r14 + ; CHECK-NEXT: pushq %rbx +-; CHECK-NEXT: pushq %rax +-; CHECK-NEXT: .cfi_offset %rbx, -24 +-; CHECK-NEXT: movq %rsp, %rbx +-; CHECK-NEXT: movq %rsp, %rax +-; CHECK-NEXT: imulq $1, %rdi, %rcx +-; CHECK-NEXT: addq $15, %rcx +-; CHECK-NEXT: andq $-16, %rcx +-; CHECK-NEXT: subq %rcx, %rax +-; CHECK-NEXT: movq %rax, %rsp +-; CHECK-NEXT: movq %rax, %rdi ++; CHECK-NEXT: .cfi_offset %rbx, -32 ++; CHECK-NEXT: .cfi_offset %r14, -24 ++; CHECK-NEXT: movq %rdi, %rbx ++; CHECK-NEXT: callq llvm.stacksave.p0 ++; CHECK-NEXT: movq %rax, %r14 ++; CHECK-NEXT: movq %rsp, %rdi ++; CHECK-NEXT: imulq $1, %rbx, %rax ++; CHECK-NEXT: addq $15, %rax ++; CHECK-NEXT: andq $-16, %rax ++; CHECK-NEXT: subq %rax, %rdi ++; CHECK-NEXT: movq %rdi, %rsp + ; CHECK-NEXT: callq use_addr +-; CHECK-NEXT: movq %rbx, %rsp +-; CHECK-NEXT: leaq -8(%rbp), %rsp ++; CHECK-NEXT: movq %r14, %rdi ++; CHECK-NEXT: callq llvm.stackrestore.p0 ++; CHECK-NEXT: leaq -16(%rbp), %rsp + ; CHECK-NEXT: popq %rbx ++; CHECK-NEXT: popq %r14 + ; CHECK-NEXT: popq %rbp + ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 + ; CHECK-NEXT: retq +-- +2.42.0.windows.2 + diff --git a/llvm.spec b/llvm.spec index 73dfde2..6a215a4 100644 --- a/llvm.spec +++ b/llvm.spec @@ -38,7 +38,7 @@ Name: %{pkg_name} Version: %{maj_ver}.%{min_ver}.%{patch_ver} -Release: 10 +Release: 11 Summary: The Low Level Virtual Machine License: NCSA @@ -70,6 +70,13 @@ Patch18: 0018-Fix-declaration-definition-mismatch-for-classic-flang.patch Patch19: 0019-Backport-LoongArch-Improve-the-support-for-atomic-and-clear_cache.patch Patch20: 0020-Update-llvm-lit-config-to-support-build_for_openeule.patch +Patch21: 0021-Backport-GlobalISel-Don-t-expand-stacksave-stackrestore-in-IRTranslator.patch +Patch22: 0022-Backport-AArch64-Refactor-allocation-of-locals-and-stack-realignment.patch +Patch23: 0023-Backport-AArch64-Stack-probing-for-function-prologues.patch +Patch24: 0024-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-SelectionDAG.patch +Patch25: 0025-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-GlobalISel.patch +Patch26: 0026-Update-testcase-for-stack-clash-protection-backport.patch + BuildRequires: binutils-devel BuildRequires: cmake BuildRequires: gcc @@ -353,6 +360,9 @@ LD_LIBRARY_PATH=%{buildroot}/%{install_libdir} %{__ninja} check-all -C ./_build %{install_includedir}/llvm-gmock %changelog +* Fri May 10 2024 rickyleung - 17.0.6-11 +- Backport the patches to support stack clash protection + * Mon Apr 29 2024 wangqiang - 17.0.6-10 - Update llvm-lit config to support macro `build_for_openeuler` -- Gitee