svn commit: r309427 - in vendor/llvm/dist: include/llvm/Support lib/CodeGen/SelectionDAG lib/Target/AMDGPU lib/Transforms/InstCombine lib/Transforms/Utils test/CodeGen/AMDGPU test/CodeGen/X86 test/...

Fri Dec 2 19:20:12 UTC 2016

Author: dim
Date: Fri Dec  2 19:20:10 2016
New Revision: 309427
URL: https://svnweb.freebsd.org/changeset/base/309427

Log:
  Vendor import of llvm release_39 branch r288513:
  https://llvm.org/svn/llvm-project/llvm/branches/release_39@288513

Added:
  vendor/llvm/dist/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll
  vendor/llvm/dist/test/CodeGen/X86/mul-i1024.ll
  vendor/llvm/dist/test/CodeGen/X86/mul-i512.ll
  vendor/llvm/dist/test/Transforms/SimplifyCFG/PR29163.ll
Modified:
  vendor/llvm/dist/include/llvm/Support/Threading.h
  vendor/llvm/dist/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
  vendor/llvm/dist/lib/Target/AMDGPU/SIInstrInfo.cpp
  vendor/llvm/dist/lib/Target/AMDGPU/SIInstructions.td
  vendor/llvm/dist/lib/Target/AMDGPU/SIWholeQuadMode.cpp
  vendor/llvm/dist/lib/Transforms/InstCombine/InstCombineCompares.cpp
  vendor/llvm/dist/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
  vendor/llvm/dist/lib/Transforms/Utils/SimplifyCFG.cpp
  vendor/llvm/dist/test/CodeGen/AMDGPU/wqm.ll
  vendor/llvm/dist/test/CodeGen/X86/mul-i256.ll
  vendor/llvm/dist/test/LTO/X86/type-mapping-bug.ll
  vendor/llvm/dist/test/Transforms/InstCombine/indexed-gep-compares.ll
  vendor/llvm/dist/test/Transforms/InstCombine/unpack-fca.ll

Modified: vendor/llvm/dist/include/llvm/Support/Threading.h
==============================================================================

--- vendor/llvm/dist/include/llvm/Support/Threading.h	Fri Dec  2 19:02:12 2016	(r309426)
+++ vendor/llvm/dist/include/llvm/Support/Threading.h	Fri Dec  2 19:20:10 2016	(r309427)
@@ -20,11 +20,11 @@
 #include <ciso646> // So we can check the C++ standard lib macros.
 #include <functional>
 
-// We use std::call_once on all Unix platforms except for NetBSD with
-// libstdc++. That platform has a bug they are working to fix, and they'll
-// remove the NetBSD checks once fixed.
-#if defined(LLVM_ON_UNIX) &&                                                   \
-    !(defined(__NetBSD__) && !defined(_LIBCPP_VERSION)) && !defined(__ppc__)
+// std::call_once from libc++ is used on all Unix platforms. Other
+// implementations like libstdc++ are known to have problems on NetBSD,
+// OpenBSD and PowerPC.
+#if defined(LLVM_ON_UNIX) && (defined(_LIBCPP_VERSION) ||                      \
+    !(defined(__NetBSD__) || defined(__OpenBSD__) || defined(__ppc__)))
 #define LLVM_THREADING_USE_STD_CALL_ONCE 1
 #else
 #define LLVM_THREADING_USE_STD_CALL_ONCE 0

Modified: vendor/llvm/dist/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
==============================================================================
--- vendor/llvm/dist/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp	Fri Dec  2 19:02:12 2016	(r309426)
+++ vendor/llvm/dist/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp	Fri Dec  2 19:20:10 2016	(r309427)
@@ -2185,24 +2185,29 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(
     // options. This is a trivially-generalized version of the code from
     // Hacker's Delight (itself derived from Knuth's Algorithm M from section
     // 4.3.1).
-    SDValue Mask =
-      DAG.getConstant(APInt::getLowBitsSet(NVT.getSizeInBits(),
-                                           NVT.getSizeInBits() >> 1), dl, NVT);
+    unsigned Bits = NVT.getSizeInBits();
+    unsigned HalfBits = Bits >> 1;
+    SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl,
+                                   NVT);
     SDValue LLL = DAG.getNode(ISD::AND, dl, NVT, LL, Mask);
     SDValue RLL = DAG.getNode(ISD::AND, dl, NVT, RL, Mask);
 
     SDValue T = DAG.getNode(ISD::MUL, dl, NVT, LLL, RLL);
     SDValue TL = DAG.getNode(ISD::AND, dl, NVT, T, Mask);
 
-    SDValue Shift =
-      DAG.getConstant(NVT.getSizeInBits() >> 1, dl,
-                      TLI.getShiftAmountTy(NVT, DAG.getDataLayout()));
+    EVT ShiftAmtTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
+    if (APInt::getMaxValue(ShiftAmtTy.getSizeInBits()).ult(HalfBits)) {
+      // The type from TLI is too small to fit the shift amount we want.
+      // Override it with i32. The shift will have to be legalized.
+      ShiftAmtTy = MVT::i32;
+    }
+    SDValue Shift = DAG.getConstant(HalfBits, dl, ShiftAmtTy);
     SDValue TH = DAG.getNode(ISD::SRL, dl, NVT, T, Shift);
     SDValue LLH = DAG.getNode(ISD::SRL, dl, NVT, LL, Shift);
     SDValue RLH = DAG.getNode(ISD::SRL, dl, NVT, RL, Shift);
 
     SDValue U = DAG.getNode(ISD::ADD, dl, NVT,
-                            DAG.getNode(ISD::MUL, dl, NVT, LLH, RLL), TL);
+                            DAG.getNode(ISD::MUL, dl, NVT, LLH, RLL), TH);
     SDValue UL = DAG.getNode(ISD::AND, dl, NVT, U, Mask);
     SDValue UH = DAG.getNode(ISD::SRL, dl, NVT, U, Shift);
 
@@ -2211,14 +2216,14 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(
     SDValue VH = DAG.getNode(ISD::SRL, dl, NVT, V, Shift);
 
     SDValue W = DAG.getNode(ISD::ADD, dl, NVT,
-                            DAG.getNode(ISD::MUL, dl, NVT, LL, RL),
+                            DAG.getNode(ISD::MUL, dl, NVT, LLH, RLH),
                             DAG.getNode(ISD::ADD, dl, NVT, UH, VH));
-    Lo = DAG.getNode(ISD::ADD, dl, NVT, TH,
+    Lo = DAG.getNode(ISD::ADD, dl, NVT, TL,
                      DAG.getNode(ISD::SHL, dl, NVT, V, Shift));
 
     Hi = DAG.getNode(ISD::ADD, dl, NVT, W,
                      DAG.getNode(ISD::ADD, dl, NVT,
-                                 DAG.getNode(ISD::MUL, dl, NVT, RH, LL), 
+                                 DAG.getNode(ISD::MUL, dl, NVT, RH, LL),
                                  DAG.getNode(ISD::MUL, dl, NVT, RL, LH)));
     return;
   }

Modified: vendor/llvm/dist/lib/Target/AMDGPU/SIInstrInfo.cpp
==============================================================================
--- vendor/llvm/dist/lib/Target/AMDGPU/SIInstrInfo.cpp	Fri Dec  2 19:02:12 2016	(r309426)
+++ vendor/llvm/dist/lib/Target/AMDGPU/SIInstrInfo.cpp	Fri Dec  2 19:20:10 2016	(r309427)
@@ -2203,7 +2203,8 @@ void SIInstrInfo::legalizeOperandsSMRD(M
 }
 
 void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
-  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  MachineFunction &MF = *MI.getParent()->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // Legalize VOP2
   if (isVOP2(MI) || isVOPC(MI)) {
@@ -2321,8 +2322,14 @@ void SIInstrInfo::legalizeOperands(Machi
     return;
   }
 
-  // Legalize MIMG
-  if (isMIMG(MI)) {
+  // Legalize MIMG and MUBUF/MTBUF for shaders.
+  //
+  // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
+  // scratch memory access. In both cases, the legalization never involves
+  // conversion to the addr64 form.
+  if (isMIMG(MI) ||
+      (AMDGPU::isShader(MF.getFunction()->getCallingConv()) &&
+       (isMUBUF(MI) || isMTBUF(MI)))) {
     MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
     if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
       unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
@@ -2337,9 +2344,10 @@ void SIInstrInfo::legalizeOperands(Machi
     return;
   }
 
-  // Legalize MUBUF* instructions
+  // Legalize MUBUF* instructions by converting to addr64 form.
   // FIXME: If we start using the non-addr64 instructions for compute, we
-  // may need to legalize them here.
+  // may need to legalize them as above. This especially applies to the
+  // buffer_load_format_* variants and variants with idxen (or bothen).
   int SRsrcIdx =
       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
   if (SRsrcIdx != -1) {

Modified: vendor/llvm/dist/lib/Target/AMDGPU/SIInstructions.td
==============================================================================
--- vendor/llvm/dist/lib/Target/AMDGPU/SIInstructions.td	Fri Dec  2 19:02:12 2016	(r309426)
+++ vendor/llvm/dist/lib/Target/AMDGPU/SIInstructions.td	Fri Dec  2 19:20:10 2016	(r309427)
@@ -2029,6 +2029,7 @@ def SI_RETURN : PseudoInstSI <
   let hasSideEffects = 1;
   let SALU = 1;
   let hasNoSchedulingInfo = 1;
+  let DisableWQM = 1;
 }
 
 let Uses = [EXEC], Defs = [EXEC, VCC, M0],

Modified: vendor/llvm/dist/lib/Target/AMDGPU/SIWholeQuadMode.cpp
==============================================================================
--- vendor/llvm/dist/lib/Target/AMDGPU/SIWholeQuadMode.cpp	Fri Dec  2 19:02:12 2016	(r309426)
+++ vendor/llvm/dist/lib/Target/AMDGPU/SIWholeQuadMode.cpp	Fri Dec  2 19:20:10 2016	(r309427)
@@ -219,13 +219,6 @@ char SIWholeQuadMode::scanInstructions(M
       markInstruction(MI, Flags, Worklist);
       GlobalFlags |= Flags;
     }
-
-    if (WQMOutputs && MBB.succ_empty()) {
-      // This is a prolog shader. Make sure we go back to exact mode at the end.
-      Blocks[&MBB].OutNeeds = StateExact;
-      Worklist.push_back(&MBB);
-      GlobalFlags |= StateExact;
-    }
   }
 
   return GlobalFlags;

Modified: vendor/llvm/dist/lib/Transforms/InstCombine/InstCombineCompares.cpp
==============================================================================
--- vendor/llvm/dist/lib/Transforms/InstCombine/InstCombineCompares.cpp	Fri Dec  2 19:02:12 2016	(r309426)
+++ vendor/llvm/dist/lib/Transforms/InstCombine/InstCombineCompares.cpp	Fri Dec  2 19:20:10 2016	(r309427)
@@ -634,7 +634,7 @@ static bool canRewriteGEPAsOffset(Value 
       }
 
       if (!isa<IntToPtrInst>(V) && !isa<PtrToIntInst>(V) &&
-          !isa<GEPOperator>(V) && !isa<PHINode>(V))
+          !isa<GetElementPtrInst>(V) && !isa<PHINode>(V))
         // We've found some value that we can't explore which is different from
         // the base. Therefore we can't do this transformation.
         return false;

Modified: vendor/llvm/dist/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
==============================================================================
--- vendor/llvm/dist/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp	Fri Dec  2 19:02:12 2016	(r309426)
+++ vendor/llvm/dist/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp	Fri Dec  2 19:20:10 2016	(r309427)
@@ -579,6 +579,13 @@ static Instruction *unpackLoadToAggregat
         UndefValue::get(T), NewLoad, 0, Name));
     }
 
+    // Bail out if the array is too large. Ideally we would like to optimize
+    // arrays of arbitrary size but this has a terrible impact on compile time.
+    // The threshold here is chosen arbitrarily, maybe needs a little bit of
+    // tuning.
+    if (NumElements > 1024)
+      return nullptr;
+
     const DataLayout &DL = IC.getDataLayout();
     auto EltSize = DL.getTypeAllocSize(ET);
     auto Align = LI.getAlignment();
@@ -1081,6 +1088,13 @@ static bool unpackStoreToAggregate(InstC
       return true;
     }
 
+    // Bail out if the array is too large. Ideally we would like to optimize
+    // arrays of arbitrary size but this has a terrible impact on compile time.
+    // The threshold here is chosen arbitrarily, maybe needs a little bit of
+    // tuning.
+    if (NumElements > 1024)
+      return false;
+
     const DataLayout &DL = IC.getDataLayout();
     auto EltSize = DL.getTypeAllocSize(AT->getElementType());
     auto Align = SI.getAlignment();

Modified: vendor/llvm/dist/lib/Transforms/Utils/SimplifyCFG.cpp
==============================================================================
--- vendor/llvm/dist/lib/Transforms/Utils/SimplifyCFG.cpp	Fri Dec  2 19:02:12 2016	(r309426)
+++ vendor/llvm/dist/lib/Transforms/Utils/SimplifyCFG.cpp	Fri Dec  2 19:20:10 2016	(r309427)
@@ -2024,14 +2024,20 @@ static bool FoldTwoEntryPHINode(PHINode 
 
   // Move all 'aggressive' instructions, which are defined in the
   // conditional parts of the if's up to the dominating block.
-  if (IfBlock1)
+  if (IfBlock1) {
+    for (auto &I : *IfBlock1)
+      I.dropUnknownNonDebugMetadata();
     DomBlock->getInstList().splice(InsertPt->getIterator(),
                                    IfBlock1->getInstList(), IfBlock1->begin(),
                                    IfBlock1->getTerminator()->getIterator());
-  if (IfBlock2)
+  }
+  if (IfBlock2) {
+    for (auto &I : *IfBlock2)
+      I.dropUnknownNonDebugMetadata();
     DomBlock->getInstList().splice(InsertPt->getIterator(),
                                    IfBlock2->getInstList(), IfBlock2->begin(),
                                    IfBlock2->getTerminator()->getIterator());
+  }
 
   while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
     // Change the PHI node into a select instruction.

Added: vendor/llvm/dist/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ vendor/llvm/dist/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll	Fri Dec  2 19:20:10 2016	(r309427)
@@ -0,0 +1,49 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
+
+; Test that buffer_load_format with VGPR resource descriptor is properly
+; legalized.
+
+; CHECK-LABEL: {{^}}test_none:
+; CHECK: buffer_load_format_x v0, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
+define amdgpu_vs float @test_none(<4 x i32> addrspace(2)* inreg %base, i32 %i) {
+main_body:
+  %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %base, i32 %i
+  %tmp2 = load <4 x i32>, <4 x i32> addrspace(2)* %ptr, align 32
+  %tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 0, i32 0, i1 0, i1 0)
+  ret float %tmp7
+}
+
+; CHECK-LABEL: {{^}}test_idxen:
+; CHECK: buffer_load_format_x v0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen{{$}}
+define amdgpu_vs float @test_idxen(<4 x i32> addrspace(2)* inreg %base, i32 %i) {
+main_body:
+  %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %base, i32 %i
+  %tmp2 = load <4 x i32>, <4 x i32> addrspace(2)* %ptr, align 32
+  %tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 undef, i32 0, i1 0, i1 0)
+  ret float %tmp7
+}
+
+; CHECK-LABEL: {{^}}test_offen:
+; CHECK: buffer_load_format_x v0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
+define amdgpu_vs float @test_offen(<4 x i32> addrspace(2)* inreg %base, i32 %i) {
+main_body:
+  %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %base, i32 %i
+  %tmp2 = load <4 x i32>, <4 x i32> addrspace(2)* %ptr, align 32
+  %tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 0, i32 undef, i1 0, i1 0)
+  ret float %tmp7
+}
+
+; CHECK-LABEL: {{^}}test_both:
+; CHECK: buffer_load_format_x v0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen{{$}}
+define amdgpu_vs float @test_both(<4 x i32> addrspace(2)* inreg %base, i32 %i) {
+main_body:
+  %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %base, i32 %i
+  %tmp2 = load <4 x i32>, <4 x i32> addrspace(2)* %ptr, align 32
+  %tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 undef, i32 undef, i1 0, i1 0)
+  ret float %tmp7
+}
+
+declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) nounwind readonly
+
+attributes #0 = { nounwind readnone }

Modified: vendor/llvm/dist/test/CodeGen/AMDGPU/wqm.ll
==============================================================================
--- vendor/llvm/dist/test/CodeGen/AMDGPU/wqm.ll	Fri Dec  2 19:02:12 2016	(r309426)
+++ vendor/llvm/dist/test/CodeGen/AMDGPU/wqm.ll	Fri Dec  2 19:20:10 2016	(r309427)
@@ -17,17 +17,18 @@ main_body:
 ;CHECK-LABEL: {{^}}test2:
 ;CHECK-NEXT: ; %main_body
 ;CHECK-NEXT: s_wqm_b64 exec, exec
-;CHECK: image_sample
 ;CHECK-NOT: exec
-;CHECK: _load_dword v0,
-define amdgpu_ps float @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
+define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
 main_body:
   %c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   %c.2 = bitcast <4 x float> %c.1 to <4 x i32>
   %c.3 = extractelement <4 x i32> %c.2, i32 0
   %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3
   %data = load float, float addrspace(1)* %gep
-  ret float %data
+
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %data, float undef, float undef, float undef)
+
+  ret void
 }
 
 ; ... but disabled for stores (and, in this simple case, not re-enabled).
@@ -414,6 +415,46 @@ entry:
   ret void
 }
 
+; Must return to exact at the end of a non-void returning shader,
+; otherwise the EXEC mask exported by the epilog will be wrong. This is true
+; even if the shader has no kills, because a kill could have happened in a
+; previous shader fragment.
+;
+; CHECK-LABEL: {{^}}test_nonvoid_return:
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+;
+; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK-NOT: exec
+define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
+  %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex.i = bitcast <4 x float> %tex to <4 x i32>
+  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %dtex
+}
+
+; CHECK-LABEL: {{^}}test_nonvoid_return_unreachable:
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+;
+; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK-NOT: exec
+define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
+entry:
+  %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex.i = bitcast <4 x float> %tex to <4 x i32>
+  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+  %cc = icmp sgt i32 %c, 0
+  br i1 %cc, label %if, label %else
+
+if:
+  store volatile <4 x float> %dtex, <4 x float>* undef
+  unreachable
+
+else:
+  ret <4 x float> %dtex
+}
 
 declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
 declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1

Added: vendor/llvm/dist/test/CodeGen/X86/mul-i1024.ll
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ vendor/llvm/dist/test/CodeGen/X86/mul-i1024.ll	Fri Dec  2 19:20:10 2016	(r309427)
@@ -0,0 +1,5938 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
+
+define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
+; X32-LABEL: test_1024:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    andl $-8, %esp
+; X32-NEXT:    subl $2640, %esp # imm = 0xA50
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    movl 64(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 68(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 72(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 76(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 80(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 84(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 88(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 92(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 96(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 100(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 104(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 108(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 112(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 116(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 120(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 124(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl (%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 4(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 8(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 12(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 16(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 20(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 24(%eax), %ecx
+; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl 28(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 32(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 36(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 40(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 44(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 48(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 52(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 56(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 60(%eax), %ebx
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 12(%ebp), %eax
+; X32-NEXT:    movl 48(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 52(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 56(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 60(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 32(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 36(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 40(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 44(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 16(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 20(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 24(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 28(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 8(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 12(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 112(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 116(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 120(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 124(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 96(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 100(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 104(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 108(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 80(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 84(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 88(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 92(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 64(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 68(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 72(%eax), %edi
+; X32-NEXT:    movl 76(%eax), %esi
+; X32-NEXT:    movl (%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 4(%eax), %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %edx
+; X32-NEXT:    pushl %ecx
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl %esi, %ebx
+; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    pushl %ecx
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl $0

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***