svn commit: r295600 - in projects/clang380-import: contrib/llvm/include/llvm/IR contrib/llvm/lib/Analysis contrib/llvm/lib/CodeGen/AsmPrinter contrib/llvm/lib/IR contrib/llvm/lib/Target/AArch64 con...

Sat Feb 13 15:58:55 UTC 2016

Author: dim
Date: Sat Feb 13 15:58:51 2016
New Revision: 295600
URL: https://svnweb.freebsd.org/changeset/base/295600

Log:
  Update llvm, clang and lldb to release_38 branch r260756.

Added:
  projects/clang380-import/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td
     - copied unchanged from r295599, vendor/llvm/dist/lib/Target/AArch64/AArch64SchedM1.td
Modified:
  projects/clang380-import/contrib/llvm/include/llvm/IR/IntrinsicsPowerPC.td
  projects/clang380-import/contrib/llvm/include/llvm/IR/Value.h
  projects/clang380-import/contrib/llvm/lib/Analysis/DemandedBits.cpp
  projects/clang380-import/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
  projects/clang380-import/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
  projects/clang380-import/contrib/llvm/lib/IR/Value.cpp
  projects/clang380-import/contrib/llvm/lib/Target/AArch64/AArch64.td
  projects/clang380-import/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
  projects/clang380-import/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
  projects/clang380-import/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
  projects/clang380-import/contrib/llvm/lib/Target/AMDGPU/Processors.td
  projects/clang380-import/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
  projects/clang380-import/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
  projects/clang380-import/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
  projects/clang380-import/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp
  projects/clang380-import/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
  projects/clang380-import/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
  projects/clang380-import/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
  projects/clang380-import/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
  projects/clang380-import/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
  projects/clang380-import/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
  projects/clang380-import/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
  projects/clang380-import/contrib/llvm/tools/clang/include/clang/Sema/Sema.h
  projects/clang380-import/contrib/llvm/tools/clang/lib/AST/ASTDiagnostic.cpp
  projects/clang380-import/contrib/llvm/tools/clang/lib/Basic/Targets.cpp
  projects/clang380-import/contrib/llvm/tools/clang/lib/CodeGen/Address.h
  projects/clang380-import/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntime.cpp
  projects/clang380-import/contrib/llvm/tools/clang/lib/CodeGen/CGStmtOpenMP.cpp
  projects/clang380-import/contrib/llvm/tools/clang/lib/Driver/ToolChains.cpp
  projects/clang380-import/contrib/llvm/tools/clang/lib/Driver/Tools.cpp
  projects/clang380-import/contrib/llvm/tools/clang/lib/Sema/SemaDeclCXX.cpp
  projects/clang380-import/contrib/llvm/tools/clang/lib/Sema/SemaExpr.cpp
  projects/clang380-import/contrib/llvm/tools/clang/lib/Sema/SemaExprObjC.cpp
  projects/clang380-import/contrib/llvm/tools/clang/lib/Sema/SemaOverload.cpp
  projects/clang380-import/contrib/llvm/tools/lldb/include/lldb/API/SBInstruction.h
  projects/clang380-import/contrib/llvm/tools/lldb/include/lldb/Core/RangeMap.h
  projects/clang380-import/contrib/llvm/tools/lldb/include/lldb/Symbol/Symtab.h
  projects/clang380-import/contrib/llvm/tools/lldb/source/API/SBInstruction.cpp
  projects/clang380-import/contrib/llvm/tools/lldb/source/Core/Module.cpp
  projects/clang380-import/contrib/llvm/tools/lldb/source/Expression/IRDynamicChecks.cpp
  projects/clang380-import/contrib/llvm/tools/lldb/source/Plugins/ABI/SysV-mips/ABISysV_mips.cpp
  projects/clang380-import/contrib/llvm/tools/lldb/source/Plugins/ABI/SysV-mips64/ABISysV_mips64.cpp
  projects/clang380-import/contrib/llvm/tools/lldb/source/Symbol/GoASTContext.cpp
  projects/clang380-import/contrib/llvm/tools/lldb/source/Symbol/Symtab.cpp
  projects/clang380-import/contrib/llvm/tools/lldb/source/Target/Target.cpp
  projects/clang380-import/lib/clang/include/clang/Basic/Version.inc
Directory Properties:
  projects/clang380-import/contrib/llvm/   (props changed)
  projects/clang380-import/contrib/llvm/tools/clang/   (props changed)
  projects/clang380-import/contrib/llvm/tools/lldb/   (props changed)

Modified: projects/clang380-import/contrib/llvm/include/llvm/IR/IntrinsicsPowerPC.td
==============================================================================

--- projects/clang380-import/contrib/llvm/include/llvm/IR/IntrinsicsPowerPC.td	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/include/llvm/IR/IntrinsicsPowerPC.td	Sat Feb 13 15:58:51 2016	(r295600)
@@ -484,7 +484,7 @@ let TargetPrefix = "ppc" in {  // All PP
             Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
                       [IntrNoMem]>;
   def int_ppc_altivec_vpkswss : GCCBuiltin<"__builtin_altivec_vpkswss">,
-            Intrinsic<[llvm_v16i8_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
+            Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
                       [IntrNoMem]>;
   def int_ppc_altivec_vpkswus : GCCBuiltin<"__builtin_altivec_vpkswus">,
             Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],

Modified: projects/clang380-import/contrib/llvm/include/llvm/IR/Value.h
==============================================================================
--- projects/clang380-import/contrib/llvm/include/llvm/IR/Value.h	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/include/llvm/IR/Value.h	Sat Feb 13 15:58:51 2016	(r295600)
@@ -280,11 +280,7 @@ public:
   // when using them since you might not get all uses.
   // The methods that don't start with materialized_ assert that modules is
   // fully materialized.
-#ifdef NDEBUG
-  void assertModuleIsMaterialized() const {}
-#else
   void assertModuleIsMaterialized() const;
-#endif
 
   bool use_empty() const {
     assertModuleIsMaterialized();

Modified: projects/clang380-import/contrib/llvm/lib/Analysis/DemandedBits.cpp
==============================================================================
--- projects/clang380-import/contrib/llvm/lib/Analysis/DemandedBits.cpp	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/lib/Analysis/DemandedBits.cpp	Sat Feb 13 15:58:51 2016	(r295600)
@@ -242,13 +242,6 @@ void DemandedBits::determineLiveOperandB
     if (OperandNo != 0)
       AB = AOut;
     break;
-  case Instruction::ICmp:
-    // Count the number of leading zeroes in each operand.
-    ComputeKnownBits(BitWidth, UserI->getOperand(0), UserI->getOperand(1));
-    auto NumLeadingZeroes = std::min(KnownZero.countLeadingOnes(),
-                                     KnownZero2.countLeadingOnes());
-    AB = ~APInt::getHighBitsSet(BitWidth, NumLeadingZeroes);
-    break;
   }
 }
 

Modified: projects/clang380-import/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
==============================================================================
--- projects/clang380-import/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp	Sat Feb 13 15:58:51 2016	(r295600)
@@ -555,6 +555,11 @@ bool AsmPrinter::PrintAsmOperand(const M
         return true;
       O << -MO.getImm();
       return false;
+    case 's':  // The GCC deprecated s modifier
+      if (MO.getType() != MachineOperand::MO_Immediate)
+        return true;
+      O << ((32 - MO.getImm()) & 31);
+      return false;
     }
   }
   return true;

Modified: projects/clang380-import/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
==============================================================================
--- projects/clang380-import/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp	Sat Feb 13 15:58:51 2016	(r295600)
@@ -793,16 +793,27 @@ static DebugLocEntry::Value getDebugLocV
   llvm_unreachable("Unexpected 4-operand DBG_VALUE instruction!");
 }
 
-/// Determine whether two variable pieces overlap.
-static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2) {
-  if (!P1->isBitPiece() || !P2->isBitPiece())
-    return true;
+// Determine the relative position of the pieces described by P1 and P2.
+// Returns  -1 if P1 is entirely before P2, 0 if P1 and P2 overlap,
+// 1 if P1 is entirely after P2.
+static int pieceCmp(const DIExpression *P1, const DIExpression *P2) {
   unsigned l1 = P1->getBitPieceOffset();
   unsigned l2 = P2->getBitPieceOffset();
   unsigned r1 = l1 + P1->getBitPieceSize();
   unsigned r2 = l2 + P2->getBitPieceSize();
-  // True where [l1,r1[ and [r1,r2[ overlap.
-  return (l1 < r2) && (l2 < r1);
+  if (r1 <= l2)
+    return -1;
+  else if (r2 <= l1)
+    return 1;
+  else
+    return 0;
+}
+
+/// Determine whether two variable pieces overlap.
+static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2) {
+  if (!P1->isBitPiece() || !P2->isBitPiece())
+    return true;
+  return pieceCmp(P1, P2) == 0;
 }
 
 /// \brief If this and Next are describing different pieces of the same
@@ -811,14 +822,32 @@ static bool piecesOverlap(const DIExpres
 /// Return true if the merge was successful.
 bool DebugLocEntry::MergeValues(const DebugLocEntry &Next) {
   if (Begin == Next.Begin) {
-    auto *Expr = cast_or_null<DIExpression>(Values[0].Expression);
-    auto *NextExpr = cast_or_null<DIExpression>(Next.Values[0].Expression);
-    if (Expr->isBitPiece() && NextExpr->isBitPiece() &&
-        !piecesOverlap(Expr, NextExpr)) {
-      addValues(Next.Values);
-      End = Next.End;
-      return true;
+    auto *FirstExpr = cast<DIExpression>(Values[0].Expression);
+    auto *FirstNextExpr = cast<DIExpression>(Next.Values[0].Expression);
+    if (!FirstExpr->isBitPiece() || !FirstNextExpr->isBitPiece())
+      return false;
+
+    // We can only merge entries if none of the pieces overlap any others.
+    // In doing so, we can take advantage of the fact that both lists are
+    // sorted.
+    for (unsigned i = 0, j = 0; i < Values.size(); ++i) {
+      for (; j < Next.Values.size(); ++j) {
+        int res = pieceCmp(cast<DIExpression>(Values[i].Expression),
+                           cast<DIExpression>(Next.Values[j].Expression));
+        if (res == 0) // The two expressions overlap, we can't merge.
+          return false;
+        // Values[i] is entirely before Next.Values[j],
+        // so go back to the next entry of Values.
+        else if (res == -1)
+          break;
+        // Next.Values[j] is entirely before Values[i], so go on to the
+        // next entry of Next.Values.
+      }
     }
+
+    addValues(Next.Values);
+    End = Next.End;
+    return true;
   }
   return false;
 }

Modified: projects/clang380-import/contrib/llvm/lib/IR/Value.cpp
==============================================================================
--- projects/clang380-import/contrib/llvm/lib/IR/Value.cpp	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/lib/IR/Value.cpp	Sat Feb 13 15:58:51 2016	(r295600)
@@ -313,8 +313,8 @@ void Value::takeName(Value *V) {
     ST->reinsertValue(this);
 }
 
-#ifndef NDEBUG
 void Value::assertModuleIsMaterialized() const {
+#ifndef NDEBUG
   const GlobalValue *GV = dyn_cast<GlobalValue>(this);
   if (!GV)
     return;
@@ -322,8 +322,10 @@ void Value::assertModuleIsMaterialized()
   if (!M)
     return;
   assert(M->isMaterialized());
+#endif
 }
 
+#ifndef NDEBUG
 static bool contains(SmallPtrSetImpl<ConstantExpr *> &Cache, ConstantExpr *Expr,
                      Constant *C) {
   if (!Cache.insert(Expr).second)

Modified: projects/clang380-import/contrib/llvm/lib/Target/AArch64/AArch64.td
==============================================================================
--- projects/clang380-import/contrib/llvm/lib/Target/AArch64/AArch64.td	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/lib/Target/AArch64/AArch64.td	Sat Feb 13 15:58:51 2016	(r295600)
@@ -90,6 +90,7 @@ def AArch64InstrInfo : InstrInfo;
 include "AArch64SchedA53.td"
 include "AArch64SchedA57.td"
 include "AArch64SchedCyclone.td"
+include "AArch64SchedM1.td"
 
 def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
                                    "Cortex-A35 ARM processors",
@@ -144,8 +145,7 @@ def : ProcessorModel<"cortex-a57", Corte
 // FIXME: Cortex-A72 is currently modelled as an Cortex-A57.
 def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>;
 def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
-// FIXME: Exynos-M1 is currently modelled without a specific SchedModel.
-def : ProcessorModel<"exynos-m1", NoSchedModel, [ProcExynosM1]>;
+def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>;
 
 //===----------------------------------------------------------------------===//
 // Assembly parser

Modified: projects/clang380-import/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
==============================================================================
--- projects/clang380-import/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp	Sat Feb 13 15:58:51 2016	(r295600)
@@ -6689,6 +6689,9 @@ SDValue AArch64TargetLowering::LowerVSET
     return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
   }
 
+  if (LHS.getValueType().getVectorElementType() == MVT::f16)
+    return SDValue();
+
   assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
          LHS.getValueType().getVectorElementType() == MVT::f64);
 

Copied: projects/clang380-import/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td (from r295599, vendor/llvm/dist/lib/Target/AArch64/AArch64SchedM1.td)
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ projects/clang380-import/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td	Sat Feb 13 15:58:51 2016	(r295600, copy of r295599, vendor/llvm/dist/lib/Target/AArch64/AArch64SchedM1.td)
@@ -0,0 +1,359 @@
+//=- AArch64SchedM1.td - Samsung Exynos-M1 Scheduling Defs ---*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Samsung Exynos-M1 to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// The Exynos-M1 is a traditional superscalar microprocessor with a
+// 4-wide in-order stage for decode and dispatch and a wider issue stage.
+// The execution units and loads and stores are out-of-order.
+
+def ExynosM1Model : SchedMachineModel {
+  let IssueWidth            =  4; // Up to 4 uops per cycle.
+  let MinLatency            =  0; // OoO.
+  let MicroOpBufferSize     = 96; // ROB size.
+  let LoopMicroOpBufferSize = 32; // Instruction queue size.
+  let LoadLatency           =  4; // Optimistic load cases.
+  let MispredictPenalty     = 14; // Minimum branch misprediction penalty.
+  let CompleteModel         =  0; // Use the default model otherwise.
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on the Exynos-M1,
+// which has 9 pipelines, each with its own queue with out-of-order dispatch.
+
+def M1UnitA  : ProcResource<2>; // Simple integer
+def M1UnitC  : ProcResource<1>; // Simple and complex integer
+def M1UnitB  : ProcResource<2>; // Branch
+def M1UnitL  : ProcResource<1>; // Load
+def M1UnitS  : ProcResource<1>; // Store
+def M1PipeF0 : ProcResource<1>; // FP #0
+def M1PipeF1 : ProcResource<1>; // FP #1
+
+let Super = M1PipeF0 in {
+  def M1UnitFMAC   : ProcResource<1>; // FP multiplication
+  def M1UnitFCVT   : ProcResource<1>; // FP conversion
+  def M1UnitNAL0   : ProcResource<1>; // Simple vector.
+  def M1UnitNMISC  : ProcResource<1>; // Miscellanea
+  def M1UnitNCRYPT : ProcResource<1>; // Cryptographic
+}
+
+let Super = M1PipeF1 in {
+  def M1UnitFADD : ProcResource<1>; // Simple FP
+  let BufferSize = 1 in
+  def M1UnitFVAR : ProcResource<1>; // FP division & square root (serialized)
+  def M1UnitNAL1 : ProcResource<1>; // Simple vector.
+  def M1UnitFST  : ProcResource<1>; // FP store
+}
+
+let SchedModel = ExynosM1Model in {
+  def M1UnitALU  : ProcResGroup<[M1UnitA,
+                                 M1UnitC]>;    // All simple integer.
+  def M1UnitNALU : ProcResGroup<[M1UnitNAL0,
+                                 M1UnitNAL1]>; // All simple vector.
+}
+
+let SchedModel = ExynosM1Model in {
+
+//===----------------------------------------------------------------------===//
+// Coarse scheduling model for the Exynos-M1.
+
+// Branch instructions.
+// TODO: Non-conditional direct branches take zero cycles and units.
+def : WriteRes<WriteBr,    [M1UnitB]> { let Latency = 1; }
+def : WriteRes<WriteBrReg, [M1UnitC]> { let Latency = 1; }
+// TODO: Branch and link is much different.
+
+// Arithmetic and logical integer instructions.
+def : WriteRes<WriteI,     [M1UnitALU]> { let Latency = 1; }
+// TODO: Shift over 3 and some extensions take 2 cycles.
+def : WriteRes<WriteISReg, [M1UnitALU]> { let Latency = 1; }
+def : WriteRes<WriteIEReg, [M1UnitALU]> { let Latency = 1; }
+def : WriteRes<WriteIS,    [M1UnitALU]> { let Latency = 1; }
+
+// Move instructions.
+def : WriteRes<WriteImm, [M1UnitALU]> { let Latency = 1; }
+
+// Divide and multiply instructions.
+// TODO: Division blocks the divider inside C.
+def : WriteRes<WriteID32, [M1UnitC]> { let Latency = 13; }
+def : WriteRes<WriteID64, [M1UnitC]> { let Latency = 21; }
+// TODO: Long multiplication take 5 cycles and also the ALU.
+// TODO: Multiplication with accumulation can be advanced.
+def : WriteRes<WriteIM32, [M1UnitC]> { let Latency = 3; }
+// TODO: 64-bit multiplication has a throughput of 1/2.
+def : WriteRes<WriteIM64, [M1UnitC]> { let Latency = 4; }
+
+// Miscellaneous instructions.
+def : WriteRes<WriteExtr, [M1UnitALU,
+                           M1UnitALU]> { let Latency = 2; }
+
+// TODO: The latency for the post or pre register is 1 cycle.
+def : WriteRes<WriteAdr, []> { let Latency = 0; }
+
+// Load instructions.
+def : WriteRes<WriteLD,    [M1UnitL]>   { let Latency = 4; }
+// TODO: Extended address requires also the ALU.
+def : WriteRes<WriteLDIdx, [M1UnitL]>   { let Latency = 5; }
+def : WriteRes<WriteLDHi,  [M1UnitALU]> { let Latency = 4; }
+
+// Store instructions.
+def : WriteRes<WriteST,    [M1UnitS]> { let Latency = 1; }
+// TODO: Extended address requires also the ALU.
+def : WriteRes<WriteSTIdx, [M1UnitS]> { let Latency = 1; }
+def : WriteRes<WriteSTP,   [M1UnitS]> { let Latency = 1; }
+def : WriteRes<WriteSTX,   [M1UnitS]> { let Latency = 1; }
+
+// FP data instructions.
+def : WriteRes<WriteF,    [M1UnitFADD]>  { let Latency = 3; }
+// TODO: FCCMP is much different.
+def : WriteRes<WriteFCmp, [M1UnitNMISC]> { let Latency = 4; }
+// TODO: DP takes longer.
+def : WriteRes<WriteFDiv, [M1UnitFVAR]>  { let Latency = 15; }
+// TODO: MACC takes longer.
+def : WriteRes<WriteFMul, [M1UnitFMAC]>  { let Latency = 4; }
+
+// FP miscellaneous instructions.
+// TODO: Conversion between register files is much different.
+def : WriteRes<WriteFCvt,  [M1UnitFCVT]> { let Latency = 3; }
+def : WriteRes<WriteFImm,  [M1UnitNALU]> { let Latency = 1; }
+// TODO: Copy from FPR to GPR is much different.
+def : WriteRes<WriteFCopy, [M1UnitS]>    { let Latency = 4; }
+
+// FP load instructions.
+// TODO: ASIMD loads are much different.
+def : WriteRes<WriteVLD, [M1UnitL]> { let Latency = 5; }
+
+// FP store instructions.
+// TODO: ASIMD stores are much different.
+def : WriteRes<WriteVST, [M1UnitS, M1UnitFST]> { let Latency = 1; }
+
+// ASIMD FP instructions.
+// TODO: Other operations are much different.
+def : WriteRes<WriteV, [M1UnitFADD]> { let Latency = 3; }
+
+// Other miscellaneous instructions.
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+//===----------------------------------------------------------------------===//
+// Fast forwarding.
+
+// TODO: Add FP register forwarding rules.
+
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+// Integer multiply-accumulate.
+// TODO: The forwarding for WriteIM64 saves actually 3 cycles.
+def : ReadAdvance<ReadIMA,     2, [WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+//===----------------------------------------------------------------------===//
+// Finer scheduling model for the Exynos-M1.
+
+def M1WriteNEONA   : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitNALU,
+                                    M1UnitFADD]>   { let Latency = 9; }
+def M1WriteNEONB   : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitFST]>    { let Latency = 5; }
+def M1WriteNEONC   : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitFST]>    { let Latency = 6; }
+def M1WriteNEOND   : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitFST,
+                                    M1UnitL]>      { let Latency = 10; }
+def M1WriteNEONE   : SchedWriteRes<[M1UnitFCVT,
+                                    M1UnitFST]>    { let Latency = 8; }
+def M1WriteNEONF   : SchedWriteRes<[M1UnitFCVT,
+                                    M1UnitFST,
+                                    M1UnitL]>      { let Latency = 13; }
+def M1WriteNEONG   : SchedWriteRes<[M1UnitNMISC,
+                                    M1UnitFST]>    { let Latency = 6; }
+def M1WriteNEONH   : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitFST]>    { let Latency = 3; }
+def M1WriteNEONI   : SchedWriteRes<[M1UnitFST,
+                                    M1UnitL]>      { let Latency = 9; }
+def M1WriteALU1    : SchedWriteRes<[M1UnitALU]>    { let Latency = 1; }
+def M1WriteB       : SchedWriteRes<[M1UnitB]>      { let Latency = 1; }
+// FIXME: This is the worst case, conditional branch and link.
+def M1WriteBL      : SchedWriteRes<[M1UnitB,
+                                    M1UnitALU]>    { let Latency = 1; }
+// FIXME: This is the worst case, when using LR.
+def M1WriteBLR     : SchedWriteRes<[M1UnitB,
+                                    M1UnitALU,
+                                    M1UnitALU]>    { let Latency = 2; }
+def M1WriteC1      : SchedWriteRes<[M1UnitC]>      { let Latency = 1; }
+def M1WriteC2      : SchedWriteRes<[M1UnitC]>      { let Latency = 2; }
+def M1WriteFADD3   : SchedWriteRes<[M1UnitFADD]>   { let Latency = 3; }
+def M1WriteFCVT3   : SchedWriteRes<[M1UnitFCVT]>   { let Latency = 3; }
+def M1WriteFCVT4   : SchedWriteRes<[M1UnitFCVT]>   { let Latency = 4; }
+def M1WriteFMAC4   : SchedWriteRes<[M1UnitFMAC]>   { let Latency = 4; }
+def M1WriteFMAC5   : SchedWriteRes<[M1UnitFMAC]>   { let Latency = 5; }
+def M1WriteFVAR15  : SchedWriteRes<[M1UnitFVAR]>   { let Latency = 15; }
+def M1WriteFVAR23  : SchedWriteRes<[M1UnitFVAR]>   { let Latency = 23; }
+def M1WriteNALU1   : SchedWriteRes<[M1UnitNALU]>   { let Latency = 1; }
+def M1WriteNALU2   : SchedWriteRes<[M1UnitNALU]>   { let Latency = 2; }
+def M1WriteNAL11   : SchedWriteRes<[M1UnitNAL1]>   { let Latency = 1; }
+def M1WriteNAL12   : SchedWriteRes<[M1UnitNAL1]>   { let Latency = 2; }
+def M1WriteNAL13   : SchedWriteRes<[M1UnitNAL1]>   { let Latency = 3; }
+def M1WriteNCRYPT1 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
+def M1WriteNCRYPT5 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 5; }
+def M1WriteNMISC1  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 1; }
+def M1WriteNMISC2  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 2; }
+def M1WriteNMISC3  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 3; }
+def M1WriteNMISC4  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 4; }
+def M1WriteS4      : SchedWriteRes<[M1UnitS]>      { let Latency = 4; }
+def M1WriteTB      : SchedWriteRes<[M1UnitC,
+                                    M1UnitALU]>    { let Latency = 2; }
+
+// Branch instructions
+def : InstRW<[M1WriteB ],  (instrs Bcc)>;
+def : InstRW<[M1WriteBL],  (instrs BL)>;
+def : InstRW<[M1WriteBLR], (instrs BLR)>;
+def : InstRW<[M1WriteC1],  (instregex "^CBN?Z[WX]")>;
+def : InstRW<[M1WriteTB],  (instregex "^TBN?Z[WX]")>;
+
+// Arithmetic and logical integer instructions.
+def : InstRW<[M1WriteALU1], (instrs COPY)>;
+
+// Divide and multiply instructions.
+
+// Miscellaneous instructions.
+
+// Load instructions.
+
+// Store instructions.
+
+// FP data instructions.
+def : InstRW<[M1WriteNALU1],  (instregex "^F(ABS|NEG)[DS]r")>;
+def : InstRW<[M1WriteFADD3],  (instregex "^F(ADD|SUB)[DS]rr")>;
+def : InstRW<[M1WriteNEONG],  (instregex "^FCCMPE?[DS]rr")>;
+def : InstRW<[M1WriteNMISC4], (instregex "^FCMPE?[DS]r")>;
+def : InstRW<[M1WriteFVAR15], (instrs FDIVSrr)>;
+def : InstRW<[M1WriteFVAR23], (instrs FDIVDrr)>;
+def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN).+rr")>;
+def : InstRW<[M1WriteFMAC4],  (instregex "^FN?MUL[DS]rr")>;
+def : InstRW<[M1WriteFMAC5],  (instregex "^FN?M(ADD|SUB)[DS]rrr")>;
+def : InstRW<[M1WriteFCVT3],  (instregex "^FRINT.+r")>;
+def : InstRW<[M1WriteNEONH],  (instregex "^FCSEL[DS]rrr")>;
+def : InstRW<[M1WriteFVAR15], (instrs FSQRTSr)>;
+def : InstRW<[M1WriteFVAR23], (instrs FSQRTDr)>;
+
+// FP miscellaneous instructions.
+def : InstRW<[M1WriteFCVT3], (instregex "^FCVT[DS][DS]r")>;
+def : InstRW<[M1WriteNEONF], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>;
+def : InstRW<[M1WriteNEONE], (instregex "^[SU]CVTF[SU]")>;
+def : InstRW<[M1WriteNALU1], (instregex "^FMOV[DS][ir]")>;
+def : InstRW<[M1WriteS4],    (instregex "^FMOV[WX][DS](High)?r")>;
+def : InstRW<[M1WriteNEONI], (instregex "^FMOV[DS][WX](High)?r")>;
+
+// FP load instructions.
+
+// FP store instructions.
+
+// ASIMD instructions.
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]ABAL?v")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^[SU]ABDL?v")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^(SQ)?ABSv")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^SQNEGv")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^(ADD|NEG|SUB)v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?H(ADD|SUB)v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?AD[AD](L|LP|P|W)V?2?v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?SUB[LW]2?v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^R?(ADD|SUB)HN?2?v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]+Q(ADD|SUB)v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU]RHADDv")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT)v")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^CMTSTv")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^(AND|BIC|EOR|MVNI|NOT|ORN|ORR)v")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^[SU](MIN|MAX)v")>;
+def : InstRW<[M1WriteNMISC2], (instregex "^[SU](MIN|MAX)Pv")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^[SU](MIN|MAX)Vv")>;
+def : InstRW<[M1WriteNMISC4], (instregex "^(MUL|SQR?DMULH)v")>;
+def : InstRW<[M1WriteNMISC4], (instregex "^ML[AS]v")>;
+def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD|SQRD)ML[AS][HL]v")>;
+def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD)MULLv")>;
+def : InstRW<[M1WriteNAL13],  (instregex "^(S|SR|U|UR)SRAv")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^[SU]?SH(L|LL|R)2?v")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^S[LR]Iv")>;
+def : InstRW<[M1WriteNAL13],  (instregex "^[SU]?(Q|QR|R)?SHR(N|U|UN)?2?v")>;
+def : InstRW<[M1WriteNAL13],  (instregex "^[SU](Q|QR|R)SHLU?v")>;
+
+// ASIMD FP instructions.
+def : InstRW<[M1WriteNALU1],  (instregex "^F(ABS|NEG)v")>;
+def : InstRW<[M1WriteNMISC3], (instregex "^F(ABD|ADD|SUB)v")>;
+def : InstRW<[M1WriteNEONA],  (instregex "^FADDP")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v[^1]")>;
+def : InstRW<[M1WriteFCVT3],  (instregex "^[FVSU]CVTX?[AFLMNPZ][SU]?(_Int)?v")>;
+def : InstRW<[M1WriteFVAR15], (instregex "FDIVv.f32")>;
+def : InstRW<[M1WriteFVAR23], (instregex "FDIVv2f64")>;
+def : InstRW<[M1WriteFVAR15], (instregex "FSQRTv.f32")>;
+def : InstRW<[M1WriteFVAR23], (instregex "FSQRTv2f64")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^F(MAX|MIN)(NM)?V?v")>;
+def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN)(NM)?Pv")>;
+def : InstRW<[M1WriteFMAC4],  (instregex "^FMULX?v")>;
+def : InstRW<[M1WriteFMAC5],  (instregex "^FML[AS]v")>;
+def : InstRW<[M1WriteFCVT3],  (instregex "^FRINT[AIMNPXZ]v")>;
+
+// ASIMD miscellaneous instructions.
+def : InstRW<[M1WriteNALU1],  (instregex "^RBITv")>;
+def : InstRW<[M1WriteNAL11],  (instregex "^(BIF|BIT|BSL)v")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^CPY")>;
+def : InstRW<[M1WriteNEONB],  (instregex "^DUPv.+gpr")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^DUPv.+lane")>;
+def : InstRW<[M1WriteNAL13],  (instregex "^[SU]?Q?XTU?Nv")>;
+def : InstRW<[M1WriteNEONC],  (instregex "^INSv.+gpr")>;
+def : InstRW<[M1WriteFCVT4],  (instregex "^[FU](RECP|RSQRT)Ev")>;
+def : InstRW<[M1WriteNMISC1], (instregex "^[FU](RECP|RSQRT)Xv")>;
+def : InstRW<[M1WriteFMAC5],  (instregex "^F(RECP|RSQRT)Sv")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^REV(16|32|64)v")>;
+def : InstRW<[M1WriteNAL11],  (instregex "^TB[LX]v8i8One")>;
+def : InstRW<[WriteSequence<[M1WriteNAL11], 2>],
+                              (instregex "^TB[LX]v8i8Two")>;
+def : InstRW<[WriteSequence<[M1WriteNAL11], 3>],
+                              (instregex "^TB[LX]v8i8Three")>;
+def : InstRW<[WriteSequence<[M1WriteNAL11], 4>],
+                              (instregex "^TB[LX]v8i8Four")>;
+def : InstRW<[M1WriteNAL12],  (instregex "^TB[LX]v16i8One")>;
+def : InstRW<[WriteSequence<[M1WriteNAL12], 2>],
+                              (instregex "^TB[LX]v16i8Two")>;
+def : InstRW<[WriteSequence<[M1WriteNAL12], 3>],
+                              (instregex "^TB[LX]v16i8Three")>;
+def : InstRW<[WriteSequence<[M1WriteNAL12], 4>],
+                              (instregex "^TB[LX]v16i8Four")>;
+def : InstRW<[M1WriteNEOND],  (instregex "^[SU]MOVv")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^INSv.+lane")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^(TRN|UZP)(1|2)(v8i8|v4i16|v2i32)")>;
+def : InstRW<[M1WriteNALU2],  (instregex "^(TRN|UZP)(1|2)(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^ZIP(1|2)v")>;
+
+// ASIMD load instructions.
+
+// ASIMD store instructions.
+
+// Cryptography instructions.
+def : InstRW<[M1WriteNCRYPT1], (instregex "^AES")>;
+def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>;
+def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>;
+def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA1[CMP]")>;
+def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA256SU0")>;
+def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA256(H|SU1)")>;
+
+// CRC instructions.
+def : InstRW<[M1WriteC2], (instregex "^CRC32")>;
+
+} // SchedModel = ExynosM1Model

Modified: projects/clang380-import/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
==============================================================================
--- projects/clang380-import/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td	Sat Feb 13 15:58:51 2016	(r295600)
@@ -183,6 +183,7 @@ def FeatureISAVersion7_0_0 : SubtargetFe
 def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1>;
 def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0>;
 def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1>;
+def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3>;
 
 class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
         "localmemorysize"#Value,
@@ -252,7 +253,7 @@ def FeatureSeaIslands : SubtargetFeature
 def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
         [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
          FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
-         FeatureGCN3Encoding, FeatureCIInsts, FeatureLDSBankCount32]>;
+         FeatureGCN3Encoding, FeatureCIInsts]>;
 
 //===----------------------------------------------------------------------===//
 

Modified: projects/clang380-import/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
==============================================================================
--- projects/clang380-import/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h	Sat Feb 13 15:58:51 2016	(r295600)
@@ -53,7 +53,8 @@ public:
     ISAVersion7_0_0,
     ISAVersion7_0_1,
     ISAVersion8_0_0,
-    ISAVersion8_0_1
+    ISAVersion8_0_1,
+    ISAVersion8_0_3
   };
 
 private:

Modified: projects/clang380-import/contrib/llvm/lib/Target/AMDGPU/Processors.td
==============================================================================
--- projects/clang380-import/contrib/llvm/lib/Target/AMDGPU/Processors.td	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/lib/Target/AMDGPU/Processors.td	Sat Feb 13 15:58:51 2016	(r295600)
@@ -128,21 +128,23 @@ def : ProcessorModel<"mullins",    SIQua
 //===----------------------------------------------------------------------===//
 
 def : ProcessorModel<"tonga",   SIQuarterSpeedModel,
-  [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0]
+  [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0,
+   FeatureLDSBankCount32]
 >;
 
 def : ProcessorModel<"iceland", SIQuarterSpeedModel,
-  [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0]
+  [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0,
+   FeatureLDSBankCount32]
 >;
 
 def : ProcessorModel<"carrizo", SIQuarterSpeedModel,
-  [FeatureVolcanicIslands, FeatureISAVersion8_0_1]
+  [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount32]
 >;
 
 def : ProcessorModel<"fiji", SIQuarterSpeedModel,
-  [FeatureVolcanicIslands, FeatureISAVersion8_0_1]
+  [FeatureVolcanicIslands, FeatureISAVersion8_0_3, FeatureLDSBankCount32]
 >;
 
 def : ProcessorModel<"stoney", SIQuarterSpeedModel,
-  [FeatureVolcanicIslands, FeatureISAVersion8_0_1]
+  [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount16]
 >;

Modified: projects/clang380-import/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
==============================================================================
--- projects/clang380-import/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp	Sat Feb 13 15:58:51 2016	(r295600)
@@ -234,6 +234,7 @@ void SIRegisterInfo::buildScratchLoadSto
   bool IsLoad = TII->get(LoadStoreOp).mayLoad();
 
   bool RanOutOfSGPRs = false;
+  bool Scavenged = false;
   unsigned SOffset = ScratchOffset;
 
   unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
@@ -244,6 +245,8 @@ void SIRegisterInfo::buildScratchLoadSto
     if (SOffset == AMDGPU::NoRegister) {
       RanOutOfSGPRs = true;
       SOffset = AMDGPU::SGPR0;
+    } else {
+      Scavenged = true;
     }
     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
             .addReg(ScratchOffset)
@@ -259,10 +262,14 @@ void SIRegisterInfo::buildScratchLoadSto
         getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
         Value;
 
+    unsigned SOffsetRegState = 0;
+    if (i + 1 == e && Scavenged)
+      SOffsetRegState |= RegState::Kill;
+
     BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
       .addReg(SubReg, getDefRegState(IsLoad))
       .addReg(ScratchRsrcReg)
-      .addReg(SOffset)
+      .addReg(SOffset, SOffsetRegState)
       .addImm(Offset)
       .addImm(0) // glc
       .addImm(0) // slc

Modified: projects/clang380-import/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
==============================================================================
--- projects/clang380-import/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp	Sat Feb 13 15:58:51 2016	(r295600)
@@ -41,6 +41,9 @@ IsaVersion getIsaVersion(const FeatureBi
   if (Features.test(FeatureISAVersion8_0_1))
     return {8, 0, 1};
 
+  if (Features.test(FeatureISAVersion8_0_3))
+    return {8, 0, 3};
+
   return {0, 0, 0};
 }
 

Modified: projects/clang380-import/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
==============================================================================
--- projects/clang380-import/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp	Sat Feb 13 15:58:51 2016	(r295600)
@@ -747,7 +747,7 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SD
 
   // If Offset is a multiply-by-constant and it's profitable to extract a shift
   // and use it in a shifted operand do so.
-  if (Offset.getOpcode() == ISD::MUL) {
+  if (Offset.getOpcode() == ISD::MUL && N.hasOneUse()) {
     unsigned PowerOfTwo = 0;
     SDValue NewMulConst;
     if (canExtractShiftFromMul(Offset, 31, PowerOfTwo, NewMulConst)) {
@@ -1422,7 +1422,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSo
 
   // If OffReg is a multiply-by-constant and it's profitable to extract a shift
   // and use it in a shifted operand do so.
-  if (OffReg.getOpcode() == ISD::MUL) {
+  if (OffReg.getOpcode() == ISD::MUL && N.hasOneUse()) {
     unsigned PowerOfTwo = 0;
     SDValue NewMulConst;
     if (canExtractShiftFromMul(OffReg, 3, PowerOfTwo, NewMulConst)) {

Modified: projects/clang380-import/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp
==============================================================================
--- projects/clang380-import/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp	Sat Feb 13 15:58:51 2016	(r295600)
@@ -1615,7 +1615,7 @@ bool PPCFastISel::SelectRet(const Instru
       // extension rather than sign extension. Make sure we pass the return
       // value extension property to integer materialization.
       unsigned SrcReg =
-          PPCMaterializeInt(CI, MVT::i64, VA.getLocInfo() == CCValAssign::SExt);
+          PPCMaterializeInt(CI, MVT::i64, VA.getLocInfo() != CCValAssign::ZExt);
 
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), RetReg).addReg(SrcReg);
@@ -2091,25 +2091,21 @@ unsigned PPCFastISel::PPCMaterializeInt(
 
   const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass :
                                    &PPC::GPRCRegClass);
+  int64_t Imm = UseSExt ? CI->getSExtValue() : CI->getZExtValue();
 
   // If the constant is in range, use a load-immediate.
-  if (UseSExt && isInt<16>(CI->getSExtValue())) {
+  // Since LI will sign extend the constant we need to make sure that for
+  // our zeroext constants that the sign extended constant fits into 16-bits -
+  // a range of 0..0x7fff.
+  if (isInt<16>(Imm)) {
     unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI;
     unsigned ImmReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg)
-        .addImm(CI->getSExtValue());
-    return ImmReg;
-  } else if (!UseSExt && isUInt<16>(CI->getZExtValue())) {
-    unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI;
-    unsigned ImmReg = createResultReg(RC);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg)
-        .addImm(CI->getZExtValue());
+        .addImm(Imm);
     return ImmReg;
   }
 
   // Construct the constant piecewise.
-  int64_t Imm = CI->getZExtValue();
-
   if (VT == MVT::i64)
     return PPCMaterialize64BitInt(Imm, RC);
   else if (VT == MVT::i32)

Modified: projects/clang380-import/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
==============================================================================
--- projects/clang380-import/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td	Sat Feb 13 15:58:51 2016	(r295600)
@@ -736,7 +736,7 @@ def VPKSHSS : VX1_Int_Ty2<398, "vpkshss"
 def VPKSHUS : VX1_Int_Ty2<270, "vpkshus", int_ppc_altivec_vpkshus,
                           v16i8, v8i16>;
 def VPKSWSS : VX1_Int_Ty2<462, "vpkswss", int_ppc_altivec_vpkswss,
-                          v16i8, v4i32>;
+                          v8i16, v4i32>;
 def VPKSWUS : VX1_Int_Ty2<334, "vpkswus", int_ppc_altivec_vpkswus,
                           v8i16, v4i32>;
 def VPKUHUM : VXForm_1<14, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),

Modified: projects/clang380-import/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
==============================================================================
--- projects/clang380-import/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp	Sat Feb 13 15:58:51 2016	(r295600)
@@ -1849,7 +1849,7 @@ static unsigned getTestUnderMaskCond(uns
     if (CCMask == SystemZ::CCMASK_CMP_NE)
       return SystemZ::CCMASK_TM_SOME_1;
   }
-  if (EffectivelyUnsigned && CmpVal <= Low) {
+  if (EffectivelyUnsigned && CmpVal > 0 && CmpVal <= Low) {
     if (CCMask == SystemZ::CCMASK_CMP_LT)
       return SystemZ::CCMASK_TM_ALL_0;
     if (CCMask == SystemZ::CCMASK_CMP_GE)

Modified: projects/clang380-import/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
==============================================================================
--- projects/clang380-import/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp	Sat Feb 13 15:58:51 2016	(r295600)
@@ -1335,6 +1335,7 @@ X86TargetLowering::X86TargetLowering(con
 
     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
+    setOperationAction(ISD::SETCCE,             MVT::i1,    Custom);
     setOperationAction(ISD::SELECT_CC,          MVT::i1,    Expand);
     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
@@ -14975,8 +14976,11 @@ SDValue X86TargetLowering::LowerSETCCE(S
   assert(Carry.getOpcode() != ISD::CARRY_FALSE);
   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
-  return DAG.getNode(X86ISD::SETCC, DL, Op.getValueType(),
-                     DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
+  SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+                              DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
+  if (Op.getSimpleValueType() == MVT::i1)
+      return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
+  return SetCC;
 }
 
 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
@@ -16315,6 +16319,11 @@ static SDValue getMaskNode(SDValue Mask,
                            const X86Subtarget *Subtarget,
                            SelectionDAG &DAG, SDLoc dl) {
 
+  if (isAllOnesConstant(Mask))
+    return DAG.getTargetConstant(1, dl, MaskVT);
+  if (X86::isZeroNode(Mask))
+    return DAG.getTargetConstant(0, dl, MaskVT);
+
   if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
     // Mask should be extended
     Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
@@ -17203,26 +17212,14 @@ static SDValue getGatherNode(unsigned Op
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   MVT MaskVT = MVT::getVectorVT(MVT::i1,
                              Index.getSimpleValueType().getVectorNumElements());
-  SDValue MaskInReg;
-  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
-  if (MaskC)
-    MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
-  else {
-    MVT BitcastVT = MVT::getVectorVT(MVT::i1,
-                                     Mask.getSimpleValueType().getSizeInBits());
 
-    // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
-    // are extracted by EXTRACT_SUBVECTOR.
-    MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                            DAG.getBitcast(BitcastVT, Mask),
-                            DAG.getIntPtrConstant(0, dl));
-  }
+  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   if (Src.getOpcode() == ISD::UNDEF)
     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
-  SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+  SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
   return DAG.getMergeValues(RetOps, dl);
@@ -17230,7 +17227,8 @@ static SDValue getGatherNode(unsigned Op
 
 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                SDValue Src, SDValue Mask, SDValue Base,
-                               SDValue Index, SDValue ScaleOp, SDValue Chain) {
+                               SDValue Index, SDValue ScaleOp, SDValue Chain,
+                               const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
   auto *C = cast<ConstantSDNode>(ScaleOp);
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
@@ -17238,29 +17236,18 @@ static SDValue getScatterNode(unsigned O
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   MVT MaskVT = MVT::getVectorVT(MVT::i1,
                              Index.getSimpleValueType().getVectorNumElements());
-  SDValue MaskInReg;
-  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
-  if (MaskC)
-    MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
-  else {
-    MVT BitcastVT = MVT::getVectorVT(MVT::i1,
-                                     Mask.getSimpleValueType().getSizeInBits());
 
-    // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
-    // are extracted by EXTRACT_SUBVECTOR.
-    MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                            DAG.getBitcast(BitcastVT, Mask),
-                            DAG.getIntPtrConstant(0, dl));
-  }
+  SDValue VMask = getMaskNode(Mask, MaskVT, &Subtarget, DAG, dl);
   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
-  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
+  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   return SDValue(Res, 1);
 }
 
 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                SDValue Mask, SDValue Base, SDValue Index,
-                               SDValue ScaleOp, SDValue Chain) {
+                               SDValue ScaleOp, SDValue Chain,
+                               const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
   auto *C = cast<ConstantSDNode>(ScaleOp);
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
@@ -17268,14 +17255,9 @@ static SDValue getPrefetchNode(unsigned 
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   MVT MaskVT =
     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
-  SDValue MaskInReg;
-  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
-  if (MaskC)
-    MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
-  else
-    MaskInReg = DAG.getBitcast(MaskVT, Mask);
+  SDValue VMask = getMaskNode(Mask, MaskVT, &Subtarget, DAG, dl);
   //SDVTList VTs = DAG.getVTList(MVT::Other);
-  SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+  SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
   return SDValue(Res, 0);
 }
@@ -17509,7 +17491,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SD
     SDValue Src   = Op.getOperand(5);
     SDValue Scale = Op.getOperand(6);
     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
-                          Scale, Chain);
+                          Scale, Chain, *Subtarget);
   }
   case PREFETCH: {
     SDValue Hint = Op.getOperand(6);
@@ -17521,7 +17503,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SD
     SDValue Index = Op.getOperand(3);
     SDValue Base  = Op.getOperand(4);
     SDValue Scale = Op.getOperand(5);
-    return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
+    return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
+                           *Subtarget);
   }
   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
   case RDTSC: {

Modified: projects/clang380-import/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
==============================================================================
--- projects/clang380-import/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp	Sat Feb 13 15:58:51 2016	(r295600)
@@ -3560,7 +3560,7 @@ Instruction *InstCombiner::visitICmpInst
                                 BO1->getOperand(0));
           }
 
-          if (CI->isMaxValue(true)) {
+          if (BO0->getOpcode() == Instruction::Xor && CI->isMaxValue(true)) {
             ICmpInst::Predicate Pred = I.isSigned()
                                            ? I.getUnsignedPredicate()
                                            : I.getSignedPredicate();

Modified: projects/clang380-import/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
==============================================================================
--- projects/clang380-import/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp	Sat Feb 13 15:58:51 2016	(r295600)
@@ -557,7 +557,8 @@ static Instruction *unpackLoadToAggregat
         ConstantInt::get(IdxType, i),
       };
       auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), EltName);
-      auto *L = IC.Builder->CreateLoad(ST->getTypeAtIndex(i), Ptr, LoadName);
+      auto *L = IC.Builder->CreateAlignedLoad(Ptr, LI.getAlignment(),
+                                              LoadName);
       V = IC.Builder->CreateInsertValue(V, L, i);
     }
 

Modified: projects/clang380-import/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
==============================================================================
--- projects/clang380-import/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp	Sat Feb 13 15:58:51 2016	(r295600)
@@ -380,6 +380,23 @@ static void replaceExtractElements(Inser
     ExtendMask.push_back(UndefValue::get(IntType));
 
   Value *ExtVecOp = ExtElt->getVectorOperand();
+  auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp);
+  BasicBlock *InsertionBlock = (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
+                                   ? ExtVecOpInst->getParent()
+                                   : ExtElt->getParent();
+
+  // TODO: This restriction matches the basic block check below when creating
+  // new extractelement instructions. If that limitation is removed, this one
+  // could also be removed. But for now, we just bail out to ensure that we
+  // will replace the extractelement instruction that is feeding our
+  // insertelement instruction. This allows the insertelement to then be
+  // replaced by a shufflevector. If the insertelement is not replaced, we can
+  // induce infinite looping because there's an optimization for extractelement
+  // that will delete our widening shuffle. This would trigger another attempt
+  // here to create that shuffle, and we spin forever.
+  if (InsertionBlock != InsElt->getParent())
+    return;
+
   auto *WideVec = new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType),
                                         ConstantVector::get(ExtendMask));
 
@@ -387,7 +404,6 @@ static void replaceExtractElements(Inser
   // (as long as it's not a PHI) or at the start of the basic block of the
   // extract, so any subsequent extracts in the same basic block can use it.
   // TODO: Insert before the earliest ExtractElementInst that is replaced.
-  auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp);
   if (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
     WideVec->insertAfter(ExtVecOpInst);
   else

Modified: projects/clang380-import/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
==============================================================================
--- projects/clang380-import/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp	Sat Feb 13 15:58:51 2016	(r295600)
@@ -90,6 +90,11 @@ static cl::opt<bool> SpeculateOneExpensi
     cl::desc("Allow exactly one expensive instruction to be speculatively "
              "executed"));
 
+static cl::opt<unsigned> MaxSpeculationDepth(
+    "max-speculation-depth", cl::Hidden, cl::init(10),
+    cl::desc("Limit maximum recursion depth when calculating costs of "
+             "speculatively executed instructions"));
+
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping");
 STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables");
@@ -269,6 +274,13 @@ static bool DominatesMergePoint(Value *V
                                 unsigned &CostRemaining,
                                 const TargetTransformInfo &TTI,
                                 unsigned Depth = 0) {
+  // It is possible to hit a zero-cost cycle (phi/gep instructions for example),
+  // so limit the recursion depth.
+  // TODO: While this recursion limit does prevent pathological behavior, it
+  // would be better to track visited instructions to avoid cycles.
+  if (Depth == MaxSpeculationDepth)
+    return false;
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) {
     // Non-instructions all dominate instructions, but not all constantexprs

Modified: projects/clang380-import/contrib/llvm/tools/clang/include/clang/Sema/Sema.h
==============================================================================
--- projects/clang380-import/contrib/llvm/tools/clang/include/clang/Sema/Sema.h	Sat Feb 13 15:01:54 2016	(r295599)
+++ projects/clang380-import/contrib/llvm/tools/clang/include/clang/Sema/Sema.h	Sat Feb 13 15:58:51 2016	(r295600)
@@ -2229,7 +2229,8 @@ public:
   bool CheckPointerConversion(Expr *From, QualType ToType,
                               CastKind &Kind,
                               CXXCastPath& BasePath,
-                              bool IgnoreBaseAccess);
+                              bool IgnoreBaseAccess,
+                              bool Diagnose = true);
   bool IsMemberPointerConversion(Expr *From, QualType FromType, QualType ToType,
                                  bool InOverloadResolution,
                                  QualType &ConvertedType);
@@ -5388,7 +5389,8 @@ public:
                                     unsigned AmbigiousBaseConvID,
                                     SourceLocation Loc, SourceRange Range,
                                     DeclarationName Name,
-                                    CXXCastPath *BasePath);
+                                    CXXCastPath *BasePath,
+                                    bool IgnoreAccess = false);
 
   std::string getAmbiguousPathsDisplayString(CXXBasePaths &Paths);
 
@@ -7514,14 +7516,15 @@ public:
                                         ObjCMethodDecl *&ClassMethod,
                                         ObjCMethodDecl *&InstanceMethod,
                                         TypedefNameDecl *&TDNDecl,
-                                        bool CfToNs);
-  

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***