llvm-r600.patch

   1 diff -Nur -x .git llvm-3.2.src/autoconf/configure.ac llvm-r600/autoconf/configure.ac
   2 --- llvm-3.2.src/autoconf/configure.ac  2012-11-21 17:13:35.000000000 +0100
   3 +++ llvm-r600/autoconf/configure.ac     2013-01-25 19:43:56.096716416 +0100
   4 @@ -751,6 +751,11 @@
   5
   6  if test ${enableval} != "disable"
   7  then
   8 +  if test ${enableval} = "AMDGPU"
   9 +  then
  10 +    AC_MSG_ERROR([The AMDGPU target has been renamed to R600, please reconfigure with --enable-experimental-targets=R600])
  11 +    enableval="R600"
  12 +  fi
  13    TARGETS_TO_BUILD="$enableval $TARGETS_TO_BUILD"
  14  fi
  15
  16 diff -Nur -x .git llvm-3.2.src/configure llvm-r600/configure
  17 --- llvm-3.2.src/configure      2012-11-21 17:13:35.000000000 +0100
  18 +++ llvm-r600/configure 2013-01-25 19:43:56.173383081 +0100
  19 @@ -5473,6 +5473,13 @@
  20
  21  if test ${enableval} != "disable"
  22  then
  23 +  if test ${enableval} = "AMDGPU"
  24 +  then
  25 +    { { echo "$as_me:$LINENO: error: The AMDGPU target has been renamed to R600, please reconfigure with --enable-experimental-targets=R600" >&5
  26 +echo "$as_me: error: The AMDGPU target has been renamed to R600, please reconfigure with --enable-experimental-targets=R600" >&2;}
  27 +   { (exit 1); exit 1; }; }
  28 +    enableval="R600"
  29 +  fi
  30    TARGETS_TO_BUILD="$enableval $TARGETS_TO_BUILD"
  31  fi
  32
  33 @@ -10316,7 +10323,7 @@
  34    lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
  35    lt_status=$lt_dlunknown
  36    cat > conftest.$ac_ext <<EOF
  37 -#line 10317 "configure"
  38 +#line 10326 "configure"
  39  #include "confdefs.h"
  40
  41  #if HAVE_DLFCN_H
  42 diff -Nur -x .git llvm-3.2.src/include/llvm/IntrinsicsR600.td llvm-r600/include/llvm/IntrinsicsR600.td
  43 --- llvm-3.2.src/include/llvm/IntrinsicsR600.td 1970-01-01 01:00:00.000000000 +0100
  44 +++ llvm-r600/include/llvm/IntrinsicsR600.td    2013-01-25 19:43:56.433383075 +0100
  45 @@ -0,0 +1,36 @@
  46 +//===- IntrinsicsR600.td - Defines R600 intrinsics ---------*- tablegen -*-===//
  47 +//
  48 +//                     The LLVM Compiler Infrastructure
  49 +//
  50 +// This file is distributed under the University of Illinois Open Source
  51 +// License. See LICENSE.TXT for details.
  52 +//
  53 +//===----------------------------------------------------------------------===//
  54 +//
  55 +// This file defines all of the R600-specific intrinsics.
  56 +//
  57 +//===----------------------------------------------------------------------===//
  58 +
  59 +let TargetPrefix = "r600" in {
  60 +
  61 +class R600ReadPreloadRegisterIntrinsic<string name>
  62 +  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
  63 +    GCCBuiltin<name>;
  64 +
  65 +multiclass R600ReadPreloadRegisterIntrinsic_xyz<string prefix> {
  66 +  def _x : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_x")>;
  67 +  def _y : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_y")>;
  68 +  def _z : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_z")>;
  69 +}
  70 +
  71 +defm int_r600_read_global_size : R600ReadPreloadRegisterIntrinsic_xyz <
  72 +                                       "__builtin_r600_read_global_size">;
  73 +defm int_r600_read_local_size : R600ReadPreloadRegisterIntrinsic_xyz <
  74 +                                       "__builtin_r600_read_local_size">;
  75 +defm int_r600_read_ngroups : R600ReadPreloadRegisterIntrinsic_xyz <
  76 +                                       "__builtin_r600_read_ngroups">;
  77 +defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz <
  78 +                                       "__builtin_r600_read_tgid">;
  79 +defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz <
  80 +                                       "__builtin_r600_read_tidig">;
  81 +} // End TargetPrefix = "r600"
  82 diff -Nur -x .git llvm-3.2.src/include/llvm/Intrinsics.td llvm-r600/include/llvm/Intrinsics.td
  83 --- llvm-3.2.src/include/llvm/Intrinsics.td     2012-10-20 01:00:20.000000000 +0200
  84 +++ llvm-r600/include/llvm/Intrinsics.td        2013-01-25 19:43:56.426716409 +0100
  85 @@ -469,3 +469,4 @@
  86  include "llvm/IntrinsicsHexagon.td"
  87  include "llvm/IntrinsicsNVVM.td"
  88  include "llvm/IntrinsicsMips.td"
  89 +include "llvm/IntrinsicsR600.td"
  90 diff -Nur -x .git llvm-3.2.src/lib/CodeGen/SelectionDAG/DAGCombiner.cpp llvm-r600/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
  91 --- llvm-3.2.src/lib/CodeGen/SelectionDAG/DAGCombiner.cpp       2012-11-26 18:01:12.000000000 +0100
  92 +++ llvm-r600/lib/CodeGen/SelectionDAG/DAGCombiner.cpp  2013-01-25 19:43:56.720049736 +0100
  93 @@ -8514,11 +8514,8 @@
  94      if (Opcode == ISD::DELETED_NODE &&
  95          (Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP)) {
  96        Opcode = Opc;
  97 -      // If not supported by target, bail out.
  98 -      if (TLI.getOperationAction(Opcode, VT) != TargetLowering::Legal &&
  99 -          TLI.getOperationAction(Opcode, VT) != TargetLowering::Custom)
 100 -        return SDValue();
 101      }
 102 +
 103      if (Opc != Opcode)
 104        return SDValue();
 105
 106 @@ -8543,6 +8540,10 @@
 107    assert(SrcVT != MVT::Other && "Cannot determine source type!");
 108
 109    EVT NVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumInScalars);
 110 +
 111 +  if (!TLI.isOperationLegalOrCustom(Opcode, NVT))
 112 +    return SDValue();
 113 +
 114    SmallVector<SDValue, 8> Opnds;
 115    for (unsigned i = 0; i != NumInScalars; ++i) {
 116      SDValue In = N->getOperand(i);
 117 diff -Nur -x .git llvm-3.2.src/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp llvm-r600/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
 118 --- llvm-3.2.src/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp       2012-10-24 19:25:11.000000000 +0200
 119 +++ llvm-r600/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp  2013-01-25 19:43:56.733383069 +0100
 120 @@ -731,9 +731,10 @@
 121            return;
 122          }
 123          case TargetLowering::Promote: {
 124 -          assert(VT.isVector() && "Unknown legal promote case!");
 125 -          Value = DAG.getNode(ISD::BITCAST, dl,
 126 -                             TLI.getTypeToPromoteTo(ISD::STORE, VT), Value);
 127 +          EVT NVT = TLI.getTypeToPromoteTo(ISD::STORE, VT);
 128 +          assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
 129 +                 "Can only promote stores to same size type");
 130 +          Value = DAG.getNode(ISD::BITCAST, dl, NVT, Value);
 131            SDValue Result =
 132              DAG.getStore(Chain, dl, Value, Ptr,
 133                           ST->getPointerInfo(), isVolatile,
 134 @@ -889,10 +890,9 @@
 135        break;
 136      }
 137      case TargetLowering::Promote: {
 138 -      // Only promote a load of vector type to another.
 139 -      assert(VT.isVector() && "Cannot promote this load!");
 140 -      // Change base type to a different vector type.
 141        EVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT);
 142 +      assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
 143 +             "Can only promote loads to same size type");
 144
 145        SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(),
 146                           LD->isVolatile(), LD->isNonTemporal(),
 147 diff -Nur -x .git llvm-3.2.src/lib/Target/LLVMBuild.txt llvm-r600/lib/Target/LLVMBuild.txt
 148 --- llvm-3.2.src/lib/Target/LLVMBuild.txt       2012-07-16 20:19:46.000000000 +0200
 149 +++ llvm-r600/lib/Target/LLVMBuild.txt  2013-01-25 19:43:57.173383060 +0100
 150 @@ -16,7 +16,7 @@
 151  ;===------------------------------------------------------------------------===;
 152
 153  [common]
 154 -subdirectories = ARM CellSPU CppBackend Hexagon MBlaze MSP430 NVPTX Mips PowerPC Sparc X86 XCore
 155 +subdirectories = ARM CellSPU CppBackend Hexagon MBlaze MSP430 NVPTX Mips PowerPC R600 Sparc X86 XCore
 156
 157  ; This is a special group whose required libraries are extended (by llvm-build)
 158  ; with the best execution engine (the native JIT, if available, or the
 159 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUAsmPrinter.cpp llvm-r600/lib/Target/R600/AMDGPUAsmPrinter.cpp
 160 --- llvm-3.2.src/lib/Target/R600/AMDGPUAsmPrinter.cpp   1970-01-01 01:00:00.000000000 +0100
 161 +++ llvm-r600/lib/Target/R600/AMDGPUAsmPrinter.cpp      2013-01-25 19:43:57.423383055 +0100
 162 @@ -0,0 +1,138 @@
 163 +//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer  --------------------===//
 164 +//
 165 +//                     The LLVM Compiler Infrastructure
 166 +//
 167 +// This file is distributed under the University of Illinois Open Source
 168 +// License. See LICENSE.TXT for details.
 169 +//
 170 +//===----------------------------------------------------------------------===//
 171 +//
 172 +/// \file
 173 +///
 174 +/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
 175 +/// code.  When passed an MCAsmStreamer it prints assembly and when passed
 176 +/// an MCObjectStreamer it outputs binary code.
 177 +//
 178 +//===----------------------------------------------------------------------===//
 179 +//
 180 +
 181 +
 182 +#include "AMDGPUAsmPrinter.h"
 183 +#include "AMDGPU.h"
 184 +#include "SIMachineFunctionInfo.h"
 185 +#include "SIRegisterInfo.h"
 186 +#include "llvm/MC/MCStreamer.h"
 187 +#include "llvm/Target/TargetLoweringObjectFile.h"
 188 +#include "llvm/Support/TargetRegistry.h"
 189 +
 190 +using namespace llvm;
 191 +
 192 +
 193 +static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm,
 194 +                                              MCStreamer &Streamer) {
 195 +  return new AMDGPUAsmPrinter(tm, Streamer);
 196 +}
 197 +
 198 +extern "C" void LLVMInitializeR600AsmPrinter() {
 199 +  TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
 200 +}
 201 +
 202 +/// We need to override this function so we can avoid
 203 +/// the call to EmitFunctionHeader(), which the MCPureStreamer can't handle.
 204 +bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 205 +  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
 206 +  if (STM.dumpCode()) {
 207 +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 208 +    MF.dump();
 209 +#endif
 210 +  }
 211 +  SetupMachineFunction(MF);
 212 +  OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
 213 +  if (STM.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
 214 +    EmitProgramInfo(MF);
 215 +  }
 216 +  EmitFunctionBody();
 217 +  return false;
 218 +}
 219 +
 220 +void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) {
 221 +  unsigned MaxSGPR = 0;
 222 +  unsigned MaxVGPR = 0;
 223 +  bool VCCUsed = false;
 224 +  const SIRegisterInfo * RI =
 225 +                static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
 226 +
 227 +  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
 228 +                                                  BB != BB_E; ++BB) {
 229 +    MachineBasicBlock &MBB = *BB;
 230 +    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
 231 +                                                    I != E; ++I) {
 232 +      MachineInstr &MI = *I;
 233 +
 234 +      unsigned numOperands = MI.getNumOperands();
 235 +      for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
 236 +        MachineOperand & MO = MI.getOperand(op_idx);
 237 +        unsigned maxUsed;
 238 +        unsigned width = 0;
 239 +        bool isSGPR = false;
 240 +        unsigned reg;
 241 +        unsigned hwReg;
 242 +        if (!MO.isReg()) {
 243 +          continue;
 244 +        }
 245 +        reg = MO.getReg();
 246 +        if (reg == AMDGPU::VCC) {
 247 +          VCCUsed = true;
 248 +          continue;
 249 +        }
 250 +        switch (reg) {
 251 +        default: break;
 252 +        case AMDGPU::EXEC:
 253 +        case AMDGPU::SI_LITERAL_CONSTANT:
 254 +        case AMDGPU::SREG_LIT_0:
 255 +        case AMDGPU::M0:
 256 +          continue;
 257 +        }
 258 +
 259 +        if (AMDGPU::SReg_32RegClass.contains(reg)) {
 260 +          isSGPR = true;
 261 +          width = 1;
 262 +        } else if (AMDGPU::VReg_32RegClass.contains(reg)) {
 263 +          isSGPR = false;
 264 +          width = 1;
 265 +        } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
 266 +          isSGPR = true;
 267 +          width = 2;
 268 +        } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
 269 +          isSGPR = false;
 270 +          width = 2;
 271 +        } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
 272 +          isSGPR = true;
 273 +          width = 4;
 274 +        } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
 275 +          isSGPR = false;
 276 +          width = 4;
 277 +        } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
 278 +          isSGPR = true;
 279 +          width = 8;
 280 +        } else {
 281 +          assert(!"Unknown register class");
 282 +        }
 283 +        hwReg = RI->getEncodingValue(reg);
 284 +        maxUsed = hwReg + width - 1;
 285 +        if (isSGPR) {
 286 +          MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
 287 +        } else {
 288 +          MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
 289 +        }
 290 +      }
 291 +    }
 292 +  }
 293 +  if (VCCUsed) {
 294 +    MaxSGPR += 2;
 295 +  }
 296 +  SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>();
 297 +  OutStreamer.EmitIntValue(MaxSGPR + 1, 4);
 298 +  OutStreamer.EmitIntValue(MaxVGPR + 1, 4);
 299 +  OutStreamer.EmitIntValue(MFI->SPIPSInputAddr, 4);
 300 +}
 301 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUAsmPrinter.h llvm-r600/lib/Target/R600/AMDGPUAsmPrinter.h
 302 --- llvm-3.2.src/lib/Target/R600/AMDGPUAsmPrinter.h     1970-01-01 01:00:00.000000000 +0100
 303 +++ llvm-r600/lib/Target/R600/AMDGPUAsmPrinter.h        2013-01-25 19:43:57.426716388 +0100
 304 @@ -0,0 +1,44 @@
 305 +//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code -------------------===//
 306 +//
 307 +//                     The LLVM Compiler Infrastructure
 308 +//
 309 +// This file is distributed under the University of Illinois Open Source
 310 +// License. See LICENSE.TXT for details.
 311 +//
 312 +//===----------------------------------------------------------------------===//
 313 +//
 314 +/// \file
 315 +/// \brief AMDGPU Assembly printer class.
 316 +//
 317 +//===----------------------------------------------------------------------===//
 318 +
 319 +#ifndef AMDGPU_ASMPRINTER_H
 320 +#define AMDGPU_ASMPRINTER_H
 321 +
 322 +#include "llvm/CodeGen/AsmPrinter.h"
 323 +
 324 +namespace llvm {
 325 +
 326 +class AMDGPUAsmPrinter : public AsmPrinter {
 327 +
 328 +public:
 329 +  explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
 330 +    : AsmPrinter(TM, Streamer) { }
 331 +
 332 +  virtual bool runOnMachineFunction(MachineFunction &MF);
 333 +
 334 +  virtual const char *getPassName() const {
 335 +    return "AMDGPU Assembly Printer";
 336 +  }
 337 +
 338 +  /// \brief Emit register usage information so that the GPU driver
 339 +  /// can correctly setup the GPU state.
 340 +  void EmitProgramInfo(MachineFunction &MF);
 341 +
 342 +  /// Implemented in AMDGPUMCInstLower.cpp
 343 +  virtual void EmitInstruction(const MachineInstr *MI);
 344 +};
 345 +
 346 +} // End anonymous llvm
 347 +
 348 +#endif //AMDGPU_ASMPRINTER_H
 349 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUCodeEmitter.h llvm-r600/lib/Target/R600/AMDGPUCodeEmitter.h
 350 --- llvm-3.2.src/lib/Target/R600/AMDGPUCodeEmitter.h    1970-01-01 01:00:00.000000000 +0100
 351 +++ llvm-r600/lib/Target/R600/AMDGPUCodeEmitter.h       2013-01-25 19:43:57.426716388 +0100
 352 @@ -0,0 +1,49 @@
 353 +//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----------------===//
 354 +//
 355 +//                     The LLVM Compiler Infrastructure
 356 +//
 357 +// This file is distributed under the University of Illinois Open Source
 358 +// License. See LICENSE.TXT for details.
 359 +//
 360 +//===----------------------------------------------------------------------===//
 361 +//
 362 +/// \file
 363 +/// \brief CodeEmitter interface for R600 and SI codegen.
 364 +//
 365 +//===----------------------------------------------------------------------===//
 366 +
 367 +#ifndef AMDGPUCODEEMITTER_H
 368 +#define AMDGPUCODEEMITTER_H
 369 +
 370 +namespace llvm {
 371 +
 372 +class AMDGPUCodeEmitter {
 373 +public:
 374 +  uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
 375 +  virtual uint64_t getMachineOpValue(const MachineInstr &MI,
 376 +                                   const MachineOperand &MO) const { return 0; }
 377 +  virtual unsigned GPR4AlignEncode(const MachineInstr  &MI,
 378 +                                     unsigned OpNo) const {
 379 +    return 0;
 380 +  }
 381 +  virtual unsigned GPR2AlignEncode(const MachineInstr &MI,
 382 +                                   unsigned OpNo) const {
 383 +    return 0;
 384 +  }
 385 +  virtual uint64_t VOPPostEncode(const MachineInstr &MI,
 386 +                                 uint64_t Value) const {
 387 +    return Value;
 388 +  }
 389 +  virtual uint64_t i32LiteralEncode(const MachineInstr &MI,
 390 +                                    unsigned OpNo) const {
 391 +    return 0;
 392 +  }
 393 +  virtual uint32_t SMRDmemriEncode(const MachineInstr &MI, unsigned OpNo)
 394 +                                                                   const {
 395 +    return 0;
 396 +  }
 397 +};
 398 +
 399 +} // End namespace llvm
 400 +
 401 +#endif // AMDGPUCODEEMITTER_H
 402 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUConvertToISA.cpp llvm-r600/lib/Target/R600/AMDGPUConvertToISA.cpp
 403 --- llvm-3.2.src/lib/Target/R600/AMDGPUConvertToISA.cpp 1970-01-01 01:00:00.000000000 +0100
 404 +++ llvm-r600/lib/Target/R600/AMDGPUConvertToISA.cpp    2013-01-25 19:43:57.426716388 +0100
 405 @@ -0,0 +1,62 @@
 406 +//===-- AMDGPUConvertToISA.cpp - Lower AMDIL to HW ISA --------------------===//
 407 +//
 408 +//                     The LLVM Compiler Infrastructure
 409 +//
 410 +// This file is distributed under the University of Illinois Open Source
 411 +// License. See LICENSE.TXT for details.
 412 +//
 413 +//===----------------------------------------------------------------------===//
 414 +//
 415 +/// \file
 416 +/// \brief This pass lowers AMDIL machine instructions to the appropriate
 417 +/// hardware instructions.
 418 +//
 419 +//===----------------------------------------------------------------------===//
 420 +
 421 +#include "AMDGPU.h"
 422 +#include "AMDGPUInstrInfo.h"
 423 +#include "llvm/CodeGen/MachineFunctionPass.h"
 424 +
 425 +using namespace llvm;
 426 +
 427 +namespace {
 428 +
 429 +class AMDGPUConvertToISAPass : public MachineFunctionPass {
 430 +
 431 +private:
 432 +  static char ID;
 433 +  TargetMachine &TM;
 434 +
 435 +public:
 436 +  AMDGPUConvertToISAPass(TargetMachine &tm) :
 437 +    MachineFunctionPass(ID), TM(tm) { }
 438 +
 439 +  virtual bool runOnMachineFunction(MachineFunction &MF);
 440 +
 441 +  virtual const char *getPassName() const {return "AMDGPU Convert to ISA";}
 442 +
 443 +};
 444 +
 445 +} // End anonymous namespace
 446 +
 447 +char AMDGPUConvertToISAPass::ID = 0;
 448 +
 449 +FunctionPass *llvm::createAMDGPUConvertToISAPass(TargetMachine &tm) {
 450 +  return new AMDGPUConvertToISAPass(tm);
 451 +}
 452 +
 453 +bool AMDGPUConvertToISAPass::runOnMachineFunction(MachineFunction &MF) {
 454 +  const AMDGPUInstrInfo * TII =
 455 +                      static_cast<const AMDGPUInstrInfo*>(TM.getInstrInfo());
 456 +
 457 +  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
 458 +                                                  BB != BB_E; ++BB) {
 459 +    MachineBasicBlock &MBB = *BB;
 460 +    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
 461 +                                                      I != E; ++I) {
 462 +      MachineInstr &MI = *I;
 463 +      TII->convertToISA(MI, MF, MBB.findDebugLoc(I));
 464 +    }
 465 +  }
 466 +  return false;
 467 +}
 468 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPU.h llvm-r600/lib/Target/R600/AMDGPU.h
 469 --- llvm-3.2.src/lib/Target/R600/AMDGPU.h       1970-01-01 01:00:00.000000000 +0100
 470 +++ llvm-r600/lib/Target/R600/AMDGPU.h  2013-01-25 19:43:57.423383055 +0100
 471 @@ -0,0 +1,51 @@
 472 +//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=//
 473 +//
 474 +//                     The LLVM Compiler Infrastructure
 475 +//
 476 +// This file is distributed under the University of Illinois Open Source
 477 +// License. See LICENSE.TXT for details.
 478 +//
 479 +/// \file
 480 +//===----------------------------------------------------------------------===//
 481 +
 482 +#ifndef AMDGPU_H
 483 +#define AMDGPU_H
 484 +
 485 +#include "AMDGPUTargetMachine.h"
 486 +#include "llvm/Support/TargetRegistry.h"
 487 +#include "llvm/Target/TargetMachine.h"
 488 +
 489 +namespace llvm {
 490 +
 491 +class FunctionPass;
 492 +class AMDGPUTargetMachine;
 493 +
 494 +// R600 Passes
 495 +FunctionPass* createR600KernelParametersPass(const DataLayout *TD);
 496 +FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
 497 +FunctionPass *createR600LowerConstCopy(TargetMachine &tm);
 498 +
 499 +// SI Passes
 500 +FunctionPass *createSIAnnotateControlFlowPass();
 501 +FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm);
 502 +FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
 503 +FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 504 +FunctionPass *createSILowerLiteralConstantsPass(TargetMachine &tm);
 505 +FunctionPass *createSIInsertWaits(TargetMachine &tm);
 506 +
 507 +// Passes common to R600 and SI
 508 +Pass *createAMDGPUStructurizeCFGPass();
 509 +FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
 510 +
 511 +} // End namespace llvm
 512 +
 513 +namespace ShaderType {
 514 +  enum Type {
 515 +    PIXEL = 0,
 516 +    VERTEX = 1,
 517 +    GEOMETRY = 2,
 518 +    COMPUTE = 3
 519 +  };
 520 +}
 521 +
 522 +#endif // AMDGPU_H
 523 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.cpp llvm-r600/lib/Target/R600/AMDGPUInstrInfo.cpp
 524 --- llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.cpp    1970-01-01 01:00:00.000000000 +0100
 525 +++ llvm-r600/lib/Target/R600/AMDGPUInstrInfo.cpp       2013-01-25 19:43:57.426716388 +0100
 526 @@ -0,0 +1,257 @@
 527 +//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===//
 528 +//
 529 +//                     The LLVM Compiler Infrastructure
 530 +//
 531 +// This file is distributed under the University of Illinois Open Source
 532 +// License. See LICENSE.TXT for details.
 533 +//
 534 +//===----------------------------------------------------------------------===//
 535 +//
 536 +/// \file
 537 +/// \brief Implementation of the TargetInstrInfo class that is common to all
 538 +/// AMD GPUs.
 539 +//
 540 +//===----------------------------------------------------------------------===//
 541 +
 542 +#include "AMDGPUInstrInfo.h"
 543 +#include "AMDGPURegisterInfo.h"
 544 +#include "AMDGPUTargetMachine.h"
 545 +#include "AMDIL.h"
 546 +#include "llvm/CodeGen/MachineFrameInfo.h"
 547 +#include "llvm/CodeGen/MachineInstrBuilder.h"
 548 +#include "llvm/CodeGen/MachineRegisterInfo.h"
 549 +
 550 +#define GET_INSTRINFO_CTOR
 551 +#include "AMDGPUGenInstrInfo.inc"
 552 +
 553 +using namespace llvm;
 554 +
 555 +AMDGPUInstrInfo::AMDGPUInstrInfo(TargetMachine &tm)
 556 +  : AMDGPUGenInstrInfo(0,0), RI(tm, *this), TM(tm) { }
 557 +
 558 +const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const {
 559 +  return RI;
 560 +}
 561 +
 562 +bool AMDGPUInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
 563 +                                           unsigned &SrcReg, unsigned &DstReg,
 564 +                                           unsigned &SubIdx) const {
 565 +// TODO: Implement this function
 566 +  return false;
 567 +}
 568 +
 569 +unsigned AMDGPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
 570 +                                             int &FrameIndex) const {
 571 +// TODO: Implement this function
 572 +  return 0;
 573 +}
 574 +
 575 +unsigned AMDGPUInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
 576 +                                                   int &FrameIndex) const {
 577 +// TODO: Implement this function
 578 +  return 0;
 579 +}
 580 +
 581 +bool AMDGPUInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
 582 +                                          const MachineMemOperand *&MMO,
 583 +                                          int &FrameIndex) const {
 584 +// TODO: Implement this function
 585 +  return false;
 586 +}
 587 +unsigned AMDGPUInstrInfo::isStoreFromStackSlot(const MachineInstr *MI,
 588 +                                              int &FrameIndex) const {
 589 +// TODO: Implement this function
 590 +  return 0;
 591 +}
 592 +unsigned AMDGPUInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI,
 593 +                                                    int &FrameIndex) const {
 594 +// TODO: Implement this function
 595 +  return 0;
 596 +}
 597 +bool AMDGPUInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI,
 598 +                                           const MachineMemOperand *&MMO,
 599 +                                           int &FrameIndex) const {
 600 +// TODO: Implement this function
 601 +  return false;
 602 +}
 603 +
 604 +MachineInstr *
 605 +AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 606 +                                      MachineBasicBlock::iterator &MBBI,
 607 +                                      LiveVariables *LV) const {
 608 +// TODO: Implement this function
 609 +  return NULL;
 610 +}
 611 +bool AMDGPUInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter,
 612 +                                        MachineBasicBlock &MBB) const {
 613 +  while (iter != MBB.end()) {
 614 +    switch (iter->getOpcode()) {
 615 +    default:
 616 +      break;
 617 +    case AMDGPU::BRANCH_COND_i32:
 618 +    case AMDGPU::BRANCH_COND_f32:
 619 +    case AMDGPU::BRANCH:
 620 +      return true;
 621 +    };
 622 +    ++iter;
 623 +  }
 624 +  return false;
 625 +}
 626 +
 627 +MachineBasicBlock::iterator skipFlowControl(MachineBasicBlock *MBB) {
 628 +  MachineBasicBlock::iterator tmp = MBB->end();
 629 +  if (!MBB->size()) {
 630 +    return MBB->end();
 631 +  }
 632 +  while (--tmp) {
 633 +    if (tmp->getOpcode() == AMDGPU::ENDLOOP
 634 +        || tmp->getOpcode() == AMDGPU::ENDIF
 635 +        || tmp->getOpcode() == AMDGPU::ELSE) {
 636 +      if (tmp == MBB->begin()) {
 637 +        return tmp;
 638 +      } else {
 639 +        continue;
 640 +      }
 641 +    }  else {
 642 +      return ++tmp;
 643 +    }
 644 +  }
 645 +  return MBB->end();
 646 +}
 647 +
 648 +void
 649 +AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 650 +                                    MachineBasicBlock::iterator MI,
 651 +                                    unsigned SrcReg, bool isKill,
 652 +                                    int FrameIndex,
 653 +                                    const TargetRegisterClass *RC,
 654 +                                    const TargetRegisterInfo *TRI) const {
 655 +  assert(!"Not Implemented");
 656 +}
 657 +
 658 +void
 659 +AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 660 +                                     MachineBasicBlock::iterator MI,
 661 +                                     unsigned DestReg, int FrameIndex,
 662 +                                     const TargetRegisterClass *RC,
 663 +                                     const TargetRegisterInfo *TRI) const {
 664 +  assert(!"Not Implemented");
 665 +}
 666 +
 667 +MachineInstr *
 668 +AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
 669 +                                      MachineInstr *MI,
 670 +                                      const SmallVectorImpl<unsigned> &Ops,
 671 +                                      int FrameIndex) const {
 672 +// TODO: Implement this function
 673 +  return 0;
 674 +}
 675 +MachineInstr*
 676 +AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
 677 +                                      MachineInstr *MI,
 678 +                                      const SmallVectorImpl<unsigned> &Ops,
 679 +                                      MachineInstr *LoadMI) const {
 680 +  // TODO: Implement this function
 681 +  return 0;
 682 +}
 683 +bool
 684 +AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
 685 +                                     const SmallVectorImpl<unsigned> &Ops) const {
 686 +  // TODO: Implement this function
 687 +  return false;
 688 +}
 689 +bool
 690 +AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
 691 +                                 unsigned Reg, bool UnfoldLoad,
 692 +                                 bool UnfoldStore,
 693 +                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
 694 +  // TODO: Implement this function
 695 +  return false;
 696 +}
 697 +
 698 +bool
 699 +AMDGPUInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
 700 +                                    SmallVectorImpl<SDNode*> &NewNodes) const {
 701 +  // TODO: Implement this function
 702 +  return false;
 703 +}
 704 +
 705 +unsigned
 706 +AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
 707 +                                           bool UnfoldLoad, bool UnfoldStore,
 708 +                                           unsigned *LoadRegIndex) const {
 709 +  // TODO: Implement this function
 710 +  return 0;
 711 +}
 712 +
 713 +bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
 714 +                                             int64_t Offset1, int64_t Offset2,
 715 +                                             unsigned NumLoads) const {
 716 +  assert(Offset2 > Offset1
 717 +         && "Second offset should be larger than first offset!");
 718 +  // If we have less than 16 loads in a row, and the offsets are within 16,
 719 +  // then schedule together.
 720 +  // TODO: Make the loads schedule near if it fits in a cacheline
 721 +  return (NumLoads < 16 && (Offset2 - Offset1) < 16);
 722 +}
 723 +
 724 +bool
 725 +AMDGPUInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
 726 +  const {
 727 +  // TODO: Implement this function
 728 +  return true;
 729 +}
 730 +void AMDGPUInstrInfo::insertNoop(MachineBasicBlock &MBB,
 731 +                                MachineBasicBlock::iterator MI) const {
 732 +  // TODO: Implement this function
 733 +}
 734 +
 735 +bool AMDGPUInstrInfo::isPredicated(const MachineInstr *MI) const {
 736 +  // TODO: Implement this function
 737 +  return false;
 738 +}
 739 +bool
 740 +AMDGPUInstrInfo::SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
 741 +                                  const SmallVectorImpl<MachineOperand> &Pred2)
 742 +  const {
 743 +  // TODO: Implement this function
 744 +  return false;
 745 +}
 746 +
 747 +bool AMDGPUInstrInfo::DefinesPredicate(MachineInstr *MI,
 748 +                                      std::vector<MachineOperand> &Pred) const {
 749 +  // TODO: Implement this function
 750 +  return false;
 751 +}
 752 +
 753 +bool AMDGPUInstrInfo::isPredicable(MachineInstr *MI) const {
 754 +  // TODO: Implement this function
 755 +  return MI->getDesc().isPredicable();
 756 +}
 757 +
 758 +bool
 759 +AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
 760 +  // TODO: Implement this function
 761 +  return true;
 762 +}
 763 +
 764 +void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF,
 765 +    DebugLoc DL) const {
 766 +  MachineRegisterInfo &MRI = MF.getRegInfo();
 767 +  const AMDGPURegisterInfo & RI = getRegisterInfo();
 768 +
 769 +  for (unsigned i = 0; i < MI.getNumOperands(); i++) {
 770 +    MachineOperand &MO = MI.getOperand(i);
 771 +    // Convert dst regclass to one that is supported by the ISA
 772 +    if (MO.isReg() && MO.isDef()) {
 773 +      if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
 774 +        const TargetRegisterClass * oldRegClass = MRI.getRegClass(MO.getReg());
 775 +        const TargetRegisterClass * newRegClass = RI.getISARegClass(oldRegClass);
 776 +
 777 +        assert(newRegClass);
 778 +
 779 +        MRI.setRegClass(MO.getReg(), newRegClass);
 780 +      }
 781 +    }
 782 +  }
 783 +}
 784 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.h llvm-r600/lib/Target/R600/AMDGPUInstrInfo.h
 785 --- llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.h      1970-01-01 01:00:00.000000000 +0100
 786 +++ llvm-r600/lib/Target/R600/AMDGPUInstrInfo.h 2013-01-25 19:43:57.430049721 +0100
 787 @@ -0,0 +1,149 @@
 788 +//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===//
 789 +//
 790 +//                     The LLVM Compiler Infrastructure
 791 +//
 792 +// This file is distributed under the University of Illinois Open Source
 793 +// License. See LICENSE.TXT for details.
 794 +//
 795 +//===----------------------------------------------------------------------===//
 796 +//
 797 +/// \file
 798 +/// \brief Contains the definition of a TargetInstrInfo class that is common
 799 +/// to all AMD GPUs.
 800 +//
 801 +//===----------------------------------------------------------------------===//
 802 +
 803 +#ifndef AMDGPUINSTRUCTIONINFO_H
 804 +#define AMDGPUINSTRUCTIONINFO_H
 805 +
 806 +#include "AMDGPURegisterInfo.h"
 807 +#include "AMDGPUInstrInfo.h"
 808 +#include "llvm/Target/TargetInstrInfo.h"
 809 +
 810 +#include <map>
 811 +
 812 +#define GET_INSTRINFO_HEADER
 813 +#define GET_INSTRINFO_ENUM
 814 +#include "AMDGPUGenInstrInfo.inc"
 815 +
 816 +#define OPCODE_IS_ZERO_INT AMDGPU::PRED_SETE_INT
 817 +#define OPCODE_IS_NOT_ZERO_INT AMDGPU::PRED_SETNE_INT
 818 +#define OPCODE_IS_ZERO AMDGPU::PRED_SETE
 819 +#define OPCODE_IS_NOT_ZERO AMDGPU::PRED_SETNE
 820 +
 821 +namespace llvm {
 822 +
 823 +class AMDGPUTargetMachine;
 824 +class MachineFunction;
 825 +class MachineInstr;
 826 +class MachineInstrBuilder;
 827 +
 828 +class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
 829 +private:
 830 +  const AMDGPURegisterInfo RI;
 831 +  TargetMachine &TM;
 832 +  bool getNextBranchInstr(MachineBasicBlock::iterator &iter,
 833 +                          MachineBasicBlock &MBB) const;
 834 +public:
 835 +  explicit AMDGPUInstrInfo(TargetMachine &tm);
 836 +
 837 +  virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0;
 838 +
 839 +  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
 840 +                             unsigned &DstReg, unsigned &SubIdx) const;
 841 +
 842 +  unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
 843 +  unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
 844 +                                     int &FrameIndex) const;
 845 +  bool hasLoadFromStackSlot(const MachineInstr *MI,
 846 +                            const MachineMemOperand *&MMO,
 847 +                            int &FrameIndex) const;
 848 +  unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
 849 +  unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI,
 850 +                                      int &FrameIndex) const;
 851 +  bool hasStoreFromStackSlot(const MachineInstr *MI,
 852 +                             const MachineMemOperand *&MMO,
 853 +                             int &FrameIndex) const;
 854 +
 855 +  MachineInstr *
 856 +  convertToThreeAddress(MachineFunction::iterator &MFI,
 857 +                        MachineBasicBlock::iterator &MBBI,
 858 +                        LiveVariables *LV) const;
 859 +
 860 +
 861 +  virtual void copyPhysReg(MachineBasicBlock &MBB,
 862 +                           MachineBasicBlock::iterator MI, DebugLoc DL,
 863 +                           unsigned DestReg, unsigned SrcReg,
 864 +                           bool KillSrc) const = 0;
 865 +
 866 +  void storeRegToStackSlot(MachineBasicBlock &MBB,
 867 +                           MachineBasicBlock::iterator MI,
 868 +                           unsigned SrcReg, bool isKill, int FrameIndex,
 869 +                           const TargetRegisterClass *RC,
 870 +                           const TargetRegisterInfo *TRI) const;
 871 +  void loadRegFromStackSlot(MachineBasicBlock &MBB,
 872 +                            MachineBasicBlock::iterator MI,
 873 +                            unsigned DestReg, int FrameIndex,
 874 +                            const TargetRegisterClass *RC,
 875 +                            const TargetRegisterInfo *TRI) const;
 876 +
 877 +protected:
 878 +  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
 879 +                                      MachineInstr *MI,
 880 +                                      const SmallVectorImpl<unsigned> &Ops,
 881 +                                      int FrameIndex) const;
 882 +  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
 883 +                                      MachineInstr *MI,
 884 +                                      const SmallVectorImpl<unsigned> &Ops,
 885 +                                      MachineInstr *LoadMI) const;
 886 +public:
 887 +  bool canFoldMemoryOperand(const MachineInstr *MI,
 888 +                            const SmallVectorImpl<unsigned> &Ops) const;
 889 +  bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
 890 +                           unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
 891 +                           SmallVectorImpl<MachineInstr *> &NewMIs) const;
 892 +  bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
 893 +                           SmallVectorImpl<SDNode *> &NewNodes) const;
 894 +  unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
 895 +                                      bool UnfoldLoad, bool UnfoldStore,
 896 +                                      unsigned *LoadRegIndex = 0) const;
 897 +  bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
 898 +                               int64_t Offset1, int64_t Offset2,
 899 +                               unsigned NumLoads) const;
 900 +
 901 +  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
 902 +  void insertNoop(MachineBasicBlock &MBB,
 903 +                  MachineBasicBlock::iterator MI) const;
 904 +  bool isPredicated(const MachineInstr *MI) const;
 905 +  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
 906 +                         const SmallVectorImpl<MachineOperand> &Pred2) const;
 907 +  bool DefinesPredicate(MachineInstr *MI,
 908 +                        std::vector<MachineOperand> &Pred) const;
 909 +  bool isPredicable(MachineInstr *MI) const;
 910 +  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
 911 +
 912 +  // Helper functions that check the opcode for status information
 913 +  bool isLoadInst(llvm::MachineInstr *MI) const;
 914 +  bool isExtLoadInst(llvm::MachineInstr *MI) const;
 915 +  bool isSWSExtLoadInst(llvm::MachineInstr *MI) const;
 916 +  bool isSExtLoadInst(llvm::MachineInstr *MI) const;
 917 +  bool isZExtLoadInst(llvm::MachineInstr *MI) const;
 918 +  bool isAExtLoadInst(llvm::MachineInstr *MI) const;
 919 +  bool isStoreInst(llvm::MachineInstr *MI) const;
 920 +  bool isTruncStoreInst(llvm::MachineInstr *MI) const;
 921 +
 922 +  virtual MachineInstr* getMovImmInstr(MachineFunction *MF, unsigned DstReg,
 923 +                                       int64_t Imm) const = 0;
 924 +  virtual unsigned getIEQOpcode() const = 0;
 925 +  virtual bool isMov(unsigned opcode) const = 0;
 926 +
 927 +  /// \brief Convert the AMDIL MachineInstr to a supported ISA
 928 +  /// MachineInstr
 929 +  virtual void convertToISA(MachineInstr & MI, MachineFunction &MF,
 930 +    DebugLoc DL) const;
 931 +
 932 +};
 933 +
 934 +} // End llvm namespace
 935 +
 936 +#endif // AMDGPUINSTRINFO_H
 937 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.td llvm-r600/lib/Target/R600/AMDGPUInstrInfo.td
 938 --- llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.td     1970-01-01 01:00:00.000000000 +0100
 939 +++ llvm-r600/lib/Target/R600/AMDGPUInstrInfo.td        2013-01-25 19:43:57.430049721 +0100
 940 @@ -0,0 +1,74 @@
 941 +//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===//
 942 +//
 943 +//                     The LLVM Compiler Infrastructure
 944 +//
 945 +// This file is distributed under the University of Illinois Open Source
 946 +// License. See LICENSE.TXT for details.
 947 +//
 948 +//===----------------------------------------------------------------------===//
 949 +//
 950 +// This file contains DAG node defintions for the AMDGPU target.
 951 +//
 952 +//===----------------------------------------------------------------------===//
 953 +
 954 +//===----------------------------------------------------------------------===//
 955 +// AMDGPU DAG Profiles
 956 +//===----------------------------------------------------------------------===//
 957 +
 958 +def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [
 959 +  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
 960 +]>;
 961 +
 962 +//===----------------------------------------------------------------------===//
 963 +// AMDGPU DAG Nodes
 964 +//
 965 +
 966 +// out = ((a << 32) | b) >> c)
 967 +//
 968 +// Can be used to optimize rtol:
 969 +// rotl(a, b) = bitalign(a, a, 32 - b)
 970 +def AMDGPUbitalign : SDNode<"AMDGPUISD::BITALIGN", AMDGPUDTIntTernaryOp>;
 971 +
 972 +// This argument to this node is a dword address.
 973 +def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
 974 +
 975 +// out = a - floor(a)
 976 +def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
 977 +
 978 +// out = max(a, b) a and b are floats
 979 +def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp,
 980 +  [SDNPCommutative, SDNPAssociative]
 981 +>;
 982 +
 983 +// out = max(a, b) a and b are signed ints
 984 +def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
 985 +  [SDNPCommutative, SDNPAssociative]
 986 +>;
 987 +
 988 +// out = max(a, b) a and b are unsigned ints
 989 +def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp,
 990 +  [SDNPCommutative, SDNPAssociative]
 991 +>;
 992 +
 993 +// out = min(a, b) a and b are floats
 994 +def AMDGPUfmin : SDNode<"AMDGPUISD::FMIN", SDTFPBinOp,
 995 +  [SDNPCommutative, SDNPAssociative]
 996 +>;
 997 +
 998 +// out = min(a, b) a snd b are signed ints
 999 +def AMDGPUsmin : SDNode<"AMDGPUISD::SMIN", SDTIntBinOp,
1000 +  [SDNPCommutative, SDNPAssociative]
1001 +>;
1002 +
1003 +// out = min(a, b) a and b are unsigned ints
1004 +def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp,
1005 +  [SDNPCommutative, SDNPAssociative]
1006 +>;
1007 +
1008 +// urecip - This operation is a helper for integer division, it returns the
1009 +// result of 1 / a as a fractional unsigned integer.
1010 +// out = (2^32 / a) + e
1011 +// e is rounding error
1012 +def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;
1013 +
1014 +def fpow : SDNode<"ISD::FPOW", SDTFPBinOp>;
1015 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUInstructions.td llvm-r600/lib/Target/R600/AMDGPUInstructions.td
1016 --- llvm-3.2.src/lib/Target/R600/AMDGPUInstructions.td  1970-01-01 01:00:00.000000000 +0100
1017 +++ llvm-r600/lib/Target/R600/AMDGPUInstructions.td     2013-01-25 19:43:57.430049721 +0100
1018 @@ -0,0 +1,190 @@
1019 +//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===//
1020 +//
1021 +//                     The LLVM Compiler Infrastructure
1022 +//
1023 +// This file is distributed under the University of Illinois Open Source
1024 +// License. See LICENSE.TXT for details.
1025 +//
1026 +//===----------------------------------------------------------------------===//
1027 +//
1028 +// This file contains instruction defs that are common to all hw codegen
1029 +// targets.
1030 +//
1031 +//===----------------------------------------------------------------------===//
1032 +
1033 +class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instruction {
1034 +  field bits<16> AMDILOp = 0;
1035 +  field bits<3> Gen = 0;
1036 +
1037 +  let Namespace = "AMDGPU";
1038 +  let OutOperandList = outs;
1039 +  let InOperandList = ins;
1040 +  let AsmString = asm;
1041 +  let Pattern = pattern;
1042 +  let Itinerary = NullALU;
1043 +  let TSFlags{42-40} = Gen;
1044 +  let TSFlags{63-48} = AMDILOp;
1045 +}
1046 +
1047 +class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
1048 +    : AMDGPUInst<outs, ins, asm, pattern> {
1049 +
1050 +  field bits<32> Inst = 0xffffffff;
1051 +
1052 +}
1053 +
1054 +def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
1055 +
1056 +def COND_EQ : PatLeaf <
1057 +  (cond),
1058 +  [{switch(N->get()){{default: return false;
1059 +                     case ISD::SETOEQ: case ISD::SETUEQ:
1060 +                     case ISD::SETEQ: return true;}}}]
1061 +>;
1062 +
1063 +def COND_NE : PatLeaf <
1064 +  (cond),
1065 +  [{switch(N->get()){{default: return false;
1066 +                     case ISD::SETONE: case ISD::SETUNE:
1067 +                     case ISD::SETNE: return true;}}}]
1068 +>;
1069 +def COND_GT : PatLeaf <
1070 +  (cond),
1071 +  [{switch(N->get()){{default: return false;
1072 +                     case ISD::SETOGT: case ISD::SETUGT:
1073 +                     case ISD::SETGT: return true;}}}]
1074 +>;
1075 +
1076 +def COND_GE : PatLeaf <
1077 +  (cond),
1078 +  [{switch(N->get()){{default: return false;
1079 +                     case ISD::SETOGE: case ISD::SETUGE:
1080 +                     case ISD::SETGE: return true;}}}]
1081 +>;
1082 +
1083 +def COND_LT : PatLeaf <
1084 +  (cond),
1085 +  [{switch(N->get()){{default: return false;
1086 +                     case ISD::SETOLT: case ISD::SETULT:
1087 +                     case ISD::SETLT: return true;}}}]
1088 +>;
1089 +
1090 +def COND_LE : PatLeaf <
1091 +  (cond),
1092 +  [{switch(N->get()){{default: return false;
1093 +                     case ISD::SETOLE: case ISD::SETULE:
1094 +                     case ISD::SETLE: return true;}}}]
1095 +>;
1096 +
1097 +//===----------------------------------------------------------------------===//
1098 +// Load/Store Pattern Fragments
1099 +//===----------------------------------------------------------------------===//
1100 +
1101 +def zextloadi8_global : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr), [{
1102 +    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
1103 +}]>;
1104 +
1105 +class Constants {
1106 +int TWO_PI = 0x40c90fdb;
1107 +int PI = 0x40490fdb;
1108 +int TWO_PI_INV = 0x3e22f983;
1109 +}
1110 +def CONST : Constants;
1111 +
1112 +def FP_ZERO : PatLeaf <
1113 +  (fpimm),
1114 +  [{return N->getValueAPF().isZero();}]
1115 +>;
1116 +
1117 +def FP_ONE : PatLeaf <
1118 +  (fpimm),
1119 +  [{return N->isExactlyValue(1.0);}]
1120 +>;
1121 +
1122 +let isCodeGenOnly = 1, isPseudo = 1, usesCustomInserter = 1  in {
1123 +
1124 +class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
1125 +  (outs rc:$dst),
1126 +  (ins rc:$src0),
1127 +  "CLAMP $dst, $src0",
1128 +  [(set rc:$dst, (int_AMDIL_clamp rc:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
1129 +>;
1130 +
1131 +class FABS <RegisterClass rc> : AMDGPUShaderInst <
1132 +  (outs rc:$dst),
1133 +  (ins rc:$src0),
1134 +  "FABS $dst, $src0",
1135 +  [(set rc:$dst, (fabs rc:$src0))]
1136 +>;
1137 +
1138 +class FNEG <RegisterClass rc> : AMDGPUShaderInst <
1139 +  (outs rc:$dst),
1140 +  (ins rc:$src0),
1141 +  "FNEG $dst, $src0",
1142 +  [(set rc:$dst, (fneg rc:$src0))]
1143 +>;
1144 +
1145 +def SHADER_TYPE : AMDGPUShaderInst <
1146 +  (outs),
1147 +  (ins i32imm:$type),
1148 +  "SHADER_TYPE $type",
1149 +  [(int_AMDGPU_shader_type imm:$type)]
1150 +>;
1151 +
1152 +} // End isCodeGenOnly = 1, isPseudo = 1, hasCustomInserter = 1
1153 +
1154 +/* Generic helper patterns for intrinsics */
1155 +/* -------------------------------------- */
1156 +
1157 +class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul,
1158 +                  RegisterClass rc> : Pat <
1159 +  (fpow rc:$src0, rc:$src1),
1160 +  (exp_ieee (mul rc:$src1, (log_ieee rc:$src0)))
1161 +>;
1162 +
1163 +/* Other helper patterns */
1164 +/* --------------------- */
1165 +
1166 +/* Extract element pattern */
1167 +class Extract_Element <ValueType sub_type, ValueType vec_type,
1168 +                     RegisterClass vec_class, int sub_idx,
1169 +                     SubRegIndex sub_reg>: Pat<
1170 +  (sub_type (vector_extract (vec_type vec_class:$src), sub_idx)),
1171 +  (EXTRACT_SUBREG vec_class:$src, sub_reg)
1172 +>;
1173 +
1174 +/* Insert element pattern */
1175 +class Insert_Element <ValueType elem_type, ValueType vec_type,
1176 +                      RegisterClass elem_class, RegisterClass vec_class,
1177 +                      int sub_idx, SubRegIndex sub_reg> : Pat <
1178 +
1179 +  (vec_type (vector_insert (vec_type vec_class:$vec),
1180 +                           (elem_type elem_class:$elem), sub_idx)),
1181 +  (INSERT_SUBREG vec_class:$vec, elem_class:$elem, sub_reg)
1182 +>;
1183 +
1184 +// Vector Build pattern
1185 +class Vector_Build <ValueType vecType, RegisterClass vectorClass,
1186 +                    ValueType elemType, RegisterClass elemClass> : Pat <
1187 +  (vecType (build_vector (elemType elemClass:$x), (elemType elemClass:$y),
1188 +                         (elemType elemClass:$z), (elemType elemClass:$w))),
1189 +  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
1190 +  (vecType (IMPLICIT_DEF)), elemClass:$x, sel_x), elemClass:$y, sel_y),
1191 +                            elemClass:$z, sel_z), elemClass:$w, sel_w)
1192 +>;
1193 +
1194 +// bitconvert pattern
1195 +class BitConvert <ValueType dt, ValueType st, RegisterClass rc> : Pat <
1196 +  (dt (bitconvert (st rc:$src0))),
1197 +  (dt rc:$src0)
1198 +>;
1199 +
1200 +class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat <
1201 +  (vt (AMDGPUdwordaddr (vt rc:$addr))),
1202 +  (vt rc:$addr)
1203 +>;
1204 +
1205 +include "R600Instructions.td"
1206 +
1207 +include "SIInstrInfo.td"
1208 +
1209 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUIntrinsics.td llvm-r600/lib/Target/R600/AMDGPUIntrinsics.td
1210 --- llvm-3.2.src/lib/Target/R600/AMDGPUIntrinsics.td    1970-01-01 01:00:00.000000000 +0100
1211 +++ llvm-r600/lib/Target/R600/AMDGPUIntrinsics.td       2013-01-25 19:43:57.430049721 +0100
1212 @@ -0,0 +1,62 @@
1213 +//===-- AMDGPUIntrinsics.td - Common intrinsics  -*- tablegen -*-----------===//
1214 +//
1215 +//                     The LLVM Compiler Infrastructure
1216 +//
1217 +// This file is distributed under the University of Illinois Open Source
1218 +// License. See LICENSE.TXT for details.
1219 +//
1220 +//===----------------------------------------------------------------------===//
1221 +//
1222 +// This file defines intrinsics that are used by all hw codegen targets.
1223 +//
1224 +//===----------------------------------------------------------------------===//
1225 +
1226 +let TargetPrefix = "AMDGPU", isTarget = 1 in {
1227 +
1228 +  def int_AMDGPU_load_const : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
1229 +  def int_AMDGPU_load_imm : Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>;
1230 +  def int_AMDGPU_reserve_reg : Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>;
1231 +  def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
1232 +  def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
1233 +
1234 +  def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
1235 +  def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1236 +  def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1237 +  def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
1238 +  def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
1239 +  def int_AMDGPU_kilp : Intrinsic<[], [], []>;
1240 +  def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1241 +  def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1242 +  def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1243 +  def int_AMDGPU_rcp : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
1244 +  def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
1245 +  def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1246 +  def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1247 +  def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1248 +  def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1249 +  def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1250 +  def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1251 +  def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1252 +  def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1253 +  def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1254 +  def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1255 +  def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1256 +  def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1257 +  def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
1258 +  def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1259 +  def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1260 +  def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1261 +  def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1262 +  def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1263 +  def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1264 +  def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
1265 +
1266 +  def int_AMDGPU_shader_type : Intrinsic<[], [llvm_i32_ty], []>;
1267 +}
1268 +
1269 +let TargetPrefix = "TGSI", isTarget = 1 in {
1270 +
1271 +  def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[IntrNoMem]>;
1272 +}
1273 +
1274 +include "SIIntrinsics.td"
1275 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUISelLowering.cpp llvm-r600/lib/Target/R600/AMDGPUISelLowering.cpp
1276 --- llvm-3.2.src/lib/Target/R600/AMDGPUISelLowering.cpp 1970-01-01 01:00:00.000000000 +0100
1277 +++ llvm-r600/lib/Target/R600/AMDGPUISelLowering.cpp    2013-01-25 19:43:57.426716388 +0100
1278 @@ -0,0 +1,418 @@
1279 +//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
1280 +//
1281 +//                     The LLVM Compiler Infrastructure
1282 +//
1283 +// This file is distributed under the University of Illinois Open Source
1284 +// License. See LICENSE.TXT for details.
1285 +//
1286 +//===----------------------------------------------------------------------===//
1287 +//
1288 +/// \file
1289 +/// \brief This is the parent TargetLowering class for hardware code gen
1290 +/// targets.
1291 +//
1292 +//===----------------------------------------------------------------------===//
1293 +
1294 +#include "AMDGPUISelLowering.h"
1295 +#include "AMDILIntrinsicInfo.h"
1296 +#include "llvm/CodeGen/MachineFunction.h"
1297 +#include "llvm/CodeGen/MachineRegisterInfo.h"
1298 +#include "llvm/CodeGen/SelectionDAG.h"
1299 +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
1300 +
1301 +using namespace llvm;
1302 +
1303 +AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
1304 +  TargetLowering(TM, new TargetLoweringObjectFileELF()) {
1305 +
1306 +  // Initialize target lowering borrowed from AMDIL
1307 +  InitAMDILLowering();
1308 +
1309 +  // We need to custom lower some of the intrinsics
1310 +  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1311 +
1312 +  // Library functions.  These default to Expand, but we have instructions
1313 +  // for them.
1314 +  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
1315 +  setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
1316 +  setOperationAction(ISD::FPOW,   MVT::f32, Legal);
1317 +  setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
1318 +  setOperationAction(ISD::FABS,   MVT::f32, Legal);
1319 +  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
1320 +  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
1321 +
1322 +  // Lower floating point store/load to integer store/load to reduce the number
1323 +  // of patterns in tablegen.
1324 +  setOperationAction(ISD::STORE, MVT::f32, Promote);
1325 +  AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
1326 +
1327 +  setOperationAction(ISD::STORE, MVT::v4f32, Promote);
1328 +  AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
1329 +
1330 +  setOperationAction(ISD::LOAD, MVT::f32, Promote);
1331 +  AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
1332 +
1333 +  setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
1334 +  AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
1335 +
1336 +  setOperationAction(ISD::UDIV, MVT::i32, Expand);
1337 +  setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
1338 +  setOperationAction(ISD::UREM, MVT::i32, Expand);
1339 +}
1340 +
1341 +//===---------------------------------------------------------------------===//
1342 +// TargetLowering Callbacks
1343 +//===---------------------------------------------------------------------===//
1344 +
1345 +SDValue AMDGPUTargetLowering::LowerFormalArguments(
1346 +                                      SDValue Chain,
1347 +                                      CallingConv::ID CallConv,
1348 +                                      bool isVarArg,
1349 +                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1350 +                                      DebugLoc DL, SelectionDAG &DAG,
1351 +                                      SmallVectorImpl<SDValue> &InVals) const {
1352 +  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1353 +    InVals.push_back(SDValue());
1354 +  }
1355 +  return Chain;
1356 +}
1357 +
1358 +SDValue AMDGPUTargetLowering::LowerReturn(
1359 +                                     SDValue Chain,
1360 +                                     CallingConv::ID CallConv,
1361 +                                     bool isVarArg,
1362 +                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
1363 +                                     const SmallVectorImpl<SDValue> &OutVals,
1364 +                                     DebugLoc DL, SelectionDAG &DAG) const {
1365 +  return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain);
1366 +}
1367 +
1368 +//===---------------------------------------------------------------------===//
1369 +// Target specific lowering
1370 +//===---------------------------------------------------------------------===//
1371 +
1372 +SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
1373 +    const {
1374 +  switch (Op.getOpcode()) {
1375 +  default:
1376 +    Op.getNode()->dump();
1377 +    assert(0 && "Custom lowering code for this"
1378 +        "instruction is not implemented yet!");
1379 +    break;
1380 +  // AMDIL DAG lowering
1381 +  case ISD::SDIV: return LowerSDIV(Op, DAG);
1382 +  case ISD::SREM: return LowerSREM(Op, DAG);
1383 +  case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1384 +  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
1385 +  // AMDGPU DAG lowering
1386 +  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
1387 +  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1388 +  }
1389 +  return Op;
1390 +}
1391 +
1392 +SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
1393 +    SelectionDAG &DAG) const {
1394 +  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1395 +  DebugLoc DL = Op.getDebugLoc();
1396 +  EVT VT = Op.getValueType();
1397 +
1398 +  switch (IntrinsicID) {
1399 +    default: return Op;
1400 +    case AMDGPUIntrinsic::AMDIL_abs:
1401 +      return LowerIntrinsicIABS(Op, DAG);
1402 +    case AMDGPUIntrinsic::AMDIL_exp:
1403 +      return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
1404 +    case AMDGPUIntrinsic::AMDGPU_lrp:
1405 +      return LowerIntrinsicLRP(Op, DAG);
1406 +    case AMDGPUIntrinsic::AMDIL_fraction:
1407 +      return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
1408 +    case AMDGPUIntrinsic::AMDIL_mad:
1409 +      return DAG.getNode(AMDGPUISD::MAD, DL, VT, Op.getOperand(1),
1410 +                              Op.getOperand(2), Op.getOperand(3));
1411 +    case AMDGPUIntrinsic::AMDIL_max:
1412 +      return DAG.getNode(AMDGPUISD::FMAX, DL, VT, Op.getOperand(1),
1413 +                                                  Op.getOperand(2));
1414 +    case AMDGPUIntrinsic::AMDGPU_imax:
1415 +      return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
1416 +                                                  Op.getOperand(2));
1417 +    case AMDGPUIntrinsic::AMDGPU_umax:
1418 +      return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1),
1419 +                                                  Op.getOperand(2));
1420 +    case AMDGPUIntrinsic::AMDIL_min:
1421 +      return DAG.getNode(AMDGPUISD::FMIN, DL, VT, Op.getOperand(1),
1422 +                                                  Op.getOperand(2));
1423 +    case AMDGPUIntrinsic::AMDGPU_imin:
1424 +      return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1),
1425 +                                                  Op.getOperand(2));
1426 +    case AMDGPUIntrinsic::AMDGPU_umin:
1427 +      return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1),
1428 +                                                  Op.getOperand(2));
1429 +    case AMDGPUIntrinsic::AMDIL_round_nearest:
1430 +      return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
1431 +  }
1432 +}
1433 +
1434 +///IABS(a) = SMAX(sub(0, a), a)
1435 +SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
1436 +    SelectionDAG &DAG) const {
1437 +
1438 +  DebugLoc DL = Op.getDebugLoc();
1439 +  EVT VT = Op.getValueType();
1440 +  SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
1441 +                                              Op.getOperand(1));
1442 +
1443 +  return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1));
1444 +}
1445 +
1446 +/// Linear Interpolation
1447 +/// LRP(a, b, c) = muladd(a,  b, (1 - a) * c)
1448 +SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
1449 +    SelectionDAG &DAG) const {
1450 +  DebugLoc DL = Op.getDebugLoc();
1451 +  EVT VT = Op.getValueType();
1452 +  SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
1453 +                                DAG.getConstantFP(1.0f, MVT::f32),
1454 +                                Op.getOperand(1));
1455 +  SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA,
1456 +                                                    Op.getOperand(3));
1457 +  return DAG.getNode(AMDGPUISD::MAD, DL, VT, Op.getOperand(1),
1458 +                                               Op.getOperand(2),
1459 +                                               OneSubAC);
1460 +}
1461 +
1462 +/// \brief Generate Min/Max node
1463 +SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
1464 +    SelectionDAG &DAG) const {
1465 +  DebugLoc DL = Op.getDebugLoc();
1466 +  EVT VT = Op.getValueType();
1467 +
1468 +  SDValue LHS = Op.getOperand(0);
1469 +  SDValue RHS = Op.getOperand(1);
1470 +  SDValue True = Op.getOperand(2);
1471 +  SDValue False = Op.getOperand(3);
1472 +  SDValue CC = Op.getOperand(4);
1473 +
1474 +  if (VT != MVT::f32 ||
1475 +      !((LHS == True && RHS == False) || (LHS == False && RHS == True))) {
1476 +    return SDValue();
1477 +  }
1478 +
1479 +  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1480 +  switch (CCOpcode) {
1481 +  case ISD::SETOEQ:
1482 +  case ISD::SETONE:
1483 +  case ISD::SETUNE:
1484 +  case ISD::SETNE:
1485 +  case ISD::SETUEQ:
1486 +  case ISD::SETEQ:
1487 +  case ISD::SETFALSE:
1488 +  case ISD::SETFALSE2:
1489 +  case ISD::SETTRUE:
1490 +  case ISD::SETTRUE2:
1491 +  case ISD::SETUO:
1492 +  case ISD::SETO:
1493 +    assert(0 && "Operation should already be optimised !");
1494 +  case ISD::SETULE:
1495 +  case ISD::SETULT:
1496 +  case ISD::SETOLE:
1497 +  case ISD::SETOLT:
1498 +  case ISD::SETLE:
1499 +  case ISD::SETLT: {
1500 +    if (LHS == True)
1501 +      return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS);
1502 +    else
1503 +      return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS);
1504 +  }
1505 +  case ISD::SETGT:
1506 +  case ISD::SETGE:
1507 +  case ISD::SETUGE:
1508 +  case ISD::SETOGE:
1509 +  case ISD::SETUGT:
1510 +  case ISD::SETOGT: {
1511 +    if (LHS == True)
1512 +      return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS);
1513 +    else
1514 +      return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS);
1515 +  }
1516 +  case ISD::SETCC_INVALID:
1517 +    assert(0 && "Invalid setcc condcode !");
1518 +  }
1519 +  return Op;
1520 +}
1521 +
1522 +
1523 +
1524 +SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1525 +    SelectionDAG &DAG) const {
1526 +  DebugLoc DL = Op.getDebugLoc();
1527 +  EVT VT = Op.getValueType();
1528 +
1529 +  SDValue Num = Op.getOperand(0);
1530 +  SDValue Den = Op.getOperand(1);
1531 +
1532 +  SmallVector<SDValue, 8> Results;
1533 +
1534 +  // RCP =  URECIP(Den) = 2^32 / Den + e
1535 +  // e is rounding error.
1536 +  SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
1537 +
1538 +  // RCP_LO = umulo(RCP, Den) */
1539 +  SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den);
1540 +
1541 +  // RCP_HI = mulhu (RCP, Den) */
1542 +  SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
1543 +
1544 +  // NEG_RCP_LO = -RCP_LO
1545 +  SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
1546 +                                                     RCP_LO);
1547 +
1548 +  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
1549 +  SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
1550 +                                           NEG_RCP_LO, RCP_LO,
1551 +                                           ISD::SETEQ);
1552 +  // Calculate the rounding error from the URECIP instruction
1553 +  // E = mulhu(ABS_RCP_LO, RCP)
1554 +  SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
1555 +
1556 +  // RCP_A_E = RCP + E
1557 +  SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
1558 +
1559 +  // RCP_S_E = RCP - E
1560 +  SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
1561 +
1562 +  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
1563 +  SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
1564 +                                     RCP_A_E, RCP_S_E,
1565 +                                     ISD::SETEQ);
1566 +  // Quotient = mulhu(Tmp0, Num)
1567 +  SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
1568 +
1569 +  // Num_S_Remainder = Quotient * Den
1570 +  SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den);
1571 +
1572 +  // Remainder = Num - Num_S_Remainder
1573 +  SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
1574 +
1575 +  // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
1576 +  SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
1577 +                                                 DAG.getConstant(-1, VT),
1578 +                                                 DAG.getConstant(0, VT),
1579 +                                                 ISD::SETGE);
1580 +  // Remainder_GE_Zero = (Remainder >= 0 ? -1 : 0)
1581 +  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Remainder,
1582 +                                                  DAG.getConstant(0, VT),
1583 +                                                  DAG.getConstant(-1, VT),
1584 +                                                  DAG.getConstant(0, VT),
1585 +                                                  ISD::SETGE);
1586 +  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
1587 +  SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
1588 +                                               Remainder_GE_Zero);
1589 +
1590 +  // Calculate Division result:
1591 +
1592 +  // Quotient_A_One = Quotient + 1
1593 +  SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
1594 +                                                         DAG.getConstant(1, VT));
1595 +
1596 +  // Quotient_S_One = Quotient - 1
1597 +  SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
1598 +                                                         DAG.getConstant(1, VT));
1599 +
1600 +  // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
1601 +  SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
1602 +                                     Quotient, Quotient_A_One, ISD::SETEQ);
1603 +
1604 +  // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
1605 +  Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
1606 +                            Quotient_S_One, Div, ISD::SETEQ);
1607 +
1608 +  // Calculate Rem result:
1609 +
1610 +  // Remainder_S_Den = Remainder - Den
1611 +  SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
1612 +
1613 +  // Remainder_A_Den = Remainder + Den
1614 +  SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
1615 +
1616 +  // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
1617 +  SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
1618 +                                    Remainder, Remainder_S_Den, ISD::SETEQ);
1619 +
1620 +  // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
1621 +  Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
1622 +                            Remainder_A_Den, Rem, ISD::SETEQ);
1623 +  SDValue Ops[2];
1624 +  Ops[0] = Div;
1625 +  Ops[1] = Rem;
1626 +  return DAG.getMergeValues(Ops, 2, DL);
1627 +}
1628 +
1629 +//===----------------------------------------------------------------------===//
1630 +// Helper functions
1631 +//===----------------------------------------------------------------------===//
1632 +
1633 +bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
1634 +  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
1635 +    return CFP->isExactlyValue(1.0);
1636 +  }
1637 +  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
1638 +    return C->isAllOnesValue();
1639 +  }
1640 +  return false;
1641 +}
1642 +
1643 +bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const {
1644 +  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
1645 +    return CFP->getValueAPF().isZero();
1646 +  }
1647 +  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
1648 +    return C->isNullValue();
1649 +  }
1650 +  return false;
1651 +}
1652 +
1653 +SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
1654 +                                                  const TargetRegisterClass *RC,
1655 +                                                   unsigned Reg, EVT VT) const {
1656 +  MachineFunction &MF = DAG.getMachineFunction();
1657 +  MachineRegisterInfo &MRI = MF.getRegInfo();
1658 +  unsigned VirtualRegister;
1659 +  if (!MRI.isLiveIn(Reg)) {
1660 +    VirtualRegister = MRI.createVirtualRegister(RC);
1661 +    MRI.addLiveIn(Reg, VirtualRegister);
1662 +  } else {
1663 +    VirtualRegister = MRI.getLiveInVirtReg(Reg);
1664 +  }
1665 +  return DAG.getRegister(VirtualRegister, VT);
1666 +}
1667 +
1668 +#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
1669 +
1670 +const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
1671 +  switch (Opcode) {
1672 +  default: return 0;
1673 +  // AMDIL DAG nodes
1674 +  NODE_NAME_CASE(MAD);
1675 +  NODE_NAME_CASE(CALL);
1676 +  NODE_NAME_CASE(UMUL);
1677 +  NODE_NAME_CASE(DIV_INF);
1678 +  NODE_NAME_CASE(RET_FLAG);
1679 +  NODE_NAME_CASE(BRANCH_COND);
1680 +
1681 +  // AMDGPU DAG nodes
1682 +  NODE_NAME_CASE(DWORDADDR)
1683 +  NODE_NAME_CASE(FRACT)
1684 +  NODE_NAME_CASE(FMAX)
1685 +  NODE_NAME_CASE(SMAX)
1686 +  NODE_NAME_CASE(UMAX)
1687 +  NODE_NAME_CASE(FMIN)
1688 +  NODE_NAME_CASE(SMIN)
1689 +  NODE_NAME_CASE(UMIN)
1690 +  NODE_NAME_CASE(URECIP)
1691 +  NODE_NAME_CASE(INTERP)
1692 +  NODE_NAME_CASE(INTERP_P0)
1693 +  NODE_NAME_CASE(EXPORT)
1694 +  NODE_NAME_CASE(CONST_ADDRESS)
1695 +  }
1696 +}
1697 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUISelLowering.h llvm-r600/lib/Target/R600/AMDGPUISelLowering.h
1698 --- llvm-3.2.src/lib/Target/R600/AMDGPUISelLowering.h   1970-01-01 01:00:00.000000000 +0100
1699 +++ llvm-r600/lib/Target/R600/AMDGPUISelLowering.h      2013-01-25 19:43:57.426716388 +0100
1700 @@ -0,0 +1,145 @@
1701 +//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===//
1702 +//
1703 +//                     The LLVM Compiler Infrastructure
1704 +//
1705 +// This file is distributed under the University of Illinois Open Source
1706 +// License. See LICENSE.TXT for details.
1707 +//
1708 +//===----------------------------------------------------------------------===//
1709 +//
1710 +/// \file
1711 +/// \brief Interface definition of the TargetLowering class that is common
1712 +/// to all AMD GPUs.
1713 +//
1714 +//===----------------------------------------------------------------------===//
1715 +
1716 +#ifndef AMDGPUISELLOWERING_H
1717 +#define AMDGPUISELLOWERING_H
1718 +
1719 +#include "llvm/Target/TargetLowering.h"
1720 +
1721 +namespace llvm {
1722 +
1723 +class MachineRegisterInfo;
1724 +
1725 +class AMDGPUTargetLowering : public TargetLowering {
1726 +private:
1727 +  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
1728 +  SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
1729 +
1730 +protected:
1731 +
1732 +  /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
1733 +  /// MachineFunction.
1734 +  ///
1735 +  /// \returns a RegisterSDNode representing Reg.
1736 +  SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
1737 +                                                  unsigned Reg, EVT VT) const;
1738 +
1739 +  bool isHWTrueValue(SDValue Op) const;
1740 +  bool isHWFalseValue(SDValue Op) const;
1741 +
1742 +public:
1743 +  AMDGPUTargetLowering(TargetMachine &TM);
1744 +
1745 +  virtual SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
1746 +                             bool isVarArg,
1747 +                             const SmallVectorImpl<ISD::InputArg> &Ins,
1748 +                             DebugLoc DL, SelectionDAG &DAG,
1749 +                             SmallVectorImpl<SDValue> &InVals) const;
1750 +
1751 +  virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
1752 +                              bool isVarArg,
1753 +                              const SmallVectorImpl<ISD::OutputArg> &Outs,
1754 +                              const SmallVectorImpl<SDValue> &OutVals,
1755 +                              DebugLoc DL, SelectionDAG &DAG) const;
1756 +
1757 +  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
1758 +  SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
1759 +  SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const;
1760 +  SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const;
1761 +  virtual const char* getTargetNodeName(unsigned Opcode) const;
1762 +
1763 +// Functions defined in AMDILISelLowering.cpp
1764 +public:
1765 +
1766 +  /// \brief Determine which of the bits specified in \p Mask are known to be
1767 +  /// either zero or one and return them in the \p KnownZero and \p KnownOne
1768 +  /// bitsets.
1769 +  virtual void computeMaskedBitsForTargetNode(const SDValue Op,
1770 +                                              APInt &KnownZero,
1771 +                                              APInt &KnownOne,
1772 +                                              const SelectionDAG &DAG,
1773 +                                              unsigned Depth = 0) const;
1774 +
1775 +  virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info,
1776 +                                  const CallInst &I, unsigned Intrinsic) const;
1777 +
1778 +  /// We want to mark f32/f64 floating point values as legal.
1779 +  bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
1780 +
1781 +  /// We don't want to shrink f64/f32 constants.
1782 +  bool ShouldShrinkFPConstant(EVT VT) const;
1783 +
1784 +private:
1785 +  void InitAMDILLowering();
1786 +  SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const;
1787 +  SDValue LowerSREM8(SDValue Op, SelectionDAG &DAG) const;
1788 +  SDValue LowerSREM16(SDValue Op, SelectionDAG &DAG) const;
1789 +  SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const;
1790 +  SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const;
1791 +  SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
1792 +  SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const;
1793 +  SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const;
1794 +  SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const;
1795 +  SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
1796 +  EVT genIntType(uint32_t size = 32, uint32_t numEle = 1) const;
1797 +  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
1798 +  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
1799 +};
1800 +
1801 +namespace AMDGPUISD {
1802 +
1803 +enum {
1804 +  // AMDIL ISD Opcodes
1805 +  FIRST_NUMBER = ISD::BUILTIN_OP_END,
1806 +  MAD,         // 32bit Fused Multiply Add instruction
1807 +  CALL,        // Function call based on a single integer
1808 +  UMUL,        // 32bit unsigned multiplication
1809 +  DIV_INF,      // Divide with infinity returned on zero divisor
1810 +  RET_FLAG,
1811 +  BRANCH_COND,
1812 +  // End AMDIL ISD Opcodes
1813 +  BITALIGN,
1814 +  DWORDADDR,
1815 +  FRACT,
1816 +  FMAX,
1817 +  SMAX,
1818 +  UMAX,
1819 +  FMIN,
1820 +  SMIN,
1821 +  UMIN,
1822 +  URECIP,
1823 +  INTERP,
1824 +  INTERP_P0,
1825 +  EXPORT,
1826 +  CONST_ADDRESS,
1827 +  LAST_AMDGPU_ISD_NUMBER
1828 +};
1829 +
1830 +
1831 +} // End namespace AMDGPUISD
1832 +
1833 +namespace SIISD {
1834 +
1835 +enum {
1836 +  SI_FIRST = AMDGPUISD::LAST_AMDGPU_ISD_NUMBER,
1837 +  VCC_AND,
1838 +  VCC_BITCAST
1839 +};
1840 +
1841 +} // End namespace SIISD
1842 +
1843 +} // End namespace llvm
1844 +
1845 +#endif // AMDGPUISELLOWERING_H
1846 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUMCInstLower.cpp llvm-r600/lib/Target/R600/AMDGPUMCInstLower.cpp
1847 --- llvm-3.2.src/lib/Target/R600/AMDGPUMCInstLower.cpp  1970-01-01 01:00:00.000000000 +0100
1848 +++ llvm-r600/lib/Target/R600/AMDGPUMCInstLower.cpp     2013-01-25 19:43:57.430049721 +0100
1849 @@ -0,0 +1,83 @@
1850 +//===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===//
1851 +//
1852 +//                     The LLVM Compiler Infrastructure
1853 +//
1854 +// This file is distributed under the University of Illinois Open Source
1855 +// License. See LICENSE.TXT for details.
1856 +//
1857 +//===----------------------------------------------------------------------===//
1858 +//
1859 +/// \file
1860 +/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst.
1861 +//
1862 +//===----------------------------------------------------------------------===//
1863 +//
1864 +
1865 +#include "AMDGPUMCInstLower.h"
1866 +#include "AMDGPUAsmPrinter.h"
1867 +#include "R600InstrInfo.h"
1868 +#include "llvm/CodeGen/MachineBasicBlock.h"
1869 +#include "llvm/CodeGen/MachineInstr.h"
1870 +#include "llvm/Constants.h"
1871 +#include "llvm/MC/MCInst.h"
1872 +#include "llvm/MC/MCStreamer.h"
1873 +#include "llvm/MC/MCExpr.h"
1874 +#include "llvm/Support/ErrorHandling.h"
1875 +
1876 +using namespace llvm;
1877 +
1878 +AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx):
1879 +  Ctx(ctx)
1880 +{ }
1881 +
1882 +void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
1883 +  OutMI.setOpcode(MI->getOpcode());
1884 +
1885 +  for (unsigned i = 0, e = MI->getNumExplicitOperands(); i != e; ++i) {
1886 +    const MachineOperand &MO = MI->getOperand(i);
1887 +
1888 +    MCOperand MCOp;
1889 +    switch (MO.getType()) {
1890 +    default:
1891 +      llvm_unreachable("unknown operand type");
1892 +    case MachineOperand::MO_FPImmediate: {
1893 +      const APFloat &FloatValue = MO.getFPImm()->getValueAPF();
1894 +      assert(&FloatValue.getSemantics() == &APFloat::IEEEsingle &&
1895 +             "Only floating point immediates are supported at the moment.");
1896 +      MCOp = MCOperand::CreateFPImm(FloatValue.convertToFloat());
1897 +      break;
1898 +    }
1899 +    case MachineOperand::MO_Immediate:
1900 +      MCOp = MCOperand::CreateImm(MO.getImm());
1901 +      break;
1902 +    case MachineOperand::MO_Register:
1903 +      MCOp = MCOperand::CreateReg(MO.getReg());
1904 +      break;
1905 +    case MachineOperand::MO_MachineBasicBlock:
1906 +      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
1907 +                                   MO.getMBB()->getSymbol(), Ctx));
1908 +    }
1909 +    OutMI.addOperand(MCOp);
1910 +  }
1911 +}
1912 +
1913 +void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
1914 +  AMDGPUMCInstLower MCInstLowering(OutContext);
1915 +
1916 +  if (MI->isBundle()) {
1917 +    const MachineBasicBlock *MBB = MI->getParent();
1918 +    MachineBasicBlock::const_instr_iterator I = MI;
1919 +    ++I;
1920 +    while (I != MBB->end() && I->isInsideBundle()) {
1921 +      MCInst MCBundleInst;
1922 +      const MachineInstr *BundledInst = I;
1923 +      MCInstLowering.lower(BundledInst, MCBundleInst);
1924 +      OutStreamer.EmitInstruction(MCBundleInst);
1925 +      ++I;
1926 +    }
1927 +  } else {
1928 +    MCInst TmpInst;
1929 +    MCInstLowering.lower(MI, TmpInst);
1930 +    OutStreamer.EmitInstruction(TmpInst);
1931 +  }
1932 +}
1933 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUMCInstLower.h llvm-r600/lib/Target/R600/AMDGPUMCInstLower.h
1934 --- llvm-3.2.src/lib/Target/R600/AMDGPUMCInstLower.h    1970-01-01 01:00:00.000000000 +0100
1935 +++ llvm-r600/lib/Target/R600/AMDGPUMCInstLower.h       2013-01-25 19:43:57.430049721 +0100
1936 @@ -0,0 +1,34 @@
1937 +//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===//
1938 +//
1939 +//                     The LLVM Compiler Infrastructure
1940 +//
1941 +// This file is distributed under the University of Illinois Open Source
1942 +// License. See LICENSE.TXT for details.
1943 +//
1944 +/// \file
1945 +//===----------------------------------------------------------------------===//
1946 +
1947 +#ifndef AMDGPU_MCINSTLOWER_H
1948 +#define AMDGPU_MCINSTLOWER_H
1949 +
1950 +namespace llvm {
1951 +
1952 +class MCInst;
1953 +class MCContext;
1954 +class MachineInstr;
1955 +
1956 +class AMDGPUMCInstLower {
1957 +
1958 +  MCContext &Ctx;
1959 +
1960 +public:
1961 +  AMDGPUMCInstLower(MCContext &ctx);
1962 +
1963 +  /// \brief Lower a MachineInstr to an MCInst
1964 +  void lower(const MachineInstr *MI, MCInst &OutMI) const;
1965 +
1966 +};
1967 +
1968 +} // End namespace llvm
1969 +
1970 +#endif //AMDGPU_MCINSTLOWER_H
1971 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.cpp llvm-r600/lib/Target/R600/AMDGPURegisterInfo.cpp
1972 --- llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.cpp 1970-01-01 01:00:00.000000000 +0100
1973 +++ llvm-r600/lib/Target/R600/AMDGPURegisterInfo.cpp    2013-01-25 19:43:57.430049721 +0100
1974 @@ -0,0 +1,51 @@
1975 +//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===//
1976 +//
1977 +//                     The LLVM Compiler Infrastructure
1978 +//
1979 +// This file is distributed under the University of Illinois Open Source
1980 +// License. See LICENSE.TXT for details.
1981 +//
1982 +//===----------------------------------------------------------------------===//
1983 +//
1984 +/// \file
1985 +/// \brief Parent TargetRegisterInfo class common to all hw codegen targets.
1986 +//
1987 +//===----------------------------------------------------------------------===//
1988 +
1989 +#include "AMDGPURegisterInfo.h"
1990 +#include "AMDGPUTargetMachine.h"
1991 +
1992 +using namespace llvm;
1993 +
1994 +AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm,
1995 +    const TargetInstrInfo &tii)
1996 +: AMDGPUGenRegisterInfo(0),
1997 +  TM(tm),
1998 +  TII(tii)
1999 +  { }
2000 +
2001 +//===----------------------------------------------------------------------===//
2002 +// Function handling callbacks - Functions are a seldom used feature of GPUS, so
2003 +// they are not supported at this time.
2004 +//===----------------------------------------------------------------------===//
2005 +
2006 +const uint16_t AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister;
2007 +
2008 +const uint16_t* AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
2009 +                                                                         const {
2010 +  return &CalleeSavedReg;
2011 +}
2012 +
2013 +void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2014 +                                             int SPAdj,
2015 +                                             RegScavenger *RS) const {
2016 +  assert(!"Subroutines not supported yet");
2017 +}
2018 +
2019 +unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
2020 +  assert(!"Subroutines not supported yet");
2021 +  return 0;
2022 +}
2023 +
2024 +#define GET_REGINFO_TARGET_DESC
2025 +#include "AMDGPUGenRegisterInfo.inc"
2026 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.h llvm-r600/lib/Target/R600/AMDGPURegisterInfo.h
2027 --- llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.h   1970-01-01 01:00:00.000000000 +0100
2028 +++ llvm-r600/lib/Target/R600/AMDGPURegisterInfo.h      2013-01-25 19:43:57.430049721 +0100
2029 @@ -0,0 +1,63 @@
2030 +//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===//
2031 +//
2032 +//                     The LLVM Compiler Infrastructure
2033 +//
2034 +// This file is distributed under the University of Illinois Open Source
2035 +// License. See LICENSE.TXT for details.
2036 +//
2037 +//===----------------------------------------------------------------------===//
2038 +//
2039 +/// \file
2040 +/// \brief TargetRegisterInfo interface that is implemented by all hw codegen
2041 +/// targets.
2042 +//
2043 +//===----------------------------------------------------------------------===//
2044 +
2045 +#ifndef AMDGPUREGISTERINFO_H
2046 +#define AMDGPUREGISTERINFO_H
2047 +
2048 +#include "llvm/ADT/BitVector.h"
2049 +#include "llvm/Target/TargetRegisterInfo.h"
2050 +
2051 +#define GET_REGINFO_HEADER
2052 +#define GET_REGINFO_ENUM
2053 +#include "AMDGPUGenRegisterInfo.inc"
2054 +
2055 +namespace llvm {
2056 +
2057 +class AMDGPUTargetMachine;
2058 +class TargetInstrInfo;
2059 +
2060 +struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
2061 +  TargetMachine &TM;
2062 +  const TargetInstrInfo &TII;
2063 +  static const uint16_t CalleeSavedReg;
2064 +
2065 +  AMDGPURegisterInfo(TargetMachine &tm, const TargetInstrInfo &tii);
2066 +
2067 +  virtual BitVector getReservedRegs(const MachineFunction &MF) const {
2068 +    assert(!"Unimplemented");  return BitVector();
2069 +  }
2070 +
2071 +  /// \param RC is an AMDIL reg class.
2072 +  ///
2073 +  /// \returns The ISA reg class that is equivalent to \p RC.
2074 +  virtual const TargetRegisterClass * getISARegClass(
2075 +                                         const TargetRegisterClass * RC) const {
2076 +    assert(!"Unimplemented"); return NULL;
2077 +  }
2078 +
2079 +  virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const {
2080 +    assert(!"Unimplemented"); return NULL;
2081 +  }
2082 +
2083 +  const uint16_t* getCalleeSavedRegs(const MachineFunction *MF) const;
2084 +  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
2085 +                           RegScavenger *RS) const;
2086 +  unsigned getFrameRegister(const MachineFunction &MF) const;
2087 +
2088 +};
2089 +
2090 +} // End namespace llvm
2091 +
2092 +#endif // AMDIDSAREGISTERINFO_H
2093 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.td llvm-r600/lib/Target/R600/AMDGPURegisterInfo.td
2094 --- llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.td  1970-01-01 01:00:00.000000000 +0100
2095 +++ llvm-r600/lib/Target/R600/AMDGPURegisterInfo.td     2013-01-25 19:43:57.433383055 +0100
2096 @@ -0,0 +1,22 @@
2097 +//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===//
2098 +//
2099 +//                     The LLVM Compiler Infrastructure
2100 +//
2101 +// This file is distributed under the University of Illinois Open Source
2102 +// License. See LICENSE.TXT for details.
2103 +//
2104 +//===----------------------------------------------------------------------===//
2105 +//
2106 +// Tablegen register definitions common to all hw codegen targets.
2107 +//
2108 +//===----------------------------------------------------------------------===//
2109 +
2110 +let Namespace = "AMDGPU" in {
2111 +  def sel_x : SubRegIndex;
2112 +  def sel_y : SubRegIndex;
2113 +  def sel_z : SubRegIndex;
2114 +  def sel_w : SubRegIndex;
2115 +}
2116 +
2117 +include "R600RegisterInfo.td"
2118 +include "SIRegisterInfo.td"
2119 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUStructurizeCFG.cpp llvm-r600/lib/Target/R600/AMDGPUStructurizeCFG.cpp
2120 --- llvm-3.2.src/lib/Target/R600/AMDGPUStructurizeCFG.cpp       1970-01-01 01:00:00.000000000 +0100
2121 +++ llvm-r600/lib/Target/R600/AMDGPUStructurizeCFG.cpp  2013-01-25 19:43:57.433383055 +0100
2122 @@ -0,0 +1,714 @@
2123 +//===-- AMDGPUStructurizeCFG.cpp -  ------------------===//
2124 +//
2125 +//                     The LLVM Compiler Infrastructure
2126 +//
2127 +// This file is distributed under the University of Illinois Open Source
2128 +// License. See LICENSE.TXT for details.
2129 +//
2130 +//===----------------------------------------------------------------------===//
2131 +//
2132 +/// \file
2133 +/// The pass implemented in this file transforms the programs control flow
2134 +/// graph into a form that's suitable for code generation on hardware that
2135 +/// implements control flow by execution masking. This currently includes all
2136 +/// AMD GPUs but may as well be useful for other types of hardware.
2137 +//
2138 +//===----------------------------------------------------------------------===//
2139 +
2140 +#include "AMDGPU.h"
2141 +#include "llvm/Module.h"
2142 +#include "llvm/ADT/SCCIterator.h"
2143 +#include "llvm/Analysis/RegionIterator.h"
2144 +#include "llvm/Analysis/RegionInfo.h"
2145 +#include "llvm/Analysis/RegionPass.h"
2146 +#include "llvm/Transforms/Utils/SSAUpdater.h"
2147 +
2148 +using namespace llvm;
2149 +
2150 +namespace {
2151 +
2152 +// Definition of the complex types used in this pass.
2153 +
2154 +typedef std::pair<BasicBlock *, Value *> BBValuePair;
2155 +typedef ArrayRef<BasicBlock*> BBVecRef;
2156 +
2157 +typedef SmallVector<RegionNode*, 8> RNVector;
2158 +typedef SmallVector<BasicBlock*, 8> BBVector;
2159 +typedef SmallVector<BBValuePair, 2> BBValueVector;
2160 +
2161 +typedef DenseMap<PHINode *, BBValueVector> PhiMap;
2162 +typedef DenseMap<BasicBlock *, PhiMap> BBPhiMap;
2163 +typedef DenseMap<BasicBlock *, Value *> BBPredicates;
2164 +typedef DenseMap<BasicBlock *, BBPredicates> PredMap;
2165 +typedef DenseMap<BasicBlock *, unsigned> VisitedMap;
2166 +
2167 +// The name for newly created blocks.
2168 +
2169 +static const char *FlowBlockName = "Flow";
2170 +
2171 +/// @brief Transforms the control flow graph on one single entry/exit region
2172 +/// at a time.
2173 +///
2174 +/// After the transform all "If"/"Then"/"Else" style control flow looks like
2175 +/// this:
2176 +///
2177 +/// \verbatim
2178 +/// 1
2179 +/// ||
2180 +/// | |
2181 +/// 2 |
2182 +/// | /
2183 +/// |/
2184 +/// 3
2185 +/// ||   Where:
2186 +/// | |  1 = "If" block, calculates the condition
2187 +/// 4 |  2 = "Then" subregion, runs if the condition is true
2188 +/// | /  3 = "Flow" blocks, newly inserted flow blocks, rejoins the flow
2189 +/// |/   4 = "Else" optional subregion, runs if the condition is false
2190 +/// 5    5 = "End" block, also rejoins the control flow
2191 +/// \endverbatim
2192 +///
2193 +/// Control flow is expressed as a branch where the true exit goes into the
2194 +/// "Then"/"Else" region, while the false exit skips the region
2195 +/// The condition for the optional "Else" region is expressed as a PHI node.
2196 +/// The incomming values of the PHI node are true for the "If" edge and false
2197 +/// for the "Then" edge.
2198 +///
2199 +/// Additionally to that even complicated loops look like this:
2200 +///
2201 +/// \verbatim
2202 +/// 1
2203 +/// ||
2204 +/// | |
2205 +/// 2 ^  Where:
2206 +/// | /  1 = "Entry" block
2207 +/// |/   2 = "Loop" optional subregion, with all exits at "Flow" block
2208 +/// 3    3 = "Flow" block, with back edge to entry block
2209 +/// |
2210 +/// \endverbatim
2211 +///
2212 +/// The back edge of the "Flow" block is always on the false side of the branch
2213 +/// while the true side continues the general flow. So the loop condition
2214 +/// consist of a network of PHI nodes where the true incoming values expresses
2215 +/// breaks and the false values expresses continue states.
2216 +class AMDGPUStructurizeCFG : public RegionPass {
2217 +
2218 +  static char ID;
2219 +
2220 +  Type *Boolean;
2221 +  ConstantInt *BoolTrue;
2222 +  ConstantInt *BoolFalse;
2223 +  UndefValue *BoolUndef;
2224 +
2225 +  Function *Func;
2226 +  Region *ParentRegion;
2227 +
2228 +  DominatorTree *DT;
2229 +
2230 +  RNVector Order;
2231 +  VisitedMap Visited;
2232 +  PredMap Predicates;
2233 +  BBPhiMap DeletedPhis;
2234 +  BBVector FlowsInserted;
2235 +
2236 +  BasicBlock *LoopStart;
2237 +  BasicBlock *LoopEnd;
2238 +  BBPredicates LoopPred;
2239 +
2240 +  void orderNodes();
2241 +
2242 +  void buildPredicate(BranchInst *Term, unsigned Idx,
2243 +                      BBPredicates &Pred, bool Invert);
2244 +
2245 +  void analyzeBlock(BasicBlock *BB);
2246 +
2247 +  void analyzeLoop(BasicBlock *BB, unsigned &LoopIdx);
2248 +
2249 +  void collectInfos();
2250 +
2251 +  bool dominatesPredicates(BasicBlock *A, BasicBlock *B);
2252 +
2253 +  void killTerminator(BasicBlock *BB);
2254 +
2255 +  RegionNode *skipChained(RegionNode *Node);
2256 +
2257 +  void delPhiValues(BasicBlock *From, BasicBlock *To);
2258 +
2259 +  void addPhiValues(BasicBlock *From, BasicBlock *To);
2260 +
2261 +  BasicBlock *getNextFlow(BasicBlock *Prev);
2262 +
2263 +  bool isPredictableTrue(BasicBlock *Prev, BasicBlock *Node);
2264 +
2265 +  BasicBlock *wireFlowBlock(BasicBlock *Prev, RegionNode *Node);
2266 +
2267 +  void createFlow();
2268 +
2269 +  void insertConditions();
2270 +
2271 +  void rebuildSSA();
2272 +
2273 +public:
2274 +  AMDGPUStructurizeCFG():
2275 +    RegionPass(ID) {
2276 +
2277 +    initializeRegionInfoPass(*PassRegistry::getPassRegistry());
2278 +  }
2279 +
2280 +  virtual bool doInitialization(Region *R, RGPassManager &RGM);
2281 +
2282 +  virtual bool runOnRegion(Region *R, RGPassManager &RGM);
2283 +
2284 +  virtual const char *getPassName() const {
2285 +    return "AMDGPU simplify control flow";
2286 +  }
2287 +
2288 +  void getAnalysisUsage(AnalysisUsage &AU) const {
2289 +
2290 +    AU.addRequired<DominatorTree>();
2291 +    AU.addPreserved<DominatorTree>();
2292 +    RegionPass::getAnalysisUsage(AU);
2293 +  }
2294 +
2295 +};
2296 +
2297 +} // end anonymous namespace
2298 +
2299 +char AMDGPUStructurizeCFG::ID = 0;
2300 +
2301 +/// \brief Initialize the types and constants used in the pass
2302 +bool AMDGPUStructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
2303 +  LLVMContext &Context = R->getEntry()->getContext();
2304 +
2305 +  Boolean = Type::getInt1Ty(Context);
2306 +  BoolTrue = ConstantInt::getTrue(Context);
2307 +  BoolFalse = ConstantInt::getFalse(Context);
2308 +  BoolUndef = UndefValue::get(Boolean);
2309 +
2310 +  return false;
2311 +}
2312 +
2313 +/// \brief Build up the general order of nodes
2314 +void AMDGPUStructurizeCFG::orderNodes() {
2315 +  scc_iterator<Region *> I = scc_begin(ParentRegion),
2316 +                         E = scc_end(ParentRegion);
2317 +  for (Order.clear(); I != E; ++I) {
2318 +    std::vector<RegionNode *> &Nodes = *I;
2319 +    Order.append(Nodes.begin(), Nodes.end());
2320 +  }
2321 +}
2322 +
2323 +/// \brief Build blocks and loop predicates
2324 +void AMDGPUStructurizeCFG::buildPredicate(BranchInst *Term, unsigned Idx,
2325 +                                          BBPredicates &Pred, bool Invert) {
2326 +  Value *True = Invert ? BoolFalse : BoolTrue;
2327 +  Value *False = Invert ? BoolTrue : BoolFalse;
2328 +
2329 +  RegionInfo *RI = ParentRegion->getRegionInfo();
2330 +  BasicBlock *BB = Term->getParent();
2331 +
2332 +  // Handle the case where multiple regions start at the same block
2333 +  Region *R = BB != ParentRegion->getEntry() ?
2334 +              RI->getRegionFor(BB) : ParentRegion;
2335 +
2336 +  if (R == ParentRegion) {
2337 +    // It's a top level block in our region
2338 +    Value *Cond = True;
2339 +    if (Term->isConditional()) {
2340 +      BasicBlock *Other = Term->getSuccessor(!Idx);
2341 +
2342 +      if (Visited.count(Other)) {
2343 +        if (!Pred.count(Other))
2344 +          Pred[Other] = False;
2345 +
2346 +        if (!Pred.count(BB))
2347 +          Pred[BB] = True;
2348 +        return;
2349 +      }
2350 +      Cond = Term->getCondition();
2351 +
2352 +      if (Idx != Invert)
2353 +        Cond = BinaryOperator::CreateNot(Cond, "", Term);
2354 +    }
2355 +
2356 +    Pred[BB] = Cond;
2357 +
2358 +  } else if (ParentRegion->contains(R)) {
2359 +    // It's a block in a sub region
2360 +    while(R->getParent() != ParentRegion)
2361 +      R = R->getParent();
2362 +
2363 +    Pred[R->getEntry()] = True;
2364 +
2365 +  } else {
2366 +    // It's a branch from outside into our parent region
2367 +    Pred[BB] = True;
2368 +  }
2369 +}
2370 +
2371 +/// \brief Analyze the successors of each block and build up predicates
2372 +void AMDGPUStructurizeCFG::analyzeBlock(BasicBlock *BB) {
2373 +  pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
2374 +  BBPredicates &Pred = Predicates[BB];
2375 +
2376 +  for (; PI != PE; ++PI) {
2377 +    BranchInst *Term = cast<BranchInst>((*PI)->getTerminator());
2378 +
2379 +    for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
2380 +      BasicBlock *Succ = Term->getSuccessor(i);
2381 +      if (Succ != BB)
2382 +        continue;
2383 +      buildPredicate(Term, i, Pred, false);
2384 +    }
2385 +  }
2386 +}
2387 +
2388 +/// \brief Analyze the conditions leading to loop to a previous block
2389 +void AMDGPUStructurizeCFG::analyzeLoop(BasicBlock *BB, unsigned &LoopIdx) {
2390 +  BranchInst *Term = cast<BranchInst>(BB->getTerminator());
2391 +
2392 +  for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
2393 +    BasicBlock *Succ = Term->getSuccessor(i);
2394 +
2395 +    // Ignore it if it's not a back edge
2396 +    if (!Visited.count(Succ))
2397 +      continue;
2398 +
2399 +    buildPredicate(Term, i, LoopPred, true);
2400 +
2401 +    LoopEnd = BB;
2402 +    if (Visited[Succ] < LoopIdx) {
2403 +      LoopIdx = Visited[Succ];
2404 +      LoopStart = Succ;
2405 +    }
2406 +  }
2407 +}
2408 +
2409 +/// \brief Collect various loop and predicate infos
2410 +void AMDGPUStructurizeCFG::collectInfos() {
2411 +  unsigned Number = 0, LoopIdx = ~0;
2412 +
2413 +  // Reset predicate
2414 +  Predicates.clear();
2415 +
2416 +  // and loop infos
2417 +  LoopStart = LoopEnd = 0;
2418 +  LoopPred.clear();
2419 +
2420 +  RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend();
2421 +  for (Visited.clear(); OI != OE; Visited[(*OI++)->getEntry()] = ++Number) {
2422 +
2423 +    // Analyze all the conditions leading to a node
2424 +    analyzeBlock((*OI)->getEntry());
2425 +
2426 +    if ((*OI)->isSubRegion())
2427 +      continue;
2428 +
2429 +    // Find the first/last loop nodes and loop predicates
2430 +    analyzeLoop((*OI)->getNodeAs<BasicBlock>(), LoopIdx);
2431 +  }
2432 +}
2433 +
2434 +/// \brief Does A dominate all the predicates of B ?
2435 +bool AMDGPUStructurizeCFG::dominatesPredicates(BasicBlock *A, BasicBlock *B) {
2436 +  BBPredicates &Preds = Predicates[B];
2437 +  for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
2438 +       PI != PE; ++PI) {
2439 +
2440 +    if (!DT->dominates(A, PI->first))
2441 +      return false;
2442 +  }
2443 +  return true;
2444 +}
2445 +
2446 +/// \brief Remove phi values from all successors and the remove the terminator.
2447 +void AMDGPUStructurizeCFG::killTerminator(BasicBlock *BB) {
2448 +  TerminatorInst *Term = BB->getTerminator();
2449 +  if (!Term)
2450 +    return;
2451 +
2452 +  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
2453 +       SI != SE; ++SI) {
2454 +
2455 +    delPhiValues(BB, *SI);
2456 +  }
2457 +
2458 +  Term->eraseFromParent();
2459 +}
2460 +
2461 +/// First: Skip forward to the first region node that either isn't a subregion or not
2462 +/// dominating it's exit, remove all the skipped nodes from the node order.
2463 +///
2464 +/// Second: Handle the first successor directly if the resulting nodes successor
2465 +/// predicates are still dominated by the original entry
2466 +RegionNode *AMDGPUStructurizeCFG::skipChained(RegionNode *Node) {
2467 +  BasicBlock *Entry = Node->getEntry();
2468 +
2469 +  // Skip forward as long as it is just a linear flow
2470 +  while (true) {
2471 +    BasicBlock *Entry = Node->getEntry();
2472 +    BasicBlock *Exit;
2473 +
2474 +    if (Node->isSubRegion()) {
2475 +      Exit = Node->getNodeAs<Region>()->getExit();
2476 +    } else {
2477 +      TerminatorInst *Term = Entry->getTerminator();
2478 +      if (Term->getNumSuccessors() != 1)
2479 +        break;
2480 +      Exit = Term->getSuccessor(0);
2481 +    }
2482 +
2483 +    // It's a back edge, break here so we can insert a loop node
2484 +    if (!Visited.count(Exit))
2485 +      return Node;
2486 +
2487 +    // More than node edges are pointing to exit
2488 +    if (!DT->dominates(Entry, Exit))
2489 +      return Node;
2490 +
2491 +    RegionNode *Next = ParentRegion->getNode(Exit);
2492 +    RNVector::iterator I = std::find(Order.begin(), Order.end(), Next);
2493 +    assert(I != Order.end());
2494 +
2495 +    Visited.erase(Next->getEntry());
2496 +    Order.erase(I);
2497 +    Node = Next;
2498 +  }
2499 +
2500 +  BasicBlock *BB = Node->getEntry();
2501 +  TerminatorInst *Term = BB->getTerminator();
2502 +  if (Term->getNumSuccessors() != 2)
2503 +    return Node;
2504 +
2505 +  // Our node has exactly two succesors, check if we can handle
2506 +  // any of them directly
2507 +  BasicBlock *Succ = Term->getSuccessor(0);
2508 +  if (!Visited.count(Succ) || !dominatesPredicates(Entry, Succ)) {
2509 +    Succ = Term->getSuccessor(1);
2510 +    if (!Visited.count(Succ) || !dominatesPredicates(Entry, Succ))
2511 +      return Node;
2512 +  } else {
2513 +    BasicBlock *Succ2 = Term->getSuccessor(1);
2514 +    if (Visited.count(Succ2) && Visited[Succ] > Visited[Succ2] &&
2515 +        dominatesPredicates(Entry, Succ2))
2516 +      Succ = Succ2;
2517 +  }
2518 +
2519 +  RegionNode *Next = ParentRegion->getNode(Succ);
2520 +  RNVector::iterator E = Order.end();
2521 +  RNVector::iterator I = std::find(Order.begin(), E, Next);
2522 +  assert(I != E);
2523 +
2524 +  killTerminator(BB);
2525 +  FlowsInserted.push_back(BB);
2526 +  Visited.erase(Succ);
2527 +  Order.erase(I);
2528 +  return ParentRegion->getNode(wireFlowBlock(BB, Next));
2529 +}
2530 +
2531 +/// \brief Remove all PHI values coming from "From" into "To" and remember
2532 +/// them in DeletedPhis
2533 +void AMDGPUStructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
2534 +  PhiMap &Map = DeletedPhis[To];
2535 +  for (BasicBlock::iterator I = To->begin(), E = To->end();
2536 +       I != E && isa<PHINode>(*I);) {
2537 +
2538 +    PHINode &Phi = cast<PHINode>(*I++);
2539 +    while (Phi.getBasicBlockIndex(From) != -1) {
2540 +      Value *Deleted = Phi.removeIncomingValue(From, false);
2541 +      Map[&Phi].push_back(std::make_pair(From, Deleted));
2542 +    }
2543 +  }
2544 +}
2545 +
2546 +/// \brief Add the PHI values back once we knew the new predecessor
2547 +void AMDGPUStructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
2548 +  if (!DeletedPhis.count(To))
2549 +    return;
2550 +
2551 +  PhiMap &Map = DeletedPhis[To];
2552 +  SSAUpdater Updater;
2553 +
2554 +  for (PhiMap::iterator I = Map.begin(), E = Map.end(); I != E; ++I) {
2555 +
2556 +    PHINode *Phi = I->first;
2557 +    Updater.Initialize(Phi->getType(), "");
2558 +    BasicBlock *Fallback = To;
2559 +    bool HaveFallback = false;
2560 +
2561 +    for (BBValueVector::iterator VI = I->second.begin(), VE = I->second.end();
2562 +         VI != VE; ++VI) {
2563 +
2564 +      Updater.AddAvailableValue(VI->first, VI->second);
2565 +      BasicBlock *Dom = DT->findNearestCommonDominator(Fallback, VI->first);
2566 +      if (Dom == VI->first)
2567 +        HaveFallback = true;
2568 +      else if (Dom != Fallback)
2569 +        HaveFallback = false;
2570 +      Fallback = Dom;
2571 +    }
2572 +    if (!HaveFallback) {
2573 +      Value *Undef = UndefValue::get(Phi->getType());
2574 +      Updater.AddAvailableValue(Fallback, Undef);
2575 +    }
2576 +
2577 +    Phi->addIncoming(Updater.GetValueAtEndOfBlock(From), From);
2578 +  }
2579 +  DeletedPhis.erase(To);
2580 +}
2581 +
2582 +/// \brief Create a new flow node and update dominator tree and region info
2583 +BasicBlock *AMDGPUStructurizeCFG::getNextFlow(BasicBlock *Prev) {
2584 +  LLVMContext &Context = Func->getContext();
2585 +  BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
2586 +                       Order.back()->getEntry();
2587 +  BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
2588 +                                        Func, Insert);
2589 +  DT->addNewBlock(Flow, Prev);
2590 +  ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion);
2591 +  FlowsInserted.push_back(Flow);
2592 +  return Flow;
2593 +}
2594 +
2595 +/// \brief Can we predict that this node will always be called?
2596 +bool AMDGPUStructurizeCFG::isPredictableTrue(BasicBlock *Prev,
2597 +                                             BasicBlock *Node) {
2598 +  BBPredicates &Preds = Predicates[Node];
2599 +  bool Dominated = false;
2600 +
2601 +  for (BBPredicates::iterator I = Preds.begin(), E = Preds.end();
2602 +       I != E; ++I) {
2603 +
2604 +    if (I->second != BoolTrue)
2605 +      return false;
2606 +
2607 +    if (!Dominated && DT->dominates(I->first, Prev))
2608 +      Dominated = true;
2609 +  }
2610 +  return Dominated;
2611 +}
2612 +
2613 +/// \brief Wire up the new control flow by inserting or updating the branch
2614 +/// instructions at node exits
2615 +BasicBlock *AMDGPUStructurizeCFG::wireFlowBlock(BasicBlock *Prev,
2616 +                                                RegionNode *Node) {
2617 +  BasicBlock *Entry = Node->getEntry();
2618 +
2619 +  if (LoopStart == Entry) {
2620 +    LoopStart = Prev;
2621 +    LoopPred[Prev] = BoolTrue;
2622 +  }
2623 +
2624 +  // Wire it up temporary, skipChained may recurse into us
2625 +  BranchInst::Create(Entry, Prev);
2626 +  DT->changeImmediateDominator(Entry, Prev);
2627 +  addPhiValues(Prev, Entry);
2628 +
2629 +  Node = skipChained(Node);
2630 +
2631 +  BasicBlock *Next = getNextFlow(Prev);
2632 +  if (!isPredictableTrue(Prev, Entry)) {
2633 +    // Let Prev point to entry and next block
2634 +    Prev->getTerminator()->eraseFromParent();
2635 +    BranchInst::Create(Entry, Next, BoolUndef, Prev);
2636 +  } else {
2637 +    DT->changeImmediateDominator(Next, Entry);
2638 +  }
2639 +
2640 +  // Let node exit(s) point to next block
2641 +  if (Node->isSubRegion()) {
2642 +    Region *SubRegion = Node->getNodeAs<Region>();
2643 +    BasicBlock *Exit = SubRegion->getExit();
2644 +
2645 +    // Find all the edges from the sub region to the exit
2646 +    BBVector ToDo;
2647 +    for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) {
2648 +      if (SubRegion->contains(*I))
2649 +        ToDo.push_back(*I);
2650 +    }
2651 +
2652 +    // Modify the edges to point to the new flow block
2653 +    for (BBVector::iterator I = ToDo.begin(), E = ToDo.end(); I != E; ++I) {
2654 +      delPhiValues(*I, Exit);
2655 +      TerminatorInst *Term = (*I)->getTerminator();
2656 +      Term->replaceUsesOfWith(Exit, Next);
2657 +    }
2658 +
2659 +    // Update the region info
2660 +    SubRegion->replaceExit(Next);
2661 +
2662 +  } else {
2663 +    BasicBlock *BB = Node->getNodeAs<BasicBlock>();
2664 +    killTerminator(BB);
2665 +    BranchInst::Create(Next, BB);
2666 +
2667 +    if (BB == LoopEnd)
2668 +      LoopEnd = 0;
2669 +  }
2670 +
2671 +  return Next;
2672 +}
2673 +
2674 +/// Destroy node order and visited map, build up flow order instead.
2675 +/// After this function control flow looks like it should be, but
2676 +/// branches only have undefined conditions.
2677 +void AMDGPUStructurizeCFG::createFlow() {
2678 +  DeletedPhis.clear();
2679 +
2680 +  BasicBlock *Prev = Order.pop_back_val()->getEntry();
2681 +  assert(Prev == ParentRegion->getEntry() && "Incorrect node order!");
2682 +  Visited.erase(Prev);
2683 +
2684 +  if (LoopStart == Prev) {
2685 +    // Loop starts at entry, split entry so that we can predicate it
2686 +    BasicBlock::iterator Insert = Prev->getFirstInsertionPt();
2687 +    BasicBlock *Split = Prev->splitBasicBlock(Insert, FlowBlockName);
2688 +    DT->addNewBlock(Split, Prev);
2689 +    ParentRegion->getRegionInfo()->setRegionFor(Split, ParentRegion);
2690 +    Predicates[Split] = Predicates[Prev];
2691 +    Order.push_back(ParentRegion->getBBNode(Split));
2692 +    LoopPred[Prev] = BoolTrue;
2693 +
2694 +  } else if (LoopStart == Order.back()->getEntry()) {
2695 +    // Loop starts behind entry, split entry so that we can jump to it
2696 +    Instruction *Term = Prev->getTerminator();
2697 +    BasicBlock *Split = Prev->splitBasicBlock(Term, FlowBlockName);
2698 +    DT->addNewBlock(Split, Prev);
2699 +    ParentRegion->getRegionInfo()->setRegionFor(Split, ParentRegion);
2700 +    Prev = Split;
2701 +  }
2702 +
2703 +  killTerminator(Prev);
2704 +  FlowsInserted.clear();
2705 +  FlowsInserted.push_back(Prev);
2706 +
2707 +  while (!Order.empty()) {
2708 +    RegionNode *Node = Order.pop_back_val();
2709 +    Visited.erase(Node->getEntry());
2710 +    Prev = wireFlowBlock(Prev, Node);
2711 +    if (LoopStart && !LoopEnd) {
2712 +      // Create an extra loop end node
2713 +      LoopEnd = Prev;
2714 +      Prev = getNextFlow(LoopEnd);
2715 +      BranchInst::Create(Prev, LoopStart, BoolUndef, LoopEnd);
2716 +      addPhiValues(LoopEnd, LoopStart);
2717 +    }
2718 +  }
2719 +
2720 +  BasicBlock *Exit = ParentRegion->getExit();
2721 +  BranchInst::Create(Exit, Prev);
2722 +  addPhiValues(Prev, Exit);
2723 +  if (DT->dominates(ParentRegion->getEntry(), Exit))
2724 +    DT->changeImmediateDominator(Exit, Prev);
2725 +
2726 +  if (LoopStart && LoopEnd) {
2727 +    BBVector::iterator FI = std::find(FlowsInserted.begin(),
2728 +                                      FlowsInserted.end(),
2729 +                                      LoopStart);
2730 +    for (; *FI != LoopEnd; ++FI) {
2731 +      addPhiValues(*FI, (*FI)->getTerminator()->getSuccessor(0));
2732 +    }
2733 +  }
2734 +
2735 +  assert(Order.empty());
2736 +  assert(Visited.empty());
2737 +  assert(DeletedPhis.empty());
2738 +}
2739 +
2740 +/// \brief Insert the missing branch conditions
2741 +void AMDGPUStructurizeCFG::insertConditions() {
2742 +  SSAUpdater PhiInserter;
2743 +
2744 +  for (BBVector::iterator FI = FlowsInserted.begin(), FE = FlowsInserted.end();
2745 +       FI != FE; ++FI) {
2746 +
2747 +    BranchInst *Term = cast<BranchInst>((*FI)->getTerminator());
2748 +    if (Term->isUnconditional())
2749 +      continue;
2750 +
2751 +    PhiInserter.Initialize(Boolean, "");
2752 +    PhiInserter.AddAvailableValue(&Func->getEntryBlock(), BoolFalse);
2753 +
2754 +    BasicBlock *Succ = Term->getSuccessor(0);
2755 +    BBPredicates &Preds = (*FI == LoopEnd) ? LoopPred : Predicates[Succ];
2756 +    for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
2757 +         PI != PE; ++PI) {
2758 +
2759 +      PhiInserter.AddAvailableValue(PI->first, PI->second);
2760 +    }
2761 +
2762 +    Term->setCondition(PhiInserter.GetValueAtEndOfBlock(*FI));
2763 +  }
2764 +}
2765 +
2766 +/// Handle a rare case where the disintegrated nodes instructions
2767 +/// no longer dominate all their uses. Not sure if this is really nessasary
2768 +void AMDGPUStructurizeCFG::rebuildSSA() {
2769 +  SSAUpdater Updater;
2770 +  for (Region::block_iterator I = ParentRegion->block_begin(),
2771 +                              E = ParentRegion->block_end();
2772 +       I != E; ++I) {
2773 +
2774 +    BasicBlock *BB = *I;
2775 +    for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
2776 +         II != IE; ++II) {
2777 +
2778 +      bool Initialized = false;
2779 +      for (Use *I = &II->use_begin().getUse(), *Next; I; I = Next) {
2780 +
2781 +        Next = I->getNext();
2782 +
2783 +        Instruction *User = cast<Instruction>(I->getUser());
2784 +        if (User->getParent() == BB) {
2785 +          continue;
2786 +
2787 +        } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
2788 +          if (UserPN->getIncomingBlock(*I) == BB)
2789 +            continue;
2790 +        }
2791 +
2792 +        if (DT->dominates(II, User))
2793 +          continue;
2794 +
2795 +        if (!Initialized) {
2796 +          Value *Undef = UndefValue::get(II->getType());
2797 +          Updater.Initialize(II->getType(), "");
2798 +          Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
2799 +          Updater.AddAvailableValue(BB, II);
2800 +          Initialized = true;
2801 +        }
2802 +        Updater.RewriteUseAfterInsertions(*I);
2803 +      }
2804 +    }
2805 +  }
2806 +}
2807 +
2808 +/// \brief Run the transformation for each region found
2809 +bool AMDGPUStructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
2810 +  if (R->isTopLevelRegion())
2811 +    return false;
2812 +
2813 +  Func = R->getEntry()->getParent();
2814 +  ParentRegion = R;
2815 +
2816 +  DT = &getAnalysis<DominatorTree>();
2817 +
2818 +  orderNodes();
2819 +  collectInfos();
2820 +  createFlow();
2821 +  insertConditions();
2822 +  rebuildSSA();
2823 +
2824 +  Order.clear();
2825 +  Visited.clear();
2826 +  Predicates.clear();
2827 +  DeletedPhis.clear();
2828 +  FlowsInserted.clear();
2829 +
2830 +  return true;
2831 +}
2832 +
2833 +/// \brief Create the pass
2834 +Pass *llvm::createAMDGPUStructurizeCFGPass() {
2835 +  return new AMDGPUStructurizeCFG();
2836 +}
2837 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUSubtarget.cpp llvm-r600/lib/Target/R600/AMDGPUSubtarget.cpp
2838 --- llvm-3.2.src/lib/Target/R600/AMDGPUSubtarget.cpp    1970-01-01 01:00:00.000000000 +0100
2839 +++ llvm-r600/lib/Target/R600/AMDGPUSubtarget.cpp       2013-01-25 19:43:57.433383055 +0100
2840 @@ -0,0 +1,87 @@
2841 +//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2842 +//
2843 +//                     The LLVM Compiler Infrastructure
2844 +//
2845 +// This file is distributed under the University of Illinois Open Source
2846 +// License. See LICENSE.TXT for details.
2847 +//
2848 +//===----------------------------------------------------------------------===//
2849 +//
2850 +/// \file
2851 +/// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
2852 +//
2853 +//===----------------------------------------------------------------------===//
2854 +
2855 +#include "AMDGPUSubtarget.h"
2856 +
2857 +using namespace llvm;
2858 +
2859 +#define GET_SUBTARGETINFO_ENUM
2860 +#define GET_SUBTARGETINFO_TARGET_DESC
2861 +#define GET_SUBTARGETINFO_CTOR
2862 +#include "AMDGPUGenSubtargetInfo.inc"
2863 +
2864 +AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
2865 +  AMDGPUGenSubtargetInfo(TT, CPU, FS), DumpCode(false) {
2866 +    InstrItins = getInstrItineraryForCPU(CPU);
2867 +
2868 +  memset(CapsOverride, 0, sizeof(*CapsOverride)
2869 +      * AMDGPUDeviceInfo::MaxNumberCapabilities);
2870 +  // Default card
2871 +  StringRef GPU = CPU;
2872 +  Is64bit = false;
2873 +  DefaultSize[0] = 64;
2874 +  DefaultSize[1] = 1;
2875 +  DefaultSize[2] = 1;
2876 +  ParseSubtargetFeatures(GPU, FS);
2877 +  DevName = GPU;
2878 +  Device = AMDGPUDeviceInfo::getDeviceFromName(DevName, this, Is64bit);
2879 +}
2880 +
2881 +AMDGPUSubtarget::~AMDGPUSubtarget() {
2882 +  delete Device;
2883 +}
2884 +
2885 +bool
2886 +AMDGPUSubtarget::isOverride(AMDGPUDeviceInfo::Caps caps) const {
2887 +  assert(caps < AMDGPUDeviceInfo::MaxNumberCapabilities &&
2888 +      "Caps index is out of bounds!");
2889 +  return CapsOverride[caps];
2890 +}
2891 +bool
2892 +AMDGPUSubtarget::is64bit() const  {
2893 +  return Is64bit;
2894 +}
2895 +bool
2896 +AMDGPUSubtarget::isTargetELF() const {
2897 +  return false;
2898 +}
2899 +size_t
2900 +AMDGPUSubtarget::getDefaultSize(uint32_t dim) const {
2901 +  if (dim > 3) {
2902 +    return 1;
2903 +  } else {
2904 +    return DefaultSize[dim];
2905 +  }
2906 +}
2907 +
2908 +std::string
2909 +AMDGPUSubtarget::getDataLayout() const {
2910 +    if (!Device) {
2911 +        return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16"
2912 +                "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
2913 +                "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
2914 +                "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
2915 +                "-v512:512:512-v1024:1024:1024-v2048:2048:2048-a0:0:64");
2916 +    }
2917 +    return Device->getDataLayout();
2918 +}
2919 +
2920 +std::string
2921 +AMDGPUSubtarget::getDeviceName() const {
2922 +  return DevName;
2923 +}
2924 +const AMDGPUDevice *
2925 +AMDGPUSubtarget::device() const {
2926 +  return Device;
2927 +}
2928 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUSubtarget.h llvm-r600/lib/Target/R600/AMDGPUSubtarget.h
2929 --- llvm-3.2.src/lib/Target/R600/AMDGPUSubtarget.h      1970-01-01 01:00:00.000000000 +0100
2930 +++ llvm-r600/lib/Target/R600/AMDGPUSubtarget.h 2013-01-25 19:43:57.433383055 +0100
2931 @@ -0,0 +1,65 @@
2932 +//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====//
2933 +//
2934 +//                     The LLVM Compiler Infrastructure
2935 +//
2936 +// This file is distributed under the University of Illinois Open Source
2937 +// License. See LICENSE.TXT for details.
2938 +//
2939 +//==-----------------------------------------------------------------------===//
2940 +//
2941 +/// \file
2942 +/// \brief AMDGPU specific subclass of TargetSubtarget.
2943 +//
2944 +//===----------------------------------------------------------------------===//
2945 +
2946 +#ifndef AMDGPUSUBTARGET_H
2947 +#define AMDGPUSUBTARGET_H
2948 +#include "AMDILDevice.h"
2949 +#include "llvm/ADT/StringExtras.h"
2950 +#include "llvm/ADT/StringRef.h"
2951 +#include "llvm/Target/TargetSubtargetInfo.h"
2952 +
2953 +#define GET_SUBTARGETINFO_HEADER
2954 +#include "AMDGPUGenSubtargetInfo.inc"
2955 +
2956 +#define MAX_CB_SIZE (1 << 16)
2957 +
2958 +namespace llvm {
2959 +
2960 +class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
2961 +private:
2962 +  bool CapsOverride[AMDGPUDeviceInfo::MaxNumberCapabilities];
2963 +  const AMDGPUDevice *Device;
2964 +  size_t DefaultSize[3];
2965 +  std::string DevName;
2966 +  bool Is64bit;
2967 +  bool Is32on64bit;
2968 +  bool DumpCode;
2969 +  bool R600ALUInst;
2970 +
2971 +  InstrItineraryData InstrItins;
2972 +
2973 +public:
2974 +  AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS);
2975 +  virtual ~AMDGPUSubtarget();
2976 +
2977 +  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
2978 +  virtual void ParseSubtargetFeatures(llvm::StringRef CPU, llvm::StringRef FS);
2979 +
2980 +  bool isOverride(AMDGPUDeviceInfo::Caps) const;
2981 +  bool is64bit() const;
2982 +
2983 +  // Helper functions to simplify if statements
2984 +  bool isTargetELF() const;
2985 +  const AMDGPUDevice* device() const;
2986 +  std::string getDataLayout() const;
2987 +  std::string getDeviceName() const;
2988 +  virtual size_t getDefaultSize(uint32_t dim) const;
2989 +  bool dumpCode() const { return DumpCode; }
2990 +  bool r600ALUEncoding() const { return R600ALUInst; }
2991 +
2992 +};
2993 +
2994 +} // End namespace llvm
2995 +
2996 +#endif // AMDGPUSUBTARGET_H
2997 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUTargetMachine.cpp llvm-r600/lib/Target/R600/AMDGPUTargetMachine.cpp
2998 --- llvm-3.2.src/lib/Target/R600/AMDGPUTargetMachine.cpp        1970-01-01 01:00:00.000000000 +0100
2999 +++ llvm-r600/lib/Target/R600/AMDGPUTargetMachine.cpp   2013-01-25 19:43:57.433383055 +0100
3000 @@ -0,0 +1,148 @@
3001 +//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
3002 +//
3003 +//                     The LLVM Compiler Infrastructure
3004 +//
3005 +// This file is distributed under the University of Illinois Open Source
3006 +// License. See LICENSE.TXT for details.
3007 +//
3008 +//===----------------------------------------------------------------------===//
3009 +//
3010 +/// \file
3011 +/// \brief The AMDGPU target machine contains all of the hardware specific
3012 +/// information  needed to emit code for R600 and SI GPUs.
3013 +//
3014 +//===----------------------------------------------------------------------===//
3015 +
3016 +#include "AMDGPUTargetMachine.h"
3017 +#include "AMDGPU.h"
3018 +#include "R600ISelLowering.h"
3019 +#include "R600InstrInfo.h"
3020 +#include "SIISelLowering.h"
3021 +#include "SIInstrInfo.h"
3022 +#include "llvm/Analysis/Passes.h"
3023 +#include "llvm/Analysis/Verifier.h"
3024 +#include "llvm/CodeGen/MachineFunctionAnalysis.h"
3025 +#include "llvm/CodeGen/MachineModuleInfo.h"
3026 +#include "llvm/CodeGen/Passes.h"
3027 +#include "llvm/MC/MCAsmInfo.h"
3028 +#include "llvm/PassManager.h"
3029 +#include "llvm/Support/TargetRegistry.h"
3030 +#include "llvm/Support/raw_os_ostream.h"
3031 +#include "llvm/Transforms/IPO.h"
3032 +#include "llvm/Transforms/Scalar.h"
3033 +#include <llvm/CodeGen/Passes.h>
3034 +
3035 +using namespace llvm;
3036 +
3037 +extern "C" void LLVMInitializeR600Target() {
3038 +  // Register the target
3039 +  RegisterTargetMachine<AMDGPUTargetMachine> X(TheAMDGPUTarget);
3040 +}
3041 +
3042 +AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
3043 +    StringRef CPU, StringRef FS,
3044 +  TargetOptions Options,
3045 +  Reloc::Model RM, CodeModel::Model CM,
3046 +  CodeGenOpt::Level OptLevel
3047 +)
3048 +:
3049 +  LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel),
3050 +  Subtarget(TT, CPU, FS),
3051 +  Layout(Subtarget.getDataLayout()),
3052 +  FrameLowering(TargetFrameLowering::StackGrowsUp,
3053 +      Subtarget.device()->getStackAlignment(), 0),
3054 +  IntrinsicInfo(this),
3055 +  InstrItins(&Subtarget.getInstrItineraryData()) {
3056 +  // TLInfo uses InstrInfo so it must be initialized after.
3057 +  if (Subtarget.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
3058 +    InstrInfo = new R600InstrInfo(*this);
3059 +    TLInfo = new R600TargetLowering(*this);
3060 +  } else {
3061 +    InstrInfo = new SIInstrInfo(*this);
3062 +    TLInfo = new SITargetLowering(*this);
3063 +  }
3064 +}
3065 +
3066 +AMDGPUTargetMachine::~AMDGPUTargetMachine() {
3067 +}
3068 +
3069 +namespace {
3070 +class AMDGPUPassConfig : public TargetPassConfig {
3071 +public:
3072 +  AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM)
3073 +    : TargetPassConfig(TM, PM) {}
3074 +
3075 +  AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
3076 +    return getTM<AMDGPUTargetMachine>();
3077 +  }
3078 +
3079 +  virtual bool addPreISel();
3080 +  virtual bool addInstSelector();
3081 +  virtual bool addPreRegAlloc();
3082 +  virtual bool addPostRegAlloc();
3083 +  virtual bool addPreSched2();
3084 +  virtual bool addPreEmitPass();
3085 +};
3086 +} // End of anonymous namespace
3087 +
3088 +TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) {
3089 +  return new AMDGPUPassConfig(this, PM);
3090 +}
3091 +
3092 +bool
3093 +AMDGPUPassConfig::addPreISel() {
3094 +  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
3095 +  if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
3096 +    addPass(createAMDGPUStructurizeCFGPass());
3097 +    addPass(createSIAnnotateControlFlowPass());
3098 +  }
3099 +  return false;
3100 +}
3101 +
3102 +bool AMDGPUPassConfig::addInstSelector() {
3103 +  addPass(createAMDGPUPeepholeOpt(*TM));
3104 +  addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
3105 +  return false;
3106 +}
3107 +
3108 +bool AMDGPUPassConfig::addPreRegAlloc() {
3109 +  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
3110 +
3111 +  if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
3112 +    addPass(createSIAssignInterpRegsPass(*TM));
3113 +  }
3114 +  addPass(createAMDGPUConvertToISAPass(*TM));
3115 +  return false;
3116 +}
3117 +
3118 +bool AMDGPUPassConfig::addPostRegAlloc() {
3119 +  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
3120 +
3121 +  if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
3122 +    addPass(createSIInsertWaits(*TM));
3123 +  }
3124 +  return false;
3125 +}
3126 +
3127 +bool AMDGPUPassConfig::addPreSched2() {
3128 +
3129 +  addPass(&IfConverterID);
3130 +  return false;
3131 +}
3132 +
3133 +bool AMDGPUPassConfig::addPreEmitPass() {
3134 +  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
3135 +  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
3136 +    addPass(createAMDGPUCFGPreparationPass(*TM));
3137 +    addPass(createAMDGPUCFGStructurizerPass(*TM));
3138 +    addPass(createR600ExpandSpecialInstrsPass(*TM));
3139 +    addPass(createR600LowerConstCopy(*TM));
3140 +    addPass(&FinalizeMachineBundlesID);
3141 +  } else {
3142 +    addPass(createSILowerLiteralConstantsPass(*TM));
3143 +    addPass(createSILowerControlFlowPass(*TM));
3144 +  }
3145 +
3146 +  return false;
3147 +}
3148 +
3149 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUTargetMachine.h llvm-r600/lib/Target/R600/AMDGPUTargetMachine.h
3150 --- llvm-3.2.src/lib/Target/R600/AMDGPUTargetMachine.h  1970-01-01 01:00:00.000000000 +0100
3151 +++ llvm-r600/lib/Target/R600/AMDGPUTargetMachine.h     2013-01-25 19:43:57.433383055 +0100
3152 @@ -0,0 +1,70 @@
3153 +//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===//
3154 +//
3155 +//                     The LLVM Compiler Infrastructure
3156 +//
3157 +// This file is distributed under the University of Illinois Open Source
3158 +// License. See LICENSE.TXT for details.
3159 +//
3160 +//===----------------------------------------------------------------------===//
3161 +//
3162 +/// \file
3163 +/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets.
3164 +//
3165 +//===----------------------------------------------------------------------===//
3166 +
3167 +#ifndef AMDGPU_TARGET_MACHINE_H
3168 +#define AMDGPU_TARGET_MACHINE_H
3169 +
3170 +#include "AMDGPUInstrInfo.h"
3171 +#include "AMDGPUSubtarget.h"
3172 +#include "AMDILFrameLowering.h"
3173 +#include "AMDILIntrinsicInfo.h"
3174 +#include "R600ISelLowering.h"
3175 +#include "llvm/ADT/OwningPtr.h"
3176 +#include "llvm/DataLayout.h"
3177 +
3178 +namespace llvm {
3179 +
3180 +MCAsmInfo* createMCAsmInfo(const Target &T, StringRef TT);
3181 +
3182 +class AMDGPUTargetMachine : public LLVMTargetMachine {
3183 +
3184 +  AMDGPUSubtarget Subtarget;
3185 +  const DataLayout Layout;
3186 +  AMDGPUFrameLowering FrameLowering;
3187 +  AMDGPUIntrinsicInfo IntrinsicInfo;
3188 +  const AMDGPUInstrInfo * InstrInfo;
3189 +  AMDGPUTargetLowering * TLInfo;
3190 +  const InstrItineraryData* InstrItins;
3191 +
3192 +public:
3193 +   AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef FS,
3194 +                       StringRef CPU,
3195 +                       TargetOptions Options,
3196 +                       Reloc::Model RM, CodeModel::Model CM,
3197 +                       CodeGenOpt::Level OL);
3198 +   ~AMDGPUTargetMachine();
3199 +   virtual const AMDGPUFrameLowering* getFrameLowering() const {
3200 +     return &FrameLowering;
3201 +   }
3202 +   virtual const AMDGPUIntrinsicInfo* getIntrinsicInfo() const {
3203 +     return &IntrinsicInfo;
3204 +   }
3205 +   virtual const AMDGPUInstrInfo *getInstrInfo() const {return InstrInfo;}
3206 +   virtual const AMDGPUSubtarget *getSubtargetImpl() const {return &Subtarget; }
3207 +   virtual const AMDGPURegisterInfo *getRegisterInfo() const {
3208 +      return &InstrInfo->getRegisterInfo();
3209 +   }
3210 +   virtual AMDGPUTargetLowering * getTargetLowering() const {
3211 +      return TLInfo;
3212 +   }
3213 +   virtual const InstrItineraryData* getInstrItineraryData() const {
3214 +      return InstrItins;
3215 +   }
3216 +   virtual const DataLayout* getDataLayout() const { return &Layout; }
3217 +   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
3218 +};
3219 +
3220 +} // End namespace llvm
3221 +
3222 +#endif // AMDGPU_TARGET_MACHINE_H
3223 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPU.td llvm-r600/lib/Target/R600/AMDGPU.td
3224 --- llvm-3.2.src/lib/Target/R600/AMDGPU.td      1970-01-01 01:00:00.000000000 +0100
3225 +++ llvm-r600/lib/Target/R600/AMDGPU.td 2013-01-25 19:43:57.423383055 +0100
3226 @@ -0,0 +1,40 @@
3227 +//===-- AMDIL.td - AMDIL Tablegen files --*- tablegen -*-------------------===//
3228 +//
3229 +//                     The LLVM Compiler Infrastructure
3230 +//
3231 +// This file is distributed under the University of Illinois Open Source
3232 +// License. See LICENSE.TXT for details.
3233 +//
3234 +//==-----------------------------------------------------------------------===//
3235 +
3236 +// Include AMDIL TD files
3237 +include "AMDILBase.td"
3238 +
3239 +
3240 +def AMDGPUInstrInfo : InstrInfo {
3241 +  let guessInstructionProperties = 1;
3242 +}
3243 +
3244 +//===----------------------------------------------------------------------===//
3245 +// Declare the target which we are implementing
3246 +//===----------------------------------------------------------------------===//
3247 +def AMDGPUAsmWriter : AsmWriter {
3248 +    string AsmWriterClassName = "InstPrinter";
3249 +    int Variant = 0;
3250 +    bit isMCAsmWriter = 1;
3251 +}
3252 +
3253 +def AMDGPU : Target {
3254 +  // Pull in Instruction Info:
3255 +  let InstructionSet = AMDGPUInstrInfo;
3256 +  let AssemblyWriters = [AMDGPUAsmWriter];
3257 +}
3258 +
3259 +// Include AMDGPU TD files
3260 +include "R600Schedule.td"
3261 +include "SISchedule.td"
3262 +include "Processors.td"
3263 +include "AMDGPUInstrInfo.td"
3264 +include "AMDGPUIntrinsics.td"
3265 +include "AMDGPURegisterInfo.td"
3266 +include "AMDGPUInstructions.td"
3267 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDIL7XXDevice.cpp llvm-r600/lib/Target/R600/AMDIL7XXDevice.cpp
3268 --- llvm-3.2.src/lib/Target/R600/AMDIL7XXDevice.cpp     1970-01-01 01:00:00.000000000 +0100
3269 +++ llvm-r600/lib/Target/R600/AMDIL7XXDevice.cpp        2013-01-25 19:43:57.433383055 +0100
3270 @@ -0,0 +1,115 @@
3271 +//===-- AMDIL7XXDevice.cpp - Device Info for 7XX GPUs ---------------------===//
3272 +//
3273 +//                     The LLVM Compiler Infrastructure
3274 +//
3275 +// This file is distributed under the University of Illinois Open Source
3276 +// License. See LICENSE.TXT for details.
3277 +//
3278 +// \file
3279 +//==-----------------------------------------------------------------------===//
3280 +#include "AMDIL7XXDevice.h"
3281 +#include "AMDGPUSubtarget.h"
3282 +#include "AMDILDevice.h"
3283 +
3284 +using namespace llvm;
3285 +
3286 +AMDGPU7XXDevice::AMDGPU7XXDevice(AMDGPUSubtarget *ST) : AMDGPUDevice(ST) {
3287 +  setCaps();
3288 +  std::string name = mSTM->getDeviceName();
3289 +  if (name == "rv710") {
3290 +    DeviceFlag = OCL_DEVICE_RV710;
3291 +  } else if (name == "rv730") {
3292 +    DeviceFlag = OCL_DEVICE_RV730;
3293 +  } else {
3294 +    DeviceFlag = OCL_DEVICE_RV770;
3295 +  }
3296 +}
3297 +
3298 +AMDGPU7XXDevice::~AMDGPU7XXDevice() {
3299 +}
3300 +
3301 +void AMDGPU7XXDevice::setCaps() {
3302 +  mSWBits.set(AMDGPUDeviceInfo::LocalMem);
3303 +}
3304 +
3305 +size_t AMDGPU7XXDevice::getMaxLDSSize() const {
3306 +  if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
3307 +    return MAX_LDS_SIZE_700;
3308 +  }
3309 +  return 0;
3310 +}
3311 +
3312 +size_t AMDGPU7XXDevice::getWavefrontSize() const {
3313 +  return AMDGPUDevice::HalfWavefrontSize;
3314 +}
3315 +
3316 +uint32_t AMDGPU7XXDevice::getGeneration() const {
3317 +  return AMDGPUDeviceInfo::HD4XXX;
3318 +}
3319 +
3320 +uint32_t AMDGPU7XXDevice::getResourceID(uint32_t DeviceID) const {
3321 +  switch (DeviceID) {
3322 +  default:
3323 +    assert(0 && "ID type passed in is unknown!");
3324 +    break;
3325 +  case GLOBAL_ID:
3326 +  case CONSTANT_ID:
3327 +  case RAW_UAV_ID:
3328 +  case ARENA_UAV_ID:
3329 +    break;
3330 +  case LDS_ID:
3331 +    if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
3332 +      return DEFAULT_LDS_ID;
3333 +    }
3334 +    break;
3335 +  case SCRATCH_ID:
3336 +    if (usesHardware(AMDGPUDeviceInfo::PrivateMem)) {
3337 +      return DEFAULT_SCRATCH_ID;
3338 +    }
3339 +    break;
3340 +  case GDS_ID:
3341 +    assert(0 && "GDS UAV ID is not supported on this chip");
3342 +    if (usesHardware(AMDGPUDeviceInfo::RegionMem)) {
3343 +      return DEFAULT_GDS_ID;
3344 +    }
3345 +    break;
3346 +  };
3347 +
3348 +  return 0;
3349 +}
3350 +
3351 +uint32_t AMDGPU7XXDevice::getMaxNumUAVs() const {
3352 +  return 1;
3353 +}
3354 +
3355 +AMDGPU770Device::AMDGPU770Device(AMDGPUSubtarget *ST): AMDGPU7XXDevice(ST) {
3356 +  setCaps();
3357 +}
3358 +
3359 +AMDGPU770Device::~AMDGPU770Device() {
3360 +}
3361 +
3362 +void AMDGPU770Device::setCaps() {
3363 +  if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) {
3364 +    mSWBits.set(AMDGPUDeviceInfo::FMA);
3365 +    mHWBits.set(AMDGPUDeviceInfo::DoubleOps);
3366 +  }
3367 +  mSWBits.set(AMDGPUDeviceInfo::BarrierDetect);
3368 +  mHWBits.reset(AMDGPUDeviceInfo::LongOps);
3369 +  mSWBits.set(AMDGPUDeviceInfo::LongOps);
3370 +  mSWBits.set(AMDGPUDeviceInfo::LocalMem);
3371 +}
3372 +
3373 +size_t AMDGPU770Device::getWavefrontSize() const {
3374 +  return AMDGPUDevice::WavefrontSize;
3375 +}
3376 +
3377 +AMDGPU710Device::AMDGPU710Device(AMDGPUSubtarget *ST) : AMDGPU7XXDevice(ST) {
3378 +}
3379 +
3380 +AMDGPU710Device::~AMDGPU710Device() {
3381 +}
3382 +
3383 +size_t AMDGPU710Device::getWavefrontSize() const {
3384 +  return AMDGPUDevice::QuarterWavefrontSize;
3385 +}
3386 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDIL7XXDevice.h llvm-r600/lib/Target/R600/AMDIL7XXDevice.h
3387 --- llvm-3.2.src/lib/Target/R600/AMDIL7XXDevice.h       1970-01-01 01:00:00.000000000 +0100
3388 +++ llvm-r600/lib/Target/R600/AMDIL7XXDevice.h  2013-01-25 19:43:57.436716388 +0100
3389 @@ -0,0 +1,72 @@
3390 +//==-- AMDIL7XXDevice.h - Define 7XX Device Device for AMDIL ---*- C++ -*--===//
3391 +//
3392 +//                     The LLVM Compiler Infrastructure
3393 +//
3394 +// This file is distributed under the University of Illinois Open Source
3395 +// License. See LICENSE.TXT for details.
3396 +//
3397 +//==-----------------------------------------------------------------------===//
3398 +/// \file
3399 +/// \brief Interface for the subtarget data classes.
3400 +///
3401 +/// This file will define the interface that each generation needs to
3402 +/// implement in order to correctly answer queries on the capabilities of the
3403 +/// specific hardware.
3404 +//===----------------------------------------------------------------------===//
3405 +#ifndef AMDIL7XXDEVICEIMPL_H
3406 +#define AMDIL7XXDEVICEIMPL_H
3407 +#include "AMDILDevice.h"
3408 +
3409 +namespace llvm {
3410 +class AMDGPUSubtarget;
3411 +
3412 +//===----------------------------------------------------------------------===//
3413 +// 7XX generation of devices and their respective sub classes
3414 +//===----------------------------------------------------------------------===//
3415 +
3416 +/// \brief The AMDGPU7XXDevice class represents the generic 7XX device.
3417 +///
3418 +/// All 7XX devices are derived from this class. The AMDGPU7XX device will only
3419 +/// support the minimal features that are required to be considered OpenCL 1.0
3420 +/// compliant and nothing more.
3421 +class AMDGPU7XXDevice : public AMDGPUDevice {
3422 +public:
3423 +  AMDGPU7XXDevice(AMDGPUSubtarget *ST);
3424 +  virtual ~AMDGPU7XXDevice();
3425 +  virtual size_t getMaxLDSSize() const;
3426 +  virtual size_t getWavefrontSize() const;
3427 +  virtual uint32_t getGeneration() const;
3428 +  virtual uint32_t getResourceID(uint32_t DeviceID) const;
3429 +  virtual uint32_t getMaxNumUAVs() const;
3430 +
3431 +protected:
3432 +  virtual void setCaps();
3433 +};
3434 +
3435 +/// \brief The AMDGPU770Device class represents the RV770 chip and it's
3436 +/// derivative cards.
3437 +///
3438 +/// The difference between this device and the base class is this device device
3439 +/// adds support for double precision and has a larger wavefront size.
3440 +class AMDGPU770Device : public AMDGPU7XXDevice {
3441 +public:
3442 +  AMDGPU770Device(AMDGPUSubtarget *ST);
3443 +  virtual ~AMDGPU770Device();
3444 +  virtual size_t getWavefrontSize() const;
3445 +private:
3446 +  virtual void setCaps();
3447 +};
3448 +
3449 +/// \brief The AMDGPU710Device class derives from the 7XX base class.
3450 +///
3451 +/// This class is a smaller derivative, so we need to overload some of the
3452 +/// functions in order to correctly specify this information.
3453 +class AMDGPU710Device : public AMDGPU7XXDevice {
3454 +public:
3455 +  AMDGPU710Device(AMDGPUSubtarget *ST);
3456 +  virtual ~AMDGPU710Device();
3457 +  virtual size_t getWavefrontSize() const;
3458 +};
3459 +
3460 +} // namespace llvm
3461 +#endif // AMDILDEVICEIMPL_H
3462 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILBase.td llvm-r600/lib/Target/R600/AMDILBase.td
3463 --- llvm-3.2.src/lib/Target/R600/AMDILBase.td   1970-01-01 01:00:00.000000000 +0100
3464 +++ llvm-r600/lib/Target/R600/AMDILBase.td      2013-01-25 19:43:57.436716388 +0100
3465 @@ -0,0 +1,85 @@
3466 +//===- AMDIL.td - AMDIL Target Machine -------------*- tablegen -*-===//
3467 +//
3468 +//                     The LLVM Compiler Infrastructure
3469 +//
3470 +// This file is distributed under the University of Illinois Open Source
3471 +// License. See LICENSE.TXT for details.
3472 +//
3473 +//===----------------------------------------------------------------------===//
3474 +// Target-independent interfaces which we are implementing
3475 +//===----------------------------------------------------------------------===//
3476 +
3477 +include "llvm/Target/Target.td"
3478 +
3479 +// Dummy Instruction itineraries for pseudo instructions
3480 +def ALU_NULL : FuncUnit;
3481 +def NullALU : InstrItinClass;
3482 +
3483 +//===----------------------------------------------------------------------===//
3484 +// AMDIL Subtarget features.
3485 +//===----------------------------------------------------------------------===//
3486 +def FeatureFP64     : SubtargetFeature<"fp64",
3487 +        "CapsOverride[AMDGPUDeviceInfo::DoubleOps]",
3488 +        "true",
3489 +        "Enable 64bit double precision operations">;
3490 +def FeatureByteAddress    : SubtargetFeature<"byte_addressable_store",
3491 +        "CapsOverride[AMDGPUDeviceInfo::ByteStores]",
3492 +        "true",
3493 +        "Enable byte addressable stores">;
3494 +def FeatureBarrierDetect : SubtargetFeature<"barrier_detect",
3495 +        "CapsOverride[AMDGPUDeviceInfo::BarrierDetect]",
3496 +        "true",
3497 +        "Enable duplicate barrier detection(HD5XXX or later).">;
3498 +def FeatureImages : SubtargetFeature<"images",
3499 +        "CapsOverride[AMDGPUDeviceInfo::Images]",
3500 +        "true",
3501 +        "Enable image functions">;
3502 +def FeatureMultiUAV : SubtargetFeature<"multi_uav",
3503 +        "CapsOverride[AMDGPUDeviceInfo::MultiUAV]",
3504 +        "true",
3505 +        "Generate multiple UAV code(HD5XXX family or later)">;
3506 +def FeatureMacroDB : SubtargetFeature<"macrodb",
3507 +        "CapsOverride[AMDGPUDeviceInfo::MacroDB]",
3508 +        "true",
3509 +        "Use internal macrodb, instead of macrodb in driver">;
3510 +def FeatureNoAlias : SubtargetFeature<"noalias",
3511 +        "CapsOverride[AMDGPUDeviceInfo::NoAlias]",
3512 +        "true",
3513 +        "assert that all kernel argument pointers are not aliased">;
3514 +def FeatureNoInline : SubtargetFeature<"no-inline",
3515 +        "CapsOverride[AMDGPUDeviceInfo::NoInline]",
3516 +        "true",
3517 +        "specify whether to not inline functions">;
3518 +
3519 +def Feature64BitPtr : SubtargetFeature<"64BitPtr",
3520 +        "Is64bit",
3521 +        "false",
3522 +        "Specify if 64bit addressing should be used.">;
3523 +
3524 +def Feature32on64BitPtr : SubtargetFeature<"64on32BitPtr",
3525 +        "Is32on64bit",
3526 +        "false",
3527 +        "Specify if 64bit sized pointers with 32bit addressing should be used.">;
3528 +def FeatureDebug : SubtargetFeature<"debug",
3529 +        "CapsOverride[AMDGPUDeviceInfo::Debug]",
3530 +        "true",
3531 +        "Debug mode is enabled, so disable hardware accelerated address spaces.">;
3532 +def FeatureDumpCode : SubtargetFeature <"DumpCode",
3533 +        "DumpCode",
3534 +        "true",
3535 +        "Dump MachineInstrs in the CodeEmitter">;
3536 +
3537 +def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
3538 +        "R600ALUInst",
3539 +        "false",
3540 +        "Older version of ALU instructions encoding.">;
3541 +
3542 +
3543 +//===----------------------------------------------------------------------===//
3544 +// Register File, Calling Conv, Instruction Descriptions
3545 +//===----------------------------------------------------------------------===//
3546 +
3547 +
3548 +include "AMDILRegisterInfo.td"
3549 +include "AMDILInstrInfo.td"
3550 +
3551 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILCFGStructurizer.cpp llvm-r600/lib/Target/R600/AMDILCFGStructurizer.cpp
3552 --- llvm-3.2.src/lib/Target/R600/AMDILCFGStructurizer.cpp       1970-01-01 01:00:00.000000000 +0100
3553 +++ llvm-r600/lib/Target/R600/AMDILCFGStructurizer.cpp  2013-01-25 19:43:57.436716388 +0100
3554 @@ -0,0 +1,3045 @@
3555 +//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===//
3556 +//
3557 +//                     The LLVM Compiler Infrastructure
3558 +//
3559 +// This file is distributed under the University of Illinois Open Source
3560 +// License. See LICENSE.TXT for details.
3561 +//
3562 +/// \file
3563 +//==-----------------------------------------------------------------------===//
3564 +
3565 +#define DEBUGME 0
3566 +#define DEBUG_TYPE "structcfg"
3567 +
3568 +#include "AMDGPUInstrInfo.h"
3569 +#include "AMDIL.h"
3570 +#include "llvm/ADT/SCCIterator.h"
3571 +#include "llvm/ADT/SmallVector.h"
3572 +#include "llvm/ADT/Statistic.h"
3573 +#include "llvm/Analysis/DominatorInternals.h"
3574 +#include "llvm/Analysis/Dominators.h"
3575 +#include "llvm/CodeGen/MachinePostDominators.h"
3576 +#include "llvm/CodeGen/MachineDominators.h"
3577 +#include "llvm/CodeGen/MachineFunction.h"
3578 +#include "llvm/CodeGen/MachineFunctionAnalysis.h"
3579 +#include "llvm/CodeGen/MachineFunctionPass.h"
3580 +#include "llvm/CodeGen/MachineInstrBuilder.h"
3581 +#include "llvm/CodeGen/MachineJumpTableInfo.h"
3582 +#include "llvm/CodeGen/MachineLoopInfo.h"
3583 +#include "llvm/CodeGen/MachineRegisterInfo.h"
3584 +#include "llvm/Target/TargetInstrInfo.h"
3585 +
3586 +using namespace llvm;
3587 +
3588 +// TODO: move-begin.
3589 +
3590 +//===----------------------------------------------------------------------===//
3591 +//
3592 +// Statistics for CFGStructurizer.
3593 +//
3594 +//===----------------------------------------------------------------------===//
3595 +
3596 +STATISTIC(numSerialPatternMatch,    "CFGStructurizer number of serial pattern "
3597 +    "matched");
3598 +STATISTIC(numIfPatternMatch,        "CFGStructurizer number of if pattern "
3599 +    "matched");
3600 +STATISTIC(numLoopbreakPatternMatch, "CFGStructurizer number of loop-break "
3601 +    "pattern matched");
3602 +STATISTIC(numLoopcontPatternMatch,  "CFGStructurizer number of loop-continue "
3603 +    "pattern matched");
3604 +STATISTIC(numLoopPatternMatch,      "CFGStructurizer number of loop pattern "
3605 +    "matched");
3606 +STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
3607 +STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
3608 +
3609 +//===----------------------------------------------------------------------===//
3610 +//
3611 +// Miscellaneous utility for CFGStructurizer.
3612 +//
3613 +//===----------------------------------------------------------------------===//
3614 +namespace llvmCFGStruct {
3615 +#define SHOWNEWINSTR(i) \
3616 +  if (DEBUGME) errs() << "New instr: " << *i << "\n"
3617 +
3618 +#define SHOWNEWBLK(b, msg) \
3619 +if (DEBUGME) { \
3620 +  errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
3621 +  errs() << "\n"; \
3622 +}
3623 +
3624 +#define SHOWBLK_DETAIL(b, msg) \
3625 +if (DEBUGME) { \
3626 +  if (b) { \
3627 +  errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
3628 +  b->print(errs()); \
3629 +  errs() << "\n"; \
3630 +  } \
3631 +}
3632 +
3633 +#define INVALIDSCCNUM -1
3634 +#define INVALIDREGNUM 0
3635 +
3636 +template<class LoopinfoT>
3637 +void PrintLoopinfo(const LoopinfoT &LoopInfo, llvm::raw_ostream &OS) {
3638 +  for (typename LoopinfoT::iterator iter = LoopInfo.begin(),
3639 +       iterEnd = LoopInfo.end();
3640 +       iter != iterEnd; ++iter) {
3641 +    (*iter)->print(OS, 0);
3642 +  }
3643 +}
3644 +
3645 +template<class NodeT>
3646 +void ReverseVector(SmallVector<NodeT *, DEFAULT_VEC_SLOTS> &Src) {
3647 +  size_t sz = Src.size();
3648 +  for (size_t i = 0; i < sz/2; ++i) {
3649 +    NodeT *t = Src[i];
3650 +    Src[i] = Src[sz - i - 1];
3651 +    Src[sz - i - 1] = t;
3652 +  }
3653 +}
3654 +
3655 +} //end namespace llvmCFGStruct
3656 +
3657 +//===----------------------------------------------------------------------===//
3658 +//
3659 +// supporting data structure for CFGStructurizer
3660 +//
3661 +//===----------------------------------------------------------------------===//
3662 +
3663 +namespace llvmCFGStruct {
3664 +template<class PassT>
3665 +struct CFGStructTraits {
3666 +};
3667 +
3668 +template <class InstrT>
3669 +class BlockInformation {
3670 +public:
3671 +  bool isRetired;
3672 +  int  sccNum;
3673 +  //SmallVector<InstrT*, DEFAULT_VEC_SLOTS> succInstr;
3674 +  //Instructions defining the corresponding successor.
3675 +  BlockInformation() : isRetired(false), sccNum(INVALIDSCCNUM) {}
3676 +};
3677 +
3678 +template <class BlockT, class InstrT, class RegiT>
3679 +class LandInformation {
3680 +public:
3681 +  BlockT *landBlk;
3682 +  std::set<RegiT> breakInitRegs;  //Registers that need to "reg = 0", before
3683 +                                  //WHILELOOP(thisloop) init before entering
3684 +                                  //thisloop.
3685 +  std::set<RegiT> contInitRegs;   //Registers that need to "reg = 0", after
3686 +                                  //WHILELOOP(thisloop) init after entering
3687 +                                  //thisloop.
3688 +  std::set<RegiT> endbranchInitRegs; //Init before entering this loop, at loop
3689 +                                     //land block, branch cond on this reg.
3690 +  std::set<RegiT> breakOnRegs;       //registers that need to "if (reg) break
3691 +                                     //endif" after ENDLOOP(thisloop) break
3692 +                                     //outerLoopOf(thisLoop).
3693 +  std::set<RegiT> contOnRegs;       //registers that need to "if (reg) continue
3694 +                                    //endif" after ENDLOOP(thisloop) continue on
3695 +                                    //outerLoopOf(thisLoop).
3696 +  LandInformation() : landBlk(NULL) {}
3697 +};
3698 +
3699 +} //end of namespace llvmCFGStruct
3700 +
3701 +//===----------------------------------------------------------------------===//
3702 +//
3703 +// CFGStructurizer
3704 +//
3705 +//===----------------------------------------------------------------------===//
3706 +
3707 +namespace llvmCFGStruct {
3708 +// bixia TODO: port it to BasicBlock, not just MachineBasicBlock.
3709 +template<class PassT>
3710 +class  CFGStructurizer {
3711 +public:
3712 +  typedef enum {
3713 +    Not_SinglePath = 0,
3714 +    SinglePath_InPath = 1,
3715 +    SinglePath_NotInPath = 2
3716 +  } PathToKind;
3717 +
3718 +public:
3719 +  typedef typename PassT::InstructionType         InstrT;
3720 +  typedef typename PassT::FunctionType            FuncT;
3721 +  typedef typename PassT::DominatortreeType       DomTreeT;
3722 +  typedef typename PassT::PostDominatortreeType   PostDomTreeT;
3723 +  typedef typename PassT::DomTreeNodeType         DomTreeNodeT;
3724 +  typedef typename PassT::LoopinfoType            LoopInfoT;
3725 +
3726 +  typedef GraphTraits<FuncT *>                    FuncGTraits;
3727 +  //typedef FuncGTraits::nodes_iterator BlockIterator;
3728 +  typedef typename FuncT::iterator                BlockIterator;
3729 +
3730 +  typedef typename FuncGTraits::NodeType          BlockT;
3731 +  typedef GraphTraits<BlockT *>                   BlockGTraits;
3732 +  typedef GraphTraits<Inverse<BlockT *> >         InvBlockGTraits;
3733 +  //typedef BlockGTraits::succ_iterator InstructionIterator;
3734 +  typedef typename BlockT::iterator               InstrIterator;
3735 +
3736 +  typedef CFGStructTraits<PassT>                  CFGTraits;
3737 +  typedef BlockInformation<InstrT>                BlockInfo;
3738 +  typedef std::map<BlockT *, BlockInfo *>         BlockInfoMap;
3739 +
3740 +  typedef int                                     RegiT;
3741 +  typedef typename PassT::LoopType                LoopT;
3742 +  typedef LandInformation<BlockT, InstrT, RegiT>  LoopLandInfo;
3743 +        typedef std::map<LoopT *, LoopLandInfo *> LoopLandInfoMap;
3744 +        //landing info for loop break
3745 +  typedef SmallVector<BlockT *, 32>               BlockTSmallerVector;
3746 +
3747 +public:
3748 +  CFGStructurizer();
3749 +  ~CFGStructurizer();
3750 +
3751 +  /// Perform the CFG structurization
3752 +  bool run(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri);
3753 +
3754 +  /// Perform the CFG preparation
3755 +  bool prepare(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri);
3756 +
3757 +private:
3758 +  void reversePredicateSetter(typename BlockT::iterator);
3759 +  void   orderBlocks();
3760 +  void   printOrderedBlocks(llvm::raw_ostream &OS);
3761 +  int patternMatch(BlockT *CurBlock);
3762 +  int patternMatchGroup(BlockT *CurBlock);
3763 +
3764 +  int serialPatternMatch(BlockT *CurBlock);
3765 +  int ifPatternMatch(BlockT *CurBlock);
3766 +  int switchPatternMatch(BlockT *CurBlock);
3767 +  int loopendPatternMatch(BlockT *CurBlock);
3768 +  int loopPatternMatch(BlockT *CurBlock);
3769 +
3770 +  int loopbreakPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
3771 +  int loopcontPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
3772 +  //int loopWithoutBreak(BlockT *);
3773 +
3774 +  void handleLoopbreak (BlockT *ExitingBlock, LoopT *ExitingLoop,
3775 +                        BlockT *ExitBlock, LoopT *exitLoop, BlockT *landBlock);
3776 +  void handleLoopcontBlock(BlockT *ContingBlock, LoopT *contingLoop,
3777 +                           BlockT *ContBlock, LoopT *contLoop);
3778 +  bool isSameloopDetachedContbreak(BlockT *Src1Block, BlockT *Src2Block);
3779 +  int handleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
3780 +                       BlockT *FalseBlock);
3781 +  int handleJumpintoIfImp(BlockT *HeadBlock, BlockT *TrueBlock,
3782 +                          BlockT *FalseBlock);
3783 +  int improveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
3784 +                              BlockT *FalseBlock, BlockT **LandBlockPtr);
3785 +  void showImproveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
3786 +                                   BlockT *FalseBlock, BlockT *LandBlock,
3787 +                                   bool Detail = false);
3788 +  PathToKind singlePathTo(BlockT *SrcBlock, BlockT *DstBlock,
3789 +                          bool AllowSideEntry = true);
3790 +  BlockT *singlePathEnd(BlockT *srcBlock, BlockT *DstBlock,
3791 +                        bool AllowSideEntry = true);
3792 +  int cloneOnSideEntryTo(BlockT *PreBlock, BlockT *SrcBlock, BlockT *DstBlock);
3793 +  void mergeSerialBlock(BlockT *DstBlock, BlockT *srcBlock);
3794 +
3795 +  void mergeIfthenelseBlock(InstrT *BranchInstr, BlockT *CurBlock,
3796 +                            BlockT *TrueBlock, BlockT *FalseBlock,
3797 +                            BlockT *LandBlock);
3798 +  void mergeLooplandBlock(BlockT *DstBlock, LoopLandInfo *LoopLand);
3799 +  void mergeLoopbreakBlock(BlockT *ExitingBlock, BlockT *ExitBlock,
3800 +                           BlockT *ExitLandBlock, RegiT SetReg);
3801 +  void settleLoopcontBlock(BlockT *ContingBlock, BlockT *ContBlock,
3802 +                           RegiT SetReg);
3803 +  BlockT *relocateLoopcontBlock(LoopT *ParentLoopRep, LoopT *LoopRep,
3804 +                                std::set<BlockT*> &ExitBlockSet,
3805 +                                BlockT *ExitLandBlk);
3806 +  BlockT *addLoopEndbranchBlock(LoopT *LoopRep,
3807 +                                BlockTSmallerVector &ExitingBlocks,
3808 +                                BlockTSmallerVector &ExitBlocks);
3809 +  BlockT *normalizeInfiniteLoopExit(LoopT *LoopRep);
3810 +  void removeUnconditionalBranch(BlockT *SrcBlock);
3811 +  void removeRedundantConditionalBranch(BlockT *SrcBlock);
3812 +  void addDummyExitBlock(SmallVector<BlockT *, DEFAULT_VEC_SLOTS> &RetBlocks);
3813 +
3814 +  void removeSuccessor(BlockT *SrcBlock);
3815 +  BlockT *cloneBlockForPredecessor(BlockT *CurBlock, BlockT *PredBlock);
3816 +  BlockT *exitingBlock2ExitBlock (LoopT *LoopRep, BlockT *exitingBlock);
3817 +
3818 +  void migrateInstruction(BlockT *SrcBlock, BlockT *DstBlock,
3819 +                          InstrIterator InsertPos);
3820 +
3821 +  void recordSccnum(BlockT *SrcBlock, int SCCNum);
3822 +  int getSCCNum(BlockT *srcBlk);
3823 +
3824 +  void retireBlock(BlockT *DstBlock, BlockT *SrcBlock);
3825 +  bool isRetiredBlock(BlockT *SrcBlock);
3826 +  bool isActiveLoophead(BlockT *CurBlock);
3827 +  bool needMigrateBlock(BlockT *Block);
3828 +
3829 +  BlockT *recordLoopLandBlock(LoopT *LoopRep, BlockT *LandBlock,
3830 +                              BlockTSmallerVector &exitBlocks,
3831 +                              std::set<BlockT*> &ExitBlockSet);
3832 +  void setLoopLandBlock(LoopT *LoopRep, BlockT *Block = NULL);
3833 +  BlockT *getLoopLandBlock(LoopT *LoopRep);
3834 +  LoopLandInfo *getLoopLandInfo(LoopT *LoopRep);
3835 +
3836 +  void addLoopBreakOnReg(LoopT *LoopRep, RegiT RegNum);
3837 +  void addLoopContOnReg(LoopT *LoopRep, RegiT RegNum);
3838 +  void addLoopBreakInitReg(LoopT *LoopRep, RegiT RegNum);
3839 +  void addLoopContInitReg(LoopT *LoopRep, RegiT RegNum);
3840 +  void addLoopEndbranchInitReg(LoopT *LoopRep, RegiT RegNum);
3841 +
3842 +  bool hasBackEdge(BlockT *curBlock);
3843 +  unsigned getLoopDepth  (LoopT *LoopRep);
3844 +  int countActiveBlock(
3845 +    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterStart,
3846 +    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterEnd);
3847 +    BlockT *findNearestCommonPostDom(std::set<BlockT *>&);
3848 +  BlockT *findNearestCommonPostDom(BlockT *Block1, BlockT *Block2);
3849 +
3850 +private:
3851 +  DomTreeT *domTree;
3852 +  PostDomTreeT *postDomTree;
3853 +  LoopInfoT *loopInfo;
3854 +  PassT *passRep;
3855 +  FuncT *funcRep;
3856 +
3857 +  BlockInfoMap blockInfoMap;
3858 +  LoopLandInfoMap loopLandInfoMap;
3859 +  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> orderedBlks;
3860 +  const AMDGPURegisterInfo *TRI;
3861 +
3862 +};  //template class CFGStructurizer
3863 +
3864 +template<class PassT> CFGStructurizer<PassT>::CFGStructurizer()
3865 +  : domTree(NULL), postDomTree(NULL), loopInfo(NULL) {
3866 +}
3867 +
3868 +template<class PassT> CFGStructurizer<PassT>::~CFGStructurizer() {
3869 +  for (typename BlockInfoMap::iterator I = blockInfoMap.begin(),
3870 +       E = blockInfoMap.end(); I != E; ++I) {
3871 +    delete I->second;
3872 +  }
3873 +}
3874 +
3875 +template<class PassT>
3876 +bool CFGStructurizer<PassT>::prepare(FuncT &func, PassT &pass,
3877 +                                     const AMDGPURegisterInfo * tri) {
3878 +  passRep = &pass;
3879 +  funcRep = &func;
3880 +  TRI = tri;
3881 +
3882 +  bool changed = false;
3883 +
3884 +  //FIXME: if not reducible flow graph, make it so ???
3885 +
3886 +  if (DEBUGME) {
3887 +        errs() << "AMDGPUCFGStructurizer::prepare\n";
3888 +  }
3889 +
3890 +  loopInfo = CFGTraits::getLoopInfo(pass);
3891 +  if (DEBUGME) {
3892 +    errs() << "LoopInfo:\n";
3893 +    PrintLoopinfo(*loopInfo, errs());
3894 +  }
3895 +
3896 +  orderBlocks();
3897 +  if (DEBUGME) {
3898 +    errs() << "Ordered blocks:\n";
3899 +    printOrderedBlocks(errs());
3900 +  }
3901 +
3902 +  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> retBlks;
3903 +
3904 +  for (typename LoopInfoT::iterator iter = loopInfo->begin(),
3905 +       iterEnd = loopInfo->end();
3906 +       iter != iterEnd; ++iter) {
3907 +    LoopT* loopRep = (*iter);
3908 +    BlockTSmallerVector exitingBlks;
3909 +    loopRep->getExitingBlocks(exitingBlks);
3910 +
3911 +    if (exitingBlks.size() == 0) {
3912 +      BlockT* dummyExitBlk = normalizeInfiniteLoopExit(loopRep);
3913 +      if (dummyExitBlk != NULL)
3914 +        retBlks.push_back(dummyExitBlk);
3915 +    }
3916 +  }
3917 +
3918 +  // Remove unconditional branch instr.
3919 +  // Add dummy exit block iff there are multiple returns.
3920 +
3921 +  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
3922 +       iterBlk = orderedBlks.begin(), iterEndBlk = orderedBlks.end();
3923 +       iterBlk != iterEndBlk;
3924 +       ++iterBlk) {
3925 +    BlockT *curBlk = *iterBlk;
3926 +    removeUnconditionalBranch(curBlk);
3927 +    removeRedundantConditionalBranch(curBlk);
3928 +    if (CFGTraits::isReturnBlock(curBlk)) {
3929 +      retBlks.push_back(curBlk);
3930 +    }
3931 +    assert(curBlk->succ_size() <= 2);
3932 +  } //for
3933 +
3934 +  if (retBlks.size() >= 2) {
3935 +    addDummyExitBlock(retBlks);
3936 +    changed = true;
3937 +  }
3938 +
3939 +  return changed;
3940 +} //CFGStructurizer::prepare
3941 +
3942 +template<class PassT>
3943 +bool CFGStructurizer<PassT>::run(FuncT &func, PassT &pass,
3944 +    const AMDGPURegisterInfo * tri) {
3945 +  passRep = &pass;
3946 +  funcRep = &func;
3947 +  TRI = tri;
3948 +
3949 +  //Assume reducible CFG...
3950 +  if (DEBUGME) {
3951 +    errs() << "AMDGPUCFGStructurizer::run\n";
3952 +    func.viewCFG();
3953 +  }
3954 +
3955 +  domTree = CFGTraits::getDominatorTree(pass);
3956 +  if (DEBUGME) {
3957 +    domTree->print(errs(), (const llvm::Module*)0);
3958 +  }
3959 +
3960 +  postDomTree = CFGTraits::getPostDominatorTree(pass);
3961 +  if (DEBUGME) {
3962 +    postDomTree->print(errs());
3963 +  }
3964 +
3965 +  loopInfo = CFGTraits::getLoopInfo(pass);
3966 +  if (DEBUGME) {
3967 +    errs() << "LoopInfo:\n";
3968 +    PrintLoopinfo(*loopInfo, errs());
3969 +  }
3970 +
3971 +  orderBlocks();
3972 +#ifdef STRESSTEST
3973 +  //Use the worse block ordering to test the algorithm.
3974 +  ReverseVector(orderedBlks);
3975 +#endif
3976 +
3977 +  if (DEBUGME) {
3978 +    errs() << "Ordered blocks:\n";
3979 +    printOrderedBlocks(errs());
3980 +  }
3981 +  int numIter = 0;
3982 +  bool finish = false;
3983 +  BlockT *curBlk;
3984 +  bool makeProgress = false;
3985 +  int numRemainedBlk = countActiveBlock(orderedBlks.begin(),
3986 +                                        orderedBlks.end());
3987 +
3988 +  do {
3989 +    ++numIter;
3990 +    if (DEBUGME) {
3991 +      errs() << "numIter = " << numIter
3992 +             << ", numRemaintedBlk = " << numRemainedBlk << "\n";
3993 +    }
3994 +
3995 +    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
3996 +      iterBlk = orderedBlks.begin();
3997 +    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
3998 +      iterBlkEnd = orderedBlks.end();
3999 +
4000 +    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
4001 +      sccBeginIter = iterBlk;
4002 +    BlockT *sccBeginBlk = NULL;
4003 +    int sccNumBlk = 0;  // The number of active blocks, init to a
4004 +                        // maximum possible number.
4005 +    int sccNumIter;     // Number of iteration in this SCC.
4006 +
4007 +    while (iterBlk != iterBlkEnd) {
4008 +      curBlk = *iterBlk;
4009 +
4010 +      if (sccBeginBlk == NULL) {
4011 +        sccBeginIter = iterBlk;
4012 +        sccBeginBlk = curBlk;
4013 +        sccNumIter = 0;
4014 +        sccNumBlk = numRemainedBlk; // Init to maximum possible number.
4015 +        if (DEBUGME) {
4016 +              errs() << "start processing SCC" << getSCCNum(sccBeginBlk);
4017 +              errs() << "\n";
4018 +        }
4019 +      }
4020 +
4021 +      if (!isRetiredBlock(curBlk)) {
4022 +        patternMatch(curBlk);
4023 +      }
4024 +
4025 +      ++iterBlk;
4026 +
4027 +      bool contNextScc = true;
4028 +      if (iterBlk == iterBlkEnd
4029 +          || getSCCNum(sccBeginBlk) != getSCCNum(*iterBlk)) {
4030 +        // Just finish one scc.
4031 +        ++sccNumIter;
4032 +        int sccRemainedNumBlk = countActiveBlock(sccBeginIter, iterBlk);
4033 +        if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= sccNumBlk) {
4034 +          if (DEBUGME) {
4035 +            errs() << "Can't reduce SCC " << getSCCNum(curBlk)
4036 +                   << ", sccNumIter = " << sccNumIter;
4037 +            errs() << "doesn't make any progress\n";
4038 +          }
4039 +          contNextScc = true;
4040 +        } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < sccNumBlk) {
4041 +          sccNumBlk = sccRemainedNumBlk;
4042 +          iterBlk = sccBeginIter;
4043 +          contNextScc = false;
4044 +          if (DEBUGME) {
4045 +            errs() << "repeat processing SCC" << getSCCNum(curBlk)
4046 +                   << "sccNumIter = " << sccNumIter << "\n";
4047 +            func.viewCFG();
4048 +          }
4049 +        } else {
4050 +          // Finish the current scc.
4051 +          contNextScc = true;
4052 +        }
4053 +      } else {
4054 +        // Continue on next component in the current scc.
4055 +        contNextScc = false;
4056 +      }
4057 +
4058 +      if (contNextScc) {
4059 +        sccBeginBlk = NULL;
4060 +      }
4061 +    } //while, "one iteration" over the function.
4062 +
4063 +    BlockT *entryBlk = FuncGTraits::nodes_begin(&func);
4064 +    if (entryBlk->succ_size() == 0) {
4065 +      finish = true;
4066 +      if (DEBUGME) {
4067 +        errs() << "Reduce to one block\n";
4068 +      }
4069 +    } else {
4070 +      int newnumRemainedBlk
4071 +        = countActiveBlock(orderedBlks.begin(), orderedBlks.end());
4072 +      // consider cloned blocks ??
4073 +      if (newnumRemainedBlk == 1 || newnumRemainedBlk < numRemainedBlk) {
4074 +        makeProgress = true;
4075 +        numRemainedBlk = newnumRemainedBlk;
4076 +      } else {
4077 +        makeProgress = false;
4078 +        if (DEBUGME) {
4079 +          errs() << "No progress\n";
4080 +        }
4081 +      }
4082 +    }
4083 +  } while (!finish && makeProgress);
4084 +
4085 +  // Misc wrap up to maintain the consistency of the Function representation.
4086 +  CFGTraits::wrapup(FuncGTraits::nodes_begin(&func));
4087 +
4088 +  // Detach retired Block, release memory.
4089 +  for (typename BlockInfoMap::iterator iterMap = blockInfoMap.begin(),
4090 +       iterEndMap = blockInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
4091 +    if ((*iterMap).second && (*iterMap).second->isRetired) {
4092 +      assert(((*iterMap).first)->getNumber() != -1);
4093 +      if (DEBUGME) {
4094 +        errs() << "Erase BB" << ((*iterMap).first)->getNumber() << "\n";
4095 +      }
4096 +      (*iterMap).first->eraseFromParent();  //Remove from the parent Function.
4097 +    }
4098 +    delete (*iterMap).second;
4099 +  }
4100 +  blockInfoMap.clear();
4101 +
4102 +  // clear loopLandInfoMap
4103 +  for (typename LoopLandInfoMap::iterator iterMap = loopLandInfoMap.begin(),
4104 +       iterEndMap = loopLandInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
4105 +    delete (*iterMap).second;
4106 +  }
4107 +  loopLandInfoMap.clear();
4108 +
4109 +  if (DEBUGME) {
4110 +    func.viewCFG();
4111 +  }
4112 +
4113 +  if (!finish) {
4114 +    assert(!"IRREDUCIBL_CF");
4115 +  }
4116 +
4117 +  return true;
4118 +} //CFGStructurizer::run
4119 +
4120 +/// Print the ordered Blocks.
4121 +///
4122 +template<class PassT>
4123 +void CFGStructurizer<PassT>::printOrderedBlocks(llvm::raw_ostream &os) {
4124 +  size_t i = 0;
4125 +  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
4126 +      iterBlk = orderedBlks.begin(), iterBlkEnd = orderedBlks.end();
4127 +       iterBlk != iterBlkEnd;
4128 +       ++iterBlk, ++i) {
4129 +    os << "BB" << (*iterBlk)->getNumber();
4130 +    os << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
4131 +    if (i != 0 && i % 10 == 0) {
4132 +      os << "\n";
4133 +    } else {
4134 +      os << " ";
4135 +    }
4136 +  }
4137 +} //printOrderedBlocks
4138 +
4139 +/// Compute the reversed DFS post order of Blocks
4140 +///
4141 +template<class PassT> void CFGStructurizer<PassT>::orderBlocks() {
4142 +  int sccNum = 0;
4143 +  BlockT *bb;
4144 +  for (scc_iterator<FuncT *> sccIter = scc_begin(funcRep),
4145 +       sccEnd = scc_end(funcRep); sccIter != sccEnd; ++sccIter, ++sccNum) {
4146 +    std::vector<BlockT *> &sccNext = *sccIter;
4147 +    for (typename std::vector<BlockT *>::const_iterator
4148 +         blockIter = sccNext.begin(), blockEnd = sccNext.end();
4149 +         blockIter != blockEnd; ++blockIter) {
4150 +      bb = *blockIter;
4151 +      orderedBlks.push_back(bb);
4152 +      recordSccnum(bb, sccNum);
4153 +    }
4154 +  }
4155 +
4156 +  //walk through all the block in func to check for unreachable
4157 +  for (BlockIterator blockIter1 = FuncGTraits::nodes_begin(funcRep),
4158 +       blockEnd1 = FuncGTraits::nodes_end(funcRep);
4159 +       blockIter1 != blockEnd1; ++blockIter1) {
4160 +    BlockT *bb = &(*blockIter1);
4161 +    sccNum = getSCCNum(bb);
4162 +    if (sccNum == INVALIDSCCNUM) {
4163 +      errs() << "unreachable block BB" << bb->getNumber() << "\n";
4164 +    }
4165 +  }
4166 +} //orderBlocks
4167 +
4168 +template<class PassT> int CFGStructurizer<PassT>::patternMatch(BlockT *curBlk) {
4169 +  int numMatch = 0;
4170 +  int curMatch;
4171 +
4172 +  if (DEBUGME) {
4173 +        errs() << "Begin patternMatch BB" << curBlk->getNumber() << "\n";
4174 +  }
4175 +
4176 +  while ((curMatch = patternMatchGroup(curBlk)) > 0) {
4177 +    numMatch += curMatch;
4178 +  }
4179 +
4180 +  if (DEBUGME) {
4181 +        errs() << "End patternMatch BB" << curBlk->getNumber()
4182 +      << ", numMatch = " << numMatch << "\n";
4183 +  }
4184 +
4185 +  return numMatch;
4186 +} //patternMatch
4187 +
4188 +template<class PassT>
4189 +int CFGStructurizer<PassT>::patternMatchGroup(BlockT *curBlk) {
4190 +  int numMatch = 0;
4191 +  numMatch += serialPatternMatch(curBlk);
4192 +  numMatch += ifPatternMatch(curBlk);
4193 +  numMatch += loopendPatternMatch(curBlk);
4194 +  numMatch += loopPatternMatch(curBlk);
4195 +  return numMatch;
4196 +}//patternMatchGroup
4197 +
4198 +template<class PassT>
4199 +int CFGStructurizer<PassT>::serialPatternMatch(BlockT *curBlk) {
4200 +  if (curBlk->succ_size() != 1) {
4201 +    return 0;
4202 +  }
4203 +
4204 +  BlockT *childBlk = *curBlk->succ_begin();
4205 +  if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk)) {
4206 +    return 0;
4207 +  }
4208 +
4209 +  mergeSerialBlock(curBlk, childBlk);
4210 +  ++numSerialPatternMatch;
4211 +  return 1;
4212 +} //serialPatternMatch
4213 +
4214 +template<class PassT>
4215 +int CFGStructurizer<PassT>::ifPatternMatch(BlockT *curBlk) {
4216 +  //two edges
4217 +  if (curBlk->succ_size() != 2) {
4218 +    return 0;
4219 +  }
4220 +
4221 +  if (hasBackEdge(curBlk)) {
4222 +    return 0;
4223 +  }
4224 +
4225 +  InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(curBlk);
4226 +  if (branchInstr == NULL) {
4227 +    return 0;
4228 +  }
4229 +
4230 +  assert(CFGTraits::isCondBranch(branchInstr));
4231 +
4232 +  BlockT *trueBlk = CFGTraits::getTrueBranch(branchInstr);
4233 +  BlockT *falseBlk = CFGTraits::getFalseBranch(curBlk, branchInstr);
4234 +  BlockT *landBlk;
4235 +  int cloned = 0;
4236 +
4237 +  // TODO: Simplify
4238 +  if (trueBlk->succ_size() == 1 && falseBlk->succ_size() == 1
4239 +    && *trueBlk->succ_begin() == *falseBlk->succ_begin()) {
4240 +    landBlk = *trueBlk->succ_begin();
4241 +  } else if (trueBlk->succ_size() == 0 && falseBlk->succ_size() == 0) {
4242 +    landBlk = NULL;
4243 +  } else if (trueBlk->succ_size() == 1 && *trueBlk->succ_begin() == falseBlk) {
4244 +    landBlk = falseBlk;
4245 +    falseBlk = NULL;
4246 +  } else if (falseBlk->succ_size() == 1
4247 +             && *falseBlk->succ_begin() == trueBlk) {
4248 +    landBlk = trueBlk;
4249 +    trueBlk = NULL;
4250 +  } else if (falseBlk->succ_size() == 1
4251 +             && isSameloopDetachedContbreak(trueBlk, falseBlk)) {
4252 +    landBlk = *falseBlk->succ_begin();
4253 +  } else if (trueBlk->succ_size() == 1
4254 +    && isSameloopDetachedContbreak(falseBlk, trueBlk)) {
4255 +    landBlk = *trueBlk->succ_begin();
4256 +  } else {
4257 +    return handleJumpintoIf(curBlk, trueBlk, falseBlk);
4258 +  }
4259 +
4260 +  // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
4261 +  // new BB created for landBlk==NULL may introduce new challenge to the
4262 +  // reduction process.
4263 +  if (landBlk != NULL &&
4264 +      ((trueBlk && trueBlk->pred_size() > 1)
4265 +      || (falseBlk && falseBlk->pred_size() > 1))) {
4266 +     cloned += improveSimpleJumpintoIf(curBlk, trueBlk, falseBlk, &landBlk);
4267 +  }
4268 +
4269 +  if (trueBlk && trueBlk->pred_size() > 1) {
4270 +    trueBlk = cloneBlockForPredecessor(trueBlk, curBlk);
4271 +    ++cloned;
4272 +  }
4273 +
4274 +  if (falseBlk && falseBlk->pred_size() > 1) {
4275 +    falseBlk = cloneBlockForPredecessor(falseBlk, curBlk);
4276 +    ++cloned;
4277 +  }
4278 +
4279 +  mergeIfthenelseBlock(branchInstr, curBlk, trueBlk, falseBlk, landBlk);
4280 +
4281 +  ++numIfPatternMatch;
4282 +
4283 +  numClonedBlock += cloned;
4284 +
4285 +  return 1 + cloned;
4286 +} //ifPatternMatch
4287 +
4288 +template<class PassT>
4289 +int CFGStructurizer<PassT>::switchPatternMatch(BlockT *curBlk) {
4290 +  return 0;
4291 +} //switchPatternMatch
4292 +
4293 +template<class PassT>
4294 +int CFGStructurizer<PassT>::loopendPatternMatch(BlockT *curBlk) {
4295 +  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
4296 +  typename std::vector<LoopT *> nestedLoops;
4297 +  while (loopRep) {
4298 +    nestedLoops.push_back(loopRep);
4299 +    loopRep = loopRep->getParentLoop();
4300 +  }
4301 +
4302 +  if (nestedLoops.size() == 0) {
4303 +    return 0;
4304 +  }
4305 +
4306 +  // Process nested loop outside->inside, so "continue" to a outside loop won't
4307 +  // be mistaken as "break" of the current loop.
4308 +  int num = 0;
4309 +  for (typename std::vector<LoopT *>::reverse_iterator
4310 +       iter = nestedLoops.rbegin(), iterEnd = nestedLoops.rend();
4311 +       iter != iterEnd; ++iter) {
4312 +    loopRep = *iter;
4313 +
4314 +    if (getLoopLandBlock(loopRep) != NULL) {
4315 +      continue;
4316 +    }
4317 +
4318 +    BlockT *loopHeader = loopRep->getHeader();
4319 +
4320 +    int numBreak = loopbreakPatternMatch(loopRep, loopHeader);
4321 +
4322 +    if (numBreak == -1) {
4323 +      break;
4324 +    }
4325 +
4326 +    int numCont = loopcontPatternMatch(loopRep, loopHeader);
4327 +    num += numBreak + numCont;
4328 +  }
4329 +
4330 +  return num;
4331 +} //loopendPatternMatch
4332 +
4333 +template<class PassT>
4334 +int CFGStructurizer<PassT>::loopPatternMatch(BlockT *curBlk) {
4335 +  if (curBlk->succ_size() != 0) {
4336 +    return 0;
4337 +  }
4338 +
4339 +  int numLoop = 0;
4340 +  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
4341 +  while (loopRep && loopRep->getHeader() == curBlk) {
4342 +    LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
4343 +    if (loopLand) {
4344 +      BlockT *landBlk = loopLand->landBlk;
4345 +      assert(landBlk);
4346 +      if (!isRetiredBlock(landBlk)) {
4347 +        mergeLooplandBlock(curBlk, loopLand);
4348 +        ++numLoop;
4349 +      }
4350 +    }
4351 +    loopRep = loopRep->getParentLoop();
4352 +  }
4353 +
4354 +  numLoopPatternMatch += numLoop;
4355 +
4356 +  return numLoop;
4357 +} //loopPatternMatch
4358 +
4359 +template<class PassT>
4360 +int CFGStructurizer<PassT>::loopbreakPatternMatch(LoopT *loopRep,
4361 +                                                  BlockT *loopHeader) {
4362 +  BlockTSmallerVector exitingBlks;
4363 +  loopRep->getExitingBlocks(exitingBlks);
4364 +
4365 +  if (DEBUGME) {
4366 +    errs() << "Loop has " << exitingBlks.size() << " exiting blocks\n";
4367 +  }
4368 +
4369 +  if (exitingBlks.size() == 0) {
4370 +    setLoopLandBlock(loopRep);
4371 +    return 0;
4372 +  }
4373 +
4374 +  // Compute the corresponding exitBlks and exit block set.
4375 +  BlockTSmallerVector exitBlks;
4376 +  std::set<BlockT *> exitBlkSet;
4377 +  for (typename BlockTSmallerVector::const_iterator iter = exitingBlks.begin(),
4378 +       iterEnd = exitingBlks.end(); iter != iterEnd; ++iter) {
4379 +    BlockT *exitingBlk = *iter;
4380 +    BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
4381 +    exitBlks.push_back(exitBlk);
4382 +    exitBlkSet.insert(exitBlk);  //non-duplicate insert
4383 +  }
4384 +
4385 +  assert(exitBlkSet.size() > 0);
4386 +  assert(exitBlks.size() == exitingBlks.size());
4387 +
4388 +  if (DEBUGME) {
4389 +    errs() << "Loop has " << exitBlkSet.size() << " exit blocks\n";
4390 +  }
4391 +
4392 +  // Find exitLandBlk.
4393 +  BlockT *exitLandBlk = NULL;
4394 +  int numCloned = 0;
4395 +  int numSerial = 0;
4396 +
4397 +  if (exitBlkSet.size() == 1) {
4398 +    exitLandBlk = *exitBlkSet.begin();
4399 +  } else {
4400 +    exitLandBlk = findNearestCommonPostDom(exitBlkSet);
4401 +
4402 +    if (exitLandBlk == NULL) {
4403 +      return -1;
4404 +    }
4405 +
4406 +    bool allInPath = true;
4407 +    bool allNotInPath = true;
4408 +    for (typename std::set<BlockT*>::const_iterator
4409 +         iter = exitBlkSet.begin(),
4410 +         iterEnd = exitBlkSet.end();
4411 +         iter != iterEnd; ++iter) {
4412 +      BlockT *exitBlk = *iter;
4413 +
4414 +      PathToKind pathKind = singlePathTo(exitBlk, exitLandBlk, true);
4415 +      if (DEBUGME) {
4416 +        errs() << "BB" << exitBlk->getNumber()
4417 +               << " to BB" << exitLandBlk->getNumber() << " PathToKind="
4418 +               << pathKind << "\n";
4419 +      }
4420 +
4421 +      allInPath = allInPath && (pathKind == SinglePath_InPath);
4422 +      allNotInPath = allNotInPath && (pathKind == SinglePath_NotInPath);
4423 +
4424 +      if (!allInPath && !allNotInPath) {
4425 +        if (DEBUGME) {
4426 +              errs() << "singlePath check fail\n";
4427 +        }
4428 +        return -1;
4429 +      }
4430 +    } // check all exit blocks
4431 +
4432 +    if (allNotInPath) {
4433 +
4434 +      // TODO: Simplify, maybe separate function?
4435 +      LoopT *parentLoopRep = loopRep->getParentLoop();
4436 +      BlockT *parentLoopHeader = NULL;
4437 +      if (parentLoopRep)
4438 +        parentLoopHeader = parentLoopRep->getHeader();
4439 +
4440 +      if (exitLandBlk == parentLoopHeader &&
4441 +          (exitLandBlk = relocateLoopcontBlock(parentLoopRep,
4442 +                                               loopRep,
4443 +                                               exitBlkSet,
4444 +                                               exitLandBlk)) != NULL) {
4445 +        if (DEBUGME) {
4446 +          errs() << "relocateLoopcontBlock success\n";
4447 +        }
4448 +      } else if ((exitLandBlk = addLoopEndbranchBlock(loopRep,
4449 +                                                      exitingBlks,
4450 +                                                      exitBlks)) != NULL) {
4451 +        if (DEBUGME) {
4452 +          errs() << "insertEndbranchBlock success\n";
4453 +        }
4454 +      } else {
4455 +        if (DEBUGME) {
4456 +          errs() << "loop exit fail\n";
4457 +        }
4458 +        return -1;
4459 +      }
4460 +    }
4461 +
4462 +    // Handle side entry to exit path.
4463 +    exitBlks.clear();
4464 +    exitBlkSet.clear();
4465 +    for (typename BlockTSmallerVector::iterator iterExiting =
4466 +           exitingBlks.begin(),
4467 +         iterExitingEnd = exitingBlks.end();
4468 +         iterExiting != iterExitingEnd; ++iterExiting) {
4469 +      BlockT *exitingBlk = *iterExiting;
4470 +      BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
4471 +      BlockT *newExitBlk = exitBlk;
4472 +
4473 +      if (exitBlk != exitLandBlk && exitBlk->pred_size() > 1) {
4474 +        newExitBlk = cloneBlockForPredecessor(exitBlk, exitingBlk);
4475 +        ++numCloned;
4476 +      }
4477 +
4478 +      numCloned += cloneOnSideEntryTo(exitingBlk, newExitBlk, exitLandBlk);
4479 +
4480 +      exitBlks.push_back(newExitBlk);
4481 +      exitBlkSet.insert(newExitBlk);
4482 +    }
4483 +
4484 +    for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
4485 +         iterExitEnd = exitBlks.end();
4486 +         iterExit != iterExitEnd; ++iterExit) {
4487 +      BlockT *exitBlk = *iterExit;
4488 +      numSerial += serialPatternMatch(exitBlk);
4489 +    }
4490 +
4491 +    for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
4492 +         iterExitEnd = exitBlks.end();
4493 +         iterExit != iterExitEnd; ++iterExit) {
4494 +      BlockT *exitBlk = *iterExit;
4495 +      if (exitBlk->pred_size() > 1) {
4496 +        if (exitBlk != exitLandBlk) {
4497 +          return -1;
4498 +        }
4499 +      } else {
4500 +        if (exitBlk != exitLandBlk &&
4501 +            (exitBlk->succ_size() != 1 ||
4502 +            *exitBlk->succ_begin() != exitLandBlk)) {
4503 +          return -1;
4504 +        }
4505 +      }
4506 +    }
4507 +  } // else
4508 +
4509 +  exitLandBlk = recordLoopLandBlock(loopRep, exitLandBlk, exitBlks, exitBlkSet);
4510 +
4511 +  // Fold break into the breaking block. Leverage across level breaks.
4512 +  assert(exitingBlks.size() == exitBlks.size());
4513 +  for (typename BlockTSmallerVector::const_iterator iterExit = exitBlks.begin(),
4514 +       iterExiting = exitingBlks.begin(), iterExitEnd = exitBlks.end();
4515 +       iterExit != iterExitEnd; ++iterExit, ++iterExiting) {
4516 +    BlockT *exitBlk = *iterExit;
4517 +    BlockT *exitingBlk = *iterExiting;
4518 +    assert(exitBlk->pred_size() == 1 || exitBlk == exitLandBlk);
4519 +    LoopT *exitingLoop = loopInfo->getLoopFor(exitingBlk);
4520 +    handleLoopbreak(exitingBlk, exitingLoop, exitBlk, loopRep, exitLandBlk);
4521 +  }
4522 +
4523 +  int numBreak = static_cast<int>(exitingBlks.size());
4524 +  numLoopbreakPatternMatch += numBreak;
4525 +  numClonedBlock += numCloned;
4526 +  return numBreak + numSerial + numCloned;
4527 +} //loopbreakPatternMatch
4528 +
4529 +template<class PassT>
4530 +int CFGStructurizer<PassT>::loopcontPatternMatch(LoopT *loopRep,
4531 +                                                 BlockT *loopHeader) {
4532 +  int numCont = 0;
4533 +  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> contBlk;
4534 +  for (typename InvBlockGTraits::ChildIteratorType iter =
4535 +       InvBlockGTraits::child_begin(loopHeader),
4536 +       iterEnd = InvBlockGTraits::child_end(loopHeader);
4537 +       iter != iterEnd; ++iter) {
4538 +    BlockT *curBlk = *iter;
4539 +    if (loopRep->contains(curBlk)) {
4540 +      handleLoopcontBlock(curBlk, loopInfo->getLoopFor(curBlk),
4541 +                          loopHeader, loopRep);
4542 +      contBlk.push_back(curBlk);
4543 +      ++numCont;
4544 +    }
4545 +  }
4546 +
4547 +  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator
4548 +       iter = contBlk.begin(), iterEnd = contBlk.end();
4549 +       iter != iterEnd; ++iter) {
4550 +    (*iter)->removeSuccessor(loopHeader);
4551 +  }
4552 +
4553 +  numLoopcontPatternMatch += numCont;
4554 +
4555 +  return numCont;
4556 +} //loopcontPatternMatch
4557 +
4558 +
4559 +template<class PassT>
4560 +bool CFGStructurizer<PassT>::isSameloopDetachedContbreak(BlockT *src1Blk,
4561 +                                                         BlockT *src2Blk) {
4562 +  // return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in the
4563 +  // same loop with LoopLandInfo without explicitly keeping track of
4564 +  // loopContBlks and loopBreakBlks, this is a method to get the information.
4565 +  //
4566 +  if (src1Blk->succ_size() == 0) {
4567 +    LoopT *loopRep = loopInfo->getLoopFor(src1Blk);
4568 +    if (loopRep != NULL && loopRep == loopInfo->getLoopFor(src2Blk)) {
4569 +      LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
4570 +      if (theEntry != NULL) {
4571 +        if (DEBUGME) {
4572 +          errs() << "isLoopContBreakBlock yes src1 = BB"
4573 +                 << src1Blk->getNumber()
4574 +                 << " src2 = BB" << src2Blk->getNumber() << "\n";
4575 +        }
4576 +        return true;
4577 +      }
4578 +    }
4579 +  }
4580 +  return false;
4581 +}  //isSameloopDetachedContbreak
4582 +
4583 +template<class PassT>
4584 +int CFGStructurizer<PassT>::handleJumpintoIf(BlockT *headBlk,
4585 +                                             BlockT *trueBlk,
4586 +                                             BlockT *falseBlk) {
4587 +  int num = handleJumpintoIfImp(headBlk, trueBlk, falseBlk);
4588 +  if (num == 0) {
4589 +    if (DEBUGME) {
4590 +      errs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
4591 +    }
4592 +    num = handleJumpintoIfImp(headBlk, falseBlk, trueBlk);
4593 +  }
4594 +  return num;
4595 +}
4596 +
4597 +template<class PassT>
4598 +int CFGStructurizer<PassT>::handleJumpintoIfImp(BlockT *headBlk,
4599 +                                                BlockT *trueBlk,
4600 +                                                BlockT *falseBlk) {
4601 +  int num = 0;
4602 +  BlockT *downBlk;
4603 +
4604 +  //trueBlk could be the common post dominator
4605 +  downBlk = trueBlk;
4606 +
4607 +  if (DEBUGME) {
4608 +    errs() << "handleJumpintoIfImp head = BB" << headBlk->getNumber()
4609 +           << " true = BB" << trueBlk->getNumber()
4610 +           << ", numSucc=" << trueBlk->succ_size()
4611 +           << " false = BB" << falseBlk->getNumber() << "\n";
4612 +  }
4613 +
4614 +  while (downBlk) {
4615 +    if (DEBUGME) {
4616 +      errs() << "check down = BB" << downBlk->getNumber();
4617 +    }
4618 +
4619 +    if (singlePathTo(falseBlk, downBlk) == SinglePath_InPath) {
4620 +      if (DEBUGME) {
4621 +        errs() << " working\n";
4622 +      }
4623 +
4624 +      num += cloneOnSideEntryTo(headBlk, trueBlk, downBlk);
4625 +      num += cloneOnSideEntryTo(headBlk, falseBlk, downBlk);
4626 +
4627 +      numClonedBlock += num;
4628 +      num += serialPatternMatch(*headBlk->succ_begin());
4629 +      num += serialPatternMatch(*(++headBlk->succ_begin()));
4630 +      num += ifPatternMatch(headBlk);
4631 +      assert(num > 0);
4632 +
4633 +      break;
4634 +    }
4635 +    if (DEBUGME) {
4636 +      errs() << " not working\n";
4637 +    }
4638 +    downBlk = (downBlk->succ_size() == 1) ? (*downBlk->succ_begin()) : NULL;
4639 +  } // walk down the postDomTree
4640 +
4641 +  return num;
4642 +} //handleJumpintoIf
4643 +
4644 +template<class PassT>
4645 +void CFGStructurizer<PassT>::showImproveSimpleJumpintoIf(BlockT *headBlk,
4646 +                                                         BlockT *trueBlk,
4647 +                                                         BlockT *falseBlk,
4648 +                                                         BlockT *landBlk,
4649 +                                                         bool detail) {
4650 +  errs() << "head = BB" << headBlk->getNumber()
4651 +         << " size = " << headBlk->size();
4652 +  if (detail) {
4653 +    errs() << "\n";
4654 +    headBlk->print(errs());
4655 +    errs() << "\n";
4656 +  }
4657 +
4658 +  if (trueBlk) {
4659 +    errs() << ", true = BB" << trueBlk->getNumber() << " size = "
4660 +           << trueBlk->size() << " numPred = " << trueBlk->pred_size();
4661 +    if (detail) {
4662 +      errs() << "\n";
4663 +      trueBlk->print(errs());
4664 +      errs() << "\n";
4665 +    }
4666 +  }
4667 +  if (falseBlk) {
4668 +    errs() << ", false = BB" << falseBlk->getNumber() << " size = "
4669 +           << falseBlk->size() << " numPred = " << falseBlk->pred_size();
4670 +    if (detail) {
4671 +      errs() << "\n";
4672 +      falseBlk->print(errs());
4673 +      errs() << "\n";
4674 +    }
4675 +  }
4676 +  if (landBlk) {
4677 +    errs() << ", land = BB" << landBlk->getNumber() << " size = "
4678 +           << landBlk->size() << " numPred = " << landBlk->pred_size();
4679 +    if (detail) {
4680 +      errs() << "\n";
4681 +      landBlk->print(errs());
4682 +      errs() << "\n";
4683 +    }
4684 +  }
4685 +
4686 +    errs() << "\n";
4687 +} //showImproveSimpleJumpintoIf
4688 +
4689 +template<class PassT>
4690 +int CFGStructurizer<PassT>::improveSimpleJumpintoIf(BlockT *headBlk,
4691 +                                                    BlockT *trueBlk,
4692 +                                                    BlockT *falseBlk,
4693 +                                                    BlockT **plandBlk) {
4694 +  bool migrateTrue = false;
4695 +  bool migrateFalse = false;
4696 +
4697 +  BlockT *landBlk = *plandBlk;
4698 +
4699 +  assert((trueBlk == NULL || trueBlk->succ_size() <= 1)
4700 +         && (falseBlk == NULL || falseBlk->succ_size() <= 1));
4701 +
4702 +  if (trueBlk == falseBlk) {
4703 +    return 0;
4704 +  }
4705 +
4706 +  migrateTrue = needMigrateBlock(trueBlk);
4707 +  migrateFalse = needMigrateBlock(falseBlk);
4708 +
4709 +  if (!migrateTrue && !migrateFalse) {
4710 +    return 0;
4711 +  }
4712 +
4713 +  // If we need to migrate either trueBlk and falseBlk, migrate the rest that
4714 +  // have more than one predecessors.  without doing this, its predecessor
4715 +  // rather than headBlk will have undefined value in initReg.
4716 +  if (!migrateTrue && trueBlk && trueBlk->pred_size() > 1) {
4717 +    migrateTrue = true;
4718 +  }
4719 +  if (!migrateFalse && falseBlk && falseBlk->pred_size() > 1) {
4720 +    migrateFalse = true;
4721 +  }
4722 +
4723 +  if (DEBUGME) {
4724 +    errs() << "before improveSimpleJumpintoIf: ";
4725 +    showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
4726 +  }
4727 +
4728 +  // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
4729 +  //
4730 +  // new: headBlk => if () {initReg = 1; org trueBlk branch} else
4731 +  //      {initReg = 0; org falseBlk branch }
4732 +  //      => landBlk => if (initReg) {org trueBlk} else {org falseBlk}
4733 +  //      => org landBlk
4734 +  //      if landBlk->pred_size() > 2, put the about if-else inside
4735 +  //      if (initReg !=2) {...}
4736 +  //
4737 +  // add initReg = initVal to headBlk
4738 +
4739 +  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
4740 +  unsigned initReg =
4741 +    funcRep->getRegInfo().createVirtualRegister(I32RC);
4742 +  if (!migrateTrue || !migrateFalse) {
4743 +    int initVal = migrateTrue ? 0 : 1;
4744 +    CFGTraits::insertAssignInstrBefore(headBlk, passRep, initReg, initVal);
4745 +  }
4746 +
4747 +  int numNewBlk = 0;
4748 +
4749 +  if (landBlk == NULL) {
4750 +    landBlk = funcRep->CreateMachineBasicBlock();
4751 +    funcRep->push_back(landBlk);  //insert to function
4752 +
4753 +    if (trueBlk) {
4754 +      trueBlk->addSuccessor(landBlk);
4755 +    } else {
4756 +      headBlk->addSuccessor(landBlk);
4757 +    }
4758 +
4759 +    if (falseBlk) {
4760 +      falseBlk->addSuccessor(landBlk);
4761 +    } else {
4762 +      headBlk->addSuccessor(landBlk);
4763 +    }
4764 +
4765 +    numNewBlk ++;
4766 +  }
4767 +
4768 +  bool landBlkHasOtherPred = (landBlk->pred_size() > 2);
4769 +
4770 +  //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
4771 +  typename BlockT::iterator insertPos =
4772 +    CFGTraits::getInstrPos
4773 +    (landBlk, CFGTraits::insertInstrBefore(landBlk, AMDGPU::ENDIF, passRep));
4774 +
4775 +  if (landBlkHasOtherPred) {
4776 +    unsigned immReg =
4777 +      funcRep->getRegInfo().createVirtualRegister(I32RC);
4778 +    CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 2);
4779 +    unsigned cmpResReg =
4780 +      funcRep->getRegInfo().createVirtualRegister(I32RC);
4781 +
4782 +    CFGTraits::insertCompareInstrBefore(landBlk, insertPos, passRep, cmpResReg,
4783 +                                        initReg, immReg);
4784 +    CFGTraits::insertCondBranchBefore(landBlk, insertPos,
4785 +                                      AMDGPU::IF_PREDICATE_SET, passRep,
4786 +                                      cmpResReg, DebugLoc());
4787 +  }
4788 +
4789 +  CFGTraits::insertCondBranchBefore(landBlk, insertPos, AMDGPU::IF_PREDICATE_SET,
4790 +                                    passRep, initReg, DebugLoc());
4791 +
4792 +  if (migrateTrue) {
4793 +    migrateInstruction(trueBlk, landBlk, insertPos);
4794 +    // need to uncondionally insert the assignment to ensure a path from its
4795 +    // predecessor rather than headBlk has valid value in initReg if
4796 +    // (initVal != 1).
4797 +    CFGTraits::insertAssignInstrBefore(trueBlk, passRep, initReg, 1);
4798 +  }
4799 +  CFGTraits::insertInstrBefore(insertPos, AMDGPU::ELSE, passRep);
4800 +
4801 +  if (migrateFalse) {
4802 +    migrateInstruction(falseBlk, landBlk, insertPos);
4803 +    // need to uncondionally insert the assignment to ensure a path from its
4804 +    // predecessor rather than headBlk has valid value in initReg if
4805 +    // (initVal != 0)
4806 +    CFGTraits::insertAssignInstrBefore(falseBlk, passRep, initReg, 0);
4807 +  }
4808 +
4809 +  if (landBlkHasOtherPred) {
4810 +    // add endif
4811 +    CFGTraits::insertInstrBefore(insertPos, AMDGPU::ENDIF, passRep);
4812 +
4813 +    // put initReg = 2 to other predecessors of landBlk
4814 +    for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
4815 +         predIterEnd = landBlk->pred_end(); predIter != predIterEnd;
4816 +         ++predIter) {
4817 +      BlockT *curBlk = *predIter;
4818 +      if (curBlk != trueBlk && curBlk != falseBlk) {
4819 +        CFGTraits::insertAssignInstrBefore(curBlk, passRep, initReg, 2);
4820 +      }
4821 +    } //for
4822 +  }
4823 +  if (DEBUGME) {
4824 +    errs() << "result from improveSimpleJumpintoIf: ";
4825 +    showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
4826 +  }
4827 +
4828 +  // update landBlk
4829 +  *plandBlk = landBlk;
4830 +
4831 +  return numNewBlk;
4832 +} //improveSimpleJumpintoIf
4833 +
4834 +template<class PassT>
4835 +void CFGStructurizer<PassT>::handleLoopbreak(BlockT *exitingBlk,
4836 +                                              LoopT *exitingLoop,
4837 +                                             BlockT *exitBlk,
4838 +                                              LoopT *exitLoop,
4839 +                                             BlockT *landBlk) {
4840 +  if (DEBUGME) {
4841 +    errs() << "Trying to break loop-depth = " << getLoopDepth(exitLoop)
4842 +           << " from loop-depth = " << getLoopDepth(exitingLoop) << "\n";
4843 +  }
4844 +  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
4845 +
4846 +  RegiT initReg = INVALIDREGNUM;
4847 +  if (exitingLoop != exitLoop) {
4848 +    initReg = static_cast<int>
4849 +      (funcRep->getRegInfo().createVirtualRegister(I32RC));
4850 +    assert(initReg != INVALIDREGNUM);
4851 +    addLoopBreakInitReg(exitLoop, initReg);
4852 +    while (exitingLoop != exitLoop && exitingLoop) {
4853 +      addLoopBreakOnReg(exitingLoop, initReg);
4854 +      exitingLoop = exitingLoop->getParentLoop();
4855 +    }
4856 +    assert(exitingLoop == exitLoop);
4857 +  }
4858 +
4859 +  mergeLoopbreakBlock(exitingBlk, exitBlk, landBlk, initReg);
4860 +
4861 +} //handleLoopbreak
4862 +
4863 +template<class PassT>
4864 +void CFGStructurizer<PassT>::handleLoopcontBlock(BlockT *contingBlk,
4865 +                                                  LoopT *contingLoop,
4866 +                                                 BlockT *contBlk,
4867 +                                                  LoopT *contLoop) {
4868 +  if (DEBUGME) {
4869 +    errs() << "loopcontPattern cont = BB" << contingBlk->getNumber()
4870 +           << " header = BB" << contBlk->getNumber() << "\n";
4871 +
4872 +    errs() << "Trying to continue loop-depth = "
4873 +           << getLoopDepth(contLoop)
4874 +           << " from loop-depth = " << getLoopDepth(contingLoop) << "\n";
4875 +  }
4876 +
4877 +  RegiT initReg = INVALIDREGNUM;
4878 +  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
4879 +  if (contingLoop != contLoop) {
4880 +    initReg = static_cast<int>
4881 +      (funcRep->getRegInfo().createVirtualRegister(I32RC));
4882 +    assert(initReg != INVALIDREGNUM);
4883 +    addLoopContInitReg(contLoop, initReg);
4884 +    while (contingLoop && contingLoop->getParentLoop() != contLoop) {
4885 +      addLoopBreakOnReg(contingLoop, initReg);  //not addLoopContOnReg
4886 +      contingLoop = contingLoop->getParentLoop();
4887 +    }
4888 +    assert(contingLoop && contingLoop->getParentLoop() == contLoop);
4889 +    addLoopContOnReg(contingLoop, initReg);
4890 +  }
4891 +
4892 +  settleLoopcontBlock(contingBlk, contBlk, initReg);
4893 +} //handleLoopcontBlock
4894 +
4895 +template<class PassT>
4896 +void CFGStructurizer<PassT>::mergeSerialBlock(BlockT *dstBlk, BlockT *srcBlk) {
4897 +  if (DEBUGME) {
4898 +    errs() << "serialPattern BB" << dstBlk->getNumber()
4899 +           << " <= BB" << srcBlk->getNumber() << "\n";
4900 +  }
4901 +  dstBlk->splice(dstBlk->end(), srcBlk, srcBlk->begin(), srcBlk->end());
4902 +
4903 +  dstBlk->removeSuccessor(srcBlk);
4904 +  CFGTraits::cloneSuccessorList(dstBlk, srcBlk);
4905 +
4906 +  removeSuccessor(srcBlk);
4907 +  retireBlock(dstBlk, srcBlk);
4908 +} //mergeSerialBlock
4909 +
4910 +template<class PassT>
4911 +void CFGStructurizer<PassT>::mergeIfthenelseBlock(InstrT *branchInstr,
4912 +                                                  BlockT *curBlk,
4913 +                                                  BlockT *trueBlk,
4914 +                                                  BlockT *falseBlk,
4915 +                                                  BlockT *landBlk) {
4916 +  if (DEBUGME) {
4917 +    errs() << "ifPattern BB" << curBlk->getNumber();
4918 +    errs() << "{  ";
4919 +    if (trueBlk) {
4920 +      errs() << "BB" << trueBlk->getNumber();
4921 +    }
4922 +    errs() << "  } else ";
4923 +    errs() << "{  ";
4924 +    if (falseBlk) {
4925 +      errs() << "BB" << falseBlk->getNumber();
4926 +    }
4927 +    errs() << "  }\n ";
4928 +    errs() << "landBlock: ";
4929 +    if (landBlk == NULL) {
4930 +      errs() << "NULL";
4931 +    } else {
4932 +      errs() << "BB" << landBlk->getNumber();
4933 +    }
4934 +    errs() << "\n";
4935 +  }
4936 +
4937 +  int oldOpcode = branchInstr->getOpcode();
4938 +  DebugLoc branchDL = branchInstr->getDebugLoc();
4939 +
4940 +//    transform to
4941 +//    if cond
4942 +//       trueBlk
4943 +//    else
4944 +//       falseBlk
4945 +//    endif
4946 +//    landBlk
4947 +
4948 +  typename BlockT::iterator branchInstrPos =
4949 +    CFGTraits::getInstrPos(curBlk, branchInstr);
4950 +  CFGTraits::insertCondBranchBefore(branchInstrPos,
4951 +                                    CFGTraits::getBranchNzeroOpcode(oldOpcode),
4952 +                                    passRep,
4953 +                                    branchDL);
4954 +
4955 +  if (trueBlk) {
4956 +    curBlk->splice(branchInstrPos, trueBlk, trueBlk->begin(), trueBlk->end());
4957 +    curBlk->removeSuccessor(trueBlk);
4958 +    if (landBlk && trueBlk->succ_size()!=0) {
4959 +      trueBlk->removeSuccessor(landBlk);
4960 +    }
4961 +    retireBlock(curBlk, trueBlk);
4962 +  }
4963 +  CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ELSE, passRep);
4964 +
4965 +  if (falseBlk) {
4966 +    curBlk->splice(branchInstrPos, falseBlk, falseBlk->begin(),
4967 +                   falseBlk->end());
4968 +    curBlk->removeSuccessor(falseBlk);
4969 +    if (landBlk && falseBlk->succ_size() != 0) {
4970 +      falseBlk->removeSuccessor(landBlk);
4971 +    }
4972 +    retireBlock(curBlk, falseBlk);
4973 +  }
4974 +  CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ENDIF, passRep);
4975 +
4976 +  branchInstr->eraseFromParent();
4977 +
4978 +  if (landBlk && trueBlk && falseBlk) {
4979 +    curBlk->addSuccessor(landBlk);
4980 +  }
4981 +
4982 +} //mergeIfthenelseBlock
4983 +
4984 +template<class PassT>
4985 +void CFGStructurizer<PassT>::mergeLooplandBlock(BlockT *dstBlk,
4986 +                                                LoopLandInfo *loopLand) {
4987 +  BlockT *landBlk = loopLand->landBlk;
4988 +
4989 +  if (DEBUGME) {
4990 +    errs() << "loopPattern header = BB" << dstBlk->getNumber()
4991 +           << " land = BB" << landBlk->getNumber() << "\n";
4992 +  }
4993 +
4994 +  // Loop contInitRegs are init at the beginning of the loop.
4995 +  for (typename std::set<RegiT>::const_iterator iter =
4996 +         loopLand->contInitRegs.begin(),
4997 +       iterEnd = loopLand->contInitRegs.end(); iter != iterEnd; ++iter) {
4998 +    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
4999 +  }
5000 +
5001 +  /* we last inserterd the DebugLoc in the
5002 +   * BREAK_LOGICALZ_i32 or AMDGPU::BREAK_LOGICALNZ statement in the current dstBlk.
5003 +   * search for the DebugLoc in the that statement.
5004 +   * if not found, we have to insert the empty/default DebugLoc */
5005 +  InstrT *loopBreakInstr = CFGTraits::getLoopBreakInstr(dstBlk);
5006 +  DebugLoc DLBreak = (loopBreakInstr) ? loopBreakInstr->getDebugLoc() : DebugLoc();
5007 +
5008 +  CFGTraits::insertInstrBefore(dstBlk, AMDGPU::WHILELOOP, passRep, DLBreak);
5009 +  // Loop breakInitRegs are init before entering the loop.
5010 +  for (typename std::set<RegiT>::const_iterator iter =
5011 +         loopLand->breakInitRegs.begin(),
5012 +       iterEnd = loopLand->breakInitRegs.end(); iter != iterEnd; ++iter) {
5013 +    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
5014 +  }
5015 +  // Loop endbranchInitRegs are init before entering the loop.
5016 +  for (typename std::set<RegiT>::const_iterator iter =
5017 +         loopLand->endbranchInitRegs.begin(),
5018 +       iterEnd = loopLand->endbranchInitRegs.end(); iter != iterEnd; ++iter) {
5019 +    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
5020 +  }
5021 +
5022 +  /* we last inserterd the DebugLoc in the continue statement in the current dstBlk
5023 +   * search for the DebugLoc in the continue statement.
5024 +   * if not found, we have to insert the empty/default DebugLoc */
5025 +  InstrT *continueInstr = CFGTraits::getContinueInstr(dstBlk);
5026 +  DebugLoc DLContinue = (continueInstr) ? continueInstr->getDebugLoc() : DebugLoc();
5027 +
5028 +  CFGTraits::insertInstrEnd(dstBlk, AMDGPU::ENDLOOP, passRep, DLContinue);
5029 +  // Loop breakOnRegs are check after the ENDLOOP: break the loop outside this
5030 +  // loop.
5031 +  for (typename std::set<RegiT>::const_iterator iter =
5032 +         loopLand->breakOnRegs.begin(),
5033 +       iterEnd = loopLand->breakOnRegs.end(); iter != iterEnd; ++iter) {
5034 +    CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::PREDICATED_BREAK, passRep,
5035 +                                   *iter);
5036 +  }
5037 +
5038 +  // Loop contOnRegs are check after the ENDLOOP: cont the loop outside this
5039 +  // loop.
5040 +  for (std::set<RegiT>::const_iterator iter = loopLand->contOnRegs.begin(),
5041 +       iterEnd = loopLand->contOnRegs.end(); iter != iterEnd; ++iter) {
5042 +    CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::CONTINUE_LOGICALNZ_i32,
5043 +                                   passRep, *iter);
5044 +  }
5045 +
5046 +  dstBlk->splice(dstBlk->end(), landBlk, landBlk->begin(), landBlk->end());
5047 +
5048 +  for (typename BlockT::succ_iterator iter = landBlk->succ_begin(),
5049 +       iterEnd = landBlk->succ_end(); iter != iterEnd; ++iter) {
5050 +    dstBlk->addSuccessor(*iter);  // *iter's predecessor is also taken care of.
5051 +  }
5052 +
5053 +  removeSuccessor(landBlk);
5054 +  retireBlock(dstBlk, landBlk);
5055 +} //mergeLooplandBlock
5056 +
5057 +template<class PassT>
5058 +void CFGStructurizer<PassT>::reversePredicateSetter(typename BlockT::iterator I) {
5059 +  while (I--) {
5060 +    if (I->getOpcode() == AMDGPU::PRED_X) {
5061 +      switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) {
5062 +      case OPCODE_IS_ZERO_INT:
5063 +        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO_INT);
5064 +        return;
5065 +      case OPCODE_IS_NOT_ZERO_INT:
5066 +        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_ZERO_INT);
5067 +        return;
5068 +      case OPCODE_IS_ZERO:
5069 +        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO);
5070 +        return;
5071 +      case OPCODE_IS_NOT_ZERO:
5072 +        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_ZERO);
5073 +        return;
5074 +      default:
5075 +        assert(0 && "PRED_X Opcode invalid!");
5076 +      }
5077 +    }
5078 +  }
5079 +}
5080 +
5081 +template<class PassT>
5082 +void CFGStructurizer<PassT>::mergeLoopbreakBlock(BlockT *exitingBlk,
5083 +                                                 BlockT *exitBlk,
5084 +                                                 BlockT *exitLandBlk,
5085 +                                                 RegiT  setReg) {
5086 +  if (DEBUGME) {
5087 +    errs() << "loopbreakPattern exiting = BB" << exitingBlk->getNumber()
5088 +           << " exit = BB" << exitBlk->getNumber()
5089 +           << " land = BB" << exitLandBlk->getNumber() << "\n";
5090 +  }
5091 +
5092 +  InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(exitingBlk);
5093 +  assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
5094 +
5095 +  DebugLoc DL = branchInstr->getDebugLoc();
5096 +
5097 +  BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
5098 +
5099 +  //    transform exitingBlk to
5100 +  //    if ( ) {
5101 +  //       exitBlk (if exitBlk != exitLandBlk)
5102 +  //       setReg = 1
5103 +  //       break
5104 +  //    }endif
5105 +  //    successor = {orgSuccessor(exitingBlk) - exitBlk}
5106 +
5107 +  typename BlockT::iterator branchInstrPos =
5108 +    CFGTraits::getInstrPos(exitingBlk, branchInstr);
5109 +
5110 +  if (exitBlk == exitLandBlk && setReg == INVALIDREGNUM) {
5111 +    //break_logical
5112 +
5113 +    if (trueBranch != exitBlk) {
5114 +      reversePredicateSetter(branchInstrPos);
5115 +    }
5116 +    CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL);
5117 +  } else {
5118 +    if (trueBranch != exitBlk) {
5119 +      reversePredicateSetter(branchInstr);
5120 +    }
5121 +    CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL);
5122 +    if (exitBlk != exitLandBlk) {
5123 +      //splice is insert-before ...
5124 +      exitingBlk->splice(branchInstrPos, exitBlk, exitBlk->begin(),
5125 +                         exitBlk->end());
5126 +    }
5127 +    if (setReg != INVALIDREGNUM) {
5128 +      CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
5129 +    }
5130 +    CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::BREAK, passRep);
5131 +  } //if_logical
5132 +
5133 +  //now branchInst can be erase safely
5134 +  branchInstr->eraseFromParent();
5135 +
5136 +  //now take care of successors, retire blocks
5137 +  exitingBlk->removeSuccessor(exitBlk);
5138 +  if (exitBlk != exitLandBlk) {
5139 +    //splice is insert-before ...
5140 +    exitBlk->removeSuccessor(exitLandBlk);
5141 +    retireBlock(exitingBlk, exitBlk);
5142 +  }
5143 +
5144 +} //mergeLoopbreakBlock
5145 +
5146 +template<class PassT>
5147 +void CFGStructurizer<PassT>::settleLoopcontBlock(BlockT *contingBlk,
5148 +                                                 BlockT *contBlk,
5149 +                                                 RegiT   setReg) {
5150 +  if (DEBUGME) {
5151 +    errs() << "settleLoopcontBlock conting = BB"
5152 +           << contingBlk->getNumber()
5153 +           << ", cont = BB" << contBlk->getNumber() << "\n";
5154 +  }
5155 +
5156 +  InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(contingBlk);
5157 +  if (branchInstr) {
5158 +    assert(CFGTraits::isCondBranch(branchInstr));
5159 +    typename BlockT::iterator branchInstrPos =
5160 +      CFGTraits::getInstrPos(contingBlk, branchInstr);
5161 +    BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
5162 +    int oldOpcode = branchInstr->getOpcode();
5163 +    DebugLoc DL = branchInstr->getDebugLoc();
5164 +
5165 +    //    transform contingBlk to
5166 +    //     if () {
5167 +    //          move instr after branchInstr
5168 +    //          continue
5169 +    //        or
5170 +    //          setReg = 1
5171 +    //          break
5172 +    //     }endif
5173 +    //     successor = {orgSuccessor(contingBlk) - loopHeader}
5174 +
5175 +    bool useContinueLogical =
5176 +      (setReg == INVALIDREGNUM && (&*contingBlk->rbegin()) == branchInstr);
5177 +
5178 +    if (useContinueLogical == false) {
5179 +      int branchOpcode =
5180 +        trueBranch == contBlk ? CFGTraits::getBranchNzeroOpcode(oldOpcode)
5181 +                              : CFGTraits::getBranchZeroOpcode(oldOpcode);
5182 +
5183 +      CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
5184 +
5185 +      if (setReg != INVALIDREGNUM) {
5186 +        CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
5187 +        // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
5188 +        CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, DL);
5189 +      } else {
5190 +        // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
5191 +        CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, DL);
5192 +      }
5193 +
5194 +      CFGTraits::insertInstrEnd(contingBlk, AMDGPU::ENDIF, passRep, DL);
5195 +    } else {
5196 +      int branchOpcode =
5197 +        trueBranch == contBlk ? CFGTraits::getContinueNzeroOpcode(oldOpcode)
5198 +                              : CFGTraits::getContinueZeroOpcode(oldOpcode);
5199 +
5200 +      CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
5201 +    }
5202 +
5203 +    branchInstr->eraseFromParent();
5204 +  } else {
5205 +    // if we've arrived here then we've already erased the branch instruction
5206 +    // travel back up the basic block to see the last reference of our debug location
5207 +    // we've just inserted that reference here so it should be representative
5208 +    if (setReg != INVALIDREGNUM) {
5209 +      CFGTraits::insertAssignInstrBefore(contingBlk, passRep, setReg, 1);
5210 +      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
5211 +      CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
5212 +    } else {
5213 +      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
5214 +      CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
5215 +    }
5216 +  } //else
5217 +
5218 +} //settleLoopcontBlock
5219 +
5220 +// BBs in exitBlkSet are determined as in break-path for loopRep,
5221 +// before we can put code for BBs as inside loop-body for loopRep
5222 +// check whether those BBs are determined as cont-BB for parentLoopRep
5223 +// earlier.
5224 +// If so, generate a new BB newBlk
5225 +//    (1) set newBlk common successor of BBs in exitBlkSet
5226 +//    (2) change the continue-instr in BBs in exitBlkSet to break-instr
5227 +//    (3) generate continue-instr in newBlk
5228 +//
5229 +template<class PassT>
5230 +typename CFGStructurizer<PassT>::BlockT *
5231 +CFGStructurizer<PassT>::relocateLoopcontBlock(LoopT *parentLoopRep,
5232 +                                              LoopT *loopRep,
5233 +                                              std::set<BlockT *> &exitBlkSet,
5234 +                                              BlockT *exitLandBlk) {
5235 +  std::set<BlockT *> endBlkSet;
5236 +
5237 +
5238 +
5239 +  for (typename std::set<BlockT *>::const_iterator iter = exitBlkSet.begin(),
5240 +       iterEnd = exitBlkSet.end();
5241 +       iter != iterEnd; ++iter) {
5242 +    BlockT *exitBlk = *iter;
5243 +    BlockT *endBlk = singlePathEnd(exitBlk, exitLandBlk);
5244 +
5245 +    if (endBlk == NULL || CFGTraits::getContinueInstr(endBlk) == NULL)
5246 +      return NULL;
5247 +
5248 +    endBlkSet.insert(endBlk);
5249 +  }
5250 +
5251 +  BlockT *newBlk = funcRep->CreateMachineBasicBlock();
5252 +  funcRep->push_back(newBlk);  //insert to function
5253 +  CFGTraits::insertInstrEnd(newBlk, AMDGPU::CONTINUE, passRep);
5254 +  SHOWNEWBLK(newBlk, "New continue block: ");
5255 +
5256 +  for (typename std::set<BlockT*>::const_iterator iter = endBlkSet.begin(),
5257 +       iterEnd = endBlkSet.end();
5258 +       iter != iterEnd; ++iter) {
5259 +      BlockT *endBlk = *iter;
5260 +      InstrT *contInstr = CFGTraits::getContinueInstr(endBlk);
5261 +      if (contInstr) {
5262 +        contInstr->eraseFromParent();
5263 +      }
5264 +      endBlk->addSuccessor(newBlk);
5265 +      if (DEBUGME) {
5266 +        errs() << "Add new continue Block to BB"
5267 +               << endBlk->getNumber() << " successors\n";
5268 +      }
5269 +  }
5270 +
5271 +  return newBlk;
5272 +} //relocateLoopcontBlock
5273 +
5274 +
5275 +// LoopEndbranchBlock is a BB created by the CFGStructurizer to use as
5276 +// LoopLandBlock. This BB branch on the loop endBranchInit register to the
5277 +// pathes corresponding to the loop exiting branches.
5278 +
5279 +template<class PassT>
5280 +typename CFGStructurizer<PassT>::BlockT *
5281 +CFGStructurizer<PassT>::addLoopEndbranchBlock(LoopT *loopRep,
5282 +                                              BlockTSmallerVector &exitingBlks,
5283 +                                              BlockTSmallerVector &exitBlks) {
5284 +  const AMDGPUInstrInfo *tii =
5285 +             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
5286 +  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
5287 +
5288 +  RegiT endBranchReg = static_cast<int>
5289 +    (funcRep->getRegInfo().createVirtualRegister(I32RC));
5290 +  assert(endBranchReg >= 0);
5291 +
5292 +  // reg = 0 before entering the loop
5293 +  addLoopEndbranchInitReg(loopRep, endBranchReg);
5294 +
5295 +  uint32_t numBlks = static_cast<uint32_t>(exitingBlks.size());
5296 +  assert(numBlks >=2 && numBlks == exitBlks.size());
5297 +
5298 +  BlockT *preExitingBlk = exitingBlks[0];
5299 +  BlockT *preExitBlk = exitBlks[0];
5300 +  BlockT *preBranchBlk = funcRep->CreateMachineBasicBlock();
5301 +  funcRep->push_back(preBranchBlk);  //insert to function
5302 +  SHOWNEWBLK(preBranchBlk, "New loopEndbranch block: ");
5303 +
5304 +  BlockT *newLandBlk = preBranchBlk;
5305 +
5306 +      CFGTraits::replaceInstrUseOfBlockWith(preExitingBlk, preExitBlk,
5307 +        newLandBlk);
5308 +  preExitingBlk->removeSuccessor(preExitBlk);
5309 +  preExitingBlk->addSuccessor(newLandBlk);
5310 +
5311 +  //it is redundant to add reg = 0 to exitingBlks[0]
5312 +
5313 +  // For 1..n th exiting path (the last iteration handles two pathes) create the
5314 +  // branch to the previous path and the current path.
5315 +  for (uint32_t i = 1; i < numBlks; ++i) {
5316 +    BlockT *curExitingBlk = exitingBlks[i];
5317 +    BlockT *curExitBlk = exitBlks[i];
5318 +    BlockT *curBranchBlk;
5319 +
5320 +    if (i == numBlks - 1) {
5321 +      curBranchBlk = curExitBlk;
5322 +    } else {
5323 +      curBranchBlk = funcRep->CreateMachineBasicBlock();
5324 +      funcRep->push_back(curBranchBlk);  //insert to function
5325 +      SHOWNEWBLK(curBranchBlk, "New loopEndbranch block: ");
5326 +    }
5327 +
5328 +    // Add reg = i to exitingBlks[i].
5329 +    CFGTraits::insertAssignInstrBefore(curExitingBlk, passRep,
5330 +                                       endBranchReg, i);
5331 +
5332 +    // Remove the edge (exitingBlks[i] exitBlks[i]) add new edge
5333 +    // (exitingBlks[i], newLandBlk).
5334 +    CFGTraits::replaceInstrUseOfBlockWith(curExitingBlk, curExitBlk,
5335 +                                          newLandBlk);
5336 +    curExitingBlk->removeSuccessor(curExitBlk);
5337 +    curExitingBlk->addSuccessor(newLandBlk);
5338 +
5339 +    // add to preBranchBlk the branch instruction:
5340 +    // if (endBranchReg == preVal)
5341 +    //    preExitBlk
5342 +    // else
5343 +    //    curBranchBlk
5344 +    //
5345 +    // preValReg = i - 1
5346 +
5347 +  DebugLoc DL;
5348 +  RegiT preValReg = static_cast<int>
5349 +    (funcRep->getRegInfo().createVirtualRegister(I32RC));
5350 +
5351 +  preBranchBlk->insert(preBranchBlk->begin(),
5352 +                       tii->getMovImmInstr(preBranchBlk->getParent(), preValReg,
5353 +                       i - 1));
5354 +
5355 +  // condResReg = (endBranchReg == preValReg)
5356 +    RegiT condResReg = static_cast<int>
5357 +      (funcRep->getRegInfo().createVirtualRegister(I32RC));
5358 +    BuildMI(preBranchBlk, DL, tii->get(tii->getIEQOpcode()), condResReg)
5359 +      .addReg(endBranchReg).addReg(preValReg);
5360 +
5361 +    BuildMI(preBranchBlk, DL, tii->get(AMDGPU::BRANCH_COND_i32))
5362 +      .addMBB(preExitBlk).addReg(condResReg);
5363 +
5364 +    preBranchBlk->addSuccessor(preExitBlk);
5365 +    preBranchBlk->addSuccessor(curBranchBlk);
5366 +
5367 +    // Update preExitingBlk, preExitBlk, preBranchBlk.
5368 +    preExitingBlk = curExitingBlk;
5369 +    preExitBlk = curExitBlk;
5370 +    preBranchBlk = curBranchBlk;
5371 +
5372 +  }  //end for 1 .. n blocks
5373 +
5374 +  return newLandBlk;
5375 +} //addLoopEndbranchBlock
5376 +
5377 +template<class PassT>
5378 +typename CFGStructurizer<PassT>::PathToKind
5379 +CFGStructurizer<PassT>::singlePathTo(BlockT *srcBlk, BlockT *dstBlk,
5380 +                                     bool allowSideEntry) {
5381 +  assert(dstBlk);
5382 +
5383 +  if (srcBlk == dstBlk) {
5384 +    return SinglePath_InPath;
5385 +  }
5386 +
5387 +  while (srcBlk && srcBlk->succ_size() == 1) {
5388 +    srcBlk = *srcBlk->succ_begin();
5389 +    if (srcBlk == dstBlk) {
5390 +      return SinglePath_InPath;
5391 +    }
5392 +
5393 +    if (!allowSideEntry && srcBlk->pred_size() > 1) {
5394 +      return Not_SinglePath;
5395 +    }
5396 +  }
5397 +
5398 +  if (srcBlk && srcBlk->succ_size()==0) {
5399 +    return SinglePath_NotInPath;
5400 +  }
5401 +
5402 +  return Not_SinglePath;
5403 +} //singlePathTo
5404 +
5405 +// If there is a single path from srcBlk to dstBlk, return the last block before
5406 +// dstBlk If there is a single path from srcBlk->end without dstBlk, return the
5407 +// last block in the path Otherwise, return NULL
5408 +template<class PassT>
5409 +typename CFGStructurizer<PassT>::BlockT *
5410 +CFGStructurizer<PassT>::singlePathEnd(BlockT *srcBlk, BlockT *dstBlk,
5411 +                                      bool allowSideEntry) {
5412 +  assert(dstBlk);
5413 +
5414 +  if (srcBlk == dstBlk) {
5415 +    return srcBlk;
5416 +  }
5417 +
5418 +  if (srcBlk->succ_size() == 0) {
5419 +    return srcBlk;
5420 +  }
5421 +
5422 +  while (srcBlk && srcBlk->succ_size() == 1) {
5423 +    BlockT *preBlk = srcBlk;
5424 +
5425 +    srcBlk = *srcBlk->succ_begin();
5426 +    if (srcBlk == NULL) {
5427 +      return preBlk;
5428 +    }
5429 +
5430 +    if (!allowSideEntry && srcBlk->pred_size() > 1) {
5431 +      return NULL;
5432 +    }
5433 +  }
5434 +
5435 +  if (srcBlk && srcBlk->succ_size()==0) {
5436 +    return srcBlk;
5437 +  }
5438 +
5439 +  return NULL;
5440 +
5441 +} //singlePathEnd
5442 +
5443 +template<class PassT>
5444 +int CFGStructurizer<PassT>::cloneOnSideEntryTo(BlockT *preBlk, BlockT *srcBlk,
5445 +                                               BlockT *dstBlk) {
5446 +  int cloned = 0;
5447 +  assert(preBlk->isSuccessor(srcBlk));
5448 +  while (srcBlk && srcBlk != dstBlk) {
5449 +    assert(srcBlk->succ_size() == 1);
5450 +    if (srcBlk->pred_size() > 1) {
5451 +      srcBlk = cloneBlockForPredecessor(srcBlk, preBlk);
5452 +      ++cloned;
5453 +    }
5454 +
5455 +    preBlk = srcBlk;
5456 +    srcBlk = *srcBlk->succ_begin();
5457 +  }
5458 +
5459 +  return cloned;
5460 +} //cloneOnSideEntryTo
5461 +
5462 +template<class PassT>
5463 +typename CFGStructurizer<PassT>::BlockT *
5464 +CFGStructurizer<PassT>::cloneBlockForPredecessor(BlockT *curBlk,
5465 +                                                 BlockT *predBlk) {
5466 +  assert(predBlk->isSuccessor(curBlk) &&
5467 +         "succBlk is not a prececessor of curBlk");
5468 +
5469 +  BlockT *cloneBlk = CFGTraits::clone(curBlk);  //clone instructions
5470 +  CFGTraits::replaceInstrUseOfBlockWith(predBlk, curBlk, cloneBlk);
5471 +  //srcBlk, oldBlk, newBlk
5472 +
5473 +  predBlk->removeSuccessor(curBlk);
5474 +  predBlk->addSuccessor(cloneBlk);
5475 +
5476 +  // add all successor to cloneBlk
5477 +  CFGTraits::cloneSuccessorList(cloneBlk, curBlk);
5478 +
5479 +  numClonedInstr += curBlk->size();
5480 +
5481 +  if (DEBUGME) {
5482 +    errs() << "Cloned block: " << "BB"
5483 +           << curBlk->getNumber() << "size " << curBlk->size() << "\n";
5484 +  }
5485 +
5486 +  SHOWNEWBLK(cloneBlk, "result of Cloned block: ");
5487 +
5488 +  return cloneBlk;
5489 +} //cloneBlockForPredecessor
5490 +
5491 +template<class PassT>
5492 +typename CFGStructurizer<PassT>::BlockT *
5493 +CFGStructurizer<PassT>::exitingBlock2ExitBlock(LoopT *loopRep,
5494 +                                               BlockT *exitingBlk) {
5495 +  BlockT *exitBlk = NULL;
5496 +
5497 +  for (typename BlockT::succ_iterator iterSucc = exitingBlk->succ_begin(),
5498 +       iterSuccEnd = exitingBlk->succ_end();
5499 +       iterSucc != iterSuccEnd; ++iterSucc) {
5500 +    BlockT *curBlk = *iterSucc;
5501 +    if (!loopRep->contains(curBlk)) {
5502 +      assert(exitBlk == NULL);
5503 +      exitBlk = curBlk;
5504 +    }
5505 +  }
5506 +
5507 +  assert(exitBlk != NULL);
5508 +
5509 +  return exitBlk;
5510 +} //exitingBlock2ExitBlock
5511 +
5512 +template<class PassT>
5513 +void CFGStructurizer<PassT>::migrateInstruction(BlockT *srcBlk,
5514 +                                                BlockT *dstBlk,
5515 +                                                InstrIterator insertPos) {
5516 +  InstrIterator spliceEnd;
5517 +  //look for the input branchinstr, not the AMDGPU branchinstr
5518 +  InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
5519 +  if (branchInstr == NULL) {
5520 +    if (DEBUGME) {
5521 +      errs() << "migrateInstruction don't see branch instr\n" ;
5522 +    }
5523 +    spliceEnd = srcBlk->end();
5524 +  } else {
5525 +    if (DEBUGME) {
5526 +      errs() << "migrateInstruction see branch instr\n" ;
5527 +      branchInstr->dump();
5528 +    }
5529 +    spliceEnd = CFGTraits::getInstrPos(srcBlk, branchInstr);
5530 +  }
5531 +  if (DEBUGME) {
5532 +    errs() << "migrateInstruction before splice dstSize = " << dstBlk->size()
5533 +      << "srcSize = " << srcBlk->size() << "\n";
5534 +  }
5535 +
5536 +  //splice insert before insertPos
5537 +  dstBlk->splice(insertPos, srcBlk, srcBlk->begin(), spliceEnd);
5538 +
5539 +  if (DEBUGME) {
5540 +    errs() << "migrateInstruction after splice dstSize = " << dstBlk->size()
5541 +      << "srcSize = " << srcBlk->size() << "\n";
5542 +  }
5543 +} //migrateInstruction
5544 +
5545 +// normalizeInfiniteLoopExit change
5546 +//   B1:
5547 +//        uncond_br LoopHeader
5548 +//
5549 +// to
5550 +//   B1:
5551 +//        cond_br 1 LoopHeader dummyExit
5552 +// and return the newly added dummy exit block
5553 +//
5554 +template<class PassT>
5555 +typename CFGStructurizer<PassT>::BlockT *
5556 +CFGStructurizer<PassT>::normalizeInfiniteLoopExit(LoopT* LoopRep) {
5557 +  BlockT *loopHeader;
5558 +  BlockT *loopLatch;
5559 +  loopHeader = LoopRep->getHeader();
5560 +  loopLatch = LoopRep->getLoopLatch();
5561 +  BlockT *dummyExitBlk = NULL;
5562 +  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
5563 +  if (loopHeader!=NULL && loopLatch!=NULL) {
5564 +    InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(loopLatch);
5565 +    if (branchInstr!=NULL && CFGTraits::isUncondBranch(branchInstr)) {
5566 +      dummyExitBlk = funcRep->CreateMachineBasicBlock();
5567 +      funcRep->push_back(dummyExitBlk);  //insert to function
5568 +      SHOWNEWBLK(dummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
5569 +
5570 +      if (DEBUGME) errs() << "Old branch instr: " << *branchInstr << "\n";
5571 +
5572 +      typename BlockT::iterator insertPos =
5573 +        CFGTraits::getInstrPos(loopLatch, branchInstr);
5574 +      unsigned immReg =
5575 +        funcRep->getRegInfo().createVirtualRegister(I32RC);
5576 +      CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 1);
5577 +      InstrT *newInstr =
5578 +        CFGTraits::insertInstrBefore(insertPos, AMDGPU::BRANCH_COND_i32, passRep);
5579 +      MachineInstrBuilder(newInstr).addMBB(loopHeader).addReg(immReg, false);
5580 +
5581 +      SHOWNEWINSTR(newInstr);
5582 +
5583 +      branchInstr->eraseFromParent();
5584 +      loopLatch->addSuccessor(dummyExitBlk);
5585 +    }
5586 +  }
5587 +
5588 +  return dummyExitBlk;
5589 +} //normalizeInfiniteLoopExit
5590 +
5591 +template<class PassT>
5592 +void CFGStructurizer<PassT>::removeUnconditionalBranch(BlockT *srcBlk) {
5593 +  InstrT *branchInstr;
5594 +
5595 +  // I saw two unconditional branch in one basic block in example
5596 +  // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
5597 +  while ((branchInstr = CFGTraits::getLoopendBlockBranchInstr(srcBlk))
5598 +          && CFGTraits::isUncondBranch(branchInstr)) {
5599 +    if (DEBUGME) {
5600 +          errs() << "Removing unconditional branch instruction" ;
5601 +      branchInstr->dump();
5602 +    }
5603 +    branchInstr->eraseFromParent();
5604 +  }
5605 +} //removeUnconditionalBranch
5606 +
5607 +template<class PassT>
5608 +void CFGStructurizer<PassT>::removeRedundantConditionalBranch(BlockT *srcBlk) {
5609 +  if (srcBlk->succ_size() == 2) {
5610 +    BlockT *blk1 = *srcBlk->succ_begin();
5611 +    BlockT *blk2 = *(++srcBlk->succ_begin());
5612 +
5613 +    if (blk1 == blk2) {
5614 +      InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
5615 +      assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
5616 +      if (DEBUGME) {
5617 +        errs() << "Removing unneeded conditional branch instruction" ;
5618 +        branchInstr->dump();
5619 +      }
5620 +      branchInstr->eraseFromParent();
5621 +      SHOWNEWBLK(blk1, "Removing redundant successor");
5622 +      srcBlk->removeSuccessor(blk1);
5623 +    }
5624 +  }
5625 +} //removeRedundantConditionalBranch
5626 +
5627 +template<class PassT>
5628 +void CFGStructurizer<PassT>::addDummyExitBlock(SmallVector<BlockT*,
5629 +                                               DEFAULT_VEC_SLOTS> &retBlks) {
5630 +  BlockT *dummyExitBlk = funcRep->CreateMachineBasicBlock();
5631 +  funcRep->push_back(dummyExitBlk);  //insert to function
5632 +  CFGTraits::insertInstrEnd(dummyExitBlk, AMDGPU::RETURN, passRep);
5633 +
5634 +  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator iter =
5635 +         retBlks.begin(),
5636 +       iterEnd = retBlks.end(); iter != iterEnd; ++iter) {
5637 +    BlockT *curBlk = *iter;
5638 +    InstrT *curInstr = CFGTraits::getReturnInstr(curBlk);
5639 +    if (curInstr) {
5640 +      curInstr->eraseFromParent();
5641 +    }
5642 +    curBlk->addSuccessor(dummyExitBlk);
5643 +    if (DEBUGME) {
5644 +      errs() << "Add dummyExitBlock to BB" << curBlk->getNumber()
5645 +             << " successors\n";
5646 +    }
5647 +  } //for
5648 +
5649 +  SHOWNEWBLK(dummyExitBlk, "DummyExitBlock: ");
5650 +} //addDummyExitBlock
5651 +
5652 +template<class PassT>
5653 +void CFGStructurizer<PassT>::removeSuccessor(BlockT *srcBlk) {
5654 +  while (srcBlk->succ_size()) {
5655 +    srcBlk->removeSuccessor(*srcBlk->succ_begin());
5656 +  }
5657 +}
5658 +
5659 +template<class PassT>
5660 +void CFGStructurizer<PassT>::recordSccnum(BlockT *srcBlk, int sccNum) {
5661 +  BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
5662 +
5663 +  if (srcBlkInfo == NULL) {
5664 +    srcBlkInfo = new BlockInfo();
5665 +  }
5666 +
5667 +  srcBlkInfo->sccNum = sccNum;
5668 +}
5669 +
5670 +template<class PassT>
5671 +int CFGStructurizer<PassT>::getSCCNum(BlockT *srcBlk) {
5672 +  BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
5673 +  return srcBlkInfo ? srcBlkInfo->sccNum : INVALIDSCCNUM;
5674 +}
5675 +
5676 +template<class PassT>
5677 +void CFGStructurizer<PassT>::retireBlock(BlockT *dstBlk, BlockT *srcBlk) {
5678 +  if (DEBUGME) {
5679 +        errs() << "Retiring BB" << srcBlk->getNumber() << "\n";
5680 +  }
5681 +
5682 +  BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
5683 +
5684 +  if (srcBlkInfo == NULL) {
5685 +    srcBlkInfo = new BlockInfo();
5686 +  }
5687 +
5688 +  srcBlkInfo->isRetired = true;
5689 +  assert(srcBlk->succ_size() == 0 && srcBlk->pred_size() == 0
5690 +         && "can't retire block yet");
5691 +}
5692 +
5693 +template<class PassT>
5694 +bool CFGStructurizer<PassT>::isRetiredBlock(BlockT *srcBlk) {
5695 +  BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
5696 +  return (srcBlkInfo && srcBlkInfo->isRetired);
5697 +}
5698 +
5699 +template<class PassT>
5700 +bool CFGStructurizer<PassT>::isActiveLoophead(BlockT *curBlk) {
5701 +  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
5702 +  while (loopRep && loopRep->getHeader() == curBlk) {
5703 +    LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
5704 +
5705 +    if(loopLand == NULL)
5706 +      return true;
5707 +
5708 +    BlockT *landBlk = loopLand->landBlk;
5709 +    assert(landBlk);
5710 +    if (!isRetiredBlock(landBlk)) {
5711 +      return true;
5712 +    }
5713 +
5714 +    loopRep = loopRep->getParentLoop();
5715 +  }
5716 +
5717 +  return false;
5718 +} //isActiveLoophead
5719 +
5720 +template<class PassT>
5721 +bool CFGStructurizer<PassT>::needMigrateBlock(BlockT *blk) {
5722 +  const unsigned blockSizeThreshold = 30;
5723 +  const unsigned cloneInstrThreshold = 100;
5724 +
5725 +  bool multiplePreds = blk && (blk->pred_size() > 1);
5726 +
5727 +  if(!multiplePreds)
5728 +    return false;
5729 +
5730 +  unsigned blkSize = blk->size();
5731 +  return ((blkSize > blockSizeThreshold)
5732 +          && (blkSize * (blk->pred_size() - 1) > cloneInstrThreshold));
5733 +} //needMigrateBlock
5734 +
5735 +template<class PassT>
5736 +typename CFGStructurizer<PassT>::BlockT *
5737 +CFGStructurizer<PassT>::recordLoopLandBlock(LoopT *loopRep, BlockT *landBlk,
5738 +                                            BlockTSmallerVector &exitBlks,
5739 +                                            std::set<BlockT *> &exitBlkSet) {
5740 +  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> inpathBlks;  //in exit path blocks
5741 +
5742 +  for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
5743 +       predIterEnd = landBlk->pred_end();
5744 +       predIter != predIterEnd; ++predIter) {
5745 +    BlockT *curBlk = *predIter;
5746 +    if (loopRep->contains(curBlk) || exitBlkSet.count(curBlk)) {
5747 +      inpathBlks.push_back(curBlk);
5748 +    }
5749 +  } //for
5750 +
5751 +  //if landBlk has predecessors that are not in the given loop,
5752 +  //create a new block
5753 +  BlockT *newLandBlk = landBlk;
5754 +  if (inpathBlks.size() != landBlk->pred_size()) {
5755 +    newLandBlk = funcRep->CreateMachineBasicBlock();
5756 +    funcRep->push_back(newLandBlk);  //insert to function
5757 +    newLandBlk->addSuccessor(landBlk);
5758 +    for (typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::iterator iter =
5759 +         inpathBlks.begin(),
5760 +         iterEnd = inpathBlks.end(); iter != iterEnd; ++iter) {
5761 +      BlockT *curBlk = *iter;
5762 +      CFGTraits::replaceInstrUseOfBlockWith(curBlk, landBlk, newLandBlk);
5763 +      //srcBlk, oldBlk, newBlk
5764 +      curBlk->removeSuccessor(landBlk);
5765 +      curBlk->addSuccessor(newLandBlk);
5766 +    }
5767 +    for (size_t i = 0, tot = exitBlks.size(); i < tot; ++i) {
5768 +      if (exitBlks[i] == landBlk) {
5769 +        exitBlks[i] = newLandBlk;
5770 +      }
5771 +    }
5772 +    SHOWNEWBLK(newLandBlk, "NewLandingBlock: ");
5773 +  }
5774 +
5775 +  setLoopLandBlock(loopRep, newLandBlk);
5776 +
5777 +  return newLandBlk;
5778 +} // recordLoopbreakLand
5779 +
5780 +template<class PassT>
5781 +void CFGStructurizer<PassT>::setLoopLandBlock(LoopT *loopRep, BlockT *blk) {
5782 +  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
5783 +
5784 +  if (theEntry == NULL) {
5785 +    theEntry = new LoopLandInfo();
5786 +  }
5787 +  assert(theEntry->landBlk == NULL);
5788 +
5789 +  if (blk == NULL) {
5790 +    blk = funcRep->CreateMachineBasicBlock();
5791 +    funcRep->push_back(blk);  //insert to function
5792 +    SHOWNEWBLK(blk, "DummyLandingBlock for loop without break: ");
5793 +  }
5794 +
5795 +  theEntry->landBlk = blk;
5796 +
5797 +  if (DEBUGME) {
5798 +    errs() << "setLoopLandBlock loop-header = BB"
5799 +           << loopRep->getHeader()->getNumber()
5800 +           << "  landing-block = BB" << blk->getNumber() << "\n";
5801 +  }
5802 +} // setLoopLandBlock
5803 +
5804 +template<class PassT>
5805 +void CFGStructurizer<PassT>::addLoopBreakOnReg(LoopT *loopRep, RegiT regNum) {
5806 +  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
5807 +
5808 +  if (theEntry == NULL) {
5809 +    theEntry = new LoopLandInfo();
5810 +  }
5811 +
5812 +  theEntry->breakOnRegs.insert(regNum);
5813 +
5814 +  if (DEBUGME) {
5815 +    errs() << "addLoopBreakOnReg loop-header = BB"
5816 +           << loopRep->getHeader()->getNumber()
5817 +           << "  regNum = " << regNum << "\n";
5818 +  }
5819 +} // addLoopBreakOnReg
5820 +
5821 +template<class PassT>
5822 +void CFGStructurizer<PassT>::addLoopContOnReg(LoopT *loopRep, RegiT regNum) {
5823 +  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
5824 +
5825 +  if (theEntry == NULL) {
5826 +    theEntry = new LoopLandInfo();
5827 +  }
5828 +  theEntry->contOnRegs.insert(regNum);
5829 +
5830 +  if (DEBUGME) {
5831 +    errs() << "addLoopContOnReg loop-header = BB"
5832 +           << loopRep->getHeader()->getNumber()
5833 +           << "  regNum = " << regNum << "\n";
5834 +  }
5835 +} // addLoopContOnReg
5836 +
5837 +template<class PassT>
5838 +void CFGStructurizer<PassT>::addLoopBreakInitReg(LoopT *loopRep, RegiT regNum) {
5839 +  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
5840 +
5841 +  if (theEntry == NULL) {
5842 +    theEntry = new LoopLandInfo();
5843 +  }
5844 +  theEntry->breakInitRegs.insert(regNum);
5845 +
5846 +  if (DEBUGME) {
5847 +    errs() << "addLoopBreakInitReg loop-header = BB"
5848 +           << loopRep->getHeader()->getNumber()
5849 +           << "  regNum = " << regNum << "\n";
5850 +  }
5851 +} // addLoopBreakInitReg
5852 +
5853 +template<class PassT>
5854 +void CFGStructurizer<PassT>::addLoopContInitReg(LoopT *loopRep, RegiT regNum) {
5855 +  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
5856 +
5857 +  if (theEntry == NULL) {
5858 +    theEntry = new LoopLandInfo();
5859 +  }
5860 +  theEntry->contInitRegs.insert(regNum);
5861 +
5862 +  if (DEBUGME) {
5863 +    errs() << "addLoopContInitReg loop-header = BB"
5864 +           << loopRep->getHeader()->getNumber()
5865 +           << "  regNum = " << regNum << "\n";
5866 +  }
5867 +} // addLoopContInitReg
5868 +
5869 +template<class PassT>
5870 +void CFGStructurizer<PassT>::addLoopEndbranchInitReg(LoopT *loopRep,
5871 +                                                     RegiT regNum) {
5872 +  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
5873 +
5874 +  if (theEntry == NULL) {
5875 +    theEntry = new LoopLandInfo();
5876 +  }
5877 +  theEntry->endbranchInitRegs.insert(regNum);
5878 +
5879 +  if (DEBUGME) {
5880 +        errs() << "addLoopEndbranchInitReg loop-header = BB"
5881 +      << loopRep->getHeader()->getNumber()
5882 +      << "  regNum = " << regNum << "\n";
5883 +  }
5884 +} // addLoopEndbranchInitReg
5885 +
5886 +template<class PassT>
5887 +typename CFGStructurizer<PassT>::LoopLandInfo *
5888 +CFGStructurizer<PassT>::getLoopLandInfo(LoopT *loopRep) {
5889 +  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
5890 +
5891 +  return theEntry;
5892 +} // getLoopLandInfo
5893 +
5894 +template<class PassT>
5895 +typename CFGStructurizer<PassT>::BlockT *
5896 +CFGStructurizer<PassT>::getLoopLandBlock(LoopT *loopRep) {
5897 +  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
5898 +
5899 +  return theEntry ? theEntry->landBlk : NULL;
5900 +} // getLoopLandBlock
5901 +
5902 +
5903 +template<class PassT>
5904 +bool CFGStructurizer<PassT>::hasBackEdge(BlockT *curBlk) {
5905 +  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
5906 +  if (loopRep == NULL)
5907 +    return false;
5908 +
5909 +  BlockT *loopHeader = loopRep->getHeader();
5910 +
5911 +  return curBlk->isSuccessor(loopHeader);
5912 +
5913 +} //hasBackEdge
5914 +
5915 +template<class PassT>
5916 +unsigned CFGStructurizer<PassT>::getLoopDepth(LoopT *loopRep) {
5917 +  return loopRep ? loopRep->getLoopDepth() : 0;
5918 +} //getLoopDepth
5919 +
5920 +template<class PassT>
5921 +int CFGStructurizer<PassT>::countActiveBlock
5922 +(typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterStart,
5923 + typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterEnd) {
5924 +  int count = 0;
5925 +  while (iterStart != iterEnd) {
5926 +    if (!isRetiredBlock(*iterStart)) {
5927 +      ++count;
5928 +    }
5929 +    ++iterStart;
5930 +  }
5931 +
5932 +  return count;
5933 +} //countActiveBlock
5934 +
5935 +// This is work around solution for findNearestCommonDominator not avaiable to
5936 +// post dom a proper fix should go to Dominators.h.
5937 +
5938 +template<class PassT>
5939 +typename CFGStructurizer<PassT>::BlockT*
5940 +CFGStructurizer<PassT>::findNearestCommonPostDom(BlockT *blk1, BlockT *blk2) {
5941 +
5942 +  if (postDomTree->dominates(blk1, blk2)) {
5943 +    return blk1;
5944 +  }
5945 +  if (postDomTree->dominates(blk2, blk1)) {
5946 +    return blk2;
5947 +  }
5948 +
5949 +  DomTreeNodeT *node1 = postDomTree->getNode(blk1);
5950 +  DomTreeNodeT *node2 = postDomTree->getNode(blk2);
5951 +
5952 +  // Handle newly cloned node.
5953 +  if (node1 == NULL && blk1->succ_size() == 1) {
5954 +    return findNearestCommonPostDom(*blk1->succ_begin(), blk2);
5955 +  }
5956 +  if (node2 == NULL && blk2->succ_size() == 1) {
5957 +    return findNearestCommonPostDom(blk1, *blk2->succ_begin());
5958 +  }
5959 +
5960 +  if (node1 == NULL || node2 == NULL) {
5961 +    return NULL;
5962 +  }
5963 +
5964 +  node1 = node1->getIDom();
5965 +  while (node1) {
5966 +    if (postDomTree->dominates(node1, node2)) {
5967 +      return node1->getBlock();
5968 +    }
5969 +    node1 = node1->getIDom();
5970 +  }
5971 +
5972 +  return NULL;
5973 +}
5974 +
5975 +template<class PassT>
5976 +typename CFGStructurizer<PassT>::BlockT *
5977 +CFGStructurizer<PassT>::findNearestCommonPostDom
5978 +(typename std::set<BlockT *> &blks) {
5979 +  BlockT *commonDom;
5980 +  typename std::set<BlockT *>::const_iterator iter = blks.begin();
5981 +  typename std::set<BlockT *>::const_iterator iterEnd = blks.end();
5982 +  for (commonDom = *iter; iter != iterEnd && commonDom != NULL; ++iter) {
5983 +    BlockT *curBlk = *iter;
5984 +    if (curBlk != commonDom) {
5985 +      commonDom = findNearestCommonPostDom(curBlk, commonDom);
5986 +    }
5987 +  }
5988 +
5989 +  if (DEBUGME) {
5990 +    errs() << "Common post dominator for exit blocks is ";
5991 +    if (commonDom) {
5992 +          errs() << "BB" << commonDom->getNumber() << "\n";
5993 +    } else {
5994 +      errs() << "NULL\n";
5995 +    }
5996 +  }
5997 +
5998 +  return commonDom;
5999 +} //findNearestCommonPostDom
6000 +
6001 +} //end namespace llvm
6002 +
6003 +//todo: move-end
6004 +
6005 +
6006 +//===----------------------------------------------------------------------===//
6007 +//
6008 +// CFGStructurizer for AMDGPU
6009 +//
6010 +//===----------------------------------------------------------------------===//
6011 +
6012 +
6013 +using namespace llvmCFGStruct;
6014 +
6015 +namespace llvm {
6016 +class AMDGPUCFGStructurizer : public MachineFunctionPass {
6017 +public:
6018 +  typedef MachineInstr              InstructionType;
6019 +  typedef MachineFunction           FunctionType;
6020 +  typedef MachineBasicBlock         BlockType;
6021 +  typedef MachineLoopInfo           LoopinfoType;
6022 +  typedef MachineDominatorTree      DominatortreeType;
6023 +  typedef MachinePostDominatorTree  PostDominatortreeType;
6024 +  typedef MachineDomTreeNode        DomTreeNodeType;
6025 +  typedef MachineLoop               LoopType;
6026 +
6027 +protected:
6028 +  TargetMachine &TM;
6029 +  const TargetInstrInfo *TII;
6030 +  const AMDGPURegisterInfo *TRI;
6031 +
6032 +public:
6033 +  AMDGPUCFGStructurizer(char &pid, TargetMachine &tm);
6034 +  const TargetInstrInfo *getTargetInstrInfo() const;
6035 +
6036 +private:
6037 +
6038 +};
6039 +
6040 +} //end of namespace llvm
6041 +AMDGPUCFGStructurizer::AMDGPUCFGStructurizer(char &pid, TargetMachine &tm)
6042 +: MachineFunctionPass(pid), TM(tm), TII(tm.getInstrInfo()),
6043 +  TRI(static_cast<const AMDGPURegisterInfo *>(tm.getRegisterInfo())) {
6044 +}
6045 +
6046 +const TargetInstrInfo *AMDGPUCFGStructurizer::getTargetInstrInfo() const {
6047 +  return TII;
6048 +}
6049 +//===----------------------------------------------------------------------===//
6050 +//
6051 +// CFGPrepare
6052 +//
6053 +//===----------------------------------------------------------------------===//
6054 +
6055 +
6056 +using namespace llvmCFGStruct;
6057 +
6058 +namespace llvm {
6059 +class AMDGPUCFGPrepare : public AMDGPUCFGStructurizer {
6060 +public:
6061 +  static char ID;
6062 +
6063 +public:
6064 +  AMDGPUCFGPrepare(TargetMachine &tm);
6065 +
6066 +  virtual const char *getPassName() const;
6067 +  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
6068 +
6069 +  bool runOnMachineFunction(MachineFunction &F);
6070 +
6071 +private:
6072 +
6073 +};
6074 +
6075 +char AMDGPUCFGPrepare::ID = 0;
6076 +} //end of namespace llvm
6077 +
6078 +AMDGPUCFGPrepare::AMDGPUCFGPrepare(TargetMachine &tm)
6079 +  : AMDGPUCFGStructurizer(ID, tm )  {
6080 +}
6081 +const char *AMDGPUCFGPrepare::getPassName() const {
6082 +  return "AMD IL Control Flow Graph Preparation Pass";
6083 +}
6084 +
6085 +void AMDGPUCFGPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
6086 +  AU.addPreserved<MachineFunctionAnalysis>();
6087 +  AU.addRequired<MachineFunctionAnalysis>();
6088 +  AU.addRequired<MachineDominatorTree>();
6089 +  AU.addRequired<MachinePostDominatorTree>();
6090 +  AU.addRequired<MachineLoopInfo>();
6091 +}
6092 +
6093 +//===----------------------------------------------------------------------===//
6094 +//
6095 +// CFGPerform
6096 +//
6097 +//===----------------------------------------------------------------------===//
6098 +
6099 +
6100 +using namespace llvmCFGStruct;
6101 +
6102 +namespace llvm {
6103 +class AMDGPUCFGPerform : public AMDGPUCFGStructurizer {
6104 +public:
6105 +  static char ID;
6106 +
6107 +public:
6108 +  AMDGPUCFGPerform(TargetMachine &tm);
6109 +  virtual const char *getPassName() const;
6110 +  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
6111 +  bool runOnMachineFunction(MachineFunction &F);
6112 +
6113 +private:
6114 +
6115 +};
6116 +
6117 +char AMDGPUCFGPerform::ID = 0;
6118 +} //end of namespace llvm
6119 +
6120 +  AMDGPUCFGPerform::AMDGPUCFGPerform(TargetMachine &tm)
6121 +: AMDGPUCFGStructurizer(ID, tm) {
6122 +}
6123 +
6124 +const char *AMDGPUCFGPerform::getPassName() const {
6125 +  return "AMD IL Control Flow Graph structurizer Pass";
6126 +}
6127 +
6128 +void AMDGPUCFGPerform::getAnalysisUsage(AnalysisUsage &AU) const {
6129 +  AU.addPreserved<MachineFunctionAnalysis>();
6130 +  AU.addRequired<MachineFunctionAnalysis>();
6131 +  AU.addRequired<MachineDominatorTree>();
6132 +  AU.addRequired<MachinePostDominatorTree>();
6133 +  AU.addRequired<MachineLoopInfo>();
6134 +}
6135 +
6136 +//===----------------------------------------------------------------------===//
6137 +//
6138 +// CFGStructTraits<AMDGPUCFGStructurizer>
6139 +//
6140 +//===----------------------------------------------------------------------===//
6141 +
6142 +namespace llvmCFGStruct {
6143 +// this class is tailor to the AMDGPU backend
6144 +template<>
6145 +struct CFGStructTraits<AMDGPUCFGStructurizer> {
6146 +  typedef int RegiT;
6147 +
6148 +  static int getBranchNzeroOpcode(int oldOpcode) {
6149 +    switch(oldOpcode) {
6150 +    case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
6151 +    case AMDGPU::BRANCH_COND_i32:
6152 +    case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
6153 +    default:
6154 +      assert(0 && "internal error");
6155 +    }
6156 +    return -1;
6157 +  }
6158 +
6159 +  static int getBranchZeroOpcode(int oldOpcode) {
6160 +    switch(oldOpcode) {
6161 +    case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
6162 +    case AMDGPU::BRANCH_COND_i32:
6163 +    case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
6164 +    default:
6165 +      assert(0 && "internal error");
6166 +    }
6167 +    return -1;
6168 +  }
6169 +
6170 +  static int getContinueNzeroOpcode(int oldOpcode) {
6171 +    switch(oldOpcode) {
6172 +    case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
6173 +    default:
6174 +      assert(0 && "internal error");
6175 +    };
6176 +    return -1;
6177 +  }
6178 +
6179 +  static int getContinueZeroOpcode(int oldOpcode) {
6180 +    switch(oldOpcode) {
6181 +    case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
6182 +    default:
6183 +      assert(0 && "internal error");
6184 +    }
6185 +    return -1;
6186 +  }
6187 +
6188 +  static MachineBasicBlock *getTrueBranch(MachineInstr *instr) {
6189 +    return instr->getOperand(0).getMBB();
6190 +  }
6191 +
6192 +  static void setTrueBranch(MachineInstr *instr, MachineBasicBlock *blk) {
6193 +    instr->getOperand(0).setMBB(blk);
6194 +  }
6195 +
6196 +  static MachineBasicBlock *
6197 +  getFalseBranch(MachineBasicBlock *blk, MachineInstr *instr) {
6198 +    assert(blk->succ_size() == 2);
6199 +    MachineBasicBlock *trueBranch = getTrueBranch(instr);
6200 +    MachineBasicBlock::succ_iterator iter = blk->succ_begin();
6201 +    MachineBasicBlock::succ_iterator iterNext = iter;
6202 +    ++iterNext;
6203 +
6204 +    return (*iter == trueBranch) ? *iterNext : *iter;
6205 +  }
6206 +
6207 +  static bool isCondBranch(MachineInstr *instr) {
6208 +    switch (instr->getOpcode()) {
6209 +      case AMDGPU::JUMP:
6210 +        return instr->getOperand(instr->findFirstPredOperandIdx()).getReg() != 0;
6211 +      case AMDGPU::BRANCH_COND_i32:
6212 +      case AMDGPU::BRANCH_COND_f32:
6213 +      break;
6214 +    default:
6215 +      return false;
6216 +    }
6217 +    return true;
6218 +  }
6219 +
6220 +  static bool isUncondBranch(MachineInstr *instr) {
6221 +    switch (instr->getOpcode()) {
6222 +    case AMDGPU::JUMP:
6223 +      return instr->getOperand(instr->findFirstPredOperandIdx()).getReg() == 0;
6224 +    case AMDGPU::BRANCH:
6225 +      return true;
6226 +    default:
6227 +      return false;
6228 +    }
6229 +    return true;
6230 +  }
6231 +
6232 +  static DebugLoc getLastDebugLocInBB(MachineBasicBlock *blk) {
6233 +    //get DebugLoc from the first MachineBasicBlock instruction with debug info
6234 +    DebugLoc DL;
6235 +    for (MachineBasicBlock::iterator iter = blk->begin(); iter != blk->end(); ++iter) {
6236 +      MachineInstr *instr = &(*iter);
6237 +      if (instr->getDebugLoc().isUnknown() == false) {
6238 +        DL = instr->getDebugLoc();
6239 +      }
6240 +    }
6241 +    return DL;
6242 +  }
6243 +
6244 +  static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *blk) {
6245 +    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
6246 +    MachineInstr *instr = &*iter;
6247 +    if (instr && (isCondBranch(instr) || isUncondBranch(instr))) {
6248 +      return instr;
6249 +    }
6250 +    return NULL;
6251 +  }
6252 +
6253 +  // The correct naming for this is getPossibleLoopendBlockBranchInstr.
6254 +  //
6255 +  // BB with backward-edge could have move instructions after the branch
6256 +  // instruction.  Such move instruction "belong to" the loop backward-edge.
6257 +  //
6258 +  static MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *blk) {
6259 +    const AMDGPUInstrInfo * TII = static_cast<const AMDGPUInstrInfo *>(
6260 +                                  blk->getParent()->getTarget().getInstrInfo());
6261 +
6262 +    for (MachineBasicBlock::reverse_iterator iter = blk->rbegin(),
6263 +         iterEnd = blk->rend(); iter != iterEnd; ++iter) {
6264 +      // FIXME: Simplify
6265 +      MachineInstr *instr = &*iter;
6266 +      if (instr) {
6267 +        if (isCondBranch(instr) || isUncondBranch(instr)) {
6268 +          return instr;
6269 +        } else if (!TII->isMov(instr->getOpcode())) {
6270 +          break;
6271 +        }
6272 +      }
6273 +    }
6274 +    return NULL;
6275 +  }
6276 +
6277 +  static MachineInstr *getReturnInstr(MachineBasicBlock *blk) {
6278 +    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
6279 +    if (iter != blk->rend()) {
6280 +      MachineInstr *instr = &(*iter);
6281 +      if (instr->getOpcode() == AMDGPU::RETURN) {
6282 +        return instr;
6283 +      }
6284 +    }
6285 +    return NULL;
6286 +  }
6287 +
6288 +  static MachineInstr *getContinueInstr(MachineBasicBlock *blk) {
6289 +    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
6290 +    if (iter != blk->rend()) {
6291 +      MachineInstr *instr = &(*iter);
6292 +      if (instr->getOpcode() == AMDGPU::CONTINUE) {
6293 +        return instr;
6294 +      }
6295 +    }
6296 +    return NULL;
6297 +  }
6298 +
6299 +  static MachineInstr *getLoopBreakInstr(MachineBasicBlock *blk) {
6300 +    for (MachineBasicBlock::iterator iter = blk->begin(); (iter != blk->end()); ++iter) {
6301 +      MachineInstr *instr = &(*iter);
6302 +      if (instr->getOpcode() == AMDGPU::PREDICATED_BREAK) {
6303 +        return instr;
6304 +      }
6305 +    }
6306 +    return NULL;
6307 +  }
6308 +
6309 +  static bool isReturnBlock(MachineBasicBlock *blk) {
6310 +    MachineInstr *instr = getReturnInstr(blk);
6311 +    bool isReturn = (blk->succ_size() == 0);
6312 +    if (instr) {
6313 +      assert(isReturn);
6314 +    } else if (isReturn) {
6315 +      if (DEBUGME) {
6316 +        errs() << "BB" << blk->getNumber()
6317 +               <<" is return block without RETURN instr\n";
6318 +      }
6319 +    }
6320 +
6321 +    return  isReturn;
6322 +  }
6323 +
6324 +  static MachineBasicBlock::iterator
6325 +  getInstrPos(MachineBasicBlock *blk, MachineInstr *instr) {
6326 +    assert(instr->getParent() == blk && "instruction doesn't belong to block");
6327 +    MachineBasicBlock::iterator iter = blk->begin();
6328 +    MachineBasicBlock::iterator iterEnd = blk->end();
6329 +    while (&(*iter) != instr && iter != iterEnd) {
6330 +      ++iter;
6331 +    }
6332 +
6333 +    assert(iter != iterEnd);
6334 +    return iter;
6335 +  }//getInstrPos
6336 +
6337 +  static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
6338 +                                         AMDGPUCFGStructurizer *passRep) {
6339 +    return insertInstrBefore(blk,newOpcode,passRep,DebugLoc());
6340 +  } //insertInstrBefore
6341 +
6342 +  static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
6343 +                                         AMDGPUCFGStructurizer *passRep, DebugLoc DL) {
6344 +    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
6345 +    MachineInstr *newInstr =
6346 +      blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL);
6347 +
6348 +    MachineBasicBlock::iterator res;
6349 +    if (blk->begin() != blk->end()) {
6350 +      blk->insert(blk->begin(), newInstr);
6351 +    } else {
6352 +      blk->push_back(newInstr);
6353 +    }
6354 +
6355 +    SHOWNEWINSTR(newInstr);
6356 +
6357 +    return newInstr;
6358 +  } //insertInstrBefore
6359 +
6360 +  static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
6361 +                             AMDGPUCFGStructurizer *passRep) {
6362 +    insertInstrEnd(blk,newOpcode,passRep,DebugLoc());
6363 +  } //insertInstrEnd
6364 +
6365 +  static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
6366 +                             AMDGPUCFGStructurizer *passRep, DebugLoc DL) {
6367 +    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
6368 +   MachineInstr *newInstr = blk->getParent()
6369 +      ->CreateMachineInstr(tii->get(newOpcode), DL);
6370 +
6371 +    blk->push_back(newInstr);
6372 +    //assume the instruction doesn't take any reg operand ...
6373 +
6374 +    SHOWNEWINSTR(newInstr);
6375 +  } //insertInstrEnd
6376 +
6377 +  static MachineInstr *insertInstrBefore(MachineBasicBlock::iterator instrPos,
6378 +                                         int newOpcode,
6379 +                                         AMDGPUCFGStructurizer *passRep) {
6380 +    MachineInstr *oldInstr = &(*instrPos);
6381 +    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
6382 +    MachineBasicBlock *blk = oldInstr->getParent();
6383 +    MachineInstr *newInstr =
6384 +      blk->getParent()->CreateMachineInstr(tii->get(newOpcode),
6385 +                                           DebugLoc());
6386 +
6387 +    blk->insert(instrPos, newInstr);
6388 +    //assume the instruction doesn't take any reg operand ...
6389 +
6390 +    SHOWNEWINSTR(newInstr);
6391 +    return newInstr;
6392 +  } //insertInstrBefore
6393 +
6394 +  static void insertCondBranchBefore(MachineBasicBlock::iterator instrPos,
6395 +                                     int newOpcode,
6396 +                                     AMDGPUCFGStructurizer *passRep,
6397 +                                     DebugLoc DL) {
6398 +    MachineInstr *oldInstr = &(*instrPos);
6399 +    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
6400 +    MachineBasicBlock *blk = oldInstr->getParent();
6401 +    MachineInstr *newInstr =
6402 +      blk->getParent()->CreateMachineInstr(tii->get(newOpcode),
6403 +                                           DL);
6404 +
6405 +    blk->insert(instrPos, newInstr);
6406 +    MachineInstrBuilder(newInstr).addReg(oldInstr->getOperand(1).getReg(),
6407 +                                         false);
6408 +
6409 +    SHOWNEWINSTR(newInstr);
6410 +    //erase later oldInstr->eraseFromParent();
6411 +  } //insertCondBranchBefore
6412 +
6413 +  static void insertCondBranchBefore(MachineBasicBlock *blk,
6414 +                                     MachineBasicBlock::iterator insertPos,
6415 +                                     int newOpcode,
6416 +                                     AMDGPUCFGStructurizer *passRep,
6417 +                                     RegiT regNum,
6418 +                                     DebugLoc DL) {
6419 +    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
6420 +
6421 +    MachineInstr *newInstr =
6422 +      blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL);
6423 +
6424 +    //insert before
6425 +    blk->insert(insertPos, newInstr);
6426 +    MachineInstrBuilder(newInstr).addReg(regNum, false);
6427 +
6428 +    SHOWNEWINSTR(newInstr);
6429 +  } //insertCondBranchBefore
6430 +
6431 +  static void insertCondBranchEnd(MachineBasicBlock *blk,
6432 +                                  int newOpcode,
6433 +                                  AMDGPUCFGStructurizer *passRep,
6434 +                                  RegiT regNum) {
6435 +    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
6436 +    MachineInstr *newInstr =
6437 +      blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DebugLoc());
6438 +
6439 +    blk->push_back(newInstr);
6440 +    MachineInstrBuilder(newInstr).addReg(regNum, false);
6441 +
6442 +    SHOWNEWINSTR(newInstr);
6443 +  } //insertCondBranchEnd
6444 +
6445 +
6446 +  static void insertAssignInstrBefore(MachineBasicBlock::iterator instrPos,
6447 +                                      AMDGPUCFGStructurizer *passRep,
6448 +                                      RegiT regNum, int regVal) {
6449 +    MachineInstr *oldInstr = &(*instrPos);
6450 +    const AMDGPUInstrInfo *tii =
6451 +             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
6452 +    MachineBasicBlock *blk = oldInstr->getParent();
6453 +    MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum,
6454 +                                                 regVal);
6455 +    blk->insert(instrPos, newInstr);
6456 +
6457 +    SHOWNEWINSTR(newInstr);
6458 +  } //insertAssignInstrBefore
6459 +
6460 +  static void insertAssignInstrBefore(MachineBasicBlock *blk,
6461 +                                      AMDGPUCFGStructurizer *passRep,
6462 +                                      RegiT regNum, int regVal) {
6463 +    const AMDGPUInstrInfo *tii =
6464 +             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
6465 +
6466 +    MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum,
6467 +                                                 regVal);
6468 +    if (blk->begin() != blk->end()) {
6469 +      blk->insert(blk->begin(), newInstr);
6470 +    } else {
6471 +      blk->push_back(newInstr);
6472 +    }
6473 +
6474 +    SHOWNEWINSTR(newInstr);
6475 +
6476 +  } //insertInstrBefore
6477 +
6478 +  static void insertCompareInstrBefore(MachineBasicBlock *blk,
6479 +                                       MachineBasicBlock::iterator instrPos,
6480 +                                       AMDGPUCFGStructurizer *passRep,
6481 +                                       RegiT dstReg, RegiT src1Reg,
6482 +                                       RegiT src2Reg) {
6483 +    const AMDGPUInstrInfo *tii =
6484 +             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
6485 +    MachineInstr *newInstr =
6486 +      blk->getParent()->CreateMachineInstr(tii->get(tii->getIEQOpcode()), DebugLoc());
6487 +
6488 +    MachineInstrBuilder(newInstr).addReg(dstReg, RegState::Define); //set target
6489 +    MachineInstrBuilder(newInstr).addReg(src1Reg); //set src value
6490 +    MachineInstrBuilder(newInstr).addReg(src2Reg); //set src value
6491 +
6492 +    blk->insert(instrPos, newInstr);
6493 +    SHOWNEWINSTR(newInstr);
6494 +
6495 +  } //insertCompareInstrBefore
6496 +
6497 +  static void cloneSuccessorList(MachineBasicBlock *dstBlk,
6498 +                                 MachineBasicBlock *srcBlk) {
6499 +    for (MachineBasicBlock::succ_iterator iter = srcBlk->succ_begin(),
6500 +         iterEnd = srcBlk->succ_end(); iter != iterEnd; ++iter) {
6501 +      dstBlk->addSuccessor(*iter);  // *iter's predecessor is also taken care of
6502 +    }
6503 +  } //cloneSuccessorList
6504 +
6505 +  static MachineBasicBlock *clone(MachineBasicBlock *srcBlk) {
6506 +    MachineFunction *func = srcBlk->getParent();
6507 +    MachineBasicBlock *newBlk = func->CreateMachineBasicBlock();
6508 +    func->push_back(newBlk);  //insert to function
6509 +    for (MachineBasicBlock::iterator iter = srcBlk->begin(),
6510 +         iterEnd = srcBlk->end();
6511 +         iter != iterEnd; ++iter) {
6512 +      MachineInstr *instr = func->CloneMachineInstr(iter);
6513 +      newBlk->push_back(instr);
6514 +    }
6515 +    return newBlk;
6516 +  }
6517 +
6518 +  //MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose because
6519 +  //the AMDGPU instruction is not recognized as terminator fix this and retire
6520 +  //this routine
6521 +  static void replaceInstrUseOfBlockWith(MachineBasicBlock *srcBlk,
6522 +                                         MachineBasicBlock *oldBlk,
6523 +                                         MachineBasicBlock *newBlk) {
6524 +    MachineInstr *branchInstr = getLoopendBlockBranchInstr(srcBlk);
6525 +    if (branchInstr && isCondBranch(branchInstr) &&
6526 +        getTrueBranch(branchInstr) == oldBlk) {
6527 +      setTrueBranch(branchInstr, newBlk);
6528 +    }
6529 +  }
6530 +
6531 +  static void wrapup(MachineBasicBlock *entryBlk) {
6532 +    assert((!entryBlk->getParent()->getJumpTableInfo()
6533 +            || entryBlk->getParent()->getJumpTableInfo()->isEmpty())
6534 +           && "found a jump table");
6535 +
6536 +     //collect continue right before endloop
6537 +     SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> contInstr;
6538 +     MachineBasicBlock::iterator pre = entryBlk->begin();
6539 +     MachineBasicBlock::iterator iterEnd = entryBlk->end();
6540 +     MachineBasicBlock::iterator iter = pre;
6541 +     while (iter != iterEnd) {
6542 +       if (pre->getOpcode() == AMDGPU::CONTINUE
6543 +           && iter->getOpcode() == AMDGPU::ENDLOOP) {
6544 +         contInstr.push_back(pre);
6545 +       }
6546 +       pre = iter;
6547 +       ++iter;
6548 +     } //end while
6549 +
6550 +     //delete continue right before endloop
6551 +     for (unsigned i = 0; i < contInstr.size(); ++i) {
6552 +        contInstr[i]->eraseFromParent();
6553 +     }
6554 +
6555 +     // TODO to fix up jump table so later phase won't be confused.  if
6556 +     // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
6557 +     // there isn't such an interface yet.  alternatively, replace all the other
6558 +     // blocks in the jump table with the entryBlk //}
6559 +
6560 +  } //wrapup
6561 +
6562 +  static MachineDominatorTree *getDominatorTree(AMDGPUCFGStructurizer &pass) {
6563 +    return &pass.getAnalysis<MachineDominatorTree>();
6564 +  }
6565 +
6566 +  static MachinePostDominatorTree*
6567 +  getPostDominatorTree(AMDGPUCFGStructurizer &pass) {
6568 +    return &pass.getAnalysis<MachinePostDominatorTree>();
6569 +  }
6570 +
6571 +  static MachineLoopInfo *getLoopInfo(AMDGPUCFGStructurizer &pass) {
6572 +    return &pass.getAnalysis<MachineLoopInfo>();
6573 +  }
6574 +}; // template class CFGStructTraits
6575 +} //end of namespace llvm
6576 +
6577 +// createAMDGPUCFGPreparationPass- Returns a pass
6578 +FunctionPass *llvm::createAMDGPUCFGPreparationPass(TargetMachine &tm
6579 +                                                 ) {
6580 +  return new AMDGPUCFGPrepare(tm );
6581 +}
6582 +
6583 +bool AMDGPUCFGPrepare::runOnMachineFunction(MachineFunction &func) {
6584 +  return llvmCFGStruct::CFGStructurizer<AMDGPUCFGStructurizer>().prepare(func,
6585 +                                                                        *this,
6586 +                                                                        TRI);
6587 +}
6588 +
6589 +// createAMDGPUCFGStructurizerPass- Returns a pass
6590 +FunctionPass *llvm::createAMDGPUCFGStructurizerPass(TargetMachine &tm
6591 +                                                  ) {
6592 +  return new AMDGPUCFGPerform(tm );
6593 +}
6594 +
6595 +bool AMDGPUCFGPerform::runOnMachineFunction(MachineFunction &func) {
6596 +  return llvmCFGStruct::CFGStructurizer<AMDGPUCFGStructurizer>().run(func,
6597 +                                                                    *this,
6598 +                                                                    TRI);
6599 +}
6600 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILDevice.cpp llvm-r600/lib/Target/R600/AMDILDevice.cpp
6601 --- llvm-3.2.src/lib/Target/R600/AMDILDevice.cpp        1970-01-01 01:00:00.000000000 +0100
6602 +++ llvm-r600/lib/Target/R600/AMDILDevice.cpp   2013-01-25 19:43:57.440049721 +0100
6603 @@ -0,0 +1,124 @@
6604 +//===-- AMDILDevice.cpp - Base class for AMDIL Devices --------------------===//
6605 +//
6606 +//                     The LLVM Compiler Infrastructure
6607 +//
6608 +// This file is distributed under the University of Illinois Open Source
6609 +// License. See LICENSE.TXT for details.
6610 +//
6611 +/// \file
6612 +//==-----------------------------------------------------------------------===//
6613 +#include "AMDILDevice.h"
6614 +#include "AMDGPUSubtarget.h"
6615 +
6616 +using namespace llvm;
6617 +// Default implementation for all of the classes.
6618 +AMDGPUDevice::AMDGPUDevice(AMDGPUSubtarget *ST) : mSTM(ST) {
6619 +  mHWBits.resize(AMDGPUDeviceInfo::MaxNumberCapabilities);
6620 +  mSWBits.resize(AMDGPUDeviceInfo::MaxNumberCapabilities);
6621 +  setCaps();
6622 +  DeviceFlag = OCL_DEVICE_ALL;
6623 +}
6624 +
6625 +AMDGPUDevice::~AMDGPUDevice() {
6626 +    mHWBits.clear();
6627 +    mSWBits.clear();
6628 +}
6629 +
6630 +size_t AMDGPUDevice::getMaxGDSSize() const {
6631 +  return 0;
6632 +}
6633 +
6634 +uint32_t
6635 +AMDGPUDevice::getDeviceFlag() const {
6636 +  return DeviceFlag;
6637 +}
6638 +
6639 +size_t AMDGPUDevice::getMaxNumCBs() const {
6640 +  if (usesHardware(AMDGPUDeviceInfo::ConstantMem)) {
6641 +    return HW_MAX_NUM_CB;
6642 +  }
6643 +
6644 +  return 0;
6645 +}
6646 +
6647 +size_t AMDGPUDevice::getMaxCBSize() const {
6648 +  if (usesHardware(AMDGPUDeviceInfo::ConstantMem)) {
6649 +    return MAX_CB_SIZE;
6650 +  }
6651 +
6652 +  return 0;
6653 +}
6654 +
6655 +size_t AMDGPUDevice::getMaxScratchSize() const {
6656 +  return 65536;
6657 +}
6658 +
6659 +uint32_t AMDGPUDevice::getStackAlignment() const {
6660 +  return 16;
6661 +}
6662 +
6663 +void AMDGPUDevice::setCaps() {
6664 +  mSWBits.set(AMDGPUDeviceInfo::HalfOps);
6665 +  mSWBits.set(AMDGPUDeviceInfo::ByteOps);
6666 +  mSWBits.set(AMDGPUDeviceInfo::ShortOps);
6667 +  mSWBits.set(AMDGPUDeviceInfo::HW64BitDivMod);
6668 +  if (mSTM->isOverride(AMDGPUDeviceInfo::NoInline)) {
6669 +    mSWBits.set(AMDGPUDeviceInfo::NoInline);
6670 +  }
6671 +  if (mSTM->isOverride(AMDGPUDeviceInfo::MacroDB)) {
6672 +    mSWBits.set(AMDGPUDeviceInfo::MacroDB);
6673 +  }
6674 +  if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) {
6675 +    mSWBits.set(AMDGPUDeviceInfo::ConstantMem);
6676 +  } else {
6677 +    mHWBits.set(AMDGPUDeviceInfo::ConstantMem);
6678 +  }
6679 +  if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) {
6680 +    mSWBits.set(AMDGPUDeviceInfo::PrivateMem);
6681 +  } else {
6682 +    mHWBits.set(AMDGPUDeviceInfo::PrivateMem);
6683 +  }
6684 +  if (mSTM->isOverride(AMDGPUDeviceInfo::BarrierDetect)) {
6685 +    mSWBits.set(AMDGPUDeviceInfo::BarrierDetect);
6686 +  }
6687 +  mSWBits.set(AMDGPUDeviceInfo::ByteLDSOps);
6688 +  mSWBits.set(AMDGPUDeviceInfo::LongOps);
6689 +}
6690 +
6691 +AMDGPUDeviceInfo::ExecutionMode
6692 +AMDGPUDevice::getExecutionMode(AMDGPUDeviceInfo::Caps Caps) const {
6693 +  if (mHWBits[Caps]) {
6694 +    assert(!mSWBits[Caps] && "Cannot set both SW and HW caps");
6695 +    return AMDGPUDeviceInfo::Hardware;
6696 +  }
6697 +
6698 +  if (mSWBits[Caps]) {
6699 +    assert(!mHWBits[Caps] && "Cannot set both SW and HW caps");
6700 +    return AMDGPUDeviceInfo::Software;
6701 +  }
6702 +
6703 +  return AMDGPUDeviceInfo::Unsupported;
6704 +
6705 +}
6706 +
6707 +bool AMDGPUDevice::isSupported(AMDGPUDeviceInfo::Caps Mode) const {
6708 +  return getExecutionMode(Mode) != AMDGPUDeviceInfo::Unsupported;
6709 +}
6710 +
6711 +bool AMDGPUDevice::usesHardware(AMDGPUDeviceInfo::Caps Mode) const {
6712 +  return getExecutionMode(Mode) == AMDGPUDeviceInfo::Hardware;
6713 +}
6714 +
6715 +bool AMDGPUDevice::usesSoftware(AMDGPUDeviceInfo::Caps Mode) const {
6716 +  return getExecutionMode(Mode) == AMDGPUDeviceInfo::Software;
6717 +}
6718 +
6719 +std::string
6720 +AMDGPUDevice::getDataLayout() const {
6721 +    return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16"
6722 +      "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
6723 +      "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
6724 +      "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
6725 +      "-v512:512:512-v1024:1024:1024-v2048:2048:2048"
6726 +      "-n8:16:32:64");
6727 +}
6728 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILDevice.h llvm-r600/lib/Target/R600/AMDILDevice.h
6729 --- llvm-3.2.src/lib/Target/R600/AMDILDevice.h  1970-01-01 01:00:00.000000000 +0100
6730 +++ llvm-r600/lib/Target/R600/AMDILDevice.h     2013-01-25 19:43:57.440049721 +0100
6731 @@ -0,0 +1,117 @@
6732 +//===---- AMDILDevice.h - Define Device Data for AMDGPU -----*- C++ -*------===//
6733 +//
6734 +//                     The LLVM Compiler Infrastructure
6735 +//
6736 +// This file is distributed under the University of Illinois Open Source
6737 +// License. See LICENSE.TXT for details.
6738 +//
6739 +//==-----------------------------------------------------------------------===//
6740 +//
6741 +/// \file
6742 +/// \brief Interface for the subtarget data classes.
6743 +//
6744 +/// This file will define the interface that each generation needs to
6745 +/// implement in order to correctly answer queries on the capabilities of the
6746 +/// specific hardware.
6747 +//===----------------------------------------------------------------------===//
6748 +#ifndef AMDILDEVICEIMPL_H
6749 +#define AMDILDEVICEIMPL_H
6750 +#include "AMDIL.h"
6751 +#include "llvm/ADT/BitVector.h"
6752 +
6753 +namespace llvm {
6754 +  class AMDGPUSubtarget;
6755 +  class MCStreamer;
6756 +//===----------------------------------------------------------------------===//
6757 +// Interface for data that is specific to a single device
6758 +//===----------------------------------------------------------------------===//
6759 +class AMDGPUDevice {
6760 +public:
6761 +  AMDGPUDevice(AMDGPUSubtarget *ST);
6762 +  virtual ~AMDGPUDevice();
6763 +
6764 +  // Enum values for the various memory types.
6765 +  enum {
6766 +    RAW_UAV_ID   = 0,
6767 +    ARENA_UAV_ID = 1,
6768 +    LDS_ID       = 2,
6769 +    GDS_ID       = 3,
6770 +    SCRATCH_ID   = 4,
6771 +    CONSTANT_ID  = 5,
6772 +    GLOBAL_ID    = 6,
6773 +    MAX_IDS      = 7
6774 +  } IO_TYPE_IDS;
6775 +
6776 +  /// \returns The max LDS size that the hardware supports.  Size is in
6777 +  /// bytes.
6778 +  virtual size_t getMaxLDSSize() const = 0;
6779 +
6780 +  /// \returns The max GDS size that the hardware supports if the GDS is
6781 +  /// supported by the hardware.  Size is in bytes.
6782 +  virtual size_t getMaxGDSSize() const;
6783 +
6784 +  /// \returns The max number of hardware constant address spaces that
6785 +  /// are supported by this device.
6786 +  virtual size_t getMaxNumCBs() const;
6787 +
6788 +  /// \returns The max number of bytes a single hardware constant buffer
6789 +  /// can support.  Size is in bytes.
6790 +  virtual size_t getMaxCBSize() const;
6791 +
6792 +  /// \returns The max number of bytes allowed by the hardware scratch
6793 +  /// buffer.  Size is in bytes.
6794 +  virtual size_t getMaxScratchSize() const;
6795 +
6796 +  /// \brief Get the flag that corresponds to the device.
6797 +  virtual uint32_t getDeviceFlag() const;
6798 +
6799 +  /// \returns The number of work-items that exist in a single hardware
6800 +  /// wavefront.
6801 +  virtual size_t getWavefrontSize() const = 0;
6802 +
6803 +  /// \brief Get the generational name of this specific device.
6804 +  virtual uint32_t getGeneration() const = 0;
6805 +
6806 +  /// \brief Get the stack alignment of this specific device.
6807 +  virtual uint32_t getStackAlignment() const;
6808 +
6809 +  /// \brief Get the resource ID for this specific device.
6810 +  virtual uint32_t getResourceID(uint32_t DeviceID) const = 0;
6811 +
6812 +  /// \brief Get the max number of UAV's for this device.
6813 +  virtual uint32_t getMaxNumUAVs() const = 0;
6814 +
6815 +
6816 +  // API utilizing more detailed capabilities of each family of
6817 +  // cards. If a capability is supported, then either usesHardware or
6818 +  // usesSoftware returned true.  If usesHardware returned true, then
6819 +  // usesSoftware must return false for the same capability.  Hardware
6820 +  // execution means that the feature is done natively by the hardware
6821 +  // and is not emulated by the softare.  Software execution means
6822 +  // that the feature could be done in the hardware, but there is
6823 +  // software that emulates it with possibly using the hardware for
6824 +  // support since the hardware does not fully comply with OpenCL
6825 +  // specs.
6826 +
6827 +  bool isSupported(AMDGPUDeviceInfo::Caps Mode) const;
6828 +  bool usesHardware(AMDGPUDeviceInfo::Caps Mode) const;
6829 +  bool usesSoftware(AMDGPUDeviceInfo::Caps Mode) const;
6830 +  virtual std::string getDataLayout() const;
6831 +  static const unsigned int MAX_LDS_SIZE_700 = 16384;
6832 +  static const unsigned int MAX_LDS_SIZE_800 = 32768;
6833 +  static const unsigned int WavefrontSize = 64;
6834 +  static const unsigned int HalfWavefrontSize = 32;
6835 +  static const unsigned int QuarterWavefrontSize = 16;
6836 +protected:
6837 +  virtual void setCaps();
6838 +  llvm::BitVector mHWBits;
6839 +  llvm::BitVector mSWBits;
6840 +  AMDGPUSubtarget *mSTM;
6841 +  uint32_t DeviceFlag;
6842 +private:
6843 +  AMDGPUDeviceInfo::ExecutionMode
6844 +  getExecutionMode(AMDGPUDeviceInfo::Caps Caps) const;
6845 +};
6846 +
6847 +} // namespace llvm
6848 +#endif // AMDILDEVICEIMPL_H
6849 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILDeviceInfo.cpp llvm-r600/lib/Target/R600/AMDILDeviceInfo.cpp
6850 --- llvm-3.2.src/lib/Target/R600/AMDILDeviceInfo.cpp    1970-01-01 01:00:00.000000000 +0100
6851 +++ llvm-r600/lib/Target/R600/AMDILDeviceInfo.cpp       2013-01-25 19:43:57.440049721 +0100
6852 @@ -0,0 +1,94 @@
6853 +//===-- AMDILDeviceInfo.cpp - AMDILDeviceInfo class -----------------------===//
6854 +//
6855 +//                     The LLVM Compiler Infrastructure
6856 +//
6857 +// This file is distributed under the University of Illinois Open Source
6858 +// License. See LICENSE.TXT for details.
6859 +//
6860 +//==-----------------------------------------------------------------------===//
6861 +//
6862 +/// \file
6863 +/// \brief Function that creates DeviceInfo from a device name and other information.
6864 +//
6865 +//==-----------------------------------------------------------------------===//
6866 +#include "AMDILDevices.h"
6867 +#include "AMDGPUSubtarget.h"
6868 +
6869 +using namespace llvm;
6870 +namespace llvm {
6871 +namespace AMDGPUDeviceInfo {
6872 +
6873 +AMDGPUDevice* getDeviceFromName(const std::string &deviceName,
6874 +                                AMDGPUSubtarget *ptr,
6875 +                                bool is64bit, bool is64on32bit) {
6876 +  if (deviceName.c_str()[2] == '7') {
6877 +    switch (deviceName.c_str()[3]) {
6878 +    case '1':
6879 +      return new AMDGPU710Device(ptr);
6880 +    case '7':
6881 +      return new AMDGPU770Device(ptr);
6882 +    default:
6883 +      return new AMDGPU7XXDevice(ptr);
6884 +    }
6885 +  } else if (deviceName == "cypress") {
6886 +#if DEBUG
6887 +    assert(!is64bit && "This device does not support 64bit pointers!");
6888 +    assert(!is64on32bit && "This device does not support 64bit"
6889 +          " on 32bit pointers!");
6890 +#endif
6891 +    return new AMDGPUCypressDevice(ptr);
6892 +  } else if (deviceName == "juniper") {
6893 +#if DEBUG
6894 +    assert(!is64bit && "This device does not support 64bit pointers!");
6895 +    assert(!is64on32bit && "This device does not support 64bit"
6896 +          " on 32bit pointers!");
6897 +#endif
6898 +    return new AMDGPUEvergreenDevice(ptr);
6899 +  } else if (deviceName == "redwood") {
6900 +#if DEBUG
6901 +    assert(!is64bit && "This device does not support 64bit pointers!");
6902 +    assert(!is64on32bit && "This device does not support 64bit"
6903 +          " on 32bit pointers!");
6904 +#endif
6905 +    return new AMDGPURedwoodDevice(ptr);
6906 +  } else if (deviceName == "cedar") {
6907 +#if DEBUG
6908 +    assert(!is64bit && "This device does not support 64bit pointers!");
6909 +    assert(!is64on32bit && "This device does not support 64bit"
6910 +          " on 32bit pointers!");
6911 +#endif
6912 +    return new AMDGPUCedarDevice(ptr);
6913 +  } else if (deviceName == "barts" || deviceName == "turks") {
6914 +#if DEBUG
6915 +    assert(!is64bit && "This device does not support 64bit pointers!");
6916 +    assert(!is64on32bit && "This device does not support 64bit"
6917 +          " on 32bit pointers!");
6918 +#endif
6919 +    return new AMDGPUNIDevice(ptr);
6920 +  } else if (deviceName == "cayman") {
6921 +#if DEBUG
6922 +    assert(!is64bit && "This device does not support 64bit pointers!");
6923 +    assert(!is64on32bit && "This device does not support 64bit"
6924 +          " on 32bit pointers!");
6925 +#endif
6926 +    return new AMDGPUCaymanDevice(ptr);
6927 +  } else if (deviceName == "caicos") {
6928 +#if DEBUG
6929 +    assert(!is64bit && "This device does not support 64bit pointers!");
6930 +    assert(!is64on32bit && "This device does not support 64bit"
6931 +          " on 32bit pointers!");
6932 +#endif
6933 +    return new AMDGPUNIDevice(ptr);
6934 +  } else if (deviceName == "SI") {
6935 +    return new AMDGPUSIDevice(ptr);
6936 +  } else {
6937 +#if DEBUG
6938 +    assert(!is64bit && "This device does not support 64bit pointers!");
6939 +    assert(!is64on32bit && "This device does not support 64bit"
6940 +          " on 32bit pointers!");
6941 +#endif
6942 +    return new AMDGPU7XXDevice(ptr);
6943 +  }
6944 +}
6945 +} // End namespace AMDGPUDeviceInfo
6946 +} // End namespace llvm
6947 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILDeviceInfo.h llvm-r600/lib/Target/R600/AMDILDeviceInfo.h
6948 --- llvm-3.2.src/lib/Target/R600/AMDILDeviceInfo.h      1970-01-01 01:00:00.000000000 +0100
6949 +++ llvm-r600/lib/Target/R600/AMDILDeviceInfo.h 2013-01-25 19:43:57.440049721 +0100
6950 @@ -0,0 +1,88 @@
6951 +//===-- AMDILDeviceInfo.h - Constants for describing devices --------------===//
6952 +//
6953 +//                     The LLVM Compiler Infrastructure
6954 +//
6955 +// This file is distributed under the University of Illinois Open Source
6956 +// License. See LICENSE.TXT for details.
6957 +//
6958 +/// \file
6959 +//==-----------------------------------------------------------------------===//
6960 +#ifndef AMDILDEVICEINFO_H
6961 +#define AMDILDEVICEINFO_H
6962 +
6963 +
6964 +#include <string>
6965 +
6966 +namespace llvm {
6967 +  class AMDGPUDevice;
6968 +  class AMDGPUSubtarget;
6969 +  namespace AMDGPUDeviceInfo {
6970 +    /// Each Capabilities can be executed using a hardware instruction,
6971 +    /// emulated with a sequence of software instructions, or not
6972 +    /// supported at all.
6973 +    enum ExecutionMode {
6974 +      Unsupported = 0, ///< Unsupported feature on the card(Default value)
6975 +       /// This is the execution mode that is set if the feature is emulated in
6976 +       /// software.
6977 +      Software,
6978 +      /// This execution mode is set if the feature exists natively in hardware
6979 +      Hardware
6980 +    };
6981 +
6982 +    enum Caps {
6983 +      HalfOps          = 0x1,  ///< Half float is supported or not.
6984 +      DoubleOps        = 0x2,  ///< Double is supported or not.
6985 +      ByteOps          = 0x3,  ///< Byte(char) is support or not.
6986 +      ShortOps         = 0x4,  ///< Short is supported or not.
6987 +      LongOps          = 0x5,  ///< Long is supported or not.
6988 +      Images           = 0x6,  ///< Images are supported or not.
6989 +      ByteStores       = 0x7,  ///< ByteStores available(!HD4XXX).
6990 +      ConstantMem      = 0x8,  ///< Constant/CB memory.
6991 +      LocalMem         = 0x9,  ///< Local/LDS memory.
6992 +      PrivateMem       = 0xA,  ///< Scratch/Private/Stack memory.
6993 +      RegionMem        = 0xB,  ///< OCL GDS Memory Extension.
6994 +      FMA              = 0xC,  ///< Use HW FMA or SW FMA.
6995 +      ArenaSegment     = 0xD,  ///< Use for Arena UAV per pointer 12-1023.
6996 +      MultiUAV         = 0xE,  ///< Use for UAV per Pointer 0-7.
6997 +      Reserved0        = 0xF,  ///< ReservedFlag
6998 +      NoAlias          = 0x10, ///< Cached loads.
6999 +      Signed24BitOps   = 0x11, ///< Peephole Optimization.
7000 +      /// Debug mode implies that no hardware features or optimizations
7001 +      /// are performned and that all memory access go through a single
7002 +      /// uav(Arena on HD5XXX/HD6XXX and Raw on HD4XXX).
7003 +      Debug            = 0x12,
7004 +      CachedMem        = 0x13, ///< Cached mem is available or not.
7005 +      BarrierDetect    = 0x14, ///< Detect duplicate barriers.
7006 +      Reserved1        = 0x15, ///< Reserved flag
7007 +      ByteLDSOps       = 0x16, ///< Flag to specify if byte LDS ops are available.
7008 +      ArenaVectors     = 0x17, ///< Flag to specify if vector loads from arena work.
7009 +      TmrReg           = 0x18, ///< Flag to specify if Tmr register is supported.
7010 +      NoInline         = 0x19, ///< Flag to specify that no inlining should occur.
7011 +      MacroDB          = 0x1A, ///< Flag to specify that backend handles macrodb.
7012 +      HW64BitDivMod    = 0x1B, ///< Flag for backend to generate 64bit div/mod.
7013 +      ArenaUAV         = 0x1C, ///< Flag to specify that arena uav is supported.
7014 +      PrivateUAV       = 0x1D, ///< Flag to specify that private memory uses uav's.
7015 +      /// If more capabilities are required, then
7016 +      /// this number needs to be increased.
7017 +      /// All capabilities must come before this
7018 +      /// number.
7019 +      MaxNumberCapabilities = 0x20
7020 +    };
7021 +    /// These have to be in order with the older generations
7022 +    /// having the lower number enumerations.
7023 +    enum Generation {
7024 +      HD4XXX = 0, ///< 7XX based devices.
7025 +      HD5XXX, ///< Evergreen based devices.
7026 +      HD6XXX, ///< NI/Evergreen+ based devices.
7027 +      HD7XXX, ///< Southern Islands based devices.
7028 +      HDTEST, ///< Experimental feature testing device.
7029 +      HDNUMGEN
7030 +    };
7031 +
7032 +
7033 +  AMDGPUDevice*
7034 +    getDeviceFromName(const std::string &name, AMDGPUSubtarget *ptr,
7035 +                      bool is64bit = false, bool is64on32bit = false);
7036 +  } // namespace AMDILDeviceInfo
7037 +} // namespace llvm
7038 +#endif // AMDILDEVICEINFO_H
7039 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILDevices.h llvm-r600/lib/Target/R600/AMDILDevices.h
7040 --- llvm-3.2.src/lib/Target/R600/AMDILDevices.h 1970-01-01 01:00:00.000000000 +0100
7041 +++ llvm-r600/lib/Target/R600/AMDILDevices.h    2013-01-25 19:43:57.440049721 +0100
7042 @@ -0,0 +1,19 @@
7043 +//===-- AMDILDevices.h - Consolidate AMDIL Device headers -----------------===//
7044 +//
7045 +//                     The LLVM Compiler Infrastructure
7046 +//
7047 +// This file is distributed under the University of Illinois Open Source
7048 +// License. See LICENSE.TXT for details.
7049 +//
7050 +/// \file
7051 +//==-----------------------------------------------------------------------===//
7052 +#ifndef AMDIL_DEVICES_H
7053 +#define AMDIL_DEVICES_H
7054 +// Include all of the device specific header files
7055 +#include "AMDIL7XXDevice.h"
7056 +#include "AMDILDevice.h"
7057 +#include "AMDILEvergreenDevice.h"
7058 +#include "AMDILNIDevice.h"
7059 +#include "AMDILSIDevice.h"
7060 +
7061 +#endif // AMDIL_DEVICES_H
7062 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILEvergreenDevice.cpp llvm-r600/lib/Target/R600/AMDILEvergreenDevice.cpp
7063 --- llvm-3.2.src/lib/Target/R600/AMDILEvergreenDevice.cpp       1970-01-01 01:00:00.000000000 +0100
7064 +++ llvm-r600/lib/Target/R600/AMDILEvergreenDevice.cpp  2013-01-25 19:43:57.440049721 +0100
7065 @@ -0,0 +1,169 @@
7066 +//===-- AMDILEvergreenDevice.cpp - Device Info for Evergreen --------------===//
7067 +//
7068 +//                     The LLVM Compiler Infrastructure
7069 +//
7070 +// This file is distributed under the University of Illinois Open Source
7071 +// License. See LICENSE.TXT for details.
7072 +//
7073 +/// \file
7074 +//==-----------------------------------------------------------------------===//
7075 +#include "AMDILEvergreenDevice.h"
7076 +
7077 +using namespace llvm;
7078 +
7079 +AMDGPUEvergreenDevice::AMDGPUEvergreenDevice(AMDGPUSubtarget *ST)
7080 +: AMDGPUDevice(ST) {
7081 +  setCaps();
7082 +  std::string name = ST->getDeviceName();
7083 +  if (name == "cedar") {
7084 +    DeviceFlag = OCL_DEVICE_CEDAR;
7085 +  } else if (name == "redwood") {
7086 +    DeviceFlag = OCL_DEVICE_REDWOOD;
7087 +  } else if (name == "cypress") {
7088 +    DeviceFlag = OCL_DEVICE_CYPRESS;
7089 +  } else {
7090 +    DeviceFlag = OCL_DEVICE_JUNIPER;
7091 +  }
7092 +}
7093 +
7094 +AMDGPUEvergreenDevice::~AMDGPUEvergreenDevice() {
7095 +}
7096 +
7097 +size_t AMDGPUEvergreenDevice::getMaxLDSSize() const {
7098 +  if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
7099 +    return MAX_LDS_SIZE_800;
7100 +  } else {
7101 +    return 0;
7102 +  }
7103 +}
7104 +size_t AMDGPUEvergreenDevice::getMaxGDSSize() const {
7105 +  if (usesHardware(AMDGPUDeviceInfo::RegionMem)) {
7106 +    return MAX_LDS_SIZE_800;
7107 +  } else {
7108 +    return 0;
7109 +  }
7110 +}
7111 +uint32_t AMDGPUEvergreenDevice::getMaxNumUAVs() const {
7112 +  return 12;
7113 +}
7114 +
7115 +uint32_t AMDGPUEvergreenDevice::getResourceID(uint32_t id) const {
7116 +  switch(id) {
7117 +  default:
7118 +    assert(0 && "ID type passed in is unknown!");
7119 +    break;
7120 +  case CONSTANT_ID:
7121 +  case RAW_UAV_ID:
7122 +    return GLOBAL_RETURN_RAW_UAV_ID;
7123 +  case GLOBAL_ID:
7124 +  case ARENA_UAV_ID:
7125 +    return DEFAULT_ARENA_UAV_ID;
7126 +  case LDS_ID:
7127 +    if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
7128 +      return DEFAULT_LDS_ID;
7129 +    } else {
7130 +      return DEFAULT_ARENA_UAV_ID;
7131 +    }
7132 +  case GDS_ID:
7133 +    if (usesHardware(AMDGPUDeviceInfo::RegionMem)) {
7134 +      return DEFAULT_GDS_ID;
7135 +    } else {
7136 +      return DEFAULT_ARENA_UAV_ID;
7137 +    }
7138 +  case SCRATCH_ID:
7139 +    if (usesHardware(AMDGPUDeviceInfo::PrivateMem)) {
7140 +      return DEFAULT_SCRATCH_ID;
7141 +    } else {
7142 +      return DEFAULT_ARENA_UAV_ID;
7143 +    }
7144 +  };
7145 +  return 0;
7146 +}
7147 +
7148 +size_t AMDGPUEvergreenDevice::getWavefrontSize() const {
7149 +  return AMDGPUDevice::WavefrontSize;
7150 +}
7151 +
7152 +uint32_t AMDGPUEvergreenDevice::getGeneration() const {
7153 +  return AMDGPUDeviceInfo::HD5XXX;
7154 +}
7155 +
7156 +void AMDGPUEvergreenDevice::setCaps() {
7157 +  mSWBits.set(AMDGPUDeviceInfo::ArenaSegment);
7158 +  mHWBits.set(AMDGPUDeviceInfo::ArenaUAV);
7159 +  mHWBits.set(AMDGPUDeviceInfo::HW64BitDivMod);
7160 +  mSWBits.reset(AMDGPUDeviceInfo::HW64BitDivMod);
7161 +  mSWBits.set(AMDGPUDeviceInfo::Signed24BitOps);
7162 +  if (mSTM->isOverride(AMDGPUDeviceInfo::ByteStores)) {
7163 +    mHWBits.set(AMDGPUDeviceInfo::ByteStores);
7164 +  }
7165 +  if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) {
7166 +    mSWBits.set(AMDGPUDeviceInfo::LocalMem);
7167 +    mSWBits.set(AMDGPUDeviceInfo::RegionMem);
7168 +  } else {
7169 +    mHWBits.set(AMDGPUDeviceInfo::LocalMem);
7170 +    mHWBits.set(AMDGPUDeviceInfo::RegionMem);
7171 +  }
7172 +  mHWBits.set(AMDGPUDeviceInfo::Images);
7173 +  if (mSTM->isOverride(AMDGPUDeviceInfo::NoAlias)) {
7174 +    mHWBits.set(AMDGPUDeviceInfo::NoAlias);
7175 +  }
7176 +  mHWBits.set(AMDGPUDeviceInfo::CachedMem);
7177 +  if (mSTM->isOverride(AMDGPUDeviceInfo::MultiUAV)) {
7178 +    mHWBits.set(AMDGPUDeviceInfo::MultiUAV);
7179 +  }
7180 +  mHWBits.set(AMDGPUDeviceInfo::ByteLDSOps);
7181 +  mSWBits.reset(AMDGPUDeviceInfo::ByteLDSOps);
7182 +  mHWBits.set(AMDGPUDeviceInfo::ArenaVectors);
7183 +  mHWBits.set(AMDGPUDeviceInfo::LongOps);
7184 +  mSWBits.reset(AMDGPUDeviceInfo::LongOps);
7185 +  mHWBits.set(AMDGPUDeviceInfo::TmrReg);
7186 +}
7187 +
7188 +AMDGPUCypressDevice::AMDGPUCypressDevice(AMDGPUSubtarget *ST)
7189 +  : AMDGPUEvergreenDevice(ST) {
7190 +  setCaps();
7191 +}
7192 +
7193 +AMDGPUCypressDevice::~AMDGPUCypressDevice() {
7194 +}
7195 +
7196 +void AMDGPUCypressDevice::setCaps() {
7197 +  if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) {
7198 +    mHWBits.set(AMDGPUDeviceInfo::DoubleOps);
7199 +    mHWBits.set(AMDGPUDeviceInfo::FMA);
7200 +  }
7201 +}
7202 +
7203 +
7204 +AMDGPUCedarDevice::AMDGPUCedarDevice(AMDGPUSubtarget *ST)
7205 +  : AMDGPUEvergreenDevice(ST) {
7206 +  setCaps();
7207 +}
7208 +
7209 +AMDGPUCedarDevice::~AMDGPUCedarDevice() {
7210 +}
7211 +
7212 +void AMDGPUCedarDevice::setCaps() {
7213 +  mSWBits.set(AMDGPUDeviceInfo::FMA);
7214 +}
7215 +
7216 +size_t AMDGPUCedarDevice::getWavefrontSize() const {
7217 +  return AMDGPUDevice::QuarterWavefrontSize;
7218 +}
7219 +
7220 +AMDGPURedwoodDevice::AMDGPURedwoodDevice(AMDGPUSubtarget *ST)
7221 +  : AMDGPUEvergreenDevice(ST) {
7222 +  setCaps();
7223 +}
7224 +
7225 +AMDGPURedwoodDevice::~AMDGPURedwoodDevice() {
7226 +}
7227 +
7228 +void AMDGPURedwoodDevice::setCaps() {
7229 +  mSWBits.set(AMDGPUDeviceInfo::FMA);
7230 +}
7231 +
7232 +size_t AMDGPURedwoodDevice::getWavefrontSize() const {
7233 +  return AMDGPUDevice::HalfWavefrontSize;
7234 +}
7235 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILEvergreenDevice.h llvm-r600/lib/Target/R600/AMDILEvergreenDevice.h
7236 --- llvm-3.2.src/lib/Target/R600/AMDILEvergreenDevice.h 1970-01-01 01:00:00.000000000 +0100
7237 +++ llvm-r600/lib/Target/R600/AMDILEvergreenDevice.h    2013-01-25 19:43:57.440049721 +0100
7238 @@ -0,0 +1,93 @@
7239 +//==- AMDILEvergreenDevice.h - Define Evergreen Device for AMDIL -*- C++ -*--=//
7240 +//
7241 +//                     The LLVM Compiler Infrastructure
7242 +//
7243 +// This file is distributed under the University of Illinois Open Source
7244 +// License. See LICENSE.TXT for details.
7245 +//
7246 +//==-----------------------------------------------------------------------===//
7247 +//
7248 +/// \file
7249 +/// \brief Interface for the subtarget data classes.
7250 +///
7251 +/// This file will define the interface that each generation needs to
7252 +/// implement in order to correctly answer queries on the capabilities of the
7253 +/// specific hardware.
7254 +//===----------------------------------------------------------------------===//
7255 +#ifndef AMDILEVERGREENDEVICE_H
7256 +#define AMDILEVERGREENDEVICE_H
7257 +#include "AMDILDevice.h"
7258 +#include "AMDGPUSubtarget.h"
7259 +
7260 +namespace llvm {
7261 +  class AMDGPUSubtarget;
7262 +//===----------------------------------------------------------------------===//
7263 +// Evergreen generation of devices and their respective sub classes
7264 +//===----------------------------------------------------------------------===//
7265 +
7266 +
7267 +/// \brief The AMDGPUEvergreenDevice is the base device class for all of the Evergreen
7268 +/// series of cards.
7269 +///
7270 +/// This class contains information required to differentiate
7271 +/// the Evergreen device from the generic AMDGPUDevice. This device represents
7272 +/// that capabilities of the 'Juniper' cards, also known as the HD57XX.
7273 +class AMDGPUEvergreenDevice : public AMDGPUDevice {
7274 +public:
7275 +  AMDGPUEvergreenDevice(AMDGPUSubtarget *ST);
7276 +  virtual ~AMDGPUEvergreenDevice();
7277 +  virtual size_t getMaxLDSSize() const;
7278 +  virtual size_t getMaxGDSSize() const;
7279 +  virtual size_t getWavefrontSize() const;
7280 +  virtual uint32_t getGeneration() const;
7281 +  virtual uint32_t getMaxNumUAVs() const;
7282 +  virtual uint32_t getResourceID(uint32_t) const;
7283 +protected:
7284 +  virtual void setCaps();
7285 +};
7286 +
7287 +/// The AMDGPUCypressDevice is similiar to the AMDGPUEvergreenDevice, except it has
7288 +/// support for double precision operations. This device is used to represent
7289 +/// both the Cypress and Hemlock cards, which are commercially known as HD58XX
7290 +/// and HD59XX cards.
7291 +class AMDGPUCypressDevice : public AMDGPUEvergreenDevice {
7292 +public:
7293 +  AMDGPUCypressDevice(AMDGPUSubtarget *ST);
7294 +  virtual ~AMDGPUCypressDevice();
7295 +private:
7296 +  virtual void setCaps();
7297 +};
7298 +
7299 +
7300 +/// \brief The AMDGPUCedarDevice is the class that represents all of the 'Cedar' based
7301 +/// devices.
7302 +///
7303 +/// This class differs from the base AMDGPUEvergreenDevice in that the
7304 +/// device is a ~quarter of the 'Juniper'. These are commercially known as the
7305 +/// HD54XX and HD53XX series of cards.
7306 +class AMDGPUCedarDevice : public AMDGPUEvergreenDevice {
7307 +public:
7308 +  AMDGPUCedarDevice(AMDGPUSubtarget *ST);
7309 +  virtual ~AMDGPUCedarDevice();
7310 +  virtual size_t getWavefrontSize() const;
7311 +private:
7312 +  virtual void setCaps();
7313 +};
7314 +
7315 +/// \brief The AMDGPURedwoodDevice is the class the represents all of the 'Redwood' based
7316 +/// devices.
7317 +///
7318 +/// This class differs from the base class, in that these devices are
7319 +/// considered about half of a 'Juniper' device. These are commercially known as
7320 +/// the HD55XX and HD56XX series of cards.
7321 +class AMDGPURedwoodDevice : public AMDGPUEvergreenDevice {
7322 +public:
7323 +  AMDGPURedwoodDevice(AMDGPUSubtarget *ST);
7324 +  virtual ~AMDGPURedwoodDevice();
7325 +  virtual size_t getWavefrontSize() const;
7326 +private:
7327 +  virtual void setCaps();
7328 +};
7329 +
7330 +} // namespace llvm
7331 +#endif // AMDILEVERGREENDEVICE_H
7332 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILFrameLowering.cpp llvm-r600/lib/Target/R600/AMDILFrameLowering.cpp
7333 --- llvm-3.2.src/lib/Target/R600/AMDILFrameLowering.cpp 1970-01-01 01:00:00.000000000 +0100
7334 +++ llvm-r600/lib/Target/R600/AMDILFrameLowering.cpp    2013-01-25 19:43:57.440049721 +0100
7335 @@ -0,0 +1,47 @@
7336 +//===----------------------- AMDILFrameLowering.cpp -----------------*- C++ -*-===//
7337 +//
7338 +//                     The LLVM Compiler Infrastructure
7339 +//
7340 +// This file is distributed under the University of Illinois Open Source
7341 +// License. See LICENSE.TXT for details.
7342 +//
7343 +//==-----------------------------------------------------------------------===//
7344 +//
7345 +/// \file
7346 +/// \brief Interface to describe a layout of a stack frame on a AMDGPU target
7347 +/// machine.
7348 +//
7349 +//===----------------------------------------------------------------------===//
7350 +#include "AMDILFrameLowering.h"
7351 +#include "llvm/CodeGen/MachineFrameInfo.h"
7352 +
7353 +using namespace llvm;
7354 +AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl,
7355 +    int LAO, unsigned TransAl)
7356 +  : TargetFrameLowering(D, StackAl, LAO, TransAl) {
7357 +}
7358 +
7359 +AMDGPUFrameLowering::~AMDGPUFrameLowering() {
7360 +}
7361 +
7362 +int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
7363 +                                         int FI) const {
7364 +  const MachineFrameInfo *MFI = MF.getFrameInfo();
7365 +  return MFI->getObjectOffset(FI);
7366 +}
7367 +
7368 +const TargetFrameLowering::SpillSlot *
7369 +AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
7370 +  NumEntries = 0;
7371 +  return 0;
7372 +}
7373 +void
7374 +AMDGPUFrameLowering::emitPrologue(MachineFunction &MF) const {
7375 +}
7376 +void
7377 +AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {
7378 +}
7379 +bool
7380 +AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const {
7381 +  return false;
7382 +}
7383 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILFrameLowering.h llvm-r600/lib/Target/R600/AMDILFrameLowering.h
7384 --- llvm-3.2.src/lib/Target/R600/AMDILFrameLowering.h   1970-01-01 01:00:00.000000000 +0100
7385 +++ llvm-r600/lib/Target/R600/AMDILFrameLowering.h      2013-01-25 19:43:57.443383054 +0100
7386 @@ -0,0 +1,40 @@
7387 +//===--------------------- AMDILFrameLowering.h -----------------*- C++ -*-===//
7388 +//
7389 +//                     The LLVM Compiler Infrastructure
7390 +//
7391 +// This file is distributed under the University of Illinois Open Source
7392 +// License. See LICENSE.TXT for details.
7393 +//
7394 +//===----------------------------------------------------------------------===//
7395 +//
7396 +/// \file
7397 +/// \brief Interface to describe a layout of a stack frame on a AMDIL target
7398 +/// machine.
7399 +//
7400 +//===----------------------------------------------------------------------===//
7401 +#ifndef AMDILFRAME_LOWERING_H
7402 +#define AMDILFRAME_LOWERING_H
7403 +
7404 +#include "llvm/CodeGen/MachineFunction.h"
7405 +#include "llvm/Target/TargetFrameLowering.h"
7406 +
7407 +namespace llvm {
7408 +
7409 +/// \brief Information about the stack frame layout on the AMDGPU targets.
7410 +///
7411 +/// It holds the direction of the stack growth, the known stack alignment on
7412 +/// entry to each function, and the offset to the locals area.
7413 +/// See TargetFrameInfo for more comments.
7414 +class AMDGPUFrameLowering : public TargetFrameLowering {
7415 +public:
7416 +  AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO,
7417 +                      unsigned TransAl = 1);
7418 +  virtual ~AMDGPUFrameLowering();
7419 +  virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
7420 +  virtual const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const;
7421 +  virtual void emitPrologue(MachineFunction &MF) const;
7422 +  virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
7423 +  virtual bool hasFP(const MachineFunction &MF) const;
7424 +};
7425 +} // namespace llvm
7426 +#endif // AMDILFRAME_LOWERING_H
7427 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDIL.h llvm-r600/lib/Target/R600/AMDIL.h
7428 --- llvm-3.2.src/lib/Target/R600/AMDIL.h        1970-01-01 01:00:00.000000000 +0100
7429 +++ llvm-r600/lib/Target/R600/AMDIL.h   2013-01-25 19:43:57.433383055 +0100
7430 @@ -0,0 +1,122 @@
7431 +//===-- AMDIL.h - Top-level interface for AMDIL representation --*- C++ -*-===//
7432 +//
7433 +//                     The LLVM Compiler Infrastructure
7434 +//
7435 +// This file is distributed under the University of Illinois Open Source
7436 +// License. See LICENSE.TXT for details.
7437 +//
7438 +//==-----------------------------------------------------------------------===//
7439 +//
7440 +/// This file contains the entry points for global functions defined in the LLVM
7441 +/// AMDGPU back-end.
7442 +//
7443 +//===----------------------------------------------------------------------===//
7444 +
7445 +#ifndef AMDIL_H
7446 +#define AMDIL_H
7447 +
7448 +#include "llvm/CodeGen/MachineFunction.h"
7449 +#include "llvm/Target/TargetMachine.h"
7450 +
7451 +#define ARENA_SEGMENT_RESERVED_UAVS 12
7452 +#define DEFAULT_ARENA_UAV_ID 8
7453 +#define DEFAULT_RAW_UAV_ID 7
7454 +#define GLOBAL_RETURN_RAW_UAV_ID 11
7455 +#define HW_MAX_NUM_CB 8
7456 +#define MAX_NUM_UNIQUE_UAVS 8
7457 +#define OPENCL_MAX_NUM_ATOMIC_COUNTERS 8
7458 +#define OPENCL_MAX_READ_IMAGES 128
7459 +#define OPENCL_MAX_WRITE_IMAGES 8
7460 +#define OPENCL_MAX_SAMPLERS 16
7461 +
7462 +// The next two values can never be zero, as zero is the ID that is
7463 +// used to assert against.
7464 +#define DEFAULT_LDS_ID     1
7465 +#define DEFAULT_GDS_ID     1
7466 +#define DEFAULT_SCRATCH_ID 1
7467 +#define DEFAULT_VEC_SLOTS  8
7468 +
7469 +#define OCL_DEVICE_RV710        0x0001
7470 +#define OCL_DEVICE_RV730        0x0002
7471 +#define OCL_DEVICE_RV770        0x0004
7472 +#define OCL_DEVICE_CEDAR        0x0008
7473 +#define OCL_DEVICE_REDWOOD      0x0010
7474 +#define OCL_DEVICE_JUNIPER      0x0020
7475 +#define OCL_DEVICE_CYPRESS      0x0040
7476 +#define OCL_DEVICE_CAICOS       0x0080
7477 +#define OCL_DEVICE_TURKS        0x0100
7478 +#define OCL_DEVICE_BARTS        0x0200
7479 +#define OCL_DEVICE_CAYMAN       0x0400
7480 +#define OCL_DEVICE_ALL          0x3FFF
7481 +
7482 +/// The number of function ID's that are reserved for
7483 +/// internal compiler usage.
7484 +const unsigned int RESERVED_FUNCS = 1024;
7485 +
7486 +namespace llvm {
7487 +class AMDGPUInstrPrinter;
7488 +class FunctionPass;
7489 +class MCAsmInfo;
7490 +class raw_ostream;
7491 +class Target;
7492 +class TargetMachine;
7493 +
7494 +// Instruction selection passes.
7495 +FunctionPass*
7496 +  createAMDGPUISelDag(TargetMachine &TM);
7497 +FunctionPass*
7498 +  createAMDGPUPeepholeOpt(TargetMachine &TM);
7499 +
7500 +// Pre emit passes.
7501 +FunctionPass*
7502 +  createAMDGPUCFGPreparationPass(TargetMachine &TM);
7503 +FunctionPass*
7504 +  createAMDGPUCFGStructurizerPass(TargetMachine &TM);
7505 +
7506 +extern Target TheAMDGPUTarget;
7507 +} // end namespace llvm;
7508 +
7509 +// Include device information enumerations
7510 +#include "AMDILDeviceInfo.h"
7511 +
7512 +namespace llvm {
7513 +/// OpenCL uses address spaces to differentiate between
7514 +/// various memory regions on the hardware. On the CPU
7515 +/// all of the address spaces point to the same memory,
7516 +/// however on the GPU, each address space points to
7517 +/// a seperate piece of memory that is unique from other
7518 +/// memory locations.
7519 +namespace AMDGPUAS {
7520 +enum AddressSpaces {
7521 +  PRIVATE_ADDRESS  = 0, ///< Address space for private memory.
7522 +  GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
7523 +  CONSTANT_ADDRESS = 2, ///< Address space for constant memory
7524 +  LOCAL_ADDRESS    = 3, ///< Address space for local memory.
7525 +  REGION_ADDRESS   = 4, ///< Address space for region memory.
7526 +  ADDRESS_NONE     = 5, ///< Address space for unknown memory.
7527 +  PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
7528 +  PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
7529 +  USER_SGPR_ADDRESS = 8, ///< Address space for USER_SGPRS on SI
7530 +  CONSTANT_BUFFER_0 = 9,
7531 +  CONSTANT_BUFFER_1 = 10,
7532 +  CONSTANT_BUFFER_2 = 11,
7533 +  CONSTANT_BUFFER_3 = 12,
7534 +  CONSTANT_BUFFER_4 = 13,
7535 +  CONSTANT_BUFFER_5 = 14,
7536 +  CONSTANT_BUFFER_6 = 15,
7537 +  CONSTANT_BUFFER_7 = 16,
7538 +  CONSTANT_BUFFER_8 = 17,
7539 +  CONSTANT_BUFFER_9 = 18,
7540 +  CONSTANT_BUFFER_10 = 19,
7541 +  CONSTANT_BUFFER_11 = 20,
7542 +  CONSTANT_BUFFER_12 = 21,
7543 +  CONSTANT_BUFFER_13 = 22,
7544 +  CONSTANT_BUFFER_14 = 23,
7545 +  CONSTANT_BUFFER_15 = 24,
7546 +  LAST_ADDRESS     = 25
7547 +};
7548 +
7549 +} // namespace AMDGPUAS
7550 +
7551 +} // end namespace llvm
7552 +#endif // AMDIL_H
7553 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILInstrInfo.td llvm-r600/lib/Target/R600/AMDILInstrInfo.td
7554 --- llvm-3.2.src/lib/Target/R600/AMDILInstrInfo.td      1970-01-01 01:00:00.000000000 +0100
7555 +++ llvm-r600/lib/Target/R600/AMDILInstrInfo.td 2013-01-25 19:43:57.443383054 +0100
7556 @@ -0,0 +1,208 @@
7557 +//===------------ AMDILInstrInfo.td - AMDIL Target ------*-tablegen-*------===//
7558 +//
7559 +//                     The LLVM Compiler Infrastructure
7560 +//
7561 +// This file is distributed under the University of Illinois Open Source
7562 +// License. See LICENSE.TXT for details.
7563 +//
7564 +//==-----------------------------------------------------------------------===//
7565 +//
7566 +// This file describes the AMDIL instructions in TableGen format.
7567 +//
7568 +//===----------------------------------------------------------------------===//
7569 +// AMDIL Instruction Predicate Definitions
7570 +// Predicate that is set to true if the hardware supports double precision
7571 +// divide
7572 +def HasHWDDiv                 : Predicate<"Subtarget.device()"
7573 +                           "->getGeneration() > AMDGPUDeviceInfo::HD4XXX && "
7574 +              "Subtarget.device()->usesHardware(AMDGPUDeviceInfo::DoubleOps)">;
7575 +
7576 +// Predicate that is set to true if the hardware supports double, but not double
7577 +// precision divide in hardware
7578 +def HasSWDDiv             : Predicate<"Subtarget.device()"
7579 +                           "->getGeneration() == AMDGPUDeviceInfo::HD4XXX &&"
7580 +              "Subtarget.device()->usesHardware(AMDGPUDeviceInfo::DoubleOps)">;
7581 +
7582 +// Predicate that is set to true if the hardware support 24bit signed
7583 +// math ops. Otherwise a software expansion to 32bit math ops is used instead.
7584 +def HasHWSign24Bit          : Predicate<"Subtarget.device()"
7585 +                            "->getGeneration() > AMDGPUDeviceInfo::HD5XXX">;
7586 +
7587 +// Predicate that is set to true if 64bit operations are supported or not
7588 +def HasHW64Bit              : Predicate<"Subtarget.device()"
7589 +                            "->usesHardware(AMDGPUDeviceInfo::LongOps)">;
7590 +def HasSW64Bit              : Predicate<"Subtarget.device()"
7591 +                            "->usesSoftware(AMDGPUDeviceInfo::LongOps)">;
7592 +
7593 +// Predicate that is set to true if the timer register is supported
7594 +def HasTmrRegister          : Predicate<"Subtarget.device()"
7595 +                            "->isSupported(AMDGPUDeviceInfo::TmrReg)">;
7596 +// Predicate that is true if we are at least evergreen series
7597 +def HasDeviceIDInst         : Predicate<"Subtarget.device()"
7598 +                            "->getGeneration() >= AMDGPUDeviceInfo::HD5XXX">;
7599 +
7600 +// Predicate that is true if we have region address space.
7601 +def hasRegionAS             : Predicate<"Subtarget.device()"
7602 +                            "->usesHardware(AMDGPUDeviceInfo::RegionMem)">;
7603 +
7604 +// Predicate that is false if we don't have region address space.
7605 +def noRegionAS             : Predicate<"!Subtarget.device()"
7606 +                            "->isSupported(AMDGPUDeviceInfo::RegionMem)">;
7607 +
7608 +
7609 +// Predicate that is set to true if 64bit Mul is supported in the IL or not
7610 +def HasHW64Mul              : Predicate<"Subtarget.calVersion()"
7611 +                                          ">= CAL_VERSION_SC_139"
7612 +                                          "&& Subtarget.device()"
7613 +                                          "->getGeneration() >="
7614 +                                          "AMDGPUDeviceInfo::HD5XXX">;
7615 +def HasSW64Mul              : Predicate<"Subtarget.calVersion()"
7616 +                                          "< CAL_VERSION_SC_139">;
7617 +// Predicate that is set to true if 64bit Div/Mod is supported in the IL or not
7618 +def HasHW64DivMod           : Predicate<"Subtarget.device()"
7619 +                            "->usesHardware(AMDGPUDeviceInfo::HW64BitDivMod)">;
7620 +def HasSW64DivMod           : Predicate<"Subtarget.device()"
7621 +                            "->usesSoftware(AMDGPUDeviceInfo::HW64BitDivMod)">;
7622 +
7623 +// Predicate that is set to true if 64bit pointer are used.
7624 +def Has64BitPtr             : Predicate<"Subtarget.is64bit()">;
7625 +def Has32BitPtr             : Predicate<"!Subtarget.is64bit()">;
7626 +//===--------------------------------------------------------------------===//
7627 +// Custom Operands
7628 +//===--------------------------------------------------------------------===//
7629 +def brtarget   : Operand<OtherVT>;
7630 +
7631 +//===--------------------------------------------------------------------===//
7632 +// Custom Selection DAG Type Profiles
7633 +//===--------------------------------------------------------------------===//
7634 +//===----------------------------------------------------------------------===//
7635 +// Generic Profile Types
7636 +//===----------------------------------------------------------------------===//
7637 +
7638 +def SDTIL_GenBinaryOp : SDTypeProfile<1, 2, [
7639 +    SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
7640 +    ]>;
7641 +def SDTIL_GenTernaryOp : SDTypeProfile<1, 3, [
7642 +    SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisSameAs<2, 3>
7643 +    ]>;
7644 +def SDTIL_GenVecBuild : SDTypeProfile<1, 1, [
7645 +    SDTCisEltOfVec<1, 0>
7646 +    ]>;
7647 +
7648 +//===----------------------------------------------------------------------===//
7649 +// Flow Control Profile Types
7650 +//===----------------------------------------------------------------------===//
7651 +// Branch instruction where second and third are basic blocks
7652 +def SDTIL_BRCond : SDTypeProfile<0, 2, [
7653 +    SDTCisVT<0, OtherVT>
7654 +    ]>;
7655 +
7656 +//===--------------------------------------------------------------------===//
7657 +// Custom Selection DAG Nodes
7658 +//===--------------------------------------------------------------------===//
7659 +//===----------------------------------------------------------------------===//
7660 +// Flow Control DAG Nodes
7661 +//===----------------------------------------------------------------------===//
7662 +def IL_brcond      : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>;
7663 +
7664 +//===----------------------------------------------------------------------===//
7665 +// Call/Return DAG Nodes
7666 +//===----------------------------------------------------------------------===//
7667 +def IL_retflag       : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
7668 +    [SDNPHasChain, SDNPOptInGlue]>;
7669 +
7670 +//===--------------------------------------------------------------------===//
7671 +// Instructions
7672 +//===--------------------------------------------------------------------===//
7673 +// Floating point math functions
7674 +def IL_div_inf      : SDNode<"AMDGPUISD::DIV_INF", SDTIL_GenBinaryOp>;
7675 +def IL_mad          : SDNode<"AMDGPUISD::MAD", SDTIL_GenTernaryOp>;
7676 +
7677 +//===----------------------------------------------------------------------===//
7678 +// Integer functions
7679 +//===----------------------------------------------------------------------===//
7680 +def IL_umul        : SDNode<"AMDGPUISD::UMUL"    , SDTIntBinOp,
7681 +    [SDNPCommutative, SDNPAssociative]>;
7682 +
7683 +//===--------------------------------------------------------------------===//
7684 +// Custom Pattern DAG Nodes
7685 +//===--------------------------------------------------------------------===//
7686 +def global_store : PatFrag<(ops node:$val, node:$ptr),
7687 +    (store node:$val, node:$ptr), [{
7688 +        return isGlobalStore(dyn_cast<StoreSDNode>(N));
7689 +}]>;
7690 +
7691 +//===----------------------------------------------------------------------===//
7692 +// Load pattern fragments
7693 +//===----------------------------------------------------------------------===//
7694 +// Global address space loads
7695 +def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
7696 +    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
7697 +}]>;
7698 +// Constant address space loads
7699 +def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
7700 +    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
7701 +}]>;
7702 +
7703 +//===----------------------------------------------------------------------===//
7704 +// Complex addressing mode patterns
7705 +//===----------------------------------------------------------------------===//
7706 +def ADDR : ComplexPattern<i32, 2, "SelectADDR", [], []>;
7707 +def ADDRF : ComplexPattern<i32, 2, "SelectADDR", [frameindex], []>;
7708 +def ADDR64 : ComplexPattern<i64, 2, "SelectADDR64", [], []>;
7709 +def ADDR64F : ComplexPattern<i64, 2, "SelectADDR64", [frameindex], []>;
7710 +
7711 +//===----------------------------------------------------------------------===//
7712 +// Instruction format classes
7713 +//===----------------------------------------------------------------------===//
7714 +class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
7715 +: Instruction {
7716 +
7717 +     let Namespace = "AMDGPU";
7718 +     dag OutOperandList = outs;
7719 +     dag InOperandList = ins;
7720 +     let Pattern = pattern;
7721 +     let AsmString = !strconcat(asmstr, "\n");
7722 +     let isPseudo = 1;
7723 +     let Itinerary = NullALU;
7724 +     bit hasIEEEFlag = 0;
7725 +     bit hasZeroOpFlag = 0;
7726 +     let mayLoad = 0;
7727 +     let mayStore = 0;
7728 +     let hasSideEffects = 0;
7729 +}
7730 +
7731 +//===--------------------------------------------------------------------===//
7732 +// Multiclass Instruction formats
7733 +//===--------------------------------------------------------------------===//
7734 +// Multiclass that handles branch instructions
7735 +multiclass BranchConditional<SDNode Op> {
7736 +    def _i32 : ILFormat<(outs),
7737 +  (ins brtarget:$target, GPRI32:$src0),
7738 +        "; i32 Pseudo branch instruction",
7739 +  [(Op bb:$target, GPRI32:$src0)]>;
7740 +    def _f32 : ILFormat<(outs),
7741 +  (ins brtarget:$target, GPRF32:$src0),
7742 +        "; f32 Pseudo branch instruction",
7743 +  [(Op bb:$target, GPRF32:$src0)]>;
7744 +}
7745 +
7746 +// Only scalar types should generate flow control
7747 +multiclass BranchInstr<string name> {
7748 +  def _i32 : ILFormat<(outs), (ins GPRI32:$src),
7749 +      !strconcat(name, " $src"), []>;
7750 +  def _f32 : ILFormat<(outs), (ins GPRF32:$src),
7751 +      !strconcat(name, " $src"), []>;
7752 +}
7753 +// Only scalar types should generate flow control
7754 +multiclass BranchInstr2<string name> {
7755 +  def _i32 : ILFormat<(outs), (ins GPRI32:$src0, GPRI32:$src1),
7756 +      !strconcat(name, " $src0, $src1"), []>;
7757 +  def _f32 : ILFormat<(outs), (ins GPRF32:$src0, GPRF32:$src1),
7758 +      !strconcat(name, " $src0, $src1"), []>;
7759 +}
7760 +
7761 +//===--------------------------------------------------------------------===//
7762 +// Intrinsics support
7763 +//===--------------------------------------------------------------------===//
7764 +include "AMDILIntrinsics.td"
7765 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILIntrinsicInfo.cpp llvm-r600/lib/Target/R600/AMDILIntrinsicInfo.cpp
7766 --- llvm-3.2.src/lib/Target/R600/AMDILIntrinsicInfo.cpp 1970-01-01 01:00:00.000000000 +0100
7767 +++ llvm-r600/lib/Target/R600/AMDILIntrinsicInfo.cpp    2013-01-25 19:43:57.446716388 +0100
7768 @@ -0,0 +1,79 @@
7769 +//===- AMDILIntrinsicInfo.cpp - AMDGPU Intrinsic Information ------*- C++ -*-===//
7770 +//
7771 +//                     The LLVM Compiler Infrastructure
7772 +//
7773 +// This file is distributed under the University of Illinois Open Source
7774 +// License. See LICENSE.TXT for details.
7775 +//
7776 +//==-----------------------------------------------------------------------===//
7777 +//
7778 +/// \file
7779 +/// \brief AMDGPU Implementation of the IntrinsicInfo class.
7780 +//
7781 +//===-----------------------------------------------------------------------===//
7782 +
7783 +#include "AMDILIntrinsicInfo.h"
7784 +#include "AMDIL.h"
7785 +#include "AMDGPUSubtarget.h"
7786 +#include "llvm/DerivedTypes.h"
7787 +#include "llvm/Intrinsics.h"
7788 +#include "llvm/Module.h"
7789 +
7790 +using namespace llvm;
7791 +
7792 +#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
7793 +#include "AMDGPUGenIntrinsics.inc"
7794 +#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
7795 +
7796 +AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm)
7797 +  : TargetIntrinsicInfo() {
7798 +}
7799 +
7800 +std::string
7801 +AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys,
7802 +    unsigned int numTys) const  {
7803 +  static const char* const names[] = {
7804 +#define GET_INTRINSIC_NAME_TABLE
7805 +#include "AMDGPUGenIntrinsics.inc"
7806 +#undef GET_INTRINSIC_NAME_TABLE
7807 +  };
7808 +
7809 +  if (IntrID < Intrinsic::num_intrinsics) {
7810 +    return 0;
7811 +  }
7812 +  assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics
7813 +      && "Invalid intrinsic ID");
7814 +
7815 +  std::string Result(names[IntrID - Intrinsic::num_intrinsics]);
7816 +  return Result;
7817 +}
7818 +
7819 +unsigned int
7820 +AMDGPUIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const  {
7821 +#define GET_FUNCTION_RECOGNIZER
7822 +#include "AMDGPUGenIntrinsics.inc"
7823 +#undef GET_FUNCTION_RECOGNIZER
7824 +  AMDGPUIntrinsic::ID IntrinsicID
7825 +    = (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic;
7826 +  IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name);
7827 +
7828 +  if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) {
7829 +    return IntrinsicID;
7830 +  }
7831 +  return 0;
7832 +}
7833 +
7834 +bool
7835 +AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const  {
7836 +  // Overload Table
7837 +#define GET_INTRINSIC_OVERLOAD_TABLE
7838 +#include "AMDGPUGenIntrinsics.inc"
7839 +#undef GET_INTRINSIC_OVERLOAD_TABLE
7840 +}
7841 +
7842 +Function*
7843 +AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
7844 +    Type **Tys,
7845 +    unsigned numTys) const  {
7846 +  assert(!"Not implemented");
7847 +}
7848 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILIntrinsicInfo.h llvm-r600/lib/Target/R600/AMDILIntrinsicInfo.h
7849 --- llvm-3.2.src/lib/Target/R600/AMDILIntrinsicInfo.h   1970-01-01 01:00:00.000000000 +0100
7850 +++ llvm-r600/lib/Target/R600/AMDILIntrinsicInfo.h      2013-01-25 19:43:57.446716388 +0100
7851 @@ -0,0 +1,49 @@
7852 +//===- AMDILIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===//
7853 +//
7854 +//                     The LLVM Compiler Infrastructure
7855 +//
7856 +// This file is distributed under the University of Illinois Open Source
7857 +// License. See LICENSE.TXT for details.
7858 +//
7859 +//==-----------------------------------------------------------------------===//
7860 +//
7861 +/// \file
7862 +/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class.
7863 +//
7864 +//===-----------------------------------------------------------------------===//
7865 +#ifndef AMDIL_INTRINSICS_H
7866 +#define AMDIL_INTRINSICS_H
7867 +
7868 +#include "llvm/Intrinsics.h"
7869 +#include "llvm/Target/TargetIntrinsicInfo.h"
7870 +
7871 +namespace llvm {
7872 +class TargetMachine;
7873 +
7874 +namespace AMDGPUIntrinsic {
7875 +enum ID {
7876 +  last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1,
7877 +#define GET_INTRINSIC_ENUM_VALUES
7878 +#include "AMDGPUGenIntrinsics.inc"
7879 +#undef GET_INTRINSIC_ENUM_VALUES
7880 +      , num_AMDGPU_intrinsics
7881 +};
7882 +
7883 +} // end namespace AMDGPUIntrinsic
7884 +
7885 +class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo {
7886 +public:
7887 +  AMDGPUIntrinsicInfo(TargetMachine *tm);
7888 +  std::string getName(unsigned int IntrId, Type **Tys = 0,
7889 +                      unsigned int numTys = 0) const;
7890 +  unsigned int lookupName(const char *Name, unsigned int Len) const;
7891 +  bool isOverloaded(unsigned int IID) const;
7892 +  Function *getDeclaration(Module *M, unsigned int ID,
7893 +                           Type **Tys = 0,
7894 +                           unsigned int numTys = 0) const;
7895 +};
7896 +
7897 +} // end namespace llvm
7898 +
7899 +#endif // AMDIL_INTRINSICS_H
7900 +
7901 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILIntrinsics.td llvm-r600/lib/Target/R600/AMDILIntrinsics.td
7902 --- llvm-3.2.src/lib/Target/R600/AMDILIntrinsics.td     1970-01-01 01:00:00.000000000 +0100
7903 +++ llvm-r600/lib/Target/R600/AMDILIntrinsics.td        2013-01-25 19:43:57.446716388 +0100
7904 @@ -0,0 +1,242 @@
7905 +//===- AMDILIntrinsics.td - Defines AMDIL Intrinscs -*- tablegen -*-===//
7906 +//
7907 +//                     The LLVM Compiler Infrastructure
7908 +//
7909 +// This file is distributed under the University of Illinois Open Source
7910 +// License. See LICENSE.TXT for details.
7911 +//
7912 +//==-----------------------------------------------------------------------===//
7913 +//
7914 +// This file defines all of the amdil-specific intrinsics
7915 +//
7916 +//===---------------------------------------------------------------===//
7917 +//===--------------------------------------------------------------------===//
7918 +// Intrinsic classes
7919 +// Generic versions of the above classes but for Target specific intrinsics
7920 +// instead of SDNode patterns.
7921 +//===--------------------------------------------------------------------===//
7922 +let TargetPrefix = "AMDIL", isTarget = 1 in {
7923 +     class VoidIntLong :
7924 +          Intrinsic<[llvm_i64_ty], [], []>;
7925 +     class VoidIntInt :
7926 +          Intrinsic<[llvm_i32_ty], [], []>;
7927 +     class VoidIntBool :
7928 +          Intrinsic<[llvm_i32_ty], [], []>;
7929 +     class UnaryIntInt :
7930 +          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
7931 +     class UnaryIntFloat :
7932 +          Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
7933 +     class ConvertIntFTOI :
7934 +          Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
7935 +     class ConvertIntITOF :
7936 +          Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty], [IntrNoMem]>;
7937 +     class UnaryIntNoRetInt :
7938 +          Intrinsic<[], [llvm_anyint_ty], []>;
7939 +     class UnaryIntNoRetFloat :
7940 +          Intrinsic<[], [llvm_anyfloat_ty], []>;
7941 +     class BinaryIntInt :
7942 +          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
7943 +     class BinaryIntFloat :
7944 +          Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
7945 +     class BinaryIntNoRetInt :
7946 +          Intrinsic<[], [llvm_anyint_ty, LLVMMatchType<0>], []>;
7947 +     class BinaryIntNoRetFloat :
7948 +          Intrinsic<[], [llvm_anyfloat_ty, LLVMMatchType<0>], []>;
7949 +     class TernaryIntInt :
7950 +          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
7951 +          LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
7952 +     class TernaryIntFloat :
7953 +          Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>,
7954 +          LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
7955 +     class QuaternaryIntInt :
7956 +          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
7957 +          LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
7958 +     class UnaryAtomicInt :
7959 +          Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
7960 +     class BinaryAtomicInt :
7961 +          Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
7962 +     class TernaryAtomicInt :
7963 +          Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
7964 +     class UnaryAtomicIntNoRet :
7965 +          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
7966 +     class BinaryAtomicIntNoRet :
7967 +          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
7968 +     class TernaryAtomicIntNoRet :
7969 +          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
7970 +}
7971 +
7972 +let TargetPrefix = "AMDIL", isTarget = 1 in {
7973 +  def int_AMDIL_abs : GCCBuiltin<"__amdil_abs">, UnaryIntInt;
7974 +
7975 +  def int_AMDIL_bit_extract_i32 : GCCBuiltin<"__amdil_ibit_extract">,
7976 +          TernaryIntInt;
7977 +  def int_AMDIL_bit_extract_u32 : GCCBuiltin<"__amdil_ubit_extract">,
7978 +          TernaryIntInt;
7979 +  def int_AMDIL_bit_reverse_u32 : GCCBuiltin<"__amdil_ubit_reverse">,
7980 +          UnaryIntInt;
7981 +  def int_AMDIL_bit_count_i32 : GCCBuiltin<"__amdil_count_bits">,
7982 +          UnaryIntInt;
7983 +  def int_AMDIL_bit_find_first_lo : GCCBuiltin<"__amdil_ffb_lo">,
7984 +          UnaryIntInt;
7985 +  def int_AMDIL_bit_find_first_hi : GCCBuiltin<"__amdil_ffb_hi">,
7986 +          UnaryIntInt;
7987 +  def int_AMDIL_bit_find_first_sgn : GCCBuiltin<"__amdil_ffb_signed">,
7988 +          UnaryIntInt;
7989 +  def int_AMDIL_media_bitalign : GCCBuiltin<"__amdil_bitalign">,
7990 +                    TernaryIntInt;
7991 +  def int_AMDIL_media_bytealign : GCCBuiltin<"__amdil_bytealign">,
7992 +                    TernaryIntInt;
7993 +  def int_AMDIL_bit_insert_u32 : GCCBuiltin<"__amdil_ubit_insert">,
7994 +                    QuaternaryIntInt;
7995 +  def int_AMDIL_bfi : GCCBuiltin<"__amdil_bfi">,
7996 +      TernaryIntInt;
7997 +  def int_AMDIL_bfm : GCCBuiltin<"__amdil_bfm">,
7998 +      BinaryIntInt;
7999 +  def int_AMDIL_mad_i32 : GCCBuiltin<"__amdil_imad">,
8000 +          TernaryIntInt;
8001 +  def int_AMDIL_mad_u32 : GCCBuiltin<"__amdil_umad">,
8002 +          TernaryIntInt;
8003 +  def int_AMDIL_mad     : GCCBuiltin<"__amdil_mad">,
8004 +          TernaryIntFloat;
8005 +  def int_AMDIL_mulhi_i32 : GCCBuiltin<"__amdil_imul_high">,
8006 +          BinaryIntInt;
8007 +  def int_AMDIL_mulhi_u32 : GCCBuiltin<"__amdil_umul_high">,
8008 +          BinaryIntInt;
8009 +  def int_AMDIL_mul24_i32 : GCCBuiltin<"__amdil_imul24">,
8010 +          BinaryIntInt;
8011 +  def int_AMDIL_mul24_u32 : GCCBuiltin<"__amdil_umul24">,
8012 +          BinaryIntInt;
8013 +  def int_AMDIL_mulhi24_i32 : GCCBuiltin<"__amdil_imul24_high">,
8014 +          BinaryIntInt;
8015 +  def int_AMDIL_mulhi24_u32 : GCCBuiltin<"__amdil_umul24_high">,
8016 +          BinaryIntInt;
8017 +  def int_AMDIL_mad24_i32 : GCCBuiltin<"__amdil_imad24">,
8018 +          TernaryIntInt;
8019 +  def int_AMDIL_mad24_u32 : GCCBuiltin<"__amdil_umad24">,
8020 +          TernaryIntInt;
8021 +  def int_AMDIL_carry_i32 : GCCBuiltin<"__amdil_carry">,
8022 +          BinaryIntInt;
8023 +  def int_AMDIL_borrow_i32 : GCCBuiltin<"__amdil_borrow">,
8024 +          BinaryIntInt;
8025 +  def int_AMDIL_min_i32 : GCCBuiltin<"__amdil_imin">,
8026 +          BinaryIntInt;
8027 +  def int_AMDIL_min_u32 : GCCBuiltin<"__amdil_umin">,
8028 +          BinaryIntInt;
8029 +  def int_AMDIL_min     : GCCBuiltin<"__amdil_min">,
8030 +          BinaryIntFloat;
8031 +  def int_AMDIL_max_i32 : GCCBuiltin<"__amdil_imax">,
8032 +          BinaryIntInt;
8033 +  def int_AMDIL_max_u32 : GCCBuiltin<"__amdil_umax">,
8034 +          BinaryIntInt;
8035 +  def int_AMDIL_max     : GCCBuiltin<"__amdil_max">,
8036 +          BinaryIntFloat;
8037 +  def int_AMDIL_media_lerp_u4 : GCCBuiltin<"__amdil_u4lerp">,
8038 +          TernaryIntInt;
8039 +  def int_AMDIL_media_sad : GCCBuiltin<"__amdil_sad">,
8040 +          TernaryIntInt;
8041 +  def int_AMDIL_media_sad_hi : GCCBuiltin<"__amdil_sadhi">,
8042 +          TernaryIntInt;
8043 +  def int_AMDIL_fraction : GCCBuiltin<"__amdil_fraction">,
8044 +          UnaryIntFloat;
8045 +  def int_AMDIL_clamp : GCCBuiltin<"__amdil_clamp">,
8046 +          TernaryIntFloat;
8047 +  def int_AMDIL_pireduce : GCCBuiltin<"__amdil_pireduce">,
8048 +          UnaryIntFloat;
8049 +  def int_AMDIL_round_nearest : GCCBuiltin<"__amdil_round_nearest">,
8050 +          UnaryIntFloat;
8051 +  def int_AMDIL_round_neginf : GCCBuiltin<"__amdil_round_neginf">,
8052 +          UnaryIntFloat;
8053 +  def int_AMDIL_round_zero : GCCBuiltin<"__amdil_round_zero">,
8054 +          UnaryIntFloat;
8055 +  def int_AMDIL_acos : GCCBuiltin<"__amdil_acos">,
8056 +          UnaryIntFloat;
8057 +  def int_AMDIL_atan : GCCBuiltin<"__amdil_atan">,
8058 +          UnaryIntFloat;
8059 +  def int_AMDIL_asin : GCCBuiltin<"__amdil_asin">,
8060 +          UnaryIntFloat;
8061 +  def int_AMDIL_cos : GCCBuiltin<"__amdil_cos">,
8062 +          UnaryIntFloat;
8063 +  def int_AMDIL_cos_vec : GCCBuiltin<"__amdil_cos_vec">,
8064 +          UnaryIntFloat;
8065 +  def int_AMDIL_tan : GCCBuiltin<"__amdil_tan">,
8066 +          UnaryIntFloat;
8067 +  def int_AMDIL_sin : GCCBuiltin<"__amdil_sin">,
8068 +          UnaryIntFloat;
8069 +  def int_AMDIL_sin_vec : GCCBuiltin<"__amdil_sin_vec">,
8070 +          UnaryIntFloat;
8071 +  def int_AMDIL_pow : GCCBuiltin<"__amdil_pow">, BinaryIntFloat;
8072 +  def int_AMDIL_div : GCCBuiltin<"__amdil_div">, BinaryIntFloat;
8073 +  def int_AMDIL_udiv : GCCBuiltin<"__amdil_udiv">, BinaryIntInt;
8074 +  def int_AMDIL_sqrt: GCCBuiltin<"__amdil_sqrt">,
8075 +          UnaryIntFloat;
8076 +  def int_AMDIL_sqrt_vec: GCCBuiltin<"__amdil_sqrt_vec">,
8077 +          UnaryIntFloat;
8078 +  def int_AMDIL_exp : GCCBuiltin<"__amdil_exp">,
8079 +          UnaryIntFloat;
8080 +  def int_AMDIL_exp_vec : GCCBuiltin<"__amdil_exp_vec">,
8081 +          UnaryIntFloat;
8082 +  def int_AMDIL_exn : GCCBuiltin<"__amdil_exn">,
8083 +          UnaryIntFloat;
8084 +  def int_AMDIL_log_vec : GCCBuiltin<"__amdil_log_vec">,
8085 +          UnaryIntFloat;
8086 +  def int_AMDIL_ln : GCCBuiltin<"__amdil_ln">,
8087 +          UnaryIntFloat;
8088 +  def int_AMDIL_sign: GCCBuiltin<"__amdil_sign">,
8089 +          UnaryIntFloat;
8090 +  def int_AMDIL_fma: GCCBuiltin<"__amdil_fma">,
8091 +          TernaryIntFloat;
8092 +  def int_AMDIL_rsq : GCCBuiltin<"__amdil_rsq">,
8093 +          UnaryIntFloat;
8094 +  def int_AMDIL_rsq_vec : GCCBuiltin<"__amdil_rsq_vec">,
8095 +          UnaryIntFloat;
8096 +  def int_AMDIL_length : GCCBuiltin<"__amdil_length">,
8097 +          UnaryIntFloat;
8098 +  def int_AMDIL_lerp : GCCBuiltin<"__amdil_lerp">,
8099 +          TernaryIntFloat;
8100 +  def int_AMDIL_media_sad4 : GCCBuiltin<"__amdil_sad4">,
8101 +      Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty,
8102 +           llvm_v4i32_ty, llvm_i32_ty], []>;
8103 +
8104 +  def int_AMDIL_frexp_f64 : GCCBuiltin<"__amdil_frexp">,
8105 +        Intrinsic<[llvm_v2i64_ty], [llvm_double_ty], []>;
8106 + def int_AMDIL_ldexp : GCCBuiltin<"__amdil_ldexp">,
8107 +    Intrinsic<[llvm_anyfloat_ty], [llvm_anyfloat_ty, llvm_anyint_ty], []>;
8108 +  def int_AMDIL_drcp : GCCBuiltin<"__amdil_rcp">,
8109 +      Intrinsic<[llvm_double_ty], [llvm_double_ty], []>;
8110 +  def int_AMDIL_convert_f16_f32 : GCCBuiltin<"__amdil_half_to_float">,
8111 +      ConvertIntITOF;
8112 +  def int_AMDIL_convert_f32_f16 : GCCBuiltin<"__amdil_float_to_half">,
8113 +      ConvertIntFTOI;
8114 +  def int_AMDIL_convert_f32_i32_rpi : GCCBuiltin<"__amdil_float_to_int_rpi">,
8115 +      ConvertIntFTOI;
8116 +  def int_AMDIL_convert_f32_i32_flr : GCCBuiltin<"__amdil_float_to_int_flr">,
8117 +      ConvertIntFTOI;
8118 +  def int_AMDIL_convert_f32_f16_near : GCCBuiltin<"__amdil_float_to_half_near">,
8119 +      ConvertIntFTOI;
8120 +  def int_AMDIL_convert_f32_f16_neg_inf : GCCBuiltin<"__amdil_float_to_half_neg_inf">,
8121 +      ConvertIntFTOI;
8122 +  def int_AMDIL_convert_f32_f16_plus_inf : GCCBuiltin<"__amdil_float_to_half_plus_inf">,
8123 +      ConvertIntFTOI;
8124 + def int_AMDIL_media_convert_f2v4u8 : GCCBuiltin<"__amdil_f_2_u4">,
8125 +      Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], []>;
8126 +  def int_AMDIL_media_unpack_byte_0 : GCCBuiltin<"__amdil_unpack_0">,
8127 +      ConvertIntITOF;
8128 +  def int_AMDIL_media_unpack_byte_1 : GCCBuiltin<"__amdil_unpack_1">,
8129 +      ConvertIntITOF;
8130 +  def int_AMDIL_media_unpack_byte_2 : GCCBuiltin<"__amdil_unpack_2">,
8131 +      ConvertIntITOF;
8132 +  def int_AMDIL_media_unpack_byte_3 : GCCBuiltin<"__amdil_unpack_3">,
8133 +      ConvertIntITOF;
8134 +  def int_AMDIL_dp2_add : GCCBuiltin<"__amdil_dp2_add">,
8135 +        Intrinsic<[llvm_float_ty], [llvm_v2f32_ty,
8136 +          llvm_v2f32_ty, llvm_float_ty], []>;
8137 +  def int_AMDIL_dp2 : GCCBuiltin<"__amdil_dp2">,
8138 +        Intrinsic<[llvm_float_ty], [llvm_v2f32_ty,
8139 +          llvm_v2f32_ty], []>;
8140 +  def int_AMDIL_dp3 : GCCBuiltin<"__amdil_dp3">,
8141 +        Intrinsic<[llvm_float_ty], [llvm_v4f32_ty,
8142 +          llvm_v4f32_ty], []>;
8143 +  def int_AMDIL_dp4 : GCCBuiltin<"__amdil_dp4">,
8144 +        Intrinsic<[llvm_float_ty], [llvm_v4f32_ty,
8145 +          llvm_v4f32_ty], []>;
8146 +}
8147 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILISelDAGToDAG.cpp llvm-r600/lib/Target/R600/AMDILISelDAGToDAG.cpp
8148 --- llvm-3.2.src/lib/Target/R600/AMDILISelDAGToDAG.cpp  1970-01-01 01:00:00.000000000 +0100
8149 +++ llvm-r600/lib/Target/R600/AMDILISelDAGToDAG.cpp     2013-01-25 19:43:57.443383054 +0100
8150 @@ -0,0 +1,567 @@
8151 +//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===//
8152 +//
8153 +//                     The LLVM Compiler Infrastructure
8154 +//
8155 +// This file is distributed under the University of Illinois Open Source
8156 +// License. See LICENSE.TXT for details.
8157 +//
8158 +//==-----------------------------------------------------------------------===//
8159 +//
8160 +/// \file
8161 +/// \brief Defines an instruction selector for the AMDGPU target.
8162 +//
8163 +//===----------------------------------------------------------------------===//
8164 +#include "AMDGPUInstrInfo.h"
8165 +#include "AMDGPUISelLowering.h" // For AMDGPUISD
8166 +#include "AMDGPURegisterInfo.h"
8167 +#include "AMDILDevices.h"
8168 +#include "R600InstrInfo.h"
8169 +#include "llvm/ADT/ValueMap.h"
8170 +#include "llvm/CodeGen/PseudoSourceValue.h"
8171 +#include "llvm/CodeGen/SelectionDAGISel.h"
8172 +#include "llvm/Support/Compiler.h"
8173 +#include "llvm/CodeGen/SelectionDAG.h"
8174 +#include <list>
8175 +#include <queue>
8176 +
8177 +using namespace llvm;
8178 +
8179 +//===----------------------------------------------------------------------===//
8180 +// Instruction Selector Implementation
8181 +//===----------------------------------------------------------------------===//
8182 +
8183 +namespace {
8184 +/// AMDGPU specific code to select AMDGPU machine instructions for
8185 +/// SelectionDAG operations.
8186 +class AMDGPUDAGToDAGISel : public SelectionDAGISel {
8187 +  // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
8188 +  // make the right decision when generating code for different targets.
8189 +  const AMDGPUSubtarget &Subtarget;
8190 +public:
8191 +  AMDGPUDAGToDAGISel(TargetMachine &TM);
8192 +  virtual ~AMDGPUDAGToDAGISel();
8193 +
8194 +  SDNode *Select(SDNode *N);
8195 +  virtual const char *getPassName() const;
8196 +
8197 +private:
8198 +  inline SDValue getSmallIPtrImm(unsigned Imm);
8199 +  bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
8200 +
8201 +  // Complex pattern selectors
8202 +  bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
8203 +  bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
8204 +  bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
8205 +
8206 +  static bool checkType(const Value *ptr, unsigned int addrspace);
8207 +  static const Value *getBasePointerValue(const Value *V);
8208 +
8209 +  static bool isGlobalStore(const StoreSDNode *N);
8210 +  static bool isPrivateStore(const StoreSDNode *N);
8211 +  static bool isLocalStore(const StoreSDNode *N);
8212 +  static bool isRegionStore(const StoreSDNode *N);
8213 +
8214 +  static bool isCPLoad(const LoadSDNode *N);
8215 +  static bool isConstantLoad(const LoadSDNode *N, int cbID);
8216 +  static bool isGlobalLoad(const LoadSDNode *N);
8217 +  static bool isParamLoad(const LoadSDNode *N);
8218 +  static bool isPrivateLoad(const LoadSDNode *N);
8219 +  static bool isLocalLoad(const LoadSDNode *N);
8220 +  static bool isRegionLoad(const LoadSDNode *N);
8221 +
8222 +  bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
8223 +  bool SelectGlobalValueVariableOffset(SDValue Addr,
8224 +      SDValue &BaseReg, SDValue& Offset);
8225 +  bool SelectADDR8BitOffset(SDValue Addr, SDValue& Base, SDValue& Offset);
8226 +  bool SelectADDRReg(SDValue Addr, SDValue& Base, SDValue& Offset);
8227 +  bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
8228 +
8229 +  // Include the pieces autogenerated from the target description.
8230 +#include "AMDGPUGenDAGISel.inc"
8231 +};
8232 +}  // end anonymous namespace
8233 +
8234 +/// \brief This pass converts a legalized DAG into a AMDGPU-specific
8235 +// DAG, ready for instruction scheduling.
8236 +FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM
8237 +                                       ) {
8238 +  return new AMDGPUDAGToDAGISel(TM);
8239 +}
8240 +
8241 +AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM
8242 +                                     )
8243 +  : SelectionDAGISel(TM), Subtarget(TM.getSubtarget<AMDGPUSubtarget>()) {
8244 +}
8245 +
8246 +AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() {
8247 +}
8248 +
8249 +SDValue AMDGPUDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) {
8250 +  return CurDAG->getTargetConstant(Imm, MVT::i32);
8251 +}
8252 +
8253 +bool AMDGPUDAGToDAGISel::SelectADDRParam(
8254 +    SDValue Addr, SDValue& R1, SDValue& R2) {
8255 +
8256 +  if (Addr.getOpcode() == ISD::FrameIndex) {
8257 +    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
8258 +      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
8259 +      R2 = CurDAG->getTargetConstant(0, MVT::i32);
8260 +    } else {
8261 +      R1 = Addr;
8262 +      R2 = CurDAG->getTargetConstant(0, MVT::i32);
8263 +    }
8264 +  } else if (Addr.getOpcode() == ISD::ADD) {
8265 +    R1 = Addr.getOperand(0);
8266 +    R2 = Addr.getOperand(1);
8267 +  } else {
8268 +    R1 = Addr;
8269 +    R2 = CurDAG->getTargetConstant(0, MVT::i32);
8270 +  }
8271 +  return true;
8272 +}
8273 +
8274 +bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) {
8275 +  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
8276 +      Addr.getOpcode() == ISD::TargetGlobalAddress) {
8277 +    return false;
8278 +  }
8279 +  return SelectADDRParam(Addr, R1, R2);
8280 +}
8281 +
8282 +
8283 +bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) {
8284 +  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
8285 +      Addr.getOpcode() == ISD::TargetGlobalAddress) {
8286 +    return false;
8287 +  }
8288 +
8289 +  if (Addr.getOpcode() == ISD::FrameIndex) {
8290 +    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
8291 +      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
8292 +      R2 = CurDAG->getTargetConstant(0, MVT::i64);
8293 +    } else {
8294 +      R1 = Addr;
8295 +      R2 = CurDAG->getTargetConstant(0, MVT::i64);
8296 +    }
8297 +  } else if (Addr.getOpcode() == ISD::ADD) {
8298 +    R1 = Addr.getOperand(0);
8299 +    R2 = Addr.getOperand(1);
8300 +  } else {
8301 +    R1 = Addr;
8302 +    R2 = CurDAG->getTargetConstant(0, MVT::i64);
8303 +  }
8304 +  return true;
8305 +}
8306 +
8307 +SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
8308 +  unsigned int Opc = N->getOpcode();
8309 +  if (N->isMachineOpcode()) {
8310 +    return NULL;   // Already selected.
8311 +  }
8312 +  switch (Opc) {
8313 +  default: break;
8314 +  case ISD::FrameIndex: {
8315 +    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(N)) {
8316 +      unsigned int FI = FIN->getIndex();
8317 +      EVT OpVT = N->getValueType(0);
8318 +      unsigned int NewOpc = AMDGPU::COPY;
8319 +      SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i32);
8320 +      return CurDAG->SelectNodeTo(N, NewOpc, OpVT, TFI);
8321 +    }
8322 +    break;
8323 +  }
8324 +  case ISD::ConstantFP:
8325 +  case ISD::Constant: {
8326 +    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
8327 +    // XXX: Custom immediate lowering not implemented yet.  Instead we use
8328 +    // pseudo instructions defined in SIInstructions.td
8329 +    if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
8330 +      break;
8331 +    }
8332 +    const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo());
8333 +
8334 +    uint64_t ImmValue = 0;
8335 +    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
8336 +
8337 +    if (N->getOpcode() == ISD::ConstantFP) {
8338 +      // XXX: 64-bit Immediates not supported yet
8339 +      assert(N->getValueType(0) != MVT::f64);
8340 +
8341 +      ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N);
8342 +      APFloat Value = C->getValueAPF();
8343 +      float FloatValue = Value.convertToFloat();
8344 +      if (FloatValue == 0.0) {
8345 +        ImmReg = AMDGPU::ZERO;
8346 +      } else if (FloatValue == 0.5) {
8347 +        ImmReg = AMDGPU::HALF;
8348 +      } else if (FloatValue == 1.0) {
8349 +        ImmReg = AMDGPU::ONE;
8350 +      } else {
8351 +        ImmValue = Value.bitcastToAPInt().getZExtValue();
8352 +      }
8353 +    } else {
8354 +      // XXX: 64-bit Immediates not supported yet
8355 +      assert(N->getValueType(0) != MVT::i64);
8356 +
8357 +      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
8358 +      if (C->getZExtValue() == 0) {
8359 +        ImmReg = AMDGPU::ZERO;
8360 +      } else if (C->getZExtValue() == 1) {
8361 +        ImmReg = AMDGPU::ONE_INT;
8362 +      } else {
8363 +        ImmValue = C->getZExtValue();
8364 +      }
8365 +    }
8366 +
8367 +    for (SDNode::use_iterator Use = N->use_begin(), Next = llvm::next(Use);
8368 +                              Use != SDNode::use_end(); Use = Next) {
8369 +      Next = llvm::next(Use);
8370 +      std::vector<SDValue> Ops;
8371 +      for (unsigned i = 0; i < Use->getNumOperands(); ++i) {
8372 +        Ops.push_back(Use->getOperand(i));
8373 +      }
8374 +
8375 +      if (!Use->isMachineOpcode()) {
8376 +          if (ImmReg == AMDGPU::ALU_LITERAL_X) {
8377 +            // We can only use literal constants (e.g. AMDGPU::ZERO,
8378 +            // AMDGPU::ONE, etc) in machine opcodes.
8379 +            continue;
8380 +          }
8381 +      } else {
8382 +        if (!TII->isALUInstr(Use->getMachineOpcode())) {
8383 +          continue;
8384 +        }
8385 +
8386 +        int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(), R600Operands::IMM);
8387 +        assert(ImmIdx != -1);
8388 +
8389 +        // subtract one from ImmIdx, because the DST operand is usually index
8390 +        // 0 for MachineInstrs, but we have no DST in the Ops vector.
8391 +        ImmIdx--;
8392 +
8393 +        // Check that we aren't already using an immediate.
8394 +        // XXX: It's possible for an instruction to have more than one
8395 +        // immediate operand, but this is not supported yet.
8396 +        if (ImmReg == AMDGPU::ALU_LITERAL_X) {
8397 +          ConstantSDNode *C = dyn_cast<ConstantSDNode>(Use->getOperand(ImmIdx));
8398 +          assert(C);
8399 +
8400 +          if (C->getZExtValue() != 0) {
8401 +            // This instruction is already using an immediate.
8402 +            continue;
8403 +          }
8404 +
8405 +          // Set the immediate value
8406 +          Ops[ImmIdx] = CurDAG->getTargetConstant(ImmValue, MVT::i32);
8407 +        }
8408 +      }
8409 +      // Set the immediate register
8410 +      Ops[Use.getOperandNo()] = CurDAG->getRegister(ImmReg, MVT::i32);
8411 +
8412 +      CurDAG->UpdateNodeOperands(*Use, Ops.data(), Use->getNumOperands());
8413 +    }
8414 +    break;
8415 +  }
8416 +  }
8417 +  SDNode *Result = SelectCode(N);
8418 +
8419 +  // Fold operands of selected node
8420 +
8421 +  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
8422 +  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
8423 +    const R600InstrInfo *TII =
8424 +        static_cast<const R600InstrInfo*>(TM.getInstrInfo());
8425 +    if (Result && TII->isALUInstr(Result->getMachineOpcode())) {
8426 +      bool IsModified = false;
8427 +      do {
8428 +        std::vector<SDValue> Ops;
8429 +        for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end();
8430 +            I != E; ++I)
8431 +          Ops.push_back(*I);
8432 +        IsModified = FoldOperands(Result->getMachineOpcode(), TII, Ops);
8433 +        if (IsModified) {
8434 +          Result = CurDAG->MorphNodeTo(Result, Result->getOpcode(),
8435 +              Result->getVTList(), Ops.data(), Ops.size());
8436 +        }
8437 +      } while (IsModified);
8438 +    }
8439 +  }
8440 +
8441 +  return Result;
8442 +}
8443 +
8444 +bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode,
8445 +    const R600InstrInfo *TII, std::vector<SDValue> &Ops) {
8446 +  int OperandIdx[] = {
8447 +    TII->getOperandIdx(Opcode, R600Operands::SRC0),
8448 +    TII->getOperandIdx(Opcode, R600Operands::SRC1),
8449 +    TII->getOperandIdx(Opcode, R600Operands::SRC2)
8450 +  };
8451 +  int SelIdx[] = {
8452 +    TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL),
8453 +    TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL),
8454 +    TII->getOperandIdx(Opcode, R600Operands::SRC2_SEL)
8455 +  };
8456 +  for (unsigned i = 0; i < 3; i++) {
8457 +    if (OperandIdx[i] < 0)
8458 +      return false;
8459 +    SDValue Operand = Ops[OperandIdx[i] - 1];
8460 +    switch (Operand.getOpcode()) {
8461 +    case AMDGPUISD::CONST_ADDRESS: {
8462 +      SDValue CstOffset;
8463 +      if (!Operand.getValueType().isVector() &&
8464 +          SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) {
8465 +        Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32);
8466 +        Ops[SelIdx[i] - 1] = CstOffset;
8467 +        return true;
8468 +      }
8469 +      }
8470 +      break;
8471 +    default:
8472 +      break;
8473 +    }
8474 +  }
8475 +  return false;
8476 +}
8477 +
8478 +bool AMDGPUDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) {
8479 +  if (!ptr) {
8480 +    return false;
8481 +  }
8482 +  Type *ptrType = ptr->getType();
8483 +  return dyn_cast<PointerType>(ptrType)->getAddressSpace() == addrspace;
8484 +}
8485 +
8486 +const Value * AMDGPUDAGToDAGISel::getBasePointerValue(const Value *V) {
8487 +  if (!V) {
8488 +    return NULL;
8489 +  }
8490 +  const Value *ret = NULL;
8491 +  ValueMap<const Value *, bool> ValueBitMap;
8492 +  std::queue<const Value *, std::list<const Value *> > ValueQueue;
8493 +  ValueQueue.push(V);
8494 +  while (!ValueQueue.empty()) {
8495 +    V = ValueQueue.front();
8496 +    if (ValueBitMap.find(V) == ValueBitMap.end()) {
8497 +      ValueBitMap[V] = true;
8498 +      if (dyn_cast<Argument>(V) && dyn_cast<PointerType>(V->getType())) {
8499 +        ret = V;
8500 +        break;
8501 +      } else if (dyn_cast<GlobalVariable>(V)) {
8502 +        ret = V;
8503 +        break;
8504 +      } else if (dyn_cast<Constant>(V)) {
8505 +        const ConstantExpr *CE = dyn_cast<ConstantExpr>(V);
8506 +        if (CE) {
8507 +          ValueQueue.push(CE->getOperand(0));
8508 +        }
8509 +      } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
8510 +        ret = AI;
8511 +        break;
8512 +      } else if (const Instruction *I = dyn_cast<Instruction>(V)) {
8513 +        uint32_t numOps = I->getNumOperands();
8514 +        for (uint32_t x = 0; x < numOps; ++x) {
8515 +          ValueQueue.push(I->getOperand(x));
8516 +        }
8517 +      } else {
8518 +        assert(!"Found a Value that we didn't know how to handle!");
8519 +      }
8520 +    }
8521 +    ValueQueue.pop();
8522 +  }
8523 +  return ret;
8524 +}
8525 +
8526 +bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
8527 +  return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
8528 +}
8529 +
8530 +bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
8531 +  return (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
8532 +          && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
8533 +          && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS));
8534 +}
8535 +
8536 +bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
8537 +  return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
8538 +}
8539 +
8540 +bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
8541 +  return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
8542 +}
8543 +
8544 +bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int cbID) {
8545 +  if (checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)) {
8546 +    return true;
8547 +  }
8548 +  MachineMemOperand *MMO = N->getMemOperand();
8549 +  const Value *V = MMO->getValue();
8550 +  const Value *BV = getBasePointerValue(V);
8551 +  if (MMO
8552 +      && MMO->getValue()
8553 +      && ((V && dyn_cast<GlobalValue>(V))
8554 +          || (BV && dyn_cast<GlobalValue>(
8555 +                        getBasePointerValue(MMO->getValue()))))) {
8556 +    return checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS);
8557 +  } else {
8558 +    return false;
8559 +  }
8560 +}
8561 +
8562 +bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) {
8563 +  return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
8564 +}
8565 +
8566 +bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) {
8567 +  return checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS);
8568 +}
8569 +
8570 +bool AMDGPUDAGToDAGISel::isLocalLoad(const  LoadSDNode *N) {
8571 +  return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
8572 +}
8573 +
8574 +bool AMDGPUDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) {
8575 +  return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
8576 +}
8577 +
8578 +bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) {
8579 +  MachineMemOperand *MMO = N->getMemOperand();
8580 +  if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
8581 +    if (MMO) {
8582 +      const Value *V = MMO->getValue();
8583 +      const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V);
8584 +      if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
8585 +        return true;
8586 +      }
8587 +    }
8588 +  }
8589 +  return false;
8590 +}
8591 +
8592 +bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) {
8593 +  if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
8594 +    // Check to make sure we are not a constant pool load or a constant load
8595 +    // that is marked as a private load
8596 +    if (isCPLoad(N) || isConstantLoad(N, -1)) {
8597 +      return false;
8598 +    }
8599 +  }
8600 +  if (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
8601 +      && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
8602 +      && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS)
8603 +      && !checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)
8604 +      && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_D_ADDRESS)
8605 +      && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS)) {
8606 +    return true;
8607 +  }
8608 +  return false;
8609 +}
8610 +
8611 +const char *AMDGPUDAGToDAGISel::getPassName() const {
8612 +  return "AMDGPU DAG->DAG Pattern Instruction Selection";
8613 +}
8614 +
8615 +#ifdef DEBUGTMP
8616 +#undef INT64_C
8617 +#endif
8618 +#undef DEBUGTMP
8619 +
8620 +///==== AMDGPU Functions ====///
8621 +
8622 +bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
8623 +    SDValue& IntPtr) {
8624 +  if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
8625 +    IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, true);
8626 +    return true;
8627 +  }
8628 +  return false;
8629 +}
8630 +
8631 +bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
8632 +    SDValue& BaseReg, SDValue &Offset) {
8633 +  if (!dyn_cast<ConstantSDNode>(Addr)) {
8634 +    BaseReg = Addr;
8635 +    Offset = CurDAG->getIntPtrConstant(0, true);
8636 +    return true;
8637 +  }
8638 +  return false;
8639 +}
8640 +
8641 +bool AMDGPUDAGToDAGISel::SelectADDR8BitOffset(SDValue Addr, SDValue& Base,
8642 +                                             SDValue& Offset) {
8643 +  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
8644 +      Addr.getOpcode() == ISD::TargetGlobalAddress) {
8645 +    return false;
8646 +  }
8647 +
8648 +
8649 +  if (Addr.getOpcode() == ISD::ADD) {
8650 +    bool Match = false;
8651 +
8652 +    // Find the base ptr and the offset
8653 +    for (unsigned i = 0; i < Addr.getNumOperands(); i++) {
8654 +      SDValue Arg = Addr.getOperand(i);
8655 +      ConstantSDNode * OffsetNode = dyn_cast<ConstantSDNode>(Arg);
8656 +      // This arg isn't a constant so it must be the base PTR.
8657 +      if (!OffsetNode) {
8658 +        Base = Addr.getOperand(i);
8659 +        continue;
8660 +      }
8661 +      // Check if the constant argument fits in 8-bits.  The offset is in bytes
8662 +      // so we need to convert it to dwords.
8663 +      if (isUInt<8>(OffsetNode->getZExtValue() >> 2)) {
8664 +        Match = true;
8665 +        Offset = CurDAG->getTargetConstant(OffsetNode->getZExtValue() >> 2,
8666 +                                           MVT::i32);
8667 +      }
8668 +    }
8669 +    return Match;
8670 +  }
8671 +
8672 +  // Default case, no offset
8673 +  Base = Addr;
8674 +  Offset = CurDAG->getTargetConstant(0, MVT::i32);
8675 +  return true;
8676 +}
8677 +
8678 +bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
8679 +                                           SDValue &Offset) {
8680 +  ConstantSDNode * IMMOffset;
8681 +
8682 +  if (Addr.getOpcode() == ISD::ADD
8683 +      && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
8684 +      && isInt<16>(IMMOffset->getZExtValue())) {
8685 +
8686 +      Base = Addr.getOperand(0);
8687 +      Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32);
8688 +      return true;
8689 +  // If the pointer address is constant, we can move it to the offset field.
8690 +  } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
8691 +             && isInt<16>(IMMOffset->getZExtValue())) {
8692 +    Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
8693 +                                  CurDAG->getEntryNode().getDebugLoc(),
8694 +                                  AMDGPU::ZERO, MVT::i32);
8695 +    Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32);
8696 +    return true;
8697 +  }
8698 +
8699 +  // Default case, no offset
8700 +  Base = Addr;
8701 +  Offset = CurDAG->getTargetConstant(0, MVT::i32);
8702 +  return true;
8703 +}
8704 +
8705 +bool AMDGPUDAGToDAGISel::SelectADDRReg(SDValue Addr, SDValue& Base,
8706 +                                      SDValue& Offset) {
8707 +  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
8708 +      Addr.getOpcode() == ISD::TargetGlobalAddress  ||
8709 +      Addr.getOpcode() != ISD::ADD) {
8710 +    return false;
8711 +  }
8712 +
8713 +  Base = Addr.getOperand(0);
8714 +  Offset = Addr.getOperand(1);
8715 +
8716 +  return true;
8717 +}
8718 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILISelLowering.cpp llvm-r600/lib/Target/R600/AMDILISelLowering.cpp
8719 --- llvm-3.2.src/lib/Target/R600/AMDILISelLowering.cpp  1970-01-01 01:00:00.000000000 +0100
8720 +++ llvm-r600/lib/Target/R600/AMDILISelLowering.cpp     2013-01-25 19:43:57.443383054 +0100
8721 @@ -0,0 +1,651 @@
8722 +//===-- AMDILISelLowering.cpp - AMDIL DAG Lowering Implementation ---------===//
8723 +//
8724 +//                     The LLVM Compiler Infrastructure
8725 +//
8726 +// This file is distributed under the University of Illinois Open Source
8727 +// License. See LICENSE.TXT for details.
8728 +//
8729 +//==-----------------------------------------------------------------------===//
8730 +//
8731 +/// \file
8732 +/// \brief TargetLowering functions borrowed from AMDIL.
8733 +//
8734 +//===----------------------------------------------------------------------===//
8735 +
8736 +#include "AMDGPUISelLowering.h"
8737 +#include "AMDGPURegisterInfo.h"
8738 +#include "AMDILDevices.h"
8739 +#include "AMDILIntrinsicInfo.h"
8740 +#include "AMDGPUSubtarget.h"
8741 +#include "llvm/CallingConv.h"
8742 +#include "llvm/CodeGen/MachineFrameInfo.h"
8743 +#include "llvm/CodeGen/MachineRegisterInfo.h"
8744 +#include "llvm/CodeGen/PseudoSourceValue.h"
8745 +#include "llvm/CodeGen/SelectionDAG.h"
8746 +#include "llvm/CodeGen/SelectionDAGNodes.h"
8747 +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
8748 +#include "llvm/DerivedTypes.h"
8749 +#include "llvm/Instructions.h"
8750 +#include "llvm/Intrinsics.h"
8751 +#include "llvm/Support/raw_ostream.h"
8752 +#include "llvm/Target/TargetInstrInfo.h"
8753 +#include "llvm/Target/TargetOptions.h"
8754 +
8755 +using namespace llvm;
8756 +//===----------------------------------------------------------------------===//
8757 +// Calling Convention Implementation
8758 +//===----------------------------------------------------------------------===//
8759 +#include "AMDGPUGenCallingConv.inc"
8760 +
8761 +//===----------------------------------------------------------------------===//
8762 +// TargetLowering Implementation Help Functions End
8763 +//===----------------------------------------------------------------------===//
8764 +
8765 +//===----------------------------------------------------------------------===//
8766 +// TargetLowering Class Implementation Begins
8767 +//===----------------------------------------------------------------------===//
8768 +void AMDGPUTargetLowering::InitAMDILLowering() {
8769 +  int types[] = {
8770 +    (int)MVT::i8,
8771 +    (int)MVT::i16,
8772 +    (int)MVT::i32,
8773 +    (int)MVT::f32,
8774 +    (int)MVT::f64,
8775 +    (int)MVT::i64,
8776 +    (int)MVT::v2i8,
8777 +    (int)MVT::v4i8,
8778 +    (int)MVT::v2i16,
8779 +    (int)MVT::v4i16,
8780 +    (int)MVT::v4f32,
8781 +    (int)MVT::v4i32,
8782 +    (int)MVT::v2f32,
8783 +    (int)MVT::v2i32,
8784 +    (int)MVT::v2f64,
8785 +    (int)MVT::v2i64
8786 +  };
8787 +
8788 +  int IntTypes[] = {
8789 +    (int)MVT::i8,
8790 +    (int)MVT::i16,
8791 +    (int)MVT::i32,
8792 +    (int)MVT::i64
8793 +  };
8794 +
8795 +  int FloatTypes[] = {
8796 +    (int)MVT::f32,
8797 +    (int)MVT::f64
8798 +  };
8799 +
8800 +  int VectorTypes[] = {
8801 +    (int)MVT::v2i8,
8802 +    (int)MVT::v4i8,
8803 +    (int)MVT::v2i16,
8804 +    (int)MVT::v4i16,
8805 +    (int)MVT::v4f32,
8806 +    (int)MVT::v4i32,
8807 +    (int)MVT::v2f32,
8808 +    (int)MVT::v2i32,
8809 +    (int)MVT::v2f64,
8810 +    (int)MVT::v2i64
8811 +  };
8812 +  size_t NumTypes = sizeof(types) / sizeof(*types);
8813 +  size_t NumFloatTypes = sizeof(FloatTypes) / sizeof(*FloatTypes);
8814 +  size_t NumIntTypes = sizeof(IntTypes) / sizeof(*IntTypes);
8815 +  size_t NumVectorTypes = sizeof(VectorTypes) / sizeof(*VectorTypes);
8816 +
8817 +  const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget<AMDGPUSubtarget>();
8818 +  // These are the current register classes that are
8819 +  // supported
8820 +
8821 +  for (unsigned int x  = 0; x < NumTypes; ++x) {
8822 +    MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x];
8823 +
8824 +    //FIXME: SIGN_EXTEND_INREG is not meaningful for floating point types
8825 +    // We cannot sextinreg, expand to shifts
8826 +    setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
8827 +    setOperationAction(ISD::SUBE, VT, Expand);
8828 +    setOperationAction(ISD::SUBC, VT, Expand);
8829 +    setOperationAction(ISD::ADDE, VT, Expand);
8830 +    setOperationAction(ISD::ADDC, VT, Expand);
8831 +    setOperationAction(ISD::BRCOND, VT, Custom);
8832 +    setOperationAction(ISD::BR_JT, VT, Expand);
8833 +    setOperationAction(ISD::BRIND, VT, Expand);
8834 +    // TODO: Implement custom UREM/SREM routines
8835 +    setOperationAction(ISD::SREM, VT, Expand);
8836 +    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
8837 +    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
8838 +    if (VT != MVT::i64 && VT != MVT::v2i64) {
8839 +      setOperationAction(ISD::SDIV, VT, Custom);
8840 +    }
8841 +  }
8842 +  for (unsigned int x = 0; x < NumFloatTypes; ++x) {
8843 +    MVT::SimpleValueType VT = (MVT::SimpleValueType)FloatTypes[x];
8844 +
8845 +    // IL does not have these operations for floating point types
8846 +    setOperationAction(ISD::FP_ROUND_INREG, VT, Expand);
8847 +    setOperationAction(ISD::SETOLT, VT, Expand);
8848 +    setOperationAction(ISD::SETOGE, VT, Expand);
8849 +    setOperationAction(ISD::SETOGT, VT, Expand);
8850 +    setOperationAction(ISD::SETOLE, VT, Expand);
8851 +    setOperationAction(ISD::SETULT, VT, Expand);
8852 +    setOperationAction(ISD::SETUGE, VT, Expand);
8853 +    setOperationAction(ISD::SETUGT, VT, Expand);
8854 +    setOperationAction(ISD::SETULE, VT, Expand);
8855 +  }
8856 +
8857 +  for (unsigned int x = 0; x < NumIntTypes; ++x) {
8858 +    MVT::SimpleValueType VT = (MVT::SimpleValueType)IntTypes[x];
8859 +
8860 +    // GPU also does not have divrem function for signed or unsigned
8861 +    setOperationAction(ISD::SDIVREM, VT, Expand);
8862 +
8863 +    // GPU does not have [S|U]MUL_LOHI functions as a single instruction
8864 +    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
8865 +    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
8866 +
8867 +    // GPU doesn't have a rotl, rotr, or byteswap instruction
8868 +    setOperationAction(ISD::ROTR, VT, Expand);
8869 +    setOperationAction(ISD::BSWAP, VT, Expand);
8870 +
8871 +    // GPU doesn't have any counting operators
8872 +    setOperationAction(ISD::CTPOP, VT, Expand);
8873 +    setOperationAction(ISD::CTTZ, VT, Expand);
8874 +    setOperationAction(ISD::CTLZ, VT, Expand);
8875 +  }
8876 +
8877 +  for (unsigned int ii = 0; ii < NumVectorTypes; ++ii) {
8878 +    MVT::SimpleValueType VT = (MVT::SimpleValueType)VectorTypes[ii];
8879 +
8880 +    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
8881 +    setOperationAction(ISD::SDIVREM, VT, Expand);
8882 +    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
8883 +    // setOperationAction(ISD::VSETCC, VT, Expand);
8884 +    setOperationAction(ISD::SELECT_CC, VT, Expand);
8885 +
8886 +  }
8887 +  if (STM.device()->isSupported(AMDGPUDeviceInfo::LongOps)) {
8888 +    setOperationAction(ISD::MULHU, MVT::i64, Expand);
8889 +    setOperationAction(ISD::MULHU, MVT::v2i64, Expand);
8890 +    setOperationAction(ISD::MULHS, MVT::i64, Expand);
8891 +    setOperationAction(ISD::MULHS, MVT::v2i64, Expand);
8892 +    setOperationAction(ISD::ADD, MVT::v2i64, Expand);
8893 +    setOperationAction(ISD::SREM, MVT::v2i64, Expand);
8894 +    setOperationAction(ISD::Constant          , MVT::i64  , Legal);
8895 +    setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
8896 +    setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand);
8897 +    setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand);
8898 +    setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand);
8899 +    setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand);
8900 +  }
8901 +  if (STM.device()->isSupported(AMDGPUDeviceInfo::DoubleOps)) {
8902 +    // we support loading/storing v2f64 but not operations on the type
8903 +    setOperationAction(ISD::FADD, MVT::v2f64, Expand);
8904 +    setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
8905 +    setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
8906 +    setOperationAction(ISD::FP_ROUND_INREG, MVT::v2f64, Expand);
8907 +    setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
8908 +    setOperationAction(ISD::ConstantFP        , MVT::f64  , Legal);
8909 +    // We want to expand vector conversions into their scalar
8910 +    // counterparts.
8911 +    setOperationAction(ISD::TRUNCATE, MVT::v2f64, Expand);
8912 +    setOperationAction(ISD::SIGN_EXTEND, MVT::v2f64, Expand);
8913 +    setOperationAction(ISD::ZERO_EXTEND, MVT::v2f64, Expand);
8914 +    setOperationAction(ISD::ANY_EXTEND, MVT::v2f64, Expand);
8915 +    setOperationAction(ISD::FABS, MVT::f64, Expand);
8916 +    setOperationAction(ISD::FABS, MVT::v2f64, Expand);
8917 +  }
8918 +  // TODO: Fix the UDIV24 algorithm so it works for these
8919 +  // types correctly. This needs vector comparisons
8920 +  // for this to work correctly.
8921 +  setOperationAction(ISD::UDIV, MVT::v2i8, Expand);
8922 +  setOperationAction(ISD::UDIV, MVT::v4i8, Expand);
8923 +  setOperationAction(ISD::UDIV, MVT::v2i16, Expand);
8924 +  setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
8925 +  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
8926 +  setOperationAction(ISD::SUBC, MVT::Other, Expand);
8927 +  setOperationAction(ISD::ADDE, MVT::Other, Expand);
8928 +  setOperationAction(ISD::ADDC, MVT::Other, Expand);
8929 +  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
8930 +  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
8931 +  setOperationAction(ISD::BRIND, MVT::Other, Expand);
8932 +  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
8933 +
8934 +
8935 +  // Use the default implementation.
8936 +  setOperationAction(ISD::ConstantFP        , MVT::f32    , Legal);
8937 +  setOperationAction(ISD::Constant          , MVT::i32    , Legal);
8938 +
8939 +  setSchedulingPreference(Sched::RegPressure);
8940 +  setPow2DivIsCheap(false);
8941 +  setSelectIsExpensive(true);
8942 +  setJumpIsExpensive(true);
8943 +
8944 +  maxStoresPerMemcpy  = 4096;
8945 +  maxStoresPerMemmove = 4096;
8946 +  maxStoresPerMemset  = 4096;
8947 +
8948 +}
8949 +
8950 +bool
8951 +AMDGPUTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
8952 +    const CallInst &I, unsigned Intrinsic) const {
8953 +  return false;
8954 +}
8955 +
8956 +// The backend supports 32 and 64 bit floating point immediates
8957 +bool
8958 +AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
8959 +  if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
8960 +      || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
8961 +    return true;
8962 +  } else {
8963 +    return false;
8964 +  }
8965 +}
8966 +
8967 +bool
8968 +AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
8969 +  if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
8970 +      || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
8971 +    return false;
8972 +  } else {
8973 +    return true;
8974 +  }
8975 +}
8976 +
8977 +
8978 +// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
8979 +// be zero. Op is expected to be a target specific node. Used by DAG
8980 +// combiner.
8981 +
8982 +void
8983 +AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
8984 +    const SDValue Op,
8985 +    APInt &KnownZero,
8986 +    APInt &KnownOne,
8987 +    const SelectionDAG &DAG,
8988 +    unsigned Depth) const {
8989 +  APInt KnownZero2;
8990 +  APInt KnownOne2;
8991 +  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything
8992 +  switch (Op.getOpcode()) {
8993 +    default: break;
8994 +    case ISD::SELECT_CC:
8995 +             DAG.ComputeMaskedBits(
8996 +                 Op.getOperand(1),
8997 +                 KnownZero,
8998 +                 KnownOne,
8999 +                 Depth + 1
9000 +                 );
9001 +             DAG.ComputeMaskedBits(
9002 +                 Op.getOperand(0),
9003 +                 KnownZero2,
9004 +                 KnownOne2
9005 +                 );
9006 +             assert((KnownZero & KnownOne) == 0
9007 +                 && "Bits known to be one AND zero?");
9008 +             assert((KnownZero2 & KnownOne2) == 0
9009 +                 && "Bits known to be one AND zero?");
9010 +             // Only known if known in both the LHS and RHS
9011 +             KnownOne &= KnownOne2;
9012 +             KnownZero &= KnownZero2;
9013 +             break;
9014 +  };
9015 +}
9016 +
9017 +//===----------------------------------------------------------------------===//
9018 +//                           Other Lowering Hooks
9019 +//===----------------------------------------------------------------------===//
9020 +
9021 +SDValue
9022 +AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
9023 +  EVT OVT = Op.getValueType();
9024 +  SDValue DST;
9025 +  if (OVT.getScalarType() == MVT::i64) {
9026 +    DST = LowerSDIV64(Op, DAG);
9027 +  } else if (OVT.getScalarType() == MVT::i32) {
9028 +    DST = LowerSDIV32(Op, DAG);
9029 +  } else if (OVT.getScalarType() == MVT::i16
9030 +      || OVT.getScalarType() == MVT::i8) {
9031 +    DST = LowerSDIV24(Op, DAG);
9032 +  } else {
9033 +    DST = SDValue(Op.getNode(), 0);
9034 +  }
9035 +  return DST;
9036 +}
9037 +
9038 +SDValue
9039 +AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const {
9040 +  EVT OVT = Op.getValueType();
9041 +  SDValue DST;
9042 +  if (OVT.getScalarType() == MVT::i64) {
9043 +    DST = LowerSREM64(Op, DAG);
9044 +  } else if (OVT.getScalarType() == MVT::i32) {
9045 +    DST = LowerSREM32(Op, DAG);
9046 +  } else if (OVT.getScalarType() == MVT::i16) {
9047 +    DST = LowerSREM16(Op, DAG);
9048 +  } else if (OVT.getScalarType() == MVT::i8) {
9049 +    DST = LowerSREM8(Op, DAG);
9050 +  } else {
9051 +    DST = SDValue(Op.getNode(), 0);
9052 +  }
9053 +  return DST;
9054 +}
9055 +
9056 +SDValue
9057 +AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const {
9058 +  SDValue Data = Op.getOperand(0);
9059 +  VTSDNode *BaseType = cast<VTSDNode>(Op.getOperand(1));
9060 +  DebugLoc DL = Op.getDebugLoc();
9061 +  EVT DVT = Data.getValueType();
9062 +  EVT BVT = BaseType->getVT();
9063 +  unsigned baseBits = BVT.getScalarType().getSizeInBits();
9064 +  unsigned srcBits = DVT.isSimple() ? DVT.getScalarType().getSizeInBits() : 1;
9065 +  unsigned shiftBits = srcBits - baseBits;
9066 +  if (srcBits < 32) {
9067 +    // If the op is less than 32 bits, then it needs to extend to 32bits
9068 +    // so it can properly keep the upper bits valid.
9069 +    EVT IVT = genIntType(32, DVT.isVector() ? DVT.getVectorNumElements() : 1);
9070 +    Data = DAG.getNode(ISD::ZERO_EXTEND, DL, IVT, Data);
9071 +    shiftBits = 32 - baseBits;
9072 +    DVT = IVT;
9073 +  }
9074 +  SDValue Shift = DAG.getConstant(shiftBits, DVT);
9075 +  // Shift left by 'Shift' bits.
9076 +  Data = DAG.getNode(ISD::SHL, DL, DVT, Data, Shift);
9077 +  // Signed shift Right by 'Shift' bits.
9078 +  Data = DAG.getNode(ISD::SRA, DL, DVT, Data, Shift);
9079 +  if (srcBits < 32) {
9080 +    // Once the sign extension is done, the op needs to be converted to
9081 +    // its original type.
9082 +    Data = DAG.getSExtOrTrunc(Data, DL, Op.getOperand(0).getValueType());
9083 +  }
9084 +  return Data;
9085 +}
9086 +EVT
9087 +AMDGPUTargetLowering::genIntType(uint32_t size, uint32_t numEle) const {
9088 +  int iSize = (size * numEle);
9089 +  int vEle = (iSize >> ((size == 64) ? 6 : 5));
9090 +  if (!vEle) {
9091 +    vEle = 1;
9092 +  }
9093 +  if (size == 64) {
9094 +    if (vEle == 1) {
9095 +      return EVT(MVT::i64);
9096 +    } else {
9097 +      return EVT(MVT::getVectorVT(MVT::i64, vEle));
9098 +    }
9099 +  } else {
9100 +    if (vEle == 1) {
9101 +      return EVT(MVT::i32);
9102 +    } else {
9103 +      return EVT(MVT::getVectorVT(MVT::i32, vEle));
9104 +    }
9105 +  }
9106 +}
9107 +
9108 +SDValue
9109 +AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
9110 +  SDValue Chain = Op.getOperand(0);
9111 +  SDValue Cond  = Op.getOperand(1);
9112 +  SDValue Jump  = Op.getOperand(2);
9113 +  SDValue Result;
9114 +  Result = DAG.getNode(
9115 +      AMDGPUISD::BRANCH_COND,
9116 +      Op.getDebugLoc(),
9117 +      Op.getValueType(),
9118 +      Chain, Jump, Cond);
9119 +  return Result;
9120 +}
9121 +
9122 +SDValue
9123 +AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
9124 +  DebugLoc DL = Op.getDebugLoc();
9125 +  EVT OVT = Op.getValueType();
9126 +  SDValue LHS = Op.getOperand(0);
9127 +  SDValue RHS = Op.getOperand(1);
9128 +  MVT INTTY;
9129 +  MVT FLTTY;
9130 +  if (!OVT.isVector()) {
9131 +    INTTY = MVT::i32;
9132 +    FLTTY = MVT::f32;
9133 +  } else if (OVT.getVectorNumElements() == 2) {
9134 +    INTTY = MVT::v2i32;
9135 +    FLTTY = MVT::v2f32;
9136 +  } else if (OVT.getVectorNumElements() == 4) {
9137 +    INTTY = MVT::v4i32;
9138 +    FLTTY = MVT::v4f32;
9139 +  }
9140 +  unsigned bitsize = OVT.getScalarType().getSizeInBits();
9141 +  // char|short jq = ia ^ ib;
9142 +  SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);
9143 +
9144 +  // jq = jq >> (bitsize - 2)
9145 +  jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT));
9146 +
9147 +  // jq = jq | 0x1
9148 +  jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));
9149 +
9150 +  // jq = (int)jq
9151 +  jq = DAG.getSExtOrTrunc(jq, DL, INTTY);
9152 +
9153 +  // int ia = (int)LHS;
9154 +  SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);
9155 +
9156 +  // int ib, (int)RHS;
9157 +  SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);
9158 +
9159 +  // float fa = (float)ia;
9160 +  SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
9161 +
9162 +  // float fb = (float)ib;
9163 +  SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
9164 +
9165 +  // float fq = native_divide(fa, fb);
9166 +  SDValue fq = DAG.getNode(AMDGPUISD::DIV_INF, DL, FLTTY, fa, fb);
9167 +
9168 +  // fq = trunc(fq);
9169 +  fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
9170 +
9171 +  // float fqneg = -fq;
9172 +  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
9173 +
9174 +  // float fr = mad(fqneg, fb, fa);
9175 +  SDValue fr = DAG.getNode(AMDGPUISD::MAD, DL, FLTTY, fqneg, fb, fa);
9176 +
9177 +  // int iq = (int)fq;
9178 +  SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
9179 +
9180 +  // fr = fabs(fr);
9181 +  fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);
9182 +
9183 +  // fb = fabs(fb);
9184 +  fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);
9185 +
9186 +  // int cv = fr >= fb;
9187 +  SDValue cv;
9188 +  if (INTTY == MVT::i32) {
9189 +    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
9190 +  } else {
9191 +    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
9192 +  }
9193 +  // jq = (cv ? jq : 0);
9194 +  jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq,
9195 +      DAG.getConstant(0, OVT));
9196 +  // dst = iq + jq;
9197 +  iq = DAG.getSExtOrTrunc(iq, DL, OVT);
9198 +  iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
9199 +  return iq;
9200 +}
9201 +
9202 +SDValue
9203 +AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
9204 +  DebugLoc DL = Op.getDebugLoc();
9205 +  EVT OVT = Op.getValueType();
9206 +  SDValue LHS = Op.getOperand(0);
9207 +  SDValue RHS = Op.getOperand(1);
9208 +  // The LowerSDIV32 function generates equivalent to the following IL.
9209 +  // mov r0, LHS
9210 +  // mov r1, RHS
9211 +  // ilt r10, r0, 0
9212 +  // ilt r11, r1, 0
9213 +  // iadd r0, r0, r10
9214 +  // iadd r1, r1, r11
9215 +  // ixor r0, r0, r10
9216 +  // ixor r1, r1, r11
9217 +  // udiv r0, r0, r1
9218 +  // ixor r10, r10, r11
9219 +  // iadd r0, r0, r10
9220 +  // ixor DST, r0, r10
9221 +
9222 +  // mov r0, LHS
9223 +  SDValue r0 = LHS;
9224 +
9225 +  // mov r1, RHS
9226 +  SDValue r1 = RHS;
9227 +
9228 +  // ilt r10, r0, 0
9229 +  SDValue r10 = DAG.getSelectCC(DL,
9230 +      r0, DAG.getConstant(0, OVT),
9231 +      DAG.getConstant(-1, MVT::i32),
9232 +      DAG.getConstant(0, MVT::i32),
9233 +      ISD::SETLT);
9234 +
9235 +  // ilt r11, r1, 0
9236 +  SDValue r11 = DAG.getSelectCC(DL,
9237 +      r1, DAG.getConstant(0, OVT),
9238 +      DAG.getConstant(-1, MVT::i32),
9239 +      DAG.getConstant(0, MVT::i32),
9240 +      ISD::SETLT);
9241 +
9242 +  // iadd r0, r0, r10
9243 +  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
9244 +
9245 +  // iadd r1, r1, r11
9246 +  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
9247 +
9248 +  // ixor r0, r0, r10
9249 +  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
9250 +
9251 +  // ixor r1, r1, r11
9252 +  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
9253 +
9254 +  // udiv r0, r0, r1
9255 +  r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);
9256 +
9257 +  // ixor r10, r10, r11
9258 +  r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);
9259 +
9260 +  // iadd r0, r0, r10
9261 +  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
9262 +
9263 +  // ixor DST, r0, r10
9264 +  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
9265 +  return DST;
9266 +}
9267 +
9268 +SDValue
9269 +AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
9270 +  return SDValue(Op.getNode(), 0);
9271 +}
9272 +
9273 +SDValue
9274 +AMDGPUTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const {
9275 +  DebugLoc DL = Op.getDebugLoc();
9276 +  EVT OVT = Op.getValueType();
9277 +  MVT INTTY = MVT::i32;
9278 +  if (OVT == MVT::v2i8) {
9279 +    INTTY = MVT::v2i32;
9280 +  } else if (OVT == MVT::v4i8) {
9281 +    INTTY = MVT::v4i32;
9282 +  }
9283 +  SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
9284 +  SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
9285 +  LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
9286 +  LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
9287 +  return LHS;
9288 +}
9289 +
9290 +SDValue
9291 +AMDGPUTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const {
9292 +  DebugLoc DL = Op.getDebugLoc();
9293 +  EVT OVT = Op.getValueType();
9294 +  MVT INTTY = MVT::i32;
9295 +  if (OVT == MVT::v2i16) {
9296 +    INTTY = MVT::v2i32;
9297 +  } else if (OVT == MVT::v4i16) {
9298 +    INTTY = MVT::v4i32;
9299 +  }
9300 +  SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
9301 +  SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
9302 +  LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
9303 +  LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
9304 +  return LHS;
9305 +}
9306 +
9307 +SDValue
9308 +AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
9309 +  DebugLoc DL = Op.getDebugLoc();
9310 +  EVT OVT = Op.getValueType();
9311 +  SDValue LHS = Op.getOperand(0);
9312 +  SDValue RHS = Op.getOperand(1);
9313 +  // The LowerSREM32 function generates equivalent to the following IL.
9314 +  // mov r0, LHS
9315 +  // mov r1, RHS
9316 +  // ilt r10, r0, 0
9317 +  // ilt r11, r1, 0
9318 +  // iadd r0, r0, r10
9319 +  // iadd r1, r1, r11
9320 +  // ixor r0, r0, r10
9321 +  // ixor r1, r1, r11
9322 +  // udiv r20, r0, r1
9323 +  // umul r20, r20, r1
9324 +  // sub r0, r0, r20
9325 +  // iadd r0, r0, r10
9326 +  // ixor DST, r0, r10
9327 +
9328 +  // mov r0, LHS
9329 +  SDValue r0 = LHS;
9330 +
9331 +  // mov r1, RHS
9332 +  SDValue r1 = RHS;
9333 +
9334 +  // ilt r10, r0, 0
9335 +  SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT);
9336 +
9337 +  // ilt r11, r1, 0
9338 +  SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT);
9339 +
9340 +  // iadd r0, r0, r10
9341 +  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
9342 +
9343 +  // iadd r1, r1, r11
9344 +  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
9345 +
9346 +  // ixor r0, r0, r10
9347 +  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
9348 +
9349 +  // ixor r1, r1, r11
9350 +  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
9351 +
9352 +  // udiv r20, r0, r1
9353 +  SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);
9354 +
9355 +  // umul r20, r20, r1
9356 +  r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1);
9357 +
9358 +  // sub r0, r0, r20
9359 +  r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);
9360 +
9361 +  // iadd r0, r0, r10
9362 +  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
9363 +
9364 +  // ixor DST, r0, r10
9365 +  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
9366 +  return DST;
9367 +}
9368 +
9369 +SDValue
9370 +AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const {
9371 +  return SDValue(Op.getNode(), 0);
9372 +}
9373 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILNIDevice.cpp llvm-r600/lib/Target/R600/AMDILNIDevice.cpp
9374 --- llvm-3.2.src/lib/Target/R600/AMDILNIDevice.cpp      1970-01-01 01:00:00.000000000 +0100
9375 +++ llvm-r600/lib/Target/R600/AMDILNIDevice.cpp 2013-01-25 19:43:57.446716388 +0100
9376 @@ -0,0 +1,65 @@
9377 +//===-- AMDILNIDevice.cpp - Device Info for Northern Islands devices ------===//
9378 +//
9379 +//                     The LLVM Compiler Infrastructure
9380 +//
9381 +// This file is distributed under the University of Illinois Open Source
9382 +// License. See LICENSE.TXT for details.
9383 +//
9384 +/// \file
9385 +//==-----------------------------------------------------------------------===//
9386 +#include "AMDILNIDevice.h"
9387 +#include "AMDILEvergreenDevice.h"
9388 +#include "AMDGPUSubtarget.h"
9389 +
9390 +using namespace llvm;
9391 +
9392 +AMDGPUNIDevice::AMDGPUNIDevice(AMDGPUSubtarget *ST)
9393 +  : AMDGPUEvergreenDevice(ST) {
9394 +  std::string name = ST->getDeviceName();
9395 +  if (name == "caicos") {
9396 +    DeviceFlag = OCL_DEVICE_CAICOS;
9397 +  } else if (name == "turks") {
9398 +    DeviceFlag = OCL_DEVICE_TURKS;
9399 +  } else if (name == "cayman") {
9400 +    DeviceFlag = OCL_DEVICE_CAYMAN;
9401 +  } else {
9402 +    DeviceFlag = OCL_DEVICE_BARTS;
9403 +  }
9404 +}
9405 +AMDGPUNIDevice::~AMDGPUNIDevice() {
9406 +}
9407 +
9408 +size_t
9409 +AMDGPUNIDevice::getMaxLDSSize() const {
9410 +  if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
9411 +    return MAX_LDS_SIZE_900;
9412 +  } else {
9413 +    return 0;
9414 +  }
9415 +}
9416 +
9417 +uint32_t
9418 +AMDGPUNIDevice::getGeneration() const {
9419 +  return AMDGPUDeviceInfo::HD6XXX;
9420 +}
9421 +
9422 +
9423 +AMDGPUCaymanDevice::AMDGPUCaymanDevice(AMDGPUSubtarget *ST)
9424 +  : AMDGPUNIDevice(ST) {
9425 +  setCaps();
9426 +}
9427 +
9428 +AMDGPUCaymanDevice::~AMDGPUCaymanDevice() {
9429 +}
9430 +
9431 +void
9432 +AMDGPUCaymanDevice::setCaps() {
9433 +  if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) {
9434 +    mHWBits.set(AMDGPUDeviceInfo::DoubleOps);
9435 +    mHWBits.set(AMDGPUDeviceInfo::FMA);
9436 +  }
9437 +  mHWBits.set(AMDGPUDeviceInfo::Signed24BitOps);
9438 +  mSWBits.reset(AMDGPUDeviceInfo::Signed24BitOps);
9439 +  mSWBits.set(AMDGPUDeviceInfo::ArenaSegment);
9440 +}
9441 +
9442 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILNIDevice.h llvm-r600/lib/Target/R600/AMDILNIDevice.h
9443 --- llvm-3.2.src/lib/Target/R600/AMDILNIDevice.h        1970-01-01 01:00:00.000000000 +0100
9444 +++ llvm-r600/lib/Target/R600/AMDILNIDevice.h   2013-01-25 19:43:57.446716388 +0100
9445 @@ -0,0 +1,57 @@
9446 +//===------- AMDILNIDevice.h - Define NI Device for AMDIL -*- C++ -*------===//
9447 +//
9448 +//                     The LLVM Compiler Infrastructure
9449 +//
9450 +// This file is distributed under the University of Illinois Open Source
9451 +// License. See LICENSE.TXT for details.
9452 +//
9453 +//==-----------------------------------------------------------------------===//
9454 +/// \file
9455 +/// \brief Interface for the subtarget data classes.
9456 +///
9457 +/// This file will define the interface that each generation needs to
9458 +/// implement in order to correctly answer queries on the capabilities of the
9459 +/// specific hardware.
9460 +//===---------------------------------------------------------------------===//
9461 +#ifndef AMDILNIDEVICE_H
9462 +#define AMDILNIDEVICE_H
9463 +#include "AMDILEvergreenDevice.h"
9464 +#include "AMDGPUSubtarget.h"
9465 +
9466 +namespace llvm {
9467 +
9468 +class AMDGPUSubtarget;
9469 +//===---------------------------------------------------------------------===//
9470 +// NI generation of devices and their respective sub classes
9471 +//===---------------------------------------------------------------------===//
9472 +
9473 +/// \brief The AMDGPUNIDevice is the base class for all Northern Island series of
9474 +/// cards.
9475 +///
9476 +/// It is very similiar to the AMDGPUEvergreenDevice, with the major
9477 +/// exception being differences in wavefront size and hardware capabilities.  The
9478 +/// NI devices are all 64 wide wavefronts and also add support for signed 24 bit
9479 +/// integer operations
9480 +class AMDGPUNIDevice : public AMDGPUEvergreenDevice {
9481 +public:
9482 +  AMDGPUNIDevice(AMDGPUSubtarget*);
9483 +  virtual ~AMDGPUNIDevice();
9484 +  virtual size_t getMaxLDSSize() const;
9485 +  virtual uint32_t getGeneration() const;
9486 +};
9487 +
9488 +/// Just as the AMDGPUCypressDevice is the double capable version of the
9489 +/// AMDGPUEvergreenDevice, the AMDGPUCaymanDevice is the double capable version
9490 +/// of the AMDGPUNIDevice.  The other major difference is that the Cayman Device
9491 +/// has 4 wide ALU's, whereas the rest of the NI family is a 5 wide.
9492 +class AMDGPUCaymanDevice: public AMDGPUNIDevice {
9493 +public:
9494 +  AMDGPUCaymanDevice(AMDGPUSubtarget*);
9495 +  virtual ~AMDGPUCaymanDevice();
9496 +private:
9497 +  virtual void setCaps();
9498 +};
9499 +
9500 +static const unsigned int MAX_LDS_SIZE_900 = AMDGPUDevice::MAX_LDS_SIZE_800;
9501 +} // namespace llvm
9502 +#endif // AMDILNIDEVICE_H
9503 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILPeepholeOptimizer.cpp llvm-r600/lib/Target/R600/AMDILPeepholeOptimizer.cpp
9504 --- llvm-3.2.src/lib/Target/R600/AMDILPeepholeOptimizer.cpp     1970-01-01 01:00:00.000000000 +0100
9505 +++ llvm-r600/lib/Target/R600/AMDILPeepholeOptimizer.cpp        2013-01-25 19:43:57.450049721 +0100
9506 @@ -0,0 +1,1256 @@
9507 +//===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===//
9508 +//
9509 +//                     The LLVM Compiler Infrastructure
9510 +//
9511 +// This file is distributed under the University of Illinois Open Source
9512 +// License. See LICENSE.TXT for details.
9513 +//
9514 +/// \file
9515 +//==-----------------------------------------------------------------------===//
9516 +
9517 +#define DEBUG_TYPE "PeepholeOpt"
9518 +#ifdef DEBUG
9519 +#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
9520 +#else
9521 +#define DEBUGME 0
9522 +#endif
9523 +
9524 +#include "AMDILDevices.h"
9525 +#include "AMDGPUInstrInfo.h"
9526 +#include "llvm/ADT/Statistic.h"
9527 +#include "llvm/ADT/StringExtras.h"
9528 +#include "llvm/ADT/StringRef.h"
9529 +#include "llvm/ADT/Twine.h"
9530 +#include "llvm/Constants.h"
9531 +#include "llvm/CodeGen/MachineFunction.h"
9532 +#include "llvm/CodeGen/MachineFunctionAnalysis.h"
9533 +#include "llvm/Function.h"
9534 +#include "llvm/Instructions.h"
9535 +#include "llvm/Module.h"
9536 +#include "llvm/Support/Debug.h"
9537 +#include "llvm/Support/MathExtras.h"
9538 +
9539 +#include <sstream>
9540 +
9541 +#if 0
9542 +STATISTIC(PointerAssignments, "Number of dynamic pointer "
9543 +    "assigments discovered");
9544 +STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
9545 +#endif
9546 +
9547 +using namespace llvm;
9548 +// The Peephole optimization pass is used to do simple last minute optimizations
9549 +// that are required for correct code or to remove redundant functions
9550 +namespace {
9551 +
9552 +class OpaqueType;
9553 +
9554 +class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
9555 +public:
9556 +  TargetMachine &TM;
9557 +  static char ID;
9558 +  AMDGPUPeepholeOpt(TargetMachine &tm);
9559 +  ~AMDGPUPeepholeOpt();
9560 +  const char *getPassName() const;
9561 +  bool runOnFunction(Function &F);
9562 +  bool doInitialization(Module &M);
9563 +  bool doFinalization(Module &M);
9564 +  void getAnalysisUsage(AnalysisUsage &AU) const;
9565 +protected:
9566 +private:
9567 +  // Function to initiate all of the instruction level optimizations.
9568 +  bool instLevelOptimizations(BasicBlock::iterator *inst);
9569 +  // Quick check to see if we need to dump all of the pointers into the
9570 +  // arena. If this is correct, then we set all pointers to exist in arena. This
9571 +  // is a workaround for aliasing of pointers in a struct/union.
9572 +  bool dumpAllIntoArena(Function &F);
9573 +  // Because I don't want to invalidate any pointers while in the
9574 +  // safeNestedForEachFunction. I push atomic conversions to a vector and handle
9575 +  // it later. This function does the conversions if required.
9576 +  void doAtomicConversionIfNeeded(Function &F);
9577 +  // Because __amdil_is_constant cannot be properly evaluated if
9578 +  // optimizations are disabled, the call's are placed in a vector
9579 +  // and evaluated after the __amdil_image* functions are evaluated
9580 +  // which should allow the __amdil_is_constant function to be
9581 +  // evaluated correctly.
9582 +  void doIsConstCallConversionIfNeeded();
9583 +  bool mChanged;
9584 +  bool mDebug;
9585 +  bool mConvertAtomics;
9586 +  CodeGenOpt::Level optLevel;
9587 +  // Run a series of tests to see if we can optimize a CALL instruction.
9588 +  bool optimizeCallInst(BasicBlock::iterator *bbb);
9589 +  // A peephole optimization to optimize bit extract sequences.
9590 +  bool optimizeBitExtract(Instruction *inst);
9591 +  // A peephole optimization to optimize bit insert sequences.
9592 +  bool optimizeBitInsert(Instruction *inst);
9593 +  bool setupBitInsert(Instruction *base,
9594 +                      Instruction *&src,
9595 +                      Constant *&mask,
9596 +                      Constant *&shift);
9597 +  // Expand the bit field insert instruction on versions of OpenCL that
9598 +  // don't support it.
9599 +  bool expandBFI(CallInst *CI);
9600 +  // Expand the bit field mask instruction on version of OpenCL that
9601 +  // don't support it.
9602 +  bool expandBFM(CallInst *CI);
9603 +  // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
9604 +  // this case we need to expand them. These functions check for 24bit functions
9605 +  // and then expand.
9606 +  bool isSigned24BitOps(CallInst *CI);
9607 +  void expandSigned24BitOps(CallInst *CI);
9608 +  // One optimization that can occur is that if the required workgroup size is
9609 +  // specified then the result of get_local_size is known at compile time and
9610 +  // can be returned accordingly.
9611 +  bool isRWGLocalOpt(CallInst *CI);
9612 +  // On northern island cards, the division is slightly less accurate than on
9613 +  // previous generations, so we need to utilize a more accurate division. So we
9614 +  // can translate the accurate divide to a normal divide on all other cards.
9615 +  bool convertAccurateDivide(CallInst *CI);
9616 +  void expandAccurateDivide(CallInst *CI);
9617 +  // If the alignment is set incorrectly, it can produce really inefficient
9618 +  // code. This checks for this scenario and fixes it if possible.
9619 +  bool correctMisalignedMemOp(Instruction *inst);
9620 +
9621 +  // If we are in no opt mode, then we need to make sure that
9622 +  // local samplers are properly propagated as constant propagation
9623 +  // doesn't occur and we need to know the value of kernel defined
9624 +  // samplers at compile time.
9625 +  bool propagateSamplerInst(CallInst *CI);
9626 +
9627 +  // Helper functions
9628 +
9629 +  // Group of functions that recursively calculate the size of a structure based
9630 +  // on it's sub-types.
9631 +  size_t getTypeSize(Type * const T, bool dereferencePtr = false);
9632 +  size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
9633 +  size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
9634 +  size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
9635 +  size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
9636 +  size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
9637 +  size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
9638 +  size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
9639 +
9640 +  LLVMContext *mCTX;
9641 +  Function *mF;
9642 +  const AMDGPUSubtarget *mSTM;
9643 +  SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
9644 +  SmallVector<CallInst *, 16> isConstVec;
9645 +}; // class AMDGPUPeepholeOpt
9646 +  char AMDGPUPeepholeOpt::ID = 0;
9647 +
9648 +// A template function that has two levels of looping before calling the
9649 +// function with a pointer to the current iterator.
9650 +template<class InputIterator, class SecondIterator, class Function>
9651 +Function safeNestedForEach(InputIterator First, InputIterator Last,
9652 +                              SecondIterator S, Function F) {
9653 +  for ( ; First != Last; ++First) {
9654 +    SecondIterator sf, sl;
9655 +    for (sf = First->begin(), sl = First->end();
9656 +         sf != sl; )  {
9657 +      if (!F(&sf)) {
9658 +        ++sf;
9659 +      }
9660 +    }
9661 +  }
9662 +  return F;
9663 +}
9664 +
9665 +} // anonymous namespace
9666 +
9667 +namespace llvm {
9668 +  FunctionPass *
9669 +  createAMDGPUPeepholeOpt(TargetMachine &tm) {
9670 +    return new AMDGPUPeepholeOpt(tm);
9671 +  }
9672 +} // llvm namespace
9673 +
9674 +AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
9675 +  : FunctionPass(ID), TM(tm)  {
9676 +  mDebug = DEBUGME;
9677 +  optLevel = TM.getOptLevel();
9678 +
9679 +}
9680 +
9681 +AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt()  {
9682 +}
9683 +
9684 +const char *
9685 +AMDGPUPeepholeOpt::getPassName() const  {
9686 +  return "AMDGPU PeepHole Optimization Pass";
9687 +}
9688 +
9689 +bool
9690 +containsPointerType(Type *Ty)  {
9691 +  if (!Ty) {
9692 +    return false;
9693 +  }
9694 +  switch(Ty->getTypeID()) {
9695 +  default:
9696 +    return false;
9697 +  case Type::StructTyID: {
9698 +    const StructType *ST = dyn_cast<StructType>(Ty);
9699 +    for (StructType::element_iterator stb = ST->element_begin(),
9700 +           ste = ST->element_end(); stb != ste; ++stb) {
9701 +      if (!containsPointerType(*stb)) {
9702 +        continue;
9703 +      }
9704 +      return true;
9705 +    }
9706 +    break;
9707 +  }
9708 +  case Type::VectorTyID:
9709 +  case Type::ArrayTyID:
9710 +    return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
9711 +  case Type::PointerTyID:
9712 +    return true;
9713 +  };
9714 +  return false;
9715 +}
9716 +
9717 +bool
9718 +AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F)  {
9719 +  bool dumpAll = false;
9720 +  for (Function::const_arg_iterator cab = F.arg_begin(),
9721 +       cae = F.arg_end(); cab != cae; ++cab) {
9722 +    const Argument *arg = cab;
9723 +    const PointerType *PT = dyn_cast<PointerType>(arg->getType());
9724 +    if (!PT) {
9725 +      continue;
9726 +    }
9727 +    Type *DereferencedType = PT->getElementType();
9728 +    if (!dyn_cast<StructType>(DereferencedType)
9729 +        ) {
9730 +      continue;
9731 +    }
9732 +    if (!containsPointerType(DereferencedType)) {
9733 +      continue;
9734 +    }
9735 +    // FIXME: Because a pointer inside of a struct/union may be aliased to
9736 +    // another pointer we need to take the conservative approach and place all
9737 +    // pointers into the arena until more advanced detection is implemented.
9738 +    dumpAll = true;
9739 +  }
9740 +  return dumpAll;
9741 +}
9742 +void
9743 +AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() {
9744 +  if (isConstVec.empty()) {
9745 +    return;
9746 +  }
9747 +  for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
9748 +    CallInst *CI = isConstVec[x];
9749 +    Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
9750 +    Type *aType = Type::getInt32Ty(*mCTX);
9751 +    Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
9752 +      : ConstantInt::get(aType, 0);
9753 +    CI->replaceAllUsesWith(Val);
9754 +    CI->eraseFromParent();
9755 +  }
9756 +  isConstVec.clear();
9757 +}
9758 +void
9759 +AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F)  {
9760 +  // Don't do anything if we don't have any atomic operations.
9761 +  if (atomicFuncs.empty()) {
9762 +    return;
9763 +  }
9764 +  // Change the function name for the atomic if it is required
9765 +  uint32_t size = atomicFuncs.size();
9766 +  for (uint32_t x = 0; x < size; ++x) {
9767 +    atomicFuncs[x].first->setOperand(
9768 +        atomicFuncs[x].first->getNumOperands()-1,
9769 +        atomicFuncs[x].second);
9770 +
9771 +  }
9772 +  mChanged = true;
9773 +  if (mConvertAtomics) {
9774 +    return;
9775 +  }
9776 +}
9777 +
9778 +bool
9779 +AMDGPUPeepholeOpt::runOnFunction(Function &MF)  {
9780 +  mChanged = false;
9781 +  mF = &MF;
9782 +  mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
9783 +  if (mDebug) {
9784 +    MF.dump();
9785 +  }
9786 +  mCTX = &MF.getType()->getContext();
9787 +  mConvertAtomics = true;
9788 +  safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
9789 +     std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
9790 +                  this));
9791 +
9792 +  doAtomicConversionIfNeeded(MF);
9793 +  doIsConstCallConversionIfNeeded();
9794 +
9795 +  if (mDebug) {
9796 +    MF.dump();
9797 +  }
9798 +  return mChanged;
9799 +}
9800 +
9801 +bool
9802 +AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)  {
9803 +  Instruction *inst = (*bbb);
9804 +  CallInst *CI = dyn_cast<CallInst>(inst);
9805 +  if (!CI) {
9806 +    return false;
9807 +  }
9808 +  if (isSigned24BitOps(CI)) {
9809 +    expandSigned24BitOps(CI);
9810 +    ++(*bbb);
9811 +    CI->eraseFromParent();
9812 +    return true;
9813 +  }
9814 +  if (propagateSamplerInst(CI)) {
9815 +    return false;
9816 +  }
9817 +  if (expandBFI(CI) || expandBFM(CI)) {
9818 +    ++(*bbb);
9819 +    CI->eraseFromParent();
9820 +    return true;
9821 +  }
9822 +  if (convertAccurateDivide(CI)) {
9823 +    expandAccurateDivide(CI);
9824 +    ++(*bbb);
9825 +    CI->eraseFromParent();
9826 +    return true;
9827 +  }
9828 +
9829 +  StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
9830 +  if (calleeName.startswith("__amdil_is_constant")) {
9831 +    // If we do not have optimizations, then this
9832 +    // cannot be properly evaluated, so we add the
9833 +    // call instruction to a vector and process
9834 +    // them at the end of processing after the
9835 +    // samplers have been correctly handled.
9836 +    if (optLevel == CodeGenOpt::None) {
9837 +      isConstVec.push_back(CI);
9838 +      return false;
9839 +    } else {
9840 +      Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
9841 +      Type *aType = Type::getInt32Ty(*mCTX);
9842 +      Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
9843 +        : ConstantInt::get(aType, 0);
9844 +      CI->replaceAllUsesWith(Val);
9845 +      ++(*bbb);
9846 +      CI->eraseFromParent();
9847 +      return true;
9848 +    }
9849 +  }
9850 +
9851 +  if (calleeName.equals("__amdil_is_asic_id_i32")) {
9852 +    ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
9853 +    Type *aType = Type::getInt32Ty(*mCTX);
9854 +    Value *Val = CV;
9855 +    if (Val) {
9856 +      Val = ConstantInt::get(aType,
9857 +          mSTM->device()->getDeviceFlag() & CV->getZExtValue());
9858 +    } else {
9859 +      Val = ConstantInt::get(aType, 0);
9860 +    }
9861 +    CI->replaceAllUsesWith(Val);
9862 +    ++(*bbb);
9863 +    CI->eraseFromParent();
9864 +    return true;
9865 +  }
9866 +  Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
9867 +  if (!F) {
9868 +    return false;
9869 +  }
9870 +  if (F->getName().startswith("__atom") && !CI->getNumUses()
9871 +      && F->getName().find("_xchg") == StringRef::npos) {
9872 +    std::string buffer(F->getName().str() + "_noret");
9873 +    F = dyn_cast<Function>(
9874 +          F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
9875 +    atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
9876 +  }
9877 +
9878 +  if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
9879 +      && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
9880 +    return false;
9881 +  }
9882 +  if (!mConvertAtomics) {
9883 +    return false;
9884 +  }
9885 +  StringRef name = F->getName();
9886 +  if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
9887 +    mConvertAtomics = false;
9888 +  }
9889 +  return false;
9890 +}
9891 +
9892 +bool
9893 +AMDGPUPeepholeOpt::setupBitInsert(Instruction *base,
9894 +    Instruction *&src,
9895 +    Constant *&mask,
9896 +    Constant *&shift) {
9897 +  if (!base) {
9898 +    if (mDebug) {
9899 +      dbgs() << "Null pointer passed into function.\n";
9900 +    }
9901 +    return false;
9902 +  }
9903 +  bool andOp = false;
9904 +  if (base->getOpcode() == Instruction::Shl) {
9905 +    shift = dyn_cast<Constant>(base->getOperand(1));
9906 +  } else if (base->getOpcode() == Instruction::And) {
9907 +    mask = dyn_cast<Constant>(base->getOperand(1));
9908 +    andOp = true;
9909 +  } else {
9910 +    if (mDebug) {
9911 +      dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
9912 +    }
9913 +    // If the base is neither a Shl or a And, we don't fit any of the patterns above.
9914 +    return false;
9915 +  }
9916 +  src = dyn_cast<Instruction>(base->getOperand(0));
9917 +  if (!src) {
9918 +    if (mDebug) {
9919 +      dbgs() << "Failed setup since the base operand is not an instruction!\n";
9920 +    }
9921 +    return false;
9922 +  }
9923 +  // If we find an 'and' operation, then we don't need to
9924 +  // find the next operation as we already know the
9925 +  // bits that are valid at this point.
9926 +  if (andOp) {
9927 +    return true;
9928 +  }
9929 +  if (src->getOpcode() == Instruction::Shl && !shift) {
9930 +    shift = dyn_cast<Constant>(src->getOperand(1));
9931 +    src = dyn_cast<Instruction>(src->getOperand(0));
9932 +  } else if (src->getOpcode() == Instruction::And && !mask) {
9933 +    mask = dyn_cast<Constant>(src->getOperand(1));
9934 +  }
9935 +  if (!mask && !shift) {
9936 +    if (mDebug) {
9937 +      dbgs() << "Failed setup since both mask and shift are NULL!\n";
9938 +    }
9939 +    // Did not find a constant mask or a shift.
9940 +    return false;
9941 +  }
9942 +  return true;
9943 +}
9944 +bool
9945 +AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst)  {
9946 +  if (!inst) {
9947 +    return false;
9948 +  }
9949 +  if (!inst->isBinaryOp()) {
9950 +    return false;
9951 +  }
9952 +  if (inst->getOpcode() != Instruction::Or) {
9953 +    return false;
9954 +  }
9955 +  if (optLevel == CodeGenOpt::None) {
9956 +    return false;
9957 +  }
9958 +  // We want to do an optimization on a sequence of ops that in the end equals a
9959 +  // single ISA instruction.
9960 +  // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
9961 +  // Some simplified versions of this pattern are as follows:
9962 +  // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
9963 +  // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
9964 +  // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
9965 +  // (A & B) | (D << F) when (1 << F) >= B
9966 +  // (A << C) | (D & E) when (1 << C) >= E
9967 +  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
9968 +    // The HD4XXX hardware doesn't support the ubit_insert instruction.
9969 +    return false;
9970 +  }
9971 +  Type *aType = inst->getType();
9972 +  bool isVector = aType->isVectorTy();
9973 +  int numEle = 1;
9974 +  // This optimization only works on 32bit integers.
9975 +  if (aType->getScalarType()
9976 +      != Type::getInt32Ty(inst->getContext())) {
9977 +    return false;
9978 +  }
9979 +  if (isVector) {
9980 +    const VectorType *VT = dyn_cast<VectorType>(aType);
9981 +    numEle = VT->getNumElements();
9982 +    // We currently cannot support more than 4 elements in a intrinsic and we
9983 +    // cannot support Vec3 types.
9984 +    if (numEle > 4 || numEle == 3) {
9985 +      return false;
9986 +    }
9987 +  }
9988 +  // TODO: Handle vectors.
9989 +  if (isVector) {
9990 +    if (mDebug) {
9991 +      dbgs() << "!!! Vectors are not supported yet!\n";
9992 +    }
9993 +    return false;
9994 +  }
9995 +  Instruction *LHSSrc = NULL, *RHSSrc = NULL;
9996 +  Constant *LHSMask = NULL, *RHSMask = NULL;
9997 +  Constant *LHSShift = NULL, *RHSShift = NULL;
9998 +  Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
9999 +  Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
10000 +  if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
10001 +    if (mDebug) {
10002 +      dbgs() << "Found an OR Operation that failed setup!\n";
10003 +      inst->dump();
10004 +      if (LHS) { LHS->dump(); }
10005 +      if (LHSSrc) { LHSSrc->dump(); }
10006 +      if (LHSMask) { LHSMask->dump(); }
10007 +      if (LHSShift) { LHSShift->dump(); }
10008 +    }
10009 +    // There was an issue with the setup for BitInsert.
10010 +    return false;
10011 +  }
10012 +  if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
10013 +    if (mDebug) {
10014 +      dbgs() << "Found an OR Operation that failed setup!\n";
10015 +      inst->dump();
10016 +      if (RHS) { RHS->dump(); }
10017 +      if (RHSSrc) { RHSSrc->dump(); }
10018 +      if (RHSMask) { RHSMask->dump(); }
10019 +      if (RHSShift) { RHSShift->dump(); }
10020 +    }
10021 +    // There was an issue with the setup for BitInsert.
10022 +    return false;
10023 +  }
10024 +  if (mDebug) {
10025 +    dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
10026 +    dbgs() << "Op:        "; inst->dump();
10027 +    dbgs() << "LHS:       "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
10028 +    dbgs() << "LHS Src:   "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
10029 +    dbgs() << "LHS Mask:  "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
10030 +    dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
10031 +    dbgs() << "RHS:       "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
10032 +    dbgs() << "RHS Src:   "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
10033 +    dbgs() << "RHS Mask:  "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
10034 +    dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
10035 +  }
10036 +  Constant *offset = NULL;
10037 +  Constant *width = NULL;
10038 +  uint32_t lhsMaskVal = 0, rhsMaskVal = 0;
10039 +  uint32_t lhsShiftVal = 0, rhsShiftVal = 0;
10040 +  uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
10041 +  uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
10042 +  lhsMaskVal = (LHSMask
10043 +      ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
10044 +  rhsMaskVal = (RHSMask
10045 +      ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
10046 +  lhsShiftVal = (LHSShift
10047 +      ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
10048 +  rhsShiftVal = (RHSShift
10049 +      ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
10050 +  lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
10051 +  rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
10052 +  lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
10053 +  rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
10054 +  // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
10055 +  if (mDebug) {
10056 +      dbgs() << "Found pattern: \'((A" << (LHSMask ? " & B)" : ")");
10057 +      dbgs() << (LHSShift ? " << C)" : ")") << " | ((D" ;
10058 +      dbgs() << (RHSMask ? " & E)" : ")");
10059 +      dbgs() << (RHSShift ? " << F)\'\n" : ")\'\n");
10060 +      dbgs() << "A = LHSSrc\t\tD = RHSSrc \n";
10061 +      dbgs() << "B = " << lhsMaskVal << "\t\tE = " << rhsMaskVal << "\n";
10062 +      dbgs() << "C = " << lhsShiftVal << "\t\tF = " << rhsShiftVal << "\n";
10063 +      dbgs() << "width(B) = " << lhsMaskWidth;
10064 +      dbgs() << "\twidth(E) = " << rhsMaskWidth << "\n";
10065 +      dbgs() << "offset(B) = " << lhsMaskOffset;
10066 +      dbgs() << "\toffset(E) = " << rhsMaskOffset << "\n";
10067 +      dbgs() << "Constraints: \n";
10068 +      dbgs() << "\t(1) B ^ E == 0\n";
10069 +      dbgs() << "\t(2-LHS) B is a mask\n";
10070 +      dbgs() << "\t(2-LHS) E is a mask\n";
10071 +      dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n";
10072 +      dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n";
10073 +  }
10074 +  if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
10075 +    if (mDebug) {
10076 +      dbgs() << lhsMaskVal << " ^ " << rhsMaskVal;
10077 +      dbgs() << " = " << (lhsMaskVal ^ rhsMaskVal) << "\n";
10078 +      dbgs() << "Failed constraint 1!\n";
10079 +    }
10080 +    return false;
10081 +  }
10082 +  if (mDebug) {
10083 +    dbgs() << "LHS = " << lhsMaskOffset << "";
10084 +    dbgs() << " >= (" << rhsMaskWidth << " + " << rhsMaskOffset << ") = ";
10085 +    dbgs() << (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset));
10086 +    dbgs() << "\nRHS = " << rhsMaskOffset << "";
10087 +    dbgs() << " >= (" << lhsMaskWidth << " + " << lhsMaskOffset << ") = ";
10088 +    dbgs() << (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset));
10089 +    dbgs() << "\n";
10090 +  }
10091 +  if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
10092 +    offset = ConstantInt::get(aType, lhsMaskOffset, false);
10093 +    width = ConstantInt::get(aType, lhsMaskWidth, false);
10094 +    RHSSrc = RHS;
10095 +    if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
10096 +      if (mDebug) {
10097 +        dbgs() << "Value is not a Mask: " << lhsMaskVal << "\n";
10098 +        dbgs() << "Failed constraint 2!\n";
10099 +      }
10100 +      return false;
10101 +    }
10102 +    if (!LHSShift) {
10103 +      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
10104 +          "MaskShr", LHS);
10105 +    } else if (lhsShiftVal != lhsMaskOffset) {
10106 +      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
10107 +          "MaskShr", LHS);
10108 +    }
10109 +    if (mDebug) {
10110 +      dbgs() << "Optimizing LHS!\n";
10111 +    }
10112 +  } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
10113 +    offset = ConstantInt::get(aType, rhsMaskOffset, false);
10114 +    width = ConstantInt::get(aType, rhsMaskWidth, false);
10115 +    LHSSrc = RHSSrc;
10116 +    RHSSrc = LHS;
10117 +    if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
10118 +      if (mDebug) {
10119 +        dbgs() << "Non-Mask: " << rhsMaskVal << "\n";
10120 +        dbgs() << "Failed constraint 2!\n";
10121 +      }
10122 +      return false;
10123 +    }
10124 +    if (!RHSShift) {
10125 +      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
10126 +          "MaskShr", RHS);
10127 +    } else if (rhsShiftVal != rhsMaskOffset) {
10128 +      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
10129 +          "MaskShr", RHS);
10130 +    }
10131 +    if (mDebug) {
10132 +      dbgs() << "Optimizing RHS!\n";
10133 +    }
10134 +  } else {
10135 +    if (mDebug) {
10136 +      dbgs() << "Failed constraint 3!\n";
10137 +    }
10138 +    return false;
10139 +  }
10140 +  if (mDebug) {
10141 +    dbgs() << "Width:  "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
10142 +    dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
10143 +    dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
10144 +    dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
10145 +  }
10146 +  if (!offset || !width) {
10147 +    if (mDebug) {
10148 +      dbgs() << "Either width or offset are NULL, failed detection!\n";
10149 +    }
10150 +    return false;
10151 +  }
10152 +  // Lets create the function signature.
10153 +  std::vector<Type *> callTypes;
10154 +  callTypes.push_back(aType);
10155 +  callTypes.push_back(aType);
10156 +  callTypes.push_back(aType);
10157 +  callTypes.push_back(aType);
10158 +  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
10159 +  std::string name = "__amdil_ubit_insert";
10160 +  if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
10161 +  Function *Func =
10162 +    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
10163 +        getOrInsertFunction(llvm::StringRef(name), funcType));
10164 +  Value *Operands[4] = {
10165 +    width,
10166 +    offset,
10167 +    LHSSrc,
10168 +    RHSSrc
10169 +  };
10170 +  CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
10171 +  if (mDebug) {
10172 +    dbgs() << "Old Inst: ";
10173 +    inst->dump();
10174 +    dbgs() << "New Inst: ";
10175 +    CI->dump();
10176 +    dbgs() << "\n\n";
10177 +  }
10178 +  CI->insertBefore(inst);
10179 +  inst->replaceAllUsesWith(CI);
10180 +  return true;
10181 +}
10182 +
10183 +bool
10184 +AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst)  {
10185 +  if (!inst) {
10186 +    return false;
10187 +  }
10188 +  if (!inst->isBinaryOp()) {
10189 +    return false;
10190 +  }
10191 +  if (inst->getOpcode() != Instruction::And) {
10192 +    return false;
10193 +  }
10194 +  if (optLevel == CodeGenOpt::None) {
10195 +    return false;
10196 +  }
10197 +  // We want to do some simple optimizations on Shift right/And patterns. The
10198 +  // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
10199 +  // value smaller than 32 and C is a mask. If C is a constant value, then the
10200 +  // following transformation can occur. For signed integers, it turns into the
10201 +  // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
10202 +  // integers, it turns into the function call dst =
10203 +  // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
10204 +  // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
10205 +  // Evergreen hardware.
10206 +  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
10207 +    // This does not work on HD4XXX hardware.
10208 +    return false;
10209 +  }
10210 +  Type *aType = inst->getType();
10211 +  bool isVector = aType->isVectorTy();
10212 +
10213 +  // XXX Support vector types
10214 +  if (isVector) {
10215 +    return false;
10216 +  }
10217 +  int numEle = 1;
10218 +  // This only works on 32bit integers
10219 +  if (aType->getScalarType()
10220 +      != Type::getInt32Ty(inst->getContext())) {
10221 +    return false;
10222 +  }
10223 +  if (isVector) {
10224 +    const VectorType *VT = dyn_cast<VectorType>(aType);
10225 +    numEle = VT->getNumElements();
10226 +    // We currently cannot support more than 4 elements in a intrinsic and we
10227 +    // cannot support Vec3 types.
10228 +    if (numEle > 4 || numEle == 3) {
10229 +      return false;
10230 +    }
10231 +  }
10232 +  BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
10233 +  // If the first operand is not a shift instruction, then we can return as it
10234 +  // doesn't match this pattern.
10235 +  if (!ShiftInst || !ShiftInst->isShift()) {
10236 +    return false;
10237 +  }
10238 +  // If we are a shift left, then we need don't match this pattern.
10239 +  if (ShiftInst->getOpcode() == Instruction::Shl) {
10240 +    return false;
10241 +  }
10242 +  bool isSigned = ShiftInst->isArithmeticShift();
10243 +  Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
10244 +  Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
10245 +  // Lets make sure that the shift value and the and mask are constant integers.
10246 +  if (!AndMask || !ShrVal) {
10247 +    return false;
10248 +  }
10249 +  Constant *newMaskConst;
10250 +  Constant *shiftValConst;
10251 +  if (isVector) {
10252 +    // Handle the vector case
10253 +    std::vector<Constant *> maskVals;
10254 +    std::vector<Constant *> shiftVals;
10255 +    ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
10256 +    ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
10257 +    Type *scalarType = AndMaskVec->getType()->getScalarType();
10258 +    assert(AndMaskVec->getNumOperands() ==
10259 +           ShrValVec->getNumOperands() && "cannot have a "
10260 +           "combination where the number of elements to a "
10261 +           "shift and an and are different!");
10262 +    for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
10263 +      ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
10264 +      ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
10265 +      if (!AndCI || !ShiftIC) {
10266 +        return false;
10267 +      }
10268 +      uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
10269 +      if (!isMask_32(maskVal)) {
10270 +        return false;
10271 +      }
10272 +      maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
10273 +      uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
10274 +      // If the mask or shiftval is greater than the bitcount, then break out.
10275 +      if (maskVal >= 32 || shiftVal >= 32) {
10276 +        return false;
10277 +      }
10278 +      // If the mask val is greater than the the number of original bits left
10279 +      // then this optimization is invalid.
10280 +      if (maskVal > (32 - shiftVal)) {
10281 +        return false;
10282 +      }
10283 +      maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
10284 +      shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
10285 +    }
10286 +    newMaskConst = ConstantVector::get(maskVals);
10287 +    shiftValConst = ConstantVector::get(shiftVals);
10288 +  } else {
10289 +    // Handle the scalar case
10290 +    uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
10291 +    // This must be a mask value where all lower bits are set to 1 and then any
10292 +    // bit higher is set to 0.
10293 +    if (!isMask_32(maskVal)) {
10294 +      return false;
10295 +    }
10296 +    maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
10297 +    // Count the number of bits set in the mask, this is the width of the
10298 +    // resulting bit set that is extracted from the source value.
10299 +    uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
10300 +    // If the mask or shift val is greater than the bitcount, then break out.
10301 +    if (maskVal >= 32 || shiftVal >= 32) {
10302 +      return false;
10303 +    }
10304 +    // If the mask val is greater than the the number of original bits left then
10305 +    // this optimization is invalid.
10306 +    if (maskVal > (32 - shiftVal)) {
10307 +      return false;
10308 +    }
10309 +    newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
10310 +    shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
10311 +  }
10312 +  // Lets create the function signature.
10313 +  std::vector<Type *> callTypes;
10314 +  callTypes.push_back(aType);
10315 +  callTypes.push_back(aType);
10316 +  callTypes.push_back(aType);
10317 +  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
10318 +  std::string name = "llvm.AMDGPU.bit.extract.u32";
10319 +  if (isVector) {
10320 +    name += ".v" + itostr(numEle) + "i32";
10321 +  } else {
10322 +    name += ".";
10323 +  }
10324 +  // Lets create the function.
10325 +  Function *Func =
10326 +    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
10327 +                       getOrInsertFunction(llvm::StringRef(name), funcType));
10328 +  Value *Operands[3] = {
10329 +    ShiftInst->getOperand(0),
10330 +    shiftValConst,
10331 +    newMaskConst
10332 +  };
10333 +  // Lets create the Call with the operands
10334 +  CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
10335 +  CI->setDoesNotAccessMemory();
10336 +  CI->insertBefore(inst);
10337 +  inst->replaceAllUsesWith(CI);
10338 +  return true;
10339 +}
10340 +
10341 +bool
10342 +AMDGPUPeepholeOpt::expandBFI(CallInst *CI) {
10343 +  if (!CI) {
10344 +    return false;
10345 +  }
10346 +  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
10347 +  if (!LHS->getName().startswith("__amdil_bfi")) {
10348 +    return false;
10349 +  }
10350 +  Type* type = CI->getOperand(0)->getType();
10351 +  Constant *negOneConst = NULL;
10352 +  if (type->isVectorTy()) {
10353 +    std::vector<Constant *> negOneVals;
10354 +    negOneConst = ConstantInt::get(CI->getContext(),
10355 +        APInt(32, StringRef("-1"), 10));
10356 +    for (size_t x = 0,
10357 +        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
10358 +      negOneVals.push_back(negOneConst);
10359 +    }
10360 +    negOneConst = ConstantVector::get(negOneVals);
10361 +  } else {
10362 +    negOneConst = ConstantInt::get(CI->getContext(),
10363 +        APInt(32, StringRef("-1"), 10));
10364 +  }
10365 +  // __amdil_bfi => (A & B) | (~A & C)
10366 +  BinaryOperator *lhs =
10367 +    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
10368 +        CI->getOperand(1), "bfi_and", CI);
10369 +  BinaryOperator *rhs =
10370 +    BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
10371 +        "bfi_not", CI);
10372 +  rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
10373 +      "bfi_and", CI);
10374 +  lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
10375 +  CI->replaceAllUsesWith(lhs);
10376 +  return true;
10377 +}
10378 +
10379 +bool
10380 +AMDGPUPeepholeOpt::expandBFM(CallInst *CI) {
10381 +  if (!CI) {
10382 +    return false;
10383 +  }
10384 +  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
10385 +  if (!LHS->getName().startswith("__amdil_bfm")) {
10386 +    return false;
10387 +  }
10388 +  // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
10389 +  Constant *newMaskConst = NULL;
10390 +  Constant *newShiftConst = NULL;
10391 +  Type* type = CI->getOperand(0)->getType();
10392 +  if (type->isVectorTy()) {
10393 +    std::vector<Constant*> newMaskVals, newShiftVals;
10394 +    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
10395 +    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
10396 +    for (size_t x = 0,
10397 +        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
10398 +      newMaskVals.push_back(newMaskConst);
10399 +      newShiftVals.push_back(newShiftConst);
10400 +    }
10401 +    newMaskConst = ConstantVector::get(newMaskVals);
10402 +    newShiftConst = ConstantVector::get(newShiftVals);
10403 +  } else {
10404 +    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
10405 +    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
10406 +  }
10407 +  BinaryOperator *lhs =
10408 +    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
10409 +        newMaskConst, "bfm_mask", CI);
10410 +  lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
10411 +      lhs, "bfm_shl", CI);
10412 +  lhs = BinaryOperator::Create(Instruction::Sub, lhs,
10413 +      newShiftConst, "bfm_sub", CI);
10414 +  BinaryOperator *rhs =
10415 +    BinaryOperator::Create(Instruction::And, CI->getOperand(1),
10416 +        newMaskConst, "bfm_mask", CI);
10417 +  lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
10418 +  CI->replaceAllUsesWith(lhs);
10419 +  return true;
10420 +}
10421 +
10422 +bool
10423 +AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb)  {
10424 +  Instruction *inst = (*bbb);
10425 +  if (optimizeCallInst(bbb)) {
10426 +    return true;
10427 +  }
10428 +  if (optimizeBitExtract(inst)) {
10429 +    return false;
10430 +  }
10431 +  if (optimizeBitInsert(inst)) {
10432 +    return false;
10433 +  }
10434 +  if (correctMisalignedMemOp(inst)) {
10435 +    return false;
10436 +  }
10437 +  return false;
10438 +}
10439 +bool
10440 +AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) {
10441 +  LoadInst *linst = dyn_cast<LoadInst>(inst);
10442 +  StoreInst *sinst = dyn_cast<StoreInst>(inst);
10443 +  unsigned alignment;
10444 +  Type* Ty = inst->getType();
10445 +  if (linst) {
10446 +    alignment = linst->getAlignment();
10447 +    Ty = inst->getType();
10448 +  } else if (sinst) {
10449 +    alignment = sinst->getAlignment();
10450 +    Ty = sinst->getValueOperand()->getType();
10451 +  } else {
10452 +    return false;
10453 +  }
10454 +  unsigned size = getTypeSize(Ty);
10455 +  if (size == alignment || size < alignment) {
10456 +    return false;
10457 +  }
10458 +  if (!Ty->isStructTy()) {
10459 +    return false;
10460 +  }
10461 +  if (alignment < 4) {
10462 +    if (linst) {
10463 +      linst->setAlignment(0);
10464 +      return true;
10465 +    } else if (sinst) {
10466 +      sinst->setAlignment(0);
10467 +      return true;
10468 +    }
10469 +  }
10470 +  return false;
10471 +}
10472 +bool
10473 +AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI)  {
10474 +  if (!CI) {
10475 +    return false;
10476 +  }
10477 +  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
10478 +  std::string namePrefix = LHS->getName().substr(0, 14);
10479 +  if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
10480 +      && namePrefix != "__amdil__imul24_high") {
10481 +    return false;
10482 +  }
10483 +  if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
10484 +    return false;
10485 +  }
10486 +  return true;
10487 +}
10488 +
10489 +void
10490 +AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI)  {
10491 +  assert(isSigned24BitOps(CI) && "Must be a "
10492 +      "signed 24 bit operation to call this function!");
10493 +  Value *LHS = CI->getOperand(CI->getNumOperands()-1);
10494 +  // On 7XX and 8XX we do not have signed 24bit, so we need to
10495 +  // expand it to the following:
10496 +  // imul24 turns into 32bit imul
10497 +  // imad24 turns into 32bit imad
10498 +  // imul24_high turns into 32bit imulhigh
10499 +  if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
10500 +    Type *aType = CI->getOperand(0)->getType();
10501 +    bool isVector = aType->isVectorTy();
10502 +    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
10503 +    std::vector<Type*> callTypes;
10504 +    callTypes.push_back(CI->getOperand(0)->getType());
10505 +    callTypes.push_back(CI->getOperand(1)->getType());
10506 +    callTypes.push_back(CI->getOperand(2)->getType());
10507 +    FunctionType *funcType =
10508 +      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
10509 +    std::string name = "__amdil_imad";
10510 +    if (isVector) {
10511 +      name += "_v" + itostr(numEle) + "i32";
10512 +    } else {
10513 +      name += "_i32";
10514 +    }
10515 +    Function *Func = dyn_cast<Function>(
10516 +                       CI->getParent()->getParent()->getParent()->
10517 +                       getOrInsertFunction(llvm::StringRef(name), funcType));
10518 +    Value *Operands[3] = {
10519 +      CI->getOperand(0),
10520 +      CI->getOperand(1),
10521 +      CI->getOperand(2)
10522 +    };
10523 +    CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
10524 +    nCI->insertBefore(CI);
10525 +    CI->replaceAllUsesWith(nCI);
10526 +  } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
10527 +    BinaryOperator *mulOp =
10528 +      BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
10529 +          CI->getOperand(1), "imul24", CI);
10530 +    CI->replaceAllUsesWith(mulOp);
10531 +  } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
10532 +    Type *aType = CI->getOperand(0)->getType();
10533 +
10534 +    bool isVector = aType->isVectorTy();
10535 +    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
10536 +    std::vector<Type*> callTypes;
10537 +    callTypes.push_back(CI->getOperand(0)->getType());
10538 +    callTypes.push_back(CI->getOperand(1)->getType());
10539 +    FunctionType *funcType =
10540 +      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
10541 +    std::string name = "__amdil_imul_high";
10542 +    if (isVector) {
10543 +      name += "_v" + itostr(numEle) + "i32";
10544 +    } else {
10545 +      name += "_i32";
10546 +    }
10547 +    Function *Func = dyn_cast<Function>(
10548 +                       CI->getParent()->getParent()->getParent()->
10549 +                       getOrInsertFunction(llvm::StringRef(name), funcType));
10550 +    Value *Operands[2] = {
10551 +      CI->getOperand(0),
10552 +      CI->getOperand(1)
10553 +    };
10554 +    CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
10555 +    nCI->insertBefore(CI);
10556 +    CI->replaceAllUsesWith(nCI);
10557 +  }
10558 +}
10559 +
10560 +bool
10561 +AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI)  {
10562 +  return (CI != NULL
10563 +          && CI->getOperand(CI->getNumOperands() - 1)->getName()
10564 +          == "__amdil_get_local_size_int");
10565 +}
10566 +
10567 +bool
10568 +AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI)  {
10569 +  if (!CI) {
10570 +    return false;
10571 +  }
10572 +  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
10573 +      && (mSTM->getDeviceName() == "cayman")) {
10574 +    return false;
10575 +  }
10576 +  return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
10577 +      == "__amdil_improved_div";
10578 +}
10579 +
10580 +void
10581 +AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI)  {
10582 +  assert(convertAccurateDivide(CI)
10583 +         && "expanding accurate divide can only happen if it is expandable!");
10584 +  BinaryOperator *divOp =
10585 +    BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
10586 +                           CI->getOperand(1), "fdiv32", CI);
10587 +  CI->replaceAllUsesWith(divOp);
10588 +}
10589 +
10590 +bool
10591 +AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) {
10592 +  if (optLevel != CodeGenOpt::None) {
10593 +    return false;
10594 +  }
10595 +
10596 +  if (!CI) {
10597 +    return false;
10598 +  }
10599 +
10600 +  unsigned funcNameIdx = 0;
10601 +  funcNameIdx = CI->getNumOperands() - 1;
10602 +  StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
10603 +  if (calleeName != "__amdil_image2d_read_norm"
10604 +   && calleeName != "__amdil_image2d_read_unnorm"
10605 +   && calleeName != "__amdil_image3d_read_norm"
10606 +   && calleeName != "__amdil_image3d_read_unnorm") {
10607 +    return false;
10608 +  }
10609 +
10610 +  unsigned samplerIdx = 2;
10611 +  samplerIdx = 1;
10612 +  Value *sampler = CI->getOperand(samplerIdx);
10613 +  LoadInst *lInst = dyn_cast<LoadInst>(sampler);
10614 +  if (!lInst) {
10615 +    return false;
10616 +  }
10617 +
10618 +  if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
10619 +    return false;
10620 +  }
10621 +
10622 +  GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
10623 +  // If we are loading from what is not a global value, then we
10624 +  // fail and return.
10625 +  if (!gv) {
10626 +    return false;
10627 +  }
10628 +
10629 +  // If we don't have an initializer or we have an initializer and
10630 +  // the initializer is not a 32bit integer, we fail.
10631 +  if (!gv->hasInitializer()
10632 +      || !gv->getInitializer()->getType()->isIntegerTy(32)) {
10633 +      return false;
10634 +  }
10635 +
10636 +  // Now that we have the global variable initializer, lets replace
10637 +  // all uses of the load instruction with the samplerVal and
10638 +  // reparse the __amdil_is_constant() function.
10639 +  Constant *samplerVal = gv->getInitializer();
10640 +  lInst->replaceAllUsesWith(samplerVal);
10641 +  return true;
10642 +}
10643 +
10644 +bool
10645 +AMDGPUPeepholeOpt::doInitialization(Module &M)  {
10646 +  return false;
10647 +}
10648 +
10649 +bool
10650 +AMDGPUPeepholeOpt::doFinalization(Module &M)  {
10651 +  return false;
10652 +}
10653 +
10654 +void
10655 +AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const  {
10656 +  AU.addRequired<MachineFunctionAnalysis>();
10657 +  FunctionPass::getAnalysisUsage(AU);
10658 +  AU.setPreservesAll();
10659 +}
10660 +
10661 +size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
10662 +  size_t size = 0;
10663 +  if (!T) {
10664 +    return size;
10665 +  }
10666 +  switch (T->getTypeID()) {
10667 +  case Type::X86_FP80TyID:
10668 +  case Type::FP128TyID:
10669 +  case Type::PPC_FP128TyID:
10670 +  case Type::LabelTyID:
10671 +    assert(0 && "These types are not supported by this backend");
10672 +  default:
10673 +  case Type::FloatTyID:
10674 +  case Type::DoubleTyID:
10675 +    size = T->getPrimitiveSizeInBits() >> 3;
10676 +    break;
10677 +  case Type::PointerTyID:
10678 +    size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
10679 +    break;
10680 +  case Type::IntegerTyID:
10681 +    size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
10682 +    break;
10683 +  case Type::StructTyID:
10684 +    size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
10685 +    break;
10686 +  case Type::ArrayTyID:
10687 +    size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
10688 +    break;
10689 +  case Type::FunctionTyID:
10690 +    size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
10691 +    break;
10692 +  case Type::VectorTyID:
10693 +    size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
10694 +    break;
10695 +  };
10696 +  return size;
10697 +}
10698 +
10699 +size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
10700 +    bool dereferencePtr) {
10701 +  size_t size = 0;
10702 +  if (!ST) {
10703 +    return size;
10704 +  }
10705 +  Type *curType;
10706 +  StructType::element_iterator eib;
10707 +  StructType::element_iterator eie;
10708 +  for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
10709 +    curType = *eib;
10710 +    size += getTypeSize(curType, dereferencePtr);
10711 +  }
10712 +  return size;
10713 +}
10714 +
10715 +size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
10716 +    bool dereferencePtr) {
10717 +  return IT ? (IT->getBitWidth() >> 3) : 0;
10718 +}
10719 +
10720 +size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
10721 +    bool dereferencePtr) {
10722 +    assert(0 && "Should not be able to calculate the size of an function type");
10723 +    return 0;
10724 +}
10725 +
10726 +size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
10727 +    bool dereferencePtr) {
10728 +  return (size_t)(AT ? (getTypeSize(AT->getElementType(),
10729 +                                    dereferencePtr) * AT->getNumElements())
10730 +                     : 0);
10731 +}
10732 +
10733 +size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
10734 +    bool dereferencePtr) {
10735 +  return VT ? (VT->getBitWidth() >> 3) : 0;
10736 +}
10737 +
10738 +size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
10739 +    bool dereferencePtr) {
10740 +  if (!PT) {
10741 +    return 0;
10742 +  }
10743 +  Type *CT = PT->getElementType();
10744 +  if (CT->getTypeID() == Type::StructTyID &&
10745 +      PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
10746 +    return getTypeSize(dyn_cast<StructType>(CT));
10747 +  } else if (dereferencePtr) {
10748 +    size_t size = 0;
10749 +    for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
10750 +      size += getTypeSize(PT->getContainedType(x), dereferencePtr);
10751 +    }
10752 +    return size;
10753 +  } else {
10754 +    return 4;
10755 +  }
10756 +}
10757 +
10758 +size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
10759 +    bool dereferencePtr) {
10760 +  //assert(0 && "Should not be able to calculate the size of an opaque type");
10761 +  return 4;
10762 +}
10763 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILRegisterInfo.td llvm-r600/lib/Target/R600/AMDILRegisterInfo.td
10764 --- llvm-3.2.src/lib/Target/R600/AMDILRegisterInfo.td   1970-01-01 01:00:00.000000000 +0100
10765 +++ llvm-r600/lib/Target/R600/AMDILRegisterInfo.td      2013-01-25 19:43:57.450049721 +0100
10766 @@ -0,0 +1,107 @@
10767 +//===- AMDILRegisterInfo.td - AMDIL Register defs ----------*- tablegen -*-===//
10768 +//
10769 +//                     The LLVM Compiler Infrastructure
10770 +//
10771 +// This file is distributed under the University of Illinois Open Source
10772 +// License. See LICENSE.TXT for details.
10773 +//
10774 +//==-----------------------------------------------------------------------===//
10775 +//
10776 +//  Declarations that describe the AMDIL register file
10777 +//
10778 +//===----------------------------------------------------------------------===//
10779 +
10780 +class AMDILReg<bits<16> num, string n> : Register<n> {
10781 +  field bits<16> Value;
10782 +  let Value = num;
10783 +  let Namespace = "AMDGPU";
10784 +}
10785 +
10786 +// We will start with 8 registers for each class before expanding to more
10787 +// Since the swizzle is added based on the register class, we can leave it
10788 +// off here and just specify different registers for different register classes
10789 +def R1 : AMDILReg<1, "r1">, DwarfRegNum<[1]>;
10790 +def R2 : AMDILReg<2, "r2">, DwarfRegNum<[2]>;
10791 +def R3 : AMDILReg<3, "r3">, DwarfRegNum<[3]>;
10792 +def R4 : AMDILReg<4, "r4">, DwarfRegNum<[4]>;
10793 +def R5 : AMDILReg<5, "r5">, DwarfRegNum<[5]>;
10794 +def R6 : AMDILReg<6, "r6">, DwarfRegNum<[6]>;
10795 +def R7 : AMDILReg<7, "r7">, DwarfRegNum<[7]>;
10796 +def R8 : AMDILReg<8, "r8">, DwarfRegNum<[8]>;
10797 +def R9 : AMDILReg<9, "r9">, DwarfRegNum<[9]>;
10798 +def R10 : AMDILReg<10, "r10">, DwarfRegNum<[10]>;
10799 +def R11 : AMDILReg<11, "r11">, DwarfRegNum<[11]>;
10800 +def R12 : AMDILReg<12, "r12">, DwarfRegNum<[12]>;
10801 +def R13 : AMDILReg<13, "r13">, DwarfRegNum<[13]>;
10802 +def R14 : AMDILReg<14, "r14">, DwarfRegNum<[14]>;
10803 +def R15 : AMDILReg<15, "r15">, DwarfRegNum<[15]>;
10804 +def R16 : AMDILReg<16, "r16">, DwarfRegNum<[16]>;
10805 +def R17 : AMDILReg<17, "r17">, DwarfRegNum<[17]>;
10806 +def R18 : AMDILReg<18, "r18">, DwarfRegNum<[18]>;
10807 +def R19 : AMDILReg<19, "r19">, DwarfRegNum<[19]>;
10808 +def R20 : AMDILReg<20, "r20">, DwarfRegNum<[20]>;
10809 +
10810 +// All registers between 1000 and 1024 are reserved and cannot be used
10811 +// unless commented in this section
10812 +// r1021-r1025 are used to dynamically calculate the local/group/thread/region/region_local ID's
10813 +// r1020 is used to hold the frame index for local arrays
10814 +// r1019 is used to hold the dynamic stack allocation pointer
10815 +// r1018 is used as a temporary register for handwritten code
10816 +// r1017 is used as a temporary register for handwritten code
10817 +// r1016 is used as a temporary register for load/store code
10818 +// r1015 is used as a temporary register for data segment offset
10819 +// r1014 is used as a temporary register for store code
10820 +// r1013 is used as the section data pointer register
10821 +// r1012-r1010 and r1001-r1008 are used for temporary I/O registers
10822 +// r1009 is used as the frame pointer register
10823 +// r999 is used as the mem register.
10824 +// r998 is used as the return address register.
10825 +//def R1025 : AMDILReg<1025, "r1025">, DwarfRegNum<[1025]>;
10826 +//def R1024 : AMDILReg<1024, "r1024">, DwarfRegNum<[1024]>;
10827 +//def R1023 : AMDILReg<1023, "r1023">, DwarfRegNum<[1023]>;
10828 +//def R1022 : AMDILReg<1022, "r1022">, DwarfRegNum<[1022]>;
10829 +//def R1021 : AMDILReg<1021, "r1021">, DwarfRegNum<[1021]>;
10830 +//def R1020 : AMDILReg<1020, "r1020">, DwarfRegNum<[1020]>;
10831 +def SP : AMDILReg<1019, "r1019">, DwarfRegNum<[1019]>;
10832 +def T1 : AMDILReg<1018, "r1018">, DwarfRegNum<[1018]>;
10833 +def T2 : AMDILReg<1017, "r1017">, DwarfRegNum<[1017]>;
10834 +def T3 : AMDILReg<1016, "r1016">, DwarfRegNum<[1016]>;
10835 +def T4 : AMDILReg<1015, "r1015">, DwarfRegNum<[1015]>;
10836 +def T5 : AMDILReg<1014, "r1014">, DwarfRegNum<[1014]>;
10837 +def SDP : AMDILReg<1013, "r1013">, DwarfRegNum<[1013]>;
10838 +def R1012: AMDILReg<1012, "r1012">, DwarfRegNum<[1012]>;
10839 +def R1011: AMDILReg<1011, "r1011">, DwarfRegNum<[1011]>;
10840 +def R1010: AMDILReg<1010, "r1010">, DwarfRegNum<[1010]>;
10841 +def DFP : AMDILReg<1009, "r1009">, DwarfRegNum<[1009]>;
10842 +def R1008: AMDILReg<1008, "r1008">, DwarfRegNum<[1008]>;
10843 +def R1007: AMDILReg<1007, "r1007">, DwarfRegNum<[1007]>;
10844 +def R1006: AMDILReg<1006, "r1006">, DwarfRegNum<[1006]>;
10845 +def R1005: AMDILReg<1005, "r1005">, DwarfRegNum<[1005]>;
10846 +def R1004: AMDILReg<1004, "r1004">, DwarfRegNum<[1004]>;
10847 +def R1003: AMDILReg<1003, "r1003">, DwarfRegNum<[1003]>;
10848 +def R1002: AMDILReg<1002, "r1002">, DwarfRegNum<[1002]>;
10849 +def R1001: AMDILReg<1001, "r1001">, DwarfRegNum<[1001]>;
10850 +def MEM : AMDILReg<999, "mem">, DwarfRegNum<[999]>;
10851 +def RA : AMDILReg<998, "r998">, DwarfRegNum<[998]>;
10852 +def FP : AMDILReg<997, "r997">, DwarfRegNum<[997]>;
10853 +def GPRI16 : RegisterClass<"AMDGPU", [i16], 16,
10854 +  (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
10855 +        let AltOrders = [(add (sequence "R%u", 1, 20))];
10856 +        let AltOrderSelect = [{
10857 +          return 1;
10858 +        }];
10859 +    }
10860 +def GPRI32 : RegisterClass<"AMDGPU", [i32], 32,
10861 +  (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
10862 +        let AltOrders = [(add (sequence "R%u", 1, 20))];
10863 +        let AltOrderSelect = [{
10864 +          return 1;
10865 +        }];
10866 +    }
10867 +def GPRF32 : RegisterClass<"AMDGPU", [f32], 32,
10868 +  (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
10869 +        let AltOrders = [(add (sequence "R%u", 1, 20))];
10870 +        let AltOrderSelect = [{
10871 +          return 1;
10872 +        }];
10873 +    }
10874 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILSIDevice.cpp llvm-r600/lib/Target/R600/AMDILSIDevice.cpp
10875 --- llvm-3.2.src/lib/Target/R600/AMDILSIDevice.cpp      1970-01-01 01:00:00.000000000 +0100
10876 +++ llvm-r600/lib/Target/R600/AMDILSIDevice.cpp 2013-01-25 19:43:57.450049721 +0100
10877 @@ -0,0 +1,45 @@
10878 +//===-- AMDILSIDevice.cpp - Device Info for Southern Islands GPUs ---------===//
10879 +//
10880 +//                     The LLVM Compiler Infrastructure
10881 +//
10882 +// This file is distributed under the University of Illinois Open Source
10883 +// License. See LICENSE.TXT for details.
10884 +//
10885 +/// \file
10886 +//==-----------------------------------------------------------------------===//
10887 +#include "AMDILSIDevice.h"
10888 +#include "AMDILEvergreenDevice.h"
10889 +#include "AMDILNIDevice.h"
10890 +#include "AMDGPUSubtarget.h"
10891 +
10892 +using namespace llvm;
10893 +
10894 +AMDGPUSIDevice::AMDGPUSIDevice(AMDGPUSubtarget *ST)
10895 +  : AMDGPUEvergreenDevice(ST) {
10896 +}
10897 +AMDGPUSIDevice::~AMDGPUSIDevice() {
10898 +}
10899 +
10900 +size_t
10901 +AMDGPUSIDevice::getMaxLDSSize() const {
10902 +  if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
10903 +    return MAX_LDS_SIZE_900;
10904 +  } else {
10905 +    return 0;
10906 +  }
10907 +}
10908 +
10909 +uint32_t
10910 +AMDGPUSIDevice::getGeneration() const {
10911 +  return AMDGPUDeviceInfo::HD7XXX;
10912 +}
10913 +
10914 +std::string
10915 +AMDGPUSIDevice::getDataLayout() const {
10916 +  return std::string("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16"
10917 +      "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
10918 +      "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
10919 +      "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
10920 +      "-v512:512:512-v1024:1024:1024-v2048:2048:2048"
10921 +      "-n8:16:32:64");
10922 +}
10923 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILSIDevice.h llvm-r600/lib/Target/R600/AMDILSIDevice.h
10924 --- llvm-3.2.src/lib/Target/R600/AMDILSIDevice.h        1970-01-01 01:00:00.000000000 +0100
10925 +++ llvm-r600/lib/Target/R600/AMDILSIDevice.h   2013-01-25 19:43:57.450049721 +0100
10926 @@ -0,0 +1,39 @@
10927 +//===------- AMDILSIDevice.h - Define SI Device for AMDIL -*- C++ -*------===//
10928 +//
10929 +//                     The LLVM Compiler Infrastructure
10930 +//
10931 +// This file is distributed under the University of Illinois Open Source
10932 +// License. See LICENSE.TXT for details.
10933 +//
10934 +//==-----------------------------------------------------------------------===//
10935 +//
10936 +/// \file
10937 +/// \brief Interface for the subtarget data classes.
10938 +///
10939 +/// This file will define the interface that each generation needs to
10940 +/// implement in order to correctly answer queries on the capabilities of the
10941 +/// specific hardware.
10942 +//===---------------------------------------------------------------------===//
10943 +#ifndef AMDILSIDEVICE_H
10944 +#define AMDILSIDEVICE_H
10945 +#include "AMDILEvergreenDevice.h"
10946 +
10947 +namespace llvm {
10948 +class AMDGPUSubtarget;
10949 +//===---------------------------------------------------------------------===//
10950 +// SI generation of devices and their respective sub classes
10951 +//===---------------------------------------------------------------------===//
10952 +
10953 +/// \brief The AMDGPUSIDevice is the base class for all Southern Island series
10954 +/// of cards.
10955 +class AMDGPUSIDevice : public AMDGPUEvergreenDevice {
10956 +public:
10957 +  AMDGPUSIDevice(AMDGPUSubtarget*);
10958 +  virtual ~AMDGPUSIDevice();
10959 +  virtual size_t getMaxLDSSize() const;
10960 +  virtual uint32_t getGeneration() const;
10961 +  virtual std::string getDataLayout() const;
10962 +};
10963 +
10964 +} // namespace llvm
10965 +#endif // AMDILSIDEVICE_H
10966 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/CMakeLists.txt llvm-r600/lib/Target/R600/CMakeLists.txt
10967 --- llvm-3.2.src/lib/Target/R600/CMakeLists.txt 1970-01-01 01:00:00.000000000 +0100
10968 +++ llvm-r600/lib/Target/R600/CMakeLists.txt    2013-01-25 19:43:57.453383054 +0100
10969 @@ -0,0 +1,55 @@
10970 +set(LLVM_TARGET_DEFINITIONS AMDGPU.td)
10971 +
10972 +tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info)
10973 +tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info)
10974 +tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel)
10975 +tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv)
10976 +tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget)
10977 +tablegen(LLVM AMDGPUGenIntrinsics.inc -gen-tgt-intrinsic)
10978 +tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
10979 +tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer)
10980 +tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
10981 +add_public_tablegen_target(AMDGPUCommonTableGen)
10982 +
10983 +add_llvm_target(AMDGPUCodeGen
10984 +  AMDIL7XXDevice.cpp
10985 +  AMDILCFGStructurizer.cpp
10986 +  AMDILDevice.cpp
10987 +  AMDILDeviceInfo.cpp
10988 +  AMDILEvergreenDevice.cpp
10989 +  AMDILFrameLowering.cpp
10990 +  AMDILIntrinsicInfo.cpp
10991 +  AMDILISelDAGToDAG.cpp
10992 +  AMDILISelLowering.cpp
10993 +  AMDILNIDevice.cpp
10994 +  AMDILPeepholeOptimizer.cpp
10995 +  AMDILSIDevice.cpp
10996 +  AMDGPUAsmPrinter.cpp
10997 +  AMDGPUMCInstLower.cpp
10998 +  AMDGPUSubtarget.cpp
10999 +  AMDGPUTargetMachine.cpp
11000 +  AMDGPUISelLowering.cpp
11001 +  AMDGPUConvertToISA.cpp
11002 +  AMDGPUInstrInfo.cpp
11003 +  AMDGPURegisterInfo.cpp
11004 +  R600ExpandSpecialInstrs.cpp
11005 +  R600InstrInfo.cpp
11006 +  R600ISelLowering.cpp
11007 +  R600LowerConstCopy.cpp
11008 +  R600MachineFunctionInfo.cpp
11009 +  R600RegisterInfo.cpp
11010 +  SIAssignInterpRegs.cpp
11011 +  SIInstrInfo.cpp
11012 +  SIISelLowering.cpp
11013 +  SILowerLiteralConstants.cpp
11014 +  SILowerControlFlow.cpp
11015 +  SIMachineFunctionInfo.cpp
11016 +  SIRegisterInfo.cpp
11017 +  SIFixSGPRLiveness.cpp
11018 +  )
11019 +
11020 +add_dependencies(LLVMR600CodeGen intrinsics_gen)
11021 +
11022 +add_subdirectory(InstPrinter)
11023 +add_subdirectory(TargetInfo)
11024 +add_subdirectory(MCTargetDesc)
11025 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp llvm-r600/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
11026 --- llvm-3.2.src/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp      1970-01-01 01:00:00.000000000 +0100
11027 +++ llvm-r600/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp 2013-01-25 19:43:57.456716387 +0100
11028 @@ -0,0 +1,156 @@
11029 +//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===//
11030 +//
11031 +//                     The LLVM Compiler Infrastructure
11032 +//
11033 +// This file is distributed under the University of Illinois Open Source
11034 +// License. See LICENSE.TXT for details.
11035 +//
11036 +// \file
11037 +//===----------------------------------------------------------------------===//
11038 +
11039 +#include "AMDGPUInstPrinter.h"
11040 +#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
11041 +#include "llvm/MC/MCInst.h"
11042 +
11043 +using namespace llvm;
11044 +
11045 +void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
11046 +                             StringRef Annot) {
11047 +  printInstruction(MI, OS);
11048 +
11049 +  printAnnotation(OS, Annot);
11050 +}
11051 +
11052 +void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
11053 +                                     raw_ostream &O) {
11054 +
11055 +  const MCOperand &Op = MI->getOperand(OpNo);
11056 +  if (Op.isReg()) {
11057 +    switch (Op.getReg()) {
11058 +    // This is the default predicate state, so we don't need to print it.
11059 +    case AMDGPU::PRED_SEL_OFF: break;
11060 +    default: O << getRegisterName(Op.getReg()); break;
11061 +    }
11062 +  } else if (Op.isImm()) {
11063 +    O << Op.getImm();
11064 +  } else if (Op.isFPImm()) {
11065 +    O << Op.getFPImm();
11066 +  } else {
11067 +    assert(!"unknown operand type in printOperand");
11068 +  }
11069 +}
11070 +
11071 +void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
11072 +                                        raw_ostream &O) {
11073 +  printOperand(MI, OpNo, O);
11074 +  O  << ", ";
11075 +  printOperand(MI, OpNo + 1, O);
11076 +}
11077 +
11078 +void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
11079 +                                    raw_ostream &O, StringRef Asm) {
11080 +  const MCOperand &Op = MI->getOperand(OpNo);
11081 +  assert(Op.isImm());
11082 +  if (Op.getImm() == 1) {
11083 +    O << Asm;
11084 +  }
11085 +}
11086 +
11087 +void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
11088 +                                 raw_ostream &O) {
11089 +  printIfSet(MI, OpNo, O, "|");
11090 +}
11091 +
11092 +void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
11093 +                                   raw_ostream &O) {
11094 +  printIfSet(MI, OpNo, O, "_SAT");
11095 +}
11096 +
11097 +void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
11098 +                                     raw_ostream &O) {
11099 +  union Literal {
11100 +    float f;
11101 +    int32_t i;
11102 +  } L;
11103 +
11104 +  L.i = MI->getOperand(OpNo).getImm();
11105 +  O << L.i << "(" << L.f << ")";
11106 +}
11107 +
11108 +void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
11109 +                                  raw_ostream &O) {
11110 +  printIfSet(MI, OpNo, O, " *");
11111 +}
11112 +
11113 +void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
11114 +                                 raw_ostream &O) {
11115 +  printIfSet(MI, OpNo, O, "-");
11116 +}
11117 +
11118 +void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
11119 +                                  raw_ostream &O) {
11120 +  switch (MI->getOperand(OpNo).getImm()) {
11121 +  default: break;
11122 +  case 1:
11123 +    O << " * 2.0";
11124 +    break;
11125 +  case 2:
11126 +    O << " * 4.0";
11127 +    break;
11128 +  case 3:
11129 +    O << " / 2.0";
11130 +    break;
11131 +  }
11132 +}
11133 +
11134 +void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo,
11135 +                                 raw_ostream &O) {
11136 +  const MCOperand &Op = MI->getOperand(OpNo);
11137 +  if (Op.getImm() != 0) {
11138 +    O << " + " << Op.getImm();
11139 +  }
11140 +}
11141 +
11142 +void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
11143 +                                            raw_ostream &O) {
11144 +  printIfSet(MI, OpNo, O, "ExecMask,");
11145 +}
11146 +
11147 +void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo,
11148 +                                        raw_ostream &O) {
11149 +  printIfSet(MI, OpNo, O, "Pred,");
11150 +}
11151 +
11152 +void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
11153 +                                       raw_ostream &O) {
11154 +  const MCOperand &Op = MI->getOperand(OpNo);
11155 +  if (Op.getImm() == 0) {
11156 +    O << " (MASKED)";
11157 +  }
11158 +}
11159 +
11160 +void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo,
11161 +                                  raw_ostream &O) {
11162 +  const char * chans = "XYZW";
11163 +  int sel = MI->getOperand(OpNo).getImm();
11164 +
11165 +  int chan = sel & 3;
11166 +  sel >>= 2;
11167 +
11168 +  if (sel >= 512) {
11169 +    sel -= 512;
11170 +    int cb = sel >> 12;
11171 +    sel &= 4095;
11172 +    O << cb << "[" << sel << "]";
11173 +  } else if (sel >= 448) {
11174 +    sel -= 448;
11175 +    O << sel;
11176 +  } else if (sel >= 0){
11177 +    O << sel;
11178 +  }
11179 +
11180 +  if (sel >= 0)
11181 +    O << "." << chans[chan];
11182 +}
11183 +
11184 +#include "AMDGPUGenAsmWriter.inc"
11185 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h llvm-r600/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
11186 --- llvm-3.2.src/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h        1970-01-01 01:00:00.000000000 +0100
11187 +++ llvm-r600/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h   2013-01-25 19:43:57.456716387 +0100
11188 @@ -0,0 +1,53 @@
11189 +//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===//
11190 +//
11191 +//                     The LLVM Compiler Infrastructure
11192 +//
11193 +// This file is distributed under the University of Illinois Open Source
11194 +// License. See LICENSE.TXT for details.
11195 +//
11196 +//===----------------------------------------------------------------------===//
11197 +//
11198 +/// \file
11199 +//===----------------------------------------------------------------------===//
11200 +
11201 +#ifndef AMDGPUINSTPRINTER_H
11202 +#define AMDGPUINSTPRINTER_H
11203 +
11204 +#include "llvm/ADT/StringRef.h"
11205 +#include "llvm/MC/MCInstPrinter.h"
11206 +#include "llvm/Support/raw_ostream.h"
11207 +
11208 +namespace llvm {
11209 +
11210 +class AMDGPUInstPrinter : public MCInstPrinter {
11211 +public:
11212 +  AMDGPUInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
11213 +                     const MCRegisterInfo &MRI)
11214 +    : MCInstPrinter(MAI, MII, MRI) {}
11215 +
11216 +  //Autogenerated by tblgen
11217 +  void printInstruction(const MCInst *MI, raw_ostream &O);
11218 +  static const char *getRegisterName(unsigned RegNo);
11219 +
11220 +  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
11221 +
11222 +private:
11223 +  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11224 +  void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11225 +  void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, StringRef Asm);
11226 +  void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11227 +  void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11228 +  void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11229 +  void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11230 +  void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11231 +  void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11232 +  void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11233 +  void printUpdateExecMask(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11234 +  void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11235 +  void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11236 +  void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11237 +};
11238 +
11239 +} // End namespace llvm
11240 +
11241 +#endif // AMDGPUINSTRPRINTER_H
11242 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/InstPrinter/CMakeLists.txt llvm-r600/lib/Target/R600/InstPrinter/CMakeLists.txt
11243 --- llvm-3.2.src/lib/Target/R600/InstPrinter/CMakeLists.txt     1970-01-01 01:00:00.000000000 +0100
11244 +++ llvm-r600/lib/Target/R600/InstPrinter/CMakeLists.txt        2013-01-25 19:43:57.456716387 +0100
11245 @@ -0,0 +1,7 @@
11246 +include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
11247 +
11248 +add_llvm_library(LLVMR600AsmPrinter
11249 +  AMDGPUInstPrinter.cpp
11250 +  )
11251 +
11252 +add_dependencies(LLVMR600AsmPrinter R600CommonTableGen)
11253 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/InstPrinter/LLVMBuild.txt llvm-r600/lib/Target/R600/InstPrinter/LLVMBuild.txt
11254 --- llvm-3.2.src/lib/Target/R600/InstPrinter/LLVMBuild.txt      1970-01-01 01:00:00.000000000 +0100
11255 +++ llvm-r600/lib/Target/R600/InstPrinter/LLVMBuild.txt 2013-01-25 19:43:57.456716387 +0100
11256 @@ -0,0 +1,24 @@
11257 +;===- ./lib/Target/R600/InstPrinter/LLVMBuild.txt -----------*- Conf -*--===;
11258 +;
11259 +;                     The LLVM Compiler Infrastructure
11260 +;
11261 +; This file is distributed under the University of Illinois Open Source
11262 +; License. See LICENSE.TXT for details.
11263 +;
11264 +;===------------------------------------------------------------------------===;
11265 +;
11266 +; This is an LLVMBuild description file for the components in this subdirectory.
11267 +;
11268 +; For more information on the LLVMBuild system, please see:
11269 +;
11270 +;   http://llvm.org/docs/LLVMBuild.html
11271 +;
11272 +;===------------------------------------------------------------------------===;
11273 +
11274 +[component_0]
11275 +type = Library
11276 +name = R600AsmPrinter
11277 +parent = R600
11278 +required_libraries = MC Support
11279 +add_to_library_groups = R600
11280 +
11281 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/InstPrinter/Makefile llvm-r600/lib/Target/R600/InstPrinter/Makefile
11282 --- llvm-3.2.src/lib/Target/R600/InstPrinter/Makefile   1970-01-01 01:00:00.000000000 +0100
11283 +++ llvm-r600/lib/Target/R600/InstPrinter/Makefile      2013-01-25 19:43:57.456716387 +0100
11284 @@ -0,0 +1,15 @@
11285 +#===- lib/Target/R600/AsmPrinter/Makefile ------------------*- Makefile -*-===##
11286 +#
11287 +#                     The LLVM Compiler Infrastructure
11288 +#
11289 +# This file is distributed under the University of Illinois Open Source
11290 +# License. See LICENSE.TXT for details.
11291 +#
11292 +##===----------------------------------------------------------------------===##
11293 +LEVEL = ../../../..
11294 +LIBRARYNAME = LLVMR600AsmPrinter
11295 +
11296 +# Hack: we need to include 'main' x86 target directory to grab private headers
11297 +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
11298 +
11299 +include $(LEVEL)/Makefile.common
11300 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/LLVMBuild.txt llvm-r600/lib/Target/R600/LLVMBuild.txt
11301 --- llvm-3.2.src/lib/Target/R600/LLVMBuild.txt  1970-01-01 01:00:00.000000000 +0100
11302 +++ llvm-r600/lib/Target/R600/LLVMBuild.txt     2013-01-25 19:43:57.456716387 +0100
11303 @@ -0,0 +1,32 @@
11304 +;===- ./lib/Target/AMDIL/LLVMBuild.txt -------------------------*- Conf -*--===;
11305 +;
11306 +;                     The LLVM Compiler Infrastructure
11307 +;
11308 +; This file is distributed under the University of Illinois Open Source
11309 +; License. See LICENSE.TXT for details.
11310 +;
11311 +;===------------------------------------------------------------------------===;
11312 +;
11313 +; This is an LLVMBuild description file for the components in this subdirectory.
11314 +;
11315 +; For more information on the LLVMBuild system, please see:
11316 +;
11317 +;   http://llvm.org/docs/LLVMBuild.html
11318 +;
11319 +;===------------------------------------------------------------------------===;
11320 +
11321 +[common]
11322 +subdirectories = InstPrinter MCTargetDesc TargetInfo
11323 +
11324 +[component_0]
11325 +type = TargetGroup
11326 +name = R600
11327 +parent = Target
11328 +has_asmprinter = 1
11329 +
11330 +[component_1]
11331 +type = Library
11332 +name = R600CodeGen
11333 +parent = R600
11334 +required_libraries = AsmPrinter CodeGen Core SelectionDAG Support Target MC R600AsmPrinter R600Desc R600Info
11335 +add_to_library_groups = R600
11336 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/Makefile llvm-r600/lib/Target/R600/Makefile
11337 --- llvm-3.2.src/lib/Target/R600/Makefile       1970-01-01 01:00:00.000000000 +0100
11338 +++ llvm-r600/lib/Target/R600/Makefile  2013-01-25 19:43:57.460049721 +0100
11339 @@ -0,0 +1,23 @@
11340 +##===- lib/Target/R600/Makefile ---------------------------*- Makefile -*-===##
11341 +#
11342 +#                     The LLVM Compiler Infrastructure
11343 +#
11344 +# This file is distributed under the University of Illinois Open Source
11345 +# License. See LICENSE.TXT for details.
11346 +#
11347 +##===----------------------------------------------------------------------===##
11348 +
11349 +LEVEL = ../../..
11350 +LIBRARYNAME = LLVMR600CodeGen
11351 +TARGET = AMDGPU
11352 +
11353 +# Make sure that tblgen is run, first thing.
11354 +BUILT_SOURCES = AMDGPUGenRegisterInfo.inc AMDGPUGenInstrInfo.inc \
11355 +               AMDGPUGenDAGISel.inc  AMDGPUGenSubtargetInfo.inc \
11356 +               AMDGPUGenMCCodeEmitter.inc AMDGPUGenCallingConv.inc \
11357 +               AMDGPUGenIntrinsics.inc AMDGPUGenDFAPacketizer.inc \
11358 +               AMDGPUGenAsmWriter.inc
11359 +
11360 +DIRS = InstPrinter TargetInfo MCTargetDesc
11361 +
11362 +include $(LEVEL)/Makefile.common
11363 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
11364 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp      1970-01-01 01:00:00.000000000 +0100
11365 +++ llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp 2013-01-25 19:43:57.456716387 +0100
11366 @@ -0,0 +1,90 @@
11367 +//===-- AMDGPUAsmBackend.cpp - AMDGPU Assembler Backend -------------------===//
11368 +//
11369 +//                     The LLVM Compiler Infrastructure
11370 +//
11371 +// This file is distributed under the University of Illinois Open Source
11372 +// License. See LICENSE.TXT for details.
11373 +//
11374 +/// \file
11375 +//===----------------------------------------------------------------------===//
11376 +
11377 +#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
11378 +#include "llvm/ADT/StringRef.h"
11379 +#include "llvm/MC/MCAsmBackend.h"
11380 +#include "llvm/MC/MCAssembler.h"
11381 +#include "llvm/MC/MCObjectWriter.h"
11382 +#include "llvm/MC/MCValue.h"
11383 +#include "llvm/Support/TargetRegistry.h"
11384 +
11385 +using namespace llvm;
11386 +
11387 +namespace {
11388 +
11389 +class AMDGPUMCObjectWriter : public MCObjectWriter {
11390 +public:
11391 +  AMDGPUMCObjectWriter(raw_ostream &OS) : MCObjectWriter(OS, true) { }
11392 +  virtual void ExecutePostLayoutBinding(MCAssembler &Asm,
11393 +                                        const MCAsmLayout &Layout) {
11394 +    //XXX: Implement if necessary.
11395 +  }
11396 +  virtual void RecordRelocation(const MCAssembler &Asm,
11397 +                                const MCAsmLayout &Layout,
11398 +                                const MCFragment *Fragment,
11399 +                                const MCFixup &Fixup,
11400 +                                MCValue Target, uint64_t &FixedValue) {
11401 +    assert(!"Not implemented");
11402 +  }
11403 +
11404 +  virtual void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout);
11405 +
11406 +};
11407 +
11408 +class AMDGPUAsmBackend : public MCAsmBackend {
11409 +public:
11410 +  AMDGPUAsmBackend(const Target &T)
11411 +    : MCAsmBackend() {}
11412 +
11413 +  virtual AMDGPUMCObjectWriter *createObjectWriter(raw_ostream &OS) const;
11414 +  virtual unsigned getNumFixupKinds() const { return 0; };
11415 +  virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
11416 +                          uint64_t Value) const;
11417 +  virtual bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
11418 +                                    const MCInstFragment *DF,
11419 +                                    const MCAsmLayout &Layout) const {
11420 +    return false;
11421 +  }
11422 +  virtual void relaxInstruction(const MCInst &Inst, MCInst &Res) const {
11423 +    assert(!"Not implemented");
11424 +  }
11425 +  virtual bool mayNeedRelaxation(const MCInst &Inst) const { return false; }
11426 +  virtual bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
11427 +    return true;
11428 +  }
11429 +};
11430 +
11431 +} //End anonymous namespace
11432 +
11433 +void AMDGPUMCObjectWriter::WriteObject(MCAssembler &Asm,
11434 +                                       const MCAsmLayout &Layout) {
11435 +  for (MCAssembler::iterator I = Asm.begin(), E = Asm.end(); I != E; ++I) {
11436 +    Asm.writeSectionData(I, Layout);
11437 +  }
11438 +}
11439 +
11440 +MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, StringRef TT,
11441 +                                           StringRef CPU) {
11442 +  return new AMDGPUAsmBackend(T);
11443 +}
11444 +
11445 +AMDGPUMCObjectWriter * AMDGPUAsmBackend::createObjectWriter(
11446 +                                                        raw_ostream &OS) const {
11447 +  return new AMDGPUMCObjectWriter(OS);
11448 +}
11449 +
11450 +void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
11451 +                                  unsigned DataSize, uint64_t Value) const {
11452 +
11453 +  uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset());
11454 +  assert(Fixup.getKind() == FK_PCRel_4);
11455 +  *Dst = (Value - 4) / 4;
11456 +}
11457 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
11458 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp       1970-01-01 01:00:00.000000000 +0100
11459 +++ llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp  2013-01-25 19:43:57.456716387 +0100
11460 @@ -0,0 +1,85 @@
11461 +//===-- MCTargetDesc/AMDGPUMCAsmInfo.cpp - Assembly Info ------------------===//
11462 +//
11463 +//                     The LLVM Compiler Infrastructure
11464 +//
11465 +// This file is distributed under the University of Illinois Open Source
11466 +// License. See LICENSE.TXT for details.
11467 +//
11468 +/// \file
11469 +//===----------------------------------------------------------------------===//
11470 +
11471 +#include "AMDGPUMCAsmInfo.h"
11472 +
11473 +using namespace llvm;
11474 +AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Target &T, StringRef &TT) : MCAsmInfo() {
11475 +  HasSingleParameterDotFile = false;
11476 +  WeakDefDirective = 0;
11477 +  //===------------------------------------------------------------------===//
11478 +  HasSubsectionsViaSymbols = true;
11479 +  HasMachoZeroFillDirective = false;
11480 +  HasMachoTBSSDirective = false;
11481 +  HasStaticCtorDtorReferenceInStaticMode = false;
11482 +  LinkerRequiresNonEmptyDwarfLines = true;
11483 +  MaxInstLength = 16;
11484 +  PCSymbol = "$";
11485 +  SeparatorString = "\n";
11486 +  CommentColumn = 40;
11487 +  CommentString = ";";
11488 +  LabelSuffix = ":";
11489 +  GlobalPrefix = "@";
11490 +  PrivateGlobalPrefix = ";.";
11491 +  LinkerPrivateGlobalPrefix = "!";
11492 +  InlineAsmStart = ";#ASMSTART";
11493 +  InlineAsmEnd = ";#ASMEND";
11494 +  AssemblerDialect = 0;
11495 +  AllowQuotesInName = false;
11496 +  AllowNameToStartWithDigit = false;
11497 +  AllowPeriodsInName = false;
11498 +
11499 +  //===--- Data Emission Directives -------------------------------------===//
11500 +  ZeroDirective = ".zero";
11501 +  AsciiDirective = ".ascii\t";
11502 +  AscizDirective = ".asciz\t";
11503 +  Data8bitsDirective = ".byte\t";
11504 +  Data16bitsDirective = ".short\t";
11505 +  Data32bitsDirective = ".long\t";
11506 +  Data64bitsDirective = ".quad\t";
11507 +  GPRel32Directive = 0;
11508 +  SunStyleELFSectionSwitchSyntax = true;
11509 +  UsesELFSectionDirectiveForBSS = true;
11510 +  HasMicrosoftFastStdCallMangling = false;
11511 +
11512 +  //===--- Alignment Information ----------------------------------------===//
11513 +  AlignDirective = ".align\t";
11514 +  AlignmentIsInBytes = true;
11515 +  TextAlignFillValue = 0;
11516 +
11517 +  //===--- Global Variable Emission Directives --------------------------===//
11518 +  GlobalDirective = ".global";
11519 +  ExternDirective = ".extern";
11520 +  HasSetDirective = false;
11521 +  HasAggressiveSymbolFolding = true;
11522 +  COMMDirectiveAlignmentIsInBytes = false;
11523 +  HasDotTypeDotSizeDirective = false;
11524 +  HasNoDeadStrip = true;
11525 +  HasSymbolResolver = false;
11526 +  WeakRefDirective = ".weakref\t";
11527 +  LinkOnceDirective = 0;
11528 +  //===--- Dwarf Emission Directives -----------------------------------===//
11529 +  HasLEB128 = true;
11530 +  SupportsDebugInformation = true;
11531 +  ExceptionsType = ExceptionHandling::None;
11532 +  DwarfUsesInlineInfoSection = false;
11533 +  DwarfSectionOffsetDirective = ".offset";
11534 +
11535 +}
11536 +
11537 +const char*
11538 +AMDGPUMCAsmInfo::getDataASDirective(unsigned int Size, unsigned int AS) const {
11539 +  return 0;
11540 +}
11541 +
11542 +const MCSection*
11543 +AMDGPUMCAsmInfo::getNonexecutableStackSection(MCContext &CTX) const {
11544 +  return 0;
11545 +}
11546 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
11547 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h 1970-01-01 01:00:00.000000000 +0100
11548 +++ llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h    2013-01-25 19:43:57.456716387 +0100
11549 @@ -0,0 +1,30 @@
11550 +//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface  ----------===//
11551 +//
11552 +//                     The LLVM Compiler Infrastructure
11553 +//
11554 +// This file is distributed under the University of Illinois Open Source
11555 +// License. See LICENSE.TXT for details.
11556 +//
11557 +//===----------------------------------------------------------------------===//
11558 +//
11559 +/// \file
11560 +//
11561 +//===----------------------------------------------------------------------===//
11562 +
11563 +#ifndef AMDGPUMCASMINFO_H
11564 +#define AMDGPUMCASMINFO_H
11565 +
11566 +#include "llvm/MC/MCAsmInfo.h"
11567 +namespace llvm {
11568 +
11569 +class Target;
11570 +class StringRef;
11571 +
11572 +class AMDGPUMCAsmInfo : public MCAsmInfo {
11573 +public:
11574 +  explicit AMDGPUMCAsmInfo(const Target &T, StringRef &TT);
11575 +  const char* getDataASDirective(unsigned int Size, unsigned int AS) const;
11576 +  const MCSection* getNonexecutableStackSection(MCContext &CTX) const;
11577 +};
11578 +} // namespace llvm
11579 +#endif // AMDGPUMCASMINFO_H
11580 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
11581 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h     1970-01-01 01:00:00.000000000 +0100
11582 +++ llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h        2013-01-25 19:43:57.456716387 +0100
11583 @@ -0,0 +1,60 @@
11584 +//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----------------===//
11585 +//
11586 +//                     The LLVM Compiler Infrastructure
11587 +//
11588 +// This file is distributed under the University of Illinois Open Source
11589 +// License. See LICENSE.TXT for details.
11590 +//
11591 +//===----------------------------------------------------------------------===//
11592 +//
11593 +/// \file
11594 +/// \brief CodeEmitter interface for R600 and SI codegen.
11595 +//
11596 +//===----------------------------------------------------------------------===//
11597 +
11598 +#ifndef AMDGPUCODEEMITTER_H
11599 +#define AMDGPUCODEEMITTER_H
11600 +
11601 +#include "llvm/MC/MCCodeEmitter.h"
11602 +#include "llvm/Support/raw_ostream.h"
11603 +
11604 +namespace llvm {
11605 +
11606 +class MCInst;
11607 +class MCOperand;
11608 +
11609 +class AMDGPUMCCodeEmitter : public MCCodeEmitter {
11610 +public:
11611 +
11612 +  uint64_t getBinaryCodeForInstr(const MCInst &MI,
11613 +                                 SmallVectorImpl<MCFixup> &Fixups) const;
11614 +
11615 +  virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
11616 +                                     SmallVectorImpl<MCFixup> &Fixups) const {
11617 +    return 0;
11618 +  }
11619 +
11620 +  virtual unsigned GPR4AlignEncode(const MCInst  &MI, unsigned OpNo,
11621 +                                   SmallVectorImpl<MCFixup> &Fixups) const {
11622 +    return 0;
11623 +  }
11624 +  virtual unsigned GPR2AlignEncode(const MCInst &MI, unsigned OpNo,
11625 +                                   SmallVectorImpl<MCFixup> &Fixups) const {
11626 +    return 0;
11627 +  }
11628 +  virtual uint64_t VOPPostEncode(const MCInst &MI, uint64_t Value) const {
11629 +    return Value;
11630 +  }
11631 +  virtual uint64_t i32LiteralEncode(const MCInst &MI, unsigned OpNo,
11632 +                                   SmallVectorImpl<MCFixup> &Fixups) const {
11633 +    return 0;
11634 +  }
11635 +  virtual uint32_t SMRDmemriEncode(const MCInst &MI, unsigned OpNo,
11636 +                                   SmallVectorImpl<MCFixup> &Fixups) const {
11637 +    return 0;
11638 +  }
11639 +};
11640 +
11641 +} // End namespace llvm
11642 +
11643 +#endif // AMDGPUCODEEMITTER_H
11644 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
11645 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp    1970-01-01 01:00:00.000000000 +0100
11646 +++ llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp       2013-01-25 19:43:57.460049721 +0100
11647 @@ -0,0 +1,113 @@
11648 +//===-- AMDGPUMCTargetDesc.cpp - AMDGPU Target Descriptions ---------------===//
11649 +//
11650 +//                     The LLVM Compiler Infrastructure
11651 +//
11652 +// This file is distributed under the University of Illinois Open Source
11653 +// License. See LICENSE.TXT for details.
11654 +//
11655 +//===----------------------------------------------------------------------===//
11656 +//
11657 +/// \file
11658 +/// \brief This file provides AMDGPU specific target descriptions.
11659 +//
11660 +//===----------------------------------------------------------------------===//
11661 +
11662 +#include "AMDGPUMCTargetDesc.h"
11663 +#include "AMDGPUMCAsmInfo.h"
11664 +#include "InstPrinter/AMDGPUInstPrinter.h"
11665 +#include "llvm/MC/MachineLocation.h"
11666 +#include "llvm/MC/MCCodeGenInfo.h"
11667 +#include "llvm/MC/MCInstrInfo.h"
11668 +#include "llvm/MC/MCRegisterInfo.h"
11669 +#include "llvm/MC/MCStreamer.h"
11670 +#include "llvm/MC/MCSubtargetInfo.h"
11671 +#include "llvm/Support/ErrorHandling.h"
11672 +#include "llvm/Support/TargetRegistry.h"
11673 +
11674 +#define GET_INSTRINFO_MC_DESC
11675 +#include "AMDGPUGenInstrInfo.inc"
11676 +
11677 +#define GET_SUBTARGETINFO_MC_DESC
11678 +#include "AMDGPUGenSubtargetInfo.inc"
11679 +
11680 +#define GET_REGINFO_MC_DESC
11681 +#include "AMDGPUGenRegisterInfo.inc"
11682 +
11683 +using namespace llvm;
11684 +
11685 +static MCInstrInfo *createAMDGPUMCInstrInfo() {
11686 +  MCInstrInfo *X = new MCInstrInfo();
11687 +  InitAMDGPUMCInstrInfo(X);
11688 +  return X;
11689 +}
11690 +
11691 +static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) {
11692 +  MCRegisterInfo *X = new MCRegisterInfo();
11693 +  InitAMDGPUMCRegisterInfo(X, 0);
11694 +  return X;
11695 +}
11696 +
11697 +static MCSubtargetInfo *createAMDGPUMCSubtargetInfo(StringRef TT, StringRef CPU,
11698 +                                                   StringRef FS) {
11699 +  MCSubtargetInfo * X = new MCSubtargetInfo();
11700 +  InitAMDGPUMCSubtargetInfo(X, TT, CPU, FS);
11701 +  return X;
11702 +}
11703 +
11704 +static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(StringRef TT, Reloc::Model RM,
11705 +                                               CodeModel::Model CM,
11706 +                                               CodeGenOpt::Level OL) {
11707 +  MCCodeGenInfo *X = new MCCodeGenInfo();
11708 +  X->InitMCCodeGenInfo(RM, CM, OL);
11709 +  return X;
11710 +}
11711 +
11712 +static MCInstPrinter *createAMDGPUMCInstPrinter(const Target &T,
11713 +                                                unsigned SyntaxVariant,
11714 +                                                const MCAsmInfo &MAI,
11715 +                                                const MCInstrInfo &MII,
11716 +                                                const MCRegisterInfo &MRI,
11717 +                                                const MCSubtargetInfo &STI) {
11718 +  return new AMDGPUInstPrinter(MAI, MII, MRI);
11719 +}
11720 +
11721 +static MCCodeEmitter *createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII,
11722 +                                                const MCRegisterInfo &MRI,
11723 +                                                const MCSubtargetInfo &STI,
11724 +                                                MCContext &Ctx) {
11725 +  if (STI.getFeatureBits() & AMDGPU::Feature64BitPtr) {
11726 +    return createSIMCCodeEmitter(MCII, MRI, STI, Ctx);
11727 +  } else {
11728 +    return createR600MCCodeEmitter(MCII, MRI, STI, Ctx);
11729 +  }
11730 +}
11731 +
11732 +static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
11733 +                                    MCContext &Ctx, MCAsmBackend &MAB,
11734 +                                    raw_ostream &_OS,
11735 +                                    MCCodeEmitter *_Emitter,
11736 +                                    bool RelaxAll,
11737 +                                    bool NoExecStack) {
11738 +  return createPureStreamer(Ctx, MAB, _OS, _Emitter);
11739 +}
11740 +
11741 +extern "C" void LLVMInitializeR600TargetMC() {
11742 +
11743 +  RegisterMCAsmInfo<AMDGPUMCAsmInfo> Y(TheAMDGPUTarget);
11744 +
11745 +  TargetRegistry::RegisterMCCodeGenInfo(TheAMDGPUTarget, createAMDGPUMCCodeGenInfo);
11746 +
11747 +  TargetRegistry::RegisterMCInstrInfo(TheAMDGPUTarget, createAMDGPUMCInstrInfo);
11748 +
11749 +  TargetRegistry::RegisterMCRegInfo(TheAMDGPUTarget, createAMDGPUMCRegisterInfo);
11750 +
11751 +  TargetRegistry::RegisterMCSubtargetInfo(TheAMDGPUTarget, createAMDGPUMCSubtargetInfo);
11752 +
11753 +  TargetRegistry::RegisterMCInstPrinter(TheAMDGPUTarget, createAMDGPUMCInstPrinter);
11754 +
11755 +  TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, createAMDGPUMCCodeEmitter);
11756 +
11757 +  TargetRegistry::RegisterMCAsmBackend(TheAMDGPUTarget, createAMDGPUAsmBackend);
11758 +
11759 +  TargetRegistry::RegisterMCObjectStreamer(TheAMDGPUTarget, createMCStreamer);
11760 +}
11761 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
11762 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h      1970-01-01 01:00:00.000000000 +0100
11763 +++ llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h 2013-01-25 19:43:57.460049721 +0100
11764 @@ -0,0 +1,55 @@
11765 +//===-- AMDGPUMCTargetDesc.h - AMDGPU Target Descriptions -----*- C++ -*-===//
11766 +//
11767 +//                     The LLVM Compiler Infrastructure
11768 +//
11769 +// This file is distributed under the University of Illinois Open Source
11770 +// License. See LICENSE.TXT for details.
11771 +//
11772 +//===----------------------------------------------------------------------===//
11773 +//
11774 +/// \file
11775 +/// \brief Provides AMDGPU specific target descriptions.
11776 +//
11777 +//===----------------------------------------------------------------------===//
11778 +//
11779 +
11780 +#ifndef AMDGPUMCTARGETDESC_H
11781 +#define AMDGPUMCTARGETDESC_H
11782 +
11783 +#include "llvm/ADT/StringRef.h"
11784 +
11785 +namespace llvm {
11786 +class MCAsmBackend;
11787 +class MCCodeEmitter;
11788 +class MCContext;
11789 +class MCInstrInfo;
11790 +class MCRegisterInfo;
11791 +class MCSubtargetInfo;
11792 +class Target;
11793 +
11794 +extern Target TheAMDGPUTarget;
11795 +
11796 +MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
11797 +                                       const MCRegisterInfo &MRI,
11798 +                                       const MCSubtargetInfo &STI,
11799 +                                       MCContext &Ctx);
11800 +
11801 +MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
11802 +                                     const MCRegisterInfo &MRI,
11803 +                                     const MCSubtargetInfo &STI,
11804 +                                     MCContext &Ctx);
11805 +
11806 +MCAsmBackend *createAMDGPUAsmBackend(const Target &T, StringRef TT,
11807 +                                     StringRef CPU);
11808 +} // End llvm namespace
11809 +
11810 +#define GET_REGINFO_ENUM
11811 +#include "AMDGPUGenRegisterInfo.inc"
11812 +
11813 +#define GET_INSTRINFO_ENUM
11814 +#include "AMDGPUGenInstrInfo.inc"
11815 +
11816 +#define GET_SUBTARGETINFO_ENUM
11817 +#include "AMDGPUGenSubtargetInfo.inc"
11818 +
11819 +#endif // AMDGPUMCTARGETDESC_H
11820 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/CMakeLists.txt llvm-r600/lib/Target/R600/MCTargetDesc/CMakeLists.txt
11821 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/CMakeLists.txt    1970-01-01 01:00:00.000000000 +0100
11822 +++ llvm-r600/lib/Target/R600/MCTargetDesc/CMakeLists.txt       2013-01-25 19:43:57.460049721 +0100
11823 @@ -0,0 +1,10 @@
11824 +
11825 +add_llvm_library(LLVMR600Desc
11826 +  AMDGPUAsmBackend.cpp
11827 +  AMDGPUMCTargetDesc.cpp
11828 +  AMDGPUMCAsmInfo.cpp
11829 +  R600MCCodeEmitter.cpp
11830 +  SIMCCodeEmitter.cpp
11831 +  )
11832 +
11833 +add_dependencies(LLVMR600Desc AMDGPUCommonTableGen)
11834 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/LLVMBuild.txt llvm-r600/lib/Target/R600/MCTargetDesc/LLVMBuild.txt
11835 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/LLVMBuild.txt     1970-01-01 01:00:00.000000000 +0100
11836 +++ llvm-r600/lib/Target/R600/MCTargetDesc/LLVMBuild.txt        2013-01-25 19:43:57.460049721 +0100
11837 @@ -0,0 +1,23 @@
11838 +;===- ./lib/Target/R600/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
11839 +;
11840 +;                     The LLVM Compiler Infrastructure
11841 +;
11842 +; This file is distributed under the University of Illinois Open Source
11843 +; License. See LICENSE.TXT for details.
11844 +;
11845 +;===------------------------------------------------------------------------===;
11846 +;
11847 +; This is an LLVMBuild description file for the components in this subdirectory.
11848 +;
11849 +; For more information on the LLVMBuild system, please see:
11850 +;
11851 +;   http://llvm.org/docs/LLVMBuild.html
11852 +;
11853 +;===------------------------------------------------------------------------===;
11854 +
11855 +[component_0]
11856 +type = Library
11857 +name = R600Desc
11858 +parent = R600
11859 +required_libraries = R600AsmPrinter R600Info MC
11860 +add_to_library_groups = R600
11861 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/Makefile llvm-r600/lib/Target/R600/MCTargetDesc/Makefile
11862 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/Makefile  1970-01-01 01:00:00.000000000 +0100
11863 +++ llvm-r600/lib/Target/R600/MCTargetDesc/Makefile     2013-01-25 19:43:57.460049721 +0100
11864 @@ -0,0 +1,16 @@
11865 +##===- lib/Target/AMDGPU/TargetDesc/Makefile ----------------*- Makefile -*-===##
11866 +#
11867 +#                     The LLVM Compiler Infrastructure
11868 +#
11869 +# This file is distributed under the University of Illinois Open Source
11870 +# License. See LICENSE.TXT for details.
11871 +#
11872 +##===----------------------------------------------------------------------===##
11873 +
11874 +LEVEL = ../../../..
11875 +LIBRARYNAME = LLVMR600Desc
11876 +
11877 +# Hack: we need to include 'main' target directory to grab private headers
11878 +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
11879 +
11880 +include $(LEVEL)/Makefile.common
11881 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp llvm-r600/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
11882 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp     1970-01-01 01:00:00.000000000 +0100
11883 +++ llvm-r600/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp        2013-01-25 19:43:57.460049721 +0100
11884 @@ -0,0 +1,580 @@
11885 +//===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===//
11886 +//
11887 +//                     The LLVM Compiler Infrastructure
11888 +//
11889 +// This file is distributed under the University of Illinois Open Source
11890 +// License. See LICENSE.TXT for details.
11891 +//
11892 +//===----------------------------------------------------------------------===//
11893 +//
11894 +/// \file
11895 +///
11896 +/// This code emitter outputs bytecode that is understood by the r600g driver
11897 +/// in the Mesa [1] project.  The bytecode is very similar to the hardware's ISA,
11898 +/// but it still needs to be run through a finalizer in order to be executed
11899 +/// by the GPU.
11900 +///
11901 +/// [1] http://www.mesa3d.org/
11902 +//
11903 +//===----------------------------------------------------------------------===//
11904 +
11905 +#include "R600Defines.h"
11906 +#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
11907 +#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
11908 +#include "llvm/MC/MCCodeEmitter.h"
11909 +#include "llvm/MC/MCContext.h"
11910 +#include "llvm/MC/MCInst.h"
11911 +#include "llvm/MC/MCInstrInfo.h"
11912 +#include "llvm/MC/MCRegisterInfo.h"
11913 +#include "llvm/MC/MCSubtargetInfo.h"
11914 +#include "llvm/Support/raw_ostream.h"
11915 +
11916 +#include <stdio.h>
11917 +
11918 +#define SRC_BYTE_COUNT 11
11919 +#define DST_BYTE_COUNT 5
11920 +
11921 +using namespace llvm;
11922 +
11923 +namespace {
11924 +
11925 +class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
11926 +  R600MCCodeEmitter(const R600MCCodeEmitter &); // DO NOT IMPLEMENT
11927 +  void operator=(const R600MCCodeEmitter &); // DO NOT IMPLEMENT
11928 +  const MCInstrInfo &MCII;
11929 +  const MCRegisterInfo &MRI;
11930 +  const MCSubtargetInfo &STI;
11931 +  MCContext &Ctx;
11932 +
11933 +public:
11934 +
11935 +  R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
11936 +                    const MCSubtargetInfo &sti, MCContext &ctx)
11937 +    : MCII(mcii), MRI(mri), STI(sti), Ctx(ctx) { }
11938 +
11939 +  /// \brief Encode the instruction and write it to the OS.
11940 +  virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
11941 +                         SmallVectorImpl<MCFixup> &Fixups) const;
11942 +
11943 +  /// \returns the encoding for an MCOperand.
11944 +  virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
11945 +                                     SmallVectorImpl<MCFixup> &Fixups) const;
11946 +private:
11947 +
11948 +  void EmitALUInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
11949 +                    raw_ostream &OS) const;
11950 +  void EmitSrc(const MCInst &MI, unsigned OpIdx, raw_ostream &OS) const;
11951 +  void EmitSrcISA(const MCInst &MI, unsigned RegOpIdx, unsigned SelOpIdx,
11952 +                    raw_ostream &OS) const;
11953 +  void EmitDst(const MCInst &MI, raw_ostream &OS) const;
11954 +  void EmitTexInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
11955 +                    raw_ostream &OS) const;
11956 +  void EmitFCInstr(const MCInst &MI, raw_ostream &OS) const;
11957 +
11958 +  void EmitNullBytes(unsigned int byteCount, raw_ostream &OS) const;
11959 +
11960 +  void EmitByte(unsigned int byte, raw_ostream &OS) const;
11961 +
11962 +  void EmitTwoBytes(uint32_t bytes, raw_ostream &OS) const;
11963 +
11964 +  void Emit(uint32_t value, raw_ostream &OS) const;
11965 +  void Emit(uint64_t value, raw_ostream &OS) const;
11966 +
11967 +  unsigned getHWRegChan(unsigned reg) const;
11968 +  unsigned getHWReg(unsigned regNo) const;
11969 +
11970 +  bool isFCOp(unsigned opcode) const;
11971 +  bool isTexOp(unsigned opcode) const;
11972 +  bool isFlagSet(const MCInst &MI, unsigned Operand, unsigned Flag) const;
11973 +
11974 +};
11975 +
11976 +} // End anonymous namespace
11977 +
11978 +enum RegElement {
11979 +  ELEMENT_X = 0,
11980 +  ELEMENT_Y,
11981 +  ELEMENT_Z,
11982 +  ELEMENT_W
11983 +};
11984 +
11985 +enum InstrTypes {
11986 +  INSTR_ALU = 0,
11987 +  INSTR_TEX,
11988 +  INSTR_FC,
11989 +  INSTR_NATIVE,
11990 +  INSTR_VTX,
11991 +  INSTR_EXPORT
11992 +};
11993 +
11994 +enum FCInstr {
11995 +  FC_IF_PREDICATE = 0,
11996 +  FC_ELSE,
11997 +  FC_ENDIF,
11998 +  FC_BGNLOOP,
11999 +  FC_ENDLOOP,
12000 +  FC_BREAK_PREDICATE,
12001 +  FC_CONTINUE
12002 +};
12003 +
12004 +enum TextureTypes {
12005 +  TEXTURE_1D = 1,
12006 +  TEXTURE_2D,
12007 +  TEXTURE_3D,
12008 +  TEXTURE_CUBE,
12009 +  TEXTURE_RECT,
12010 +  TEXTURE_SHADOW1D,
12011 +  TEXTURE_SHADOW2D,
12012 +  TEXTURE_SHADOWRECT,
12013 +  TEXTURE_1D_ARRAY,
12014 +  TEXTURE_2D_ARRAY,
12015 +  TEXTURE_SHADOW1D_ARRAY,
12016 +  TEXTURE_SHADOW2D_ARRAY
12017 +};
12018 +
12019 +MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
12020 +                                           const MCRegisterInfo &MRI,
12021 +                                           const MCSubtargetInfo &STI,
12022 +                                           MCContext &Ctx) {
12023 +  return new R600MCCodeEmitter(MCII, MRI, STI, Ctx);
12024 +}
12025 +
12026 +void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
12027 +                                       SmallVectorImpl<MCFixup> &Fixups) const {
12028 +  if (isTexOp(MI.getOpcode())) {
12029 +    EmitTexInstr(MI, Fixups, OS);
12030 +  } else if (isFCOp(MI.getOpcode())){
12031 +    EmitFCInstr(MI, OS);
12032 +  } else if (MI.getOpcode() == AMDGPU::RETURN ||
12033 +    MI.getOpcode() == AMDGPU::BUNDLE ||
12034 +    MI.getOpcode() == AMDGPU::KILL) {
12035 +    return;
12036 +  } else {
12037 +    switch(MI.getOpcode()) {
12038 +    case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
12039 +    case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
12040 +      uint64_t inst = getBinaryCodeForInstr(MI, Fixups);
12041 +      EmitByte(INSTR_NATIVE, OS);
12042 +      Emit(inst, OS);
12043 +      break;
12044 +    }
12045 +    case AMDGPU::CONSTANT_LOAD_eg:
12046 +    case AMDGPU::VTX_READ_PARAM_8_eg:
12047 +    case AMDGPU::VTX_READ_PARAM_16_eg:
12048 +    case AMDGPU::VTX_READ_PARAM_32_eg:
12049 +    case AMDGPU::VTX_READ_GLOBAL_8_eg:
12050 +    case AMDGPU::VTX_READ_GLOBAL_32_eg:
12051 +    case AMDGPU::VTX_READ_GLOBAL_128_eg:
12052 +    case AMDGPU::TEX_VTX_CONSTBUF: {
12053 +      uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
12054 +      uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
12055 +
12056 +      EmitByte(INSTR_VTX, OS);
12057 +      Emit(InstWord01, OS);
12058 +      Emit(InstWord2, OS);
12059 +      break;
12060 +    }
12061 +    case AMDGPU::EG_ExportSwz:
12062 +    case AMDGPU::R600_ExportSwz:
12063 +    case AMDGPU::EG_ExportBuf:
12064 +    case AMDGPU::R600_ExportBuf: {
12065 +      uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
12066 +      EmitByte(INSTR_EXPORT, OS);
12067 +      Emit(Inst, OS);
12068 +      break;
12069 +    }
12070 +
12071 +    default:
12072 +      EmitALUInstr(MI, Fixups, OS);
12073 +      break;
12074 +    }
12075 +  }
12076 +}
12077 +
12078 +void R600MCCodeEmitter::EmitALUInstr(const MCInst &MI,
12079 +                                     SmallVectorImpl<MCFixup> &Fixups,
12080 +                                     raw_ostream &OS) const {
12081 +  const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode());
12082 +
12083 +  // Emit instruction type
12084 +  EmitByte(INSTR_ALU, OS);
12085 +
12086 +  uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
12087 +
12088 +  //older alu have different encoding for instructions with one or two src
12089 +  //parameters.
12090 +  if ((STI.getFeatureBits() & AMDGPU::FeatureR600ALUInst) &&
12091 +      !(MCDesc.TSFlags & R600_InstFlag::OP3)) {
12092 +    uint64_t ISAOpCode = InstWord01 & (0x3FFULL << 39);
12093 +    InstWord01 &= ~(0x3FFULL << 39);
12094 +    InstWord01 |= ISAOpCode << 1;
12095 +  }
12096 +
12097 +  unsigned SrcNum = MCDesc.TSFlags & R600_InstFlag::OP3 ? 3 :
12098 +      MCDesc.TSFlags & R600_InstFlag::OP2 ? 2 : 1;
12099 +
12100 +  EmitByte(SrcNum, OS);
12101 +
12102 +  const unsigned SrcOps[3][2] = {
12103 +      {R600Operands::SRC0, R600Operands::SRC0_SEL},
12104 +      {R600Operands::SRC1, R600Operands::SRC1_SEL},
12105 +      {R600Operands::SRC2, R600Operands::SRC2_SEL}
12106 +  };
12107 +
12108 +  for (unsigned SrcIdx = 0; SrcIdx < SrcNum; ++SrcIdx) {
12109 +    unsigned RegOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][0]];
12110 +    unsigned SelOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][1]];
12111 +    EmitSrcISA(MI, RegOpIdx, SelOpIdx, OS);
12112 +  }
12113 +
12114 +  Emit(InstWord01, OS);
12115 +  return;
12116 +}
12117 +
12118 +void R600MCCodeEmitter::EmitSrc(const MCInst &MI, unsigned OpIdx,
12119 +                                raw_ostream &OS) const {
12120 +  const MCOperand &MO = MI.getOperand(OpIdx);
12121 +  union {
12122 +    float f;
12123 +    uint32_t i;
12124 +  } Value;
12125 +  Value.i = 0;
12126 +  // Emit the source select (2 bytes).  For GPRs, this is the register index.
12127 +  // For other potential instruction operands, (e.g. constant registers) the
12128 +  // value of the source select is defined in the r600isa docs.
12129 +  if (MO.isReg()) {
12130 +    unsigned reg = MO.getReg();
12131 +    EmitTwoBytes(getHWReg(reg), OS);
12132 +    if (reg == AMDGPU::ALU_LITERAL_X) {
12133 +      unsigned ImmOpIndex = MI.getNumOperands() - 1;
12134 +      MCOperand ImmOp = MI.getOperand(ImmOpIndex);
12135 +      if (ImmOp.isFPImm()) {
12136 +        Value.f = ImmOp.getFPImm();
12137 +      } else {
12138 +        assert(ImmOp.isImm());
12139 +        Value.i = ImmOp.getImm();
12140 +      }
12141 +    }
12142 +  } else {
12143 +    // XXX: Handle other operand types.
12144 +    EmitTwoBytes(0, OS);
12145 +  }
12146 +
12147 +  // Emit the source channel (1 byte)
12148 +  if (MO.isReg()) {
12149 +    EmitByte(getHWRegChan(MO.getReg()), OS);
12150 +  } else {
12151 +    EmitByte(0, OS);
12152 +  }
12153 +
12154 +  // XXX: Emit isNegated (1 byte)
12155 +  if ((!(isFlagSet(MI, OpIdx, MO_FLAG_ABS)))
12156 +      && (isFlagSet(MI, OpIdx, MO_FLAG_NEG) ||
12157 +     (MO.isReg() &&
12158 +      (MO.getReg() == AMDGPU::NEG_ONE || MO.getReg() == AMDGPU::NEG_HALF)))){
12159 +    EmitByte(1, OS);
12160 +  } else {
12161 +    EmitByte(0, OS);
12162 +  }
12163 +
12164 +  // Emit isAbsolute (1 byte)
12165 +  if (isFlagSet(MI, OpIdx, MO_FLAG_ABS)) {
12166 +    EmitByte(1, OS);
12167 +  } else {
12168 +    EmitByte(0, OS);
12169 +  }
12170 +
12171 +  // XXX: Emit relative addressing mode (1 byte)
12172 +  EmitByte(0, OS);
12173 +
12174 +  // Emit kc_bank, This will be adjusted later by r600_asm
12175 +  EmitByte(0, OS);
12176 +
12177 +  // Emit the literal value, if applicable (4 bytes).
12178 +  Emit(Value.i, OS);
12179 +
12180 +}
12181 +
12182 +void R600MCCodeEmitter::EmitSrcISA(const MCInst &MI, unsigned RegOpIdx,
12183 +                                   unsigned SelOpIdx, raw_ostream &OS) const {
12184 +  const MCOperand &RegMO = MI.getOperand(RegOpIdx);
12185 +  const MCOperand &SelMO = MI.getOperand(SelOpIdx);
12186 +
12187 +  union {
12188 +    float f;
12189 +    uint32_t i;
12190 +  } InlineConstant;
12191 +  InlineConstant.i = 0;
12192 +  // Emit source type (1 byte) and source select (4 bytes). For GPRs type is 0
12193 +  // and select is 0 (GPR index is encoded in the instr encoding. For constants
12194 +  // type is 1 and select is the original const select passed from the driver.
12195 +  unsigned Reg = RegMO.getReg();
12196 +  if (Reg == AMDGPU::ALU_CONST) {
12197 +    EmitByte(1, OS);
12198 +    uint32_t Sel = SelMO.getImm();
12199 +    Emit(Sel, OS);
12200 +  } else {
12201 +    EmitByte(0, OS);
12202 +    Emit((uint32_t)0, OS);
12203 +  }
12204 +
12205 +  if (Reg == AMDGPU::ALU_LITERAL_X) {
12206 +    unsigned ImmOpIndex = MI.getNumOperands() - 1;
12207 +    MCOperand ImmOp = MI.getOperand(ImmOpIndex);
12208 +    if (ImmOp.isFPImm()) {
12209 +      InlineConstant.f = ImmOp.getFPImm();
12210 +    } else {
12211 +      assert(ImmOp.isImm());
12212 +      InlineConstant.i = ImmOp.getImm();
12213 +    }
12214 +  }
12215 +
12216 +  // Emit the literal value, if applicable (4 bytes).
12217 +  Emit(InlineConstant.i, OS);
12218 +}
12219 +
12220 +void R600MCCodeEmitter::EmitTexInstr(const MCInst &MI,
12221 +                                     SmallVectorImpl<MCFixup> &Fixups,
12222 +                                     raw_ostream &OS) const {
12223 +
12224 +  unsigned Opcode = MI.getOpcode();
12225 +  bool hasOffsets = (Opcode == AMDGPU::TEX_LD);
12226 +  unsigned OpOffset = hasOffsets ? 3 : 0;
12227 +  int64_t Resource = MI.getOperand(OpOffset + 2).getImm();
12228 +  int64_t Sampler = MI.getOperand(OpOffset + 3).getImm();
12229 +  int64_t TextureType = MI.getOperand(OpOffset + 4).getImm();
12230 +  unsigned srcSelect[4] = {0, 1, 2, 3};
12231 +
12232 +  // Emit instruction type
12233 +  EmitByte(1, OS);
12234 +
12235 +  // Emit instruction
12236 +  EmitByte(getBinaryCodeForInstr(MI, Fixups), OS);
12237 +
12238 +  // Emit resource id
12239 +  EmitByte(Resource, OS);
12240 +
12241 +  // Emit source register
12242 +  EmitByte(getHWReg(MI.getOperand(1).getReg()), OS);
12243 +
12244 +  // XXX: Emit src isRelativeAddress
12245 +  EmitByte(0, OS);
12246 +
12247 +  // Emit destination register
12248 +  EmitByte(getHWReg(MI.getOperand(0).getReg()), OS);
12249 +
12250 +  // XXX: Emit dst isRealtiveAddress
12251 +  EmitByte(0, OS);
12252 +
12253 +  // XXX: Emit dst select
12254 +  EmitByte(0, OS); // X
12255 +  EmitByte(1, OS); // Y
12256 +  EmitByte(2, OS); // Z
12257 +  EmitByte(3, OS); // W
12258 +
12259 +  // XXX: Emit lod bias
12260 +  EmitByte(0, OS);
12261 +
12262 +  // XXX: Emit coord types
12263 +  unsigned coordType[4] = {1, 1, 1, 1};
12264 +
12265 +  if (TextureType == TEXTURE_RECT
12266 +      || TextureType == TEXTURE_SHADOWRECT) {
12267 +    coordType[ELEMENT_X] = 0;
12268 +    coordType[ELEMENT_Y] = 0;
12269 +  }
12270 +
12271 +  if (TextureType == TEXTURE_1D_ARRAY
12272 +      || TextureType == TEXTURE_SHADOW1D_ARRAY) {
12273 +    if (Opcode == AMDGPU::TEX_SAMPLE_C_L || Opcode == AMDGPU::TEX_SAMPLE_C_LB) {
12274 +      coordType[ELEMENT_Y] = 0;
12275 +    } else {
12276 +      coordType[ELEMENT_Z] = 0;
12277 +      srcSelect[ELEMENT_Z] = ELEMENT_Y;
12278 +    }
12279 +  } else if (TextureType == TEXTURE_2D_ARRAY
12280 +             || TextureType == TEXTURE_SHADOW2D_ARRAY) {
12281 +    coordType[ELEMENT_Z] = 0;
12282 +  }
12283 +
12284 +  for (unsigned i = 0; i < 4; i++) {
12285 +    EmitByte(coordType[i], OS);
12286 +  }
12287 +
12288 +  // XXX: Emit offsets
12289 +  if (hasOffsets)
12290 +         for (unsigned i = 2; i < 5; i++)
12291 +                 EmitByte(MI.getOperand(i).getImm()<<1, OS);
12292 +  else
12293 +         EmitNullBytes(3, OS);
12294 +
12295 +  // Emit sampler id
12296 +  EmitByte(Sampler, OS);
12297 +
12298 +  // XXX:Emit source select
12299 +  if ((TextureType == TEXTURE_SHADOW1D
12300 +      || TextureType == TEXTURE_SHADOW2D
12301 +      || TextureType == TEXTURE_SHADOWRECT
12302 +      || TextureType == TEXTURE_SHADOW1D_ARRAY)
12303 +      && Opcode != AMDGPU::TEX_SAMPLE_C_L
12304 +      && Opcode != AMDGPU::TEX_SAMPLE_C_LB) {
12305 +    srcSelect[ELEMENT_W] = ELEMENT_Z;
12306 +  }
12307 +
12308 +  for (unsigned i = 0; i < 4; i++) {
12309 +    EmitByte(srcSelect[i], OS);
12310 +  }
12311 +}
12312 +
12313 +void R600MCCodeEmitter::EmitFCInstr(const MCInst &MI, raw_ostream &OS) const {
12314 +
12315 +  // Emit instruction type
12316 +  EmitByte(INSTR_FC, OS);
12317 +
12318 +  // Emit SRC
12319 +  unsigned NumOperands = MI.getNumOperands();
12320 +  if (NumOperands > 0) {
12321 +    assert(NumOperands == 1);
12322 +    EmitSrc(MI, 0, OS);
12323 +  } else {
12324 +    EmitNullBytes(SRC_BYTE_COUNT, OS);
12325 +  }
12326 +
12327 +  // Emit FC Instruction
12328 +  enum FCInstr instr;
12329 +  switch (MI.getOpcode()) {
12330 +  case AMDGPU::PREDICATED_BREAK:
12331 +    instr = FC_BREAK_PREDICATE;
12332 +    break;
12333 +  case AMDGPU::CONTINUE:
12334 +    instr = FC_CONTINUE;
12335 +    break;
12336 +  case AMDGPU::IF_PREDICATE_SET:
12337 +    instr = FC_IF_PREDICATE;
12338 +    break;
12339 +  case AMDGPU::ELSE:
12340 +    instr = FC_ELSE;
12341 +    break;
12342 +  case AMDGPU::ENDIF:
12343 +    instr = FC_ENDIF;
12344 +    break;
12345 +  case AMDGPU::ENDLOOP:
12346 +    instr = FC_ENDLOOP;
12347 +    break;
12348 +  case AMDGPU::WHILELOOP:
12349 +    instr = FC_BGNLOOP;
12350 +    break;
12351 +  default:
12352 +    abort();
12353 +    break;
12354 +  }
12355 +  EmitByte(instr, OS);
12356 +}
12357 +
12358 +void R600MCCodeEmitter::EmitNullBytes(unsigned int ByteCount,
12359 +                                      raw_ostream &OS) const {
12360 +
12361 +  for (unsigned int i = 0; i < ByteCount; i++) {
12362 +    EmitByte(0, OS);
12363 +  }
12364 +}
12365 +
12366 +void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const {
12367 +  OS.write((uint8_t) Byte & 0xff);
12368 +}
12369 +
12370 +void R600MCCodeEmitter::EmitTwoBytes(unsigned int Bytes,
12371 +                                     raw_ostream &OS) const {
12372 +  OS.write((uint8_t) (Bytes & 0xff));
12373 +  OS.write((uint8_t) ((Bytes >> 8) & 0xff));
12374 +}
12375 +
12376 +void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const {
12377 +  for (unsigned i = 0; i < 4; i++) {
12378 +    OS.write((uint8_t) ((Value >> (8 * i)) & 0xff));
12379 +  }
12380 +}
12381 +
12382 +void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const {
12383 +  for (unsigned i = 0; i < 8; i++) {
12384 +    EmitByte((Value >> (8 * i)) & 0xff, OS);
12385 +  }
12386 +}
12387 +
12388 +unsigned R600MCCodeEmitter::getHWRegChan(unsigned reg) const {
12389 +  return MRI.getEncodingValue(reg) >> HW_CHAN_SHIFT;
12390 +}
12391 +
12392 +unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const {
12393 +  return MRI.getEncodingValue(RegNo) & HW_REG_MASK;
12394 +}
12395 +
12396 +uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
12397 +                                              const MCOperand &MO,
12398 +                                        SmallVectorImpl<MCFixup> &Fixup) const {
12399 +  if (MO.isReg()) {
12400 +    if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) {
12401 +      return MRI.getEncodingValue(MO.getReg());
12402 +    } else {
12403 +      return getHWReg(MO.getReg());
12404 +    }
12405 +  } else if (MO.isImm()) {
12406 +    return MO.getImm();
12407 +  } else {
12408 +    assert(0);
12409 +    return 0;
12410 +  }
12411 +}
12412 +
12413 +//===----------------------------------------------------------------------===//
12414 +// Encoding helper functions
12415 +//===----------------------------------------------------------------------===//
12416 +
12417 +bool R600MCCodeEmitter::isFCOp(unsigned opcode) const {
12418 +  switch(opcode) {
12419 +  default: return false;
12420 +  case AMDGPU::PREDICATED_BREAK:
12421 +  case AMDGPU::CONTINUE:
12422 +  case AMDGPU::IF_PREDICATE_SET:
12423 +  case AMDGPU::ELSE:
12424 +  case AMDGPU::ENDIF:
12425 +  case AMDGPU::ENDLOOP:
12426 +  case AMDGPU::WHILELOOP:
12427 +    return true;
12428 +  }
12429 +}
12430 +
12431 +bool R600MCCodeEmitter::isTexOp(unsigned opcode) const {
12432 +  switch(opcode) {
12433 +  default: return false;
12434 +  case AMDGPU::TEX_LD:
12435 +  case AMDGPU::TEX_GET_TEXTURE_RESINFO:
12436 +  case AMDGPU::TEX_SAMPLE:
12437 +  case AMDGPU::TEX_SAMPLE_C:
12438 +  case AMDGPU::TEX_SAMPLE_L:
12439 +  case AMDGPU::TEX_SAMPLE_C_L:
12440 +  case AMDGPU::TEX_SAMPLE_LB:
12441 +  case AMDGPU::TEX_SAMPLE_C_LB:
12442 +  case AMDGPU::TEX_SAMPLE_G:
12443 +  case AMDGPU::TEX_SAMPLE_C_G:
12444 +  case AMDGPU::TEX_GET_GRADIENTS_H:
12445 +  case AMDGPU::TEX_GET_GRADIENTS_V:
12446 +  case AMDGPU::TEX_SET_GRADIENTS_H:
12447 +  case AMDGPU::TEX_SET_GRADIENTS_V:
12448 +    return true;
12449 +  }
12450 +}
12451 +
12452 +bool R600MCCodeEmitter::isFlagSet(const MCInst &MI, unsigned Operand,
12453 +                                  unsigned Flag) const {
12454 +  const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode());
12455 +  unsigned FlagIndex = GET_FLAG_OPERAND_IDX(MCDesc.TSFlags);
12456 +  if (FlagIndex == 0) {
12457 +    return false;
12458 +  }
12459 +  assert(MI.getOperand(FlagIndex).isImm());
12460 +  return !!((MI.getOperand(FlagIndex).getImm() >>
12461 +            (NUM_MO_FLAGS * Operand)) & Flag);
12462 +}
12463 +
12464 +#include "AMDGPUGenMCCodeEmitter.inc"
12465 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp llvm-r600/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
12466 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp       1970-01-01 01:00:00.000000000 +0100
12467 +++ llvm-r600/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp  2013-01-25 19:43:57.460049721 +0100
12468 @@ -0,0 +1,298 @@
12469 +//===-- SIMCCodeEmitter.cpp - SI Code Emitter -------------------------------===//
12470 +//
12471 +//                     The LLVM Compiler Infrastructure
12472 +//
12473 +// This file is distributed under the University of Illinois Open Source
12474 +// License. See LICENSE.TXT for details.
12475 +//
12476 +//===----------------------------------------------------------------------===//
12477 +//
12478 +/// \file
12479 +/// \brief The SI code emitter produces machine code that can be executed
12480 +/// directly on the GPU device.
12481 +//
12482 +//===----------------------------------------------------------------------===//
12483 +
12484 +#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
12485 +#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
12486 +#include "llvm/MC/MCCodeEmitter.h"
12487 +#include "llvm/MC/MCContext.h"
12488 +#include "llvm/MC/MCInst.h"
12489 +#include "llvm/MC/MCInstrInfo.h"
12490 +#include "llvm/MC/MCRegisterInfo.h"
12491 +#include "llvm/MC/MCSubtargetInfo.h"
12492 +#include "llvm/MC/MCFixup.h"
12493 +#include "llvm/Support/raw_ostream.h"
12494 +
12495 +#define VGPR_BIT(src_idx) (1ULL << (9 * src_idx - 1))
12496 +#define SI_INSTR_FLAGS_ENCODING_MASK 0xf
12497 +
12498 +// These must be kept in sync with SIInstructions.td and also the
12499 +// InstrEncodingInfo array in SIInstrInfo.cpp.
12500 +//
12501 +// NOTE: This enum is only used to identify the encoding type within LLVM,
12502 +// the actual encoding type that is part of the instruction format is different
12503 +namespace SIInstrEncodingType {
12504 +  enum Encoding {
12505 +    EXP = 0,
12506 +    LDS = 1,
12507 +    MIMG = 2,
12508 +    MTBUF = 3,
12509 +    MUBUF = 4,
12510 +    SMRD = 5,
12511 +    SOP1 = 6,
12512 +    SOP2 = 7,
12513 +    SOPC = 8,
12514 +    SOPK = 9,
12515 +    SOPP = 10,
12516 +    VINTRP = 11,
12517 +    VOP1 = 12,
12518 +    VOP2 = 13,
12519 +    VOP3 = 14,
12520 +    VOPC = 15
12521 +  };
12522 +}
12523 +
12524 +using namespace llvm;
12525 +
12526 +namespace {
12527 +class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
12528 +  SIMCCodeEmitter(const SIMCCodeEmitter &); // DO NOT IMPLEMENT
12529 +  void operator=(const SIMCCodeEmitter &); // DO NOT IMPLEMENT
12530 +  const MCInstrInfo &MCII;
12531 +  const MCRegisterInfo &MRI;
12532 +  const MCSubtargetInfo &STI;
12533 +  MCContext &Ctx;
12534 +
12535 +public:
12536 +  SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
12537 +                  const MCSubtargetInfo &sti, MCContext &ctx)
12538 +    : MCII(mcii), MRI(mri), STI(sti), Ctx(ctx) { }
12539 +
12540 +  ~SIMCCodeEmitter() { }
12541 +
12542 +  /// \breif Encode the instruction and write it to the OS.
12543 +  virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
12544 +                         SmallVectorImpl<MCFixup> &Fixups) const;
12545 +
12546 +  /// \returns the encoding for an MCOperand.
12547 +  virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
12548 +                                     SmallVectorImpl<MCFixup> &Fixups) const;
12549 +
12550 +public:
12551 +
12552 +  /// \brief Encode a sequence of registers with the correct alignment.
12553 +  unsigned GPRAlign(const MCInst &MI, unsigned OpNo, unsigned shift) const;
12554 +
12555 +  /// \brief Encoding for when 2 consecutive registers are used
12556 +  virtual unsigned GPR2AlignEncode(const MCInst &MI, unsigned OpNo,
12557 +                                   SmallVectorImpl<MCFixup> &Fixup) const;
12558 +
12559 +  /// \brief Encoding for when 4 consectuive registers are used
12560 +  virtual unsigned GPR4AlignEncode(const MCInst &MI, unsigned OpNo,
12561 +                                   SmallVectorImpl<MCFixup> &Fixup) const;
12562 +
12563 +  /// \brief Encoding for SMRD indexed loads
12564 +  virtual uint32_t SMRDmemriEncode(const MCInst &MI, unsigned OpNo,
12565 +                                   SmallVectorImpl<MCFixup> &Fixup) const;
12566 +
12567 +  /// \brief Post-Encoder method for VOP instructions
12568 +  virtual uint64_t VOPPostEncode(const MCInst &MI, uint64_t Value) const;
12569 +
12570 +private:
12571 +
12572 +  /// \returns this SIInstrEncodingType for this instruction.
12573 +  unsigned getEncodingType(const MCInst &MI) const;
12574 +
12575 +  /// \brief Get then size in bytes of this instructions encoding.
12576 +  unsigned getEncodingBytes(const MCInst &MI) const;
12577 +
12578 +  /// \returns the hardware encoding for a register
12579 +  unsigned getRegBinaryCode(unsigned reg) const;
12580 +
12581 +  /// \brief Generated function that returns the hardware encoding for
12582 +  /// a register
12583 +  unsigned getHWRegNum(unsigned reg) const;
12584 +
12585 +};
12586 +
12587 +} // End anonymous namespace
12588 +
12589 +MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII,
12590 +                                           const MCRegisterInfo &MRI,
12591 +                                           const MCSubtargetInfo &STI,
12592 +                                           MCContext &Ctx) {
12593 +  return new SIMCCodeEmitter(MCII, MRI, STI, Ctx);
12594 +}
12595 +
12596 +void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
12597 +                                       SmallVectorImpl<MCFixup> &Fixups) const {
12598 +  uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups);
12599 +  unsigned bytes = getEncodingBytes(MI);
12600 +  for (unsigned i = 0; i < bytes; i++) {
12601 +    OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff));
12602 +  }
12603 +}
12604 +
12605 +uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
12606 +                                            const MCOperand &MO,
12607 +                                       SmallVectorImpl<MCFixup> &Fixups) const {
12608 +  if (MO.isReg()) {
12609 +    return getRegBinaryCode(MO.getReg());
12610 +  } else if (MO.isImm()) {
12611 +    return MO.getImm();
12612 +  } else if (MO.isFPImm()) {
12613 +    // XXX: Not all instructions can use inline literals
12614 +    // XXX: We should make sure this is a 32-bit constant
12615 +    union {
12616 +      float F;
12617 +      uint32_t I;
12618 +    } Imm;
12619 +    Imm.F = MO.getFPImm();
12620 +    return Imm.I;
12621 +  } else if (MO.isExpr()) {
12622 +    const MCExpr *Expr = MO.getExpr();
12623 +    MCFixupKind Kind = MCFixupKind(FK_PCRel_4);
12624 +    Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
12625 +    return 0;
12626 +  } else{
12627 +    llvm_unreachable("Encoding of this operand type is not supported yet.");
12628 +  }
12629 +  return 0;
12630 +}
12631 +
12632 +//===----------------------------------------------------------------------===//
12633 +// Custom Operand Encodings
12634 +//===----------------------------------------------------------------------===//
12635 +
12636 +unsigned SIMCCodeEmitter::GPRAlign(const MCInst &MI, unsigned OpNo,
12637 +                                   unsigned shift) const {
12638 +  unsigned regCode = getRegBinaryCode(MI.getOperand(OpNo).getReg());
12639 +  return regCode >> shift;
12640 +  return 0;
12641 +}
12642 +unsigned SIMCCodeEmitter::GPR2AlignEncode(const MCInst &MI,
12643 +                                          unsigned OpNo ,
12644 +                                        SmallVectorImpl<MCFixup> &Fixup) const {
12645 +  return GPRAlign(MI, OpNo, 1);
12646 +}
12647 +
12648 +unsigned SIMCCodeEmitter::GPR4AlignEncode(const MCInst &MI,
12649 +                                          unsigned OpNo,
12650 +                                        SmallVectorImpl<MCFixup> &Fixup) const {
12651 +  return GPRAlign(MI, OpNo, 2);
12652 +}
12653 +
12654 +#define SMRD_OFFSET_MASK 0xff
12655 +#define SMRD_IMM_SHIFT 8
12656 +#define SMRD_SBASE_MASK 0x3f
12657 +#define SMRD_SBASE_SHIFT 9
12658 +/// This function is responsibe for encoding the offset
12659 +/// and the base ptr for SMRD instructions it should return a bit string in
12660 +/// this format:
12661 +///
12662 +/// OFFSET = bits{7-0}
12663 +/// IMM    = bits{8}
12664 +/// SBASE  = bits{14-9}
12665 +///
12666 +uint32_t SIMCCodeEmitter::SMRDmemriEncode(const MCInst &MI, unsigned OpNo,
12667 +                                        SmallVectorImpl<MCFixup> &Fixup) const {
12668 +  uint32_t Encoding;
12669 +
12670 +  const MCOperand &OffsetOp = MI.getOperand(OpNo + 1);
12671 +
12672 +  //XXX: Use this function for SMRD loads with register offsets
12673 +  assert(OffsetOp.isImm());
12674 +
12675 +  Encoding =
12676 +      (getMachineOpValue(MI, OffsetOp, Fixup) & SMRD_OFFSET_MASK)
12677 +    | (1 << SMRD_IMM_SHIFT) //XXX If the Offset is a register we shouldn't set this bit
12678 +    | ((GPR2AlignEncode(MI, OpNo, Fixup) & SMRD_SBASE_MASK) << SMRD_SBASE_SHIFT)
12679 +    ;
12680 +
12681 +  return Encoding;
12682 +}
12683 +
12684 +//===----------------------------------------------------------------------===//
12685 +// Post Encoder Callbacks
12686 +//===----------------------------------------------------------------------===//
12687 +
12688 +uint64_t SIMCCodeEmitter::VOPPostEncode(const MCInst &MI, uint64_t Value) const{
12689 +  unsigned encodingType = getEncodingType(MI);
12690 +  unsigned numSrcOps;
12691 +  unsigned vgprBitOffset;
12692 +
12693 +  if (encodingType == SIInstrEncodingType::VOP3) {
12694 +    numSrcOps = 3;
12695 +    vgprBitOffset = 32;
12696 +  } else {
12697 +    numSrcOps = 1;
12698 +    vgprBitOffset = 0;
12699 +  }
12700 +
12701 +  // Add one to skip over the destination reg operand.
12702 +  for (unsigned opIdx = 1; opIdx < numSrcOps + 1; opIdx++) {
12703 +    const MCOperand &MO = MI.getOperand(opIdx);
12704 +    if (MO.isReg()) {
12705 +      unsigned reg = MI.getOperand(opIdx).getReg();
12706 +      if (AMDGPUMCRegisterClasses[AMDGPU::VReg_32RegClassID].contains(reg) ||
12707 +          AMDGPUMCRegisterClasses[AMDGPU::VReg_64RegClassID].contains(reg)) {
12708 +        Value |= (VGPR_BIT(opIdx)) << vgprBitOffset;
12709 +      }
12710 +    } else if (MO.isFPImm()) {
12711 +      union {
12712 +        float f;
12713 +        uint32_t i;
12714 +      } Imm;
12715 +      // XXX: Not all instructions can use inline literals
12716 +      // XXX: We should make sure this is a 32-bit constant
12717 +      Imm.f = MO.getFPImm();
12718 +      Value |= ((uint64_t)Imm.i) << 32;
12719 +    }
12720 +  }
12721 +  return Value;
12722 +}
12723 +
12724 +//===----------------------------------------------------------------------===//
12725 +// Encoding helper functions
12726 +//===----------------------------------------------------------------------===//
12727 +
12728 +unsigned SIMCCodeEmitter::getEncodingType(const MCInst &MI) const {
12729 +  return MCII.get(MI.getOpcode()).TSFlags & SI_INSTR_FLAGS_ENCODING_MASK;
12730 +}
12731 +
12732 +unsigned SIMCCodeEmitter::getEncodingBytes(const MCInst &MI) const {
12733 +
12734 +  // These instructions aren't real instructions with an encoding type, so
12735 +  // we need to manually specify their size.
12736 +  switch (MI.getOpcode()) {
12737 +  default: break;
12738 +  case AMDGPU::SI_LOAD_LITERAL_I32:
12739 +  case AMDGPU::SI_LOAD_LITERAL_F32:
12740 +    return 4;
12741 +  }
12742 +
12743 +  unsigned encoding_type = getEncodingType(MI);
12744 +  switch (encoding_type) {
12745 +    case SIInstrEncodingType::EXP:
12746 +    case SIInstrEncodingType::LDS:
12747 +    case SIInstrEncodingType::MUBUF:
12748 +    case SIInstrEncodingType::MTBUF:
12749 +    case SIInstrEncodingType::MIMG:
12750 +    case SIInstrEncodingType::VOP3:
12751 +      return 8;
12752 +    default:
12753 +      return 4;
12754 +  }
12755 +}
12756 +
12757 +
12758 +unsigned SIMCCodeEmitter::getRegBinaryCode(unsigned reg) const {
12759 +  switch (reg) {
12760 +    case AMDGPU::M0: return 124;
12761 +    case AMDGPU::SREG_LIT_0: return 128;
12762 +    case AMDGPU::SI_LITERAL_CONSTANT: return 255;
12763 +    default: return MRI.getEncodingValue(reg);
12764 +  }
12765 +}
12766 +
12767 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/Processors.td llvm-r600/lib/Target/R600/Processors.td
12768 --- llvm-3.2.src/lib/Target/R600/Processors.td  1970-01-01 01:00:00.000000000 +0100
12769 +++ llvm-r600/lib/Target/R600/Processors.td     2013-01-25 19:43:57.460049721 +0100
12770 @@ -0,0 +1,29 @@
12771 +//===-- Processors.td - TODO: Add brief description -------===//
12772 +//
12773 +//                     The LLVM Compiler Infrastructure
12774 +//
12775 +// This file is distributed under the University of Illinois Open Source
12776 +// License. See LICENSE.TXT for details.
12777 +//
12778 +//===----------------------------------------------------------------------===//
12779 +//
12780 +// AMDIL processors supported.
12781 +//
12782 +//===----------------------------------------------------------------------===//
12783 +
12784 +class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features>
12785 +: Processor<Name, itin, Features>;
12786 +def : Proc<"r600",       R600_EG_Itin, [FeatureR600ALUInst]>;
12787 +def : Proc<"rv710",      R600_EG_Itin, []>;
12788 +def : Proc<"rv730",      R600_EG_Itin, []>;
12789 +def : Proc<"rv770",      R600_EG_Itin, [FeatureFP64]>;
12790 +def : Proc<"cedar",      R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
12791 +def : Proc<"redwood",    R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
12792 +def : Proc<"juniper",    R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
12793 +def : Proc<"cypress",    R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>;
12794 +def : Proc<"barts",      R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
12795 +def : Proc<"turks",      R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
12796 +def : Proc<"caicos",     R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
12797 +def : Proc<"cayman",     R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>;
12798 +def : Proc<"SI", SI_Itin, [Feature64BitPtr]>;
12799 +
12800 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600Defines.h llvm-r600/lib/Target/R600/R600Defines.h
12801 --- llvm-3.2.src/lib/Target/R600/R600Defines.h  1970-01-01 01:00:00.000000000 +0100
12802 +++ llvm-r600/lib/Target/R600/R600Defines.h     2013-01-25 19:43:57.460049721 +0100
12803 @@ -0,0 +1,94 @@
12804 +//===-- R600Defines.h - R600 Helper Macros ----------------------*- C++ -*-===//
12805 +//
12806 +//                     The LLVM Compiler Infrastructure
12807 +//
12808 +// This file is distributed under the University of Illinois Open Source
12809 +// License. See LICENSE.TXT for details.
12810 +//
12811 +/// \file
12812 +//===----------------------------------------------------------------------===//
12813 +
12814 +#ifndef R600DEFINES_H_
12815 +#define R600DEFINES_H_
12816 +
12817 +#include "llvm/MC/MCRegisterInfo.h"
12818 +
12819 +// Operand Flags
12820 +#define MO_FLAG_CLAMP (1 << 0)
12821 +#define MO_FLAG_NEG   (1 << 1)
12822 +#define MO_FLAG_ABS   (1 << 2)
12823 +#define MO_FLAG_MASK  (1 << 3)
12824 +#define MO_FLAG_PUSH  (1 << 4)
12825 +#define MO_FLAG_NOT_LAST  (1 << 5)
12826 +#define MO_FLAG_LAST  (1 << 6)
12827 +#define NUM_MO_FLAGS 7
12828 +
12829 +/// \brief Helper for getting the operand index for the instruction flags
12830 +/// operand.
12831 +#define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3)
12832 +
12833 +namespace R600_InstFlag {
12834 +  enum TIF {
12835 +    TRANS_ONLY = (1 << 0),
12836 +    TEX = (1 << 1),
12837 +    REDUCTION = (1 << 2),
12838 +    FC = (1 << 3),
12839 +    TRIG = (1 << 4),
12840 +    OP3 = (1 << 5),
12841 +    VECTOR = (1 << 6),
12842 +    //FlagOperand bits 7, 8
12843 +    NATIVE_OPERANDS = (1 << 9),
12844 +    OP1 = (1 << 10),
12845 +    OP2 = (1 << 11)
12846 +  };
12847 +}
12848 +
12849 +#define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS)
12850 +
12851 +/// \brief Defines for extracting register infomation from register encoding
12852 +#define HW_REG_MASK 0x1ff
12853 +#define HW_CHAN_SHIFT 9
12854 +
12855 +namespace R600Operands {
12856 +  enum Ops {
12857 +    DST,
12858 +    UPDATE_EXEC_MASK,
12859 +    UPDATE_PREDICATE,
12860 +    WRITE,
12861 +    OMOD,
12862 +    DST_REL,
12863 +    CLAMP,
12864 +    SRC0,
12865 +    SRC0_NEG,
12866 +    SRC0_REL,
12867 +    SRC0_ABS,
12868 +    SRC0_SEL,
12869 +    SRC1,
12870 +    SRC1_NEG,
12871 +    SRC1_REL,
12872 +    SRC1_ABS,
12873 +    SRC1_SEL,
12874 +    SRC2,
12875 +    SRC2_NEG,
12876 +    SRC2_REL,
12877 +    SRC2_SEL,
12878 +    LAST,
12879 +    PRED_SEL,
12880 +    IMM,
12881 +    COUNT
12882 + };
12883 +
12884 +  const static int ALUOpTable[3][R600Operands::COUNT] = {
12885 +//            W        C     S  S  S  S     S  S  S  S     S  S  S
12886 +//            R  O  D  L  S  R  R  R  R  S  R  R  R  R  S  R  R  R  L  P
12887 +//   D  U     I  M  R  A  R  C  C  C  C  R  C  C  C  C  R  C  C  C  A  R  I
12888 +//   S  E  U  T  O  E  M  C  0  0  0  0  C  1  1  1  1  C  2  2  2  S  E  M
12889 +//   T  M  P  E  D  L  P  0  N  R  A  S  1  N  R  A  S  2  N  R  S  T  D  M
12890 +    {0,-1,-1, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1,-1,10,11,12},
12891 +    {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,13,14,15,16,-1,-1,-1,-1,17,18,19},
12892 +    {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17}
12893 +  };
12894 +
12895 +}
12896 +
12897 +#endif // R600DEFINES_H_
12898 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600ExpandSpecialInstrs.cpp llvm-r600/lib/Target/R600/R600ExpandSpecialInstrs.cpp
12899 --- llvm-3.2.src/lib/Target/R600/R600ExpandSpecialInstrs.cpp    1970-01-01 01:00:00.000000000 +0100
12900 +++ llvm-r600/lib/Target/R600/R600ExpandSpecialInstrs.cpp       2013-01-25 19:43:57.463383054 +0100
12901 @@ -0,0 +1,333 @@
12902 +//===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===//
12903 +//
12904 +//                     The LLVM Compiler Infrastructure
12905 +//
12906 +// This file is distributed under the University of Illinois Open Source
12907 +// License. See LICENSE.TXT for details.
12908 +//
12909 +//===----------------------------------------------------------------------===//
12910 +//
12911 +/// \file
12912 +/// Vector, Reduction, and Cube instructions need to fill the entire instruction
12913 +/// group to work correctly.  This pass expands these individual instructions
12914 +/// into several instructions that will completely fill the instruction group.
12915 +//
12916 +//===----------------------------------------------------------------------===//
12917 +
12918 +#include "AMDGPU.h"
12919 +#include "R600Defines.h"
12920 +#include "R600InstrInfo.h"
12921 +#include "R600RegisterInfo.h"
12922 +#include "R600MachineFunctionInfo.h"
12923 +#include "llvm/CodeGen/MachineFunctionPass.h"
12924 +#include "llvm/CodeGen/MachineInstrBuilder.h"
12925 +#include "llvm/CodeGen/MachineRegisterInfo.h"
12926 +
12927 +using namespace llvm;
12928 +
12929 +namespace {
12930 +
12931 +class R600ExpandSpecialInstrsPass : public MachineFunctionPass {
12932 +
12933 +private:
12934 +  static char ID;
12935 +  const R600InstrInfo *TII;
12936 +
12937 +  bool ExpandInputPerspective(MachineInstr& MI);
12938 +  bool ExpandInputConstant(MachineInstr& MI);
12939 +
12940 +public:
12941 +  R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
12942 +    TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())) { }
12943 +
12944 +  virtual bool runOnMachineFunction(MachineFunction &MF);
12945 +
12946 +  const char *getPassName() const {
12947 +    return "R600 Expand special instructions pass";
12948 +  }
12949 +};
12950 +
12951 +} // End anonymous namespace
12952 +
12953 +char R600ExpandSpecialInstrsPass::ID = 0;
12954 +
12955 +FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) {
12956 +  return new R600ExpandSpecialInstrsPass(TM);
12957 +}
12958 +
12959 +bool R600ExpandSpecialInstrsPass::ExpandInputPerspective(MachineInstr &MI) {
12960 +  const R600RegisterInfo &TRI = TII->getRegisterInfo();
12961 +  if (MI.getOpcode() != AMDGPU::input_perspective)
12962 +    return false;
12963 +
12964 +  MachineBasicBlock::iterator I = &MI;
12965 +  unsigned DstReg = MI.getOperand(0).getReg();
12966 +  R600MachineFunctionInfo *MFI = MI.getParent()->getParent()
12967 +      ->getInfo<R600MachineFunctionInfo>();
12968 +  unsigned IJIndexBase;
12969 +
12970 +  // In Evergreen ISA doc section 8.3.2 :
12971 +  // We need to interpolate XY and ZW in two different instruction groups.
12972 +  // An INTERP_* must occupy all 4 slots of an instruction group.
12973 +  // Output of INTERP_XY is written in X,Y slots
12974 +  // Output of INTERP_ZW is written in Z,W slots
12975 +  //
12976 +  // Thus interpolation requires the following sequences :
12977 +  //
12978 +  // AnyGPR.x = INTERP_ZW; (Write Masked Out)
12979 +  // AnyGPR.y = INTERP_ZW; (Write Masked Out)
12980 +  // DstGPR.z = INTERP_ZW;
12981 +  // DstGPR.w = INTERP_ZW; (End of first IG)
12982 +  // DstGPR.x = INTERP_XY;
12983 +  // DstGPR.y = INTERP_XY;
12984 +  // AnyGPR.z = INTERP_XY; (Write Masked Out)
12985 +  // AnyGPR.w = INTERP_XY; (Write Masked Out) (End of second IG)
12986 +  //
12987 +  switch (MI.getOperand(1).getImm()) {
12988 +  case 0:
12989 +    IJIndexBase = MFI->GetIJPerspectiveIndex();
12990 +    break;
12991 +  case 1:
12992 +    IJIndexBase = MFI->GetIJLinearIndex();
12993 +    break;
12994 +  default:
12995 +    assert(0 && "Unknow ij index");
12996 +  }
12997 +
12998 +  for (unsigned i = 0; i < 8; i++) {
12999 +    unsigned IJIndex = AMDGPU::R600_TReg32RegClass.getRegister(
13000 +        2 * IJIndexBase + ((i + 1) % 2));
13001 +    unsigned ReadReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
13002 +        MI.getOperand(2).getImm());
13003 +
13004 +
13005 +    unsigned Sel = AMDGPU::sel_x;
13006 +    switch (i % 4) {
13007 +    case 0:Sel = AMDGPU::sel_x;break;
13008 +    case 1:Sel = AMDGPU::sel_y;break;
13009 +    case 2:Sel = AMDGPU::sel_z;break;
13010 +    case 3:Sel = AMDGPU::sel_w;break;
13011 +    default:break;
13012 +    }
13013 +
13014 +    unsigned Res = TRI.getSubReg(DstReg, Sel);
13015 +
13016 +    unsigned Opcode = (i < 4)?AMDGPU::INTERP_ZW:AMDGPU::INTERP_XY;
13017 +
13018 +    MachineBasicBlock &MBB = *(MI.getParent());
13019 +    MachineInstr *NewMI =
13020 +        TII->buildDefaultInstruction(MBB, I, Opcode, Res, IJIndex, ReadReg);
13021 +
13022 +    if (!(i> 1 && i < 6)) {
13023 +      TII->addFlag(NewMI, 0, MO_FLAG_MASK);
13024 +    }
13025 +
13026 +    if (i % 4 !=  3)
13027 +      TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
13028 +  }
13029 +
13030 +  MI.eraseFromParent();
13031 +
13032 +  return true;
13033 +}
13034 +
13035 +bool R600ExpandSpecialInstrsPass::ExpandInputConstant(MachineInstr &MI) {
13036 +  const R600RegisterInfo &TRI = TII->getRegisterInfo();
13037 +  if (MI.getOpcode() != AMDGPU::input_constant)
13038 +    return false;
13039 +
13040 +  MachineBasicBlock::iterator I = &MI;
13041 +  unsigned DstReg = MI.getOperand(0).getReg();
13042 +
13043 +  for (unsigned i = 0; i < 4; i++) {
13044 +    unsigned ReadReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
13045 +        MI.getOperand(1).getImm());
13046 +
13047 +    unsigned Sel = AMDGPU::sel_x;
13048 +    switch (i % 4) {
13049 +    case 0:Sel = AMDGPU::sel_x;break;
13050 +    case 1:Sel = AMDGPU::sel_y;break;
13051 +    case 2:Sel = AMDGPU::sel_z;break;
13052 +    case 3:Sel = AMDGPU::sel_w;break;
13053 +    default:break;
13054 +    }
13055 +
13056 +    unsigned Res = TRI.getSubReg(DstReg, Sel);
13057 +
13058 +    MachineBasicBlock &MBB = *(MI.getParent());
13059 +    MachineInstr *NewMI = TII->buildDefaultInstruction(
13060 +        MBB, I, AMDGPU::INTERP_LOAD_P0, Res, ReadReg);
13061 +
13062 +    if (i % 4 !=  3)
13063 +      TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
13064 +  }
13065 +
13066 +  MI.eraseFromParent();
13067 +
13068 +  return true;
13069 +}
13070 +
13071 +bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
13072 +
13073 +  const R600RegisterInfo &TRI = TII->getRegisterInfo();
13074 +
13075 +  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
13076 +                                                  BB != BB_E; ++BB) {
13077 +    MachineBasicBlock &MBB = *BB;
13078 +    MachineBasicBlock::iterator I = MBB.begin();
13079 +    while (I != MBB.end()) {
13080 +      MachineInstr &MI = *I;
13081 +      I = llvm::next(I);
13082 +
13083 +      switch (MI.getOpcode()) {
13084 +      default: break;
13085 +      // Expand PRED_X to one of the PRED_SET instructions.
13086 +      case AMDGPU::PRED_X: {
13087 +        uint64_t Flags = MI.getOperand(3).getImm();
13088 +        // The native opcode used by PRED_X is stored as an immediate in the
13089 +        // third operand.
13090 +        MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
13091 +                                            MI.getOperand(2).getImm(), // opcode
13092 +                                            MI.getOperand(0).getReg(), // dst
13093 +                                            MI.getOperand(1).getReg(), // src0
13094 +                                            AMDGPU::ZERO);             // src1
13095 +        TII->addFlag(PredSet, 0, MO_FLAG_MASK);
13096 +        if (Flags & MO_FLAG_PUSH) {
13097 +          TII->setImmOperand(PredSet, R600Operands::UPDATE_EXEC_MASK, 1);
13098 +        } else {
13099 +          TII->setImmOperand(PredSet, R600Operands::UPDATE_PREDICATE, 1);
13100 +        }
13101 +        MI.eraseFromParent();
13102 +        continue;
13103 +        }
13104 +      case AMDGPU::BREAK:
13105 +        MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
13106 +                                          AMDGPU::PRED_SETE_INT,
13107 +                                          AMDGPU::PREDICATE_BIT,
13108 +                                          AMDGPU::ZERO,
13109 +                                          AMDGPU::ZERO);
13110 +        TII->addFlag(PredSet, 0, MO_FLAG_MASK);
13111 +        TII->setImmOperand(PredSet, R600Operands::UPDATE_EXEC_MASK, 1);
13112 +
13113 +        BuildMI(MBB, I, MBB.findDebugLoc(I),
13114 +                TII->get(AMDGPU::PREDICATED_BREAK))
13115 +                .addReg(AMDGPU::PREDICATE_BIT);
13116 +        MI.eraseFromParent();
13117 +        continue;
13118 +    }
13119 +
13120 +    if (ExpandInputPerspective(MI))
13121 +      continue;
13122 +    if (ExpandInputConstant(MI))
13123 +      continue;
13124 +
13125 +      bool IsReduction = TII->isReductionOp(MI.getOpcode());
13126 +      bool IsVector = TII->isVector(MI);
13127 +      bool IsCube = TII->isCubeOp(MI.getOpcode());
13128 +      if (!IsReduction && !IsVector && !IsCube) {
13129 +        continue;
13130 +      }
13131 +
13132 +      // Expand the instruction
13133 +      //
13134 +      // Reduction instructions:
13135 +      // T0_X = DP4 T1_XYZW, T2_XYZW
13136 +      // becomes:
13137 +      // TO_X = DP4 T1_X, T2_X
13138 +      // TO_Y (write masked) = DP4 T1_Y, T2_Y
13139 +      // TO_Z (write masked) = DP4 T1_Z, T2_Z
13140 +      // TO_W (write masked) = DP4 T1_W, T2_W
13141 +      //
13142 +      // Vector instructions:
13143 +      // T0_X = MULLO_INT T1_X, T2_X
13144 +      // becomes:
13145 +      // T0_X = MULLO_INT T1_X, T2_X
13146 +      // T0_Y (write masked) = MULLO_INT T1_X, T2_X
13147 +      // T0_Z (write masked) = MULLO_INT T1_X, T2_X
13148 +      // T0_W (write masked) = MULLO_INT T1_X, T2_X
13149 +      //
13150 +      // Cube instructions:
13151 +      // T0_XYZW = CUBE T1_XYZW
13152 +      // becomes:
13153 +      // TO_X = CUBE T1_Z, T1_Y
13154 +      // T0_Y = CUBE T1_Z, T1_X
13155 +      // T0_Z = CUBE T1_X, T1_Z
13156 +      // T0_W = CUBE T1_Y, T1_Z
13157 +      for (unsigned Chan = 0; Chan < 4; Chan++) {
13158 +        unsigned DstReg = MI.getOperand(
13159 +                            TII->getOperandIdx(MI, R600Operands::DST)).getReg();
13160 +        unsigned Src0 = MI.getOperand(
13161 +                           TII->getOperandIdx(MI, R600Operands::SRC0)).getReg();
13162 +        unsigned Src1 = 0;
13163 +
13164 +        // Determine the correct source registers
13165 +        if (!IsCube) {
13166 +          int Src1Idx = TII->getOperandIdx(MI, R600Operands::SRC1);
13167 +          if (Src1Idx != -1) {
13168 +            Src1 = MI.getOperand(Src1Idx).getReg();
13169 +          }
13170 +        }
13171 +        if (IsReduction) {
13172 +          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
13173 +          Src0 = TRI.getSubReg(Src0, SubRegIndex);
13174 +          Src1 = TRI.getSubReg(Src1, SubRegIndex);
13175 +        } else if (IsCube) {
13176 +          static const int CubeSrcSwz[] = {2, 2, 0, 1};
13177 +          unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]);
13178 +          unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
13179 +          Src1 = TRI.getSubReg(Src0, SubRegIndex1);
13180 +          Src0 = TRI.getSubReg(Src0, SubRegIndex0);
13181 +        }
13182 +
13183 +        // Determine the correct destination registers;
13184 +        bool Mask = false;
13185 +        bool NotLast = true;
13186 +        if (IsCube) {
13187 +          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
13188 +          DstReg = TRI.getSubReg(DstReg, SubRegIndex);
13189 +        } else {
13190 +          // Mask the write if the original instruction does not write to
13191 +          // the current Channel.
13192 +          Mask = (Chan != TRI.getHWRegChan(DstReg));
13193 +          unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
13194 +          DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
13195 +        }
13196 +
13197 +        // Set the IsLast bit
13198 +        NotLast = (Chan != 3 );
13199 +
13200 +        // Add the new instruction
13201 +        unsigned Opcode = MI.getOpcode();
13202 +        switch (Opcode) {
13203 +        case AMDGPU::CUBE_r600_pseudo:
13204 +          Opcode = AMDGPU::CUBE_r600_real;
13205 +          break;
13206 +        case AMDGPU::CUBE_eg_pseudo:
13207 +          Opcode = AMDGPU::CUBE_eg_real;
13208 +          break;
13209 +        case AMDGPU::DOT4_r600_pseudo:
13210 +          Opcode = AMDGPU::DOT4_r600_real;
13211 +          break;
13212 +        case AMDGPU::DOT4_eg_pseudo:
13213 +          Opcode = AMDGPU::DOT4_eg_real;
13214 +          break;
13215 +        default:
13216 +          break;
13217 +        }
13218 +
13219 +        MachineInstr *NewMI =
13220 +          TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1);
13221 +
13222 +        NewMI->setIsInsideBundle(Chan != 0);
13223 +        if (Mask) {
13224 +          TII->addFlag(NewMI, 0, MO_FLAG_MASK);
13225 +        }
13226 +        if (NotLast) {
13227 +          TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
13228 +        }
13229 +      }
13230 +      MI.eraseFromParent();
13231 +    }
13232 +  }
13233 +  return false;
13234 +}
13235 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600InstrInfo.cpp llvm-r600/lib/Target/R600/R600InstrInfo.cpp
13236 --- llvm-3.2.src/lib/Target/R600/R600InstrInfo.cpp      1970-01-01 01:00:00.000000000 +0100
13237 +++ llvm-r600/lib/Target/R600/R600InstrInfo.cpp 2013-01-25 19:43:57.466716387 +0100
13238 @@ -0,0 +1,655 @@
13239 +//===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===//
13240 +//
13241 +//                     The LLVM Compiler Infrastructure
13242 +//
13243 +// This file is distributed under the University of Illinois Open Source
13244 +// License. See LICENSE.TXT for details.
13245 +//
13246 +//===----------------------------------------------------------------------===//
13247 +//
13248 +/// \file
13249 +/// \brief R600 Implementation of TargetInstrInfo.
13250 +//
13251 +//===----------------------------------------------------------------------===//
13252 +
13253 +#include "R600InstrInfo.h"
13254 +#include "AMDGPUTargetMachine.h"
13255 +#include "AMDGPUSubtarget.h"
13256 +#include "R600Defines.h"
13257 +#include "R600RegisterInfo.h"
13258 +#include "llvm/CodeGen/MachineInstrBuilder.h"
13259 +
13260 +#define GET_INSTRINFO_CTOR
13261 +#include "AMDGPUGenDFAPacketizer.inc"
13262 +
13263 +using namespace llvm;
13264 +
13265 +R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm)
13266 +  : AMDGPUInstrInfo(tm),
13267 +    RI(tm, *this)
13268 +  { }
13269 +
13270 +const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const {
13271 +  return RI;
13272 +}
13273 +
13274 +bool R600InstrInfo::isTrig(const MachineInstr &MI) const {
13275 +  return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG;
13276 +}
13277 +
13278 +bool R600InstrInfo::isVector(const MachineInstr &MI) const {
13279 +  return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
13280 +}
13281 +
13282 +void
13283 +R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
13284 +                           MachineBasicBlock::iterator MI, DebugLoc DL,
13285 +                           unsigned DestReg, unsigned SrcReg,
13286 +                           bool KillSrc) const {
13287 +  if (AMDGPU::R600_Reg128RegClass.contains(DestReg)
13288 +      && AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
13289 +    for (unsigned I = 0; I < 4; I++) {
13290 +      unsigned SubRegIndex = RI.getSubRegFromChannel(I);
13291 +      buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
13292 +                              RI.getSubReg(DestReg, SubRegIndex),
13293 +                              RI.getSubReg(SrcReg, SubRegIndex))
13294 +                              .addReg(DestReg,
13295 +                                      RegState::Define | RegState::Implicit);
13296 +    }
13297 +  } else {
13298 +
13299 +    // We can't copy vec4 registers
13300 +    assert(!AMDGPU::R600_Reg128RegClass.contains(DestReg)
13301 +           && !AMDGPU::R600_Reg128RegClass.contains(SrcReg));
13302 +
13303 +    MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
13304 +                                                  DestReg, SrcReg);
13305 +    NewMI->getOperand(getOperandIdx(*NewMI, R600Operands::SRC0))
13306 +                                    .setIsKill(KillSrc);
13307 +  }
13308 +}
13309 +
13310 +MachineInstr * R600InstrInfo::getMovImmInstr(MachineFunction *MF,
13311 +                                             unsigned DstReg, int64_t Imm) const {
13312 +  MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::MOV), DebugLoc());
13313 +  MachineInstrBuilder(MI).addReg(DstReg, RegState::Define);
13314 +  MachineInstrBuilder(MI).addReg(AMDGPU::ALU_LITERAL_X);
13315 +  MachineInstrBuilder(MI).addImm(Imm);
13316 +  MachineInstrBuilder(MI).addReg(0); // PREDICATE_BIT
13317 +
13318 +  return MI;
13319 +}
13320 +
13321 +unsigned R600InstrInfo::getIEQOpcode() const {
13322 +  return AMDGPU::SETE_INT;
13323 +}
13324 +
13325 +bool R600InstrInfo::isMov(unsigned Opcode) const {
13326 +
13327 +
13328 +  switch(Opcode) {
13329 +  default: return false;
13330 +  case AMDGPU::MOV:
13331 +  case AMDGPU::MOV_IMM_F32:
13332 +  case AMDGPU::MOV_IMM_I32:
13333 +    return true;
13334 +  }
13335 +}
13336 +
13337 +// Some instructions act as place holders to emulate operations that the GPU
13338 +// hardware does automatically. This function can be used to check if
13339 +// an opcode falls into this category.
13340 +bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const {
13341 +  switch (Opcode) {
13342 +  default: return false;
13343 +  case AMDGPU::RETURN:
13344 +  case AMDGPU::RESERVE_REG:
13345 +    return true;
13346 +  }
13347 +}
13348 +
13349 +bool R600InstrInfo::isReductionOp(unsigned Opcode) const {
13350 +  switch(Opcode) {
13351 +    default: return false;
13352 +    case AMDGPU::DOT4_r600_pseudo:
13353 +    case AMDGPU::DOT4_eg_pseudo:
13354 +      return true;
13355 +  }
13356 +}
13357 +
13358 +bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
13359 +  switch(Opcode) {
13360 +    default: return false;
13361 +    case AMDGPU::CUBE_r600_pseudo:
13362 +    case AMDGPU::CUBE_r600_real:
13363 +    case AMDGPU::CUBE_eg_pseudo:
13364 +    case AMDGPU::CUBE_eg_real:
13365 +      return true;
13366 +  }
13367 +}
13368 +
13369 +bool R600InstrInfo::isALUInstr(unsigned Opcode) const {
13370 +  unsigned TargetFlags = get(Opcode).TSFlags;
13371 +
13372 +  return ((TargetFlags & R600_InstFlag::OP1) |
13373 +          (TargetFlags & R600_InstFlag::OP2) |
13374 +          (TargetFlags & R600_InstFlag::OP3));
13375 +}
13376 +
13377 +DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM,
13378 +    const ScheduleDAG *DAG) const {
13379 +  const InstrItineraryData *II = TM->getInstrItineraryData();
13380 +  return TM->getSubtarget<AMDGPUSubtarget>().createDFAPacketizer(II);
13381 +}
13382 +
13383 +static bool
13384 +isPredicateSetter(unsigned Opcode) {
13385 +  switch (Opcode) {
13386 +  case AMDGPU::PRED_X:
13387 +    return true;
13388 +  default:
13389 +    return false;
13390 +  }
13391 +}
13392 +
13393 +static MachineInstr *
13394 +findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
13395 +                             MachineBasicBlock::iterator I) {
13396 +  while (I != MBB.begin()) {
13397 +    --I;
13398 +    MachineInstr *MI = I;
13399 +    if (isPredicateSetter(MI->getOpcode()))
13400 +      return MI;
13401 +  }
13402 +
13403 +  return NULL;
13404 +}
13405 +
13406 +bool
13407 +R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
13408 +                             MachineBasicBlock *&TBB,
13409 +                             MachineBasicBlock *&FBB,
13410 +                             SmallVectorImpl<MachineOperand> &Cond,
13411 +                             bool AllowModify) const {
13412 +  // Most of the following comes from the ARM implementation of AnalyzeBranch
13413 +
13414 +  // If the block has no terminators, it just falls into the block after it.
13415 +  MachineBasicBlock::iterator I = MBB.end();
13416 +  if (I == MBB.begin())
13417 +    return false;
13418 +  --I;
13419 +  while (I->isDebugValue()) {
13420 +    if (I == MBB.begin())
13421 +      return false;
13422 +    --I;
13423 +  }
13424 +  if (static_cast<MachineInstr *>(I)->getOpcode() != AMDGPU::JUMP) {
13425 +    return false;
13426 +  }
13427 +
13428 +  // Get the last instruction in the block.
13429 +  MachineInstr *LastInst = I;
13430 +
13431 +  // If there is only one terminator instruction, process it.
13432 +  unsigned LastOpc = LastInst->getOpcode();
13433 +  if (I == MBB.begin() ||
13434 +      static_cast<MachineInstr *>(--I)->getOpcode() != AMDGPU::JUMP) {
13435 +    if (LastOpc == AMDGPU::JUMP) {
13436 +      if(!isPredicated(LastInst)) {
13437 +        TBB = LastInst->getOperand(0).getMBB();
13438 +        return false;
13439 +      } else {
13440 +        MachineInstr *predSet = I;
13441 +        while (!isPredicateSetter(predSet->getOpcode())) {
13442 +          predSet = --I;
13443 +        }
13444 +        TBB = LastInst->getOperand(0).getMBB();
13445 +        Cond.push_back(predSet->getOperand(1));
13446 +        Cond.push_back(predSet->getOperand(2));
13447 +        Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
13448 +        return false;
13449 +      }
13450 +    }
13451 +    return true;  // Can't handle indirect branch.
13452 +  }
13453 +
13454 +  // Get the instruction before it if it is a terminator.
13455 +  MachineInstr *SecondLastInst = I;
13456 +  unsigned SecondLastOpc = SecondLastInst->getOpcode();
13457 +
13458 +  // If the block ends with a B and a Bcc, handle it.
13459 +  if (SecondLastOpc == AMDGPU::JUMP &&
13460 +      isPredicated(SecondLastInst) &&
13461 +      LastOpc == AMDGPU::JUMP &&
13462 +      !isPredicated(LastInst)) {
13463 +    MachineInstr *predSet = --I;
13464 +    while (!isPredicateSetter(predSet->getOpcode())) {
13465 +      predSet = --I;
13466 +    }
13467 +    TBB = SecondLastInst->getOperand(0).getMBB();
13468 +    FBB = LastInst->getOperand(0).getMBB();
13469 +    Cond.push_back(predSet->getOperand(1));
13470 +    Cond.push_back(predSet->getOperand(2));
13471 +    Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
13472 +    return false;
13473 +  }
13474 +
13475 +  // Otherwise, can't handle this.
13476 +  return true;
13477 +}
13478 +
13479 +int R600InstrInfo::getBranchInstr(const MachineOperand &op) const {
13480 +  const MachineInstr *MI = op.getParent();
13481 +
13482 +  switch (MI->getDesc().OpInfo->RegClass) {
13483 +  default: // FIXME: fallthrough??
13484 +  case AMDGPU::GPRI32RegClassID: return AMDGPU::BRANCH_COND_i32;
13485 +  case AMDGPU::GPRF32RegClassID: return AMDGPU::BRANCH_COND_f32;
13486 +  };
13487 +}
13488 +
13489 +unsigned
13490 +R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
13491 +                            MachineBasicBlock *TBB,
13492 +                            MachineBasicBlock *FBB,
13493 +                            const SmallVectorImpl<MachineOperand> &Cond,
13494 +                            DebugLoc DL) const {
13495 +  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
13496 +
13497 +  if (FBB == 0) {
13498 +    if (Cond.empty()) {
13499 +      BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB).addReg(0);
13500 +      return 1;
13501 +    } else {
13502 +      MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
13503 +      assert(PredSet && "No previous predicate !");
13504 +      addFlag(PredSet, 0, MO_FLAG_PUSH);
13505 +      PredSet->getOperand(2).setImm(Cond[1].getImm());
13506 +
13507 +      BuildMI(&MBB, DL, get(AMDGPU::JUMP))
13508 +             .addMBB(TBB)
13509 +             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
13510 +      return 1;
13511 +    }
13512 +  } else {
13513 +    MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
13514 +    assert(PredSet && "No previous predicate !");
13515 +    addFlag(PredSet, 0, MO_FLAG_PUSH);
13516 +    PredSet->getOperand(2).setImm(Cond[1].getImm());
13517 +    BuildMI(&MBB, DL, get(AMDGPU::JUMP))
13518 +            .addMBB(TBB)
13519 +            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
13520 +    BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB).addReg(0);
13521 +    return 2;
13522 +  }
13523 +}
13524 +
13525 +unsigned
13526 +R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
13527 +
13528 +  // Note : we leave PRED* instructions there.
13529 +  // They may be needed when predicating instructions.
13530 +
13531 +  MachineBasicBlock::iterator I = MBB.end();
13532 +
13533 +  if (I == MBB.begin()) {
13534 +    return 0;
13535 +  }
13536 +  --I;
13537 +  switch (I->getOpcode()) {
13538 +  default:
13539 +    return 0;
13540 +  case AMDGPU::JUMP:
13541 +    if (isPredicated(I)) {
13542 +      MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
13543 +      clearFlag(predSet, 0, MO_FLAG_PUSH);
13544 +    }
13545 +    I->eraseFromParent();
13546 +    break;
13547 +  }
13548 +  I = MBB.end();
13549 +
13550 +  if (I == MBB.begin()) {
13551 +    return 1;
13552 +  }
13553 +  --I;
13554 +  switch (I->getOpcode()) {
13555 +    // FIXME: only one case??
13556 +  default:
13557 +    return 1;
13558 +  case AMDGPU::JUMP:
13559 +    if (isPredicated(I)) {
13560 +      MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
13561 +      clearFlag(predSet, 0, MO_FLAG_PUSH);
13562 +    }
13563 +    I->eraseFromParent();
13564 +    break;
13565 +  }
13566 +  return 2;
13567 +}
13568 +
13569 +bool
13570 +R600InstrInfo::isPredicated(const MachineInstr *MI) const {
13571 +  int idx = MI->findFirstPredOperandIdx();
13572 +  if (idx < 0)
13573 +    return false;
13574 +
13575 +  unsigned Reg = MI->getOperand(idx).getReg();
13576 +  switch (Reg) {
13577 +  default: return false;
13578 +  case AMDGPU::PRED_SEL_ONE:
13579 +  case AMDGPU::PRED_SEL_ZERO:
13580 +  case AMDGPU::PREDICATE_BIT:
13581 +    return true;
13582 +  }
13583 +}
13584 +
13585 +bool
13586 +R600InstrInfo::isPredicable(MachineInstr *MI) const {
13587 +  // XXX: KILL* instructions can be predicated, but they must be the last
13588 +  // instruction in a clause, so this means any instructions after them cannot
13589 +  // be predicated.  Until we have proper support for instruction clauses in the
13590 +  // backend, we will mark KILL* instructions as unpredicable.
13591 +
13592 +  if (MI->getOpcode() == AMDGPU::KILLGT) {
13593 +    return false;
13594 +  } else {
13595 +    return AMDGPUInstrInfo::isPredicable(MI);
13596 +  }
13597 +}
13598 +
13599 +
13600 +bool
13601 +R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
13602 +                                   unsigned NumCyles,
13603 +                                   unsigned ExtraPredCycles,
13604 +                                   const BranchProbability &Probability) const{
13605 +  return true;
13606 +}
13607 +
13608 +bool
13609 +R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
13610 +                                   unsigned NumTCycles,
13611 +                                   unsigned ExtraTCycles,
13612 +                                   MachineBasicBlock &FMBB,
13613 +                                   unsigned NumFCycles,
13614 +                                   unsigned ExtraFCycles,
13615 +                                   const BranchProbability &Probability) const {
13616 +  return true;
13617 +}
13618 +
13619 +bool
13620 +R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
13621 +                                         unsigned NumCyles,
13622 +                                         const BranchProbability &Probability)
13623 +                                         const {
13624 +  return true;
13625 +}
13626 +
13627 +bool
13628 +R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
13629 +                                         MachineBasicBlock &FMBB) const {
13630 +  return false;
13631 +}
13632 +
13633 +
13634 +bool
13635 +R600InstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
13636 +  MachineOperand &MO = Cond[1];
13637 +  switch (MO.getImm()) {
13638 +  case OPCODE_IS_ZERO_INT:
13639 +    MO.setImm(OPCODE_IS_NOT_ZERO_INT);
13640 +    break;
13641 +  case OPCODE_IS_NOT_ZERO_INT:
13642 +    MO.setImm(OPCODE_IS_ZERO_INT);
13643 +    break;
13644 +  case OPCODE_IS_ZERO:
13645 +    MO.setImm(OPCODE_IS_NOT_ZERO);
13646 +    break;
13647 +  case OPCODE_IS_NOT_ZERO:
13648 +    MO.setImm(OPCODE_IS_ZERO);
13649 +    break;
13650 +  default:
13651 +    return true;
13652 +  }
13653 +
13654 +  MachineOperand &MO2 = Cond[2];
13655 +  switch (MO2.getReg()) {
13656 +  case AMDGPU::PRED_SEL_ZERO:
13657 +    MO2.setReg(AMDGPU::PRED_SEL_ONE);
13658 +    break;
13659 +  case AMDGPU::PRED_SEL_ONE:
13660 +    MO2.setReg(AMDGPU::PRED_SEL_ZERO);
13661 +    break;
13662 +  default:
13663 +    return true;
13664 +  }
13665 +  return false;
13666 +}
13667 +
13668 +bool
13669 +R600InstrInfo::DefinesPredicate(MachineInstr *MI,
13670 +                                std::vector<MachineOperand> &Pred) const {
13671 +  return isPredicateSetter(MI->getOpcode());
13672 +}
13673 +
13674 +
13675 +bool
13676 +R600InstrInfo::SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
13677 +                       const SmallVectorImpl<MachineOperand> &Pred2) const {
13678 +  return false;
13679 +}
13680 +
13681 +
13682 +bool
13683 +R600InstrInfo::PredicateInstruction(MachineInstr *MI,
13684 +                      const SmallVectorImpl<MachineOperand> &Pred) const {
13685 +  int PIdx = MI->findFirstPredOperandIdx();
13686 +
13687 +  if (PIdx != -1) {
13688 +    MachineOperand &PMO = MI->getOperand(PIdx);
13689 +    PMO.setReg(Pred[2].getReg());
13690 +    MachineInstrBuilder(MI).addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
13691 +    return true;
13692 +  }
13693 +
13694 +  return false;
13695 +}
13696 +
13697 +unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
13698 +                                            const MachineInstr *MI,
13699 +                                            unsigned *PredCost) const {
13700 +  if (PredCost)
13701 +    *PredCost = 2;
13702 +  return 2;
13703 +}
13704 +
13705 +MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MBB,
13706 +                                                  MachineBasicBlock::iterator I,
13707 +                                                  unsigned Opcode,
13708 +                                                  unsigned DstReg,
13709 +                                                  unsigned Src0Reg,
13710 +                                                  unsigned Src1Reg) const {
13711 +  MachineInstrBuilder MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opcode),
13712 +    DstReg);           // $dst
13713 +
13714 +  if (Src1Reg) {
13715 +    MIB.addImm(0)     // $update_exec_mask
13716 +       .addImm(0);    // $update_predicate
13717 +  }
13718 +  MIB.addImm(1)        // $write
13719 +     .addImm(0)        // $omod
13720 +     .addImm(0)        // $dst_rel
13721 +     .addImm(0)        // $dst_clamp
13722 +     .addReg(Src0Reg)  // $src0
13723 +     .addImm(0)        // $src0_neg
13724 +     .addImm(0)        // $src0_rel
13725 +     .addImm(0)        // $src0_abs
13726 +     .addImm(-1);       // $src0_sel
13727 +
13728 +  if (Src1Reg) {
13729 +    MIB.addReg(Src1Reg) // $src1
13730 +       .addImm(0)       // $src1_neg
13731 +       .addImm(0)       // $src1_rel
13732 +       .addImm(0)       // $src1_abs
13733 +       .addImm(-1);      // $src1_sel
13734 +  }
13735 +
13736 +  //XXX: The r600g finalizer expects this to be 1, once we've moved the
13737 +  //scheduling to the backend, we can change the default to 0.
13738 +  MIB.addImm(1)        // $last
13739 +      .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel
13740 +      .addImm(0);        // $literal
13741 +
13742 +  return MIB;
13743 +}
13744 +
13745 +MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
13746 +                                         MachineBasicBlock::iterator I,
13747 +                                         unsigned DstReg,
13748 +                                         uint64_t Imm) const {
13749 +  MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg,
13750 +                                                  AMDGPU::ALU_LITERAL_X);
13751 +  setImmOperand(MovImm, R600Operands::IMM, Imm);
13752 +  return MovImm;
13753 +}
13754 +
13755 +int R600InstrInfo::getOperandIdx(const MachineInstr &MI,
13756 +                                 R600Operands::Ops Op) const {
13757 +  return getOperandIdx(MI.getOpcode(), Op);
13758 +}
13759 +
13760 +int R600InstrInfo::getOperandIdx(unsigned Opcode,
13761 +                                 R600Operands::Ops Op) const {
13762 +  unsigned TargetFlags = get(Opcode).TSFlags;
13763 +  unsigned OpTableIdx;
13764 +
13765 +  if (!HAS_NATIVE_OPERANDS(TargetFlags)) {
13766 +    switch (Op) {
13767 +    case R600Operands::DST: return 0;
13768 +    case R600Operands::SRC0: return 1;
13769 +    case R600Operands::SRC1: return 2;
13770 +    case R600Operands::SRC2: return 3;
13771 +    default:
13772 +      assert(!"Unknown operand type for instruction");
13773 +      return -1;
13774 +    }
13775 +  }
13776 +
13777 +  if (TargetFlags & R600_InstFlag::OP1) {
13778 +    OpTableIdx = 0;
13779 +  } else if (TargetFlags & R600_InstFlag::OP2) {
13780 +    OpTableIdx = 1;
13781 +  } else {
13782 +    assert((TargetFlags & R600_InstFlag::OP3) && "OP1, OP2, or OP3 not defined "
13783 +                                                 "for this instruction");
13784 +    OpTableIdx = 2;
13785 +  }
13786 +
13787 +  return R600Operands::ALUOpTable[OpTableIdx][Op];
13788 +}
13789 +
13790 +void R600InstrInfo::setImmOperand(MachineInstr *MI, R600Operands::Ops Op,
13791 +                                  int64_t Imm) const {
13792 +  int Idx = getOperandIdx(*MI, Op);
13793 +  assert(Idx != -1 && "Operand not supported for this instruction.");
13794 +  assert(MI->getOperand(Idx).isImm());
13795 +  MI->getOperand(Idx).setImm(Imm);
13796 +}
13797 +
13798 +//===----------------------------------------------------------------------===//
13799 +// Instruction flag getters/setters
13800 +//===----------------------------------------------------------------------===//
13801 +
13802 +bool R600InstrInfo::hasFlagOperand(const MachineInstr &MI) const {
13803 +  return GET_FLAG_OPERAND_IDX(get(MI.getOpcode()).TSFlags) != 0;
13804 +}
13805 +
13806 +MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx,
13807 +                                         unsigned Flag) const {
13808 +  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
13809 +  int FlagIndex = 0;
13810 +  if (Flag != 0) {
13811 +    // If we pass something other than the default value of Flag to this
13812 +    // function, it means we are want to set a flag on an instruction
13813 +    // that uses native encoding.
13814 +    assert(HAS_NATIVE_OPERANDS(TargetFlags));
13815 +    bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3;
13816 +    switch (Flag) {
13817 +    case MO_FLAG_CLAMP:
13818 +      FlagIndex = getOperandIdx(*MI, R600Operands::CLAMP);
13819 +      break;
13820 +    case MO_FLAG_MASK:
13821 +      FlagIndex = getOperandIdx(*MI, R600Operands::WRITE);
13822 +      break;
13823 +    case MO_FLAG_NOT_LAST:
13824 +    case MO_FLAG_LAST:
13825 +      FlagIndex = getOperandIdx(*MI, R600Operands::LAST);
13826 +      break;
13827 +    case MO_FLAG_NEG:
13828 +      switch (SrcIdx) {
13829 +      case 0: FlagIndex = getOperandIdx(*MI, R600Operands::SRC0_NEG); break;
13830 +      case 1: FlagIndex = getOperandIdx(*MI, R600Operands::SRC1_NEG); break;
13831 +      case 2: FlagIndex = getOperandIdx(*MI, R600Operands::SRC2_NEG); break;
13832 +      }
13833 +      break;
13834 +
13835 +    case MO_FLAG_ABS:
13836 +      assert(!IsOP3 && "Cannot set absolute value modifier for OP3 "
13837 +                       "instructions.");
13838 +      switch (SrcIdx) {
13839 +      case 0: FlagIndex = getOperandIdx(*MI, R600Operands::SRC0_ABS); break;
13840 +      case 1: FlagIndex = getOperandIdx(*MI, R600Operands::SRC1_ABS); break;
13841 +      }
13842 +      break;
13843 +
13844 +    default:
13845 +      FlagIndex = -1;
13846 +      break;
13847 +    }
13848 +    assert(FlagIndex != -1 && "Flag not supported for this instruction");
13849 +  } else {
13850 +      FlagIndex = GET_FLAG_OPERAND_IDX(TargetFlags);
13851 +      assert(FlagIndex != 0 &&
13852 +         "Instruction flags not supported for this instruction");
13853 +  }
13854 +
13855 +  MachineOperand &FlagOp = MI->getOperand(FlagIndex);
13856 +  assert(FlagOp.isImm());
13857 +  return FlagOp;
13858 +}
13859 +
13860 +void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand,
13861 +                            unsigned Flag) const {
13862 +  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
13863 +  if (Flag == 0) {
13864 +    return;
13865 +  }
13866 +  if (HAS_NATIVE_OPERANDS(TargetFlags)) {
13867 +    MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
13868 +    if (Flag == MO_FLAG_NOT_LAST) {
13869 +      clearFlag(MI, Operand, MO_FLAG_LAST);
13870 +    } else if (Flag == MO_FLAG_MASK) {
13871 +      clearFlag(MI, Operand, Flag);
13872 +    } else {
13873 +      FlagOp.setImm(1);
13874 +    }
13875 +  } else {
13876 +      MachineOperand &FlagOp = getFlagOp(MI, Operand);
13877 +      FlagOp.setImm(FlagOp.getImm() | (Flag << (NUM_MO_FLAGS * Operand)));
13878 +  }
13879 +}
13880 +
13881 +void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand,
13882 +                              unsigned Flag) const {
13883 +  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
13884 +  if (HAS_NATIVE_OPERANDS(TargetFlags)) {
13885 +    MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
13886 +    FlagOp.setImm(0);
13887 +  } else {
13888 +    MachineOperand &FlagOp = getFlagOp(MI);
13889 +    unsigned InstFlags = FlagOp.getImm();
13890 +    InstFlags &= ~(Flag << (NUM_MO_FLAGS * Operand));
13891 +    FlagOp.setImm(InstFlags);
13892 +  }
13893 +}
13894 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600InstrInfo.h llvm-r600/lib/Target/R600/R600InstrInfo.h
13895 --- llvm-3.2.src/lib/Target/R600/R600InstrInfo.h        1970-01-01 01:00:00.000000000 +0100
13896 +++ llvm-r600/lib/Target/R600/R600InstrInfo.h   2013-01-25 19:43:57.466716387 +0100
13897 @@ -0,0 +1,169 @@
13898 +//===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===//
13899 +//
13900 +//                     The LLVM Compiler Infrastructure
13901 +//
13902 +// This file is distributed under the University of Illinois Open Source
13903 +// License. See LICENSE.TXT for details.
13904 +//
13905 +//===----------------------------------------------------------------------===//
13906 +//
13907 +/// \file
13908 +/// \brief Interface definition for R600InstrInfo
13909 +//
13910 +//===----------------------------------------------------------------------===//
13911 +
13912 +#ifndef R600INSTRUCTIONINFO_H_
13913 +#define R600INSTRUCTIONINFO_H_
13914 +
13915 +#include "AMDIL.h"
13916 +#include "AMDGPUInstrInfo.h"
13917 +#include "R600Defines.h"
13918 +#include "R600RegisterInfo.h"
13919 +
13920 +#include <map>
13921 +
13922 +namespace llvm {
13923 +
13924 +  class AMDGPUTargetMachine;
13925 +  class DFAPacketizer;
13926 +  class ScheduleDAG;
13927 +  class MachineFunction;
13928 +  class MachineInstr;
13929 +  class MachineInstrBuilder;
13930 +
13931 +  class R600InstrInfo : public AMDGPUInstrInfo {
13932 +  private:
13933 +  const R600RegisterInfo RI;
13934 +
13935 +  int getBranchInstr(const MachineOperand &op) const;
13936 +
13937 +  public:
13938 +  explicit R600InstrInfo(AMDGPUTargetMachine &tm);
13939 +
13940 +  const R600RegisterInfo &getRegisterInfo() const;
13941 +  virtual void copyPhysReg(MachineBasicBlock &MBB,
13942 +                           MachineBasicBlock::iterator MI, DebugLoc DL,
13943 +                           unsigned DestReg, unsigned SrcReg,
13944 +                           bool KillSrc) const;
13945 +
13946 +  bool isTrig(const MachineInstr &MI) const;
13947 +  bool isPlaceHolderOpcode(unsigned opcode) const;
13948 +  bool isReductionOp(unsigned opcode) const;
13949 +  bool isCubeOp(unsigned opcode) const;
13950 +
13951 +  /// \returns true if this \p Opcode represents an ALU instruction.
13952 +  bool isALUInstr(unsigned Opcode) const;
13953 +
13954 +  /// \breif Vector instructions are instructions that must fill all
13955 +  /// instruction slots within an instruction group.
13956 +  bool isVector(const MachineInstr &MI) const;
13957 +
13958 +  virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg,
13959 +                                        int64_t Imm) const;
13960 +
13961 +  virtual unsigned getIEQOpcode() const;
13962 +  virtual bool isMov(unsigned Opcode) const;
13963 +
13964 +  DFAPacketizer *CreateTargetScheduleState(const TargetMachine *TM,
13965 +                                           const ScheduleDAG *DAG) const;
13966 +
13967 +  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
13968 +
13969 +  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
13970 +                     SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const;
13971 +
13972 +  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const;
13973 +
13974 +  unsigned RemoveBranch(MachineBasicBlock &MBB) const;
13975 +
13976 +  bool isPredicated(const MachineInstr *MI) const;
13977 +
13978 +  bool isPredicable(MachineInstr *MI) const;
13979 +
13980 +  bool
13981 +   isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
13982 +                             const BranchProbability &Probability) const;
13983 +
13984 +  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
13985 +                           unsigned ExtraPredCycles,
13986 +                           const BranchProbability &Probability) const ;
13987 +
13988 +  bool
13989 +   isProfitableToIfCvt(MachineBasicBlock &TMBB,
13990 +                       unsigned NumTCycles, unsigned ExtraTCycles,
13991 +                       MachineBasicBlock &FMBB,
13992 +                       unsigned NumFCycles, unsigned ExtraFCycles,
13993 +                       const BranchProbability &Probability) const;
13994 +
13995 +  bool DefinesPredicate(MachineInstr *MI,
13996 +                                  std::vector<MachineOperand> &Pred) const;
13997 +
13998 +  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
13999 +                         const SmallVectorImpl<MachineOperand> &Pred2) const;
14000 +
14001 +  bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
14002 +                                          MachineBasicBlock &FMBB) const;
14003 +
14004 +  bool PredicateInstruction(MachineInstr *MI,
14005 +                        const SmallVectorImpl<MachineOperand> &Pred) const;
14006 +
14007 +  unsigned int getInstrLatency(const InstrItineraryData *ItinData,
14008 +                               const MachineInstr *MI,
14009 +                               unsigned *PredCost = 0) const;
14010 +
14011 +  virtual int getInstrLatency(const InstrItineraryData *ItinData,
14012 +                              SDNode *Node) const { return 1;}
14013 +
14014 +  /// You can use this function to avoid manually specifying each instruction
14015 +  /// modifier operand when building a new instruction.
14016 +  ///
14017 +  /// \returns a MachineInstr with all the instruction modifiers initialized
14018 +  /// to their default values.
14019 +  MachineInstrBuilder buildDefaultInstruction(MachineBasicBlock &MBB,
14020 +                                              MachineBasicBlock::iterator I,
14021 +                                              unsigned Opcode,
14022 +                                              unsigned DstReg,
14023 +                                              unsigned Src0Reg,
14024 +                                              unsigned Src1Reg = 0) const;
14025 +
14026 +  MachineInstr *buildMovImm(MachineBasicBlock &BB,
14027 +                                  MachineBasicBlock::iterator I,
14028 +                                  unsigned DstReg,
14029 +                                  uint64_t Imm) const;
14030 +
14031 +  /// \brief Get the index of Op in the MachineInstr.
14032 +  ///
14033 +  /// \returns -1 if the Instruction does not contain the specified \p Op.
14034 +  int getOperandIdx(const MachineInstr &MI, R600Operands::Ops Op) const;
14035 +
14036 +  /// \brief Get the index of \p Op for the given Opcode.
14037 +  ///
14038 +  /// \returns -1 if the Instruction does not contain the specified \p Op.
14039 +  int getOperandIdx(unsigned Opcode, R600Operands::Ops Op) const;
14040 +
14041 +  /// \brief Helper function for setting instruction flag values.
14042 +  void setImmOperand(MachineInstr *MI, R600Operands::Ops Op, int64_t Imm) const;
14043 +
14044 +  /// \returns true if this instruction has an operand for storing target flags.
14045 +  bool hasFlagOperand(const MachineInstr &MI) const;
14046 +
14047 +  ///\brief Add one of the MO_FLAG* flags to the specified \p Operand.
14048 +  void addFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
14049 +
14050 +  ///\brief Determine if the specified \p Flag is set on this \p Operand.
14051 +  bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const;
14052 +
14053 +  /// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2)
14054 +  /// \param Flag The flag being set.
14055 +  ///
14056 +  /// \returns the operand containing the flags for this instruction.
14057 +  MachineOperand &getFlagOp(MachineInstr *MI, unsigned SrcIdx = 0,
14058 +                            unsigned Flag = 0) const;
14059 +
14060 +  /// \brief Clear the specified flag on the instruction.
14061 +  void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
14062 +};
14063 +
14064 +} // End llvm namespace
14065 +
14066 +#endif // R600INSTRINFO_H_
14067 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600Instructions.td llvm-r600/lib/Target/R600/R600Instructions.td
14068 --- llvm-3.2.src/lib/Target/R600/R600Instructions.td    1970-01-01 01:00:00.000000000 +0100
14069 +++ llvm-r600/lib/Target/R600/R600Instructions.td       2013-01-25 19:43:57.466716387 +0100
14070 @@ -0,0 +1,1843 @@
14071 +//===-- R600Instructions.td - R600 Instruction defs  -------*- tablegen -*-===//
14072 +//
14073 +//                     The LLVM Compiler Infrastructure
14074 +//
14075 +// This file is distributed under the University of Illinois Open Source
14076 +// License. See LICENSE.TXT for details.
14077 +//
14078 +//===----------------------------------------------------------------------===//
14079 +//
14080 +// R600 Tablegen instruction definitions
14081 +//
14082 +//===----------------------------------------------------------------------===//
14083 +
14084 +include "R600Intrinsics.td"
14085 +
14086 +class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern,
14087 +                InstrItinClass itin>
14088 +    : AMDGPUInst <outs, ins, asm, pattern> {
14089 +
14090 +  field bits<64> Inst;
14091 +  bit Trig = 0;
14092 +  bit Op3 = 0;
14093 +  bit isVector = 0;
14094 +  bits<2> FlagOperandIdx = 0;
14095 +  bit Op1 = 0;
14096 +  bit Op2 = 0;
14097 +  bit HasNativeOperands = 0;
14098 +
14099 +  bits<11> op_code = inst;
14100 +  //let Inst = inst;
14101 +  let Namespace = "AMDGPU";
14102 +  let OutOperandList = outs;
14103 +  let InOperandList = ins;
14104 +  let AsmString = asm;
14105 +  let Pattern = pattern;
14106 +  let Itinerary = itin;
14107 +
14108 +  let TSFlags{4} = Trig;
14109 +  let TSFlags{5} = Op3;
14110 +
14111 +  // Vector instructions are instructions that must fill all slots in an
14112 +  // instruction group
14113 +  let TSFlags{6} = isVector;
14114 +  let TSFlags{8-7} = FlagOperandIdx;
14115 +  let TSFlags{9} = HasNativeOperands;
14116 +  let TSFlags{10} = Op1;
14117 +  let TSFlags{11} = Op2;
14118 +}
14119 +
14120 +class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> :
14121 +    AMDGPUInst <outs, ins, asm, pattern> {
14122 +  field bits<64> Inst;
14123 +
14124 +  let Namespace = "AMDGPU";
14125 +}
14126 +
14127 +def MEMxi : Operand<iPTR> {
14128 +  let MIOperandInfo = (ops R600_TReg32_X:$ptr, i32imm:$index);
14129 +  let PrintMethod = "printMemOperand";
14130 +}
14131 +
14132 +def MEMrr : Operand<iPTR> {
14133 +  let MIOperandInfo = (ops R600_Reg32:$ptr, R600_Reg32:$index);
14134 +}
14135 +
14136 +// Operands for non-registers
14137 +
14138 +class InstFlag<string PM = "printOperand", int Default = 0>
14139 +    : OperandWithDefaultOps <i32, (ops (i32 Default))> {
14140 +  let PrintMethod = PM;
14141 +}
14142 +
14143 +// src_sel for ALU src operands, see also ALU_CONST, ALU_PARAM registers
14144 +def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))> {
14145 +  let PrintMethod = "printSel";
14146 +}
14147 +
14148 +def LITERAL : InstFlag<"printLiteral">;
14149 +
14150 +def WRITE : InstFlag <"printWrite", 1>;
14151 +def OMOD : InstFlag <"printOMOD">;
14152 +def REL : InstFlag <"printRel">;
14153 +def CLAMP : InstFlag <"printClamp">;
14154 +def NEG : InstFlag <"printNeg">;
14155 +def ABS : InstFlag <"printAbs">;
14156 +def UEM : InstFlag <"printUpdateExecMask">;
14157 +def UP : InstFlag <"printUpdatePred">;
14158 +
14159 +// XXX: The r600g finalizer in Mesa expects last to be one in most cases.
14160 +// Once we start using the packetizer in this backend we should have this
14161 +// default to 0.
14162 +def LAST : InstFlag<"printLast", 1>;
14163 +
14164 +def ADDRParam : ComplexPattern<i32, 2, "SelectADDRParam", [], []>;
14165 +def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
14166 +def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
14167 +def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>;
14168 +def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>;
14169 +
14170 +class R600ALU_Word0 {
14171 +  field bits<32> Word0;
14172 +
14173 +  bits<11> src0;
14174 +  bits<1>  src0_neg;
14175 +  bits<1>  src0_rel;
14176 +  bits<11> src1;
14177 +  bits<1>  src1_rel;
14178 +  bits<1>  src1_neg;
14179 +  bits<3>  index_mode = 0;
14180 +  bits<2>  pred_sel;
14181 +  bits<1>  last;
14182 +
14183 +  bits<9>  src0_sel  = src0{8-0};
14184 +  bits<2>  src0_chan = src0{10-9};
14185 +  bits<9>  src1_sel  = src1{8-0};
14186 +  bits<2>  src1_chan = src1{10-9};
14187 +
14188 +  let Word0{8-0}   = src0_sel;
14189 +  let Word0{9}     = src0_rel;
14190 +  let Word0{11-10} = src0_chan;
14191 +  let Word0{12}    = src0_neg;
14192 +  let Word0{21-13} = src1_sel;
14193 +  let Word0{22}    = src1_rel;
14194 +  let Word0{24-23} = src1_chan;
14195 +  let Word0{25}    = src1_neg;
14196 +  let Word0{28-26} = index_mode;
14197 +  let Word0{30-29} = pred_sel;
14198 +  let Word0{31}    = last;
14199 +}
14200 +
14201 +class R600ALU_Word1 {
14202 +  field bits<32> Word1;
14203 +
14204 +  bits<11> dst;
14205 +  bits<3>  bank_swizzle = 0;
14206 +  bits<1>  dst_rel;
14207 +  bits<1>  clamp;
14208 +
14209 +  bits<7>  dst_sel  = dst{6-0};
14210 +  bits<2>  dst_chan = dst{10-9};
14211 +
14212 +  let Word1{20-18} = bank_swizzle;
14213 +  let Word1{27-21} = dst_sel;
14214 +  let Word1{28}    = dst_rel;
14215 +  let Word1{30-29} = dst_chan;
14216 +  let Word1{31}    = clamp;
14217 +}
14218 +
14219 +class R600ALU_Word1_OP2 <bits<11> alu_inst> : R600ALU_Word1{
14220 +
14221 +  bits<1>  src0_abs;
14222 +  bits<1>  src1_abs;
14223 +  bits<1>  update_exec_mask;
14224 +  bits<1>  update_pred;
14225 +  bits<1>  write;
14226 +  bits<2>  omod;
14227 +
14228 +  let Word1{0}     = src0_abs;
14229 +  let Word1{1}     = src1_abs;
14230 +  let Word1{2}     = update_exec_mask;
14231 +  let Word1{3}     = update_pred;
14232 +  let Word1{4}     = write;
14233 +  let Word1{6-5}   = omod;
14234 +  let Word1{17-7}  = alu_inst;
14235 +}
14236 +
14237 +class R600ALU_Word1_OP3 <bits<5> alu_inst> : R600ALU_Word1{
14238 +
14239 +  bits<11> src2;
14240 +  bits<1>  src2_rel;
14241 +  bits<1>  src2_neg;
14242 +
14243 +  bits<9>  src2_sel = src2{8-0};
14244 +  bits<2>  src2_chan = src2{10-9};
14245 +
14246 +  let Word1{8-0}   = src2_sel;
14247 +  let Word1{9}     = src2_rel;
14248 +  let Word1{11-10} = src2_chan;
14249 +  let Word1{12}    = src2_neg;
14250 +  let Word1{17-13} = alu_inst;
14251 +}
14252 +
14253 +class VTX_WORD0 {
14254 +  field bits<32> Word0;
14255 +  bits<7> SRC_GPR;
14256 +  bits<5> VC_INST;
14257 +  bits<2> FETCH_TYPE;
14258 +  bits<1> FETCH_WHOLE_QUAD;
14259 +  bits<8> BUFFER_ID;
14260 +  bits<1> SRC_REL;
14261 +  bits<2> SRC_SEL_X;
14262 +  bits<6> MEGA_FETCH_COUNT;
14263 +
14264 +  let Word0{4-0}   = VC_INST;
14265 +  let Word0{6-5}   = FETCH_TYPE;
14266 +  let Word0{7}     = FETCH_WHOLE_QUAD;
14267 +  let Word0{15-8}  = BUFFER_ID;
14268 +  let Word0{22-16} = SRC_GPR;
14269 +  let Word0{23}    = SRC_REL;
14270 +  let Word0{25-24} = SRC_SEL_X;
14271 +  let Word0{31-26} = MEGA_FETCH_COUNT;
14272 +}
14273 +
14274 +class VTX_WORD1_GPR {
14275 +  field bits<32> Word1;
14276 +  bits<7> DST_GPR;
14277 +  bits<1> DST_REL;
14278 +  bits<3> DST_SEL_X;
14279 +  bits<3> DST_SEL_Y;
14280 +  bits<3> DST_SEL_Z;
14281 +  bits<3> DST_SEL_W;
14282 +  bits<1> USE_CONST_FIELDS;
14283 +  bits<6> DATA_FORMAT;
14284 +  bits<2> NUM_FORMAT_ALL;
14285 +  bits<1> FORMAT_COMP_ALL;
14286 +  bits<1> SRF_MODE_ALL;
14287 +
14288 +  let Word1{6-0} = DST_GPR;
14289 +  let Word1{7}    = DST_REL;
14290 +  let Word1{8}    = 0; // Reserved
14291 +  let Word1{11-9} = DST_SEL_X;
14292 +  let Word1{14-12} = DST_SEL_Y;
14293 +  let Word1{17-15} = DST_SEL_Z;
14294 +  let Word1{20-18} = DST_SEL_W;
14295 +  let Word1{21}    = USE_CONST_FIELDS;
14296 +  let Word1{27-22} = DATA_FORMAT;
14297 +  let Word1{29-28} = NUM_FORMAT_ALL;
14298 +  let Word1{30}    = FORMAT_COMP_ALL;
14299 +  let Word1{31}    = SRF_MODE_ALL;
14300 +}
14301 +
14302 +/*
14303 +XXX: R600 subtarget uses a slightly different encoding than the other
14304 +subtargets.  We currently handle this in R600MCCodeEmitter, but we may
14305 +want to use these instruction classes in the future.
14306 +
14307 +class R600ALU_Word1_OP2_r600 : R600ALU_Word1_OP2 {
14308 +
14309 +  bits<1>  fog_merge;
14310 +  bits<10> alu_inst;
14311 +
14312 +  let Inst{37}    = fog_merge;
14313 +  let Inst{39-38} = omod;
14314 +  let Inst{49-40} = alu_inst;
14315 +}
14316 +
14317 +class R600ALU_Word1_OP2_r700 : R600ALU_Word1_OP2 {
14318 +
14319 +  bits<11> alu_inst;
14320 +
14321 +  let Inst{38-37} = omod;
14322 +  let Inst{49-39} = alu_inst;
14323 +}
14324 +*/
14325 +
14326 +def R600_Pred : PredicateOperand<i32, (ops R600_Predicate),
14327 +                                     (ops PRED_SEL_OFF)>;
14328 +
14329 +
14330 +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
14331 +
14332 +// Class for instructions with only one source register.
14333 +// If you add new ins to this instruction, make sure they are listed before
14334 +// $literal, because the backend currently assumes that the last operand is
14335 +// a literal.  Also be sure to update the enum R600Op1OperandIndex::ROI in
14336 +// R600Defines.h, R600InstrInfo::buildDefaultInstruction(),
14337 +// and R600InstrInfo::getOperandIdx().
14338 +class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
14339 +                InstrItinClass itin = AnyALU> :
14340 +    InstR600 <0,
14341 +              (outs R600_Reg32:$dst),
14342 +              (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
14343 +                   R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
14344 +                   LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
14345 +              !strconcat(opName,
14346 +                   "$clamp $dst$write$dst_rel$omod, "
14347 +                   "$src0_neg$src0_abs$src0$src0_sel$src0_abs$src0_rel, "
14348 +                   "$literal $pred_sel$last"),
14349 +              pattern,
14350 +              itin>,
14351 +    R600ALU_Word0,
14352 +    R600ALU_Word1_OP2 <inst> {
14353 +
14354 +  let src1 = 0;
14355 +  let src1_rel = 0;
14356 +  let src1_neg = 0;
14357 +  let src1_abs = 0;
14358 +  let update_exec_mask = 0;
14359 +  let update_pred = 0;
14360 +  let HasNativeOperands = 1;
14361 +  let Op1 = 1;
14362 +  let DisableEncoding = "$literal";
14363 +
14364 +  let Inst{31-0}  = Word0;
14365 +  let Inst{63-32} = Word1;
14366 +}
14367 +
14368 +class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
14369 +                    InstrItinClass itin = AnyALU> :
14370 +    R600_1OP <inst, opName,
14371 +              [(set R600_Reg32:$dst, (node R600_Reg32:$src0))]
14372 +>;
14373 +
14374 +// If you add our change the operands for R600_2OP instructions, you must
14375 +// also update the R600Op2OperandIndex::ROI enum in R600Defines.h,
14376 +// R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx().
14377 +class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
14378 +                InstrItinClass itin = AnyALU> :
14379 +  InstR600 <inst,
14380 +          (outs R600_Reg32:$dst),
14381 +          (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write,
14382 +               OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
14383 +               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
14384 +               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, SEL:$src1_sel,
14385 +               LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
14386 +          !strconcat(opName,
14387 +                "$clamp $update_exec_mask$update_pred$dst$write$dst_rel$omod, "
14388 +                "$src0_neg$src0_abs$src0$src0_sel$src0_abs$src0_rel, "
14389 +                "$src1_neg$src1_abs$src1$src1_sel$src1_abs$src1_rel, "
14390 +                "$literal $pred_sel$last"),
14391 +          pattern,
14392 +          itin>,
14393 +    R600ALU_Word0,
14394 +    R600ALU_Word1_OP2 <inst> {
14395 +
14396 +  let HasNativeOperands = 1;
14397 +  let Op2 = 1;
14398 +  let DisableEncoding = "$literal";
14399 +
14400 +  let Inst{31-0}  = Word0;
14401 +  let Inst{63-32} = Word1;
14402 +}
14403 +
14404 +class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
14405 +                       InstrItinClass itim = AnyALU> :
14406 +    R600_2OP <inst, opName,
14407 +              [(set R600_Reg32:$dst, (node R600_Reg32:$src0,
14408 +                                           R600_Reg32:$src1))]
14409 +>;
14410 +
14411 +// If you add our change the operands for R600_3OP instructions, you must
14412 +// also update the R600Op3OperandIndex::ROI enum in R600Defines.h,
14413 +// R600InstrInfo::buildDefaultInstruction(), and
14414 +// R600InstrInfo::getOperandIdx().
14415 +class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
14416 +                InstrItinClass itin = AnyALU> :
14417 +  InstR600 <0,
14418 +          (outs R600_Reg32:$dst),
14419 +          (ins REL:$dst_rel, CLAMP:$clamp,
14420 +               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel,
14421 +               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel,
14422 +               R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel,
14423 +               LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
14424 +          !strconcat(opName, "$clamp $dst$dst_rel, "
14425 +                             "$src0_neg$src0$src0_sel$src0_rel, "
14426 +                             "$src1_neg$src1$src1_sel$src1_rel, "
14427 +                             "$src2_neg$src2$src2_sel$src2_rel, "
14428 +                             "$literal $pred_sel$last"),
14429 +          pattern,
14430 +          itin>,
14431 +    R600ALU_Word0,
14432 +    R600ALU_Word1_OP3<inst>{
14433 +
14434 +  let HasNativeOperands = 1;
14435 +  let DisableEncoding = "$literal";
14436 +  let Op3 = 1;
14437 +
14438 +  let Inst{31-0}  = Word0;
14439 +  let Inst{63-32} = Word1;
14440 +}
14441 +
14442 +class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
14443 +                      InstrItinClass itin = VecALU> :
14444 +  InstR600 <inst,
14445 +          (outs R600_Reg32:$dst),
14446 +          ins,
14447 +          asm,
14448 +          pattern,
14449 +          itin>;
14450 +
14451 +class R600_TEX <bits<11> inst, string opName, list<dag> pattern,
14452 +                InstrItinClass itin = AnyALU> :
14453 +  InstR600 <inst,
14454 +          (outs R600_Reg128:$dst),
14455 +          (ins R600_Reg128:$src0, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
14456 +          !strconcat(opName, "$dst, $src0, $resourceId, $samplerId, $textureTarget"),
14457 +          pattern,
14458 +          itin>{
14459 +    let Inst {10-0} = inst;
14460 +  }
14461 +
14462 +} // End mayLoad = 1, mayStore = 0, hasSideEffects = 0
14463 +
14464 +def TEX_SHADOW : PatLeaf<
14465 +  (imm),
14466 +  [{uint32_t TType = (uint32_t)N->getZExtValue();
14467 +    return (TType >= 6 && TType <= 8) || (TType >= 11 && TType <= 13);
14468 +  }]
14469 +>;
14470 +
14471 +def TEX_RECT : PatLeaf<
14472 +  (imm),
14473 +  [{uint32_t TType = (uint32_t)N->getZExtValue();
14474 +    return TType == 5;
14475 +  }]
14476 +>;
14477 +
14478 +class EG_CF_RAT <bits <8> cf_inst, bits <6> rat_inst, bits<4> rat_id, dag outs,
14479 +                 dag ins, string asm, list<dag> pattern> :
14480 +    InstR600ISA <outs, ins, asm, pattern> {
14481 +  bits<7>  RW_GPR;
14482 +  bits<7>  INDEX_GPR;
14483 +
14484 +  bits<2>  RIM;
14485 +  bits<2>  TYPE;
14486 +  bits<1>  RW_REL;
14487 +  bits<2>  ELEM_SIZE;
14488 +
14489 +  bits<12> ARRAY_SIZE;
14490 +  bits<4>  COMP_MASK;
14491 +  bits<4>  BURST_COUNT;
14492 +  bits<1>  VPM;
14493 +  bits<1>  eop;
14494 +  bits<1>  MARK;
14495 +  bits<1>  BARRIER;
14496 +
14497 +  // CF_ALLOC_EXPORT_WORD0_RAT
14498 +  let Inst{3-0}   = rat_id;
14499 +  let Inst{9-4}   = rat_inst;
14500 +  let Inst{10}    = 0; // Reserved
14501 +  let Inst{12-11} = RIM;
14502 +  let Inst{14-13} = TYPE;
14503 +  let Inst{21-15} = RW_GPR;
14504 +  let Inst{22}    = RW_REL;
14505 +  let Inst{29-23} = INDEX_GPR;
14506 +  let Inst{31-30} = ELEM_SIZE;
14507 +
14508 +  // CF_ALLOC_EXPORT_WORD1_BUF
14509 +  let Inst{43-32} = ARRAY_SIZE;
14510 +  let Inst{47-44} = COMP_MASK;
14511 +  let Inst{51-48} = BURST_COUNT;
14512 +  let Inst{52}    = VPM;
14513 +  let Inst{53}    = eop;
14514 +  let Inst{61-54} = cf_inst;
14515 +  let Inst{62}    = MARK;
14516 +  let Inst{63}    = BARRIER;
14517 +}
14518 +
14519 +class LoadParamFrag <PatFrag load_type> : PatFrag <
14520 +  (ops node:$ptr), (load_type node:$ptr),
14521 +  [{ return isParamLoad(dyn_cast<LoadSDNode>(N)); }]
14522 +>;
14523 +
14524 +def load_param : LoadParamFrag<load>;
14525 +def load_param_zexti8 : LoadParamFrag<zextloadi8>;
14526 +def load_param_zexti16 : LoadParamFrag<zextloadi16>;
14527 +
14528 +def isR600 : Predicate<"Subtarget.device()"
14529 +                            "->getGeneration() == AMDGPUDeviceInfo::HD4XXX">;
14530 +def isR700 : Predicate<"Subtarget.device()"
14531 +                            "->getGeneration() == AMDGPUDeviceInfo::HD4XXX &&"
14532 +                            "Subtarget.device()->getDeviceFlag()"
14533 +                            ">= OCL_DEVICE_RV710">;
14534 +def isEG : Predicate<
14535 +  "Subtarget.device()->getGeneration() >= AMDGPUDeviceInfo::HD5XXX && "
14536 +  "Subtarget.device()->getGeneration() < AMDGPUDeviceInfo::HD7XXX && "
14537 +  "Subtarget.device()->getDeviceFlag() != OCL_DEVICE_CAYMAN">;
14538 +
14539 +def isCayman : Predicate<"Subtarget.device()"
14540 +                            "->getDeviceFlag() == OCL_DEVICE_CAYMAN">;
14541 +def isEGorCayman : Predicate<"Subtarget.device()"
14542 +                            "->getGeneration() == AMDGPUDeviceInfo::HD5XXX"
14543 +                            "|| Subtarget.device()->getGeneration() =="
14544 +                            "AMDGPUDeviceInfo::HD6XXX">;
14545 +
14546 +def isR600toCayman : Predicate<
14547 +                     "Subtarget.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX">;
14548 +
14549 +//===----------------------------------------------------------------------===//
14550 +// R600 SDNodes
14551 +//===----------------------------------------------------------------------===//
14552 +
14553 +def INTERP: SDNode<"AMDGPUISD::INTERP",
14554 +  SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisInt<1>, SDTCisInt<2>]>
14555 +  >;
14556 +
14557 +def INTERP_P0: SDNode<"AMDGPUISD::INTERP_P0",
14558 +  SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisInt<1>]>
14559 +  >;
14560 +
14561 +def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS",
14562 +  SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisPtrTy<1>]>,
14563 +  [SDNPMayLoad]
14564 +>;
14565 +
14566 +//===----------------------------------------------------------------------===//
14567 +// Interpolation Instructions
14568 +//===----------------------------------------------------------------------===//
14569 +
14570 +let usesCustomInserter = 1 in {
14571 +def input_perspective :  AMDGPUShaderInst <
14572 +  (outs R600_Reg128:$dst),
14573 +  (ins i32imm:$src0, i32imm:$src1),
14574 +  "input_perspective $src0 $src1 : dst",
14575 +  [(set R600_Reg128:$dst, (INTERP (i32 imm:$src0), (i32 imm:$src1)))]>;
14576 +}  // End usesCustomInserter = 1
14577 +
14578 +def input_constant :  AMDGPUShaderInst <
14579 +  (outs R600_Reg128:$dst),
14580 +  (ins i32imm:$src),
14581 +  "input_perspective $src : dst",
14582 +  [(set R600_Reg128:$dst, (INTERP_P0 (i32 imm:$src)))]>;
14583 +
14584 +
14585 +
14586 +def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> {
14587 +  let bank_swizzle = 5;
14588 +}
14589 +
14590 +def INTERP_ZW : R600_2OP <0xD7, "INTERP_ZW", []> {
14591 +  let bank_swizzle = 5;
14592 +}
14593 +
14594 +def INTERP_LOAD_P0 : R600_1OP <0xE0, "INTERP_LOAD_P0", []>;
14595 +
14596 +//===----------------------------------------------------------------------===//
14597 +// Export Instructions
14598 +//===----------------------------------------------------------------------===//
14599 +
14600 +def ExportType : SDTypeProfile<0, 5, [SDTCisFP<0>, SDTCisInt<1>]>;
14601 +
14602 +def EXPORT: SDNode<"AMDGPUISD::EXPORT", ExportType,
14603 +  [SDNPHasChain, SDNPSideEffect]>;
14604 +
14605 +class ExportWord0 {
14606 +  field bits<32> Word0;
14607 +
14608 +  bits<13> arraybase;
14609 +  bits<2> type;
14610 +  bits<7> gpr;
14611 +  bits<2> elem_size;
14612 +
14613 +  let Word0{12-0} = arraybase;
14614 +  let Word0{14-13} = type;
14615 +  let Word0{21-15} = gpr;
14616 +  let Word0{22} = 0; // RW_REL
14617 +  let Word0{29-23} = 0; // INDEX_GPR
14618 +  let Word0{31-30} = elem_size;
14619 +}
14620 +
14621 +class ExportSwzWord1 {
14622 +  field bits<32> Word1;
14623 +
14624 +  bits<3> sw_x;
14625 +  bits<3> sw_y;
14626 +  bits<3> sw_z;
14627 +  bits<3> sw_w;
14628 +  bits<1> eop;
14629 +  bits<8> inst;
14630 +
14631 +  let Word1{2-0} = sw_x;
14632 +  let Word1{5-3} = sw_y;
14633 +  let Word1{8-6} = sw_z;
14634 +  let Word1{11-9} = sw_w;
14635 +}
14636 +
14637 +class ExportBufWord1 {
14638 +  field bits<32> Word1;
14639 +
14640 +  bits<12> arraySize;
14641 +  bits<4> compMask;
14642 +  bits<1> eop;
14643 +  bits<8> inst;
14644 +
14645 +  let Word1{11-0} = arraySize;
14646 +  let Word1{15-12} = compMask;
14647 +}
14648 +
14649 +multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
14650 +  def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg),
14651 +    (ExportInst
14652 +        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sel_x),
14653 +        0, 61, 0, 7, 7, 7, cf_inst, 0)
14654 +  >;
14655 +
14656 +  def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg),
14657 +    (ExportInst
14658 +        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sel_x),
14659 +        0, 61, 7, 0, 7, 7, cf_inst, 0)
14660 +  >;
14661 +
14662 +  def : Pat<(int_R600_store_pixel_dummy),
14663 +    (ExportInst
14664 +        (v4f32 (IMPLICIT_DEF)), 0, 0, 7, 7, 7, 7, cf_inst, 0)
14665 +  >;
14666 +
14667 +  def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 0),
14668 +    (i32 imm:$type), (i32 imm:$arraybase), (i32 imm)),
14669 +        (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase,
14670 +        0, 1, 2, 3, cf_inst, 0)
14671 +  >;
14672 +  def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 1),
14673 +    (i32 imm:$type), (i32 imm:$arraybase), (i32 imm)),
14674 +        (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase,
14675 +        0, 1, 2, 3, cf_inst, 0)
14676 +  >;
14677 +
14678 +  def : Pat<(int_R600_store_swizzle (v4f32 R600_Reg128:$src), imm:$arraybase,
14679 +      imm:$type),
14680 +    (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase,
14681 +        0, 1, 2, 3, cf_inst, 0)
14682 +  >;
14683 +}
14684 +
14685 +multiclass SteamOutputExportPattern<Instruction ExportInst,
14686 +    bits<8> buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> {
14687 +// Stream0
14688 +  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
14689 +      (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)),
14690 +      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
14691 +      4095, imm:$mask, buf0inst, 0)>;
14692 +// Stream1
14693 +  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
14694 +      (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)),
14695 +      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
14696 +      4095, imm:$mask, buf1inst, 0)>;
14697 +// Stream2
14698 +  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
14699 +      (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)),
14700 +      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
14701 +      4095, imm:$mask, buf2inst, 0)>;
14702 +// Stream3
14703 +  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
14704 +      (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)),
14705 +      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
14706 +      4095, imm:$mask, buf3inst, 0)>;
14707 +}
14708 +
14709 +let isTerminator = 1, usesCustomInserter = 1 in {
14710 +
14711 +class ExportSwzInst : InstR600ISA<(
14712 +    outs),
14713 +    (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase,
14714 +    i32imm:$sw_x, i32imm:$sw_y, i32imm:$sw_z, i32imm:$sw_w, i32imm:$inst,
14715 +    i32imm:$eop),
14716 +    !strconcat("EXPORT", " $gpr"),
14717 +    []>, ExportWord0, ExportSwzWord1 {
14718 +  let elem_size = 3;
14719 +  let Inst{31-0} = Word0;
14720 +  let Inst{63-32} = Word1;
14721 +}
14722 +
14723 +} // End isTerminator = 1, usesCustomInserter = 1
14724 +
14725 +class ExportBufInst : InstR600ISA<(
14726 +    outs),
14727 +    (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase,
14728 +    i32imm:$arraySize, i32imm:$compMask, i32imm:$inst, i32imm:$eop),
14729 +    !strconcat("EXPORT", " $gpr"),
14730 +    []>, ExportWord0, ExportBufWord1 {
14731 +  let elem_size = 0;
14732 +  let Inst{31-0} = Word0;
14733 +  let Inst{63-32} = Word1;
14734 +}
14735 +
14736 +let Predicates = [isR600toCayman] in {
14737 +
14738 +//===----------------------------------------------------------------------===//
14739 +// Common Instructions R600, R700, Evergreen, Cayman
14740 +//===----------------------------------------------------------------------===//
14741 +
14742 +def ADD : R600_2OP_Helper <0x0, "ADD", fadd>;
14743 +// Non-IEEE MUL: 0 * anything = 0
14744 +def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>;
14745 +def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>;
14746 +def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax>;
14747 +def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin>;
14748 +
14749 +// For the SET* instructions there is a naming conflict in TargetSelectionDAG.td,
14750 +// so some of the instruction names don't match the asm string.
14751 +// XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics.
14752 +def SETE : R600_2OP <
14753 +  0x08, "SETE",
14754 +  [(set R600_Reg32:$dst,
14755 +   (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
14756 +             COND_EQ))]
14757 +>;
14758 +
14759 +def SGT : R600_2OP <
14760 +  0x09, "SETGT",
14761 +  [(set R600_Reg32:$dst,
14762 +   (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
14763 +              COND_GT))]
14764 +>;
14765 +
14766 +def SGE : R600_2OP <
14767 +  0xA, "SETGE",
14768 +  [(set R600_Reg32:$dst,
14769 +   (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
14770 +              COND_GE))]
14771 +>;
14772 +
14773 +def SNE : R600_2OP <
14774 +  0xB, "SETNE",
14775 +  [(set R600_Reg32:$dst,
14776 +   (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
14777 +    COND_NE))]
14778 +>;
14779 +
14780 +def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
14781 +def TRUNC : R600_1OP_Helper <0x11, "TRUNC", int_AMDGPU_trunc>;
14782 +def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>;
14783 +def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>;
14784 +def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>;
14785 +
14786 +def MOV : R600_1OP <0x19, "MOV", []>;
14787 +
14788 +let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in {
14789 +
14790 +class MOV_IMM <ValueType vt, Operand immType> : AMDGPUInst <
14791 +  (outs R600_Reg32:$dst),
14792 +  (ins immType:$imm),
14793 +  "",
14794 +  []
14795 +>;
14796 +
14797 +} // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1
14798 +
14799 +def MOV_IMM_I32 : MOV_IMM<i32, i32imm>;
14800 +def : Pat <
14801 +  (imm:$val),
14802 +  (MOV_IMM_I32 imm:$val)
14803 +>;
14804 +
14805 +def MOV_IMM_F32 : MOV_IMM<f32, f32imm>;
14806 +def : Pat <
14807 +  (fpimm:$val),
14808 +  (MOV_IMM_F32  fpimm:$val)
14809 +>;
14810 +
14811 +def PRED_SETE : R600_2OP <0x20, "PRED_SETE", []>;
14812 +def PRED_SETGT : R600_2OP <0x21, "PRED_SETGT", []>;
14813 +def PRED_SETGE : R600_2OP <0x22, "PRED_SETGE", []>;
14814 +def PRED_SETNE : R600_2OP <0x23, "PRED_SETNE", []>;
14815 +
14816 +let hasSideEffects = 1 in {
14817 +
14818 +def KILLGT : R600_2OP <0x2D, "KILLGT", []>;
14819 +
14820 +} // end hasSideEffects
14821 +
14822 +def AND_INT : R600_2OP_Helper <0x30, "AND_INT", and>;
14823 +def OR_INT : R600_2OP_Helper <0x31, "OR_INT", or>;
14824 +def XOR_INT : R600_2OP_Helper <0x32, "XOR_INT", xor>;
14825 +def NOT_INT : R600_1OP_Helper <0x33, "NOT_INT", not>;
14826 +def ADD_INT : R600_2OP_Helper <0x34, "ADD_INT", add>;
14827 +def SUB_INT : R600_2OP_Helper <0x35, "SUB_INT", sub>;
14828 +def MAX_INT : R600_2OP_Helper <0x36, "MAX_INT", AMDGPUsmax>;
14829 +def MIN_INT : R600_2OP_Helper <0x37, "MIN_INT", AMDGPUsmin>;
14830 +def MAX_UINT : R600_2OP_Helper <0x38, "MAX_UINT", AMDGPUumax>;
14831 +def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", AMDGPUumin>;
14832 +
14833 +def SETE_INT : R600_2OP <
14834 +  0x3A, "SETE_INT",
14835 +  [(set (i32 R600_Reg32:$dst),
14836 +   (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETEQ))]
14837 +>;
14838 +
14839 +def SETGT_INT : R600_2OP <
14840 +  0x3B, "SGT_INT",
14841 +  [(set (i32 R600_Reg32:$dst),
14842 +   (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETGT))]
14843 +>;
14844 +
14845 +def SETGE_INT : R600_2OP <
14846 +  0x3C, "SETGE_INT",
14847 +  [(set (i32 R600_Reg32:$dst),
14848 +   (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETGE))]
14849 +>;
14850 +
14851 +def SETNE_INT : R600_2OP <
14852 +  0x3D, "SETNE_INT",
14853 +  [(set (i32 R600_Reg32:$dst),
14854 +   (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETNE))]
14855 +>;
14856 +
14857 +def SETGT_UINT : R600_2OP <
14858 +  0x3E, "SETGT_UINT",
14859 +  [(set (i32 R600_Reg32:$dst),
14860 +   (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUGT))]
14861 +>;
14862 +
14863 +def SETGE_UINT : R600_2OP <
14864 +  0x3F, "SETGE_UINT",
14865 +  [(set (i32 R600_Reg32:$dst),
14866 +    (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUGE))]
14867 +>;
14868 +
14869 +def PRED_SETE_INT : R600_2OP <0x42, "PRED_SETE_INT", []>;
14870 +def PRED_SETGT_INT : R600_2OP <0x43, "PRED_SETGE_INT", []>;
14871 +def PRED_SETGE_INT : R600_2OP <0x44, "PRED_SETGE_INT", []>;
14872 +def PRED_SETNE_INT : R600_2OP <0x45, "PRED_SETNE_INT", []>;
14873 +
14874 +def CNDE_INT : R600_3OP <
14875 +  0x1C, "CNDE_INT",
14876 +  [(set (i32 R600_Reg32:$dst),
14877 +   (selectcc (i32 R600_Reg32:$src0), 0,
14878 +       (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2),
14879 +       COND_EQ))]
14880 +>;
14881 +
14882 +def CNDGE_INT : R600_3OP <
14883 +  0x1E, "CNDGE_INT",
14884 +  [(set (i32 R600_Reg32:$dst),
14885 +   (selectcc (i32 R600_Reg32:$src0), 0,
14886 +       (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2),
14887 +       COND_GE))]
14888 +>;
14889 +
14890 +def CNDGT_INT : R600_3OP <
14891 +  0x1D, "CNDGT_INT",
14892 +  [(set (i32 R600_Reg32:$dst),
14893 +   (selectcc (i32 R600_Reg32:$src0), 0,
14894 +       (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2),
14895 +       COND_GT))]
14896 +>;
14897 +
14898 +//===----------------------------------------------------------------------===//
14899 +// Texture instructions
14900 +//===----------------------------------------------------------------------===//
14901 +
14902 +def TEX_LD : R600_TEX <
14903 +  0x03, "TEX_LD",
14904 +  [(set R600_Reg128:$dst, (int_AMDGPU_txf R600_Reg128:$src0, imm:$src1, imm:$src2, imm:$src3, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
14905 +> {
14906 +let AsmString = "TEX_LD $dst, $src0, $src1, $src2, $src3, $resourceId, $samplerId, $textureTarget";
14907 +let InOperandList = (ins R600_Reg128:$src0, i32imm:$src1, i32imm:$src2, i32imm:$src3, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget);
14908 +}
14909 +
14910 +def TEX_GET_TEXTURE_RESINFO : R600_TEX <
14911 +  0x04, "TEX_GET_TEXTURE_RESINFO",
14912 +  [(set R600_Reg128:$dst, (int_AMDGPU_txq R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
14913 +>;
14914 +
14915 +def TEX_GET_GRADIENTS_H : R600_TEX <
14916 +  0x07, "TEX_GET_GRADIENTS_H",
14917 +  [(set R600_Reg128:$dst, (int_AMDGPU_ddx R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
14918 +>;
14919 +
14920 +def TEX_GET_GRADIENTS_V : R600_TEX <
14921 +  0x08, "TEX_GET_GRADIENTS_V",
14922 +  [(set R600_Reg128:$dst, (int_AMDGPU_ddy R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
14923 +>;
14924 +
14925 +def TEX_SET_GRADIENTS_H : R600_TEX <
14926 +  0x0B, "TEX_SET_GRADIENTS_H",
14927 +  []
14928 +>;
14929 +
14930 +def TEX_SET_GRADIENTS_V : R600_TEX <
14931 +  0x0C, "TEX_SET_GRADIENTS_V",
14932 +  []
14933 +>;
14934 +
14935 +def TEX_SAMPLE : R600_TEX <
14936 +  0x10, "TEX_SAMPLE",
14937 +  [(set R600_Reg128:$dst, (int_AMDGPU_tex R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
14938 +>;
14939 +
14940 +def TEX_SAMPLE_C : R600_TEX <
14941 +  0x18, "TEX_SAMPLE_C",
14942 +  [(set R600_Reg128:$dst, (int_AMDGPU_tex R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))]
14943 +>;
14944 +
14945 +def TEX_SAMPLE_L : R600_TEX <
14946 +  0x11, "TEX_SAMPLE_L",
14947 +  [(set R600_Reg128:$dst, (int_AMDGPU_txl R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
14948 +>;
14949 +
14950 +def TEX_SAMPLE_C_L : R600_TEX <
14951 +  0x19, "TEX_SAMPLE_C_L",
14952 +  [(set R600_Reg128:$dst, (int_AMDGPU_txl R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))]
14953 +>;
14954 +
14955 +def TEX_SAMPLE_LB : R600_TEX <
14956 +  0x12, "TEX_SAMPLE_LB",
14957 +  [(set R600_Reg128:$dst, (int_AMDGPU_txb R600_Reg128:$src0,imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
14958 +>;
14959 +
14960 +def TEX_SAMPLE_C_LB : R600_TEX <
14961 +  0x1A, "TEX_SAMPLE_C_LB",
14962 +  [(set R600_Reg128:$dst, (int_AMDGPU_txb R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))]
14963 +>;
14964 +
14965 +def TEX_SAMPLE_G : R600_TEX <
14966 +  0x14, "TEX_SAMPLE_G",
14967 +  []
14968 +>;
14969 +
14970 +def TEX_SAMPLE_C_G : R600_TEX <
14971 +  0x1C, "TEX_SAMPLE_C_G",
14972 +  []
14973 +>;
14974 +
14975 +//===----------------------------------------------------------------------===//
14976 +// Helper classes for common instructions
14977 +//===----------------------------------------------------------------------===//
14978 +
14979 +class MUL_LIT_Common <bits<5> inst> : R600_3OP <
14980 +  inst, "MUL_LIT",
14981 +  []
14982 +>;
14983 +
14984 +class MULADD_Common <bits<5> inst> : R600_3OP <
14985 +  inst, "MULADD",
14986 +  [(set (f32 R600_Reg32:$dst),
14987 +   (IL_mad R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2))]
14988 +>;
14989 +
14990 +class CNDE_Common <bits<5> inst> : R600_3OP <
14991 +  inst, "CNDE",
14992 +  [(set R600_Reg32:$dst,
14993 +   (selectcc (f32 R600_Reg32:$src0), FP_ZERO,
14994 +       (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2),
14995 +       COND_EQ))]
14996 +>;
14997 +
14998 +class CNDGT_Common <bits<5> inst> : R600_3OP <
14999 +  inst, "CNDGT",
15000 +  [(set R600_Reg32:$dst,
15001 +   (selectcc (f32 R600_Reg32:$src0), FP_ZERO,
15002 +       (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2),
15003 +       COND_GT))]
15004 +>;
15005 +
15006 +class CNDGE_Common <bits<5> inst> : R600_3OP <
15007 +  inst, "CNDGE",
15008 +  [(set R600_Reg32:$dst,
15009 +   (selectcc (f32 R600_Reg32:$src0), FP_ZERO,
15010 +       (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2),
15011 +       COND_GE))]
15012 +>;
15013 +
15014 +multiclass DOT4_Common <bits<11> inst> {
15015 +
15016 +  def _pseudo : R600_REDUCTION <inst,
15017 +    (ins R600_Reg128:$src0, R600_Reg128:$src1),
15018 +    "DOT4 $dst $src0, $src1",
15019 +    [(set R600_Reg32:$dst, (int_AMDGPU_dp4 R600_Reg128:$src0, R600_Reg128:$src1))]
15020 +  >;
15021 +
15022 +  def _real : R600_2OP <inst, "DOT4", []>;
15023 +}
15024 +
15025 +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
15026 +multiclass CUBE_Common <bits<11> inst> {
15027 +
15028 +  def _pseudo : InstR600 <
15029 +    inst,
15030 +    (outs R600_Reg128:$dst),
15031 +    (ins R600_Reg128:$src),
15032 +    "CUBE $dst $src",
15033 +    [(set R600_Reg128:$dst, (int_AMDGPU_cube R600_Reg128:$src))],
15034 +    VecALU
15035 +  > {
15036 +    let isPseudo = 1;
15037 +  }
15038 +
15039 +  def _real : R600_2OP <inst, "CUBE", []>;
15040 +}
15041 +} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0
15042 +
15043 +class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
15044 +  inst, "EXP_IEEE", fexp2
15045 +>;
15046 +
15047 +class FLT_TO_INT_Common <bits<11> inst> : R600_1OP_Helper <
15048 +  inst, "FLT_TO_INT", fp_to_sint
15049 +>;
15050 +
15051 +class INT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
15052 +  inst, "INT_TO_FLT", sint_to_fp
15053 +>;
15054 +
15055 +class FLT_TO_UINT_Common <bits<11> inst> : R600_1OP_Helper <
15056 +  inst, "FLT_TO_UINT", fp_to_uint
15057 +>;
15058 +
15059 +class UINT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
15060 +  inst, "UINT_TO_FLT", uint_to_fp
15061 +>;
15062 +
15063 +class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP <
15064 +  inst, "LOG_CLAMPED", []
15065 +>;
15066 +
15067 +class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
15068 +  inst, "LOG_IEEE", flog2
15069 +>;
15070 +
15071 +class LSHL_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHL", shl>;
15072 +class LSHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHR", srl>;
15073 +class ASHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "ASHR", sra>;
15074 +class MULHI_INT_Common <bits<11> inst> : R600_2OP_Helper <
15075 +  inst, "MULHI_INT", mulhs
15076 +>;
15077 +class MULHI_UINT_Common <bits<11> inst> : R600_2OP_Helper <
15078 +  inst, "MULHI", mulhu
15079 +>;
15080 +class MULLO_INT_Common <bits<11> inst> : R600_2OP_Helper <
15081 +  inst, "MULLO_INT", mul
15082 +>;
15083 +class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []>;
15084 +
15085 +class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP <
15086 +  inst, "RECIP_CLAMPED", []
15087 +>;
15088 +
15089 +class RECIP_IEEE_Common <bits<11> inst> : R600_1OP <
15090 +  inst, "RECIP_IEEE", [(set R600_Reg32:$dst, (fdiv FP_ONE, R600_Reg32:$src0))]
15091 +>;
15092 +
15093 +class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper <
15094 +  inst, "RECIP_UINT", AMDGPUurecip
15095 +>;
15096 +
15097 +class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper <
15098 +  inst, "RECIPSQRT_CLAMPED", int_AMDGPU_rsq
15099 +>;
15100 +
15101 +class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP <
15102 +  inst, "RECIPSQRT_IEEE", []
15103 +>;
15104 +
15105 +class SIN_Common <bits<11> inst> : R600_1OP <
15106 +  inst, "SIN", []>{
15107 +  let Trig = 1;
15108 +}
15109 +
15110 +class COS_Common <bits<11> inst> : R600_1OP <
15111 +  inst, "COS", []> {
15112 +  let Trig = 1;
15113 +}
15114 +
15115 +//===----------------------------------------------------------------------===//
15116 +// Helper patterns for complex intrinsics
15117 +//===----------------------------------------------------------------------===//
15118 +
15119 +multiclass DIV_Common <InstR600 recip_ieee> {
15120 +def : Pat<
15121 +  (int_AMDGPU_div R600_Reg32:$src0, R600_Reg32:$src1),
15122 +  (MUL R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1))
15123 +>;
15124 +
15125 +def : Pat<
15126 +  (fdiv R600_Reg32:$src0, R600_Reg32:$src1),
15127 +  (MUL R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1))
15128 +>;
15129 +}
15130 +
15131 +class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee> : Pat <
15132 +  (int_TGSI_lit_z R600_Reg32:$src_x, R600_Reg32:$src_y, R600_Reg32:$src_w),
15133 +  (exp_ieee (mul_lit (log_clamped (MAX R600_Reg32:$src_y, (f32 ZERO))), R600_Reg32:$src_w, R600_Reg32:$src_x))
15134 +>;
15135 +
15136 +//===----------------------------------------------------------------------===//
15137 +// R600 / R700 Instructions
15138 +//===----------------------------------------------------------------------===//
15139 +
15140 +let Predicates = [isR600] in {
15141 +
15142 +  def MUL_LIT_r600 : MUL_LIT_Common<0x0C>;
15143 +  def MULADD_r600 : MULADD_Common<0x10>;
15144 +  def CNDE_r600 : CNDE_Common<0x18>;
15145 +  def CNDGT_r600 : CNDGT_Common<0x19>;
15146 +  def CNDGE_r600 : CNDGE_Common<0x1A>;
15147 +  defm DOT4_r600 : DOT4_Common<0x50>;
15148 +  defm CUBE_r600 : CUBE_Common<0x52>;
15149 +  def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>;
15150 +  def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>;
15151 +  def LOG_IEEE_r600 : LOG_IEEE_Common<0x63>;
15152 +  def RECIP_CLAMPED_r600 : RECIP_CLAMPED_Common<0x64>;
15153 +  def RECIP_IEEE_r600 : RECIP_IEEE_Common<0x66>;
15154 +  def RECIPSQRT_CLAMPED_r600 : RECIPSQRT_CLAMPED_Common<0x67>;
15155 +  def RECIPSQRT_IEEE_r600 : RECIPSQRT_IEEE_Common<0x69>;
15156 +  def FLT_TO_INT_r600 : FLT_TO_INT_Common<0x6b>;
15157 +  def INT_TO_FLT_r600 : INT_TO_FLT_Common<0x6c>;
15158 +  def FLT_TO_UINT_r600 : FLT_TO_UINT_Common<0x79>;
15159 +  def UINT_TO_FLT_r600 : UINT_TO_FLT_Common<0x6d>;
15160 +  def SIN_r600 : SIN_Common<0x6E>;
15161 +  def COS_r600 : COS_Common<0x6F>;
15162 +  def ASHR_r600 : ASHR_Common<0x70>;
15163 +  def LSHR_r600 : LSHR_Common<0x71>;
15164 +  def LSHL_r600 : LSHL_Common<0x72>;
15165 +  def MULLO_INT_r600 : MULLO_INT_Common<0x73>;
15166 +  def MULHI_INT_r600 : MULHI_INT_Common<0x74>;
15167 +  def MULLO_UINT_r600 : MULLO_UINT_Common<0x75>;
15168 +  def MULHI_UINT_r600 : MULHI_UINT_Common<0x76>;
15169 +  def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>;
15170 +
15171 +  defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
15172 +  def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>;
15173 +
15174 +  def : Pat<(fsqrt R600_Reg32:$src),
15175 +    (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_r600 R600_Reg32:$src))>;
15176 +
15177 +  def R600_ExportSwz : ExportSwzInst {
15178 +    let Word1{20-17} = 1; // BURST_COUNT
15179 +    let Word1{21} = eop;
15180 +    let Word1{22} = 1; // VALID_PIXEL_MODE
15181 +    let Word1{30-23} = inst;
15182 +    let Word1{31} = 1; // BARRIER
15183 +  }
15184 +  defm : ExportPattern<R600_ExportSwz, 39>;
15185 +
15186 +  def R600_ExportBuf : ExportBufInst {
15187 +    let Word1{20-17} = 1; // BURST_COUNT
15188 +    let Word1{21} = eop;
15189 +    let Word1{22} = 1; // VALID_PIXEL_MODE
15190 +    let Word1{30-23} = inst;
15191 +    let Word1{31} = 1; // BARRIER
15192 +  }
15193 +  defm : SteamOutputExportPattern<R600_ExportBuf, 0x20, 0x21, 0x22, 0x23>;
15194 +}
15195 +
15196 +// Helper pattern for normalizing inputs to triginomic instructions for R700+
15197 +// cards.
15198 +class COS_PAT <InstR600 trig> : Pat<
15199 +  (fcos R600_Reg32:$src),
15200 +  (trig (MUL (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src))
15201 +>;
15202 +
15203 +class SIN_PAT <InstR600 trig> : Pat<
15204 +  (fsin R600_Reg32:$src),
15205 +  (trig (MUL (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src))
15206 +>;
15207 +
15208 +//===----------------------------------------------------------------------===//
15209 +// R700 Only instructions
15210 +//===----------------------------------------------------------------------===//
15211 +
15212 +let Predicates = [isR700] in {
15213 +  def SIN_r700 : SIN_Common<0x6E>;
15214 +  def COS_r700 : COS_Common<0x6F>;
15215 +
15216 +  // R700 normalizes inputs to SIN/COS the same as EG
15217 +  def : SIN_PAT <SIN_r700>;
15218 +  def : COS_PAT <COS_r700>;
15219 +}
15220 +
15221 +//===----------------------------------------------------------------------===//
15222 +// Evergreen Only instructions
15223 +//===----------------------------------------------------------------------===//
15224 +
15225 +let Predicates = [isEG] in {
15226 +
15227 +def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>;
15228 +defm DIV_eg : DIV_Common<RECIP_IEEE_eg>;
15229 +
15230 +def MULLO_INT_eg : MULLO_INT_Common<0x8F>;
15231 +def MULHI_INT_eg : MULHI_INT_Common<0x90>;
15232 +def MULLO_UINT_eg : MULLO_UINT_Common<0x91>;
15233 +def MULHI_UINT_eg : MULHI_UINT_Common<0x92>;
15234 +def RECIP_UINT_eg : RECIP_UINT_Common<0x94>;
15235 +def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>;
15236 +def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;
15237 +def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
15238 +def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
15239 +def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
15240 +def SIN_eg : SIN_Common<0x8D>;
15241 +def COS_eg : COS_Common<0x8E>;
15242 +
15243 +def : SIN_PAT <SIN_eg>;
15244 +def : COS_PAT <COS_eg>;
15245 +def : Pat<(fsqrt R600_Reg32:$src),
15246 +  (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_eg R600_Reg32:$src))>;
15247 +} // End Predicates = [isEG]
15248 +
15249 +//===----------------------------------------------------------------------===//
15250 +// Evergreen / Cayman Instructions
15251 +//===----------------------------------------------------------------------===//
15252 +
15253 +let Predicates = [isEGorCayman] in {
15254 +
15255 +  // BFE_UINT - bit_extract, an optimization for mask and shift
15256 +  // Src0 = Input
15257 +  // Src1 = Offset
15258 +  // Src2 = Width
15259 +  //
15260 +  // bit_extract = (Input << (32 - Offset - Width)) >> (32 - Width)
15261 +  //
15262 +  // Example Usage:
15263 +  // (Offset, Width)
15264 +  //
15265 +  // (0, 8)           = (Input << 24) >> 24  = (Input &  0xff)       >> 0
15266 +  // (8, 8)           = (Input << 16) >> 24  = (Input &  0xffff)     >> 8
15267 +  // (16,8)           = (Input <<  8) >> 24  = (Input &  0xffffff)   >> 16
15268 +  // (24,8)           = (Input <<  0) >> 24  = (Input &  0xffffffff) >> 24
15269 +  def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT",
15270 +    [(set R600_Reg32:$dst, (int_AMDIL_bit_extract_u32 R600_Reg32:$src0,
15271 +                                                      R600_Reg32:$src1,
15272 +                                                      R600_Reg32:$src2))],
15273 +    VecALU
15274 +  >;
15275 +
15276 +  def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT",
15277 +    [(set R600_Reg32:$dst, (AMDGPUbitalign R600_Reg32:$src0, R600_Reg32:$src1,
15278 +                                          R600_Reg32:$src2))],
15279 +    VecALU
15280 +  >;
15281 +
15282 +  def MULADD_eg : MULADD_Common<0x14>;
15283 +  def ASHR_eg : ASHR_Common<0x15>;
15284 +  def LSHR_eg : LSHR_Common<0x16>;
15285 +  def LSHL_eg : LSHL_Common<0x17>;
15286 +  def CNDE_eg : CNDE_Common<0x19>;
15287 +  def CNDGT_eg : CNDGT_Common<0x1A>;
15288 +  def CNDGE_eg : CNDGE_Common<0x1B>;
15289 +  def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
15290 +  def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
15291 +  defm DOT4_eg : DOT4_Common<0xBE>;
15292 +  defm CUBE_eg : CUBE_Common<0xC0>;
15293 +
15294 +  def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common<MUL_LIT_eg, LOG_CLAMPED_eg, EXP_IEEE_eg>;
15295 +
15296 +  def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> {
15297 +    let Pattern = [];
15298 +  }
15299 +
15300 +  def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>;
15301 +
15302 +  def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> {
15303 +    let Pattern = [];
15304 +  }
15305 +
15306 +  def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
15307 +
15308 +  // TRUNC is used for the FLT_TO_INT instructions to work around a
15309 +  // perceived problem where the rounding modes are applied differently
15310 +  // depending on the instruction and the slot they are in.
15311 +  // See:
15312 +  // https://bugs.freedesktop.org/show_bug.cgi?id=50232
15313 +  // Mesa commit: a1a0974401c467cb86ef818f22df67c21774a38c
15314 +  //
15315 +  // XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes,
15316 +  // which do not need to be truncated since the fp values are 0.0f or 1.0f.
15317 +  // We should look into handling these cases separately.
15318 +  def : Pat<(fp_to_sint R600_Reg32:$src0),
15319 +    (FLT_TO_INT_eg (TRUNC R600_Reg32:$src0))>;
15320 +
15321 +  def : Pat<(fp_to_uint R600_Reg32:$src0),
15322 +    (FLT_TO_UINT_eg (TRUNC R600_Reg32:$src0))>;
15323 +
15324 +  def EG_ExportSwz : ExportSwzInst {
15325 +    let Word1{19-16} = 1; // BURST_COUNT
15326 +    let Word1{20} = 1; // VALID_PIXEL_MODE
15327 +    let Word1{21} = eop;
15328 +    let Word1{29-22} = inst;
15329 +    let Word1{30} = 0; // MARK
15330 +    let Word1{31} = 1; // BARRIER
15331 +  }
15332 +  defm : ExportPattern<EG_ExportSwz, 83>;
15333 +
15334 +  def EG_ExportBuf : ExportBufInst {
15335 +    let Word1{19-16} = 1; // BURST_COUNT
15336 +    let Word1{20} = 1; // VALID_PIXEL_MODE
15337 +    let Word1{21} = eop;
15338 +    let Word1{29-22} = inst;
15339 +    let Word1{30} = 0; // MARK
15340 +    let Word1{31} = 1; // BARRIER
15341 +  }
15342 +  defm : SteamOutputExportPattern<EG_ExportBuf, 0x40, 0x41, 0x42, 0x43>;
15343 +
15344 +//===----------------------------------------------------------------------===//
15345 +// Memory read/write instructions
15346 +//===----------------------------------------------------------------------===//
15347 +let usesCustomInserter = 1 in {
15348 +
15349 +class RAT_WRITE_CACHELESS_eg <dag ins, bits<4> comp_mask, string name,
15350 +                              list<dag> pattern>
15351 +    : EG_CF_RAT <0x57, 0x2, 0, (outs), ins,
15352 +                 !strconcat(name, " $rw_gpr, $index_gpr, $eop"), pattern> {
15353 +  let RIM         = 0;
15354 +  // XXX: Have a separate instruction for non-indexed writes.
15355 +  let TYPE        = 1;
15356 +  let RW_REL      = 0;
15357 +  let ELEM_SIZE   = 0;
15358 +
15359 +  let ARRAY_SIZE  = 0;
15360 +  let COMP_MASK   = comp_mask;
15361 +  let BURST_COUNT = 0;
15362 +  let VPM         = 0;
15363 +  let MARK        = 0;
15364 +  let BARRIER     = 1;
15365 +}
15366 +
15367 +} // End usesCustomInserter = 1
15368 +
15369 +// 32-bit store
15370 +def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg <
15371 +  (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
15372 +  0x1, "RAT_WRITE_CACHELESS_32_eg",
15373 +  [(global_store (i32 R600_TReg32_X:$rw_gpr), R600_TReg32_X:$index_gpr)]
15374 +>;
15375 +
15376 +//128-bit store
15377 +def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg <
15378 +  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
15379 +  0xf, "RAT_WRITE_CACHELESS_128",
15380 +  [(global_store (v4i32 R600_Reg128:$rw_gpr), R600_TReg32_X:$index_gpr)]
15381 +>;
15382 +
15383 +class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
15384 +    : InstR600ISA <outs, (ins MEMxi:$ptr), name#" $dst, $ptr", pattern>,
15385 +      VTX_WORD1_GPR, VTX_WORD0 {
15386 +
15387 +  // Static fields
15388 +  let VC_INST = 0;
15389 +  let FETCH_TYPE = 2;
15390 +  let FETCH_WHOLE_QUAD = 0;
15391 +  let BUFFER_ID = buffer_id;
15392 +  let SRC_REL = 0;
15393 +  // XXX: We can infer this field based on the SRC_GPR.  This would allow us
15394 +  // to store vertex addresses in any channel, not just X.
15395 +  let SRC_SEL_X = 0;
15396 +  let DST_REL = 0;
15397 +  // The docs say that if this bit is set, then DATA_FORMAT, NUM_FORMAT_ALL,
15398 +  // FORMAT_COMP_ALL, SRF_MODE_ALL, and ENDIAN_SWAP fields will be ignored,
15399 +  // however, based on my testing if USE_CONST_FIELDS is set, then all
15400 +  // these fields need to be set to 0.
15401 +  let USE_CONST_FIELDS = 0;
15402 +  let NUM_FORMAT_ALL = 1;
15403 +  let FORMAT_COMP_ALL = 0;
15404 +  let SRF_MODE_ALL = 0;
15405 +
15406 +  let Inst{31-0} = Word0;
15407 +  let Inst{63-32} = Word1;
15408 +  // LLVM can only encode 64-bit instructions, so these fields are manually
15409 +  // encoded in R600CodeEmitter
15410 +  //
15411 +  // bits<16> OFFSET;
15412 +  // bits<2>  ENDIAN_SWAP = 0;
15413 +  // bits<1>  CONST_BUF_NO_STRIDE = 0;
15414 +  // bits<1>  MEGA_FETCH = 0;
15415 +  // bits<1>  ALT_CONST = 0;
15416 +  // bits<2>  BUFFER_INDEX_MODE = 0;
15417 +
15418 +
15419 +
15420 +  // VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
15421 +  // is done in R600CodeEmitter
15422 +  //
15423 +  // Inst{79-64} = OFFSET;
15424 +  // Inst{81-80} = ENDIAN_SWAP;
15425 +  // Inst{82}    = CONST_BUF_NO_STRIDE;
15426 +  // Inst{83}    = MEGA_FETCH;
15427 +  // Inst{84}    = ALT_CONST;
15428 +  // Inst{86-85} = BUFFER_INDEX_MODE;
15429 +  // Inst{95-86} = 0; Reserved
15430 +
15431 +  // VTX_WORD3 (Padding)
15432 +  //
15433 +  // Inst{127-96} = 0;
15434 +}
15435 +
15436 +class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern>
15437 +    : VTX_READ_eg <"VTX_READ_8", buffer_id, (outs R600_TReg32_X:$dst),
15438 +                   pattern> {
15439 +
15440 +  let MEGA_FETCH_COUNT = 1;
15441 +  let DST_SEL_X = 0;
15442 +  let DST_SEL_Y = 7;   // Masked
15443 +  let DST_SEL_Z = 7;   // Masked
15444 +  let DST_SEL_W = 7;   // Masked
15445 +  let DATA_FORMAT = 1; // FMT_8
15446 +}
15447 +
15448 +class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern>
15449 +    : VTX_READ_eg <"VTX_READ_16", buffer_id, (outs R600_TReg32_X:$dst),
15450 +                    pattern> {
15451 +  let MEGA_FETCH_COUNT = 2;
15452 +  let DST_SEL_X = 0;
15453 +  let DST_SEL_Y = 7;   // Masked
15454 +  let DST_SEL_Z = 7;   // Masked
15455 +  let DST_SEL_W = 7;   // Masked
15456 +  let DATA_FORMAT = 5; // FMT_16
15457 +
15458 +}
15459 +
15460 +class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
15461 +    : VTX_READ_eg <"VTX_READ_32", buffer_id, (outs R600_TReg32_X:$dst),
15462 +                   pattern> {
15463 +
15464 +  let MEGA_FETCH_COUNT = 4;
15465 +  let DST_SEL_X        = 0;
15466 +  let DST_SEL_Y        = 7;   // Masked
15467 +  let DST_SEL_Z        = 7;   // Masked
15468 +  let DST_SEL_W        = 7;   // Masked
15469 +  let DATA_FORMAT      = 0xD; // COLOR_32
15470 +
15471 +  // This is not really necessary, but there were some GPU hangs that appeared
15472 +  // to be caused by ALU instructions in the next instruction group that wrote
15473 +  // to the $ptr registers of the VTX_READ.
15474 +  // e.g.
15475 +  // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
15476 +  // %T2_X<def> = MOV %ZERO
15477 +  //Adding this constraint prevents this from happening.
15478 +  let Constraints = "$ptr.ptr = $dst";
15479 +}
15480 +
15481 +class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
15482 +    : VTX_READ_eg <"VTX_READ_128", buffer_id, (outs R600_Reg128:$dst),
15483 +                   pattern> {
15484 +
15485 +  let MEGA_FETCH_COUNT = 16;
15486 +  let DST_SEL_X        =  0;
15487 +  let DST_SEL_Y        =  1;
15488 +  let DST_SEL_Z        =  2;
15489 +  let DST_SEL_W        =  3;
15490 +  let DATA_FORMAT      =  0x22; // COLOR_32_32_32_32
15491 +
15492 +  // XXX: Need to force VTX_READ_128 instructions to write to the same register
15493 +  // that holds its buffer address to avoid potential hangs.  We can't use
15494 +  // the same constraint as VTX_READ_32_eg, because the $ptr.ptr and $dst
15495 +  // registers are different sizes.
15496 +}
15497 +
15498 +//===----------------------------------------------------------------------===//
15499 +// VTX Read from parameter memory space
15500 +//===----------------------------------------------------------------------===//
15501 +
15502 +def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0,
15503 +  [(set (i32 R600_TReg32_X:$dst), (load_param_zexti8 ADDRVTX_READ:$ptr))]
15504 +>;
15505 +
15506 +def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0,
15507 +  [(set (i32 R600_TReg32_X:$dst), (load_param_zexti16 ADDRVTX_READ:$ptr))]
15508 +>;
15509 +
15510 +def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
15511 +  [(set (i32 R600_TReg32_X:$dst), (load_param ADDRVTX_READ:$ptr))]
15512 +>;
15513 +
15514 +//===----------------------------------------------------------------------===//
15515 +// VTX Read from global memory space
15516 +//===----------------------------------------------------------------------===//
15517 +
15518 +// 8-bit reads
15519 +def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1,
15520 +  [(set (i32 R600_TReg32_X:$dst), (zextloadi8_global ADDRVTX_READ:$ptr))]
15521 +>;
15522 +
15523 +// 32-bit reads
15524 +def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
15525 +  [(set (i32 R600_TReg32_X:$dst), (global_load ADDRVTX_READ:$ptr))]
15526 +>;
15527 +
15528 +// 128-bit reads
15529 +def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
15530 +  [(set (v4i32 R600_Reg128:$dst), (global_load ADDRVTX_READ:$ptr))]
15531 +>;
15532 +
15533 +//===----------------------------------------------------------------------===//
15534 +// Constant Loads
15535 +// XXX: We are currently storing all constants in the global address space.
15536 +//===----------------------------------------------------------------------===//
15537 +
15538 +def CONSTANT_LOAD_eg : VTX_READ_32_eg <1,
15539 +  [(set (i32 R600_TReg32_X:$dst), (constant_load ADDRVTX_READ:$ptr))]
15540 +>;
15541 +
15542 +}
15543 +
15544 +let Predicates = [isCayman] in {
15545 +
15546 +let isVector = 1 in {
15547 +
15548 +def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>;
15549 +
15550 +def MULLO_INT_cm : MULLO_INT_Common<0x8F>;
15551 +def MULHI_INT_cm : MULHI_INT_Common<0x90>;
15552 +def MULLO_UINT_cm : MULLO_UINT_Common<0x91>;
15553 +def MULHI_UINT_cm : MULHI_UINT_Common<0x92>;
15554 +def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>;
15555 +def EXP_IEEE_cm : EXP_IEEE_Common<0x81>;
15556 +def LOG_IEEE_ : LOG_IEEE_Common<0x83>;
15557 +def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>;
15558 +def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>;
15559 +def SIN_cm : SIN_Common<0x8D>;
15560 +def COS_cm : COS_Common<0x8E>;
15561 +} // End isVector = 1
15562 +
15563 +def : SIN_PAT <SIN_cm>;
15564 +def : COS_PAT <COS_cm>;
15565 +
15566 +defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
15567 +
15568 +// RECIP_UINT emulation for Cayman
15569 +def : Pat <
15570 +  (AMDGPUurecip R600_Reg32:$src0),
15571 +  (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg R600_Reg32:$src0)),
15572 +                            (MOV_IMM_I32 0x4f800000)))
15573 +>;
15574 +
15575 +
15576 +def : Pat<(fsqrt R600_Reg32:$src),
15577 +  (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm R600_Reg32:$src))>;
15578 +
15579 +} // End isCayman
15580 +
15581 +//===----------------------------------------------------------------------===//
15582 +// Branch Instructions
15583 +//===----------------------------------------------------------------------===//
15584 +
15585 +
15586 +def IF_PREDICATE_SET  : ILFormat<(outs), (ins GPRI32:$src),
15587 +  "IF_PREDICATE_SET $src", []>;
15588 +
15589 +def PREDICATED_BREAK : ILFormat<(outs), (ins GPRI32:$src),
15590 +  "PREDICATED_BREAK $src", []>;
15591 +
15592 +//===----------------------------------------------------------------------===//
15593 +// Pseudo instructions
15594 +//===----------------------------------------------------------------------===//
15595 +
15596 +let isPseudo = 1 in {
15597 +
15598 +def PRED_X : InstR600 <
15599 +  0, (outs R600_Predicate_Bit:$dst),
15600 +  (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags),
15601 +  "", [], NullALU> {
15602 +  let FlagOperandIdx = 3;
15603 +}
15604 +
15605 +let isTerminator = 1, isBranch = 1, isBarrier = 1 in {
15606 +
15607 +def JUMP : InstR600 <0x10,
15608 +          (outs),
15609 +          (ins brtarget:$target, R600_Pred:$p),
15610 +          "JUMP $target ($p)",
15611 +          [], AnyALU
15612 +  >;
15613 +
15614 +}  // End isTerminator = 1, isBranch = 1, isBarrier = 1
15615 +
15616 +let usesCustomInserter = 1 in {
15617 +
15618 +let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in {
15619 +
15620 +def MASK_WRITE : AMDGPUShaderInst <
15621 +    (outs),
15622 +    (ins R600_Reg32:$src),
15623 +    "MASK_WRITE $src",
15624 +    []
15625 +>;
15626 +
15627 +} // End mayLoad = 0, mayStore = 0, hasSideEffects = 1
15628 +
15629 +
15630 +def RESERVE_REG : AMDGPUShaderInst <
15631 +  (outs),
15632 +  (ins i32imm:$src),
15633 +  "RESERVE_REG $src",
15634 +  [(int_AMDGPU_reserve_reg imm:$src)]
15635 +>;
15636 +def TXD: AMDGPUShaderInst <
15637 +  (outs R600_Reg128:$dst),
15638 +  (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
15639 +  "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
15640 +  [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
15641 +>;
15642 +
15643 +def TXD_SHADOW: AMDGPUShaderInst <
15644 +  (outs R600_Reg128:$dst),
15645 +  (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
15646 +  "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
15647 +  [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))]
15648 +>;
15649 +
15650 +} // End isPseudo = 1
15651 +} // End usesCustomInserter = 1
15652 +
15653 +def CLAMP_R600 :  CLAMP <R600_Reg32>;
15654 +def FABS_R600 : FABS<R600_Reg32>;
15655 +def FNEG_R600 : FNEG<R600_Reg32>;
15656 +
15657 +//===---------------------------------------------------------------------===//
15658 +// Return instruction
15659 +//===---------------------------------------------------------------------===//
15660 +let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1 in {
15661 +  def RETURN          : ILFormat<(outs), (ins variable_ops),
15662 +      "RETURN", [(IL_retflag)]>;
15663 +}
15664 +
15665 +
15666 +//===----------------------------------------------------------------------===//
15667 +// Constant Buffer Addressing Support
15668 +//===----------------------------------------------------------------------===//
15669 +
15670 +let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
15671 +def CONST_COPY : Instruction {
15672 +  let OutOperandList = (outs R600_Reg32:$dst);
15673 +  let InOperandList = (ins i32imm:$src);
15674 +  let Pattern = [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))];
15675 +  let AsmString = "CONST_COPY";
15676 +  let neverHasSideEffects = 1;
15677 +  let isAsCheapAsAMove = 1;
15678 +  let Itinerary = NullALU;
15679 +}
15680 +} // end isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"
15681 +
15682 +def TEX_VTX_CONSTBUF :
15683 +  InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr), "VTX_READ_eg $dst, $ptr",
15684 +      [(set R600_Reg128:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr))]>,
15685 +  VTX_WORD1_GPR, VTX_WORD0 {
15686 +
15687 +  let VC_INST = 0;
15688 +  let FETCH_TYPE = 2;
15689 +  let FETCH_WHOLE_QUAD = 0;
15690 +  let BUFFER_ID = 0;
15691 +  let SRC_REL = 0;
15692 +  let SRC_SEL_X = 0;
15693 +  let DST_REL = 0;
15694 +  let USE_CONST_FIELDS = 0;
15695 +  let NUM_FORMAT_ALL = 2;
15696 +  let FORMAT_COMP_ALL = 1;
15697 +  let SRF_MODE_ALL = 1;
15698 +  let MEGA_FETCH_COUNT = 16;
15699 +  let DST_SEL_X        = 0;
15700 +  let DST_SEL_Y        = 1;
15701 +  let DST_SEL_Z        = 2;
15702 +  let DST_SEL_W        = 3;
15703 +  let DATA_FORMAT      = 35;
15704 +
15705 +  let Inst{31-0} = Word0;
15706 +  let Inst{63-32} = Word1;
15707 +
15708 +// LLVM can only encode 64-bit instructions, so these fields are manually
15709 +// encoded in R600CodeEmitter
15710 +//
15711 +// bits<16> OFFSET;
15712 +// bits<2>  ENDIAN_SWAP = 0;
15713 +// bits<1>  CONST_BUF_NO_STRIDE = 0;
15714 +// bits<1>  MEGA_FETCH = 0;
15715 +// bits<1>  ALT_CONST = 0;
15716 +// bits<2>  BUFFER_INDEX_MODE = 0;
15717 +
15718 +
15719 +
15720 +// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
15721 +// is done in R600CodeEmitter
15722 +//
15723 +// Inst{79-64} = OFFSET;
15724 +// Inst{81-80} = ENDIAN_SWAP;
15725 +// Inst{82}    = CONST_BUF_NO_STRIDE;
15726 +// Inst{83}    = MEGA_FETCH;
15727 +// Inst{84}    = ALT_CONST;
15728 +// Inst{86-85} = BUFFER_INDEX_MODE;
15729 +// Inst{95-86} = 0; Reserved
15730 +
15731 +// VTX_WORD3 (Padding)
15732 +//
15733 +// Inst{127-96} = 0;
15734 +}
15735 +
15736 +
15737 +//===--------------------------------------------------------------------===//
15738 +// Instructions support
15739 +//===--------------------------------------------------------------------===//
15740 +//===---------------------------------------------------------------------===//
15741 +// Custom Inserter for Branches and returns, this eventually will be a
15742 +// seperate pass
15743 +//===---------------------------------------------------------------------===//
15744 +let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
15745 +  def BRANCH : ILFormat<(outs), (ins brtarget:$target),
15746 +      "; Pseudo unconditional branch instruction",
15747 +      [(br bb:$target)]>;
15748 +  defm BRANCH_COND : BranchConditional<IL_brcond>;
15749 +}
15750 +
15751 +//===---------------------------------------------------------------------===//
15752 +// Flow and Program control Instructions
15753 +//===---------------------------------------------------------------------===//
15754 +let isTerminator=1 in {
15755 +  def SWITCH      : ILFormat< (outs), (ins GPRI32:$src),
15756 +  !strconcat("SWITCH", " $src"), []>;
15757 +  def CASE        : ILFormat< (outs), (ins GPRI32:$src),
15758 +      !strconcat("CASE", " $src"), []>;
15759 +  def BREAK       : ILFormat< (outs), (ins),
15760 +      "BREAK", []>;
15761 +  def CONTINUE    : ILFormat< (outs), (ins),
15762 +      "CONTINUE", []>;
15763 +  def DEFAULT     : ILFormat< (outs), (ins),
15764 +      "DEFAULT", []>;
15765 +  def ELSE        : ILFormat< (outs), (ins),
15766 +      "ELSE", []>;
15767 +  def ENDSWITCH   : ILFormat< (outs), (ins),
15768 +      "ENDSWITCH", []>;
15769 +  def ENDMAIN     : ILFormat< (outs), (ins),
15770 +      "ENDMAIN", []>;
15771 +  def END         : ILFormat< (outs), (ins),
15772 +      "END", []>;
15773 +  def ENDFUNC     : ILFormat< (outs), (ins),
15774 +      "ENDFUNC", []>;
15775 +  def ENDIF       : ILFormat< (outs), (ins),
15776 +      "ENDIF", []>;
15777 +  def WHILELOOP   : ILFormat< (outs), (ins),
15778 +      "WHILE", []>;
15779 +  def ENDLOOP     : ILFormat< (outs), (ins),
15780 +      "ENDLOOP", []>;
15781 +  def FUNC        : ILFormat< (outs), (ins),
15782 +      "FUNC", []>;
15783 +  def RETDYN      : ILFormat< (outs), (ins),
15784 +      "RET_DYN", []>;
15785 +  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
15786 +  defm IF_LOGICALNZ  : BranchInstr<"IF_LOGICALNZ">;
15787 +  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
15788 +  defm IF_LOGICALZ   : BranchInstr<"IF_LOGICALZ">;
15789 +  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
15790 +  defm BREAK_LOGICALNZ : BranchInstr<"BREAK_LOGICALNZ">;
15791 +  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
15792 +  defm BREAK_LOGICALZ : BranchInstr<"BREAK_LOGICALZ">;
15793 +  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
15794 +  defm CONTINUE_LOGICALNZ : BranchInstr<"CONTINUE_LOGICALNZ">;
15795 +  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
15796 +  defm CONTINUE_LOGICALZ : BranchInstr<"CONTINUE_LOGICALZ">;
15797 +  defm IFC         : BranchInstr2<"IFC">;
15798 +  defm BREAKC      : BranchInstr2<"BREAKC">;
15799 +  defm CONTINUEC   : BranchInstr2<"CONTINUEC">;
15800 +}
15801 +
15802 +//===----------------------------------------------------------------------===//
15803 +// ISel Patterns
15804 +//===----------------------------------------------------------------------===//
15805 +
15806 +//CNDGE_INT extra pattern
15807 +def : Pat <
15808 +  (selectcc (i32 R600_Reg32:$src0), -1, (i32 R600_Reg32:$src1),
15809 +                                        (i32 R600_Reg32:$src2), COND_GT),
15810 +  (CNDGE_INT R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2)
15811 +>;
15812 +
15813 +// KIL Patterns
15814 +def KILP : Pat <
15815 +  (int_AMDGPU_kilp),
15816 +  (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO)))
15817 +>;
15818 +
15819 +def KIL : Pat <
15820 +  (int_AMDGPU_kill R600_Reg32:$src0),
15821 +  (MASK_WRITE (KILLGT (f32 ZERO), (f32 R600_Reg32:$src0)))
15822 +>;
15823 +
15824 +// SGT Reverse args
15825 +def : Pat <
15826 +  (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, COND_LT),
15827 +  (SGT R600_Reg32:$src1, R600_Reg32:$src0)
15828 +>;
15829 +
15830 +// SGE Reverse args
15831 +def : Pat <
15832 +  (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, COND_LE),
15833 +  (SGE R600_Reg32:$src1, R600_Reg32:$src0)
15834 +>;
15835 +
15836 +// SETGT_INT reverse args
15837 +def : Pat <
15838 +  (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETLT),
15839 +  (SETGT_INT R600_Reg32:$src1, R600_Reg32:$src0)
15840 +>;
15841 +
15842 +// SETGE_INT reverse args
15843 +def : Pat <
15844 +  (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETLE),
15845 +  (SETGE_INT R600_Reg32:$src1, R600_Reg32:$src0)
15846 +>;
15847 +
15848 +// SETGT_UINT reverse args
15849 +def : Pat <
15850 +  (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETULT),
15851 +  (SETGT_UINT R600_Reg32:$src1, R600_Reg32:$src0)
15852 +>;
15853 +
15854 +// SETGE_UINT reverse args
15855 +def : Pat <
15856 +  (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETULE),
15857 +  (SETGE_UINT R600_Reg32:$src1, R600_Reg32:$src0)
15858 +>;
15859 +
15860 +// The next two patterns are special cases for handling 'true if ordered' and
15861 +// 'true if unordered' conditionals.  The assumption here is that the behavior of
15862 +// SETE and SNE conforms to the Direct3D 10 rules for floating point values
15863 +// described here:
15864 +// http://msdn.microsoft.com/en-us/library/windows/desktop/cc308050.aspx#alpha_32_bit
15865 +// We assume that  SETE returns false when one of the operands is NAN and
15866 +// SNE returns true when on of the operands is NAN
15867 +
15868 +//SETE - 'true if ordered'
15869 +def : Pat <
15870 +  (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETO),
15871 +  (SETE R600_Reg32:$src0, R600_Reg32:$src1)
15872 +>;
15873 +
15874 +//SNE - 'true if unordered'
15875 +def : Pat <
15876 +  (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETUO),
15877 +  (SNE R600_Reg32:$src0, R600_Reg32:$src1)
15878 +>;
15879 +
15880 +def : Extract_Element <f32, v4f32, R600_Reg128, 0, sel_x>;
15881 +def : Extract_Element <f32, v4f32, R600_Reg128, 1, sel_y>;
15882 +def : Extract_Element <f32, v4f32, R600_Reg128, 2, sel_z>;
15883 +def : Extract_Element <f32, v4f32, R600_Reg128, 3, sel_w>;
15884 +
15885 +def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 0, sel_x>;
15886 +def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 1, sel_y>;
15887 +def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 2, sel_z>;
15888 +def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 3, sel_w>;
15889 +
15890 +def : Extract_Element <i32, v4i32, R600_Reg128, 0, sel_x>;
15891 +def : Extract_Element <i32, v4i32, R600_Reg128, 1, sel_y>;
15892 +def : Extract_Element <i32, v4i32, R600_Reg128, 2, sel_z>;
15893 +def : Extract_Element <i32, v4i32, R600_Reg128, 3, sel_w>;
15894 +
15895 +def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 0, sel_x>;
15896 +def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 1, sel_y>;
15897 +def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 2, sel_z>;
15898 +def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 3, sel_w>;
15899 +
15900 +def : Vector_Build <v4f32, R600_Reg128, f32, R600_Reg32>;
15901 +def : Vector_Build <v4i32, R600_Reg128, i32, R600_Reg32>;
15902 +
15903 +// bitconvert patterns
15904 +
15905 +def : BitConvert <i32, f32, R600_Reg32>;
15906 +def : BitConvert <f32, i32, R600_Reg32>;
15907 +def : BitConvert <v4f32, v4i32, R600_Reg128>;
15908 +def : BitConvert <v4i32, v4f32, R600_Reg128>;
15909 +
15910 +// DWORDADDR pattern
15911 +def : DwordAddrPat  <i32, R600_Reg32>;
15912 +
15913 +} // End isR600toCayman Predicate
15914 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600Intrinsics.td llvm-r600/lib/Target/R600/R600Intrinsics.td
15915 --- llvm-3.2.src/lib/Target/R600/R600Intrinsics.td      1970-01-01 01:00:00.000000000 +0100
15916 +++ llvm-r600/lib/Target/R600/R600Intrinsics.td 2013-01-25 19:43:57.466716387 +0100
15917 @@ -0,0 +1,34 @@
15918 +//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===//
15919 +//
15920 +//                     The LLVM Compiler Infrastructure
15921 +//
15922 +// This file is distributed under the University of Illinois Open Source
15923 +// License. See LICENSE.TXT for details.
15924 +//
15925 +//===----------------------------------------------------------------------===//
15926 +//
15927 +// R600 Intrinsic Definitions
15928 +//
15929 +//===----------------------------------------------------------------------===//
15930 +
15931 +let TargetPrefix = "R600", isTarget = 1 in {
15932 +  def int_R600_load_input : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
15933 +  def int_R600_load_input_perspective :
15934 +    Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadMem]>;
15935 +  def int_R600_load_input_constant :
15936 +    Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadMem]>;
15937 +  def int_R600_load_input_linear :
15938 +    Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadMem]>;
15939 +  def int_R600_store_swizzle :
15940 +    Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
15941 +  def int_R600_store_stream_output :
15942 +    Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
15943 +  def int_R600_store_pixel_color :
15944 +      Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
15945 +  def int_R600_store_pixel_depth :
15946 +      Intrinsic<[], [llvm_float_ty], []>;
15947 +  def int_R600_store_pixel_stencil :
15948 +      Intrinsic<[], [llvm_float_ty], []>;
15949 +  def int_R600_store_pixel_dummy :
15950 +      Intrinsic<[], [], []>;
15951 +}
15952 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600ISelLowering.cpp llvm-r600/lib/Target/R600/R600ISelLowering.cpp
15953 --- llvm-3.2.src/lib/Target/R600/R600ISelLowering.cpp   1970-01-01 01:00:00.000000000 +0100
15954 +++ llvm-r600/lib/Target/R600/R600ISelLowering.cpp      2013-01-25 19:43:57.463383054 +0100
15955 @@ -0,0 +1,997 @@
15956 +//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
15957 +//
15958 +//                     The LLVM Compiler Infrastructure
15959 +//
15960 +// This file is distributed under the University of Illinois Open Source
15961 +// License. See LICENSE.TXT for details.
15962 +//
15963 +//===----------------------------------------------------------------------===//
15964 +//
15965 +/// \file
15966 +/// \brief Custom DAG lowering for R600
15967 +//
15968 +//===----------------------------------------------------------------------===//
15969 +
15970 +#include "R600ISelLowering.h"
15971 +#include "R600Defines.h"
15972 +#include "R600InstrInfo.h"
15973 +#include "R600MachineFunctionInfo.h"
15974 +#include "llvm/Argument.h"
15975 +#include "llvm/Function.h"
15976 +#include "llvm/CodeGen/MachineInstrBuilder.h"
15977 +#include "llvm/CodeGen/MachineRegisterInfo.h"
15978 +#include "llvm/CodeGen/SelectionDAG.h"
15979 +
15980 +using namespace llvm;
15981 +
15982 +R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
15983 +    AMDGPUTargetLowering(TM),
15984 +    TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
15985 +  setOperationAction(ISD::MUL, MVT::i64, Expand);
15986 +  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
15987 +  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
15988 +  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
15989 +  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
15990 +  computeRegisterProperties();
15991 +
15992 +  setOperationAction(ISD::FADD, MVT::v4f32, Expand);
15993 +  setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
15994 +  setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
15995 +  setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
15996 +
15997 +  setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
15998 +  setOperationAction(ISD::AND,  MVT::v4i32, Expand);
15999 +  setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
16000 +  setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
16001 +  setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
16002 +  setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
16003 +  setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
16004 +  setOperationAction(ISD::UREM, MVT::v4i32, Expand);
16005 +  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
16006 +
16007 +  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
16008 +  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
16009 +
16010 +  setOperationAction(ISD::FSUB, MVT::f32, Expand);
16011 +
16012 +  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
16013 +  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
16014 +  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
16015 +  setOperationAction(ISD::FPOW, MVT::f32, Custom);
16016 +
16017 +  setOperationAction(ISD::ROTL, MVT::i32, Custom);
16018 +
16019 +  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
16020 +  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
16021 +
16022 +  setOperationAction(ISD::SETCC, MVT::i32, Custom);
16023 +  setOperationAction(ISD::SETCC, MVT::f32, Custom);
16024 +  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
16025 +
16026 +  setOperationAction(ISD::SELECT, MVT::i32, Custom);
16027 +  setOperationAction(ISD::SELECT, MVT::f32, Custom);
16028 +
16029 +  setOperationAction(ISD::STORE, MVT::i32, Custom);
16030 +  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
16031 +
16032 +  setOperationAction(ISD::LOAD, MVT::i32, Custom);
16033 +  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
16034 +  setTargetDAGCombine(ISD::FP_ROUND);
16035 +  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
16036 +
16037 +  setSchedulingPreference(Sched::VLIW);
16038 +}
16039 +
16040 +MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
16041 +    MachineInstr * MI, MachineBasicBlock * BB) const {
16042 +  MachineFunction * MF = BB->getParent();
16043 +  MachineRegisterInfo &MRI = MF->getRegInfo();
16044 +  MachineBasicBlock::iterator I = *MI;
16045 +
16046 +  switch (MI->getOpcode()) {
16047 +  default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
16048 +  case AMDGPU::SHADER_TYPE: break;
16049 +  case AMDGPU::CLAMP_R600: {
16050 +    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
16051 +                                                   AMDGPU::MOV,
16052 +                                                   MI->getOperand(0).getReg(),
16053 +                                                   MI->getOperand(1).getReg());
16054 +    TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
16055 +    break;
16056 +  }
16057 +
16058 +  case AMDGPU::FABS_R600: {
16059 +    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
16060 +                                                    AMDGPU::MOV,
16061 +                                                    MI->getOperand(0).getReg(),
16062 +                                                    MI->getOperand(1).getReg());
16063 +    TII->addFlag(NewMI, 0, MO_FLAG_ABS);
16064 +    break;
16065 +  }
16066 +
16067 +  case AMDGPU::FNEG_R600: {
16068 +    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
16069 +                                                    AMDGPU::MOV,
16070 +                                                    MI->getOperand(0).getReg(),
16071 +                                                    MI->getOperand(1).getReg());
16072 +    TII->addFlag(NewMI, 0, MO_FLAG_NEG);
16073 +    break;
16074 +  }
16075 +
16076 +  case AMDGPU::MASK_WRITE: {
16077 +    unsigned maskedRegister = MI->getOperand(0).getReg();
16078 +    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
16079 +    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
16080 +    TII->addFlag(defInstr, 0, MO_FLAG_MASK);
16081 +    break;
16082 +  }
16083 +
16084 +  case AMDGPU::MOV_IMM_F32:
16085 +    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
16086 +                     MI->getOperand(1).getFPImm()->getValueAPF()
16087 +                         .bitcastToAPInt().getZExtValue());
16088 +    break;
16089 +  case AMDGPU::MOV_IMM_I32:
16090 +    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
16091 +                     MI->getOperand(1).getImm());
16092 +    break;
16093 +
16094 +
16095 +  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
16096 +  case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
16097 +    unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
16098 +
16099 +    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
16100 +            .addOperand(MI->getOperand(0))
16101 +            .addOperand(MI->getOperand(1))
16102 +            .addImm(EOP); // Set End of program bit
16103 +    break;
16104 +  }
16105 +
16106 +  case AMDGPU::RESERVE_REG: {
16107 +    R600MachineFunctionInfo * MFI = MF->getInfo<R600MachineFunctionInfo>();
16108 +    int64_t ReservedIndex = MI->getOperand(0).getImm();
16109 +    unsigned ReservedReg =
16110 +                         AMDGPU::R600_TReg32RegClass.getRegister(ReservedIndex);
16111 +    MFI->ReservedRegs.push_back(ReservedReg);
16112 +    unsigned SuperReg =
16113 +          AMDGPU::R600_Reg128RegClass.getRegister(ReservedIndex / 4);
16114 +    MFI->ReservedRegs.push_back(SuperReg);
16115 +    break;
16116 +  }
16117 +
16118 +  case AMDGPU::TXD: {
16119 +    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
16120 +    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
16121 +
16122 +    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
16123 +            .addOperand(MI->getOperand(3))
16124 +            .addOperand(MI->getOperand(4))
16125 +            .addOperand(MI->getOperand(5))
16126 +            .addOperand(MI->getOperand(6));
16127 +    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
16128 +            .addOperand(MI->getOperand(2))
16129 +            .addOperand(MI->getOperand(4))
16130 +            .addOperand(MI->getOperand(5))
16131 +            .addOperand(MI->getOperand(6));
16132 +    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
16133 +            .addOperand(MI->getOperand(0))
16134 +            .addOperand(MI->getOperand(1))
16135 +            .addOperand(MI->getOperand(4))
16136 +            .addOperand(MI->getOperand(5))
16137 +            .addOperand(MI->getOperand(6))
16138 +            .addReg(T0, RegState::Implicit)
16139 +            .addReg(T1, RegState::Implicit);
16140 +    break;
16141 +  }
16142 +
16143 +  case AMDGPU::TXD_SHADOW: {
16144 +    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
16145 +    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
16146 +
16147 +    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
16148 +            .addOperand(MI->getOperand(3))
16149 +            .addOperand(MI->getOperand(4))
16150 +            .addOperand(MI->getOperand(5))
16151 +            .addOperand(MI->getOperand(6));
16152 +    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
16153 +            .addOperand(MI->getOperand(2))
16154 +            .addOperand(MI->getOperand(4))
16155 +            .addOperand(MI->getOperand(5))
16156 +            .addOperand(MI->getOperand(6));
16157 +    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
16158 +            .addOperand(MI->getOperand(0))
16159 +            .addOperand(MI->getOperand(1))
16160 +            .addOperand(MI->getOperand(4))
16161 +            .addOperand(MI->getOperand(5))
16162 +            .addOperand(MI->getOperand(6))
16163 +            .addReg(T0, RegState::Implicit)
16164 +            .addReg(T1, RegState::Implicit);
16165 +    break;
16166 +  }
16167 +
16168 +  case AMDGPU::BRANCH:
16169 +      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
16170 +              .addOperand(MI->getOperand(0))
16171 +              .addReg(0);
16172 +      break;
16173 +
16174 +  case AMDGPU::BRANCH_COND_f32: {
16175 +    MachineInstr *NewMI =
16176 +      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
16177 +              AMDGPU::PREDICATE_BIT)
16178 +              .addOperand(MI->getOperand(1))
16179 +              .addImm(OPCODE_IS_NOT_ZERO)
16180 +              .addImm(0); // Flags
16181 +    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
16182 +    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
16183 +            .addOperand(MI->getOperand(0))
16184 +            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
16185 +    break;
16186 +  }
16187 +
16188 +  case AMDGPU::BRANCH_COND_i32: {
16189 +    MachineInstr *NewMI =
16190 +      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
16191 +            AMDGPU::PREDICATE_BIT)
16192 +            .addOperand(MI->getOperand(1))
16193 +            .addImm(OPCODE_IS_NOT_ZERO_INT)
16194 +            .addImm(0); // Flags
16195 +    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
16196 +    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
16197 +           .addOperand(MI->getOperand(0))
16198 +            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
16199 +    break;
16200 +  }
16201 +
16202 +  case AMDGPU::input_perspective: {
16203 +    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
16204 +
16205 +    // XXX Be more fine about register reservation
16206 +    for (unsigned i = 0; i < 4; i ++) {
16207 +      unsigned ReservedReg = AMDGPU::R600_TReg32RegClass.getRegister(i);
16208 +      MFI->ReservedRegs.push_back(ReservedReg);
16209 +    }
16210 +
16211 +    switch (MI->getOperand(1).getImm()) {
16212 +    case 0:// Perspective
16213 +      MFI->HasPerspectiveInterpolation = true;
16214 +      break;
16215 +    case 1:// Linear
16216 +      MFI->HasLinearInterpolation = true;
16217 +      break;
16218 +    default:
16219 +      assert(0 && "Unknow ij index");
16220 +    }
16221 +
16222 +    return BB;
16223 +  }
16224 +
16225 +  case AMDGPU::EG_ExportSwz:
16226 +  case AMDGPU::R600_ExportSwz: {
16227 +    // Instruction is left unmodified if its not the last one of its type
16228 +    bool isLastInstructionOfItsType = true;
16229 +    unsigned InstExportType = MI->getOperand(1).getImm();
16230 +    for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
16231 +         EndBlock = BB->end(); NextExportInst != EndBlock;
16232 +         NextExportInst = llvm::next(NextExportInst)) {
16233 +      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
16234 +          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
16235 +        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
16236 +            .getImm();
16237 +        if (CurrentInstExportType == InstExportType) {
16238 +          isLastInstructionOfItsType = false;
16239 +          break;
16240 +        }
16241 +      }
16242 +    }
16243 +    bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
16244 +    if (!EOP && !isLastInstructionOfItsType)
16245 +      return BB;
16246 +    unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
16247 +    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
16248 +            .addOperand(MI->getOperand(0))
16249 +            .addOperand(MI->getOperand(1))
16250 +            .addOperand(MI->getOperand(2))
16251 +            .addOperand(MI->getOperand(3))
16252 +            .addOperand(MI->getOperand(4))
16253 +            .addOperand(MI->getOperand(5))
16254 +            .addOperand(MI->getOperand(6))
16255 +            .addImm(CfInst)
16256 +            .addImm(EOP);
16257 +    break;
16258 +  }
16259 +  }
16260 +
16261 +  MI->eraseFromParent();
16262 +  return BB;
16263 +}
16264 +
16265 +//===----------------------------------------------------------------------===//
16266 +// Custom DAG Lowering Operations
16267 +//===----------------------------------------------------------------------===//
16268 +
16269 +using namespace llvm::Intrinsic;
16270 +using namespace llvm::AMDGPUIntrinsic;
16271 +
16272 +static SDValue
16273 +InsertScalarToRegisterExport(SelectionDAG &DAG, DebugLoc DL, SDNode **ExportMap,
16274 +    unsigned Slot, unsigned Channel, unsigned Inst, unsigned Type,
16275 +    SDValue Scalar, SDValue Chain) {
16276 +  if (!ExportMap[Slot]) {
16277 +    SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
16278 +      DL, MVT::v4f32,
16279 +      DAG.getUNDEF(MVT::v4f32),
16280 +      Scalar,
16281 +      DAG.getConstant(Channel, MVT::i32));
16282 +
16283 +    unsigned Mask = 1 << Channel;
16284 +
16285 +    const SDValue Ops[] = {Chain, Vector, DAG.getConstant(Inst, MVT::i32),
16286 +        DAG.getConstant(Type, MVT::i32), DAG.getConstant(Slot, MVT::i32),
16287 +        DAG.getConstant(Mask, MVT::i32)};
16288 +
16289 +    SDValue Res =  DAG.getNode(
16290 +        AMDGPUISD::EXPORT,
16291 +        DL,
16292 +        MVT::Other,
16293 +        Ops, 6);
16294 +     ExportMap[Slot] = Res.getNode();
16295 +     return Res;
16296 +  }
16297 +
16298 +  SDNode *ExportInstruction = (SDNode *) ExportMap[Slot] ;
16299 +  SDValue PreviousVector = ExportInstruction->getOperand(1);
16300 +  SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
16301 +      DL, MVT::v4f32,
16302 +      PreviousVector,
16303 +      Scalar,
16304 +      DAG.getConstant(Channel, MVT::i32));
16305 +
16306 +  unsigned Mask = dyn_cast<ConstantSDNode>(ExportInstruction->getOperand(5))
16307 +      ->getZExtValue();
16308 +  Mask |= (1 << Channel);
16309 +
16310 +  const SDValue Ops[] = {ExportInstruction->getOperand(0), Vector,
16311 +      DAG.getConstant(Inst, MVT::i32),
16312 +      DAG.getConstant(Type, MVT::i32),
16313 +      DAG.getConstant(Slot, MVT::i32),
16314 +      DAG.getConstant(Mask, MVT::i32)};
16315 +
16316 +  DAG.UpdateNodeOperands(ExportInstruction,
16317 +      Ops, 6);
16318 +
16319 +  return Chain;
16320 +
16321 +}
16322 +
16323 +SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
16324 +  switch (Op.getOpcode()) {
16325 +  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
16326 +  case ISD::BR_CC: return LowerBR_CC(Op, DAG);
16327 +  case ISD::ROTL: return LowerROTL(Op, DAG);
16328 +  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
16329 +  case ISD::SELECT: return LowerSELECT(Op, DAG);
16330 +  case ISD::SETCC: return LowerSETCC(Op, DAG);
16331 +  case ISD::STORE: return LowerSTORE(Op, DAG);
16332 +  case ISD::LOAD: return LowerLOAD(Op, DAG);
16333 +  case ISD::FPOW: return LowerFPOW(Op, DAG);
16334 +  case ISD::INTRINSIC_VOID: {
16335 +    SDValue Chain = Op.getOperand(0);
16336 +    unsigned IntrinsicID =
16337 +                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
16338 +    switch (IntrinsicID) {
16339 +    case AMDGPUIntrinsic::AMDGPU_store_output: {
16340 +      MachineFunction &MF = DAG.getMachineFunction();
16341 +      MachineRegisterInfo &MRI = MF.getRegInfo();
16342 +      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
16343 +      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
16344 +      if (!MRI.isLiveOut(Reg)) {
16345 +        MRI.addLiveOut(Reg);
16346 +      }
16347 +      return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
16348 +    }
16349 +    case AMDGPUIntrinsic::R600_store_pixel_color: {
16350 +      MachineFunction &MF = DAG.getMachineFunction();
16351 +      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
16352 +      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
16353 +
16354 +      SDNode **OutputsMap = MFI->Outputs;
16355 +      return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap,
16356 +          RegIndex / 4, RegIndex % 4, 0, 0, Op.getOperand(2),
16357 +          Chain);
16358 +
16359 +    }
16360 +
16361 +    // default for switch(IntrinsicID)
16362 +    default: break;
16363 +    }
16364 +    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
16365 +    break;
16366 +  }
16367 +  case ISD::INTRINSIC_WO_CHAIN: {
16368 +    unsigned IntrinsicID =
16369 +                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
16370 +    EVT VT = Op.getValueType();
16371 +    DebugLoc DL = Op.getDebugLoc();
16372 +    switch(IntrinsicID) {
16373 +    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
16374 +    case AMDGPUIntrinsic::R600_load_input: {
16375 +      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
16376 +      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
16377 +      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
16378 +    }
16379 +    case AMDGPUIntrinsic::R600_load_input_perspective: {
16380 +      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
16381 +      if (slot < 0)
16382 +        return DAG.getUNDEF(MVT::f32);
16383 +      SDValue FullVector = DAG.getNode(
16384 +          AMDGPUISD::INTERP,
16385 +          DL, MVT::v4f32,
16386 +          DAG.getConstant(0, MVT::i32), DAG.getConstant(slot / 4 , MVT::i32));
16387 +      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
16388 +        DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32));
16389 +    }
16390 +    case AMDGPUIntrinsic::R600_load_input_linear: {
16391 +      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
16392 +      if (slot < 0)
16393 +        return DAG.getUNDEF(MVT::f32);
16394 +      SDValue FullVector = DAG.getNode(
16395 +        AMDGPUISD::INTERP,
16396 +        DL, MVT::v4f32,
16397 +        DAG.getConstant(1, MVT::i32), DAG.getConstant(slot / 4 , MVT::i32));
16398 +      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
16399 +        DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32));
16400 +    }
16401 +    case AMDGPUIntrinsic::R600_load_input_constant: {
16402 +      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
16403 +      if (slot < 0)
16404 +        return DAG.getUNDEF(MVT::f32);
16405 +      SDValue FullVector = DAG.getNode(
16406 +        AMDGPUISD::INTERP_P0,
16407 +        DL, MVT::v4f32,
16408 +        DAG.getConstant(slot / 4 , MVT::i32));
16409 +      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
16410 +          DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32));
16411 +    }
16412 +
16413 +    case r600_read_ngroups_x:
16414 +      return LowerImplicitParameter(DAG, VT, DL, 0);
16415 +    case r600_read_ngroups_y:
16416 +      return LowerImplicitParameter(DAG, VT, DL, 1);
16417 +    case r600_read_ngroups_z:
16418 +      return LowerImplicitParameter(DAG, VT, DL, 2);
16419 +    case r600_read_global_size_x:
16420 +      return LowerImplicitParameter(DAG, VT, DL, 3);
16421 +    case r600_read_global_size_y:
16422 +      return LowerImplicitParameter(DAG, VT, DL, 4);
16423 +    case r600_read_global_size_z:
16424 +      return LowerImplicitParameter(DAG, VT, DL, 5);
16425 +    case r600_read_local_size_x:
16426 +      return LowerImplicitParameter(DAG, VT, DL, 6);
16427 +    case r600_read_local_size_y:
16428 +      return LowerImplicitParameter(DAG, VT, DL, 7);
16429 +    case r600_read_local_size_z:
16430 +      return LowerImplicitParameter(DAG, VT, DL, 8);
16431 +
16432 +    case r600_read_tgid_x:
16433 +      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
16434 +                                  AMDGPU::T1_X, VT);
16435 +    case r600_read_tgid_y:
16436 +      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
16437 +                                  AMDGPU::T1_Y, VT);
16438 +    case r600_read_tgid_z:
16439 +      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
16440 +                                  AMDGPU::T1_Z, VT);
16441 +    case r600_read_tidig_x:
16442 +      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
16443 +                                  AMDGPU::T0_X, VT);
16444 +    case r600_read_tidig_y:
16445 +      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
16446 +                                  AMDGPU::T0_Y, VT);
16447 +    case r600_read_tidig_z:
16448 +      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
16449 +                                  AMDGPU::T0_Z, VT);
16450 +    }
16451 +    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
16452 +    break;
16453 +  }
16454 +  } // end switch(Op.getOpcode())
16455 +  return SDValue();
16456 +}
16457 +
16458 +void R600TargetLowering::ReplaceNodeResults(SDNode *N,
16459 +                                            SmallVectorImpl<SDValue> &Results,
16460 +                                            SelectionDAG &DAG) const {
16461 +  switch (N->getOpcode()) {
16462 +  default: return;
16463 +  case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
16464 +    return;
16465 +  case ISD::LOAD: {
16466 +    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
16467 +    Results.push_back(SDValue(Node, 0));
16468 +    Results.push_back(SDValue(Node, 1));
16469 +    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
16470 +    // function
16471 +    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
16472 +    return;
16473 +  }
16474 +  }
16475 +}
16476 +
16477 +SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
16478 +  return DAG.getNode(
16479 +      ISD::SETCC,
16480 +      Op.getDebugLoc(),
16481 +      MVT::i1,
16482 +      Op, DAG.getConstantFP(0.0f, MVT::f32),
16483 +      DAG.getCondCode(ISD::SETNE)
16484 +      );
16485 +}
16486 +
16487 +SDValue R600TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
16488 +  SDValue Chain = Op.getOperand(0);
16489 +  SDValue CC = Op.getOperand(1);
16490 +  SDValue LHS   = Op.getOperand(2);
16491 +  SDValue RHS   = Op.getOperand(3);
16492 +  SDValue JumpT  = Op.getOperand(4);
16493 +  SDValue CmpValue;
16494 +  SDValue Result;
16495 +
16496 +  if (LHS.getValueType() == MVT::i32) {
16497 +    CmpValue = DAG.getNode(
16498 +        ISD::SELECT_CC,
16499 +        Op.getDebugLoc(),
16500 +        MVT::i32,
16501 +        LHS, RHS,
16502 +        DAG.getConstant(-1, MVT::i32),
16503 +        DAG.getConstant(0, MVT::i32),
16504 +        CC);
16505 +  } else if (LHS.getValueType() == MVT::f32) {
16506 +    CmpValue = DAG.getNode(
16507 +        ISD::SELECT_CC,
16508 +        Op.getDebugLoc(),
16509 +        MVT::f32,
16510 +        LHS, RHS,
16511 +        DAG.getConstantFP(1.0f, MVT::f32),
16512 +        DAG.getConstantFP(0.0f, MVT::f32),
16513 +        CC);
16514 +  } else {
16515 +    assert(0 && "Not valid type for br_cc");
16516 +  }
16517 +  Result = DAG.getNode(
16518 +      AMDGPUISD::BRANCH_COND,
16519 +      CmpValue.getDebugLoc(),
16520 +      MVT::Other, Chain,
16521 +      JumpT, CmpValue);
16522 +  return Result;
16523 +}
16524 +
16525 +SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
16526 +                                                   DebugLoc DL,
16527 +                                                   unsigned DwordOffset) const {
16528 +  unsigned ByteOffset = DwordOffset * 4;
16529 +  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
16530 +                                      AMDGPUAS::PARAM_I_ADDRESS);
16531 +
16532 +  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
16533 +  assert(isInt<16>(ByteOffset));
16534 +
16535 +  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
16536 +                     DAG.getConstant(ByteOffset, MVT::i32), // PTR
16537 +                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
16538 +                     false, false, false, 0);
16539 +}
16540 +
16541 +SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
16542 +  DebugLoc DL = Op.getDebugLoc();
16543 +  EVT VT = Op.getValueType();
16544 +
16545 +  return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
16546 +                     Op.getOperand(0),
16547 +                     Op.getOperand(0),
16548 +                     DAG.getNode(ISD::SUB, DL, VT,
16549 +                                 DAG.getConstant(32, MVT::i32),
16550 +                                 Op.getOperand(1)));
16551 +}
16552 +
16553 +bool R600TargetLowering::isZero(SDValue Op) const {
16554 +  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
16555 +    return Cst->isNullValue();
16556 +  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
16557 +    return CstFP->isZero();
16558 +  } else {
16559 +    return false;
16560 +  }
16561 +}
16562 +
16563 +SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
16564 +  DebugLoc DL = Op.getDebugLoc();
16565 +  EVT VT = Op.getValueType();
16566 +
16567 +  SDValue LHS = Op.getOperand(0);
16568 +  SDValue RHS = Op.getOperand(1);
16569 +  SDValue True = Op.getOperand(2);
16570 +  SDValue False = Op.getOperand(3);
16571 +  SDValue CC = Op.getOperand(4);
16572 +  SDValue Temp;
16573 +
16574 +  // LHS and RHS are guaranteed to be the same value type
16575 +  EVT CompareVT = LHS.getValueType();
16576 +
16577 +  // Check if we can lower this to a native operation.
16578 +
16579 +  // Try to lower to a CND* instruction:
16580 +  // CND* instructions requires RHS to be zero.  Some SELECT_CC nodes that
16581 +  // can be lowered to CND* instructions can also be lowered to SET*
16582 +  // instructions.  CND* instructions are cheaper, because they dont't
16583 +  // require additional instructions to convert their result to the correct
16584 +  // value type, so this check should be first.
16585 +  if (isZero(LHS) || isZero(RHS)) {
16586 +    SDValue Cond = (isZero(LHS) ? RHS : LHS);
16587 +    SDValue Zero = (isZero(LHS) ? LHS : RHS);
16588 +    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
16589 +    if (CompareVT != VT) {
16590 +      // Bitcast True / False to the correct types.  This will end up being
16591 +      // a nop, but it allows us to define only a single pattern in the
16592 +      // .TD files for each CND* instruction rather than having to have
16593 +      // one pattern for integer True/False and one for fp True/False
16594 +      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
16595 +      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
16596 +    }
16597 +    if (isZero(LHS)) {
16598 +      CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
16599 +    }
16600 +
16601 +    switch (CCOpcode) {
16602 +    case ISD::SETONE:
16603 +    case ISD::SETUNE:
16604 +    case ISD::SETNE:
16605 +    case ISD::SETULE:
16606 +    case ISD::SETULT:
16607 +    case ISD::SETOLE:
16608 +    case ISD::SETOLT:
16609 +    case ISD::SETLE:
16610 +    case ISD::SETLT:
16611 +      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
16612 +      Temp = True;
16613 +      True = False;
16614 +      False = Temp;
16615 +      break;
16616 +    default:
16617 +      break;
16618 +    }
16619 +    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
16620 +        Cond, Zero,
16621 +        True, False,
16622 +        DAG.getCondCode(CCOpcode));
16623 +    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
16624 +  }
16625 +
16626 +  // Try to lower to a SET* instruction:
16627 +  // We need all the operands of SELECT_CC to have the same value type, so if
16628 +  // necessary we need to change True and False to be the same type as LHS and
16629 +  // RHS, and then convert the result of the select_cc back to the correct type.
16630 +
16631 +  // Move hardware True/False values to the correct operand.
16632 +  if (isHWTrueValue(False) && isHWFalseValue(True)) {
16633 +    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
16634 +    std::swap(False, True);
16635 +    CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
16636 +  }
16637 +
16638 +  if (isHWTrueValue(True) && isHWFalseValue(False)) {
16639 +    if (CompareVT !=  VT) {
16640 +      if (VT == MVT::f32 && CompareVT == MVT::i32) {
16641 +        SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
16642 +            LHS, RHS,
16643 +            DAG.getConstant(-1, MVT::i32),
16644 +            DAG.getConstant(0, MVT::i32),
16645 +            CC);
16646 +        // Convert integer values of true (-1) and false (0) to fp values of
16647 +        // true (1.0f) and false (0.0f).
16648 +        SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean,
16649 +                                                  DAG.getConstant(1, MVT::i32));
16650 +        return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB);
16651 +      } else if (VT == MVT::i32 && CompareVT == MVT::f32) {
16652 +        SDValue BoolAsFlt = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
16653 +            LHS, RHS,
16654 +            DAG.getConstantFP(1.0f, MVT::f32),
16655 +            DAG.getConstantFP(0.0f, MVT::f32),
16656 +            CC);
16657 +        // Convert fp values of true (1.0f) and false (0.0f) to integer values
16658 +        // of true (-1) and false (0).
16659 +        SDValue Neg = DAG.getNode(ISD::FNEG, DL, MVT::f32, BoolAsFlt);
16660 +        return DAG.getNode(ISD::FP_TO_SINT, DL, VT, Neg);
16661 +      } else {
16662 +        // I don't think there will be any other type pairings.
16663 +        assert(!"Unhandled operand type parings in SELECT_CC");
16664 +      }
16665 +    } else {
16666 +      // This SELECT_CC is already legal.
16667 +      return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
16668 +    }
16669 +  }
16670 +
16671 +  // Possible Min/Max pattern
16672 +  SDValue MinMax = LowerMinMax(Op, DAG);
16673 +  if (MinMax.getNode()) {
16674 +    return MinMax;
16675 +  }
16676 +
16677 +  // If we make it this for it means we have no native instructions to handle
16678 +  // this SELECT_CC, so we must lower it.
16679 +  SDValue HWTrue, HWFalse;
16680 +
16681 +  if (CompareVT == MVT::f32) {
16682 +    HWTrue = DAG.getConstantFP(1.0f, CompareVT);
16683 +    HWFalse = DAG.getConstantFP(0.0f, CompareVT);
16684 +  } else if (CompareVT == MVT::i32) {
16685 +    HWTrue = DAG.getConstant(-1, CompareVT);
16686 +    HWFalse = DAG.getConstant(0, CompareVT);
16687 +  }
16688 +  else {
16689 +    assert(!"Unhandled value type in LowerSELECT_CC");
16690 +  }
16691 +
16692 +  // Lower this unsupported SELECT_CC into a combination of two supported
16693 +  // SELECT_CC operations.
16694 +  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
16695 +
16696 +  return DAG.getNode(ISD::SELECT_CC, DL, VT,
16697 +      Cond, HWFalse,
16698 +      True, False,
16699 +      DAG.getCondCode(ISD::SETNE));
16700 +}
16701 +
16702 +SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
16703 +  return DAG.getNode(ISD::SELECT_CC,
16704 +      Op.getDebugLoc(),
16705 +      Op.getValueType(),
16706 +      Op.getOperand(0),
16707 +      DAG.getConstant(0, MVT::i32),
16708 +      Op.getOperand(1),
16709 +      Op.getOperand(2),
16710 +      DAG.getCondCode(ISD::SETNE));
16711 +}
16712 +
16713 +SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
16714 +  SDValue Cond;
16715 +  SDValue LHS = Op.getOperand(0);
16716 +  SDValue RHS = Op.getOperand(1);
16717 +  SDValue CC  = Op.getOperand(2);
16718 +  DebugLoc DL = Op.getDebugLoc();
16719 +  assert(Op.getValueType() == MVT::i32);
16720 +  if (LHS.getValueType() == MVT::i32) {
16721 +    Cond = DAG.getNode(
16722 +        ISD::SELECT_CC,
16723 +        Op.getDebugLoc(),
16724 +        MVT::i32,
16725 +        LHS, RHS,
16726 +        DAG.getConstant(-1, MVT::i32),
16727 +        DAG.getConstant(0, MVT::i32),
16728 +        CC);
16729 +  } else if (LHS.getValueType() == MVT::f32) {
16730 +    Cond = DAG.getNode(
16731 +        ISD::SELECT_CC,
16732 +        Op.getDebugLoc(),
16733 +        MVT::f32,
16734 +        LHS, RHS,
16735 +        DAG.getConstantFP(1.0f, MVT::f32),
16736 +        DAG.getConstantFP(0.0f, MVT::f32),
16737 +        CC);
16738 +    Cond = DAG.getNode(
16739 +        ISD::FP_TO_SINT,
16740 +        DL,
16741 +        MVT::i32,
16742 +        Cond);
16743 +  } else {
16744 +    assert(0 && "Not valid type for set_cc");
16745 +  }
16746 +  Cond = DAG.getNode(
16747 +      ISD::AND,
16748 +      DL,
16749 +      MVT::i32,
16750 +      DAG.getConstant(1, MVT::i32),
16751 +      Cond);
16752 +  return Cond;
16753 +}
16754 +
16755 +SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
16756 +  DebugLoc DL = Op.getDebugLoc();
16757 +  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
16758 +  SDValue Chain = Op.getOperand(0);
16759 +  SDValue Value = Op.getOperand(1);
16760 +  SDValue Ptr = Op.getOperand(2);
16761 +
16762 +  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
16763 +      Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
16764 +    // Convert pointer from byte address to dword address.
16765 +    Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
16766 +                      DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
16767 +                                  Ptr, DAG.getConstant(2, MVT::i32)));
16768 +
16769 +    if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
16770 +      assert(!"Truncated and indexed stores not supported yet");
16771 +    } else {
16772 +      Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
16773 +    }
16774 +    return Chain;
16775 +  }
16776 +  return SDValue();
16777 +}
16778 +
16779 +// return (512 + (kc_bank << 12)
16780 +static int
16781 +ConstantAddressBlock(unsigned AddressSpace) {
16782 +  switch (AddressSpace) {
16783 +  case AMDGPUAS::CONSTANT_BUFFER_0:
16784 +    return 512;
16785 +  case AMDGPUAS::CONSTANT_BUFFER_1:
16786 +    return 512 + 4096;
16787 +  case AMDGPUAS::CONSTANT_BUFFER_2:
16788 +    return 512 + 4096 * 2;
16789 +  case AMDGPUAS::CONSTANT_BUFFER_3:
16790 +    return 512 + 4096 * 3;
16791 +  case AMDGPUAS::CONSTANT_BUFFER_4:
16792 +    return 512 + 4096 * 4;
16793 +  case AMDGPUAS::CONSTANT_BUFFER_5:
16794 +    return 512 + 4096 * 5;
16795 +  case AMDGPUAS::CONSTANT_BUFFER_6:
16796 +    return 512 + 4096 * 6;
16797 +  case AMDGPUAS::CONSTANT_BUFFER_7:
16798 +    return 512 + 4096 * 7;
16799 +  case AMDGPUAS::CONSTANT_BUFFER_8:
16800 +    return 512 + 4096 * 8;
16801 +  case AMDGPUAS::CONSTANT_BUFFER_9:
16802 +    return 512 + 4096 * 9;
16803 +  case AMDGPUAS::CONSTANT_BUFFER_10:
16804 +    return 512 + 4096 * 10;
16805 +  case AMDGPUAS::CONSTANT_BUFFER_11:
16806 +    return 512 + 4096 * 11;
16807 +  case AMDGPUAS::CONSTANT_BUFFER_12:
16808 +    return 512 + 4096 * 12;
16809 +  case AMDGPUAS::CONSTANT_BUFFER_13:
16810 +    return 512 + 4096 * 13;
16811 +  case AMDGPUAS::CONSTANT_BUFFER_14:
16812 +    return 512 + 4096 * 14;
16813 +  case AMDGPUAS::CONSTANT_BUFFER_15:
16814 +    return 512 + 4096 * 15;
16815 +  default:
16816 +    return -1;
16817 +  }
16818 +}
16819 +
16820 +SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
16821 +{
16822 +  EVT VT = Op.getValueType();
16823 +  DebugLoc DL = Op.getDebugLoc();
16824 +  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
16825 +  SDValue Chain = Op.getOperand(0);
16826 +  SDValue Ptr = Op.getOperand(1);
16827 +  SDValue LoweredLoad;
16828 +
16829 +  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
16830 +  if (ConstantBlock > -1) {
16831 +    SDValue Result;
16832 +    if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
16833 +        dyn_cast<Constant>(LoadNode->getSrcValue())) {
16834 +      SDValue Slots[4];
16835 +      for (unsigned i = 0; i < 4; i++) {
16836 +        // We want Const position encoded with the following formula :
16837 +        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
16838 +        // const_index is Ptr computed by llvm using an alignment of 16.
16839 +        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
16840 +        // then div by 4 at the ISel step
16841 +        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
16842 +            DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
16843 +        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
16844 +      }
16845 +      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
16846 +    } else {
16847 +      // non constant ptr cant be folded, keeps it as a v4f32 load
16848 +      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
16849 +          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32))
16850 +          );
16851 +    }
16852 +
16853 +    if (!VT.isVector()) {
16854 +      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
16855 +          DAG.getConstant(0, MVT::i32));
16856 +    }
16857 +
16858 +    SDValue MergedValues[2] = {
16859 +        Result,
16860 +        Chain
16861 +    };
16862 +    return DAG.getMergeValues(MergedValues, 2, DL);
16863 +  }
16864 +
16865 +  return SDValue();
16866 +}
16867 +
16868 +SDValue R600TargetLowering::LowerFPOW(SDValue Op,
16869 +    SelectionDAG &DAG) const {
16870 +  DebugLoc DL = Op.getDebugLoc();
16871 +  EVT VT = Op.getValueType();
16872 +  SDValue LogBase = DAG.getNode(ISD::FLOG2, DL, VT, Op.getOperand(0));
16873 +  SDValue MulLogBase = DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), LogBase);
16874 +  return DAG.getNode(ISD::FEXP2, DL, VT, MulLogBase);
16875 +}
16876 +
16877 +/// XXX Only kernel functions are supported, so we can assume for now that
16878 +/// every function is a kernel function, but in the future we should use
16879 +/// separate calling conventions for kernel and non-kernel functions.
16880 +SDValue R600TargetLowering::LowerFormalArguments(
16881 +                                      SDValue Chain,
16882 +                                      CallingConv::ID CallConv,
16883 +                                      bool isVarArg,
16884 +                                      const SmallVectorImpl<ISD::InputArg> &Ins,
16885 +                                      DebugLoc DL, SelectionDAG &DAG,
16886 +                                      SmallVectorImpl<SDValue> &InVals) const {
16887 +  unsigned ParamOffsetBytes = 36;
16888 +  Function::const_arg_iterator FuncArg =
16889 +                            DAG.getMachineFunction().getFunction()->arg_begin();
16890 +  for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
16891 +    EVT VT = Ins[i].VT;
16892 +    Type *ArgType = FuncArg->getType();
16893 +    unsigned ArgSizeInBits = ArgType->isPointerTy() ?
16894 +                             32 : ArgType->getPrimitiveSizeInBits();
16895 +    unsigned ArgBytes = ArgSizeInBits >> 3;
16896 +    EVT ArgVT;
16897 +    if (ArgSizeInBits < VT.getSizeInBits()) {
16898 +      assert(!ArgType->isFloatTy() &&
16899 +             "Extending floating point arguments not supported yet");
16900 +      ArgVT = MVT::getIntegerVT(ArgSizeInBits);
16901 +    } else {
16902 +      ArgVT = VT;
16903 +    }
16904 +    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
16905 +                                                    AMDGPUAS::PARAM_I_ADDRESS);
16906 +    SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
16907 +                                DAG.getConstant(ParamOffsetBytes, MVT::i32),
16908 +                                       MachinePointerInfo(new Argument(PtrTy)),
16909 +                                       ArgVT, false, false, ArgBytes);
16910 +    InVals.push_back(Arg);
16911 +    ParamOffsetBytes += ArgBytes;
16912 +  }
16913 +  return Chain;
16914 +}
16915 +
16916 +EVT R600TargetLowering::getSetCCResultType(EVT VT) const {
16917 +   if (!VT.isVector()) return MVT::i32;
16918 +   return VT.changeVectorElementTypeToInteger();
16919 +}
16920 +
16921 +//===----------------------------------------------------------------------===//
16922 +// Custom DAG Optimizations
16923 +//===----------------------------------------------------------------------===//
16924 +
16925 +SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
16926 +                                              DAGCombinerInfo &DCI) const {
16927 +  SelectionDAG &DAG = DCI.DAG;
16928 +
16929 +  switch (N->getOpcode()) {
16930 +  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
16931 +  case ISD::FP_ROUND: {
16932 +      SDValue Arg = N->getOperand(0);
16933 +      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
16934 +        return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0),
16935 +                           Arg.getOperand(0));
16936 +      }
16937 +      break;
16938 +    }
16939 +  // Extract_vec (Build_vector) generated by custom lowering
16940 +  // also needs to be customly combined
16941 +  case ISD::EXTRACT_VECTOR_ELT: {
16942 +    SDValue Arg = N->getOperand(0);
16943 +    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
16944 +      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
16945 +        unsigned Element = Const->getZExtValue();
16946 +        return Arg->getOperand(Element);
16947 +      }
16948 +    }
16949 +  }
16950 +  }
16951 +  return SDValue();
16952 +}
16953 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600ISelLowering.h llvm-r600/lib/Target/R600/R600ISelLowering.h
16954 --- llvm-3.2.src/lib/Target/R600/R600ISelLowering.h     1970-01-01 01:00:00.000000000 +0100
16955 +++ llvm-r600/lib/Target/R600/R600ISelLowering.h        2013-01-25 19:43:57.463383054 +0100
16956 @@ -0,0 +1,73 @@
16957 +//===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===//
16958 +//
16959 +//                     The LLVM Compiler Infrastructure
16960 +//
16961 +// This file is distributed under the University of Illinois Open Source
16962 +// License. See LICENSE.TXT for details.
16963 +//
16964 +//===----------------------------------------------------------------------===//
16965 +//
16966 +/// \file
16967 +/// \brief R600 DAG Lowering interface definition
16968 +//
16969 +//===----------------------------------------------------------------------===//
16970 +
16971 +#ifndef R600ISELLOWERING_H
16972 +#define R600ISELLOWERING_H
16973 +
16974 +#include "AMDGPUISelLowering.h"
16975 +
16976 +namespace llvm {
16977 +
16978 +class R600InstrInfo;
16979 +
16980 +class R600TargetLowering : public AMDGPUTargetLowering {
16981 +public:
16982 +  R600TargetLowering(TargetMachine &TM);
16983 +  virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
16984 +      MachineBasicBlock * BB) const;
16985 +  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
16986 +  virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
16987 +  void ReplaceNodeResults(SDNode * N,
16988 +      SmallVectorImpl<SDValue> &Results,
16989 +      SelectionDAG &DAG) const;
16990 +  virtual SDValue LowerFormalArguments(
16991 +                                      SDValue Chain,
16992 +                                      CallingConv::ID CallConv,
16993 +                                      bool isVarArg,
16994 +                                      const SmallVectorImpl<ISD::InputArg> &Ins,
16995 +                                      DebugLoc DL, SelectionDAG &DAG,
16996 +                                      SmallVectorImpl<SDValue> &InVals) const;
16997 +  virtual EVT getSetCCResultType(EVT VT) const;
16998 +private:
16999 +  const R600InstrInfo * TII;
17000 +
17001 +  /// Each OpenCL kernel has nine implicit parameters that are stored in the
17002 +  /// first nine dwords of a Vertex Buffer.  These implicit parameters are
17003 +  /// lowered to load instructions which retreive the values from the Vertex
17004 +  /// Buffer.
17005 +  SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
17006 +                                 DebugLoc DL, unsigned DwordOffset) const;
17007 +
17008 +  void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
17009 +      MachineRegisterInfo & MRI, unsigned dword_offset) const;
17010 +
17011 +  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
17012 +
17013 +  /// \brief Lower ROTL opcode to BITALIGN
17014 +  SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
17015 +
17016 +  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
17017 +  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
17018 +  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
17019 +  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
17020 +  SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
17021 +  SDValue LowerFPOW(SDValue Op, SelectionDAG &DAG) const;
17022 +  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
17023 +
17024 +  bool isZero(SDValue Op) const;
17025 +};
17026 +
17027 +} // End namespace llvm;
17028 +
17029 +#endif // R600ISELLOWERING_H
17030 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600LowerConstCopy.cpp llvm-r600/lib/Target/R600/R600LowerConstCopy.cpp
17031 --- llvm-3.2.src/lib/Target/R600/R600LowerConstCopy.cpp 1970-01-01 01:00:00.000000000 +0100
17032 +++ llvm-r600/lib/Target/R600/R600LowerConstCopy.cpp    2013-01-25 19:43:57.466716387 +0100
17033 @@ -0,0 +1,74 @@
17034 +//===-- R600LowerConstCopy.cpp - Propagate ConstCopy / lower them to MOV---===//
17035 +//
17036 +//                     The LLVM Compiler Infrastructure
17037 +//
17038 +// This file is distributed under the University of Illinois Open Source
17039 +// License. See LICENSE.TXT for details.
17040 +//
17041 +//===----------------------------------------------------------------------===//
17042 +//
17043 +/// \file
17044 +/// This pass is intended to handle remaining ConstCopy pseudo MachineInstr.
17045 +/// ISel will fold each Const Buffer read inside scalar ALU. However it cannot
17046 +/// fold them inside vector instruction, like DOT4 or Cube ; ISel emits
17047 +/// ConstCopy instead. This pass (executed after ExpandingSpecialInstr) will try
17048 +/// to fold them if possible or replace them by MOV otherwise.
17049 +/// TODO : Implement the folding part, using Copy Propagation algorithm.
17050 +//
17051 +//===----------------------------------------------------------------------===//
17052 +
17053 +#include "AMDGPU.h"
17054 +#include "llvm/CodeGen/MachineFunction.h"
17055 +#include "llvm/CodeGen/MachineFunctionPass.h"
17056 +#include "R600InstrInfo.h"
17057 +#include "llvm/GlobalValue.h"
17058 +#include "llvm/CodeGen/MachineInstrBuilder.h"
17059 +
17060 +namespace llvm {
17061 +
17062 +class R600LowerConstCopy : public MachineFunctionPass {
17063 +private:
17064 +  static char ID;
17065 +  const R600InstrInfo *TII;
17066 +public:
17067 +  R600LowerConstCopy(TargetMachine &tm);
17068 +  virtual bool runOnMachineFunction(MachineFunction &MF);
17069 +
17070 +  const char *getPassName() const { return "R600 Eliminate Symbolic Operand"; }
17071 +};
17072 +
17073 +char R600LowerConstCopy::ID = 0;
17074 +
17075 +
17076 +R600LowerConstCopy::R600LowerConstCopy(TargetMachine &tm) :
17077 +    MachineFunctionPass(ID),
17078 +    TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo()))
17079 +{
17080 +}
17081 +
17082 +bool R600LowerConstCopy::runOnMachineFunction(MachineFunction &MF) {
17083 +  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
17084 +                                                  BB != BB_E; ++BB) {
17085 +    MachineBasicBlock &MBB = *BB;
17086 +    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
17087 +                                                      I != E;) {
17088 +      MachineInstr &MI = *I;
17089 +      I = llvm::next(I);
17090 +      if (MI.getOpcode() != AMDGPU::CONST_COPY)
17091 +        continue;
17092 +      MachineInstr *NewMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::MOV,
17093 +          MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
17094 +      NewMI->getOperand(9).setImm(MI.getOperand(1).getImm());
17095 +      MI.eraseFromParent();
17096 +    }
17097 +  }
17098 +  return false;
17099 +}
17100 +
17101 +FunctionPass *createR600LowerConstCopy(TargetMachine &tm) {
17102 +  return new R600LowerConstCopy(tm);
17103 +}
17104 +
17105 +}
17106 +
17107 +
17108 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600MachineFunctionInfo.cpp llvm-r600/lib/Target/R600/R600MachineFunctionInfo.cpp
17109 --- llvm-3.2.src/lib/Target/R600/R600MachineFunctionInfo.cpp    1970-01-01 01:00:00.000000000 +0100
17110 +++ llvm-r600/lib/Target/R600/R600MachineFunctionInfo.cpp       2013-01-25 19:43:57.470049720 +0100
17111 @@ -0,0 +1,33 @@
17112 +//===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===//
17113 +//
17114 +//                     The LLVM Compiler Infrastructure
17115 +//
17116 +// This file is distributed under the University of Illinois Open Source
17117 +// License. See LICENSE.TXT for details.
17118 +//
17119 +/// \file
17120 +//===----------------------------------------------------------------------===//
17121 +
17122 +#include "R600MachineFunctionInfo.h"
17123 +
17124 +using namespace llvm;
17125 +
17126 +R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
17127 +  : MachineFunctionInfo(),
17128 +    HasLinearInterpolation(false),
17129 +    HasPerspectiveInterpolation(false) {
17130 +    memset(Outputs, 0, sizeof(Outputs));
17131 +  }
17132 +
17133 +unsigned R600MachineFunctionInfo::GetIJPerspectiveIndex() const {
17134 +  assert(HasPerspectiveInterpolation);
17135 +  return 0;
17136 +}
17137 +
17138 +unsigned R600MachineFunctionInfo::GetIJLinearIndex() const {
17139 +  assert(HasLinearInterpolation);
17140 +  if (HasPerspectiveInterpolation)
17141 +    return 1;
17142 +  else
17143 +    return 0;
17144 +}
17145 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600MachineFunctionInfo.h llvm-r600/lib/Target/R600/R600MachineFunctionInfo.h
17146 --- llvm-3.2.src/lib/Target/R600/R600MachineFunctionInfo.h      1970-01-01 01:00:00.000000000 +0100
17147 +++ llvm-r600/lib/Target/R600/R600MachineFunctionInfo.h 2013-01-25 19:43:57.470049720 +0100
17148 @@ -0,0 +1,38 @@
17149 +//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=//
17150 +//
17151 +//                     The LLVM Compiler Infrastructure
17152 +//
17153 +// This file is distributed under the University of Illinois Open Source
17154 +// License. See LICENSE.TXT for details.
17155 +//
17156 +//===----------------------------------------------------------------------===//
17157 +//
17158 +/// \file
17159 +//===----------------------------------------------------------------------===//
17160 +
17161 +#ifndef R600MACHINEFUNCTIONINFO_H
17162 +#define R600MACHINEFUNCTIONINFO_H
17163 +
17164 +#include "llvm/CodeGen/MachineFunction.h"
17165 +#include "llvm/CodeGen/SelectionDAG.h"
17166 +#include <vector>
17167 +
17168 +namespace llvm {
17169 +
17170 +class R600MachineFunctionInfo : public MachineFunctionInfo {
17171 +
17172 +public:
17173 +  R600MachineFunctionInfo(const MachineFunction &MF);
17174 +  std::vector<unsigned> ReservedRegs;
17175 +  SDNode *Outputs[16];
17176 +  bool HasLinearInterpolation;
17177 +  bool HasPerspectiveInterpolation;
17178 +
17179 +  unsigned GetIJLinearIndex() const;
17180 +  unsigned GetIJPerspectiveIndex() const;
17181 +
17182 +};
17183 +
17184 +} // End llvm namespace
17185 +
17186 +#endif //R600MACHINEFUNCTIONINFO_H
17187 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600RegisterInfo.cpp llvm-r600/lib/Target/R600/R600RegisterInfo.cpp
17188 --- llvm-3.2.src/lib/Target/R600/R600RegisterInfo.cpp   1970-01-01 01:00:00.000000000 +0100
17189 +++ llvm-r600/lib/Target/R600/R600RegisterInfo.cpp      2013-01-25 19:43:57.470049720 +0100
17190 @@ -0,0 +1,85 @@
17191 +//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===//
17192 +//
17193 +//                     The LLVM Compiler Infrastructure
17194 +//
17195 +// This file is distributed under the University of Illinois Open Source
17196 +// License. See LICENSE.TXT for details.
17197 +//
17198 +//===----------------------------------------------------------------------===//
17199 +//
17200 +/// \file
17201 +/// \brief R600 implementation of the TargetRegisterInfo class.
17202 +//
17203 +//===----------------------------------------------------------------------===//
17204 +
17205 +#include "R600RegisterInfo.h"
17206 +#include "AMDGPUTargetMachine.h"
17207 +#include "R600Defines.h"
17208 +#include "R600MachineFunctionInfo.h"
17209 +
17210 +using namespace llvm;
17211 +
17212 +R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm,
17213 +    const TargetInstrInfo &tii)
17214 +: AMDGPURegisterInfo(tm, tii),
17215 +  TM(tm),
17216 +  TII(tii)
17217 +  { }
17218 +
17219 +BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
17220 +  BitVector Reserved(getNumRegs());
17221 +  const R600MachineFunctionInfo * MFI = MF.getInfo<R600MachineFunctionInfo>();
17222 +
17223 +  Reserved.set(AMDGPU::ZERO);
17224 +  Reserved.set(AMDGPU::HALF);
17225 +  Reserved.set(AMDGPU::ONE);
17226 +  Reserved.set(AMDGPU::ONE_INT);
17227 +  Reserved.set(AMDGPU::NEG_HALF);
17228 +  Reserved.set(AMDGPU::NEG_ONE);
17229 +  Reserved.set(AMDGPU::PV_X);
17230 +  Reserved.set(AMDGPU::ALU_LITERAL_X);
17231 +  Reserved.set(AMDGPU::ALU_CONST);
17232 +  Reserved.set(AMDGPU::PREDICATE_BIT);
17233 +  Reserved.set(AMDGPU::PRED_SEL_OFF);
17234 +  Reserved.set(AMDGPU::PRED_SEL_ZERO);
17235 +  Reserved.set(AMDGPU::PRED_SEL_ONE);
17236 +
17237 +  for (std::vector<unsigned>::const_iterator I = MFI->ReservedRegs.begin(),
17238 +                                    E = MFI->ReservedRegs.end(); I != E; ++I) {
17239 +    Reserved.set(*I);
17240 +  }
17241 +
17242 +  return Reserved;
17243 +}
17244 +
17245 +const TargetRegisterClass *
17246 +R600RegisterInfo::getISARegClass(const TargetRegisterClass * rc) const {
17247 +  switch (rc->getID()) {
17248 +  case AMDGPU::GPRF32RegClassID:
17249 +  case AMDGPU::GPRI32RegClassID:
17250 +    return &AMDGPU::R600_Reg32RegClass;
17251 +  default: return rc;
17252 +  }
17253 +}
17254 +
17255 +unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
17256 +  return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
17257 +}
17258 +
17259 +const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
17260 +                                                                   MVT VT) const {
17261 +  switch(VT.SimpleTy) {
17262 +  default:
17263 +  case MVT::i32: return &AMDGPU::R600_TReg32RegClass;
17264 +  }
17265 +}
17266 +
17267 +unsigned R600RegisterInfo::getSubRegFromChannel(unsigned Channel) const {
17268 +  switch (Channel) {
17269 +    default: assert(!"Invalid channel index"); return 0;
17270 +    case 0: return AMDGPU::sel_x;
17271 +    case 1: return AMDGPU::sel_y;
17272 +    case 2: return AMDGPU::sel_z;
17273 +    case 3: return AMDGPU::sel_w;
17274 +  }
17275 +}
17276 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600RegisterInfo.h llvm-r600/lib/Target/R600/R600RegisterInfo.h
17277 --- llvm-3.2.src/lib/Target/R600/R600RegisterInfo.h     1970-01-01 01:00:00.000000000 +0100
17278 +++ llvm-r600/lib/Target/R600/R600RegisterInfo.h        2013-01-25 19:43:57.470049720 +0100
17279 @@ -0,0 +1,55 @@
17280 +//===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===//
17281 +//
17282 +//                     The LLVM Compiler Infrastructure
17283 +//
17284 +// This file is distributed under the University of Illinois Open Source
17285 +// License. See LICENSE.TXT for details.
17286 +//
17287 +//===----------------------------------------------------------------------===//
17288 +//
17289 +/// \file
17290 +/// \brief Interface definition for R600RegisterInfo
17291 +//
17292 +//===----------------------------------------------------------------------===//
17293 +
17294 +#ifndef R600REGISTERINFO_H_
17295 +#define R600REGISTERINFO_H_
17296 +
17297 +#include "AMDGPUTargetMachine.h"
17298 +#include "AMDGPURegisterInfo.h"
17299 +
17300 +namespace llvm {
17301 +
17302 +class R600TargetMachine;
17303 +class TargetInstrInfo;
17304 +
17305 +struct R600RegisterInfo : public AMDGPURegisterInfo {
17306 +  AMDGPUTargetMachine &TM;
17307 +  const TargetInstrInfo &TII;
17308 +
17309 +  R600RegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii);
17310 +
17311 +  virtual BitVector getReservedRegs(const MachineFunction &MF) const;
17312 +
17313 +  /// \param RC is an AMDIL reg class.
17314 +  ///
17315 +  /// \returns the R600 reg class that is equivalent to \p RC.
17316 +  virtual const TargetRegisterClass *getISARegClass(
17317 +    const TargetRegisterClass *RC) const;
17318 +
17319 +  /// \brief get the HW encoding for a register's channel.
17320 +  unsigned getHWRegChan(unsigned reg) const;
17321 +
17322 +  /// \brief get the register class of the specified type to use in the
17323 +  /// CFGStructurizer
17324 +  virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
17325 +
17326 +  /// \returns the sub reg enum value for the given \p Channel
17327 +  /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sel_x)
17328 +  unsigned getSubRegFromChannel(unsigned Channel) const;
17329 +
17330 +};
17331 +
17332 +} // End namespace llvm
17333 +
17334 +#endif // AMDIDSAREGISTERINFO_H_
17335 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600RegisterInfo.td llvm-r600/lib/Target/R600/R600RegisterInfo.td
17336 --- llvm-3.2.src/lib/Target/R600/R600RegisterInfo.td    1970-01-01 01:00:00.000000000 +0100
17337 +++ llvm-r600/lib/Target/R600/R600RegisterInfo.td       2013-01-25 19:43:57.470049720 +0100
17338 @@ -0,0 +1,101 @@
17339 +
17340 +class R600Reg <string name, bits<16> encoding> : Register<name> {
17341 +  let Namespace = "AMDGPU";
17342 +  let HWEncoding = encoding;
17343 +}
17344 +
17345 +class R600RegWithChan <string name, bits<9> sel, string chan> :
17346 +    Register <name> {
17347 +
17348 +  field bits<2> chan_encoding = !if(!eq(chan, "X"), 0,
17349 +                                !if(!eq(chan, "Y"), 1,
17350 +                                !if(!eq(chan, "Z"), 2,
17351 +                                !if(!eq(chan, "W"), 3, 0))));
17352 +  let HWEncoding{8-0}  = sel;
17353 +  let HWEncoding{10-9} = chan_encoding;
17354 +  let Namespace = "AMDGPU";
17355 +}
17356 +
17357 +class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
17358 +    RegisterWithSubRegs<n, subregs> {
17359 +  let Namespace = "AMDGPU";
17360 +  let SubRegIndices = [sel_x, sel_y, sel_z, sel_w];
17361 +  let HWEncoding = encoding;
17362 +}
17363 +
17364 +foreach Index = 0-127 in {
17365 +  foreach Chan = [ "X", "Y", "Z", "W" ] in {
17366 +    // 32-bit Temporary Registers
17367 +    def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>;
17368 +  }
17369 +  // 128-bit Temporary Registers
17370 +  def T#Index#_XYZW : R600Reg_128 <"T"#Index#".XYZW",
17371 +                                   [!cast<Register>("T"#Index#"_X"),
17372 +                                    !cast<Register>("T"#Index#"_Y"),
17373 +                                    !cast<Register>("T"#Index#"_Z"),
17374 +                                    !cast<Register>("T"#Index#"_W")],
17375 +                                   Index>;
17376 +}
17377 +
17378 +// Array Base Register holding input in FS
17379 +foreach Index = 448-464 in {
17380 +  def ArrayBase#Index :  R600Reg<"ARRAY_BASE", Index>;
17381 +}
17382 +
17383 +
17384 +// Special Registers
17385 +
17386 +def ZERO : R600Reg<"0.0", 248>;
17387 +def ONE : R600Reg<"1.0", 249>;
17388 +def NEG_ONE : R600Reg<"-1.0", 249>;
17389 +def ONE_INT : R600Reg<"1", 250>;
17390 +def HALF : R600Reg<"0.5", 252>;
17391 +def NEG_HALF : R600Reg<"-0.5", 252>;
17392 +def ALU_LITERAL_X : R600Reg<"literal.x", 253>;
17393 +def PV_X : R600Reg<"pv.x", 254>;
17394 +def PREDICATE_BIT : R600Reg<"PredicateBit", 0>;
17395 +def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
17396 +def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
17397 +def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>;
17398 +
17399 +def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32,
17400 +                          (add (sequence "ArrayBase%u", 448, 464))>;
17401 +// special registers for ALU src operands
17402 +// const buffer reference, SRCx_SEL contains index
17403 +def ALU_CONST : R600Reg<"CBuf", 0>;
17404 +// interpolation param reference, SRCx_SEL contains index
17405 +def ALU_PARAM : R600Reg<"Param", 0>;
17406 +
17407 +def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32,
17408 +                                   (add (sequence "T%u_X", 0, 127))>;
17409 +
17410 +def R600_TReg32_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
17411 +                                   (add (sequence "T%u_Y", 0, 127))>;
17412 +
17413 +def R600_TReg32_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
17414 +                                   (add (sequence "T%u_Z", 0, 127))>;
17415 +
17416 +def R600_TReg32_W : RegisterClass <"AMDGPU", [f32, i32], 32,
17417 +                                   (add (sequence "T%u_W", 0, 127))>;
17418 +
17419 +def R600_TReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
17420 +                                   (interleave R600_TReg32_X, R600_TReg32_Y,
17421 +                                               R600_TReg32_Z, R600_TReg32_W)>;
17422 +
17423 +def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
17424 +    R600_TReg32,
17425 +    R600_ArrayBase,
17426 +    ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF,
17427 +    ALU_CONST, ALU_PARAM
17428 +    )>;
17429 +
17430 +def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add
17431 +    PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>;
17432 +
17433 +def R600_Predicate_Bit: RegisterClass <"AMDGPU", [i32], 32, (add
17434 +    PREDICATE_BIT)>;
17435 +
17436 +def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
17437 +                                (add (sequence "T%u_XYZW", 0, 127))> {
17438 +  let CopyCost = -1;
17439 +}
17440 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600Schedule.td llvm-r600/lib/Target/R600/R600Schedule.td
17441 --- llvm-3.2.src/lib/Target/R600/R600Schedule.td        1970-01-01 01:00:00.000000000 +0100
17442 +++ llvm-r600/lib/Target/R600/R600Schedule.td   2013-01-25 19:43:57.470049720 +0100
17443 @@ -0,0 +1,36 @@
17444 +//===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===//
17445 +//
17446 +//                     The LLVM Compiler Infrastructure
17447 +//
17448 +// This file is distributed under the University of Illinois Open Source
17449 +// License. See LICENSE.TXT for details.
17450 +//
17451 +//===----------------------------------------------------------------------===//
17452 +//
17453 +// R600 has a VLIW architecture.  On pre-cayman cards there are 5 instruction
17454 +// slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS.  For cayman cards, the TRANS
17455 +// slot has been removed.
17456 +//
17457 +//===----------------------------------------------------------------------===//
17458 +
17459 +
17460 +def ALU_X : FuncUnit;
17461 +def ALU_Y : FuncUnit;
17462 +def ALU_Z : FuncUnit;
17463 +def ALU_W : FuncUnit;
17464 +def TRANS : FuncUnit;
17465 +
17466 +def AnyALU : InstrItinClass;
17467 +def VecALU : InstrItinClass;
17468 +def TransALU : InstrItinClass;
17469 +
17470 +def R600_EG_Itin : ProcessorItineraries <
17471 +  [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL],
17472 +  [],
17473 +  [
17474 +    InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS]>]>,
17475 +    InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_X, ALU_W]>]>,
17476 +    InstrItinData<TransALU, [InstrStage<1, [TRANS]>]>,
17477 +    InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
17478 +  ]
17479 +>;
17480 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIAnnotateControlFlow.cpp llvm-r600/lib/Target/R600/SIAnnotateControlFlow.cpp
17481 --- llvm-3.2.src/lib/Target/R600/SIAnnotateControlFlow.cpp      1970-01-01 01:00:00.000000000 +0100
17482 +++ llvm-r600/lib/Target/R600/SIAnnotateControlFlow.cpp 2013-01-25 19:43:57.470049720 +0100
17483 @@ -0,0 +1,330 @@
17484 +//===-- SIAnnotateControlFlow.cpp -  ------------------===//
17485 +//
17486 +//                     The LLVM Compiler Infrastructure
17487 +//
17488 +// This file is distributed under the University of Illinois Open Source
17489 +// License. See LICENSE.TXT for details.
17490 +//
17491 +//===----------------------------------------------------------------------===//
17492 +//
17493 +/// \file
17494 +/// Annotates the control flow with hardware specific intrinsics.
17495 +//
17496 +//===----------------------------------------------------------------------===//
17497 +
17498 +#include "AMDGPU.h"
17499 +
17500 +#include "llvm/Pass.h"
17501 +#include "llvm/Module.h"
17502 +#include "llvm/Analysis/Dominators.h"
17503 +#include "llvm/Transforms/Utils/BasicBlockUtils.h"
17504 +#include "llvm/ADT/DepthFirstIterator.h"
17505 +#include "llvm/Transforms/Utils/SSAUpdater.h"
17506 +
17507 +using namespace llvm;
17508 +
17509 +namespace {
17510 +
17511 +// Complex types used in this pass
17512 +typedef std::pair<BasicBlock *, Value *> StackEntry;
17513 +typedef SmallVector<StackEntry, 16> StackVector;
17514 +
17515 +// Intrinsic names the control flow is annotated with
17516 +static const char *IfIntrinsic = "llvm.SI.if";
17517 +static const char *ElseIntrinsic = "llvm.SI.else";
17518 +static const char *BreakIntrinsic = "llvm.SI.break";
17519 +static const char *IfBreakIntrinsic = "llvm.SI.if.break";
17520 +static const char *ElseBreakIntrinsic = "llvm.SI.else.break";
17521 +static const char *LoopIntrinsic = "llvm.SI.loop";
17522 +static const char *EndCfIntrinsic = "llvm.SI.end.cf";
17523 +
17524 +class SIAnnotateControlFlow : public FunctionPass {
17525 +
17526 +  static char ID;
17527 +
17528 +  Type *Boolean;
17529 +  Type *Void;
17530 +  Type *Int64;
17531 +  Type *ReturnStruct;
17532 +
17533 +  ConstantInt *BoolTrue;
17534 +  ConstantInt *BoolFalse;
17535 +  UndefValue *BoolUndef;
17536 +  Constant *Int64Zero;
17537 +
17538 +  Constant *If;
17539 +  Constant *Else;
17540 +  Constant *Break;
17541 +  Constant *IfBreak;
17542 +  Constant *ElseBreak;
17543 +  Constant *Loop;
17544 +  Constant *EndCf;
17545 +
17546 +  DominatorTree *DT;
17547 +  StackVector Stack;
17548 +  SSAUpdater PhiInserter;
17549 +
17550 +  bool isTopOfStack(BasicBlock *BB);
17551 +
17552 +  Value *popSaved();
17553 +
17554 +  void push(BasicBlock *BB, Value *Saved);
17555 +
17556 +  bool isElse(PHINode *Phi);
17557 +
17558 +  void eraseIfUnused(PHINode *Phi);
17559 +
17560 +  void openIf(BranchInst *Term);
17561 +
17562 +  void insertElse(BranchInst *Term);
17563 +
17564 +  void handleLoopCondition(Value *Cond);
17565 +
17566 +  void handleLoop(BranchInst *Term);
17567 +
17568 +  void closeControlFlow(BasicBlock *BB);
17569 +
17570 +public:
17571 +  SIAnnotateControlFlow():
17572 +    FunctionPass(ID) { }
17573 +
17574 +  virtual bool doInitialization(Module &M);
17575 +
17576 +  virtual bool runOnFunction(Function &F);
17577 +
17578 +  virtual const char *getPassName() const {
17579 +    return "SI annotate control flow";
17580 +  }
17581 +
17582 +  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
17583 +    AU.addRequired<DominatorTree>();
17584 +    AU.addPreserved<DominatorTree>();
17585 +    FunctionPass::getAnalysisUsage(AU);
17586 +  }
17587 +
17588 +};
17589 +
17590 +} // end anonymous namespace
17591 +
17592 +char SIAnnotateControlFlow::ID = 0;
17593 +
17594 +/// \brief Initialize all the types and constants used in the pass
17595 +bool SIAnnotateControlFlow::doInitialization(Module &M) {
17596 +  LLVMContext &Context = M.getContext();
17597 +
17598 +  Void = Type::getVoidTy(Context);
17599 +  Boolean = Type::getInt1Ty(Context);
17600 +  Int64 = Type::getInt64Ty(Context);
17601 +  ReturnStruct = StructType::get(Boolean, Int64, (Type *)0);
17602 +
17603 +  BoolTrue = ConstantInt::getTrue(Context);
17604 +  BoolFalse = ConstantInt::getFalse(Context);
17605 +  BoolUndef = UndefValue::get(Boolean);
17606 +  Int64Zero = ConstantInt::get(Int64, 0);
17607 +
17608 +  If = M.getOrInsertFunction(
17609 +    IfIntrinsic, ReturnStruct, Boolean, (Type *)0);
17610 +
17611 +  Else = M.getOrInsertFunction(
17612 +    ElseIntrinsic, ReturnStruct, Int64, (Type *)0);
17613 +
17614 +  Break = M.getOrInsertFunction(
17615 +    BreakIntrinsic, Int64, Int64, (Type *)0);
17616 +
17617 +  IfBreak = M.getOrInsertFunction(
17618 +    IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)0);
17619 +
17620 +  ElseBreak = M.getOrInsertFunction(
17621 +    ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)0);
17622 +
17623 +  Loop = M.getOrInsertFunction(
17624 +    LoopIntrinsic, Boolean, Int64, (Type *)0);
17625 +
17626 +  EndCf = M.getOrInsertFunction(
17627 +    EndCfIntrinsic, Void, Int64, (Type *)0);
17628 +
17629 +  return false;
17630 +}
17631 +
17632 +/// \brief Is BB the last block saved on the stack ?
17633 +bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) {
17634 +  return Stack.back().first == BB;
17635 +}
17636 +
17637 +/// \brief Pop the last saved value from the control flow stack
17638 +Value *SIAnnotateControlFlow::popSaved() {
17639 +  return Stack.pop_back_val().second;
17640 +}
17641 +
17642 +/// \brief Push a BB and saved value to the control flow stack
17643 +void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) {
17644 +  Stack.push_back(std::make_pair(BB, Saved));
17645 +}
17646 +
17647 +/// \brief Can the condition represented by this PHI node treated like
17648 +/// an "Else" block?
17649 +bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
17650 +  BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock();
17651 +  for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
17652 +    if (Phi->getIncomingBlock(i) == IDom) {
17653 +
17654 +      if (Phi->getIncomingValue(i) != BoolTrue)
17655 +        return false;
17656 +
17657 +    } else {
17658 +      if (Phi->getIncomingValue(i) != BoolFalse)
17659 +        return false;
17660 +
17661 +    }
17662 +  }
17663 +  return true;
17664 +}
17665 +
17666 +// \brief Erase "Phi" if it is not used any more
17667 +void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
17668 +  if (!Phi->hasNUsesOrMore(1))
17669 +    Phi->eraseFromParent();
17670 +}
17671 +
17672 +/// \brief Open a new "If" block
17673 +void SIAnnotateControlFlow::openIf(BranchInst *Term) {
17674 +  Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term);
17675 +  Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
17676 +  push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
17677 +}
17678 +
17679 +/// \brief Close the last "If" block and open a new "Else" block
17680 +void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
17681 +  Value *Ret = CallInst::Create(Else, popSaved(), "", Term);
17682 +  Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
17683 +  push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
17684 +}
17685 +
17686 +/// \brief Recursively handle the condition leading to a loop
17687 +void SIAnnotateControlFlow::handleLoopCondition(Value *Cond) {
17688 +  if (PHINode *Phi = dyn_cast<PHINode>(Cond)) {
17689 +
17690 +    // Handle all non constant incoming values first
17691 +    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
17692 +      Value *Incoming = Phi->getIncomingValue(i);
17693 +      if (isa<ConstantInt>(Incoming))
17694 +        continue;
17695 +
17696 +      Phi->setIncomingValue(i, BoolFalse);
17697 +      handleLoopCondition(Incoming);
17698 +    }
17699 +
17700 +    BasicBlock *Parent = Phi->getParent();
17701 +    BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock();
17702 +
17703 +    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
17704 +
17705 +      Value *Incoming = Phi->getIncomingValue(i);
17706 +      if (Incoming != BoolTrue)
17707 +        continue;
17708 +
17709 +      BasicBlock *From = Phi->getIncomingBlock(i);
17710 +      if (From == IDom) {
17711 +        CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt());
17712 +        if (OldEnd && OldEnd->getCalledFunction() == EndCf) {
17713 +          Value *Args[] = {
17714 +            OldEnd->getArgOperand(0),
17715 +            PhiInserter.GetValueAtEndOfBlock(Parent)
17716 +          };
17717 +          Value *Ret = CallInst::Create(ElseBreak, Args, "", OldEnd);
17718 +          PhiInserter.AddAvailableValue(Parent, Ret);
17719 +          continue;
17720 +        }
17721 +      }
17722 +
17723 +      TerminatorInst *Insert = From->getTerminator();
17724 +      Value *Arg = PhiInserter.GetValueAtEndOfBlock(From);
17725 +      Value *Ret = CallInst::Create(Break, Arg, "", Insert);
17726 +      PhiInserter.AddAvailableValue(From, Ret);
17727 +    }
17728 +    eraseIfUnused(Phi);
17729 +
17730 +  } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
17731 +    BasicBlock *Parent = Inst->getParent();
17732 +    TerminatorInst *Insert = Parent->getTerminator();
17733 +    Value *Args[] = { Cond, PhiInserter.GetValueAtEndOfBlock(Parent) };
17734 +    Value *Ret = CallInst::Create(IfBreak, Args, "", Insert);
17735 +    PhiInserter.AddAvailableValue(Parent, Ret);
17736 +
17737 +  } else {
17738 +    assert(0 && "Unhandled loop condition!");
17739 +  }
17740 +}
17741 +
17742 +/// \brief Handle a back edge (loop)
17743 +void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
17744 +  BasicBlock *Target = Term->getSuccessor(1);
17745 +  PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front());
17746 +
17747 +  PhiInserter.Initialize(Int64, "");
17748 +  PhiInserter.AddAvailableValue(Target, Broken);
17749 +
17750 +  Value *Cond = Term->getCondition();
17751 +  Term->setCondition(BoolTrue);
17752 +  handleLoopCondition(Cond);
17753 +
17754 +  BasicBlock *BB = Term->getParent();
17755 +  Value *Arg = PhiInserter.GetValueAtEndOfBlock(BB);
17756 +  for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target);
17757 +       PI != PE; ++PI) {
17758 +
17759 +    Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI);
17760 +  }
17761 +
17762 +  Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
17763 +  push(Term->getSuccessor(0), Arg);
17764 +}
17765 +
17766 +/// \brief Close the last opened control flow
17767 +void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
17768 +  CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt());
17769 +}
17770 +
17771 +/// \brief Annotate the control flow with intrinsics so the backend can
17772 +/// recognize if/then/else and loops.
17773 +bool SIAnnotateControlFlow::runOnFunction(Function &F) {
17774 +  DT = &getAnalysis<DominatorTree>();
17775 +
17776 +  for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
17777 +       E = df_end(&F.getEntryBlock()); I != E; ++I) {
17778 +
17779 +    BranchInst *Term = dyn_cast<BranchInst>((*I)->getTerminator());
17780 +
17781 +    if (!Term || Term->isUnconditional()) {
17782 +      if (isTopOfStack(*I))
17783 +        closeControlFlow(*I);
17784 +      continue;
17785 +    }
17786 +
17787 +    if (I.nodeVisited(Term->getSuccessor(1))) {
17788 +      if (isTopOfStack(*I))
17789 +        closeControlFlow(*I);
17790 +      handleLoop(Term);
17791 +      continue;
17792 +    }
17793 +
17794 +    if (isTopOfStack(*I)) {
17795 +      PHINode *Phi = dyn_cast<PHINode>(Term->getCondition());
17796 +      if (Phi && Phi->getParent() == *I && isElse(Phi)) {
17797 +        insertElse(Term);
17798 +        eraseIfUnused(Phi);
17799 +        continue;
17800 +      }
17801 +      closeControlFlow(*I);
17802 +    }
17803 +    openIf(Term);
17804 +  }
17805 +
17806 +  assert(Stack.empty());
17807 +  return true;
17808 +}
17809 +
17810 +/// \brief Create the annotation pass
17811 +FunctionPass *llvm::createSIAnnotateControlFlowPass() {
17812 +  return new SIAnnotateControlFlow();
17813 +}
17814 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIAssignInterpRegs.cpp llvm-r600/lib/Target/R600/SIAssignInterpRegs.cpp
17815 --- llvm-3.2.src/lib/Target/R600/SIAssignInterpRegs.cpp 1970-01-01 01:00:00.000000000 +0100
17816 +++ llvm-r600/lib/Target/R600/SIAssignInterpRegs.cpp    2013-01-25 19:43:57.470049720 +0100
17817 @@ -0,0 +1,152 @@
17818 +//===-- SIAssignInterpRegs.cpp - Assign interpolation registers -----------===//
17819 +//
17820 +//                     The LLVM Compiler Infrastructure
17821 +//
17822 +// This file is distributed under the University of Illinois Open Source
17823 +// License. See LICENSE.TXT for details.
17824 +//
17825 +//===----------------------------------------------------------------------===//
17826 +//
17827 +/// \file
17828 +/// \brief This pass maps the pseudo interpolation registers to the correct physical
17829 +/// registers.
17830 +//
17831 +/// Prior to executing a fragment shader, the GPU loads interpolation
17832 +/// parameters into physical registers.  The specific physical register that each
17833 +/// interpolation parameter ends up in depends on the type of the interpolation
17834 +/// parameter as well as how many interpolation parameters are used by the
17835 +/// shader.
17836 +//
17837 +//===----------------------------------------------------------------------===//
17838 +
17839 +
17840 +
17841 +#include "AMDGPU.h"
17842 +#include "AMDIL.h"
17843 +#include "SIMachineFunctionInfo.h"
17844 +#include "llvm/CodeGen/MachineFunctionPass.h"
17845 +#include "llvm/CodeGen/MachineInstrBuilder.h"
17846 +#include "llvm/CodeGen/MachineRegisterInfo.h"
17847 +
17848 +using namespace llvm;
17849 +
17850 +namespace {
17851 +
17852 +class SIAssignInterpRegsPass : public MachineFunctionPass {
17853 +
17854 +private:
17855 +  static char ID;
17856 +  TargetMachine &TM;
17857 +
17858 +  void addLiveIn(MachineFunction * MF,  MachineRegisterInfo & MRI,
17859 +                 unsigned physReg, unsigned virtReg);
17860 +
17861 +public:
17862 +  SIAssignInterpRegsPass(TargetMachine &tm) :
17863 +    MachineFunctionPass(ID), TM(tm) { }
17864 +
17865 +  virtual bool runOnMachineFunction(MachineFunction &MF);
17866 +
17867 +  const char *getPassName() const { return "SI Assign intrpolation registers"; }
17868 +};
17869 +
17870 +} // End anonymous namespace
17871 +
17872 +char SIAssignInterpRegsPass::ID = 0;
17873 +
17874 +#define INTERP_VALUES 16
17875 +#define REQUIRED_VALUE_MAX_INDEX 7
17876 +
17877 +struct InterpInfo {
17878 +  bool Enabled;
17879 +  unsigned Regs[3];
17880 +  unsigned RegCount;
17881 +};
17882 +
17883 +
17884 +FunctionPass *llvm::createSIAssignInterpRegsPass(TargetMachine &tm) {
17885 +  return new SIAssignInterpRegsPass(tm);
17886 +}
17887 +
17888 +bool SIAssignInterpRegsPass::runOnMachineFunction(MachineFunction &MF) {
17889 +
17890 +  struct InterpInfo InterpUse[INTERP_VALUES] = {
17891 +    {false, {AMDGPU::PERSP_SAMPLE_I, AMDGPU::PERSP_SAMPLE_J}, 2},
17892 +    {false, {AMDGPU::PERSP_CENTER_I, AMDGPU::PERSP_CENTER_J}, 2},
17893 +    {false, {AMDGPU::PERSP_CENTROID_I, AMDGPU::PERSP_CENTROID_J}, 2},
17894 +    {false, {AMDGPU::PERSP_I_W, AMDGPU::PERSP_J_W, AMDGPU::PERSP_1_W}, 3},
17895 +    {false, {AMDGPU::LINEAR_SAMPLE_I, AMDGPU::LINEAR_SAMPLE_J}, 2},
17896 +    {false, {AMDGPU::LINEAR_CENTER_I, AMDGPU::LINEAR_CENTER_J}, 2},
17897 +    {false, {AMDGPU::LINEAR_CENTROID_I, AMDGPU::LINEAR_CENTROID_J}, 2},
17898 +    {false, {AMDGPU::LINE_STIPPLE_TEX_COORD}, 1},
17899 +    {false, {AMDGPU::POS_X_FLOAT}, 1},
17900 +    {false, {AMDGPU::POS_Y_FLOAT}, 1},
17901 +    {false, {AMDGPU::POS_Z_FLOAT}, 1},
17902 +    {false, {AMDGPU::POS_W_FLOAT}, 1},
17903 +    {false, {AMDGPU::FRONT_FACE}, 1},
17904 +    {false, {AMDGPU::ANCILLARY}, 1},
17905 +    {false, {AMDGPU::SAMPLE_COVERAGE}, 1},
17906 +    {false, {AMDGPU::POS_FIXED_PT}, 1}
17907 +  };
17908 +
17909 +  SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>();
17910 +  // This pass is only needed for pixel shaders.
17911 +  if (MFI->ShaderType != ShaderType::PIXEL) {
17912 +    return false;
17913 +  }
17914 +  MachineRegisterInfo &MRI = MF.getRegInfo();
17915 +  bool ForceEnable = true;
17916 +
17917 +  // First pass, mark the interpolation values that are used.
17918 +  for (unsigned InterpIdx = 0; InterpIdx < INTERP_VALUES; InterpIdx++) {
17919 +    for (unsigned RegIdx = 0; RegIdx < InterpUse[InterpIdx].RegCount;
17920 +                                                               RegIdx++) {
17921 +      InterpUse[InterpIdx].Enabled = InterpUse[InterpIdx].Enabled ||
17922 +                            !MRI.use_empty(InterpUse[InterpIdx].Regs[RegIdx]);
17923 +      if (InterpUse[InterpIdx].Enabled &&
17924 +          InterpIdx <= REQUIRED_VALUE_MAX_INDEX) {
17925 +        ForceEnable = false;
17926 +      }
17927 +    }
17928 +  }
17929 +
17930 +  // At least one interpolation mode must be enabled or else the GPU will hang.
17931 +  if (ForceEnable) {
17932 +    InterpUse[0].Enabled = true;
17933 +  }
17934 +
17935 +  unsigned UsedVgprs = 0;
17936 +
17937 +  // Second pass, replace with VGPRs.
17938 +  for (unsigned InterpIdx = 0; InterpIdx < INTERP_VALUES; InterpIdx++) {
17939 +    if (!InterpUse[InterpIdx].Enabled) {
17940 +      continue;
17941 +    }
17942 +    MFI->SPIPSInputAddr |= (1 << InterpIdx);
17943 +
17944 +    for (unsigned RegIdx = 0; RegIdx < InterpUse[InterpIdx].RegCount;
17945 +                                                  RegIdx++, UsedVgprs++) {
17946 +      unsigned NewReg = AMDGPU::VReg_32RegClass.getRegister(UsedVgprs);
17947 +      unsigned VirtReg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
17948 +      MRI.replaceRegWith(InterpUse[InterpIdx].Regs[RegIdx], VirtReg);
17949 +      addLiveIn(&MF, MRI, NewReg, VirtReg);
17950 +    }
17951 +  }
17952 +
17953 +  return false;
17954 +}
17955 +
17956 +void SIAssignInterpRegsPass::addLiveIn(MachineFunction * MF,
17957 +                           MachineRegisterInfo & MRI,
17958 +                           unsigned physReg, unsigned virtReg) {
17959 +    const TargetInstrInfo * TII = TM.getInstrInfo();
17960 +    if (!MRI.isLiveIn(physReg)) {
17961 +      MRI.addLiveIn(physReg, virtReg);
17962 +      MF->front().addLiveIn(physReg);
17963 +      BuildMI(MF->front(), MF->front().begin(), DebugLoc(),
17964 +              TII->get(TargetOpcode::COPY), virtReg)
17965 +                .addReg(physReg);
17966 +    } else {
17967 +      MRI.replaceRegWith(virtReg, MRI.getLiveInVirtReg(physReg));
17968 +    }
17969 +}
17970 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIInsertWaits.cpp llvm-r600/lib/Target/R600/SIInsertWaits.cpp
17971 --- llvm-3.2.src/lib/Target/R600/SIInsertWaits.cpp      1970-01-01 01:00:00.000000000 +0100
17972 +++ llvm-r600/lib/Target/R600/SIInsertWaits.cpp 2013-01-25 19:43:57.473383054 +0100
17973 @@ -0,0 +1,353 @@
17974 +//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
17975 +//
17976 +//                     The LLVM Compiler Infrastructure
17977 +//
17978 +// This file is distributed under the University of Illinois Open Source
17979 +// License. See LICENSE.TXT for details.
17980 +//
17981 +//===----------------------------------------------------------------------===//
17982 +//
17983 +/// \file
17984 +/// \brief Insert wait instructions for memory reads and writes.
17985 +///
17986 +/// Memory reads and writes are issued asynchronously, so we need to insert
17987 +/// S_WAITCNT instructions when we want to access any of their results or
17988 +/// overwrite any register that's used asynchronously.
17989 +//
17990 +//===----------------------------------------------------------------------===//
17991 +
17992 +#include "AMDGPU.h"
17993 +#include "SIInstrInfo.h"
17994 +#include "SIMachineFunctionInfo.h"
17995 +#include "llvm/CodeGen/MachineFunction.h"
17996 +#include "llvm/CodeGen/MachineFunctionPass.h"
17997 +#include "llvm/CodeGen/MachineInstrBuilder.h"
17998 +#include "llvm/CodeGen/MachineRegisterInfo.h"
17999 +
18000 +using namespace llvm;
18001 +
18002 +namespace {
18003 +
18004 +/// \brief One variable for each of the hardware counters
18005 +typedef union {
18006 +  struct {
18007 +    unsigned VM;
18008 +    unsigned EXP;
18009 +    unsigned LGKM;
18010 +  } Named;
18011 +  unsigned Array[3];
18012 +
18013 +} Counters;
18014 +
18015 +typedef Counters RegCounters[512];
18016 +typedef std::pair<unsigned, unsigned> RegInterval;
18017 +
18018 +class SIInsertWaits : public MachineFunctionPass {
18019 +
18020 +private:
18021 +  static char ID;
18022 +  const SIInstrInfo *TII;
18023 +  const SIRegisterInfo &TRI;
18024 +  const MachineRegisterInfo *MRI;
18025 +
18026 +  /// \brief Constant hardware limits
18027 +  static const Counters WaitCounts;
18028 +
18029 +  /// \brief Constant zero value
18030 +  static const Counters ZeroCounts;
18031 +
18032 +  /// \brief Counter values we have already waited on.
18033 +  Counters WaitedOn;
18034 +
18035 +  /// \brief Counter values for last instruction issued.
18036 +  Counters LastIssued;
18037 +
18038 +  /// \brief Registers used by async instructions.
18039 +  RegCounters UsedRegs;
18040 +
18041 +  /// \brief Registers defined by async instructions.
18042 +  RegCounters DefinedRegs;
18043 +
18044 +  /// \brief Different export instruction types seen since last wait.
18045 +  unsigned ExpInstrTypesSeen;
18046 +
18047 +  /// \brief Get increment/decrement amount for this instruction.
18048 +  Counters getHwCounts(MachineInstr &MI);
18049 +
18050 +  /// \brief Is operand relevant for async execution?
18051 +  bool isOpRelevant(MachineOperand &Op);
18052 +
18053 +  /// \brief Get register interval an operand affects.
18054 +  RegInterval getRegInterval(MachineOperand &Op);
18055 +
18056 +  /// \brief Handle instructions async components
18057 +  void pushInstruction(MachineInstr &MI);
18058 +
18059 +  /// \brief Insert the actual wait instruction
18060 +  bool insertWait(MachineBasicBlock &MBB,
18061 +                  MachineBasicBlock::iterator I,
18062 +                  const Counters &Counts);
18063 +
18064 +  /// \brief Resolve all operand dependencies to counter requirements
18065 +  Counters handleOperands(MachineInstr &MI);
18066 +
18067 +public:
18068 +  SIInsertWaits(TargetMachine &tm) :
18069 +    MachineFunctionPass(ID),
18070 +    TII(static_cast<const SIInstrInfo*>(tm.getInstrInfo())),
18071 +    TRI(TII->getRegisterInfo()) { }
18072 +
18073 +  virtual bool runOnMachineFunction(MachineFunction &MF);
18074 +
18075 +  const char *getPassName() const {
18076 +    return "SI insert wait  instructions";
18077 +  }
18078 +
18079 +};
18080 +
18081 +} // End anonymous namespace
18082 +
18083 +char SIInsertWaits::ID = 0;
18084 +
18085 +const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } };
18086 +const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
18087 +
18088 +FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
18089 +  return new SIInsertWaits(tm);
18090 +}
18091 +
18092 +Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
18093 +
18094 +  uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags;
18095 +  Counters Result;
18096 +
18097 +  Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
18098 +
18099 +  // Only consider stores or EXP for EXP_CNT
18100 +  Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
18101 +      (MI.getOpcode() == AMDGPU::EXP || !MI.getDesc().mayStore()));
18102 +
18103 +  // LGKM may uses larger values
18104 +  if (TSFlags & SIInstrFlags::LGKM_CNT) {
18105 +
18106 +    MachineOperand &Op = MI.getOperand(0);
18107 +    assert(Op.isReg() && "First LGKM operand must be a register!");
18108 +
18109 +    unsigned Reg = Op.getReg();
18110 +    unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
18111 +    Result.Named.LGKM = Size > 4 ? 2 : 1;
18112 +
18113 +  } else {
18114 +    Result.Named.LGKM = 0;
18115 +  }
18116 +
18117 +  return Result;
18118 +}
18119 +
18120 +bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
18121 +
18122 +  // Constants are always irrelevant
18123 +  if (!Op.isReg())
18124 +    return false;
18125 +
18126 +  // Defines are always relevant
18127 +  if (Op.isDef())
18128 +    return true;
18129 +
18130 +  // For exports all registers are relevant
18131 +  MachineInstr &MI = *Op.getParent();
18132 +  if (MI.getOpcode() == AMDGPU::EXP)
18133 +    return true;
18134 +
18135 +  // For stores the stored value is also relevant
18136 +  if (!MI.getDesc().mayStore())
18137 +    return false;
18138 +
18139 +  for (MachineInstr::mop_iterator I = MI.operands_begin(),
18140 +       E = MI.operands_end(); I != E; ++I) {
18141 +
18142 +    if (I->isReg() && I->isUse())
18143 +      return Op.isIdenticalTo(*I);
18144 +  }
18145 +
18146 +  return false;
18147 +}
18148 +
18149 +RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
18150 +
18151 +  if (!Op.isReg())
18152 +    return std::make_pair(0, 0);
18153 +
18154 +  unsigned Reg = Op.getReg();
18155 +  unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
18156 +
18157 +  assert(Size >= 4);
18158 +
18159 +  RegInterval Result;
18160 +  Result.first = TRI.getEncodingValue(Reg);
18161 +  Result.second = Result.first + Size / 4;
18162 +
18163 +  return Result;
18164 +}
18165 +
18166 +void SIInsertWaits::pushInstruction(MachineInstr &MI) {
18167 +
18168 +  // Get the hardware counter increments and sum them up
18169 +  Counters Increment = getHwCounts(MI);
18170 +  unsigned Sum = 0;
18171 +
18172 +  for (unsigned i = 0; i < 3; ++i) {
18173 +    LastIssued.Array[i] += Increment.Array[i];
18174 +    Sum += Increment.Array[i];
18175 +  }
18176 +
18177 +  // If we don't increase anything then that's it
18178 +  if (Sum == 0)
18179 +    return;
18180 +
18181 +  // Remember which export instructions we have seen
18182 +  if (Increment.Named.EXP) {
18183 +    ExpInstrTypesSeen |= MI.getOpcode() == AMDGPU::EXP ? 1 : 2;
18184 +  }
18185 +
18186 +  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
18187 +
18188 +    MachineOperand &Op = MI.getOperand(i);
18189 +    if (!isOpRelevant(Op))
18190 +      continue;
18191 +
18192 +    RegInterval Interval = getRegInterval(Op);
18193 +    for (unsigned j = Interval.first; j < Interval.second; ++j) {
18194 +
18195 +      // Remember which registers we define
18196 +      if (Op.isDef())
18197 +        DefinedRegs[j] = LastIssued;
18198 +
18199 +      // and which one we are using
18200 +      if (Op.isUse())
18201 +        UsedRegs[j] = LastIssued;
18202 +    }
18203 +  }
18204 +}
18205 +
18206 +bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
18207 +                               MachineBasicBlock::iterator I,
18208 +                               const Counters &Required) {
18209 +
18210 +  // End of program? No need to wait on anything
18211 +  if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
18212 +    return false;
18213 +
18214 +  // Figure out if the async instructions execute in order
18215 +  bool Ordered[3];
18216 +
18217 +  // VM_CNT is always ordered
18218 +  Ordered[0] = true;
18219 +
18220 +  // EXP_CNT is unordered if we have both EXP & VM-writes
18221 +  Ordered[1] = ExpInstrTypesSeen == 3;
18222 +
18223 +  // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
18224 +  Ordered[2] = false;
18225 +
18226 +  // The values we are going to put into the S_WAITCNT instruction
18227 +  Counters Counts = WaitCounts;
18228 +
18229 +  // Do we really need to wait?
18230 +  bool NeedWait = false;
18231 +
18232 +  for (unsigned i = 0; i < 3; ++i) {
18233 +
18234 +    if (Required.Array[i] <= WaitedOn.Array[i])
18235 +      continue;
18236 +
18237 +    NeedWait = true;
18238 +
18239 +    if (Ordered[i]) {
18240 +      unsigned Value = LastIssued.Array[i] - Required.Array[i];
18241 +
18242 +      // adjust the value to the real hardware posibilities
18243 +      Counts.Array[i] = std::min(Value, WaitCounts.Array[i]);
18244 +
18245 +    } else
18246 +      Counts.Array[i] = 0;
18247 +
18248 +    // Remember on what we have waited on
18249 +    WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
18250 +  }
18251 +
18252 +  if (!NeedWait)
18253 +    return false;
18254 +
18255 +  // Reset EXP_CNT instruction types
18256 +  if (Counts.Named.EXP == 0)
18257 +    ExpInstrTypesSeen = 0;
18258 +
18259 +  // Build the wait instruction
18260 +  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
18261 +          .addImm((Counts.Named.VM & 0xF) |
18262 +                  ((Counts.Named.EXP & 0x7) << 4) |
18263 +                  ((Counts.Named.LGKM & 0x7) << 8));
18264 +
18265 +  return true;
18266 +}
18267 +
18268 +/// \brief helper function for handleOperands
18269 +static void increaseCounters(Counters &Dst, const Counters &Src) {
18270 +
18271 +  for (unsigned i = 0; i < 3; ++i)
18272 +    Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
18273 +}
18274 +
18275 +Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
18276 +
18277 +  Counters Result = ZeroCounts;
18278 +
18279 +  // For each register affected by this
18280 +  // instruction increase the result sequence
18281 +  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
18282 +
18283 +    MachineOperand &Op = MI.getOperand(i);
18284 +    RegInterval Interval = getRegInterval(Op);
18285 +    for (unsigned j = Interval.first; j < Interval.second; ++j) {
18286 +
18287 +      if (Op.isDef())
18288 +        increaseCounters(Result, UsedRegs[j]);
18289 +
18290 +      if (Op.isUse())
18291 +        increaseCounters(Result, DefinedRegs[j]);
18292 +    }
18293 +  }
18294 +
18295 +  return Result;
18296 +}
18297 +
18298 +bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
18299 +
18300 +  bool Changes = false;
18301 +
18302 +  MRI = &MF.getRegInfo();
18303 +
18304 +  WaitedOn = ZeroCounts;
18305 +  LastIssued = ZeroCounts;
18306 +
18307 +  memset(&UsedRegs, 0, sizeof(UsedRegs));
18308 +  memset(&DefinedRegs, 0, sizeof(DefinedRegs));
18309 +
18310 +  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
18311 +       BI != BE; ++BI) {
18312 +
18313 +    MachineBasicBlock &MBB = *BI;
18314 +    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
18315 +         I != E; ++I) {
18316 +
18317 +      Changes |= insertWait(MBB, I, handleOperands(*I));
18318 +      pushInstruction(*I);
18319 +    }
18320 +
18321 +    // Wait for everything at the end of the MBB
18322 +    Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
18323 +  }
18324 +
18325 +  return Changes;
18326 +}
18327 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIInstrFormats.td llvm-r600/lib/Target/R600/SIInstrFormats.td
18328 --- llvm-3.2.src/lib/Target/R600/SIInstrFormats.td      1970-01-01 01:00:00.000000000 +0100
18329 +++ llvm-r600/lib/Target/R600/SIInstrFormats.td 2013-01-25 19:43:57.473383054 +0100
18330 @@ -0,0 +1,146 @@
18331 +//===-- SIInstrFormats.td - SI Instruction Formats ------------------------===//
18332 +//
18333 +//                     The LLVM Compiler Infrastructure
18334 +//
18335 +// This file is distributed under the University of Illinois Open Source
18336 +// License. See LICENSE.TXT for details.
18337 +//
18338 +//===----------------------------------------------------------------------===//
18339 +//
18340 +// SI Instruction format definitions.
18341 +//
18342 +// Instructions with _32 take 32-bit operands.
18343 +// Instructions with _64 take 64-bit operands.
18344 +//
18345 +// VOP_* instructions can use either a 32-bit or 64-bit encoding.  The 32-bit
18346 +// encoding is the standard encoding, but instruction that make use of
18347 +// any of the instruction modifiers must use the 64-bit encoding.
18348 +//
18349 +// Instructions with _e32 use the 32-bit encoding.
18350 +// Instructions with _e64 use the 64-bit encoding.
18351 +//
18352 +//===----------------------------------------------------------------------===//
18353 +
18354 +class VOP3b_2IN <bits<9> op, string opName, RegisterClass dstClass,
18355 +                 RegisterClass src0Class, RegisterClass src1Class,
18356 +                 list<dag> pattern>
18357 +  : VOP3b <op, (outs dstClass:$vdst),
18358 +               (ins src0Class:$src0, src1Class:$src1, InstFlag:$src2, InstFlag:$sdst,
18359 +                    InstFlag:$omod, InstFlag:$neg),
18360 +           opName, pattern
18361 +>;
18362 +
18363 +
18364 +class VOP3_1_32 <bits<9> op, string opName, list<dag> pattern>
18365 +  : VOP3b_2IN <op, opName, SReg_1, AllReg_32, VReg_32, pattern>;
18366 +
18367 +class VOP3_32 <bits<9> op, string opName, list<dag> pattern>
18368 +  : VOP3 <op, (outs VReg_32:$dst), (ins AllReg_32:$src0, VReg_32:$src1, VReg_32:$src2, i32imm:$src3, i32imm:$src4, i32imm:$src5, i32imm:$src6), opName, pattern>;
18369 +
18370 +class VOP3_64 <bits<9> op, string opName, list<dag> pattern>
18371 +  : VOP3 <op, (outs VReg_64:$dst), (ins AllReg_64:$src0, VReg_64:$src1, VReg_64:$src2, i32imm:$src3, i32imm:$src4, i32imm:$src5, i32imm:$src6), opName, pattern>;
18372 +
18373 +
18374 +class SOP1_32 <bits<8> op, string opName, list<dag> pattern>
18375 +  : SOP1 <op, (outs SReg_32:$dst), (ins SReg_32:$src0), opName, pattern>;
18376 +
18377 +class SOP1_64 <bits<8> op, string opName, list<dag> pattern>
18378 +  : SOP1 <op, (outs SReg_64:$dst), (ins SReg_64:$src0), opName, pattern>;
18379 +
18380 +class SOP2_32 <bits<7> op, string opName, list<dag> pattern>
18381 +  : SOP2 <op, (outs SReg_32:$dst), (ins SReg_32:$src0, SReg_32:$src1), opName, pattern>;
18382 +
18383 +class SOP2_64 <bits<7> op, string opName, list<dag> pattern>
18384 +  : SOP2 <op, (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), opName, pattern>;
18385 +
18386 +class SOP2_VCC <bits<7> op, string opName, list<dag> pattern>
18387 +  : SOP2 <op, (outs SReg_1:$vcc), (ins SReg_64:$src0, SReg_64:$src1), opName, pattern>;
18388 +
18389 +class VOP1_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
18390 +                   string opName, list<dag> pattern> :
18391 +  VOP1 <
18392 +    op, (outs vrc:$dst), (ins arc:$src0), opName, pattern
18393 +  >;
18394 +
18395 +multiclass VOP1_32 <bits<8> op, string opName, list<dag> pattern> {
18396 +  def _e32: VOP1_Helper <op, VReg_32, AllReg_32, opName, pattern>;
18397 +  def _e64 : VOP3_32 <{1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
18398 +                      opName, []
18399 +  >;
18400 +}
18401 +
18402 +multiclass VOP1_64 <bits<8> op, string opName, list<dag> pattern> {
18403 +
18404 +  def _e32 : VOP1_Helper <op, VReg_64, AllReg_64, opName, pattern>;
18405 +
18406 +  def _e64 : VOP3_64 <
18407 +    {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
18408 +    opName, []
18409 +  >;
18410 +}
18411 +
18412 +class VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc,
18413 +                   string opName, list<dag> pattern> :
18414 +  VOP2 <
18415 +    op, (outs vrc:$dst), (ins arc:$src0, vrc:$src1), opName, pattern
18416 +  >;
18417 +
18418 +multiclass VOP2_32 <bits<6> op, string opName, list<dag> pattern> {
18419 +
18420 +  def _e32 : VOP2_Helper <op, VReg_32, AllReg_32, opName, pattern>;
18421 +
18422 +  def _e64 : VOP3_32 <{1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
18423 +                      opName, []
18424 +  >;
18425 +}
18426 +
18427 +multiclass VOP2_64 <bits<6> op, string opName, list<dag> pattern> {
18428 +  def _e32: VOP2_Helper <op, VReg_64, AllReg_64, opName, pattern>;
18429 +
18430 +  def _e64 : VOP3_64 <
18431 +    {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
18432 +    opName, []
18433 +  >;
18434 +}
18435 +
18436 +class SOPK_32 <bits<5> op, string opName, list<dag> pattern>
18437 +  : SOPK <op, (outs SReg_32:$dst), (ins i16imm:$src0), opName, pattern>;
18438 +
18439 +class SOPK_64 <bits<5> op, string opName, list<dag> pattern>
18440 +  : SOPK <op, (outs SReg_64:$dst), (ins i16imm:$src0), opName, pattern>;
18441 +
18442 +class VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
18443 +                 string opName, list<dag> pattern> :
18444 +  VOPC <
18445 +    op, (ins arc:$src0, vrc:$src1), opName, pattern
18446 +  >;
18447 +
18448 +multiclass VOPC_32 <bits<9> op, string opName, list<dag> pattern> {
18449 +
18450 +  def _e32 : VOPC_Helper <
18451 +    {op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
18452 +    VReg_32, AllReg_32, opName, pattern
18453 +  >;
18454 +
18455 +  def _e64 : VOP3_1_32 <
18456 +    op,
18457 +    opName, pattern
18458 +  >;
18459 +}
18460 +
18461 +multiclass VOPC_64 <bits<8> op, string opName, list<dag> pattern> {
18462 +
18463 +  def _e32 : VOPC_Helper <op, VReg_64, AllReg_64, opName, pattern>;
18464 +
18465 +  def _e64 : VOP3_64 <
18466 +    {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
18467 +    opName, []
18468 +  >;
18469 +}
18470 +
18471 +class SOPC_32 <bits<7> op, string opName, list<dag> pattern>
18472 +  : SOPC <op, (outs SCCReg:$dst), (ins SReg_32:$src0, SReg_32:$src1), opName, pattern>;
18473 +
18474 +class SOPC_64 <bits<7> op, string opName, list<dag> pattern>
18475 +  : SOPC <op, (outs SCCReg:$dst), (ins SReg_64:$src0, SReg_64:$src1), opName, pattern>;
18476 +
18477 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIInstrInfo.cpp llvm-r600/lib/Target/R600/SIInstrInfo.cpp
18478 --- llvm-3.2.src/lib/Target/R600/SIInstrInfo.cpp        1970-01-01 01:00:00.000000000 +0100
18479 +++ llvm-r600/lib/Target/R600/SIInstrInfo.cpp   2013-01-25 19:43:57.473383054 +0100
18480 @@ -0,0 +1,89 @@
18481 +//===-- SIInstrInfo.cpp - SI Instruction Information  ---------------------===//
18482 +//
18483 +//                     The LLVM Compiler Infrastructure
18484 +//
18485 +// This file is distributed under the University of Illinois Open Source
18486 +// License. See LICENSE.TXT for details.
18487 +//
18488 +//===----------------------------------------------------------------------===//
18489 +//
18490 +/// \file
18491 +/// \brief SI Implementation of TargetInstrInfo.
18492 +//
18493 +//===----------------------------------------------------------------------===//
18494 +
18495 +
18496 +#include "SIInstrInfo.h"
18497 +#include "AMDGPUTargetMachine.h"
18498 +#include "llvm/CodeGen/MachineInstrBuilder.h"
18499 +#include "llvm/CodeGen/MachineRegisterInfo.h"
18500 +#include "llvm/MC/MCInstrDesc.h"
18501 +
18502 +#include <stdio.h>
18503 +
18504 +using namespace llvm;
18505 +
18506 +SIInstrInfo::SIInstrInfo(AMDGPUTargetMachine &tm)
18507 +  : AMDGPUInstrInfo(tm),
18508 +    RI(tm, *this)
18509 +    { }
18510 +
18511 +const SIRegisterInfo &SIInstrInfo::getRegisterInfo() const {
18512 +  return RI;
18513 +}
18514 +
18515 +void
18516 +SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
18517 +                           MachineBasicBlock::iterator MI, DebugLoc DL,
18518 +                           unsigned DestReg, unsigned SrcReg,
18519 +                           bool KillSrc) const {
18520 +  // If we are trying to copy to or from SCC, there is a bug somewhere else in
18521 +  // the backend.  While it may be theoretically possible to do this, it should
18522 +  // never be necessary.
18523 +  assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC);
18524 +
18525 +  if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
18526 +    assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
18527 +    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
18528 +            .addReg(SrcReg, getKillRegState(KillSrc));
18529 +  } else if (AMDGPU::VReg_32RegClass.contains(DestReg)) {
18530 +    assert(AMDGPU::VReg_32RegClass.contains(SrcReg) ||
18531 +           AMDGPU::SReg_32RegClass.contains(SrcReg));
18532 +    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
18533 +            .addReg(SrcReg, getKillRegState(KillSrc));
18534 +  } else {
18535 +    assert(AMDGPU::SReg_32RegClass.contains(DestReg));
18536 +    assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
18537 +    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
18538 +            .addReg(SrcReg, getKillRegState(KillSrc));
18539 +  }
18540 +}
18541 +
18542 +MachineInstr * SIInstrInfo::getMovImmInstr(MachineFunction *MF, unsigned DstReg,
18543 +                                           int64_t Imm) const {
18544 +  MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::V_MOV_IMM_I32), DebugLoc());
18545 +  MachineInstrBuilder(MI).addReg(DstReg, RegState::Define);
18546 +  MachineInstrBuilder(MI).addImm(Imm);
18547 +
18548 +  return MI;
18549 +
18550 +}
18551 +
18552 +bool SIInstrInfo::isMov(unsigned Opcode) const {
18553 +  switch(Opcode) {
18554 +  default: return false;
18555 +  case AMDGPU::S_MOV_B32:
18556 +  case AMDGPU::S_MOV_B64:
18557 +  case AMDGPU::V_MOV_B32_e32:
18558 +  case AMDGPU::V_MOV_B32_e64:
18559 +  case AMDGPU::V_MOV_IMM_F32:
18560 +  case AMDGPU::V_MOV_IMM_I32:
18561 +  case AMDGPU::S_MOV_IMM_I32:
18562 +    return true;
18563 +  }
18564 +}
18565 +
18566 +bool
18567 +SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
18568 +  return RC != &AMDGPU::EXECRegRegClass;
18569 +}
18570 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIInstrInfo.h llvm-r600/lib/Target/R600/SIInstrInfo.h
18571 --- llvm-3.2.src/lib/Target/R600/SIInstrInfo.h  1970-01-01 01:00:00.000000000 +0100
18572 +++ llvm-r600/lib/Target/R600/SIInstrInfo.h     2013-01-25 19:43:57.476716387 +0100
18573 @@ -0,0 +1,64 @@
18574 +//===-- SIInstrInfo.h - SI Instruction Info Interface ---------------------===//
18575 +//
18576 +//                     The LLVM Compiler Infrastructure
18577 +//
18578 +// This file is distributed under the University of Illinois Open Source
18579 +// License. See LICENSE.TXT for details.
18580 +//
18581 +//===----------------------------------------------------------------------===//
18582 +//
18583 +/// \file
18584 +/// \brief Interface definition for SIInstrInfo.
18585 +//
18586 +//===----------------------------------------------------------------------===//
18587 +
18588 +
18589 +#ifndef SIINSTRINFO_H
18590 +#define SIINSTRINFO_H
18591 +
18592 +#include "AMDGPUInstrInfo.h"
18593 +#include "SIRegisterInfo.h"
18594 +
18595 +namespace llvm {
18596 +
18597 +class SIInstrInfo : public AMDGPUInstrInfo {
18598 +private:
18599 +  const SIRegisterInfo RI;
18600 +
18601 +public:
18602 +  explicit SIInstrInfo(AMDGPUTargetMachine &tm);
18603 +
18604 +  const SIRegisterInfo &getRegisterInfo() const;
18605 +
18606 +  virtual void copyPhysReg(MachineBasicBlock &MBB,
18607 +                           MachineBasicBlock::iterator MI, DebugLoc DL,
18608 +                           unsigned DestReg, unsigned SrcReg,
18609 +                           bool KillSrc) const;
18610 +
18611 +  /// \returns the encoding type of this instruction.
18612 +  unsigned getEncodingType(const MachineInstr &MI) const;
18613 +
18614 +  /// \returns the size of this instructions encoding in number of bytes.
18615 +  unsigned getEncodingBytes(const MachineInstr &MI) const;
18616 +
18617 +  virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg,
18618 +                                        int64_t Imm) const;
18619 +
18620 +  virtual unsigned getIEQOpcode() const { assert(!"Implement"); return 0;}
18621 +  virtual bool isMov(unsigned Opcode) const;
18622 +
18623 +  virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
18624 +  };
18625 +
18626 +} // End namespace llvm
18627 +
18628 +namespace SIInstrFlags {
18629 +  enum Flags {
18630 +    // First 4 bits are the instruction encoding
18631 +    VM_CNT = 1 << 4,
18632 +    EXP_CNT = 1 << 5,
18633 +    LGKM_CNT = 1 << 6
18634 +  };
18635 +}
18636 +
18637 +#endif //SIINSTRINFO_H
18638 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIInstrInfo.td llvm-r600/lib/Target/R600/SIInstrInfo.td
18639 --- llvm-3.2.src/lib/Target/R600/SIInstrInfo.td 1970-01-01 01:00:00.000000000 +0100
18640 +++ llvm-r600/lib/Target/R600/SIInstrInfo.td    2013-01-25 19:43:57.476716387 +0100
18641 @@ -0,0 +1,591 @@
18642 +//===-- SIInstrInfo.td - SI Instruction Encodings ---------*- tablegen -*--===//
18643 +//
18644 +//                     The LLVM Compiler Infrastructure
18645 +//
18646 +// This file is distributed under the University of Illinois Open Source
18647 +// License. See LICENSE.TXT for details.
18648 +//
18649 +//===----------------------------------------------------------------------===//
18650 +
18651 +//===----------------------------------------------------------------------===//
18652 +// SI DAG Profiles
18653 +//===----------------------------------------------------------------------===//
18654 +def SDTVCCBinaryOp : SDTypeProfile<1, 2, [
18655 +  SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 2>
18656 +]>;
18657 +
18658 +//===----------------------------------------------------------------------===//
18659 +// SI DAG Nodes
18660 +//===----------------------------------------------------------------------===//
18661 +
18662 +// and operation on 64-bit wide vcc
18663 +def SIsreg1_and : SDNode<"SIISD::VCC_AND", SDTVCCBinaryOp,
18664 +  [SDNPCommutative, SDNPAssociative]
18665 +>;
18666 +
18667 +// Special bitcast node for sharing VCC register between VALU and SALU
18668 +def SIsreg1_bitcast : SDNode<"SIISD::VCC_BITCAST",
18669 +  SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>
18670 +>;
18671 +
18672 +// and operation on 64-bit wide vcc
18673 +def SIvcc_and : SDNode<"SIISD::VCC_AND", SDTVCCBinaryOp,
18674 +  [SDNPCommutative, SDNPAssociative]
18675 +>;
18676 +
18677 +// Special bitcast node for sharing VCC register between VALU and SALU
18678 +def SIvcc_bitcast : SDNode<"SIISD::VCC_BITCAST",
18679 +  SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>
18680 +>;
18681 +
18682 +class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
18683 +    AMDGPUInst<outs, ins, asm, pattern> {
18684 +
18685 +  field bits<4> EncodingType = 0;
18686 +  field bits<1> VM_CNT = 0;
18687 +  field bits<1> EXP_CNT = 0;
18688 +  field bits<1> LGKM_CNT = 0;
18689 +
18690 +  let TSFlags{3-0} = EncodingType;
18691 +  let TSFlags{4} = VM_CNT;
18692 +  let TSFlags{5} = EXP_CNT;
18693 +  let TSFlags{6} = LGKM_CNT;
18694 +}
18695 +
18696 +class Enc32 <dag outs, dag ins, string asm, list<dag> pattern> :
18697 +    InstSI <outs, ins, asm, pattern> {
18698 +
18699 +  field bits<32> Inst;
18700 +}
18701 +
18702 +class Enc64 <dag outs, dag ins, string asm, list<dag> pattern> :
18703 +    InstSI <outs, ins, asm, pattern> {
18704 +
18705 +  field bits<64> Inst;
18706 +}
18707 +
18708 +class SIOperand <ValueType vt, dag opInfo>: Operand <vt> {
18709 +  let EncoderMethod = "encodeOperand";
18710 +  let MIOperandInfo = opInfo;
18711 +}
18712 +
18713 +def IMM16bit : ImmLeaf <
18714 +  i16,
18715 +  [{return isInt<16>(Imm);}]
18716 +>;
18717 +
18718 +def IMM8bit : ImmLeaf <
18719 +  i32,
18720 +  [{return (int32_t)Imm >= 0 && (int32_t)Imm <= 0xff;}]
18721 +>;
18722 +
18723 +def IMM12bit : ImmLeaf <
18724 +  i16,
18725 +  [{return (int16_t)Imm >= 0 && (int16_t)Imm <= 0xfff;}]
18726 +>;
18727 +
18728 +def IMM32bitIn64bit : ImmLeaf <
18729 +  i64,
18730 +  [{return isInt<32>(Imm);}]
18731 +>;
18732 +
18733 +class GPR4Align <RegisterClass rc> : Operand <vAny> {
18734 +  let EncoderMethod = "GPR4AlignEncode";
18735 +  let MIOperandInfo = (ops rc:$reg);
18736 +}
18737 +
18738 +class GPR2Align <RegisterClass rc, ValueType vt> : Operand <vt> {
18739 +  let EncoderMethod = "GPR2AlignEncode";
18740 +  let MIOperandInfo = (ops rc:$reg);
18741 +}
18742 +
18743 +def SMRDmemrr : Operand<iPTR> {
18744 +  let MIOperandInfo = (ops SReg_64, SReg_32);
18745 +  let EncoderMethod = "GPR2AlignEncode";
18746 +}
18747 +
18748 +def SMRDmemri : Operand<iPTR> {
18749 +  let MIOperandInfo = (ops SReg_64, i32imm);
18750 +  let EncoderMethod = "SMRDmemriEncode";
18751 +}
18752 +
18753 +def ADDR_Reg     : ComplexPattern<i64, 2, "SelectADDRReg", [], []>;
18754 +def ADDR_Offset8 : ComplexPattern<i64, 2, "SelectADDR8BitOffset", [], []>;
18755 +
18756 +let Uses = [EXEC] in {
18757 +
18758 +def EXP : Enc64<
18759 +  (outs),
18760 +  (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
18761 +       VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3),
18762 +  "EXP $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3",
18763 +  [] > {
18764 +
18765 +  bits<4> EN;
18766 +  bits<6> TGT;
18767 +  bits<1> COMPR;
18768 +  bits<1> DONE;
18769 +  bits<1> VM;
18770 +  bits<8> VSRC0;
18771 +  bits<8> VSRC1;
18772 +  bits<8> VSRC2;
18773 +  bits<8> VSRC3;
18774 +
18775 +  let Inst{3-0} = EN;
18776 +  let Inst{9-4} = TGT;
18777 +  let Inst{10} = COMPR;
18778 +  let Inst{11} = DONE;
18779 +  let Inst{12} = VM;
18780 +  let Inst{31-26} = 0x3e;
18781 +  let Inst{39-32} = VSRC0;
18782 +  let Inst{47-40} = VSRC1;
18783 +  let Inst{55-48} = VSRC2;
18784 +  let Inst{63-56} = VSRC3;
18785 +  let EncodingType = 0; //SIInstrEncodingType::EXP
18786 +
18787 +  let EXP_CNT = 1;
18788 +}
18789 +
18790 +class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
18791 +    Enc64 <outs, ins, asm, pattern> {
18792 +
18793 +  bits<8> VDATA;
18794 +  bits<4> DMASK;
18795 +  bits<1> UNORM;
18796 +  bits<1> GLC;
18797 +  bits<1> DA;
18798 +  bits<1> R128;
18799 +  bits<1> TFE;
18800 +  bits<1> LWE;
18801 +  bits<1> SLC;
18802 +  bits<8> VADDR;
18803 +  bits<5> SRSRC;
18804 +  bits<5> SSAMP;
18805 +
18806 +  let Inst{11-8} = DMASK;
18807 +  let Inst{12} = UNORM;
18808 +  let Inst{13} = GLC;
18809 +  let Inst{14} = DA;
18810 +  let Inst{15} = R128;
18811 +  let Inst{16} = TFE;
18812 +  let Inst{17} = LWE;
18813 +  let Inst{24-18} = op;
18814 +  let Inst{25} = SLC;
18815 +  let Inst{31-26} = 0x3c;
18816 +  let Inst{39-32} = VADDR;
18817 +  let Inst{47-40} = VDATA;
18818 +  let Inst{52-48} = SRSRC;
18819 +  let Inst{57-53} = SSAMP;
18820 +  let EncodingType = 2; //SIInstrEncodingType::MIMG
18821 +
18822 +  let VM_CNT = 1;
18823 +  let EXP_CNT = 1;
18824 +}
18825 +
18826 +class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
18827 +    Enc64<outs, ins, asm, pattern> {
18828 +
18829 +  bits<8> VDATA;
18830 +  bits<12> OFFSET;
18831 +  bits<1> OFFEN;
18832 +  bits<1> IDXEN;
18833 +  bits<1> GLC;
18834 +  bits<1> ADDR64;
18835 +  bits<4> DFMT;
18836 +  bits<3> NFMT;
18837 +  bits<8> VADDR;
18838 +  bits<5> SRSRC;
18839 +  bits<1> SLC;
18840 +  bits<1> TFE;
18841 +  bits<8> SOFFSET;
18842 +
18843 +  let Inst{11-0} = OFFSET;
18844 +  let Inst{12} = OFFEN;
18845 +  let Inst{13} = IDXEN;
18846 +  let Inst{14} = GLC;
18847 +  let Inst{15} = ADDR64;
18848 +  let Inst{18-16} = op;
18849 +  let Inst{22-19} = DFMT;
18850 +  let Inst{25-23} = NFMT;
18851 +  let Inst{31-26} = 0x3a; //encoding
18852 +  let Inst{39-32} = VADDR;
18853 +  let Inst{47-40} = VDATA;
18854 +  let Inst{52-48} = SRSRC;
18855 +  let Inst{54} = SLC;
18856 +  let Inst{55} = TFE;
18857 +  let Inst{63-56} = SOFFSET;
18858 +  let EncodingType = 3; //SIInstrEncodingType::MTBUF
18859 +
18860 +  let VM_CNT = 1;
18861 +  let EXP_CNT = 1;
18862 +
18863 +  let neverHasSideEffects = 1;
18864 +}
18865 +
18866 +class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
18867 +    Enc64<outs, ins, asm, pattern> {
18868 +
18869 +  bits<8> VDATA;
18870 +  bits<12> OFFSET;
18871 +  bits<1> OFFEN;
18872 +  bits<1> IDXEN;
18873 +  bits<1> GLC;
18874 +  bits<1> ADDR64;
18875 +  bits<1> LDS;
18876 +  bits<8> VADDR;
18877 +  bits<5> SRSRC;
18878 +  bits<1> SLC;
18879 +  bits<1> TFE;
18880 +  bits<8> SOFFSET;
18881 +
18882 +  let Inst{11-0} = OFFSET;
18883 +  let Inst{12} = OFFEN;
18884 +  let Inst{13} = IDXEN;
18885 +  let Inst{14} = GLC;
18886 +  let Inst{15} = ADDR64;
18887 +  let Inst{16} = LDS;
18888 +  let Inst{24-18} = op;
18889 +  let Inst{31-26} = 0x38; //encoding
18890 +  let Inst{39-32} = VADDR;
18891 +  let Inst{47-40} = VDATA;
18892 +  let Inst{52-48} = SRSRC;
18893 +  let Inst{54} = SLC;
18894 +  let Inst{55} = TFE;
18895 +  let Inst{63-56} = SOFFSET;
18896 +  let EncodingType = 4; //SIInstrEncodingType::MUBUF
18897 +
18898 +  let VM_CNT = 1;
18899 +  let EXP_CNT = 1;
18900 +
18901 +  let neverHasSideEffects = 1;
18902 +}
18903 +
18904 +} // End Uses = [EXEC]
18905 +
18906 +class SMRD <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
18907 +    Enc32<outs, ins, asm, pattern> {
18908 +
18909 +  bits<7> SDST;
18910 +  bits<15> PTR;
18911 +  bits<8> OFFSET = PTR{7-0};
18912 +  bits<1> IMM    = PTR{8};
18913 +  bits<6> SBASE  = PTR{14-9};
18914 +
18915 +  let Inst{7-0} = OFFSET;
18916 +  let Inst{8} = IMM;
18917 +  let Inst{14-9} = SBASE;
18918 +  let Inst{21-15} = SDST;
18919 +  let Inst{26-22} = op;
18920 +  let Inst{31-27} = 0x18; //encoding
18921 +  let EncodingType = 5; //SIInstrEncodingType::SMRD
18922 +
18923 +  let LGKM_CNT = 1;
18924 +}
18925 +
18926 +class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
18927 +    Enc32<outs, ins, asm, pattern> {
18928 +
18929 +  bits<7> SDST;
18930 +  bits<8> SSRC0;
18931 +
18932 +  let Inst{7-0} = SSRC0;
18933 +  let Inst{15-8} = op;
18934 +  let Inst{22-16} = SDST;
18935 +  let Inst{31-23} = 0x17d; //encoding;
18936 +  let EncodingType = 6; //SIInstrEncodingType::SOP1
18937 +
18938 +  let mayLoad = 0;
18939 +  let mayStore = 0;
18940 +  let hasSideEffects = 0;
18941 +}
18942 +
18943 +class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
18944 +    Enc32 <outs, ins, asm, pattern> {
18945 +
18946 +  bits<7> SDST;
18947 +  bits<8> SSRC0;
18948 +  bits<8> SSRC1;
18949 +
18950 +  let Inst{7-0} = SSRC0;
18951 +  let Inst{15-8} = SSRC1;
18952 +  let Inst{22-16} = SDST;
18953 +  let Inst{29-23} = op;
18954 +  let Inst{31-30} = 0x2; // encoding
18955 +  let EncodingType = 7; // SIInstrEncodingType::SOP2
18956 +
18957 +  let mayLoad = 0;
18958 +  let mayStore = 0;
18959 +  let hasSideEffects = 0;
18960 +}
18961 +
18962 +class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
18963 +  Enc32<outs, ins, asm, pattern> {
18964 +
18965 +  bits<8> SSRC0;
18966 +  bits<8> SSRC1;
18967 +
18968 +  let Inst{7-0} = SSRC0;
18969 +  let Inst{15-8} = SSRC1;
18970 +  let Inst{22-16} = op;
18971 +  let Inst{31-23} = 0x17e;
18972 +  let EncodingType = 8; // SIInstrEncodingType::SOPC
18973 +
18974 +  let DisableEncoding = "$dst";
18975 +  let mayLoad = 0;
18976 +  let mayStore = 0;
18977 +  let hasSideEffects = 0;
18978 +}
18979 +
18980 +class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
18981 +   Enc32 <outs, ins , asm, pattern> {
18982 +
18983 +  bits <7> SDST;
18984 +  bits <16> SIMM16;
18985 +
18986 +  let Inst{15-0} = SIMM16;
18987 +  let Inst{22-16} = SDST;
18988 +  let Inst{27-23} = op;
18989 +  let Inst{31-28} = 0xb; //encoding
18990 +  let EncodingType = 9; // SIInstrEncodingType::SOPK
18991 +
18992 +  let mayLoad = 0;
18993 +  let mayStore = 0;
18994 +  let hasSideEffects = 0;
18995 +}
18996 +
18997 +class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern> : Enc32 <
18998 +  (outs),
18999 +  ins,
19000 +  asm,
19001 +  pattern > {
19002 +
19003 +  bits <16> SIMM16;
19004 +
19005 +  let Inst{15-0} = SIMM16;
19006 +  let Inst{22-16} = op;
19007 +  let Inst{31-23} = 0x17f; // encoding
19008 +  let EncodingType = 10; // SIInstrEncodingType::SOPP
19009 +
19010 +  let mayLoad = 0;
19011 +  let mayStore = 0;
19012 +  let hasSideEffects = 0;
19013 +}
19014 +
19015 +let Uses = [EXEC] in {
19016 +
19017 +class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
19018 +    Enc32 <outs, ins, asm, pattern> {
19019 +
19020 +  bits<8> VDST;
19021 +  bits<8> VSRC;
19022 +  bits<2> ATTRCHAN;
19023 +  bits<6> ATTR;
19024 +
19025 +  let Inst{7-0} = VSRC;
19026 +  let Inst{9-8} = ATTRCHAN;
19027 +  let Inst{15-10} = ATTR;
19028 +  let Inst{17-16} = op;
19029 +  let Inst{25-18} = VDST;
19030 +  let Inst{31-26} = 0x32; // encoding
19031 +  let EncodingType = 11; // SIInstrEncodingType::VINTRP
19032 +
19033 +  let neverHasSideEffects = 1;
19034 +  let mayLoad = 1;
19035 +  let mayStore = 0;
19036 +}
19037 +
19038 +class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
19039 +    Enc32 <outs, ins, asm, pattern> {
19040 +
19041 +  bits<8> VDST;
19042 +  bits<9> SRC0;
19043 +
19044 +  let Inst{8-0} = SRC0;
19045 +  let Inst{16-9} = op;
19046 +  let Inst{24-17} = VDST;
19047 +  let Inst{31-25} = 0x3f; //encoding
19048 +
19049 +  let EncodingType = 12; // SIInstrEncodingType::VOP1
19050 +  let PostEncoderMethod = "VOPPostEncode";
19051 +
19052 +  let mayLoad = 0;
19053 +  let mayStore = 0;
19054 +  let hasSideEffects = 0;
19055 +}
19056 +
19057 +class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
19058 +    Enc32 <outs, ins, asm, pattern> {
19059 +
19060 +  bits<8> VDST;
19061 +  bits<9> SRC0;
19062 +  bits<8> VSRC1;
19063 +
19064 +  let Inst{8-0} = SRC0;
19065 +  let Inst{16-9} = VSRC1;
19066 +  let Inst{24-17} = VDST;
19067 +  let Inst{30-25} = op;
19068 +  let Inst{31} = 0x0; //encoding
19069 +
19070 +  let EncodingType = 13; // SIInstrEncodingType::VOP2
19071 +  let PostEncoderMethod = "VOPPostEncode";
19072 +
19073 +  let mayLoad = 0;
19074 +  let mayStore = 0;
19075 +  let hasSideEffects = 0;
19076 +}
19077 +
19078 +class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
19079 +    Enc64 <outs, ins, asm, pattern> {
19080 +
19081 +  bits<8> VDST;
19082 +  bits<9> SRC0;
19083 +  bits<9> SRC1;
19084 +  bits<9> SRC2;
19085 +  bits<3> ABS;
19086 +  bits<1> CLAMP;
19087 +  bits<2> OMOD;
19088 +  bits<3> NEG;
19089 +
19090 +  let Inst{7-0} = VDST;
19091 +  let Inst{10-8} = ABS;
19092 +  let Inst{11} = CLAMP;
19093 +  let Inst{25-17} = op;
19094 +  let Inst{31-26} = 0x34; //encoding
19095 +  let Inst{40-32} = SRC0;
19096 +  let Inst{49-41} = SRC1;
19097 +  let Inst{58-50} = SRC2;
19098 +  let Inst{60-59} = OMOD;
19099 +  let Inst{63-61} = NEG;
19100 +
19101 +  let EncodingType = 14; // SIInstrEncodingType::VOP3
19102 +  let PostEncoderMethod = "VOPPostEncode";
19103 +
19104 +  let mayLoad = 0;
19105 +  let mayStore = 0;
19106 +  let hasSideEffects = 0;
19107 +}
19108 +
19109 +class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
19110 +    Enc64 <outs, ins, asm, pattern> {
19111 +
19112 +  bits<8> VDST;
19113 +  bits<9> SRC0;
19114 +  bits<9> SRC1;
19115 +  bits<9> SRC2;
19116 +  bits<7> SDST;
19117 +  bits<2> OMOD;
19118 +  bits<3> NEG;
19119 +
19120 +  let Inst{7-0} = VDST;
19121 +  let Inst{14-8} = SDST;
19122 +  let Inst{25-17} = op;
19123 +  let Inst{31-26} = 0x34; //encoding
19124 +  let Inst{40-32} = SRC0;
19125 +  let Inst{49-41} = SRC1;
19126 +  let Inst{58-50} = SRC2;
19127 +  let Inst{60-59} = OMOD;
19128 +  let Inst{63-61} = NEG;
19129 +
19130 +  let EncodingType = 14; // SIInstrEncodingType::VOP3
19131 +  let PostEncoderMethod = "VOPPostEncode";
19132 +
19133 +  let mayLoad = 0;
19134 +  let mayStore = 0;
19135 +  let hasSideEffects = 0;
19136 +}
19137 +
19138 +class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
19139 +    Enc32 <(outs VCCReg:$dst), ins, asm, pattern> {
19140 +
19141 +  bits<9> SRC0;
19142 +  bits<8> VSRC1;
19143 +
19144 +  let Inst{8-0} = SRC0;
19145 +  let Inst{16-9} = VSRC1;
19146 +  let Inst{24-17} = op;
19147 +  let Inst{31-25} = 0x3e;
19148 +
19149 +  let EncodingType = 15; //SIInstrEncodingType::VOPC
19150 +  let PostEncoderMethod = "VOPPostEncode";
19151 +  let DisableEncoding = "$dst";
19152 +  let mayLoad = 0;
19153 +  let mayStore = 0;
19154 +  let hasSideEffects = 0;
19155 +}
19156 +
19157 +} // End Uses = [EXEC]
19158 +
19159 +class MIMG_Load_Helper <bits<7> op, string asm> : MIMG <
19160 +  op,
19161 +  (outs VReg_128:$vdata),
19162 +  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
19163 +       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, VReg_128:$vaddr,
19164 +       GPR4Align<SReg_256>:$srsrc, GPR4Align<SReg_128>:$ssamp),
19165 +  asm,
19166 +  []> {
19167 +  let mayLoad = 1;
19168 +  let mayStore = 0;
19169 +}
19170 +
19171 +class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF <
19172 +  op,
19173 +  (outs regClass:$dst),
19174 +  (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
19175 +       i1imm:$lds, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc, i1imm:$slc,
19176 +       i1imm:$tfe, SReg_32:$soffset),
19177 +  asm,
19178 +  []> {
19179 +  let mayLoad = 1;
19180 +  let mayStore = 0;
19181 +}
19182 +
19183 +class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
19184 +  op,
19185 +  (outs regClass:$dst),
19186 +  (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
19187 +       i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc,
19188 +       i1imm:$slc, i1imm:$tfe, SReg_32:$soffset),
19189 +  asm,
19190 +  []> {
19191 +  let mayLoad = 1;
19192 +  let mayStore = 0;
19193 +}
19194 +
19195 +class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
19196 +  op,
19197 +  (outs),
19198 +  (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
19199 +   i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr,
19200 +   GPR4Align<SReg_128>:$srsrc, i1imm:$slc, i1imm:$tfe, SReg_32:$soffset),
19201 +  asm,
19202 +  []> {
19203 +  let mayStore = 1;
19204 +  let mayLoad = 0;
19205 +}
19206 +
19207 +multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass dstClass,
19208 +                        ValueType vt> {
19209 +  def _IMM : SMRD <
19210 +              op,
19211 +              (outs dstClass:$dst),
19212 +              (ins SMRDmemri:$src0),
19213 +              asm,
19214 +              [(set (vt dstClass:$dst), (constant_load ADDR_Offset8:$src0))]
19215 +  >;
19216 +
19217 +  def _SGPR : SMRD <
19218 +              op,
19219 +              (outs dstClass:$dst),
19220 +              (ins SMRDmemrr:$src0),
19221 +              asm,
19222 +              [(set (vt dstClass:$dst), (constant_load ADDR_Reg:$src0))]
19223 +  >;
19224 +}
19225 +
19226 +multiclass SMRD_32 <bits<5> op, string asm, RegisterClass dstClass> {
19227 +  defm _F32 : SMRD_Helper <op, asm, dstClass, f32>;
19228 +  defm _I32 : SMRD_Helper <op, asm, dstClass, i32>;
19229 +}
19230 +
19231 +include "SIInstrFormats.td"
19232 +include "SIInstructions.td"
19233 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIInstructions.td llvm-r600/lib/Target/R600/SIInstructions.td
19234 --- llvm-3.2.src/lib/Target/R600/SIInstructions.td      1970-01-01 01:00:00.000000000 +0100
19235 +++ llvm-r600/lib/Target/R600/SIInstructions.td 2013-01-25 19:43:57.480049720 +0100
19236 @@ -0,0 +1,1357 @@
19237 +//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
19238 +//
19239 +//                     The LLVM Compiler Infrastructure
19240 +//
19241 +// This file is distributed under the University of Illinois Open Source
19242 +// License. See LICENSE.TXT for details.
19243 +//
19244 +//===----------------------------------------------------------------------===//
19245 +// This file was originally auto-generated from a GPU register header file and
19246 +// all the instruction definitions were originally commented out.  Instructions
19247 +// that are not yet supported remain commented out.
19248 +//===----------------------------------------------------------------------===//
19249 +
19250 +def isSI : Predicate<"Subtarget.device()"
19251 +                            "->getGeneration() == AMDGPUDeviceInfo::HD7XXX">;
19252 +
19253 +let Predicates = [isSI] in {
19254 +
19255 +let neverHasSideEffects = 1 in {
19256 +def S_MOV_B32 : SOP1_32 <0x00000003, "S_MOV_B32", []>;
19257 +def S_MOV_B64 : SOP1_64 <0x00000004, "S_MOV_B64", []>;
19258 +def S_CMOV_B32 : SOP1_32 <0x00000005, "S_CMOV_B32", []>;
19259 +def S_CMOV_B64 : SOP1_64 <0x00000006, "S_CMOV_B64", []>;
19260 +def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32", []>;
19261 +def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", []>;
19262 +def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>;
19263 +def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>;
19264 +def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32", []>;
19265 +def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>;
19266 +} // End neverHasSideEffects = 1
19267 +////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>;
19268 +////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>;
19269 +////def S_BCNT1_I32_B32 : SOP1_BCNT1 <0x0000000f, "S_BCNT1_I32_B32", []>;
19270 +////def S_BCNT1_I32_B64 : SOP1_BCNT1 <0x00000010, "S_BCNT1_I32_B64", []>;
19271 +////def S_FF0_I32_B32 : SOP1_FF0 <0x00000011, "S_FF0_I32_B32", []>;
19272 +////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>;
19273 +////def S_FF1_I32_B32 : SOP1_FF1 <0x00000013, "S_FF1_I32_B32", []>;
19274 +////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "S_FF1_I32_B64", []>;
19275 +//def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32", []>;
19276 +//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>;
19277 +def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>;
19278 +//def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>;
19279 +//def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8", []>;
19280 +//def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16", []>;
19281 +////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "S_BITSET0_B32", []>;
19282 +////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>;
19283 +////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>;
19284 +////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "S_BITSET1_B64", []>;
19285 +def S_GETPC_B64 : SOP1_64 <0x0000001f, "S_GETPC_B64", []>;
19286 +def S_SETPC_B64 : SOP1_64 <0x00000020, "S_SETPC_B64", []>;
19287 +def S_SWAPPC_B64 : SOP1_64 <0x00000021, "S_SWAPPC_B64", []>;
19288 +def S_RFE_B64 : SOP1_64 <0x00000022, "S_RFE_B64", []>;
19289 +
19290 +let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC] in {
19291 +
19292 +def S_AND_SAVEEXEC_B64 : SOP1_64 <0x00000024, "S_AND_SAVEEXEC_B64", []>;
19293 +def S_OR_SAVEEXEC_B64 : SOP1_64 <0x00000025, "S_OR_SAVEEXEC_B64", []>;
19294 +def S_XOR_SAVEEXEC_B64 : SOP1_64 <0x00000026, "S_XOR_SAVEEXEC_B64", []>;
19295 +def S_ANDN2_SAVEEXEC_B64 : SOP1_64 <0x00000027, "S_ANDN2_SAVEEXEC_B64", []>;
19296 +def S_ORN2_SAVEEXEC_B64 : SOP1_64 <0x00000028, "S_ORN2_SAVEEXEC_B64", []>;
19297 +def S_NAND_SAVEEXEC_B64 : SOP1_64 <0x00000029, "S_NAND_SAVEEXEC_B64", []>;
19298 +def S_NOR_SAVEEXEC_B64 : SOP1_64 <0x0000002a, "S_NOR_SAVEEXEC_B64", []>;
19299 +def S_XNOR_SAVEEXEC_B64 : SOP1_64 <0x0000002b, "S_XNOR_SAVEEXEC_B64", []>;
19300 +
19301 +} // End hasSideEffects = 1
19302 +
19303 +def S_QUADMASK_B32 : SOP1_32 <0x0000002c, "S_QUADMASK_B32", []>;
19304 +def S_QUADMASK_B64 : SOP1_64 <0x0000002d, "S_QUADMASK_B64", []>;
19305 +def S_MOVRELS_B32 : SOP1_32 <0x0000002e, "S_MOVRELS_B32", []>;
19306 +def S_MOVRELS_B64 : SOP1_64 <0x0000002f, "S_MOVRELS_B64", []>;
19307 +def S_MOVRELD_B32 : SOP1_32 <0x00000030, "S_MOVRELD_B32", []>;
19308 +def S_MOVRELD_B64 : SOP1_64 <0x00000031, "S_MOVRELD_B64", []>;
19309 +//def S_CBRANCH_JOIN : SOP1_ <0x00000032, "S_CBRANCH_JOIN", []>;
19310 +def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "S_MOV_REGRD_B32", []>;
19311 +def S_ABS_I32 : SOP1_32 <0x00000034, "S_ABS_I32", []>;
19312 +def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>;
19313 +def S_MOVK_I32 : SOPK_32 <0x00000000, "S_MOVK_I32", []>;
19314 +def S_CMOVK_I32 : SOPK_32 <0x00000002, "S_CMOVK_I32", []>;
19315 +
19316 +/*
19317 +This instruction is disabled for now until we can figure out how to teach
19318 +the instruction selector to correctly use the  S_CMP* vs V_CMP*
19319 +instructions.
19320 +
19321 +When this instruction is enabled the code generator sometimes produces this
19322 +invalid sequence:
19323 +
19324 +SCC = S_CMPK_EQ_I32 SGPR0, imm
19325 +VCC = COPY SCC
19326 +VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1
19327 +
19328 +def S_CMPK_EQ_I32 : SOPK <
19329 +  0x00000003, (outs SCCReg:$dst), (ins SReg_32:$src0, i32imm:$src1),
19330 +  "S_CMPK_EQ_I32",
19331 +  [(set SCCReg:$dst, (setcc SReg_32:$src0, imm:$src1, SETEQ))]
19332 +>;
19333 +*/
19334 +
19335 +def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "S_CMPK_LG_I32", []>;
19336 +def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "S_CMPK_GT_I32", []>;
19337 +def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "S_CMPK_GE_I32", []>;
19338 +def S_CMPK_LT_I32 : SOPK_32 <0x00000007, "S_CMPK_LT_I32", []>;
19339 +def S_CMPK_LE_I32 : SOPK_32 <0x00000008, "S_CMPK_LE_I32", []>;
19340 +def S_CMPK_EQ_U32 : SOPK_32 <0x00000009, "S_CMPK_EQ_U32", []>;
19341 +def S_CMPK_LG_U32 : SOPK_32 <0x0000000a, "S_CMPK_LG_U32", []>;
19342 +def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "S_CMPK_GT_U32", []>;
19343 +def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "S_CMPK_GE_U32", []>;
19344 +def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>;
19345 +def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>;
19346 +def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>;
19347 +def S_MULK_I32 : SOPK_32 <0x00000010, "S_MULK_I32", []>;
19348 +//def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "S_CBRANCH_I_FORK", []>;
19349 +def S_GETREG_B32 : SOPK_32 <0x00000012, "S_GETREG_B32", []>;
19350 +def S_SETREG_B32 : SOPK_32 <0x00000013, "S_SETREG_B32", []>;
19351 +def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>;
19352 +//def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "S_SETREG_IMM32_B32", []>;
19353 +//def EXP : EXP_ <0x00000000, "EXP", []>;
19354 +
19355 +defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32", []>;
19356 +defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", []>;
19357 +def : Pat <
19358 +  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_LT)),
19359 +  (V_CMP_LT_F32_e64 AllReg_32:$src0, VReg_32:$src1)
19360 +>;
19361 +defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", []>;
19362 +def : Pat <
19363 +  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_EQ)),
19364 +  (V_CMP_EQ_F32_e64 AllReg_32:$src0, VReg_32:$src1)
19365 +>;
19366 +defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", []>;
19367 +def : Pat <
19368 +  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_LE)),
19369 +  (V_CMP_LE_F32_e64 AllReg_32:$src0, VReg_32:$src1)
19370 +>;
19371 +defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", []>;
19372 +def : Pat <
19373 +  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_GT)),
19374 +  (V_CMP_GT_F32_e64 AllReg_32:$src0, VReg_32:$src1)
19375 +>;
19376 +defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32", []>;
19377 +def : Pat <
19378 +  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_NE)),
19379 +  (V_CMP_LG_F32_e64 AllReg_32:$src0, VReg_32:$src1)
19380 +>;
19381 +defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", []>;
19382 +def : Pat <
19383 +  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_GE)),
19384 +  (V_CMP_GE_F32_e64 AllReg_32:$src0, VReg_32:$src1)
19385 +>;
19386 +defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32", []>;
19387 +defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32", []>;
19388 +defm V_CMP_NGE_F32 : VOPC_32 <0x00000009, "V_CMP_NGE_F32", []>;
19389 +defm V_CMP_NLG_F32 : VOPC_32 <0x0000000a, "V_CMP_NLG_F32", []>;
19390 +defm V_CMP_NGT_F32 : VOPC_32 <0x0000000b, "V_CMP_NGT_F32", []>;
19391 +defm V_CMP_NLE_F32 : VOPC_32 <0x0000000c, "V_CMP_NLE_F32", []>;
19392 +defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", []>;
19393 +def : Pat <
19394 +  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_NE)),
19395 +  (V_CMP_NEQ_F32_e64 AllReg_32:$src0, VReg_32:$src1)
19396 +>;
19397 +defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32", []>;
19398 +defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32", []>;
19399 +
19400 +//Side effect is writing to EXEC
19401 +let hasSideEffects = 1 in {
19402 +
19403 +defm V_CMPX_F_F32 : VOPC_32 <0x00000010, "V_CMPX_F_F32", []>;
19404 +defm V_CMPX_LT_F32 : VOPC_32 <0x00000011, "V_CMPX_LT_F32", []>;
19405 +defm V_CMPX_EQ_F32 : VOPC_32 <0x00000012, "V_CMPX_EQ_F32", []>;
19406 +defm V_CMPX_LE_F32 : VOPC_32 <0x00000013, "V_CMPX_LE_F32", []>;
19407 +defm V_CMPX_GT_F32 : VOPC_32 <0x00000014, "V_CMPX_GT_F32", []>;
19408 +defm V_CMPX_LG_F32 : VOPC_32 <0x00000015, "V_CMPX_LG_F32", []>;
19409 +defm V_CMPX_GE_F32 : VOPC_32 <0x00000016, "V_CMPX_GE_F32", []>;
19410 +defm V_CMPX_O_F32 : VOPC_32 <0x00000017, "V_CMPX_O_F32", []>;
19411 +defm V_CMPX_U_F32 : VOPC_32 <0x00000018, "V_CMPX_U_F32", []>;
19412 +defm V_CMPX_NGE_F32 : VOPC_32 <0x00000019, "V_CMPX_NGE_F32", []>;
19413 +defm V_CMPX_NLG_F32 : VOPC_32 <0x0000001a, "V_CMPX_NLG_F32", []>;
19414 +defm V_CMPX_NGT_F32 : VOPC_32 <0x0000001b, "V_CMPX_NGT_F32", []>;
19415 +defm V_CMPX_NLE_F32 : VOPC_32 <0x0000001c, "V_CMPX_NLE_F32", []>;
19416 +defm V_CMPX_NEQ_F32 : VOPC_32 <0x0000001d, "V_CMPX_NEQ_F32", []>;
19417 +defm V_CMPX_NLT_F32 : VOPC_32 <0x0000001e, "V_CMPX_NLT_F32", []>;
19418 +defm V_CMPX_TRU_F32 : VOPC_32 <0x0000001f, "V_CMPX_TRU_F32", []>;
19419 +
19420 +} // End hasSideEffects = 1
19421 +
19422 +defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64", []>;
19423 +defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", []>;
19424 +defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64", []>;
19425 +defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64", []>;
19426 +defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64", []>;
19427 +defm V_CMP_LG_F64 : VOPC_64 <0x00000025, "V_CMP_LG_F64", []>;
19428 +defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64", []>;
19429 +defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64", []>;
19430 +defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64", []>;
19431 +defm V_CMP_NGE_F64 : VOPC_64 <0x00000029, "V_CMP_NGE_F64", []>;
19432 +defm V_CMP_NLG_F64 : VOPC_64 <0x0000002a, "V_CMP_NLG_F64", []>;
19433 +defm V_CMP_NGT_F64 : VOPC_64 <0x0000002b, "V_CMP_NGT_F64", []>;
19434 +defm V_CMP_NLE_F64 : VOPC_64 <0x0000002c, "V_CMP_NLE_F64", []>;
19435 +defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", []>;
19436 +defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64", []>;
19437 +defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64", []>;
19438 +
19439 +//Side effect is writing to EXEC
19440 +let hasSideEffects = 1 in {
19441 +
19442 +defm V_CMPX_F_F64 : VOPC_64 <0x00000030, "V_CMPX_F_F64", []>;
19443 +defm V_CMPX_LT_F64 : VOPC_64 <0x00000031, "V_CMPX_LT_F64", []>;
19444 +defm V_CMPX_EQ_F64 : VOPC_64 <0x00000032, "V_CMPX_EQ_F64", []>;
19445 +defm V_CMPX_LE_F64 : VOPC_64 <0x00000033, "V_CMPX_LE_F64", []>;
19446 +defm V_CMPX_GT_F64 : VOPC_64 <0x00000034, "V_CMPX_GT_F64", []>;
19447 +defm V_CMPX_LG_F64 : VOPC_64 <0x00000035, "V_CMPX_LG_F64", []>;
19448 +defm V_CMPX_GE_F64 : VOPC_64 <0x00000036, "V_CMPX_GE_F64", []>;
19449 +defm V_CMPX_O_F64 : VOPC_64 <0x00000037, "V_CMPX_O_F64", []>;
19450 +defm V_CMPX_U_F64 : VOPC_64 <0x00000038, "V_CMPX_U_F64", []>;
19451 +defm V_CMPX_NGE_F64 : VOPC_64 <0x00000039, "V_CMPX_NGE_F64", []>;
19452 +defm V_CMPX_NLG_F64 : VOPC_64 <0x0000003a, "V_CMPX_NLG_F64", []>;
19453 +defm V_CMPX_NGT_F64 : VOPC_64 <0x0000003b, "V_CMPX_NGT_F64", []>;
19454 +defm V_CMPX_NLE_F64 : VOPC_64 <0x0000003c, "V_CMPX_NLE_F64", []>;
19455 +defm V_CMPX_NEQ_F64 : VOPC_64 <0x0000003d, "V_CMPX_NEQ_F64", []>;
19456 +defm V_CMPX_NLT_F64 : VOPC_64 <0x0000003e, "V_CMPX_NLT_F64", []>;
19457 +defm V_CMPX_TRU_F64 : VOPC_64 <0x0000003f, "V_CMPX_TRU_F64", []>;
19458 +
19459 +} // End hasSideEffects = 1
19460 +
19461 +defm V_CMPS_F_F32 : VOPC_32 <0x00000040, "V_CMPS_F_F32", []>;
19462 +defm V_CMPS_LT_F32 : VOPC_32 <0x00000041, "V_CMPS_LT_F32", []>;
19463 +defm V_CMPS_EQ_F32 : VOPC_32 <0x00000042, "V_CMPS_EQ_F32", []>;
19464 +defm V_CMPS_LE_F32 : VOPC_32 <0x00000043, "V_CMPS_LE_F32", []>;
19465 +defm V_CMPS_GT_F32 : VOPC_32 <0x00000044, "V_CMPS_GT_F32", []>;
19466 +defm V_CMPS_LG_F32 : VOPC_32 <0x00000045, "V_CMPS_LG_F32", []>;
19467 +defm V_CMPS_GE_F32 : VOPC_32 <0x00000046, "V_CMPS_GE_F32", []>;
19468 +defm V_CMPS_O_F32 : VOPC_32 <0x00000047, "V_CMPS_O_F32", []>;
19469 +defm V_CMPS_U_F32 : VOPC_32 <0x00000048, "V_CMPS_U_F32", []>;
19470 +defm V_CMPS_NGE_F32 : VOPC_32 <0x00000049, "V_CMPS_NGE_F32", []>;
19471 +defm V_CMPS_NLG_F32 : VOPC_32 <0x0000004a, "V_CMPS_NLG_F32", []>;
19472 +defm V_CMPS_NGT_F32 : VOPC_32 <0x0000004b, "V_CMPS_NGT_F32", []>;
19473 +defm V_CMPS_NLE_F32 : VOPC_32 <0x0000004c, "V_CMPS_NLE_F32", []>;
19474 +defm V_CMPS_NEQ_F32 : VOPC_32 <0x0000004d, "V_CMPS_NEQ_F32", []>;
19475 +defm V_CMPS_NLT_F32 : VOPC_32 <0x0000004e, "V_CMPS_NLT_F32", []>;
19476 +defm V_CMPS_TRU_F32 : VOPC_32 <0x0000004f, "V_CMPS_TRU_F32", []>;
19477 +defm V_CMPSX_F_F32 : VOPC_32 <0x00000050, "V_CMPSX_F_F32", []>;
19478 +defm V_CMPSX_LT_F32 : VOPC_32 <0x00000051, "V_CMPSX_LT_F32", []>;
19479 +defm V_CMPSX_EQ_F32 : VOPC_32 <0x00000052, "V_CMPSX_EQ_F32", []>;
19480 +defm V_CMPSX_LE_F32 : VOPC_32 <0x00000053, "V_CMPSX_LE_F32", []>;
19481 +defm V_CMPSX_GT_F32 : VOPC_32 <0x00000054, "V_CMPSX_GT_F32", []>;
19482 +defm V_CMPSX_LG_F32 : VOPC_32 <0x00000055, "V_CMPSX_LG_F32", []>;
19483 +defm V_CMPSX_GE_F32 : VOPC_32 <0x00000056, "V_CMPSX_GE_F32", []>;
19484 +defm V_CMPSX_O_F32 : VOPC_32 <0x00000057, "V_CMPSX_O_F32", []>;
19485 +defm V_CMPSX_U_F32 : VOPC_32 <0x00000058, "V_CMPSX_U_F32", []>;
19486 +defm V_CMPSX_NGE_F32 : VOPC_32 <0x00000059, "V_CMPSX_NGE_F32", []>;
19487 +defm V_CMPSX_NLG_F32 : VOPC_32 <0x0000005a, "V_CMPSX_NLG_F32", []>;
19488 +defm V_CMPSX_NGT_F32 : VOPC_32 <0x0000005b, "V_CMPSX_NGT_F32", []>;
19489 +defm V_CMPSX_NLE_F32 : VOPC_32 <0x0000005c, "V_CMPSX_NLE_F32", []>;
19490 +defm V_CMPSX_NEQ_F32 : VOPC_32 <0x0000005d, "V_CMPSX_NEQ_F32", []>;
19491 +defm V_CMPSX_NLT_F32 : VOPC_32 <0x0000005e, "V_CMPSX_NLT_F32", []>;
19492 +defm V_CMPSX_TRU_F32 : VOPC_32 <0x0000005f, "V_CMPSX_TRU_F32", []>;
19493 +defm V_CMPS_F_F64 : VOPC_64 <0x00000060, "V_CMPS_F_F64", []>;
19494 +defm V_CMPS_LT_F64 : VOPC_64 <0x00000061, "V_CMPS_LT_F64", []>;
19495 +defm V_CMPS_EQ_F64 : VOPC_64 <0x00000062, "V_CMPS_EQ_F64", []>;
19496 +defm V_CMPS_LE_F64 : VOPC_64 <0x00000063, "V_CMPS_LE_F64", []>;
19497 +defm V_CMPS_GT_F64 : VOPC_64 <0x00000064, "V_CMPS_GT_F64", []>;
19498 +defm V_CMPS_LG_F64 : VOPC_64 <0x00000065, "V_CMPS_LG_F64", []>;
19499 +defm V_CMPS_GE_F64 : VOPC_64 <0x00000066, "V_CMPS_GE_F64", []>;
19500 +defm V_CMPS_O_F64 : VOPC_64 <0x00000067, "V_CMPS_O_F64", []>;
19501 +defm V_CMPS_U_F64 : VOPC_64 <0x00000068, "V_CMPS_U_F64", []>;
19502 +defm V_CMPS_NGE_F64 : VOPC_64 <0x00000069, "V_CMPS_NGE_F64", []>;
19503 +defm V_CMPS_NLG_F64 : VOPC_64 <0x0000006a, "V_CMPS_NLG_F64", []>;
19504 +defm V_CMPS_NGT_F64 : VOPC_64 <0x0000006b, "V_CMPS_NGT_F64", []>;
19505 +defm V_CMPS_NLE_F64 : VOPC_64 <0x0000006c, "V_CMPS_NLE_F64", []>;
19506 +defm V_CMPS_NEQ_F64 : VOPC_64 <0x0000006d, "V_CMPS_NEQ_F64", []>;
19507 +defm V_CMPS_NLT_F64 : VOPC_64 <0x0000006e, "V_CMPS_NLT_F64", []>;
19508 +defm V_CMPS_TRU_F64 : VOPC_64 <0x0000006f, "V_CMPS_TRU_F64", []>;
19509 +defm V_CMPSX_F_F64 : VOPC_64 <0x00000070, "V_CMPSX_F_F64", []>;
19510 +defm V_CMPSX_LT_F64 : VOPC_64 <0x00000071, "V_CMPSX_LT_F64", []>;
19511 +defm V_CMPSX_EQ_F64 : VOPC_64 <0x00000072, "V_CMPSX_EQ_F64", []>;
19512 +defm V_CMPSX_LE_F64 : VOPC_64 <0x00000073, "V_CMPSX_LE_F64", []>;
19513 +defm V_CMPSX_GT_F64 : VOPC_64 <0x00000074, "V_CMPSX_GT_F64", []>;
19514 +defm V_CMPSX_LG_F64 : VOPC_64 <0x00000075, "V_CMPSX_LG_F64", []>;
19515 +defm V_CMPSX_GE_F64 : VOPC_64 <0x00000076, "V_CMPSX_GE_F64", []>;
19516 +defm V_CMPSX_O_F64 : VOPC_64 <0x00000077, "V_CMPSX_O_F64", []>;
19517 +defm V_CMPSX_U_F64 : VOPC_64 <0x00000078, "V_CMPSX_U_F64", []>;
19518 +defm V_CMPSX_NGE_F64 : VOPC_64 <0x00000079, "V_CMPSX_NGE_F64", []>;
19519 +defm V_CMPSX_NLG_F64 : VOPC_64 <0x0000007a, "V_CMPSX_NLG_F64", []>;
19520 +defm V_CMPSX_NGT_F64 : VOPC_64 <0x0000007b, "V_CMPSX_NGT_F64", []>;
19521 +defm V_CMPSX_NLE_F64 : VOPC_64 <0x0000007c, "V_CMPSX_NLE_F64", []>;
19522 +defm V_CMPSX_NEQ_F64 : VOPC_64 <0x0000007d, "V_CMPSX_NEQ_F64", []>;
19523 +defm V_CMPSX_NLT_F64 : VOPC_64 <0x0000007e, "V_CMPSX_NLT_F64", []>;
19524 +defm V_CMPSX_TRU_F64 : VOPC_64 <0x0000007f, "V_CMPSX_TRU_F64", []>;
19525 +defm V_CMP_F_I32 : VOPC_32 <0x00000080, "V_CMP_F_I32", []>;
19526 +defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", []>;
19527 +def : Pat <
19528 +  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_LT)),
19529 +  (V_CMP_LT_I32_e64 AllReg_32:$src0, VReg_32:$src1)
19530 +>;
19531 +defm V_CMP_EQ_I32 : VOPC_32 <0x00000082, "V_CMP_EQ_I32", []>;
19532 +def : Pat <
19533 +  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_EQ)),
19534 +  (V_CMP_EQ_I32_e64 AllReg_32:$src0, VReg_32:$src1)
19535 +>;
19536 +defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", []>;
19537 +def : Pat <
19538 +  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_LE)),
19539 +  (V_CMP_LE_I32_e64 AllReg_32:$src0, VReg_32:$src1)
19540 +>;
19541 +defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", []>;
19542 +def : Pat <
19543 +  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_GT)),
19544 +  (V_CMP_GT_I32_e64 AllReg_32:$src0, VReg_32:$src1)
19545 +>;
19546 +defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", []>;
19547 +def : Pat <
19548 +  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_NE)),
19549 +  (V_CMP_NE_I32_e64 AllReg_32:$src0, VReg_32:$src1)
19550 +>;
19551 +defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", []>;
19552 +def : Pat <
19553 +  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_GE)),
19554 +  (V_CMP_GE_I32_e64 AllReg_32:$src0, VReg_32:$src1)
19555 +>;
19556 +defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32", []>;
19557 +
19558 +let hasSideEffects = 1 in {
19559 +
19560 +defm V_CMPX_F_I32 : VOPC_32 <0x00000090, "V_CMPX_F_I32", []>;
19561 +defm V_CMPX_LT_I32 : VOPC_32 <0x00000091, "V_CMPX_LT_I32", []>;
19562 +defm V_CMPX_EQ_I32 : VOPC_32 <0x00000092, "V_CMPX_EQ_I32", []>;
19563 +defm V_CMPX_LE_I32 : VOPC_32 <0x00000093, "V_CMPX_LE_I32", []>;
19564 +defm V_CMPX_GT_I32 : VOPC_32 <0x00000094, "V_CMPX_GT_I32", []>;
19565 +defm V_CMPX_NE_I32 : VOPC_32 <0x00000095, "V_CMPX_NE_I32", []>;
19566 +defm V_CMPX_GE_I32 : VOPC_32 <0x00000096, "V_CMPX_GE_I32", []>;
19567 +defm V_CMPX_T_I32 : VOPC_32 <0x00000097, "V_CMPX_T_I32", []>;
19568 +
19569 +} // End hasSideEffects
19570 +
19571 +defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64", []>;
19572 +defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64", []>;
19573 +defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64", []>;
19574 +defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64", []>;
19575 +defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64", []>;
19576 +defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64", []>;
19577 +defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64", []>;
19578 +defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64", []>;
19579 +
19580 +let hasSideEffects = 1 in {
19581 +
19582 +defm V_CMPX_F_I64 : VOPC_64 <0x000000b0, "V_CMPX_F_I64", []>;
19583 +defm V_CMPX_LT_I64 : VOPC_64 <0x000000b1, "V_CMPX_LT_I64", []>;
19584 +defm V_CMPX_EQ_I64 : VOPC_64 <0x000000b2, "V_CMPX_EQ_I64", []>;
19585 +defm V_CMPX_LE_I64 : VOPC_64 <0x000000b3, "V_CMPX_LE_I64", []>;
19586 +defm V_CMPX_GT_I64 : VOPC_64 <0x000000b4, "V_CMPX_GT_I64", []>;
19587 +defm V_CMPX_NE_I64 : VOPC_64 <0x000000b5, "V_CMPX_NE_I64", []>;
19588 +defm V_CMPX_GE_I64 : VOPC_64 <0x000000b6, "V_CMPX_GE_I64", []>;
19589 +defm V_CMPX_T_I64 : VOPC_64 <0x000000b7, "V_CMPX_T_I64", []>;
19590 +
19591 +} // End hasSideEffects
19592 +
19593 +defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32", []>;
19594 +defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32", []>;
19595 +defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32", []>;
19596 +defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32", []>;
19597 +defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32", []>;
19598 +defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32", []>;
19599 +defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32", []>;
19600 +defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32", []>;
19601 +
19602 +let hasSideEffects = 1 in {
19603 +
19604 +defm V_CMPX_F_U32 : VOPC_32 <0x000000d0, "V_CMPX_F_U32", []>;
19605 +defm V_CMPX_LT_U32 : VOPC_32 <0x000000d1, "V_CMPX_LT_U32", []>;
19606 +defm V_CMPX_EQ_U32 : VOPC_32 <0x000000d2, "V_CMPX_EQ_U32", []>;
19607 +defm V_CMPX_LE_U32 : VOPC_32 <0x000000d3, "V_CMPX_LE_U32", []>;
19608 +defm V_CMPX_GT_U32 : VOPC_32 <0x000000d4, "V_CMPX_GT_U32", []>;
19609 +defm V_CMPX_NE_U32 : VOPC_32 <0x000000d5, "V_CMPX_NE_U32", []>;
19610 +defm V_CMPX_GE_U32 : VOPC_32 <0x000000d6, "V_CMPX_GE_U32", []>;
19611 +defm V_CMPX_T_U32 : VOPC_32 <0x000000d7, "V_CMPX_T_U32", []>;
19612 +
19613 +} // End hasSideEffects
19614 +
19615 +defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64", []>;
19616 +defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64", []>;
19617 +defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64", []>;
19618 +defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64", []>;
19619 +defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64", []>;
19620 +defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64", []>;
19621 +defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64", []>;
19622 +defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64", []>;
19623 +defm V_CMPX_F_U64 : VOPC_64 <0x000000f0, "V_CMPX_F_U64", []>;
19624 +defm V_CMPX_LT_U64 : VOPC_64 <0x000000f1, "V_CMPX_LT_U64", []>;
19625 +defm V_CMPX_EQ_U64 : VOPC_64 <0x000000f2, "V_CMPX_EQ_U64", []>;
19626 +defm V_CMPX_LE_U64 : VOPC_64 <0x000000f3, "V_CMPX_LE_U64", []>;
19627 +defm V_CMPX_GT_U64 : VOPC_64 <0x000000f4, "V_CMPX_GT_U64", []>;
19628 +defm V_CMPX_NE_U64 : VOPC_64 <0x000000f5, "V_CMPX_NE_U64", []>;
19629 +defm V_CMPX_GE_U64 : VOPC_64 <0x000000f6, "V_CMPX_GE_U64", []>;
19630 +defm V_CMPX_T_U64 : VOPC_64 <0x000000f7, "V_CMPX_T_U64", []>;
19631 +defm V_CMP_CLASS_F32 : VOPC_32 <0x00000088, "V_CMP_CLASS_F32", []>;
19632 +defm V_CMPX_CLASS_F32 : VOPC_32 <0x00000098, "V_CMPX_CLASS_F32", []>;
19633 +defm V_CMP_CLASS_F64 : VOPC_64 <0x000000a8, "V_CMP_CLASS_F64", []>;
19634 +defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64", []>;
19635 +//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
19636 +//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
19637 +//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>;
19638 +def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>;
19639 +//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "BUFFER_STORE_FORMAT_X", []>;
19640 +//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>;
19641 +//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>;
19642 +//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>;
19643 +//def BUFFER_LOAD_UBYTE : MUBUF_ <0x00000008, "BUFFER_LOAD_UBYTE", []>;
19644 +//def BUFFER_LOAD_SBYTE : MUBUF_ <0x00000009, "BUFFER_LOAD_SBYTE", []>;
19645 +//def BUFFER_LOAD_USHORT : MUBUF_ <0x0000000a, "BUFFER_LOAD_USHORT", []>;
19646 +//def BUFFER_LOAD_SSHORT : MUBUF_ <0x0000000b, "BUFFER_LOAD_SSHORT", []>;
19647 +//def BUFFER_LOAD_DWORD : MUBUF_ <0x0000000c, "BUFFER_LOAD_DWORD", []>;
19648 +//def BUFFER_LOAD_DWORDX2 : MUBUF_DWORDX2 <0x0000000d, "BUFFER_LOAD_DWORDX2", []>;
19649 +//def BUFFER_LOAD_DWORDX4 : MUBUF_DWORDX4 <0x0000000e, "BUFFER_LOAD_DWORDX4", []>;
19650 +//def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>;
19651 +//def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>;
19652 +//def BUFFER_STORE_DWORD : MUBUF_ <0x0000001c, "BUFFER_STORE_DWORD", []>;
19653 +//def BUFFER_STORE_DWORDX2 : MUBUF_DWORDX2 <0x0000001d, "BUFFER_STORE_DWORDX2", []>;
19654 +//def BUFFER_STORE_DWORDX4 : MUBUF_DWORDX4 <0x0000001e, "BUFFER_STORE_DWORDX4", []>;
19655 +//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
19656 +//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>;
19657 +//def BUFFER_ATOMIC_ADD : MUBUF_ <0x00000032, "BUFFER_ATOMIC_ADD", []>;
19658 +//def BUFFER_ATOMIC_SUB : MUBUF_ <0x00000033, "BUFFER_ATOMIC_SUB", []>;
19659 +//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "BUFFER_ATOMIC_RSUB", []>;
19660 +//def BUFFER_ATOMIC_SMIN : MUBUF_ <0x00000035, "BUFFER_ATOMIC_SMIN", []>;
19661 +//def BUFFER_ATOMIC_UMIN : MUBUF_ <0x00000036, "BUFFER_ATOMIC_UMIN", []>;
19662 +//def BUFFER_ATOMIC_SMAX : MUBUF_ <0x00000037, "BUFFER_ATOMIC_SMAX", []>;
19663 +//def BUFFER_ATOMIC_UMAX : MUBUF_ <0x00000038, "BUFFER_ATOMIC_UMAX", []>;
19664 +//def BUFFER_ATOMIC_AND : MUBUF_ <0x00000039, "BUFFER_ATOMIC_AND", []>;
19665 +//def BUFFER_ATOMIC_OR : MUBUF_ <0x0000003a, "BUFFER_ATOMIC_OR", []>;
19666 +//def BUFFER_ATOMIC_XOR : MUBUF_ <0x0000003b, "BUFFER_ATOMIC_XOR", []>;
19667 +//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "BUFFER_ATOMIC_INC", []>;
19668 +//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "BUFFER_ATOMIC_DEC", []>;
19669 +//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "BUFFER_ATOMIC_FCMPSWAP", []>;
19670 +//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "BUFFER_ATOMIC_FMIN", []>;
19671 +//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "BUFFER_ATOMIC_FMAX", []>;
19672 +//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "BUFFER_ATOMIC_SWAP_X2", []>;
19673 +//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "BUFFER_ATOMIC_CMPSWAP_X2", []>;
19674 +//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "BUFFER_ATOMIC_ADD_X2", []>;
19675 +//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "BUFFER_ATOMIC_SUB_X2", []>;
19676 +//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "BUFFER_ATOMIC_RSUB_X2", []>;
19677 +//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "BUFFER_ATOMIC_SMIN_X2", []>;
19678 +//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "BUFFER_ATOMIC_UMIN_X2", []>;
19679 +//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "BUFFER_ATOMIC_SMAX_X2", []>;
19680 +//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "BUFFER_ATOMIC_UMAX_X2", []>;
19681 +//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "BUFFER_ATOMIC_AND_X2", []>;
19682 +//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "BUFFER_ATOMIC_OR_X2", []>;
19683 +//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "BUFFER_ATOMIC_XOR_X2", []>;
19684 +//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "BUFFER_ATOMIC_INC_X2", []>;
19685 +//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "BUFFER_ATOMIC_DEC_X2", []>;
19686 +//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "BUFFER_ATOMIC_FCMPSWAP_X2", []>;
19687 +//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "BUFFER_ATOMIC_FMIN_X2", []>;
19688 +//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "BUFFER_ATOMIC_FMAX_X2", []>;
19689 +//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "BUFFER_WBINVL1_SC", []>;
19690 +//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "BUFFER_WBINVL1", []>;
19691 +//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "TBUFFER_LOAD_FORMAT_X", []>;
19692 +//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>;
19693 +//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>;
19694 +def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORMAT_XYZW", VReg_128>;
19695 +//def TBUFFER_STORE_FORMAT_X : MTBUF_ <0x00000004, "TBUFFER_STORE_FORMAT_X", []>;
19696 +//def TBUFFER_STORE_FORMAT_XY : MTBUF_ <0x00000005, "TBUFFER_STORE_FORMAT_XY", []>;
19697 +//def TBUFFER_STORE_FORMAT_XYZ : MTBUF_ <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", []>;
19698 +//def TBUFFER_STORE_FORMAT_XYZW : MTBUF_ <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", []>;
19699 +
19700 +defm S_LOAD_DWORD : SMRD_32 <0x00000000, "S_LOAD_DWORD", SReg_32>;
19701 +
19702 +//def S_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000001, "S_LOAD_DWORDX2", []>;
19703 +defm S_LOAD_DWORDX4 : SMRD_Helper <0x00000002, "S_LOAD_DWORDX4", SReg_128, v4i32>;
19704 +defm S_LOAD_DWORDX8 : SMRD_Helper <0x00000003, "S_LOAD_DWORDX8", SReg_256, v8i32>;
19705 +//def S_LOAD_DWORDX16 : SMRD_DWORDX16 <0x00000004, "S_LOAD_DWORDX16", []>;
19706 +//def S_BUFFER_LOAD_DWORD : SMRD_ <0x00000008, "S_BUFFER_LOAD_DWORD", []>;
19707 +//def S_BUFFER_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000009, "S_BUFFER_LOAD_DWORDX2", []>;
19708 +//def S_BUFFER_LOAD_DWORDX4 : SMRD_DWORDX4 <0x0000000a, "S_BUFFER_LOAD_DWORDX4", []>;
19709 +//def S_BUFFER_LOAD_DWORDX8 : SMRD_DWORDX8 <0x0000000b, "S_BUFFER_LOAD_DWORDX8", []>;
19710 +//def S_BUFFER_LOAD_DWORDX16 : SMRD_DWORDX16 <0x0000000c, "S_BUFFER_LOAD_DWORDX16", []>;
19711 +
19712 +//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
19713 +//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
19714 +//def IMAGE_LOAD : MIMG_NoPattern_ <"IMAGE_LOAD", 0x00000000>;
19715 +//def IMAGE_LOAD_MIP : MIMG_NoPattern_ <"IMAGE_LOAD_MIP", 0x00000001>;
19716 +//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>;
19717 +//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_PCK_SGN", 0x00000003>;
19718 +//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK", 0x00000004>;
19719 +//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK_SGN", 0x00000005>;
19720 +//def IMAGE_STORE : MIMG_NoPattern_ <"IMAGE_STORE", 0x00000008>;
19721 +//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"IMAGE_STORE_MIP", 0x00000009>;
19722 +//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"IMAGE_STORE_PCK", 0x0000000a>;
19723 +//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"IMAGE_STORE_MIP_PCK", 0x0000000b>;
19724 +//def IMAGE_GET_RESINFO : MIMG_NoPattern_ <"IMAGE_GET_RESINFO", 0x0000000e>;
19725 +//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_SWAP", 0x0000000f>;
19726 +//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_CMPSWAP", 0x00000010>;
19727 +//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"IMAGE_ATOMIC_ADD", 0x00000011>;
19728 +//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_SUB", 0x00000012>;
19729 +//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_RSUB", 0x00000013>;
19730 +//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMIN", 0x00000014>;
19731 +//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMIN", 0x00000015>;
19732 +//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMAX", 0x00000016>;
19733 +//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMAX", 0x00000017>;
19734 +//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"IMAGE_ATOMIC_AND", 0x00000018>;
19735 +//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"IMAGE_ATOMIC_OR", 0x00000019>;
19736 +//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"IMAGE_ATOMIC_XOR", 0x0000001a>;
19737 +//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"IMAGE_ATOMIC_INC", 0x0000001b>;
19738 +//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"IMAGE_ATOMIC_DEC", 0x0000001c>;
19739 +//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>;
19740 +//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>;
19741 +//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>;
19742 +def IMAGE_SAMPLE : MIMG_Load_Helper <0x00000020, "IMAGE_SAMPLE">;
19743 +//def IMAGE_SAMPLE_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL", 0x00000021>;
19744 +def IMAGE_SAMPLE_D : MIMG_Load_Helper <0x00000022, "IMAGE_SAMPLE_D">;
19745 +//def IMAGE_SAMPLE_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL", 0x00000023>;
19746 +def IMAGE_SAMPLE_L : MIMG_Load_Helper <0x00000024, "IMAGE_SAMPLE_L">;
19747 +def IMAGE_SAMPLE_B : MIMG_Load_Helper <0x00000025, "IMAGE_SAMPLE_B">;
19748 +//def IMAGE_SAMPLE_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL", 0x00000026>;
19749 +//def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>;
19750 +//def IMAGE_SAMPLE_C : MIMG_NoPattern_ <"IMAGE_SAMPLE_C", 0x00000028>;
19751 +//def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>;
19752 +//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D", 0x0000002a>;
19753 +//def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>;
19754 +//def IMAGE_SAMPLE_C_L : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_L", 0x0000002c>;
19755 +//def IMAGE_SAMPLE_C_B : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B", 0x0000002d>;
19756 +//def IMAGE_SAMPLE_C_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL", 0x0000002e>;
19757 +//def IMAGE_SAMPLE_C_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ", 0x0000002f>;
19758 +//def IMAGE_SAMPLE_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_O", 0x00000030>;
19759 +//def IMAGE_SAMPLE_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL_O", 0x00000031>;
19760 +//def IMAGE_SAMPLE_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_O", 0x00000032>;
19761 +//def IMAGE_SAMPLE_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL_O", 0x00000033>;
19762 +//def IMAGE_SAMPLE_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_L_O", 0x00000034>;
19763 +//def IMAGE_SAMPLE_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_O", 0x00000035>;
19764 +//def IMAGE_SAMPLE_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL_O", 0x00000036>;
19765 +//def IMAGE_SAMPLE_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ_O", 0x00000037>;
19766 +//def IMAGE_SAMPLE_C_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_O", 0x00000038>;
19767 +//def IMAGE_SAMPLE_C_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL_O", 0x00000039>;
19768 +//def IMAGE_SAMPLE_C_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_O", 0x0000003a>;
19769 +//def IMAGE_SAMPLE_C_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL_O", 0x0000003b>;
19770 +//def IMAGE_SAMPLE_C_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_L_O", 0x0000003c>;
19771 +//def IMAGE_SAMPLE_C_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_O", 0x0000003d>;
19772 +//def IMAGE_SAMPLE_C_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL_O", 0x0000003e>;
19773 +//def IMAGE_SAMPLE_C_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ_O", 0x0000003f>;
19774 +//def IMAGE_GATHER4 : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4", 0x00000040>;
19775 +//def IMAGE_GATHER4_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL", 0x00000041>;
19776 +//def IMAGE_GATHER4_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L", 0x00000044>;
19777 +//def IMAGE_GATHER4_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B", 0x00000045>;
19778 +//def IMAGE_GATHER4_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL", 0x00000046>;
19779 +//def IMAGE_GATHER4_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ", 0x00000047>;
19780 +//def IMAGE_GATHER4_C : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C", 0x00000048>;
19781 +//def IMAGE_GATHER4_C_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL", 0x00000049>;
19782 +//def IMAGE_GATHER4_C_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L", 0x0000004c>;
19783 +//def IMAGE_GATHER4_C_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B", 0x0000004d>;
19784 +//def IMAGE_GATHER4_C_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL", 0x0000004e>;
19785 +//def IMAGE_GATHER4_C_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ", 0x0000004f>;
19786 +//def IMAGE_GATHER4_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_O", 0x00000050>;
19787 +//def IMAGE_GATHER4_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL_O", 0x00000051>;
19788 +//def IMAGE_GATHER4_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L_O", 0x00000054>;
19789 +//def IMAGE_GATHER4_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_O", 0x00000055>;
19790 +//def IMAGE_GATHER4_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL_O", 0x00000056>;
19791 +//def IMAGE_GATHER4_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ_O", 0x00000057>;
19792 +//def IMAGE_GATHER4_C_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_O", 0x00000058>;
19793 +//def IMAGE_GATHER4_C_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL_O", 0x00000059>;
19794 +//def IMAGE_GATHER4_C_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L_O", 0x0000005c>;
19795 +//def IMAGE_GATHER4_C_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_O", 0x0000005d>;
19796 +//def IMAGE_GATHER4_C_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL_O", 0x0000005e>;
19797 +//def IMAGE_GATHER4_C_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ_O", 0x0000005f>;
19798 +//def IMAGE_GET_LOD : MIMG_NoPattern_ <"IMAGE_GET_LOD", 0x00000060>;
19799 +//def IMAGE_SAMPLE_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD", 0x00000068>;
19800 +//def IMAGE_SAMPLE_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL", 0x00000069>;
19801 +//def IMAGE_SAMPLE_C_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD", 0x0000006a>;
19802 +//def IMAGE_SAMPLE_C_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL", 0x0000006b>;
19803 +//def IMAGE_SAMPLE_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_O", 0x0000006c>;
19804 +//def IMAGE_SAMPLE_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL_O", 0x0000006d>;
19805 +//def IMAGE_SAMPLE_C_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_O", 0x0000006e>;
19806 +//def IMAGE_SAMPLE_C_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL_O", 0x0000006f>;
19807 +//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"IMAGE_RSRC256", 0x0000007e>;
19808 +//def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>;
19809 +//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>;
19810 +
19811 +let neverHasSideEffects = 1 in {
19812 +defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>;
19813 +}  // End neverHasSideEffects
19814 +defm V_READFIRSTLANE_B32 : VOP1_32 <0x00000002, "V_READFIRSTLANE_B32", []>;
19815 +//defm V_CVT_I32_F64 : VOP1_32 <0x00000003, "V_CVT_I32_F64", []>;
19816 +//defm V_CVT_F64_I32 : VOP1_64 <0x00000004, "V_CVT_F64_I32", []>;
19817 +defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32",
19818 +  [(set VReg_32:$dst, (sint_to_fp AllReg_32:$src0))]
19819 +>;
19820 +//defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32", []>;
19821 +//defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>;
19822 +defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32",
19823 +  [(set VReg_32:$dst, (fp_to_sint AllReg_32:$src0))]
19824 +>;
19825 +defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>;
19826 +////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>;
19827 +//defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", []>;
19828 +//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>;
19829 +//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>;
19830 +//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>;
19831 +//defm V_CVT_F32_F64 : VOP1_32 <0x0000000f, "V_CVT_F32_F64", []>;
19832 +//defm V_CVT_F64_F32 : VOP1_64 <0x00000010, "V_CVT_F64_F32", []>;
19833 +//defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", []>;
19834 +//defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>;
19835 +//defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>;
19836 +//defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", []>;
19837 +//defm V_CVT_U32_F64 : VOP1_32 <0x00000015, "V_CVT_U32_F64", []>;
19838 +//defm V_CVT_F64_U32 : VOP1_64 <0x00000016, "V_CVT_F64_U32", []>;
19839 +defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32",
19840 +  [(set VReg_32:$dst, (AMDGPUfract AllReg_32:$src0))]
19841 +>;
19842 +defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", []>;
19843 +defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32", []>;
19844 +defm V_RNDNE_F32 : VOP1_32 <0x00000023, "V_RNDNE_F32",
19845 +  [(set VReg_32:$dst, (frint AllReg_32:$src0))]
19846 +>;
19847 +defm V_FLOOR_F32 : VOP1_32 <0x00000024, "V_FLOOR_F32",
19848 +  [(set VReg_32:$dst, (ffloor AllReg_32:$src0))]
19849 +>;
19850 +defm V_EXP_F32 : VOP1_32 <0x00000025, "V_EXP_F32",
19851 +  [(set VReg_32:$dst, (fexp2 AllReg_32:$src0))]
19852 +>;
19853 +defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>;
19854 +defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32", []>;
19855 +defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>;
19856 +defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>;
19857 +defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32",
19858 +  [(set VReg_32:$dst, (fdiv FP_ONE, AllReg_32:$src0))]
19859 +>;
19860 +defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>;
19861 +defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>;
19862 +defm V_RSQ_LEGACY_F32 : VOP1_32 <
19863 +  0x0000002d, "V_RSQ_LEGACY_F32",
19864 +  [(set VReg_32:$dst, (int_AMDGPU_rsq AllReg_32:$src0))]
19865 +>;
19866 +defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", []>;
19867 +defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", []>;
19868 +defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>;
19869 +defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", []>;
19870 +defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", []>;
19871 +defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32", []>;
19872 +defm V_SQRT_F64 : VOP1_64 <0x00000034, "V_SQRT_F64", []>;
19873 +defm V_SIN_F32 : VOP1_32 <0x00000035, "V_SIN_F32", []>;
19874 +defm V_COS_F32 : VOP1_32 <0x00000036, "V_COS_F32", []>;
19875 +defm V_NOT_B32 : VOP1_32 <0x00000037, "V_NOT_B32", []>;
19876 +defm V_BFREV_B32 : VOP1_32 <0x00000038, "V_BFREV_B32", []>;
19877 +defm V_FFBH_U32 : VOP1_32 <0x00000039, "V_FFBH_U32", []>;
19878 +defm V_FFBL_B32 : VOP1_32 <0x0000003a, "V_FFBL_B32", []>;
19879 +defm V_FFBH_I32 : VOP1_32 <0x0000003b, "V_FFBH_I32", []>;
19880 +//defm V_FREXP_EXP_I32_F64 : VOP1_32 <0x0000003c, "V_FREXP_EXP_I32_F64", []>;
19881 +defm V_FREXP_MANT_F64 : VOP1_64 <0x0000003d, "V_FREXP_MANT_F64", []>;
19882 +defm V_FRACT_F64 : VOP1_64 <0x0000003e, "V_FRACT_F64", []>;
19883 +//defm V_FREXP_EXP_I32_F32 : VOP1_32 <0x0000003f, "V_FREXP_EXP_I32_F32", []>;
19884 +defm V_FREXP_MANT_F32 : VOP1_32 <0x00000040, "V_FREXP_MANT_F32", []>;
19885 +//def V_CLREXCP : VOP1_ <0x00000041, "V_CLREXCP", []>;
19886 +defm V_MOVRELD_B32 : VOP1_32 <0x00000042, "V_MOVRELD_B32", []>;
19887 +defm V_MOVRELS_B32 : VOP1_32 <0x00000043, "V_MOVRELS_B32", []>;
19888 +defm V_MOVRELSD_B32 : VOP1_32 <0x00000044, "V_MOVRELSD_B32", []>;
19889 +
19890 +def V_INTERP_P1_F32 : VINTRP <
19891 +  0x00000000,
19892 +  (outs VReg_32:$dst),
19893 +  (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
19894 +  "V_INTERP_P1_F32",
19895 +  []> {
19896 +  let DisableEncoding = "$m0";
19897 +}
19898 +
19899 +def V_INTERP_P2_F32 : VINTRP <
19900 +  0x00000001,
19901 +  (outs VReg_32:$dst),
19902 +  (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
19903 +  "V_INTERP_P2_F32",
19904 +  []> {
19905 +
19906 +  let Constraints = "$src0 = $dst";
19907 +  let DisableEncoding = "$src0,$m0";
19908 +
19909 +}
19910 +
19911 +def V_INTERP_MOV_F32 : VINTRP <
19912 +  0x00000002,
19913 +  (outs VReg_32:$dst),
19914 +  (ins i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
19915 +  "V_INTERP_MOV_F32",
19916 +  []> {
19917 +  let VSRC = 0;
19918 +  let DisableEncoding = "$m0";
19919 +}
19920 +
19921 +//def S_NOP : SOPP_ <0x00000000, "S_NOP", []>;
19922 +
19923 +let isTerminator = 1 in {
19924 +
19925 +def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM",
19926 +  [(IL_retflag)]> {
19927 +  let SIMM16 = 0;
19928 +  let isBarrier = 1;
19929 +  let hasCtrlDep = 1;
19930 +}
19931 +
19932 +let isBranch = 1 in {
19933 +def S_BRANCH : SOPP <
19934 +  0x00000002, (ins brtarget:$target), "S_BRANCH",
19935 +  [(br bb:$target)]> {
19936 +  let isBarrier = 1;
19937 +}
19938 +
19939 +let DisableEncoding = "$scc" in {
19940 +def S_CBRANCH_SCC0 : SOPP <
19941 +  0x00000004, (ins brtarget:$target, SCCReg:$scc),
19942 +  "S_CBRANCH_SCC0", []
19943 +>;
19944 +def S_CBRANCH_SCC1 : SOPP <
19945 +  0x00000005, (ins brtarget:$target, SCCReg:$scc),
19946 +  "S_CBRANCH_SCC1",
19947 +  []
19948 +>;
19949 +} // End DisableEncoding = "$scc"
19950 +
19951 +def S_CBRANCH_VCCZ : SOPP <
19952 +  0x00000006, (ins brtarget:$target, VCCReg:$vcc),
19953 +  "S_CBRANCH_VCCZ",
19954 +  []
19955 +>;
19956 +def S_CBRANCH_VCCNZ : SOPP <
19957 +  0x00000007, (ins brtarget:$target, VCCReg:$vcc),
19958 +  "S_CBRANCH_VCCNZ",
19959 +  []
19960 +>;
19961 +
19962 +let DisableEncoding = "$exec" in {
19963 +def S_CBRANCH_EXECZ : SOPP <
19964 +  0x00000008, (ins brtarget:$target, EXECReg:$exec),
19965 +  "S_CBRANCH_EXECZ",
19966 +  []
19967 +>;
19968 +def S_CBRANCH_EXECNZ : SOPP <
19969 +  0x00000009, (ins brtarget:$target, EXECReg:$exec),
19970 +  "S_CBRANCH_EXECNZ",
19971 +  []
19972 +>;
19973 +} // End DisableEncoding = "$exec"
19974 +
19975 +
19976 +} // End isBranch = 1
19977 +} // End isTerminator = 1
19978 +
19979 +//def S_BARRIER : SOPP_ <0x0000000a, "S_BARRIER", []>;
19980 +let hasSideEffects = 1 in {
19981 +def S_WAITCNT : SOPP <0x0000000c, (ins i32imm:$simm16), "S_WAITCNT $simm16",
19982 +  []
19983 +>;
19984 +} // End hasSideEffects
19985 +//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>;
19986 +//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>;
19987 +//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>;
19988 +//def S_SENDMSG : SOPP_ <0x00000010, "S_SENDMSG", []>;
19989 +//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>;
19990 +//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>;
19991 +//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>;
19992 +//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>;
19993 +//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>;
19994 +//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>;
19995 +
19996 +def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst),
19997 +  (ins AllReg_32:$src0, VReg_32:$src1, VCCReg:$vcc), "V_CNDMASK_B32_e32",
19998 +  []
19999 +>{
20000 +  let DisableEncoding = "$vcc";
20001 +}
20002 +
20003 +def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst),
20004 +  (ins VReg_32:$src0, VReg_32:$src1, SReg_1:$src2, InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
20005 +  "V_CNDMASK_B32_e64",
20006 +  [(set (i32 VReg_32:$dst), (select SReg_1:$src2, VReg_32:$src1, VReg_32:$src0))]
20007 +>;
20008 +
20009 +//f32 pattern for V_CNDMASK_B32_e64
20010 +def : Pat <
20011 +  (f32 (select SReg_1:$src2, VReg_32:$src1, VReg_32:$src0)),
20012 +  (V_CNDMASK_B32_e64 VReg_32:$src0, VReg_32:$src1, SReg_1:$src2)
20013 +>;
20014 +
20015 +defm V_READLANE_B32 : VOP2_32 <0x00000001, "V_READLANE_B32", []>;
20016 +defm V_WRITELANE_B32 : VOP2_32 <0x00000002, "V_WRITELANE_B32", []>;
20017 +
20018 +defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32", []>;
20019 +def : Pat <
20020 +  (f32 (fadd AllReg_32:$src0, VReg_32:$src1)),
20021 +  (V_ADD_F32_e32  AllReg_32:$src0, VReg_32:$src1)
20022 +>;
20023 +
20024 +defm V_SUB_F32 : VOP2_32 <0x00000004, "V_SUB_F32", []>;
20025 +def : Pat <
20026 +  (f32 (fsub AllReg_32:$src0, VReg_32:$src1)),
20027 +  (V_SUB_F32_e32  AllReg_32:$src0, VReg_32:$src1)
20028 +>;
20029 +defm V_SUBREV_F32 : VOP2_32 <0x00000005, "V_SUBREV_F32", []>;
20030 +defm V_MAC_LEGACY_F32 : VOP2_32 <0x00000006, "V_MAC_LEGACY_F32", []>;
20031 +defm V_MUL_LEGACY_F32 : VOP2_32 <
20032 +  0x00000007, "V_MUL_LEGACY_F32",
20033 +  [(set VReg_32:$dst, (int_AMDGPU_mul AllReg_32:$src0, VReg_32:$src1))]
20034 +>;
20035 +
20036 +defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32",
20037 +  [(set VReg_32:$dst, (fmul AllReg_32:$src0, VReg_32:$src1))]
20038 +>;
20039 +//defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24", []>;
20040 +//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>;
20041 +//defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24", []>;
20042 +//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>;
20043 +defm V_MIN_LEGACY_F32 : VOP2_32 <0x0000000d, "V_MIN_LEGACY_F32",
20044 +  [(set VReg_32:$dst, (AMDGPUfmin AllReg_32:$src0, VReg_32:$src1))]
20045 +>;
20046 +
20047 +defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32",
20048 +  [(set VReg_32:$dst, (AMDGPUfmax AllReg_32:$src0, VReg_32:$src1))]
20049 +>;
20050 +defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>;
20051 +defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>;
20052 +defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", []>;
20053 +defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", []>;
20054 +defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", []>;
20055 +defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", []>;
20056 +defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", []>;
20057 +defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", []>;
20058 +defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", []>;
20059 +defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", []>;
20060 +defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", []>;
20061 +defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", []>;
20062 +defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32",
20063 +  [(set VReg_32:$dst, (and AllReg_32:$src0, VReg_32:$src1))]
20064 +>;
20065 +defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32",
20066 +  [(set VReg_32:$dst, (or AllReg_32:$src0, VReg_32:$src1))]
20067 +>;
20068 +defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32",
20069 +  [(set VReg_32:$dst, (xor AllReg_32:$src0, VReg_32:$src1))]
20070 +>;
20071 +defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32", []>;
20072 +defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;
20073 +defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>;
20074 +defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;
20075 +//defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
20076 +//defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
20077 +//defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
20078 +let Defs = [VCC] in { // Carry-out goes to VCC
20079 +defm V_ADD_I32 : VOP2_32 <0x00000025, "V_ADD_I32",
20080 +  [(set VReg_32:$dst, (add (i32 AllReg_32:$src0), (i32 VReg_32:$src1)))]
20081 +>;
20082 +defm V_SUB_I32 : VOP2_32 <0x00000026, "V_SUB_I32",
20083 +  [(set VReg_32:$dst, (sub (i32 AllReg_32:$src0), (i32 VReg_32:$src1)))]
20084 +>;
20085 +} // End Defs = [VCC]
20086 +defm V_SUBREV_I32 : VOP2_32 <0x00000027, "V_SUBREV_I32", []>;
20087 +defm V_ADDC_U32 : VOP2_32 <0x00000028, "V_ADDC_U32", []>;
20088 +defm V_SUBB_U32 : VOP2_32 <0x00000029, "V_SUBB_U32", []>;
20089 +defm V_SUBBREV_U32 : VOP2_32 <0x0000002a, "V_SUBBREV_U32", []>;
20090 +defm V_LDEXP_F32 : VOP2_32 <0x0000002b, "V_LDEXP_F32", []>;
20091 +////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "V_CVT_PKACCUM_U8_F32", []>;
20092 +////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "V_CVT_PKNORM_I16_F32", []>;
20093 +////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "V_CVT_PKNORM_U16_F32", []>;
20094 +defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32",
20095 + [(set VReg_32:$dst, (int_SI_packf16 AllReg_32:$src0, VReg_32:$src1))]
20096 +>;
20097 +////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>;
20098 +////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>;
20099 +def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32", []>;
20100 +def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32", []>;
20101 +def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32", []>;
20102 +def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32", []>;
20103 +def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32", []>;
20104 +def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32", []>;
20105 +def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32", []>;
20106 +def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32", []>;
20107 +def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32", []>;
20108 +def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32", []>;
20109 +def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32", []>;
20110 +def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32", []>;
20111 +////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>;
20112 +////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>;
20113 +////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>;
20114 +////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>;
20115 +//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>;
20116 +
20117 +let neverHasSideEffects = 1 in {
20118 +
20119 +def V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>;
20120 +def V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", []>;
20121 +//def V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24", []>;
20122 +//def V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24", []>;
20123 +
20124 +} // End neverHasSideEffects
20125 +def V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>;
20126 +def V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>;
20127 +def V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>;
20128 +def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
20129 +def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", []>;
20130 +def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", []>;
20131 +def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", []>;
20132 +def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", []>;
20133 +def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", []>;
20134 +//def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
20135 +def V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
20136 +def V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>;
20137 +def V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
20138 +////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>;
20139 +////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>;
20140 +////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>;
20141 +////def V_MAX3_F32 : VOP3_MAX3 <0x00000154, "V_MAX3_F32", []>;
20142 +////def V_MAX3_I32 : VOP3_MAX3 <0x00000155, "V_MAX3_I32", []>;
20143 +////def V_MAX3_U32 : VOP3_MAX3 <0x00000156, "V_MAX3_U32", []>;
20144 +////def V_MED3_F32 : VOP3_MED3 <0x00000157, "V_MED3_F32", []>;
20145 +////def V_MED3_I32 : VOP3_MED3 <0x00000158, "V_MED3_I32", []>;
20146 +////def V_MED3_U32 : VOP3_MED3 <0x00000159, "V_MED3_U32", []>;
20147 +//def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>;
20148 +//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>;
20149 +//def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>;
20150 +def V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
20151 +////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>;
20152 +def V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
20153 +def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>;
20154 +def V_LSHL_B64 : VOP3_64 <0x00000161, "V_LSHL_B64", []>;
20155 +def V_LSHR_B64 : VOP3_64 <0x00000162, "V_LSHR_B64", []>;
20156 +def V_ASHR_I64 : VOP3_64 <0x00000163, "V_ASHR_I64", []>;
20157 +def V_ADD_F64 : VOP3_64 <0x00000164, "V_ADD_F64", []>;
20158 +def V_MUL_F64 : VOP3_64 <0x00000165, "V_MUL_F64", []>;
20159 +def V_MIN_F64 : VOP3_64 <0x00000166, "V_MIN_F64", []>;
20160 +def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>;
20161 +def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>;
20162 +def V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>;
20163 +def V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>;
20164 +def V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>;
20165 +def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
20166 +def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
20167 +def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
20168 +def V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>;
20169 +def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>;
20170 +//def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>;
20171 +//def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
20172 +//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
20173 +def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>;
20174 +def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>;
20175 +def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>;
20176 +def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32", []>;
20177 +def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32", []>;
20178 +def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32", []>;
20179 +def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32", []>;
20180 +def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32", []>;
20181 +def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32", []>;
20182 +def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32", []>;
20183 +def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32", []>;
20184 +
20185 +def S_CSELECT_B32 : SOP2 <
20186 +  0x0000000a, (outs SReg_32:$dst),
20187 +  (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32",
20188 +  [(set (i32 SReg_32:$dst), (select SCCReg:$scc, SReg_32:$src0, SReg_32:$src1))]
20189 +>;
20190 +
20191 +def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>;
20192 +
20193 +// f32 pattern for S_CSELECT_B32
20194 +def : Pat <
20195 +  (f32 (select SCCReg:$scc, SReg_32:$src0, SReg_32:$src1)),
20196 +  (S_CSELECT_B32 SReg_32:$src0, SReg_32:$src1, SCCReg:$scc)
20197 +>;
20198 +
20199 +def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32", []>;
20200 +
20201 +def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64",
20202 +  [(set SReg_64:$dst, (and SReg_64:$src0, SReg_64:$src1))]
20203 +>;
20204 +def S_AND_VCC : SOP2_VCC <0x0000000f, "S_AND_B64",
20205 +  [(set SReg_1:$vcc, (SIvcc_and SReg_64:$src0, SReg_64:$src1))]
20206 +>;
20207 +def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32", []>;
20208 +def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64", []>;
20209 +def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", []>;
20210 +def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", []>;
20211 +def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>;
20212 +def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>;
20213 +def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>;
20214 +def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>;
20215 +def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>;
20216 +def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>;
20217 +def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>;
20218 +def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>;
20219 +def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>;
20220 +def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>;
20221 +def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32", []>;
20222 +def S_LSHL_B64 : SOP2_64 <0x0000001f, "S_LSHL_B64", []>;
20223 +def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32", []>;
20224 +def S_LSHR_B64 : SOP2_64 <0x00000021, "S_LSHR_B64", []>;
20225 +def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32", []>;
20226 +def S_ASHR_I64 : SOP2_64 <0x00000023, "S_ASHR_I64", []>;
20227 +def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>;
20228 +def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>;
20229 +def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>;
20230 +def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>;
20231 +def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>;
20232 +def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>;
20233 +def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>;
20234 +//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>;
20235 +def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
20236 +
20237 +class V_MOV_IMM <Operand immType, SDNode immNode> : InstSI <
20238 +  (outs VReg_32:$dst),
20239 +  (ins immType:$src0),
20240 +  "V_MOV_IMM",
20241 +   [(set VReg_32:$dst, (immNode:$src0))]
20242 +>;
20243 +
20244 +let isCodeGenOnly = 1, isPseudo = 1 in {
20245 +
20246 +def V_MOV_IMM_I32 : V_MOV_IMM<i32imm, imm>;
20247 +def V_MOV_IMM_F32 : V_MOV_IMM<f32imm, fpimm>;
20248 +
20249 +def S_MOV_IMM_I32 : InstSI <
20250 +  (outs SReg_32:$dst),
20251 +  (ins i32imm:$src0),
20252 +  "S_MOV_IMM_I32",
20253 +  [(set SReg_32:$dst, (imm:$src0))]
20254 +>;
20255 +
20256 +// i64 immediates aren't really supported in hardware, but LLVM will use the i64
20257 +// type for indices on load and store instructions.  The pattern for
20258 +// S_MOV_IMM_I64 will only match i64 immediates that can fit into 32-bits,
20259 +// which the hardware can handle.
20260 +def S_MOV_IMM_I64 : InstSI <
20261 +  (outs SReg_64:$dst),
20262 +  (ins i64imm:$src0),
20263 +  "S_MOV_IMM_I64 $dst, $src0",
20264 +  [(set SReg_64:$dst, (IMM32bitIn64bit:$src0))]
20265 +>;
20266 +
20267 +} // End isCodeGenOnly, isPseudo = 1
20268 +
20269 +class SI_LOAD_LITERAL<Operand ImmType> :
20270 +    Enc32 <(outs), (ins ImmType:$imm), "LOAD_LITERAL $imm", []> {
20271 +
20272 +  bits<32> imm;
20273 +  let Inst{31-0} = imm;
20274 +}
20275 +
20276 +def SI_LOAD_LITERAL_I32 : SI_LOAD_LITERAL<i32imm>;
20277 +def SI_LOAD_LITERAL_F32 : SI_LOAD_LITERAL<f32imm>;
20278 +
20279 +let isCodeGenOnly = 1, isPseudo = 1 in {
20280 +
20281 +def SET_M0 : InstSI <
20282 +  (outs SReg_32:$dst),
20283 +  (ins i32imm:$src0),
20284 +  "SET_M0",
20285 +  [(set SReg_32:$dst, (int_SI_set_M0 imm:$src0))]
20286 +>;
20287 +
20288 +def LOAD_CONST : AMDGPUShaderInst <
20289 +  (outs GPRF32:$dst),
20290 +  (ins i32imm:$src),
20291 +  "LOAD_CONST $dst, $src",
20292 +  [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))]
20293 +>;
20294 +
20295 +let usesCustomInserter = 1 in {
20296 +
20297 +def SI_V_CNDLT : InstSI <
20298 +  (outs VReg_32:$dst),
20299 +  (ins VReg_32:$src0, VReg_32:$src1, VReg_32:$src2),
20300 +  "SI_V_CNDLT $dst, $src0, $src1, $src2",
20301 +  [(set VReg_32:$dst, (int_AMDGPU_cndlt VReg_32:$src0, VReg_32:$src1, VReg_32:$src2))]
20302 +>;
20303 +
20304 +def SI_INTERP : InstSI <
20305 +  (outs VReg_32:$dst),
20306 +  (ins VReg_32:$i, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, SReg_32:$params),
20307 +  "SI_INTERP $dst, $i, $j, $attr_chan, $attr, $params",
20308 +  []
20309 +>;
20310 +
20311 +def SI_INTERP_CONST : InstSI <
20312 +  (outs VReg_32:$dst),
20313 +  (ins i32imm:$attr_chan, i32imm:$attr, SReg_32:$params),
20314 +  "SI_INTERP_CONST $dst, $attr_chan, $attr, $params",
20315 +  [(set VReg_32:$dst, (int_SI_fs_interp_constant imm:$attr_chan,
20316 +                                                 imm:$attr, SReg_32:$params))]
20317 +>;
20318 +
20319 +def SI_WQM : InstSI <
20320 +  (outs),
20321 +  (ins),
20322 +  "SI_WQM",
20323 +  [(int_SI_wqm)]
20324 +>;
20325 +
20326 +} // end usesCustomInserter
20327 +
20328 +// SI Psuedo instructions. These are used by the CFG structurizer pass
20329 +// and should be lowered to ISA instructions prior to codegen.
20330 +
20331 +let mayLoad = 1, mayStore = 1, hasSideEffects = 1,
20332 +    Uses = [EXEC], Defs = [EXEC] in {
20333 +
20334 +let isBranch = 1, isTerminator = 1 in {
20335 +
20336 +def SI_IF : InstSI <
20337 +  (outs SReg_64:$dst),
20338 +  (ins SReg_1:$vcc, brtarget:$target),
20339 +  "SI_IF",
20340 +  [(set SReg_64:$dst, (int_SI_if SReg_1:$vcc, bb:$target))]
20341 +>;
20342 +
20343 +def SI_ELSE : InstSI <
20344 +  (outs SReg_64:$dst),
20345 +  (ins SReg_64:$src, brtarget:$target),
20346 +  "SI_ELSE",
20347 +  [(set SReg_64:$dst, (int_SI_else SReg_64:$src, bb:$target))]> {
20348 +
20349 +  let Constraints = "$src = $dst";
20350 +}
20351 +
20352 +def SI_LOOP : InstSI <
20353 +  (outs),
20354 +  (ins SReg_64:$saved, brtarget:$target),
20355 +  "SI_LOOP",
20356 +  [(int_SI_loop SReg_64:$saved, bb:$target)]
20357 +>;
20358 +
20359 +} // end isBranch = 1, isTerminator = 1
20360 +
20361 +def SI_BREAK : InstSI <
20362 +  (outs SReg_64:$dst),
20363 +  (ins SReg_64:$src),
20364 +  "SI_ELSE",
20365 +  [(set SReg_64:$dst, (int_SI_break SReg_64:$src))]
20366 +>;
20367 +
20368 +def SI_IF_BREAK : InstSI <
20369 +  (outs SReg_64:$dst),
20370 +  (ins SReg_1:$vcc, SReg_64:$src),
20371 +  "SI_IF_BREAK",
20372 +  [(set SReg_64:$dst, (int_SI_if_break SReg_1:$vcc, SReg_64:$src))]
20373 +>;
20374 +
20375 +def SI_ELSE_BREAK : InstSI <
20376 +  (outs SReg_64:$dst),
20377 +  (ins SReg_64:$src0, SReg_64:$src1),
20378 +  "SI_ELSE_BREAK",
20379 +  [(set SReg_64:$dst, (int_SI_else_break SReg_64:$src0, SReg_64:$src1))]
20380 +>;
20381 +
20382 +def SI_END_CF : InstSI <
20383 +  (outs),
20384 +  (ins SReg_64:$saved),
20385 +  "SI_END_CF",
20386 +  [(int_SI_end_cf SReg_64:$saved)]
20387 +>;
20388 +
20389 +def SI_KILL : InstSI <
20390 +  (outs),
20391 +  (ins VReg_32:$src),
20392 +  "SI_KIL $src",
20393 +  [(int_AMDGPU_kill VReg_32:$src)]
20394 +>;
20395 +
20396 +} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1
20397 +  // Uses = [EXEC], Defs = [EXEC]
20398 +
20399 +} // end IsCodeGenOnly, isPseudo
20400 +
20401 +def : Pat <
20402 +  (int_AMDGPU_kilp),
20403 +  (SI_KILL (V_MOV_IMM_I32 0xbf800000))
20404 +>;
20405 +
20406 +/* int_SI_vs_load_input */
20407 +def : Pat<
20408 +  (int_SI_vs_load_input SReg_128:$tlst, IMM12bit:$attr_offset,
20409 +                        VReg_32:$buf_idx_vgpr),
20410 +  (BUFFER_LOAD_FORMAT_XYZW imm:$attr_offset, 0, 1, 0, 0, 0,
20411 +                           VReg_32:$buf_idx_vgpr, SReg_128:$tlst,
20412 +                           0, 0, (i32 SREG_LIT_0))
20413 +>;
20414 +
20415 +/* int_SI_export */
20416 +def : Pat <
20417 +  (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr,
20418 +                 VReg_32:$src0,VReg_32:$src1, VReg_32:$src2, VReg_32:$src3),
20419 +  (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm,
20420 +       VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3)
20421 +>;
20422 +
20423 +/* int_SI_sample */
20424 +def : Pat <
20425 +  (int_SI_sample imm:$writemask, VReg_128:$coord, SReg_256:$rsrc, SReg_128:$sampler, imm),
20426 +  (IMAGE_SAMPLE imm:$writemask, 0, 0, 0, 0, 0, 0, 0, VReg_128:$coord,
20427 +                SReg_256:$rsrc, SReg_128:$sampler)
20428 +>;
20429 +
20430 +def : Pat <
20431 +  (int_SI_sample imm:$writemask, VReg_128:$coord, SReg_256:$rsrc, SReg_128:$sampler, TEX_RECT),
20432 +  (IMAGE_SAMPLE imm:$writemask, 1, 0, 0, 0, 0, 0, 0, VReg_128:$coord,
20433 +                SReg_256:$rsrc, SReg_128:$sampler)
20434 +>;
20435 +
20436 +/* int_SI_sample_lod */
20437 +def : Pat <
20438 +  (int_SI_sample_lod imm:$writemask, VReg_128:$coord, SReg_256:$rsrc, SReg_128:$sampler, imm),
20439 +  (IMAGE_SAMPLE_L imm:$writemask, 0, 0, 0, 0, 0, 0, 0, VReg_128:$coord,
20440 +                  SReg_256:$rsrc, SReg_128:$sampler)
20441 +>;
20442 +
20443 +/* int_SI_sample_bias */
20444 +def : Pat <
20445 +  (int_SI_sample_bias imm:$writemask, VReg_128:$coord, SReg_256:$rsrc, SReg_128:$sampler, imm),
20446 +  (IMAGE_SAMPLE_B imm:$writemask, 0, 0, 0, 0, 0, 0, 0, VReg_128:$coord,
20447 +                  SReg_256:$rsrc, SReg_128:$sampler)
20448 +>;
20449 +
20450 +def CLAMP_SI : CLAMP<VReg_32>;
20451 +def FABS_SI : FABS<VReg_32>;
20452 +def FNEG_SI : FNEG<VReg_32>;
20453 +
20454 +def : Extract_Element <f32, v4f32, VReg_128, 0, sel_x>;
20455 +def : Extract_Element <f32, v4f32, VReg_128, 1, sel_y>;
20456 +def : Extract_Element <f32, v4f32, VReg_128, 2, sel_z>;
20457 +def : Extract_Element <f32, v4f32, VReg_128, 3, sel_w>;
20458 +
20459 +def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 4, sel_x>;
20460 +def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 5, sel_y>;
20461 +def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 6, sel_z>;
20462 +def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 7, sel_w>;
20463 +
20464 +def : Vector_Build <v4f32, VReg_128, f32, VReg_32>;
20465 +def : Vector_Build <v4i32, SReg_128, i32, SReg_32>;
20466 +
20467 +def : BitConvert <i32, f32, SReg_32>;
20468 +def : BitConvert <i32, f32, VReg_32>;
20469 +
20470 +def : BitConvert <f32, i32, SReg_32>;
20471 +def : BitConvert <f32, i32, VReg_32>;
20472 +
20473 +def : Pat <
20474 +  (i64 (SIsreg1_bitcast SReg_1:$vcc)),
20475 +  (S_MOV_B64 (COPY_TO_REGCLASS SReg_1:$vcc, SReg_64))
20476 +>;
20477 +
20478 +def : Pat <
20479 +  (i1 (SIsreg1_bitcast SReg_64:$vcc)),
20480 +  (COPY_TO_REGCLASS SReg_64:$vcc, SReg_1)
20481 +>;
20482 +
20483 +def : Pat <
20484 +  (i64 (SIvcc_bitcast VCCReg:$vcc)),
20485 +  (S_MOV_B64 (COPY_TO_REGCLASS VCCReg:$vcc, SReg_64))
20486 +>;
20487 +
20488 +def : Pat <
20489 +  (i1 (SIvcc_bitcast SReg_64:$vcc)),
20490 +  (COPY_TO_REGCLASS SReg_64:$vcc, VCCReg)
20491 +>;
20492 +
20493 +/********** ===================== **********/
20494 +/********** Interpolation Paterns **********/
20495 +/********** ===================== **********/
20496 +
20497 +def : Pat <
20498 +  (int_SI_fs_interp_linear_center imm:$attr_chan, imm:$attr, SReg_32:$params),
20499 +  (SI_INTERP (f32 LINEAR_CENTER_I), (f32 LINEAR_CENTER_J), imm:$attr_chan,
20500 +             imm:$attr, SReg_32:$params)
20501 +>;
20502 +
20503 +def : Pat <
20504 +  (int_SI_fs_interp_linear_centroid imm:$attr_chan, imm:$attr, SReg_32:$params),
20505 +  (SI_INTERP (f32 LINEAR_CENTROID_I), (f32 LINEAR_CENTROID_J), imm:$attr_chan,
20506 +             imm:$attr, SReg_32:$params)
20507 +>;
20508 +
20509 +def : Pat <
20510 +  (int_SI_fs_interp_persp_center imm:$attr_chan, imm:$attr, SReg_32:$params),
20511 +  (SI_INTERP (f32 PERSP_CENTER_I), (f32 PERSP_CENTER_J), imm:$attr_chan,
20512 +             imm:$attr, SReg_32:$params)
20513 +>;
20514 +
20515 +def : Pat <
20516 +  (int_SI_fs_interp_persp_centroid imm:$attr_chan, imm:$attr, SReg_32:$params),
20517 +  (SI_INTERP (f32 PERSP_CENTROID_I), (f32 PERSP_CENTROID_J), imm:$attr_chan,
20518 +             imm:$attr, SReg_32:$params)
20519 +>;
20520 +
20521 +def : Pat <
20522 +  (int_SI_fs_read_face),
20523 +  (f32 FRONT_FACE)
20524 +>;
20525 +
20526 +def : Pat <
20527 +  (int_SI_fs_read_pos 0),
20528 +  (f32 POS_X_FLOAT)
20529 +>;
20530 +
20531 +def : Pat <
20532 +  (int_SI_fs_read_pos 1),
20533 +  (f32 POS_Y_FLOAT)
20534 +>;
20535 +
20536 +def : Pat <
20537 +  (int_SI_fs_read_pos 2),
20538 +  (f32 POS_Z_FLOAT)
20539 +>;
20540 +
20541 +def : Pat <
20542 +  (int_SI_fs_read_pos 3),
20543 +  (f32 POS_W_FLOAT)
20544 +>;
20545 +
20546 +/********** ================== **********/
20547 +/********** Intrinsic Patterns **********/
20548 +/********** ================== **********/
20549 +
20550 +/* llvm.AMDGPU.pow */
20551 +/* XXX: We are using IEEE MUL, not the 0 * anything = 0 MUL, is this correct? */
20552 +def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_F32_e32, VReg_32>;
20553 +
20554 +def : Pat <
20555 +  (int_AMDGPU_div AllReg_32:$src0, AllReg_32:$src1),
20556 +  (V_MUL_LEGACY_F32_e32 AllReg_32:$src0, (V_RCP_LEGACY_F32_e32 AllReg_32:$src1))
20557 +>;
20558 +
20559 +def : Pat<
20560 +  (fdiv AllReg_32:$src0, AllReg_32:$src1),
20561 +  (V_MUL_F32_e32 AllReg_32:$src0, (V_RCP_F32_e32 AllReg_32:$src1))
20562 +>;
20563 +
20564 +def : Pat <
20565 +  (int_AMDGPU_cube VReg_128:$src),
20566 +  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
20567 +    (V_CUBETC_F32 (EXTRACT_SUBREG VReg_128:$src, sel_x),
20568 +                  (EXTRACT_SUBREG VReg_128:$src, sel_y),
20569 +                  (EXTRACT_SUBREG VReg_128:$src, sel_z),
20570 +                  0, 0, 0, 0), sel_x),
20571 +    (V_CUBESC_F32 (EXTRACT_SUBREG VReg_128:$src, sel_x),
20572 +                  (EXTRACT_SUBREG VReg_128:$src, sel_y),
20573 +                  (EXTRACT_SUBREG VReg_128:$src, sel_z),
20574 +                  0, 0, 0, 0), sel_y),
20575 +    (V_CUBEMA_F32 (EXTRACT_SUBREG VReg_128:$src, sel_x),
20576 +                  (EXTRACT_SUBREG VReg_128:$src, sel_y),
20577 +                  (EXTRACT_SUBREG VReg_128:$src, sel_z),
20578 +                  0, 0, 0, 0), sel_z),
20579 +    (V_CUBEID_F32 (EXTRACT_SUBREG VReg_128:$src, sel_x),
20580 +                  (EXTRACT_SUBREG VReg_128:$src, sel_y),
20581 +                  (EXTRACT_SUBREG VReg_128:$src, sel_z),
20582 +                  0, 0, 0, 0), sel_w)
20583 +>;
20584 +
20585 +/********** ================== **********/
20586 +/**********   VOP3 Patterns    **********/
20587 +/********** ================== **********/
20588 +
20589 +def : Pat <(f32 (IL_mad AllReg_32:$src0, VReg_32:$src1, VReg_32:$src2)),
20590 +           (V_MAD_LEGACY_F32 AllReg_32:$src0, VReg_32:$src1, VReg_32:$src2,
20591 +            0, 0, 0, 0)>;
20592 +
20593 +} // End isSI predicate
20594 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIIntrinsics.td llvm-r600/lib/Target/R600/SIIntrinsics.td
20595 --- llvm-3.2.src/lib/Target/R600/SIIntrinsics.td        1970-01-01 01:00:00.000000000 +0100
20596 +++ llvm-r600/lib/Target/R600/SIIntrinsics.td   2013-01-25 19:43:57.480049720 +0100
20597 @@ -0,0 +1,54 @@
20598 +//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===//
20599 +//
20600 +//                     The LLVM Compiler Infrastructure
20601 +//
20602 +// This file is distributed under the University of Illinois Open Source
20603 +// License. See LICENSE.TXT for details.
20604 +//
20605 +//===----------------------------------------------------------------------===//
20606 +//
20607 +// SI Intrinsic Definitions
20608 +//
20609 +//===----------------------------------------------------------------------===//
20610 +
20611 +
20612 +let TargetPrefix = "SI", isTarget = 1 in {
20613 +
20614 +  def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
20615 +  def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
20616 +  /* XXX: We may need a seperate intrinsic here for loading integer values */
20617 +  def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_i64_ty, llvm_i32_ty], []>;
20618 +  def int_SI_vs_load_buffer_index : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>;
20619 +  def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v4i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrReadMem]> ;
20620 +  def int_SI_wqm : Intrinsic <[], [], []>;
20621 +
20622 +  class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_v4f32_ty, llvm_v8i32_ty, llvm_v4i32_ty, llvm_i32_ty], [IntrReadMem]>;
20623 +
20624 +  def int_SI_sample : Sample;
20625 +  def int_SI_sample_bias : Sample;
20626 +  def int_SI_sample_lod : Sample;
20627 +
20628 +  /* Interpolation Intrinsics */
20629 +
20630 +  def int_SI_set_M0 : Intrinsic <[llvm_i32_ty], [llvm_i32_ty]>;
20631 +  class Interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadMem]>;
20632 +
20633 +  def int_SI_fs_interp_linear_center : Interp;
20634 +  def int_SI_fs_interp_linear_centroid : Interp;
20635 +  def int_SI_fs_interp_persp_center : Interp;
20636 +  def int_SI_fs_interp_persp_centroid : Interp;
20637 +  def int_SI_fs_interp_constant : Interp;
20638 +
20639 +  def int_SI_fs_read_face : Intrinsic <[llvm_float_ty], [], [IntrNoMem]>;
20640 +  def int_SI_fs_read_pos : Intrinsic <[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
20641 +
20642 +  /* Control flow Intrinsics */
20643 +
20644 +  def int_SI_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
20645 +  def int_SI_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>;
20646 +  def int_SI_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>;
20647 +  def int_SI_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>;
20648 +  def int_SI_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>;
20649 +  def int_SI_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>;
20650 +  def int_SI_end_cf : Intrinsic<[], [llvm_i64_ty], []>;
20651 +}
20652 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIISelLowering.cpp llvm-r600/lib/Target/R600/SIISelLowering.cpp
20653 --- llvm-3.2.src/lib/Target/R600/SIISelLowering.cpp     1970-01-01 01:00:00.000000000 +0100
20654 +++ llvm-r600/lib/Target/R600/SIISelLowering.cpp        2013-01-25 19:43:57.470049720 +0100
20655 @@ -0,0 +1,486 @@
20656 +//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
20657 +//
20658 +//                     The LLVM Compiler Infrastructure
20659 +//
20660 +// This file is distributed under the University of Illinois Open Source
20661 +// License. See LICENSE.TXT for details.
20662 +//
20663 +//===----------------------------------------------------------------------===//
20664 +//
20665 +/// \file
20666 +/// \brief Custom DAG lowering for SI
20667 +//
20668 +//===----------------------------------------------------------------------===//
20669 +
20670 +#include "SIISelLowering.h"
20671 +#include "AMDIL.h"
20672 +#include "AMDILIntrinsicInfo.h"
20673 +#include "SIInstrInfo.h"
20674 +#include "SIMachineFunctionInfo.h"
20675 +#include "SIRegisterInfo.h"
20676 +#include "llvm/CodeGen/MachineInstrBuilder.h"
20677 +#include "llvm/CodeGen/MachineRegisterInfo.h"
20678 +#include "llvm/CodeGen/SelectionDAG.h"
20679 +
20680 +using namespace llvm;
20681 +
20682 +SITargetLowering::SITargetLowering(TargetMachine &TM) :
20683 +    AMDGPUTargetLowering(TM),
20684 +    TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo())) {
20685 +  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
20686 +  addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
20687 +  addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass);
20688 +  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
20689 +  addRegisterClass(MVT::i1, &AMDGPU::SCCRegRegClass);
20690 +  addRegisterClass(MVT::i1, &AMDGPU::VCCRegRegClass);
20691 +
20692 +  addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
20693 +  addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
20694 +
20695 +  computeRegisterProperties();
20696 +
20697 +  setOperationAction(ISD::AND, MVT::i1, Custom);
20698 +
20699 +  setOperationAction(ISD::ADD, MVT::i64, Legal);
20700 +  setOperationAction(ISD::ADD, MVT::i32, Legal);
20701 +
20702 +  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
20703 +
20704 +  // We need to custom lower loads from the USER_SGPR address space, so we can
20705 +  // add the SGPRs as livein registers.
20706 +  setOperationAction(ISD::LOAD, MVT::i32, Custom);
20707 +  setOperationAction(ISD::LOAD, MVT::i64, Custom);
20708 +
20709 +  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
20710 +  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
20711 +
20712 +  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
20713 +  setTargetDAGCombine(ISD::SELECT_CC);
20714 +
20715 +  setTargetDAGCombine(ISD::SETCC);
20716 +}
20717 +
20718 +MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
20719 +    MachineInstr * MI, MachineBasicBlock * BB) const {
20720 +  const TargetInstrInfo * TII = getTargetMachine().getInstrInfo();
20721 +  MachineRegisterInfo & MRI = BB->getParent()->getRegInfo();
20722 +  MachineBasicBlock::iterator I = MI;
20723 +
20724 +  switch (MI->getOpcode()) {
20725 +  default:
20726 +    return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
20727 +  case AMDGPU::BRANCH: return BB;
20728 +  case AMDGPU::CLAMP_SI:
20729 +    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
20730 +           .addOperand(MI->getOperand(0))
20731 +           .addOperand(MI->getOperand(1))
20732 +           // VSRC1-2 are unused, but we still need to fill all the
20733 +           // operand slots, so we just reuse the VSRC0 operand
20734 +           .addOperand(MI->getOperand(1))
20735 +           .addOperand(MI->getOperand(1))
20736 +           .addImm(0) // ABS
20737 +           .addImm(1) // CLAMP
20738 +           .addImm(0) // OMOD
20739 +           .addImm(0); // NEG
20740 +    MI->eraseFromParent();
20741 +    break;
20742 +
20743 +  case AMDGPU::FABS_SI:
20744 +    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
20745 +                 .addOperand(MI->getOperand(0))
20746 +                 .addOperand(MI->getOperand(1))
20747 +                 // VSRC1-2 are unused, but we still need to fill all the
20748 +                 // operand slots, so we just reuse the VSRC0 operand
20749 +                 .addOperand(MI->getOperand(1))
20750 +                 .addOperand(MI->getOperand(1))
20751 +                 .addImm(1) // ABS
20752 +                 .addImm(0) // CLAMP
20753 +                 .addImm(0) // OMOD
20754 +                 .addImm(0); // NEG
20755 +    MI->eraseFromParent();
20756 +    break;
20757 +
20758 +  case AMDGPU::FNEG_SI:
20759 +    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
20760 +                 .addOperand(MI->getOperand(0))
20761 +                 .addOperand(MI->getOperand(1))
20762 +                 // VSRC1-2 are unused, but we still need to fill all the
20763 +                 // operand slots, so we just reuse the VSRC0 operand
20764 +                 .addOperand(MI->getOperand(1))
20765 +                 .addOperand(MI->getOperand(1))
20766 +                 .addImm(0) // ABS
20767 +                 .addImm(0) // CLAMP
20768 +                 .addImm(0) // OMOD
20769 +                 .addImm(1); // NEG
20770 +    MI->eraseFromParent();
20771 +    break;
20772 +  case AMDGPU::SHADER_TYPE:
20773 +    BB->getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType =
20774 +                                        MI->getOperand(0).getImm();
20775 +    MI->eraseFromParent();
20776 +    break;
20777 +
20778 +  case AMDGPU::SI_INTERP:
20779 +    LowerSI_INTERP(MI, *BB, I, MRI);
20780 +    break;
20781 +  case AMDGPU::SI_INTERP_CONST:
20782 +    LowerSI_INTERP_CONST(MI, *BB, I, MRI);
20783 +    break;
20784 +  case AMDGPU::SI_WQM:
20785 +    LowerSI_WQM(MI, *BB, I, MRI);
20786 +    break;
20787 +  case AMDGPU::SI_V_CNDLT:
20788 +    LowerSI_V_CNDLT(MI, *BB, I, MRI);
20789 +    break;
20790 +  }
20791 +  return BB;
20792 +}
20793 +
20794 +void SITargetLowering::LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
20795 +    MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
20796 +  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC)
20797 +          .addReg(AMDGPU::EXEC);
20798 +
20799 +  MI->eraseFromParent();
20800 +}
20801 +
20802 +void SITargetLowering::LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
20803 +    MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
20804 +  unsigned tmp = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
20805 +  unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass);
20806 +  MachineOperand dst = MI->getOperand(0);
20807 +  MachineOperand iReg = MI->getOperand(1);
20808 +  MachineOperand jReg = MI->getOperand(2);
20809 +  MachineOperand attr_chan = MI->getOperand(3);
20810 +  MachineOperand attr = MI->getOperand(4);
20811 +  MachineOperand params = MI->getOperand(5);
20812 +
20813 +  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0)
20814 +          .addOperand(params);
20815 +
20816 +  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P1_F32), tmp)
20817 +          .addOperand(iReg)
20818 +          .addOperand(attr_chan)
20819 +          .addOperand(attr)
20820 +          .addReg(M0);
20821 +
20822 +  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P2_F32))
20823 +          .addOperand(dst)
20824 +          .addReg(tmp)
20825 +          .addOperand(jReg)
20826 +          .addOperand(attr_chan)
20827 +          .addOperand(attr)
20828 +          .addReg(M0);
20829 +
20830 +  MI->eraseFromParent();
20831 +}
20832 +
20833 +void SITargetLowering::LowerSI_INTERP_CONST(MachineInstr *MI,
20834 +    MachineBasicBlock &BB, MachineBasicBlock::iterator I,
20835 +    MachineRegisterInfo &MRI) const {
20836 +  MachineOperand dst = MI->getOperand(0);
20837 +  MachineOperand attr_chan = MI->getOperand(1);
20838 +  MachineOperand attr = MI->getOperand(2);
20839 +  MachineOperand params = MI->getOperand(3);
20840 +  unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass);
20841 +
20842 +  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0)
20843 +          .addOperand(params);
20844 +
20845 +  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_MOV_F32))
20846 +          .addOperand(dst)
20847 +          .addOperand(attr_chan)
20848 +          .addOperand(attr)
20849 +          .addReg(M0);
20850 +
20851 +  MI->eraseFromParent();
20852 +}
20853 +
20854 +void SITargetLowering::LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB,
20855 +    MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
20856 +  unsigned VCC = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
20857 +
20858 +  BuildMI(BB, I, BB.findDebugLoc(I),
20859 +          TII->get(AMDGPU::V_CMP_GT_F32_e32),
20860 +          VCC)
20861 +          .addReg(AMDGPU::SREG_LIT_0)
20862 +          .addOperand(MI->getOperand(1));
20863 +
20864 +  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_CNDMASK_B32_e32))
20865 +          .addOperand(MI->getOperand(0))
20866 +          .addOperand(MI->getOperand(3))
20867 +          .addOperand(MI->getOperand(2))
20868 +          .addReg(VCC);
20869 +
20870 +  MI->eraseFromParent();
20871 +}
20872 +
20873 +EVT SITargetLowering::getSetCCResultType(EVT VT) const {
20874 +  return MVT::i1;
20875 +}
20876 +
20877 +//===----------------------------------------------------------------------===//
20878 +// Custom DAG Lowering Operations
20879 +//===----------------------------------------------------------------------===//
20880 +
20881 +SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
20882 +  switch (Op.getOpcode()) {
20883 +  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
20884 +  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
20885 +  case ISD::LOAD: return LowerLOAD(Op, DAG);
20886 +  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
20887 +  case ISD::AND: return Loweri1ContextSwitch(Op, DAG, ISD::AND);
20888 +  case ISD::INTRINSIC_WO_CHAIN: {
20889 +    unsigned IntrinsicID =
20890 +                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20891 +    EVT VT = Op.getValueType();
20892 +    switch (IntrinsicID) {
20893 +    case AMDGPUIntrinsic::SI_vs_load_buffer_index:
20894 +      return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
20895 +                                  AMDGPU::VGPR0, VT);
20896 +    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
20897 +    }
20898 +    break;
20899 +  }
20900 +  }
20901 +  return SDValue();
20902 +}
20903 +
20904 +/// \brief The function is for lowering i1 operations on the
20905 +/// VCC register.
20906 +///
20907 +/// In the VALU context, VCC is a one bit register, but in the
20908 +/// SALU context the VCC is a 64-bit register (1-bit per thread).  Since only
20909 +/// the SALU can perform operations on the VCC register, we need to promote
20910 +/// the operand types from i1 to i64 in order for tablegen to be able to match
20911 +/// this operation to the correct SALU instruction.  We do this promotion by
20912 +/// wrapping the operands in a CopyToReg node.
20913 +///
20914 +SDValue SITargetLowering::Loweri1ContextSwitch(SDValue Op,
20915 +                                               SelectionDAG &DAG,
20916 +                                               unsigned VCCNode) const {
20917 +  DebugLoc DL = Op.getDebugLoc();
20918 +
20919 +  SDValue OpNode = DAG.getNode(VCCNode, DL, MVT::i64,
20920 +                               DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64,
20921 +                                           Op.getOperand(0)),
20922 +                               DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64,
20923 +                                           Op.getOperand(1)));
20924 +
20925 +  return DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i1, OpNode);
20926 +}
20927 +
20928 +/// \brief Helper function for LowerBRCOND
20929 +static SDNode *findUser(SDValue Value, unsigned Opcode) {
20930 +
20931 +  SDNode *Parent = Value.getNode();
20932 +  for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
20933 +       I != E; ++I) {
20934 +
20935 +    if (I.getUse().get() != Value)
20936 +      continue;
20937 +
20938 +    if (I->getOpcode() == Opcode)
20939 +      return *I;
20940 +  }
20941 +  return 0;
20942 +}
20943 +
20944 +/// This transforms the control flow intrinsics to get the branch destination as
20945 +/// last parameter, also switches branch target with BR if the need arise
20946 +SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
20947 +                                      SelectionDAG &DAG) const {
20948 +
20949 +  DebugLoc DL = BRCOND.getDebugLoc();
20950 +
20951 +  SDNode *Intr = BRCOND.getOperand(1).getNode();
20952 +  SDValue Target = BRCOND.getOperand(2);
20953 +  SDNode *BR = 0;
20954 +
20955 +  if (Intr->getOpcode() == ISD::SETCC) {
20956 +    // As long as we negate the condition everything is fine
20957 +    SDNode *SetCC = Intr;
20958 +    assert(SetCC->getConstantOperandVal(1) == 1);
20959 +
20960 +    CondCodeSDNode *CC = cast<CondCodeSDNode>(SetCC->getOperand(2).getNode());
20961 +    assert(CC->get() == ISD::SETNE);
20962 +    Intr = SetCC->getOperand(0).getNode();
20963 +
20964 +  } else {
20965 +    // Get the target from BR if we don't negate the condition
20966 +    BR = findUser(BRCOND, ISD::BR);
20967 +    Target = BR->getOperand(1);
20968 +  }
20969 +
20970 +  assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN);
20971 +
20972 +  // Build the result and
20973 +  SmallVector<EVT, 4> Res;
20974 +  for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i)
20975 +    Res.push_back(Intr->getValueType(i));
20976 +
20977 +  // operands of the new intrinsic call
20978 +  SmallVector<SDValue, 4> Ops;
20979 +  Ops.push_back(BRCOND.getOperand(0));
20980 +  for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i)
20981 +    Ops.push_back(Intr->getOperand(i));
20982 +  Ops.push_back(Target);
20983 +
20984 +  // build the new intrinsic call
20985 +  SDNode *Result = DAG.getNode(
20986 +    Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
20987 +    DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode();
20988 +
20989 +  if (BR) {
20990 +    // Give the branch instruction our target
20991 +    SDValue Ops[] = {
20992 +      BR->getOperand(0),
20993 +      BRCOND.getOperand(2)
20994 +    };
20995 +    DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2);
20996 +  }
20997 +
20998 +  SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
20999 +
21000 +  // Copy the intrinsic results to registers
21001 +  for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
21002 +    SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
21003 +    if (!CopyToReg)
21004 +      continue;
21005 +
21006 +    Chain = DAG.getCopyToReg(
21007 +      Chain, DL,
21008 +      CopyToReg->getOperand(1),
21009 +      SDValue(Result, i - 1),
21010 +      SDValue());
21011 +
21012 +    DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
21013 +  }
21014 +
21015 +  // Remove the old intrinsic from the chain
21016 +  DAG.ReplaceAllUsesOfValueWith(
21017 +    SDValue(Intr, Intr->getNumValues() - 1),
21018 +    Intr->getOperand(0));
21019 +
21020 +  return Chain;
21021 +}
21022 +
21023 +SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
21024 +  EVT VT = Op.getValueType();
21025 +  LoadSDNode *Ptr = dyn_cast<LoadSDNode>(Op);
21026 +
21027 +  assert(Ptr);
21028 +
21029 +  unsigned AddrSpace = Ptr->getPointerInfo().getAddrSpace();
21030 +
21031 +  // We only need to lower USER_SGPR address space loads
21032 +  if (AddrSpace != AMDGPUAS::USER_SGPR_ADDRESS) {
21033 +    return SDValue();
21034 +  }
21035 +
21036 +  // Loads from the USER_SGPR address space can only have constant value
21037 +  // pointers.
21038 +  ConstantSDNode *BasePtr = dyn_cast<ConstantSDNode>(Ptr->getBasePtr());
21039 +  assert(BasePtr);
21040 +
21041 +  unsigned TypeDwordWidth = VT.getSizeInBits() / 32;
21042 +  const TargetRegisterClass * dstClass;
21043 +  switch (TypeDwordWidth) {
21044 +    default:
21045 +      assert(!"USER_SGPR value size not implemented");
21046 +      return SDValue();
21047 +    case 1:
21048 +      dstClass = &AMDGPU::SReg_32RegClass;
21049 +      break;
21050 +    case 2:
21051 +      dstClass = &AMDGPU::SReg_64RegClass;
21052 +      break;
21053 +  }
21054 +  uint64_t Index = BasePtr->getZExtValue();
21055 +  assert(Index % TypeDwordWidth == 0 && "USER_SGPR not properly aligned");
21056 +  unsigned SGPRIndex = Index / TypeDwordWidth;
21057 +  unsigned Reg = dstClass->getRegister(SGPRIndex);
21058 +
21059 +  DAG.ReplaceAllUsesOfValueWith(Op, CreateLiveInRegister(DAG, dstClass, Reg,
21060 +                                                         VT));
21061 +  return SDValue();
21062 +}
21063 +
21064 +SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
21065 +  SDValue LHS = Op.getOperand(0);
21066 +  SDValue RHS = Op.getOperand(1);
21067 +  SDValue True = Op.getOperand(2);
21068 +  SDValue False = Op.getOperand(3);
21069 +  SDValue CC = Op.getOperand(4);
21070 +  EVT VT = Op.getValueType();
21071 +  DebugLoc DL = Op.getDebugLoc();
21072 +
21073 +  // Possible Min/Max pattern
21074 +  SDValue MinMax = LowerMinMax(Op, DAG);
21075 +  if (MinMax.getNode()) {
21076 +    return MinMax;
21077 +  }
21078 +
21079 +  SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC);
21080 +  return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
21081 +}
21082 +
21083 +//===----------------------------------------------------------------------===//
21084 +// Custom DAG optimizations
21085 +//===----------------------------------------------------------------------===//
21086 +
21087 +SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
21088 +                                            DAGCombinerInfo &DCI) const {
21089 +  SelectionDAG &DAG = DCI.DAG;
21090 +  DebugLoc DL = N->getDebugLoc();
21091 +  EVT VT = N->getValueType(0);
21092 +
21093 +  switch (N->getOpcode()) {
21094 +    default: break;
21095 +    case ISD::SELECT_CC: {
21096 +      N->dump();
21097 +      ConstantSDNode *True, *False;
21098 +      // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc)
21099 +      if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2)))
21100 +          && (False = dyn_cast<ConstantSDNode>(N->getOperand(3)))
21101 +          && True->isAllOnesValue()
21102 +          && False->isNullValue()
21103 +          && VT == MVT::i1) {
21104 +        return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0),
21105 +                           N->getOperand(1), N->getOperand(4));
21106 +
21107 +      }
21108 +      break;
21109 +    }
21110 +    case ISD::SETCC: {
21111 +      SDValue Arg0 = N->getOperand(0);
21112 +      SDValue Arg1 = N->getOperand(1);
21113 +      SDValue CC = N->getOperand(2);
21114 +      ConstantSDNode * C = NULL;
21115 +      ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get();
21116 +
21117 +      // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne)
21118 +      if (VT == MVT::i1
21119 +          && Arg0.getOpcode() == ISD::SIGN_EXTEND
21120 +          && Arg0.getOperand(0).getValueType() == MVT::i1
21121 +          && (C = dyn_cast<ConstantSDNode>(Arg1))
21122 +          && C->isNullValue()
21123 +          && CCOp == ISD::SETNE) {
21124 +        return SimplifySetCC(VT, Arg0.getOperand(0),
21125 +                             DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL);
21126 +      }
21127 +      break;
21128 +    }
21129 +  }
21130 +  return SDValue();
21131 +}
21132 +
21133 +#define NODE_NAME_CASE(node) case SIISD::node: return #node;
21134 +
21135 +const char* SITargetLowering::getTargetNodeName(unsigned Opcode) const {
21136 +  switch (Opcode) {
21137 +  default: return AMDGPUTargetLowering::getTargetNodeName(Opcode);
21138 +  NODE_NAME_CASE(VCC_AND)
21139 +  NODE_NAME_CASE(VCC_BITCAST)
21140 +  }
21141 +}
21142 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIISelLowering.h llvm-r600/lib/Target/R600/SIISelLowering.h
21143 --- llvm-3.2.src/lib/Target/R600/SIISelLowering.h       1970-01-01 01:00:00.000000000 +0100
21144 +++ llvm-r600/lib/Target/R600/SIISelLowering.h  2013-01-25 19:43:57.473383054 +0100
21145 @@ -0,0 +1,55 @@
21146 +//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===//
21147 +//
21148 +//                     The LLVM Compiler Infrastructure
21149 +//
21150 +// This file is distributed under the University of Illinois Open Source
21151 +// License. See LICENSE.TXT for details.
21152 +//
21153 +//===----------------------------------------------------------------------===//
21154 +//
21155 +/// \file
21156 +/// \brief SI DAG Lowering interface definition
21157 +//
21158 +//===----------------------------------------------------------------------===//
21159 +
21160 +#ifndef SIISELLOWERING_H
21161 +#define SIISELLOWERING_H
21162 +
21163 +#include "AMDGPUISelLowering.h"
21164 +#include "SIInstrInfo.h"
21165 +
21166 +namespace llvm {
21167 +
21168 +class SITargetLowering : public AMDGPUTargetLowering {
21169 +  const SIInstrInfo * TII;
21170 +
21171 +  void LowerMOV_IMM(MachineInstr *MI, MachineBasicBlock &BB,
21172 +              MachineBasicBlock::iterator I, unsigned Opocde) const;
21173 +  void LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
21174 +              MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
21175 +  void LowerSI_INTERP_CONST(MachineInstr *MI, MachineBasicBlock &BB,
21176 +              MachineBasicBlock::iterator I, MachineRegisterInfo &MRI) const;
21177 +  void LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
21178 +              MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
21179 +  void LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB,
21180 +              MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
21181 +
21182 +  SDValue Loweri1ContextSwitch(SDValue Op, SelectionDAG &DAG,
21183 +                                           unsigned VCCNode) const;
21184 +  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
21185 +  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
21186 +  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
21187 +
21188 +public:
21189 +  SITargetLowering(TargetMachine &tm);
21190 +  virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
21191 +                                              MachineBasicBlock * BB) const;
21192 +  virtual EVT getSetCCResultType(EVT VT) const;
21193 +  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
21194 +  virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
21195 +  virtual const char* getTargetNodeName(unsigned Opcode) const;
21196 +};
21197 +
21198 +} // End namespace llvm
21199 +
21200 +#endif //SIISELLOWERING_H
21201 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SILowerControlFlow.cpp llvm-r600/lib/Target/R600/SILowerControlFlow.cpp
21202 --- llvm-3.2.src/lib/Target/R600/SILowerControlFlow.cpp 1970-01-01 01:00:00.000000000 +0100
21203 +++ llvm-r600/lib/Target/R600/SILowerControlFlow.cpp    2013-01-25 19:43:57.480049720 +0100
21204 @@ -0,0 +1,372 @@
21205 +//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
21206 +//
21207 +//                     The LLVM Compiler Infrastructure
21208 +//
21209 +// This file is distributed under the University of Illinois Open Source
21210 +// License. See LICENSE.TXT for details.
21211 +//
21212 +//===----------------------------------------------------------------------===//
21213 +//
21214 +/// \file
21215 +/// \brief This pass lowers the pseudo control flow instructions to real
21216 +/// machine instructions.
21217 +///
21218 +/// All control flow is handled using predicated instructions and
21219 +/// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
21220 +/// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
21221 +/// by writting to the 64-bit EXEC register (each bit corresponds to a
21222 +/// single vector ALU).  Typically, for predicates, a vector ALU will write
21223 +/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
21224 +/// Vector ALU) and then the ScalarALU will AND the VCC register with the
21225 +/// EXEC to update the predicates.
21226 +///
21227 +/// For example:
21228 +/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
21229 +/// %SGPR0 = SI_IF %VCC
21230 +///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
21231 +/// %SGPR0 = SI_ELSE %SGPR0
21232 +///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
21233 +/// SI_END_CF %SGPR0
21234 +///
21235 +/// becomes:
21236 +///
21237 +/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
21238 +/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
21239 +/// S_CBRANCH_EXECZ label0            // This instruction is an optional
21240 +///                                   // optimization which allows us to
21241 +///                                   // branch if all the bits of
21242 +///                                   // EXEC are zero.
21243 +/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
21244 +///
21245 +/// label0:
21246 +/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
21247 +/// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
21248 +/// S_BRANCH_EXECZ label1              // Use our branch optimization
21249 +///                                    // instruction again.
21250 +/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
21251 +/// label1:
21252 +/// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
21253 +//===----------------------------------------------------------------------===//
21254 +
21255 +#include "AMDGPU.h"
21256 +#include "SIInstrInfo.h"
21257 +#include "SIMachineFunctionInfo.h"
21258 +#include "llvm/CodeGen/MachineFunction.h"
21259 +#include "llvm/CodeGen/MachineFunctionPass.h"
21260 +#include "llvm/CodeGen/MachineInstrBuilder.h"
21261 +#include "llvm/CodeGen/MachineRegisterInfo.h"
21262 +
21263 +using namespace llvm;
21264 +
21265 +namespace {
21266 +
21267 +class SILowerControlFlowPass : public MachineFunctionPass {
21268 +
21269 +private:
21270 +  static const unsigned SkipThreshold = 12;
21271 +
21272 +  static char ID;
21273 +  const TargetInstrInfo *TII;
21274 +
21275 +  bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
21276 +
21277 +  void Skip(MachineInstr &From, MachineOperand &To);
21278 +  void SkipIfDead(MachineInstr &MI);
21279 +
21280 +  void If(MachineInstr &MI);
21281 +  void Else(MachineInstr &MI);
21282 +  void Break(MachineInstr &MI);
21283 +  void IfBreak(MachineInstr &MI);
21284 +  void ElseBreak(MachineInstr &MI);
21285 +  void Loop(MachineInstr &MI);
21286 +  void EndCf(MachineInstr &MI);
21287 +
21288 +  void Kill(MachineInstr &MI);
21289 +  void Branch(MachineInstr &MI);
21290 +
21291 +public:
21292 +  SILowerControlFlowPass(TargetMachine &tm) :
21293 +    MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
21294 +
21295 +  virtual bool runOnMachineFunction(MachineFunction &MF);
21296 +
21297 +  const char *getPassName() const {
21298 +    return "SI Lower control flow instructions";
21299 +  }
21300 +
21301 +};
21302 +
21303 +} // End anonymous namespace
21304 +
21305 +char SILowerControlFlowPass::ID = 0;
21306 +
21307 +FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
21308 +  return new SILowerControlFlowPass(tm);
21309 +}
21310 +
21311 +bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From,
21312 +                                        MachineBasicBlock *To) {
21313 +
21314 +  unsigned NumInstr = 0;
21315 +
21316 +  for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty();
21317 +       MBB = *MBB->succ_begin()) {
21318 +
21319 +    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
21320 +         NumInstr < SkipThreshold && I != E; ++I) {
21321 +
21322 +      if (I->isBundle() || !I->isBundled())
21323 +        if (++NumInstr >= SkipThreshold)
21324 +          return true;
21325 +    }
21326 +  }
21327 +
21328 +  return false;
21329 +}
21330 +
21331 +void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
21332 +
21333 +  if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
21334 +    return;
21335 +
21336 +  DebugLoc DL = From.getDebugLoc();
21337 +  BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
21338 +          .addOperand(To)
21339 +          .addReg(AMDGPU::EXEC);
21340 +}
21341 +
21342 +void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
21343 +
21344 +  MachineBasicBlock &MBB = *MI.getParent();
21345 +  DebugLoc DL = MI.getDebugLoc();
21346 +
21347 +  if (!shouldSkip(&MBB, &MBB.getParent()->back()))
21348 +    return;
21349 +
21350 +  MachineBasicBlock::iterator Insert = &MI;
21351 +  ++Insert;
21352 +
21353 +  // If the exec mask is non-zero, skip the next two instructions
21354 +  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
21355 +          .addImm(3)
21356 +          .addReg(AMDGPU::EXEC);
21357 +
21358 +  // Exec mask is zero: Export to NULL target...
21359 +  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
21360 +          .addImm(0)
21361 +          .addImm(0x09) // V_008DFC_SQ_EXP_NULL
21362 +          .addImm(0)
21363 +          .addImm(1)
21364 +          .addImm(1)
21365 +          .addReg(AMDGPU::SREG_LIT_0)
21366 +          .addReg(AMDGPU::SREG_LIT_0)
21367 +          .addReg(AMDGPU::SREG_LIT_0)
21368 +          .addReg(AMDGPU::SREG_LIT_0);
21369 +
21370 +  // ... and terminate wavefront
21371 +  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
21372 +}
21373 +
21374 +void SILowerControlFlowPass::If(MachineInstr &MI) {
21375 +  MachineBasicBlock &MBB = *MI.getParent();
21376 +  DebugLoc DL = MI.getDebugLoc();
21377 +  unsigned Reg = MI.getOperand(0).getReg();
21378 +  unsigned Vcc = MI.getOperand(1).getReg();
21379 +
21380 +  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
21381 +          .addReg(Vcc);
21382 +
21383 +  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
21384 +          .addReg(AMDGPU::EXEC)
21385 +          .addReg(Reg);
21386 +
21387 +  Skip(MI, MI.getOperand(2));
21388 +
21389 +  MI.eraseFromParent();
21390 +}
21391 +
21392 +void SILowerControlFlowPass::Else(MachineInstr &MI) {
21393 +  MachineBasicBlock &MBB = *MI.getParent();
21394 +  DebugLoc DL = MI.getDebugLoc();
21395 +  unsigned Dst = MI.getOperand(0).getReg();
21396 +  unsigned Src = MI.getOperand(1).getReg();
21397 +
21398 +  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
21399 +          .addReg(Src); // Saved EXEC
21400 +
21401 +  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
21402 +          .addReg(AMDGPU::EXEC)
21403 +          .addReg(Dst);
21404 +
21405 +  Skip(MI, MI.getOperand(2));
21406 +
21407 +  MI.eraseFromParent();
21408 +}
21409 +
21410 +void SILowerControlFlowPass::Break(MachineInstr &MI) {
21411 +  MachineBasicBlock &MBB = *MI.getParent();
21412 +  DebugLoc DL = MI.getDebugLoc();
21413 +
21414 +  unsigned Dst = MI.getOperand(0).getReg();
21415 +  unsigned Src = MI.getOperand(1).getReg();
21416 +
21417 +  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
21418 +          .addReg(AMDGPU::EXEC)
21419 +          .addReg(Src);
21420 +
21421 +  MI.eraseFromParent();
21422 +}
21423 +
21424 +void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
21425 +  MachineBasicBlock &MBB = *MI.getParent();
21426 +  DebugLoc DL = MI.getDebugLoc();
21427 +
21428 +  unsigned Dst = MI.getOperand(0).getReg();
21429 +  unsigned Vcc = MI.getOperand(1).getReg();
21430 +  unsigned Src = MI.getOperand(2).getReg();
21431 +
21432 +  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
21433 +          .addReg(Vcc)
21434 +          .addReg(Src);
21435 +
21436 +  MI.eraseFromParent();
21437 +}
21438 +
21439 +void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
21440 +  MachineBasicBlock &MBB = *MI.getParent();
21441 +  DebugLoc DL = MI.getDebugLoc();
21442 +
21443 +  unsigned Dst = MI.getOperand(0).getReg();
21444 +  unsigned Saved = MI.getOperand(1).getReg();
21445 +  unsigned Src = MI.getOperand(2).getReg();
21446 +
21447 +  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
21448 +          .addReg(Saved)
21449 +          .addReg(Src);
21450 +
21451 +  MI.eraseFromParent();
21452 +}
21453 +
21454 +void SILowerControlFlowPass::Loop(MachineInstr &MI) {
21455 +  MachineBasicBlock &MBB = *MI.getParent();
21456 +  DebugLoc DL = MI.getDebugLoc();
21457 +  unsigned Src = MI.getOperand(0).getReg();
21458 +
21459 +  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
21460 +          .addReg(AMDGPU::EXEC)
21461 +          .addReg(Src);
21462 +
21463 +  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
21464 +          .addOperand(MI.getOperand(1))
21465 +          .addReg(AMDGPU::EXEC);
21466 +
21467 +  MI.eraseFromParent();
21468 +}
21469 +
21470 +void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
21471 +  MachineBasicBlock &MBB = *MI.getParent();
21472 +  DebugLoc DL = MI.getDebugLoc();
21473 +  unsigned Reg = MI.getOperand(0).getReg();
21474 +
21475 +  BuildMI(MBB, MBB.getFirstNonPHI(), DL,
21476 +          TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
21477 +          .addReg(AMDGPU::EXEC)
21478 +          .addReg(Reg);
21479 +
21480 +  MI.eraseFromParent();
21481 +}
21482 +
21483 +void SILowerControlFlowPass::Branch(MachineInstr &MI) {
21484 +  MachineBasicBlock *Next = MI.getParent()->getNextNode();
21485 +  MachineBasicBlock *Target = MI.getOperand(0).getMBB();
21486 +  if (Target == Next)
21487 +    MI.eraseFromParent();
21488 +  else
21489 +    assert(0);
21490 +}
21491 +
21492 +void SILowerControlFlowPass::Kill(MachineInstr &MI) {
21493 +
21494 +  MachineBasicBlock &MBB = *MI.getParent();
21495 +  DebugLoc DL = MI.getDebugLoc();
21496 +
21497 +  // Kill is only allowed in pixel shaders
21498 +  MachineFunction &MF = *MBB.getParent();
21499 +  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
21500 +  assert(Info->ShaderType == ShaderType::PIXEL);
21501 +
21502 +  // Clear this pixel from the exec mask if the operand is negative
21503 +  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC)
21504 +          .addReg(AMDGPU::SREG_LIT_0)
21505 +          .addOperand(MI.getOperand(0));
21506 +
21507 +  MI.eraseFromParent();
21508 +}
21509 +
21510 +bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
21511 +
21512 +  bool HaveKill = false;
21513 +  unsigned Depth = 0;
21514 +
21515 +  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
21516 +       BI != BE; ++BI) {
21517 +
21518 +    MachineBasicBlock &MBB = *BI;
21519 +    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
21520 +         I != MBB.end(); I = Next) {
21521 +
21522 +      Next = llvm::next(I);
21523 +      MachineInstr &MI = *I;
21524 +      switch (MI.getOpcode()) {
21525 +        default: break;
21526 +        case AMDGPU::SI_IF:
21527 +          ++Depth;
21528 +          If(MI);
21529 +          break;
21530 +
21531 +        case AMDGPU::SI_ELSE:
21532 +          Else(MI);
21533 +          break;
21534 +
21535 +        case AMDGPU::SI_BREAK:
21536 +          Break(MI);
21537 +          break;
21538 +
21539 +        case AMDGPU::SI_IF_BREAK:
21540 +          IfBreak(MI);
21541 +          break;
21542 +
21543 +        case AMDGPU::SI_ELSE_BREAK:
21544 +          ElseBreak(MI);
21545 +          break;
21546 +
21547 +        case AMDGPU::SI_LOOP:
21548 +          ++Depth;
21549 +          Loop(MI);
21550 +          break;
21551 +
21552 +        case AMDGPU::SI_END_CF:
21553 +          if (--Depth == 0 && HaveKill) {
21554 +            SkipIfDead(MI);
21555 +            HaveKill = false;
21556 +          }
21557 +          EndCf(MI);
21558 +          break;
21559 +
21560 +        case AMDGPU::SI_KILL:
21561 +          if (Depth == 0)
21562 +            SkipIfDead(MI);
21563 +          else
21564 +            HaveKill = true;
21565 +          Kill(MI);
21566 +          break;
21567 +
21568 +        case AMDGPU::S_BRANCH:
21569 +          Branch(MI);
21570 +          break;
21571 +      }
21572 +    }
21573 +  }
21574 +
21575 +  return true;
21576 +}
21577 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SILowerLiteralConstants.cpp llvm-r600/lib/Target/R600/SILowerLiteralConstants.cpp
21578 --- llvm-3.2.src/lib/Target/R600/SILowerLiteralConstants.cpp    1970-01-01 01:00:00.000000000 +0100
21579 +++ llvm-r600/lib/Target/R600/SILowerLiteralConstants.cpp       2013-01-25 19:43:57.480049720 +0100
21580 @@ -0,0 +1,108 @@
21581 +//===-- SILowerLiteralConstants.cpp - Lower intrs using literal constants--===//
21582 +//
21583 +//                     The LLVM Compiler Infrastructure
21584 +//
21585 +// This file is distributed under the University of Illinois Open Source
21586 +// License. See LICENSE.TXT for details.
21587 +//
21588 +//===----------------------------------------------------------------------===//
21589 +//
21590 +/// \file
21591 +/// \brief This pass performs the following transformation on instructions with
21592 +/// literal constants:
21593 +///
21594 +/// %VGPR0 = V_MOV_IMM_I32 1
21595 +///
21596 +/// becomes:
21597 +///
21598 +/// BUNDLE
21599 +///   * %VGPR = V_MOV_B32_32 SI_LITERAL_CONSTANT
21600 +///   * SI_LOAD_LITERAL 1
21601 +///
21602 +/// The resulting sequence matches exactly how the hardware handles immediate
21603 +/// operands, so this transformation greatly simplifies the code generator.
21604 +///
21605 +/// Only the *_MOV_IMM_* support immediate operands at the moment, but when
21606 +/// support for immediate operands is added to other instructions, they
21607 +/// will be lowered here as well.
21608 +//===----------------------------------------------------------------------===//
21609 +
21610 +#include "AMDGPU.h"
21611 +#include "llvm/CodeGen/MachineFunction.h"
21612 +#include "llvm/CodeGen/MachineFunctionPass.h"
21613 +#include "llvm/CodeGen/MachineInstrBuilder.h"
21614 +#include "llvm/CodeGen/MachineInstrBundle.h"
21615 +
21616 +using namespace llvm;
21617 +
21618 +namespace {
21619 +
21620 +class SILowerLiteralConstantsPass : public MachineFunctionPass {
21621 +
21622 +private:
21623 +  static char ID;
21624 +  const TargetInstrInfo *TII;
21625 +
21626 +public:
21627 +  SILowerLiteralConstantsPass(TargetMachine &tm) :
21628 +    MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
21629 +
21630 +  virtual bool runOnMachineFunction(MachineFunction &MF);
21631 +
21632 +  const char *getPassName() const {
21633 +    return "SI Lower literal constants pass";
21634 +  }
21635 +};
21636 +
21637 +} // End anonymous namespace
21638 +
21639 +char SILowerLiteralConstantsPass::ID = 0;
21640 +
21641 +FunctionPass *llvm::createSILowerLiteralConstantsPass(TargetMachine &tm) {
21642 +  return new SILowerLiteralConstantsPass(tm);
21643 +}
21644 +
21645 +bool SILowerLiteralConstantsPass::runOnMachineFunction(MachineFunction &MF) {
21646 +  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
21647 +                                                  BB != BB_E; ++BB) {
21648 +    MachineBasicBlock &MBB = *BB;
21649 +    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
21650 +                               I != MBB.end(); I = Next) {
21651 +      Next = llvm::next(I);
21652 +      MachineInstr &MI = *I;
21653 +      switch (MI.getOpcode()) {
21654 +      default: break;
21655 +      case AMDGPU::S_MOV_IMM_I32:
21656 +      case AMDGPU::S_MOV_IMM_I64:
21657 +      case AMDGPU::V_MOV_IMM_F32:
21658 +      case AMDGPU::V_MOV_IMM_I32: {
21659 +          unsigned MovOpcode;
21660 +          unsigned LoadLiteralOpcode;
21661 +          MachineOperand LiteralOp = MI.getOperand(1);
21662 +          if (AMDGPU::VReg_32RegClass.contains(MI.getOperand(0).getReg())) {
21663 +            MovOpcode = AMDGPU::V_MOV_B32_e32;
21664 +          } else {
21665 +            MovOpcode = AMDGPU::S_MOV_B32;
21666 +          }
21667 +          if (LiteralOp.isImm()) {
21668 +            LoadLiteralOpcode = AMDGPU::SI_LOAD_LITERAL_I32;
21669 +          } else {
21670 +            LoadLiteralOpcode = AMDGPU::SI_LOAD_LITERAL_F32;
21671 +          }
21672 +          MachineInstr *First =
21673 +            BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(MovOpcode),
21674 +                    MI.getOperand(0).getReg())
21675 +                    .addReg(AMDGPU::SI_LITERAL_CONSTANT);
21676 +          MachineInstr *Last =
21677 +            BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(LoadLiteralOpcode))
21678 +                    .addOperand(MI.getOperand(1));
21679 +          Last->setIsInsideBundle();
21680 +          llvm::finalizeBundle(MBB, First, Last);
21681 +          MI.eraseFromParent();
21682 +          break;
21683 +        }
21684 +      }
21685 +    }
21686 +  }
21687 +  return false;
21688 +}
21689 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIMachineFunctionInfo.cpp llvm-r600/lib/Target/R600/SIMachineFunctionInfo.cpp
21690 --- llvm-3.2.src/lib/Target/R600/SIMachineFunctionInfo.cpp      1970-01-01 01:00:00.000000000 +0100
21691 +++ llvm-r600/lib/Target/R600/SIMachineFunctionInfo.cpp 2013-01-25 19:43:57.480049720 +0100
21692 @@ -0,0 +1,20 @@
21693 +//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===//
21694 +//
21695 +//                     The LLVM Compiler Infrastructure
21696 +//
21697 +// This file is distributed under the University of Illinois Open Source
21698 +// License. See LICENSE.TXT for details.
21699 +//
21700 +/// \file
21701 +//===----------------------------------------------------------------------===//
21702 +
21703 +
21704 +#include "SIMachineFunctionInfo.h"
21705 +
21706 +using namespace llvm;
21707 +
21708 +SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
21709 +  : MachineFunctionInfo(),
21710 +    SPIPSInputAddr(0),
21711 +    ShaderType(0)
21712 +  { }
21713 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIMachineFunctionInfo.h llvm-r600/lib/Target/R600/SIMachineFunctionInfo.h
21714 --- llvm-3.2.src/lib/Target/R600/SIMachineFunctionInfo.h        1970-01-01 01:00:00.000000000 +0100
21715 +++ llvm-r600/lib/Target/R600/SIMachineFunctionInfo.h   2013-01-25 19:43:57.480049720 +0100
21716 @@ -0,0 +1,34 @@
21717 +//===- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface -*- C++ -*-==//
21718 +//
21719 +//                     The LLVM Compiler Infrastructure
21720 +//
21721 +// This file is distributed under the University of Illinois Open Source
21722 +// License. See LICENSE.TXT for details.
21723 +//
21724 +//===----------------------------------------------------------------------===//
21725 +//
21726 +/// \file
21727 +//
21728 +//===----------------------------------------------------------------------===//
21729 +
21730 +
21731 +#ifndef SIMACHINEFUNCTIONINFO_H_
21732 +#define SIMACHINEFUNCTIONINFO_H_
21733 +
21734 +#include "llvm/CodeGen/MachineFunction.h"
21735 +
21736 +namespace llvm {
21737 +
21738 +/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
21739 +/// tells the hardware which interpolation parameters to load.
21740 +class SIMachineFunctionInfo : public MachineFunctionInfo {
21741 +public:
21742 +  SIMachineFunctionInfo(const MachineFunction &MF);
21743 +  unsigned SPIPSInputAddr;
21744 +  unsigned ShaderType;
21745 +};
21746 +
21747 +} // End namespace llvm
21748 +
21749 +
21750 +#endif //_SIMACHINEFUNCTIONINFO_H_
21751 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIRegisterInfo.cpp llvm-r600/lib/Target/R600/SIRegisterInfo.cpp
21752 --- llvm-3.2.src/lib/Target/R600/SIRegisterInfo.cpp     1970-01-01 01:00:00.000000000 +0100
21753 +++ llvm-r600/lib/Target/R600/SIRegisterInfo.cpp        2013-01-25 19:43:57.480049720 +0100
21754 @@ -0,0 +1,48 @@
21755 +//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
21756 +//
21757 +//                     The LLVM Compiler Infrastructure
21758 +//
21759 +// This file is distributed under the University of Illinois Open Source
21760 +// License. See LICENSE.TXT for details.
21761 +//
21762 +//===----------------------------------------------------------------------===//
21763 +//
21764 +/// \file
21765 +/// \brief SI implementation of the TargetRegisterInfo class.
21766 +//
21767 +//===----------------------------------------------------------------------===//
21768 +
21769 +
21770 +#include "SIRegisterInfo.h"
21771 +#include "AMDGPUTargetMachine.h"
21772 +
21773 +using namespace llvm;
21774 +
21775 +SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine &tm,
21776 +    const TargetInstrInfo &tii)
21777 +: AMDGPURegisterInfo(tm, tii),
21778 +  TM(tm),
21779 +  TII(tii)
21780 +  { }
21781 +
21782 +BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
21783 +  BitVector Reserved(getNumRegs());
21784 +  return Reserved;
21785 +}
21786 +
21787 +const TargetRegisterClass *
21788 +SIRegisterInfo::getISARegClass(const TargetRegisterClass * rc) const {
21789 +  switch (rc->getID()) {
21790 +  case AMDGPU::GPRF32RegClassID:
21791 +    return &AMDGPU::VReg_32RegClass;
21792 +  default: return rc;
21793 +  }
21794 +}
21795 +
21796 +const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
21797 +                                                                   MVT VT) const {
21798 +  switch(VT.SimpleTy) {
21799 +    default:
21800 +    case MVT::i32: return &AMDGPU::VReg_32RegClass;
21801 +  }
21802 +}
21803 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIRegisterInfo.h llvm-r600/lib/Target/R600/SIRegisterInfo.h
21804 --- llvm-3.2.src/lib/Target/R600/SIRegisterInfo.h       1970-01-01 01:00:00.000000000 +0100
21805 +++ llvm-r600/lib/Target/R600/SIRegisterInfo.h  2013-01-25 19:43:57.483383054 +0100
21806 @@ -0,0 +1,47 @@
21807 +//===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===//
21808 +//
21809 +//                     The LLVM Compiler Infrastructure
21810 +//
21811 +// This file is distributed under the University of Illinois Open Source
21812 +// License. See LICENSE.TXT for details.
21813 +//
21814 +//===----------------------------------------------------------------------===//
21815 +//
21816 +/// \file
21817 +/// \brief Interface definition for SIRegisterInfo
21818 +//
21819 +//===----------------------------------------------------------------------===//
21820 +
21821 +
21822 +#ifndef SIREGISTERINFO_H_
21823 +#define SIREGISTERINFO_H_
21824 +
21825 +#include "AMDGPURegisterInfo.h"
21826 +
21827 +namespace llvm {
21828 +
21829 +class AMDGPUTargetMachine;
21830 +class TargetInstrInfo;
21831 +
21832 +struct SIRegisterInfo : public AMDGPURegisterInfo {
21833 +  AMDGPUTargetMachine &TM;
21834 +  const TargetInstrInfo &TII;
21835 +
21836 +  SIRegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii);
21837 +
21838 +  virtual BitVector getReservedRegs(const MachineFunction &MF) const;
21839 +
21840 +  /// \param RC is an AMDIL reg class.
21841 +  ///
21842 +  /// \returns the SI register class that is equivalent to \p RC.
21843 +  virtual const TargetRegisterClass *
21844 +    getISARegClass(const TargetRegisterClass *RC) const;
21845 +
21846 +  /// \brief get the register class of the specified type to use in the
21847 +  /// CFGStructurizer
21848 +  virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
21849 +};
21850 +
21851 +} // End namespace llvm
21852 +
21853 +#endif // SIREGISTERINFO_H_
21854 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIRegisterInfo.td llvm-r600/lib/Target/R600/SIRegisterInfo.td
21855 --- llvm-3.2.src/lib/Target/R600/SIRegisterInfo.td      1970-01-01 01:00:00.000000000 +0100
21856 +++ llvm-r600/lib/Target/R600/SIRegisterInfo.td 2013-01-25 19:43:57.483383054 +0100
21857 @@ -0,0 +1,167 @@
21858 +
21859 +let Namespace = "AMDGPU" in {
21860 +  def low : SubRegIndex;
21861 +  def high : SubRegIndex;
21862 +
21863 +  def sub0 : SubRegIndex;
21864 +  def sub1 : SubRegIndex;
21865 +  def sub2 : SubRegIndex;
21866 +  def sub3 : SubRegIndex;
21867 +  def sub4 : SubRegIndex;
21868 +  def sub5 : SubRegIndex;
21869 +  def sub6 : SubRegIndex;
21870 +  def sub7 : SubRegIndex;
21871 +}
21872 +
21873 +class SIReg <string n, bits<16> encoding = 0> : Register<n> {
21874 +  let Namespace = "AMDGPU";
21875 +  let HWEncoding = encoding;
21876 +}
21877 +
21878 +class SI_64 <string n, list<Register> subregs, bits<16> encoding> : RegisterWithSubRegs<n, subregs> {
21879 +  let Namespace = "AMDGPU";
21880 +  let SubRegIndices = [low, high];
21881 +  let HWEncoding = encoding;
21882 +}
21883 +
21884 +class SGPR_32 <bits<16> num, string name> : SIReg<name, num>;
21885 +
21886 +class VGPR_32 <bits<16> num, string name> : SIReg<name, num>;
21887 +
21888 +// Special Registers
21889 +def VCC : SIReg<"VCC", 106>;
21890 +def EXEC_LO : SIReg <"EXEC LO", 126>;
21891 +def EXEC_HI : SIReg <"EXEC HI", 127>;
21892 +def EXEC : SI_64<"EXEC", [EXEC_LO, EXEC_HI], 126>;
21893 +def SCC : SIReg<"SCC", 253>;
21894 +def SREG_LIT_0 : SIReg <"S LIT 0", 128>;
21895 +def SI_LITERAL_CONSTANT : SIReg<"LITERAL CONSTANT", 255>;
21896 +def M0 : SIReg <"M0", 124>;
21897 +
21898 +//Interpolation registers
21899 +def PERSP_SAMPLE_I : SIReg <"PERSP_SAMPLE_I">;
21900 +def PERSP_SAMPLE_J : SIReg <"PERSP_SAMPLE_J">;
21901 +def PERSP_CENTER_I : SIReg <"PERSP_CENTER_I">;
21902 +def PERSP_CENTER_J : SIReg <"PERSP_CENTER_J">;
21903 +def PERSP_CENTROID_I : SIReg <"PERSP_CENTROID_I">;
21904 +def PERSP_CENTROID_J : SIReg <"PERP_CENTROID_J">;
21905 +def PERSP_I_W : SIReg <"PERSP_I_W">;
21906 +def PERSP_J_W : SIReg <"PERSP_J_W">;
21907 +def PERSP_1_W : SIReg <"PERSP_1_W">;
21908 +def LINEAR_SAMPLE_I : SIReg <"LINEAR_SAMPLE_I">;
21909 +def LINEAR_SAMPLE_J : SIReg <"LINEAR_SAMPLE_J">;
21910 +def LINEAR_CENTER_I : SIReg <"LINEAR_CENTER_I">;
21911 +def LINEAR_CENTER_J : SIReg <"LINEAR_CENTER_J">;
21912 +def LINEAR_CENTROID_I : SIReg <"LINEAR_CENTROID_I">;
21913 +def LINEAR_CENTROID_J : SIReg <"LINEAR_CENTROID_J">;
21914 +def LINE_STIPPLE_TEX_COORD : SIReg <"LINE_STIPPLE_TEX_COORD">;
21915 +def POS_X_FLOAT : SIReg <"POS_X_FLOAT">;
21916 +def POS_Y_FLOAT : SIReg <"POS_Y_FLOAT">;
21917 +def POS_Z_FLOAT : SIReg <"POS_Z_FLOAT">;
21918 +def POS_W_FLOAT : SIReg <"POS_W_FLOAT">;
21919 +def FRONT_FACE : SIReg <"FRONT_FACE">;
21920 +def ANCILLARY : SIReg <"ANCILLARY">;
21921 +def SAMPLE_COVERAGE : SIReg <"SAMPLE_COVERAGE">;
21922 +def POS_FIXED_PT : SIReg <"POS_FIXED_PT">;
21923 +
21924 +// SGPR 32-bit registers
21925 +foreach Index = 0-101 in {
21926 +  def SGPR#Index : SGPR_32 <Index, "SGPR"#Index>;
21927 +}
21928 +
21929 +def SGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
21930 +                            (add (sequence "SGPR%u", 0, 101))>;
21931 +
21932 +// SGPR 64-bit registers
21933 +def SGPR_64 : RegisterTuples<[low, high],
21934 +                             [(add (decimate SGPR_32, 2)),
21935 +                              (add(decimate (rotl SGPR_32, 1), 2))]>;
21936 +
21937 +// SGPR 128-bit registers
21938 +def SGPR_128 : RegisterTuples<[sel_x, sel_y, sel_z, sel_w],
21939 +                              [(add (decimate SGPR_32, 4)),
21940 +                               (add (decimate (rotl SGPR_32, 1), 4)),
21941 +                               (add (decimate (rotl SGPR_32, 2), 4)),
21942 +                               (add (decimate (rotl SGPR_32, 3), 4))]>;
21943 +
21944 +// SGPR 256-bit registers
21945 +def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
21946 +                              [(add (decimate SGPR_32, 8)),
21947 +                               (add (decimate (rotl SGPR_32, 1), 8)),
21948 +                               (add (decimate (rotl SGPR_32, 2), 8)),
21949 +                               (add (decimate (rotl SGPR_32, 3), 8)),
21950 +                               (add (decimate (rotl SGPR_32, 4), 8)),
21951 +                               (add (decimate (rotl SGPR_32, 5), 8)),
21952 +                               (add (decimate (rotl SGPR_32, 6), 8)),
21953 +                               (add (decimate (rotl SGPR_32, 7), 8))]>;
21954 +
21955 +// VGPR 32-bit registers
21956 +foreach Index = 0-255 in {
21957 +  def VGPR#Index : VGPR_32 <Index, "VGPR"#Index>;
21958 +}
21959 +
21960 +def VGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
21961 +                            (add (sequence "VGPR%u", 0, 255))>;
21962 +
21963 +// VGPR 64-bit registers
21964 +def VGPR_64 : RegisterTuples<[low, high],
21965 +                             [(add VGPR_32),
21966 +                              (add (rotl VGPR_32, 1))]>;
21967 +
21968 +// VGPR 128-bit registers
21969 +def VGPR_128 : RegisterTuples<[sel_x, sel_y, sel_z, sel_w],
21970 +                              [(add VGPR_32),
21971 +                               (add (rotl VGPR_32, 1)),
21972 +                               (add (rotl VGPR_32, 2)),
21973 +                               (add (rotl VGPR_32, 3))]>;
21974 +
21975 +// Register class for all scalar registers (SGPRs + Special Registers)
21976 +def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
21977 +    (add SGPR_32,  SREG_LIT_0, M0, EXEC_LO, EXEC_HI)
21978 +>;
21979 +
21980 +def SReg_64 : RegisterClass<"AMDGPU", [i64], 64, (add SGPR_64, VCC, EXEC)>;
21981 +
21982 +def SReg_1 : RegisterClass<"AMDGPU", [i1], 1, (add VCC, SGPR_64, EXEC)>;
21983 +
21984 +def SReg_128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, (add SGPR_128)>;
21985 +
21986 +def SReg_256 : RegisterClass<"AMDGPU", [v8i32], 256, (add SGPR_256)>;
21987 +
21988 +// Register class for all vector registers (VGPRs + Interploation Registers)
21989 +def VReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
21990 +    (add VGPR_32,
21991 +    PERSP_SAMPLE_I, PERSP_SAMPLE_J,
21992 +    PERSP_CENTER_I, PERSP_CENTER_J,
21993 +    PERSP_CENTROID_I, PERSP_CENTROID_J,
21994 +    PERSP_I_W, PERSP_J_W, PERSP_1_W,
21995 +    LINEAR_SAMPLE_I, LINEAR_SAMPLE_J,
21996 +    LINEAR_CENTER_I, LINEAR_CENTER_J,
21997 +    LINEAR_CENTROID_I, LINEAR_CENTROID_J,
21998 +    LINE_STIPPLE_TEX_COORD,
21999 +    POS_X_FLOAT,
22000 +    POS_Y_FLOAT,
22001 +    POS_Z_FLOAT,
22002 +    POS_W_FLOAT,
22003 +    FRONT_FACE,
22004 +    ANCILLARY,
22005 +    SAMPLE_COVERAGE,
22006 +    POS_FIXED_PT
22007 +    )
22008 +>;
22009 +
22010 +def VReg_64 : RegisterClass<"AMDGPU", [i64], 64, (add VGPR_64)>;
22011 +
22012 +def VReg_128 : RegisterClass<"AMDGPU", [v4f32], 128, (add VGPR_128)>;
22013 +
22014 +// AllReg_* - A set of all scalar and vector registers of a given width.
22015 +def AllReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32, (add VReg_32, SReg_32)>;
22016 +
22017 +def AllReg_64 : RegisterClass<"AMDGPU", [f64, i64], 64, (add SReg_64, VReg_64)>;
22018 +
22019 +// Special register classes for predicates and the M0 register
22020 +def SCCReg : RegisterClass<"AMDGPU", [i1], 1, (add SCC)>;
22021 +def VCCReg : RegisterClass<"AMDGPU", [i1], 1, (add VCC)>;
22022 +def EXECReg : RegisterClass<"AMDGPU", [i1], 1, (add EXEC)>;
22023 +def M0Reg : RegisterClass<"AMDGPU", [i32], 32, (add M0)>;
22024 +
22025 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SISchedule.td llvm-r600/lib/Target/R600/SISchedule.td
22026 --- llvm-3.2.src/lib/Target/R600/SISchedule.td  1970-01-01 01:00:00.000000000 +0100
22027 +++ llvm-r600/lib/Target/R600/SISchedule.td     2013-01-25 19:43:57.483383054 +0100
22028 @@ -0,0 +1,15 @@
22029 +//===-- SISchedule.td - SI Scheduling definitons -------------------------===//
22030 +//
22031 +//                     The LLVM Compiler Infrastructure
22032 +//
22033 +// This file is distributed under the University of Illinois Open Source
22034 +// License. See LICENSE.TXT for details.
22035 +//
22036 +//===----------------------------------------------------------------------===//
22037 +//
22038 +// TODO: This is just a place holder for now.
22039 +//
22040 +//===----------------------------------------------------------------------===//
22041 +
22042 +
22043 +def SI_Itin : ProcessorItineraries <[], [], []>;
22044 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp llvm-r600/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
22045 --- llvm-3.2.src/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp        1970-01-01 01:00:00.000000000 +0100
22046 +++ llvm-r600/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp   2013-01-25 19:43:57.483383054 +0100
22047 @@ -0,0 +1,26 @@
22048 +//===-- TargetInfo/AMDGPUTargetInfo.cpp - TargetInfo for AMDGPU -----------===//
22049 +//
22050 +//                     The LLVM Compiler Infrastructure
22051 +//
22052 +// This file is distributed under the University of Illinois Open Source
22053 +// License. See LICENSE.TXT for details.
22054 +//
22055 +//===----------------------------------------------------------------------===//
22056 +//
22057 +/// \file
22058 +//
22059 +//===----------------------------------------------------------------------===//
22060 +
22061 +#include "AMDGPU.h"
22062 +#include "llvm/Support/TargetRegistry.h"
22063 +
22064 +using namespace llvm;
22065 +
22066 +/// \brief The target for the AMDGPU backend
22067 +Target llvm::TheAMDGPUTarget;
22068 +
22069 +/// \brief Extern function to initialize the targets for the AMDGPU backend
22070 +extern "C" void LLVMInitializeR600TargetInfo() {
22071 +  RegisterTarget<Triple::r600, false>
22072 +    R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX");
22073 +}
22074 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/TargetInfo/CMakeLists.txt llvm-r600/lib/Target/R600/TargetInfo/CMakeLists.txt
22075 --- llvm-3.2.src/lib/Target/R600/TargetInfo/CMakeLists.txt      1970-01-01 01:00:00.000000000 +0100
22076 +++ llvm-r600/lib/Target/R600/TargetInfo/CMakeLists.txt 2013-01-25 19:43:57.483383054 +0100
22077 @@ -0,0 +1,7 @@
22078 +include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
22079 +
22080 +add_llvm_library(LLVMR600Info
22081 +  AMDGPUTargetInfo.cpp
22082 +  )
22083 +
22084 +add_dependencies(LLVMR600Info AMDGPUCommonTableGen intrinsics_gen)
22085 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/TargetInfo/LLVMBuild.txt llvm-r600/lib/Target/R600/TargetInfo/LLVMBuild.txt
22086 --- llvm-3.2.src/lib/Target/R600/TargetInfo/LLVMBuild.txt       1970-01-01 01:00:00.000000000 +0100
22087 +++ llvm-r600/lib/Target/R600/TargetInfo/LLVMBuild.txt  2013-01-25 19:43:57.483383054 +0100
22088 @@ -0,0 +1,23 @@
22089 +;===- ./lib/Target/R600/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
22090 +;
22091 +;                     The LLVM Compiler Infrastructure
22092 +;
22093 +; This file is distributed under the University of Illinois Open Source
22094 +; License. See LICENSE.TXT for details.
22095 +;
22096 +;===------------------------------------------------------------------------===;
22097 +;
22098 +; This is an LLVMBuild description file for the components in this subdirectory.
22099 +;
22100 +; For more information on the LLVMBuild system, please see:
22101 +;
22102 +;   http://llvm.org/docs/LLVMBuild.html
22103 +;
22104 +;===------------------------------------------------------------------------===;
22105 +
22106 +[component_0]
22107 +type = Library
22108 +name = R600Info
22109 +parent = R600
22110 +required_libraries = MC Support
22111 +add_to_library_groups = R600
22112 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/TargetInfo/Makefile llvm-r600/lib/Target/R600/TargetInfo/Makefile
22113 --- llvm-3.2.src/lib/Target/R600/TargetInfo/Makefile    1970-01-01 01:00:00.000000000 +0100
22114 +++ llvm-r600/lib/Target/R600/TargetInfo/Makefile       2013-01-25 19:43:57.483383054 +0100
22115 @@ -0,0 +1,15 @@
22116 +##===- lib/Target/AMDGPU/TargetInfo/Makefile ----------------*- Makefile -*-===##
22117 +#
22118 +#                     The LLVM Compiler Infrastructure
22119 +#
22120 +# This file is distributed under the University of Illinois Open Source
22121 +# License. See LICENSE.TXT for details.
22122 +#
22123 +##===----------------------------------------------------------------------===##
22124 +LEVEL = ../../../..
22125 +LIBRARYNAME = LLVMR600Info
22126 +
22127 +# Hack: we need to include 'main' target directory to grab private headers
22128 +CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
22129 +
22130 +include $(LEVEL)/Makefile.common
22131 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/add.v4i32.ll llvm-r600/test/CodeGen/R600/add.v4i32.ll
22132 --- llvm-3.2.src/test/CodeGen/R600/add.v4i32.ll 1970-01-01 01:00:00.000000000 +0100
22133 +++ llvm-r600/test/CodeGen/R600/add.v4i32.ll    2013-01-25 19:43:58.460049700 +0100
22134 @@ -0,0 +1,15 @@
22135 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22136 +
22137 +;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22138 +;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22139 +;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22140 +;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22141 +
22142 +define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
22143 +  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
22144 +  %a = load <4 x i32> addrspace(1) * %in
22145 +  %b = load <4 x i32> addrspace(1) * %b_ptr
22146 +  %result = add <4 x i32> %a, %b
22147 +  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
22148 +  ret void
22149 +}
22150 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/and.v4i32.ll llvm-r600/test/CodeGen/R600/and.v4i32.ll
22151 --- llvm-3.2.src/test/CodeGen/R600/and.v4i32.ll 1970-01-01 01:00:00.000000000 +0100
22152 +++ llvm-r600/test/CodeGen/R600/and.v4i32.ll    2013-01-25 19:43:58.460049700 +0100
22153 @@ -0,0 +1,15 @@
22154 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22155 +
22156 +;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22157 +;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22158 +;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22159 +;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22160 +
22161 +define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
22162 +  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
22163 +  %a = load <4 x i32> addrspace(1) * %in
22164 +  %b = load <4 x i32> addrspace(1) * %b_ptr
22165 +  %result = and <4 x i32> %a, %b
22166 +  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
22167 +  ret void
22168 +}
22169 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll llvm-r600/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll
22170 --- llvm-3.2.src/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll    1970-01-01 01:00:00.000000000 +0100
22171 +++ llvm-r600/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll       2013-01-25 19:43:58.460049700 +0100
22172 @@ -0,0 +1,33 @@
22173 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22174 +
22175 +;CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22176 +
22177 +; This test is for a bug in
22178 +; DAGCombiner::reduceBuildVecConvertToConvertBuildVec() where
22179 +; the wrong type was being passed to
22180 +; TargetLowering::getOperationAction() when checking the legality of
22181 +; ISD::UINT_TO_FP and ISD::SINT_TO_FP opcodes.
22182 +
22183 +define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
22184 +entry:
22185 +  %ptr = getelementptr i32 addrspace(1)* %in, i32 1
22186 +  %sint = load i32 addrspace(1) * %in
22187 +  %conv = sitofp i32 %sint to float
22188 +  %0 = insertelement <4 x float> undef, float %conv, i32 0
22189 +  %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer
22190 +  store <4 x float> %splat, <4 x float> addrspace(1)* %out
22191 +  ret void
22192 +}
22193 +
22194 +;CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22195 +
22196 +define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
22197 +entry:
22198 +  %ptr = getelementptr i32 addrspace(1)* %in, i32 1
22199 +  %uint = load i32 addrspace(1) * %in
22200 +  %conv = uitofp i32 %uint to float
22201 +  %0 = insertelement <4 x float> undef, float %conv, i32 0
22202 +  %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer
22203 +  store <4 x float> %splat, <4 x float> addrspace(1)* %out
22204 +  ret void
22205 +}
22206 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fabs.ll llvm-r600/test/CodeGen/R600/fabs.ll
22207 --- llvm-3.2.src/test/CodeGen/R600/fabs.ll      1970-01-01 01:00:00.000000000 +0100
22208 +++ llvm-r600/test/CodeGen/R600/fabs.ll 2013-01-25 19:43:58.460049700 +0100
22209 @@ -0,0 +1,16 @@
22210 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22211 +
22212 +;CHECK: MOV T{{[0-9]+\.[XYZW], \|T[0-9]+\.[XYZW]\|}}
22213 +
22214 +define void @test() {
22215 +   %r0 = call float @llvm.R600.load.input(i32 0)
22216 +   %r1 = call float @fabs( float %r0)
22217 +   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
22218 +   ret void
22219 +}
22220 +
22221 +declare float @llvm.R600.load.input(i32) readnone
22222 +
22223 +declare void @llvm.AMDGPU.store.output(float, i32)
22224 +
22225 +declare float @fabs(float ) readnone
22226 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fadd.ll llvm-r600/test/CodeGen/R600/fadd.ll
22227 --- llvm-3.2.src/test/CodeGen/R600/fadd.ll      1970-01-01 01:00:00.000000000 +0100
22228 +++ llvm-r600/test/CodeGen/R600/fadd.ll 2013-01-25 19:43:58.460049700 +0100
22229 @@ -0,0 +1,16 @@
22230 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22231 +
22232 +; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22233 +
22234 +define void @test() {
22235 +   %r0 = call float @llvm.R600.load.input(i32 0)
22236 +   %r1 = call float @llvm.R600.load.input(i32 1)
22237 +   %r2 = fadd float %r0, %r1
22238 +   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
22239 +   ret void
22240 +}
22241 +
22242 +declare float @llvm.R600.load.input(i32) readnone
22243 +
22244 +declare void @llvm.AMDGPU.store.output(float, i32)
22245 +
22246 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fadd.v4f32.ll llvm-r600/test/CodeGen/R600/fadd.v4f32.ll
22247 --- llvm-3.2.src/test/CodeGen/R600/fadd.v4f32.ll        1970-01-01 01:00:00.000000000 +0100
22248 +++ llvm-r600/test/CodeGen/R600/fadd.v4f32.ll   2013-01-25 19:43:58.460049700 +0100
22249 @@ -0,0 +1,15 @@
22250 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22251 +
22252 +;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22253 +;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22254 +;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22255 +;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22256 +
22257 +define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
22258 +  %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
22259 +  %a = load <4 x float> addrspace(1) * %in
22260 +  %b = load <4 x float> addrspace(1) * %b_ptr
22261 +  %result = fadd <4 x float> %a, %b
22262 +  store <4 x float> %result, <4 x float> addrspace(1)* %out
22263 +  ret void
22264 +}
22265 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fcmp-cnde-int-args.ll llvm-r600/test/CodeGen/R600/fcmp-cnde-int-args.ll
22266 --- llvm-3.2.src/test/CodeGen/R600/fcmp-cnde-int-args.ll        1970-01-01 01:00:00.000000000 +0100
22267 +++ llvm-r600/test/CodeGen/R600/fcmp-cnde-int-args.ll   2013-01-25 19:43:58.460049700 +0100
22268 @@ -0,0 +1,16 @@
22269 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22270 +
22271 +; This test checks a bug in R600TargetLowering::LowerSELECT_CC where the
22272 +; chance to optimize the fcmp + select instructions to CNDE was missed
22273 +; due to the fact that the operands to fcmp and select had different types
22274 +
22275 +;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, 0.0, -1}}
22276 +
22277 +define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
22278 +entry:
22279 +  %0 = load float addrspace(1)* %in
22280 +  %cmp = fcmp oeq float %0, 0.000000e+00
22281 +  %value = select i1 %cmp, i32 -1, i32 0
22282 +  store i32 %value, i32 addrspace(1)* %out
22283 +  ret void
22284 +}
22285 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fcmp-cnd.ll llvm-r600/test/CodeGen/R600/fcmp-cnd.ll
22286 --- llvm-3.2.src/test/CodeGen/R600/fcmp-cnd.ll  1970-01-01 01:00:00.000000000 +0100
22287 +++ llvm-r600/test/CodeGen/R600/fcmp-cnd.ll     2013-01-25 19:43:58.460049700 +0100
22288 @@ -0,0 +1,14 @@
22289 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22290 +
22291 +;Not checking arguments 2 and 3 to CNDE, because they may change between
22292 +;registers and literal.x depending on what the optimizer does.
22293 +;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22294 +
22295 +define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
22296 +entry:
22297 +  %0 = load float addrspace(1)* %in
22298 +  %cmp = fcmp oeq float %0, 0.000000e+00
22299 +  %value = select i1 %cmp, i32 2, i32 3
22300 +  store i32 %value, i32 addrspace(1)* %out
22301 +  ret void
22302 +}
22303 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fcmp.ll llvm-r600/test/CodeGen/R600/fcmp.ll
22304 --- llvm-3.2.src/test/CodeGen/R600/fcmp.ll      1970-01-01 01:00:00.000000000 +0100
22305 +++ llvm-r600/test/CodeGen/R600/fcmp.ll 2013-01-25 19:43:58.460049700 +0100
22306 @@ -0,0 +1,16 @@
22307 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22308 +
22309 +;CHECK: SETE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22310 +;CHECK: MOV T{{[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
22311 +;CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22312 +
22313 +define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
22314 +entry:
22315 +  %0 = load float addrspace(1)* %in
22316 +  %arrayidx1 = getelementptr inbounds float addrspace(1)* %in, i32 1
22317 +  %1 = load float addrspace(1)* %arrayidx1
22318 +  %cmp = fcmp oeq float %0, %1
22319 +  %sext = sext i1 %cmp to i32
22320 +  store i32 %sext, i32 addrspace(1)* %out
22321 +  ret void
22322 +}
22323 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fdiv.v4f32.ll llvm-r600/test/CodeGen/R600/fdiv.v4f32.ll
22324 --- llvm-3.2.src/test/CodeGen/R600/fdiv.v4f32.ll        1970-01-01 01:00:00.000000000 +0100
22325 +++ llvm-r600/test/CodeGen/R600/fdiv.v4f32.ll   2013-01-25 19:43:58.460049700 +0100
22326 @@ -0,0 +1,19 @@
22327 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22328 +
22329 +;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22330 +;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22331 +;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22332 +;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22333 +;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22334 +;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22335 +;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22336 +;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22337 +
22338 +define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
22339 +  %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
22340 +  %a = load <4 x float> addrspace(1) * %in
22341 +  %b = load <4 x float> addrspace(1) * %b_ptr
22342 +  %result = fdiv <4 x float> %a, %b
22343 +  store <4 x float> %result, <4 x float> addrspace(1)* %out
22344 +  ret void
22345 +}
22346 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/floor.ll llvm-r600/test/CodeGen/R600/floor.ll
22347 --- llvm-3.2.src/test/CodeGen/R600/floor.ll     1970-01-01 01:00:00.000000000 +0100
22348 +++ llvm-r600/test/CodeGen/R600/floor.ll        2013-01-25 19:43:58.463383033 +0100
22349 @@ -0,0 +1,16 @@
22350 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22351 +
22352 +;CHECK: FLOOR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22353 +
22354 +define void @test() {
22355 +   %r0 = call float @llvm.R600.load.input(i32 0)
22356 +   %r1 = call float @floor(float %r0)
22357 +   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
22358 +   ret void
22359 +}
22360 +
22361 +declare float @llvm.R600.load.input(i32) readnone
22362 +
22363 +declare void @llvm.AMDGPU.store.output(float, i32)
22364 +
22365 +declare float @floor(float) readonly
22366 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fmax.ll llvm-r600/test/CodeGen/R600/fmax.ll
22367 --- llvm-3.2.src/test/CodeGen/R600/fmax.ll      1970-01-01 01:00:00.000000000 +0100
22368 +++ llvm-r600/test/CodeGen/R600/fmax.ll 2013-01-25 19:43:58.463383033 +0100
22369 @@ -0,0 +1,16 @@
22370 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22371 +
22372 +;CHECK: MAX T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22373 +
22374 +define void @test() {
22375 +   %r0 = call float @llvm.R600.load.input(i32 0)
22376 +   %r1 = call float @llvm.R600.load.input(i32 1)
22377 +   %r2 = fcmp uge float %r0, %r1
22378 +   %r3 = select i1 %r2, float %r0, float %r1
22379 +   call void @llvm.AMDGPU.store.output(float %r3, i32 0)
22380 +   ret void
22381 +}
22382 +
22383 +declare float @llvm.R600.load.input(i32) readnone
22384 +
22385 +declare void @llvm.AMDGPU.store.output(float, i32)
22386 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fmin.ll llvm-r600/test/CodeGen/R600/fmin.ll
22387 --- llvm-3.2.src/test/CodeGen/R600/fmin.ll      1970-01-01 01:00:00.000000000 +0100
22388 +++ llvm-r600/test/CodeGen/R600/fmin.ll 2013-01-25 19:43:58.463383033 +0100
22389 @@ -0,0 +1,16 @@
22390 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22391 +
22392 +;CHECK: MIN T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22393 +
22394 +define void @test() {
22395 +   %r0 = call float @llvm.R600.load.input(i32 0)
22396 +   %r1 = call float @llvm.R600.load.input(i32 1)
22397 +   %r2 = fcmp uge float %r0, %r1
22398 +   %r3 = select i1 %r2, float %r1, float %r0
22399 +   call void @llvm.AMDGPU.store.output(float %r3, i32 0)
22400 +   ret void
22401 +}
22402 +
22403 +declare float @llvm.R600.load.input(i32) readnone
22404 +
22405 +declare void @llvm.AMDGPU.store.output(float, i32)
22406 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fmul.ll llvm-r600/test/CodeGen/R600/fmul.ll
22407 --- llvm-3.2.src/test/CodeGen/R600/fmul.ll      1970-01-01 01:00:00.000000000 +0100
22408 +++ llvm-r600/test/CodeGen/R600/fmul.ll 2013-01-25 19:43:58.463383033 +0100
22409 @@ -0,0 +1,16 @@
22410 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22411 +
22412 +; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22413 +
22414 +define void @test() {
22415 +   %r0 = call float @llvm.R600.load.input(i32 0)
22416 +   %r1 = call float @llvm.R600.load.input(i32 1)
22417 +   %r2 = fmul float %r0, %r1
22418 +   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
22419 +   ret void
22420 +}
22421 +
22422 +declare float @llvm.R600.load.input(i32) readnone
22423 +
22424 +declare void @llvm.AMDGPU.store.output(float, i32)
22425 +
22426 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fmul.v4f32.ll llvm-r600/test/CodeGen/R600/fmul.v4f32.ll
22427 --- llvm-3.2.src/test/CodeGen/R600/fmul.v4f32.ll        1970-01-01 01:00:00.000000000 +0100
22428 +++ llvm-r600/test/CodeGen/R600/fmul.v4f32.ll   2013-01-25 19:43:58.463383033 +0100
22429 @@ -0,0 +1,15 @@
22430 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22431 +
22432 +;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22433 +;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22434 +;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22435 +;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22436 +
22437 +define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
22438 +  %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
22439 +  %a = load <4 x float> addrspace(1) * %in
22440 +  %b = load <4 x float> addrspace(1) * %b_ptr
22441 +  %result = fmul <4 x float> %a, %b
22442 +  store <4 x float> %result, <4 x float> addrspace(1)* %out
22443 +  ret void
22444 +}
22445 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fsub.ll llvm-r600/test/CodeGen/R600/fsub.ll
22446 --- llvm-3.2.src/test/CodeGen/R600/fsub.ll      1970-01-01 01:00:00.000000000 +0100
22447 +++ llvm-r600/test/CodeGen/R600/fsub.ll 2013-01-25 19:43:58.463383033 +0100
22448 @@ -0,0 +1,17 @@
22449 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22450 +
22451 +; CHECK: MOV T{{[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
22452 +; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22453 +
22454 +define void @test() {
22455 +   %r0 = call float @llvm.R600.load.input(i32 0)
22456 +   %r1 = call float @llvm.R600.load.input(i32 1)
22457 +   %r2 = fsub float %r0, %r1
22458 +   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
22459 +   ret void
22460 +}
22461 +
22462 +declare float @llvm.R600.load.input(i32) readnone
22463 +
22464 +declare void @llvm.AMDGPU.store.output(float, i32)
22465 +
22466 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fsub.v4f32.ll llvm-r600/test/CodeGen/R600/fsub.v4f32.ll
22467 --- llvm-3.2.src/test/CodeGen/R600/fsub.v4f32.ll        1970-01-01 01:00:00.000000000 +0100
22468 +++ llvm-r600/test/CodeGen/R600/fsub.v4f32.ll   2013-01-25 19:43:58.463383033 +0100
22469 @@ -0,0 +1,15 @@
22470 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22471 +
22472 +;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22473 +;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22474 +;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22475 +;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22476 +
22477 +define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
22478 +  %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
22479 +  %a = load <4 x float> addrspace(1) * %in
22480 +  %b = load <4 x float> addrspace(1) * %b_ptr
22481 +  %result = fsub <4 x float> %a, %b
22482 +  store <4 x float> %result, <4 x float> addrspace(1)* %out
22483 +  ret void
22484 +}
22485 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/i8_to_double_to_float.ll llvm-r600/test/CodeGen/R600/i8_to_double_to_float.ll
22486 --- llvm-3.2.src/test/CodeGen/R600/i8_to_double_to_float.ll     1970-01-01 01:00:00.000000000 +0100
22487 +++ llvm-r600/test/CodeGen/R600/i8_to_double_to_float.ll        2013-01-25 19:43:58.463383033 +0100
22488 @@ -0,0 +1,11 @@
22489 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22490 +
22491 +;CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22492 +
22493 +define void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) {
22494 +  %1 = load i8 addrspace(1)* %in
22495 +  %2 = uitofp i8 %1 to double
22496 +  %3 = fptrunc double %2 to float
22497 +  store float %3, float addrspace(1)* %out
22498 +  ret void
22499 +}
22500 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/icmp-select-sete-reverse-args.ll llvm-r600/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
22501 --- llvm-3.2.src/test/CodeGen/R600/icmp-select-sete-reverse-args.ll     1970-01-01 01:00:00.000000000 +0100
22502 +++ llvm-r600/test/CodeGen/R600/icmp-select-sete-reverse-args.ll        2013-01-25 19:43:58.463383033 +0100
22503 @@ -0,0 +1,18 @@
22504 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22505 +
22506 +;Test that a select with reversed True/False values is correctly lowered
22507 +;to a SETNE_INT.  There should only be one SETNE_INT instruction.
22508 +
22509 +;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22510 +;CHECK_NOT: SETNE_INT
22511 +
22512 +define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
22513 +entry:
22514 +  %0 = load i32 addrspace(1)* %in
22515 +  %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %in, i32 1
22516 +  %1 = load i32 addrspace(1)* %arrayidx1
22517 +  %cmp = icmp eq i32 %0, %1
22518 +  %value = select i1 %cmp, i32 0, i32 -1
22519 +  store i32 %value, i32 addrspace(1)* %out
22520 +  ret void
22521 +}
22522 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/literals.ll llvm-r600/test/CodeGen/R600/literals.ll
22523 --- llvm-3.2.src/test/CodeGen/R600/literals.ll  1970-01-01 01:00:00.000000000 +0100
22524 +++ llvm-r600/test/CodeGen/R600/literals.ll     2013-01-25 19:43:58.463383033 +0100
22525 @@ -0,0 +1,30 @@
22526 +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22527 +
22528 +; Test using an integer literal constant.
22529 +; Generated ASM should be:
22530 +; ADD_INT REG literal.x, 5
22531 +; or
22532 +; ADD_INT literal.x REG, 5
22533 +
22534 +; CHECK: ADD_INT {{[A-Z0-9,. ]*}}literal.x,{{[A-Z0-9,. ]*}} 5
22535 +define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
22536 +entry:
22537 +  %0 = add i32 5, %in
22538 +  store i32 %0, i32 addrspace(1)* %out
22539 +  ret void
22540 +}
22541 +
22542 +; Test using a float literal constant.
22543 +; Generated ASM should be:
22544 +; ADD REG literal.x, 5.0
22545 +; or
22546 +; ADD literal.x REG, 5.0
22547 +
22548 +; CHECK: ADD {{[A-Z0-9,. ]*}}literal.x,{{[A-Z0-9,. ]*}} {{[0-9]+}}(5.0
22549 +define void @float_literal(float addrspace(1)* %out, float %in) {
22550 +entry:
22551 +  %0 = fadd float 5.0, %in
22552 +  store float %0, float addrspace(1)* %out
22553 +  ret void
22554 +}
22555 +
22556 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/lit.local.cfg llvm-r600/test/CodeGen/R600/lit.local.cfg
22557 --- llvm-3.2.src/test/CodeGen/R600/lit.local.cfg        1970-01-01 01:00:00.000000000 +0100
22558 +++ llvm-r600/test/CodeGen/R600/lit.local.cfg   2013-01-25 19:43:58.463383033 +0100
22559 @@ -0,0 +1,13 @@
22560 +config.suffixes = ['.ll', '.c', '.cpp']
22561 +
22562 +def getRoot(config):
22563 +    if not config.parent:
22564 +        return config
22565 +    return getRoot(config.parent)
22566 +
22567 +root = getRoot(config)
22568 +
22569 +targets = set(root.targets_to_build.split())
22570 +if not 'R600' in targets:
22571 +    config.unsupported = True
22572 +
22573 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/llvm.AMDGPU.mul.ll llvm-r600/test/CodeGen/R600/llvm.AMDGPU.mul.ll
22574 --- llvm-3.2.src/test/CodeGen/R600/llvm.AMDGPU.mul.ll   1970-01-01 01:00:00.000000000 +0100
22575 +++ llvm-r600/test/CodeGen/R600/llvm.AMDGPU.mul.ll      2013-01-25 19:43:58.463383033 +0100
22576 @@ -0,0 +1,17 @@
22577 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22578 +
22579 +;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22580 +
22581 +define void @test() {
22582 +   %r0 = call float @llvm.R600.load.input(i32 0)
22583 +   %r1 = call float @llvm.R600.load.input(i32 1)
22584 +   %r2 = call float @llvm.AMDGPU.mul( float %r0, float %r1)
22585 +   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
22586 +   ret void
22587 +}
22588 +
22589 +declare float @llvm.R600.load.input(i32) readnone
22590 +
22591 +declare void @llvm.AMDGPU.store.output(float, i32)
22592 +
22593 +declare float @llvm.AMDGPU.mul(float ,float ) readnone
22594 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/llvm.AMDGPU.trunc.ll llvm-r600/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
22595 --- llvm-3.2.src/test/CodeGen/R600/llvm.AMDGPU.trunc.ll 1970-01-01 01:00:00.000000000 +0100
22596 +++ llvm-r600/test/CodeGen/R600/llvm.AMDGPU.trunc.ll    2013-01-25 19:43:58.463383033 +0100
22597 @@ -0,0 +1,16 @@
22598 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22599 +
22600 +;CHECK: TRUNC T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22601 +
22602 +define void @test() {
22603 +   %r0 = call float @llvm.R600.load.input(i32 0)
22604 +   %r1 = call float @llvm.AMDGPU.trunc( float %r0)
22605 +   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
22606 +   ret void
22607 +}
22608 +
22609 +declare float @llvm.R600.load.input(i32) readnone
22610 +
22611 +declare void @llvm.AMDGPU.store.output(float, i32)
22612 +
22613 +declare float @llvm.AMDGPU.trunc(float ) readnone
22614 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/llvm.cos.ll llvm-r600/test/CodeGen/R600/llvm.cos.ll
22615 --- llvm-3.2.src/test/CodeGen/R600/llvm.cos.ll  1970-01-01 01:00:00.000000000 +0100
22616 +++ llvm-r600/test/CodeGen/R600/llvm.cos.ll     2013-01-25 19:43:58.463383033 +0100
22617 @@ -0,0 +1,16 @@
22618 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22619 +
22620 +;CHECK: COS T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22621 +
22622 +define void @test() {
22623 +   %r0 = call float @llvm.R600.load.input(i32 0)
22624 +   %r1 = call float @llvm.cos.f32(float %r0)
22625 +   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
22626 +   ret void
22627 +}
22628 +
22629 +declare float @llvm.cos.f32(float) readnone
22630 +
22631 +declare float @llvm.R600.load.input(i32) readnone
22632 +
22633 +declare void @llvm.AMDGPU.store.output(float, i32)
22634 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/llvm.pow.ll llvm-r600/test/CodeGen/R600/llvm.pow.ll
22635 --- llvm-3.2.src/test/CodeGen/R600/llvm.pow.ll  1970-01-01 01:00:00.000000000 +0100
22636 +++ llvm-r600/test/CodeGen/R600/llvm.pow.ll     2013-01-25 19:43:58.466716366 +0100
22637 @@ -0,0 +1,19 @@
22638 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22639 +
22640 +;CHECK: LOG_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22641 +;CHECK-NEXT: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22642 +;CHECK-NEXT: EXP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22643 +
22644 +define void @test() {
22645 +   %r0 = call float @llvm.R600.load.input(i32 0)
22646 +   %r1 = call float @llvm.R600.load.input(i32 1)
22647 +   %r2 = call float @llvm.pow.f32( float %r0, float %r1)
22648 +   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
22649 +   ret void
22650 +}
22651 +
22652 +declare float @llvm.R600.load.input(i32) readnone
22653 +
22654 +declare void @llvm.AMDGPU.store.output(float, i32)
22655 +
22656 +declare float @llvm.pow.f32(float ,float ) readonly
22657 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/llvm.sin.ll llvm-r600/test/CodeGen/R600/llvm.sin.ll
22658 --- llvm-3.2.src/test/CodeGen/R600/llvm.sin.ll  1970-01-01 01:00:00.000000000 +0100
22659 +++ llvm-r600/test/CodeGen/R600/llvm.sin.ll     2013-01-25 19:43:58.466716366 +0100
22660 @@ -0,0 +1,16 @@
22661 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22662 +
22663 +;CHECK: SIN T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22664 +
22665 +define void @test() {
22666 +   %r0 = call float @llvm.R600.load.input(i32 0)
22667 +   %r1 = call float @llvm.sin.f32( float %r0)
22668 +   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
22669 +   ret void
22670 +}
22671 +
22672 +declare float @llvm.sin.f32(float) readnone
22673 +
22674 +declare float @llvm.R600.load.input(i32) readnone
22675 +
22676 +declare void @llvm.AMDGPU.store.output(float, i32)
22677 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/load.constant_addrspace.f32.ll llvm-r600/test/CodeGen/R600/load.constant_addrspace.f32.ll
22678 --- llvm-3.2.src/test/CodeGen/R600/load.constant_addrspace.f32.ll       1970-01-01 01:00:00.000000000 +0100
22679 +++ llvm-r600/test/CodeGen/R600/load.constant_addrspace.f32.ll  2013-01-25 19:43:58.466716366 +0100
22680 @@ -0,0 +1,9 @@
22681 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22682 +
22683 +;CHECK: VTX_READ_32 T{{[0-9]+\.X, T[0-9]+\.X}}
22684 +
22685 +define void @test(float addrspace(1)* %out, float addrspace(2)* %in) {
22686 +  %1 = load float addrspace(2)* %in
22687 +  store float %1, float addrspace(1)* %out
22688 +  ret void
22689 +}
22690 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/load.i8.ll llvm-r600/test/CodeGen/R600/load.i8.ll
22691 --- llvm-3.2.src/test/CodeGen/R600/load.i8.ll   1970-01-01 01:00:00.000000000 +0100
22692 +++ llvm-r600/test/CodeGen/R600/load.i8.ll      2013-01-25 19:43:58.466716366 +0100
22693 @@ -0,0 +1,10 @@
22694 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22695 +
22696 +;CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
22697 +
22698 +define void @test(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
22699 +  %1 = load i8 addrspace(1)* %in
22700 +  %2 = zext i8 %1 to i32
22701 +  store i32 %2, i32 addrspace(1)* %out
22702 +  ret void
22703 +}
22704 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/reciprocal.ll llvm-r600/test/CodeGen/R600/reciprocal.ll
22705 --- llvm-3.2.src/test/CodeGen/R600/reciprocal.ll        1970-01-01 01:00:00.000000000 +0100
22706 +++ llvm-r600/test/CodeGen/R600/reciprocal.ll   2013-01-25 19:43:58.466716366 +0100
22707 @@ -0,0 +1,16 @@
22708 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22709 +
22710 +;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22711 +
22712 +define void @test() {
22713 +   %r0 = call float @llvm.R600.load.input(i32 0)
22714 +   %r1 = fdiv float 1.0, %r0
22715 +   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
22716 +   ret void
22717 +}
22718 +
22719 +declare float @llvm.R600.load.input(i32) readnone
22720 +
22721 +declare void @llvm.AMDGPU.store.output(float, i32)
22722 +
22723 +declare float @llvm.AMDGPU.rcp(float ) readnone
22724 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/sdiv.ll llvm-r600/test/CodeGen/R600/sdiv.ll
22725 --- llvm-3.2.src/test/CodeGen/R600/sdiv.ll      1970-01-01 01:00:00.000000000 +0100
22726 +++ llvm-r600/test/CodeGen/R600/sdiv.ll 2013-01-25 19:43:58.466716366 +0100
22727 @@ -0,0 +1,21 @@
22728 +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22729 +
22730 +; The code generated by sdiv is long and complex and may frequently change.
22731 +; The goal of this test is to make sure the ISel doesn't fail.
22732 +;
22733 +; This program was previously failing to compile when one of the selectcc
22734 +; opcodes generated by the sdiv lowering was being legalized and optimized to:
22735 +; selectcc Remainder -1, 0, -1, SETGT
22736 +; This was fixed by adding an additional pattern in R600Instructions.td to
22737 +; match this pattern with a CNDGE_INT.
22738 +
22739 +; CHECK: RETURN
22740 +
22741 +define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
22742 +  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
22743 +  %num = load i32 addrspace(1) * %in
22744 +  %den = load i32 addrspace(1) * %den_ptr
22745 +  %result = sdiv i32 %num, %den
22746 +  store i32 %result, i32 addrspace(1)* %out
22747 +  ret void
22748 +}
22749 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/selectcc_cnde_int.ll llvm-r600/test/CodeGen/R600/selectcc_cnde_int.ll
22750 --- llvm-3.2.src/test/CodeGen/R600/selectcc_cnde_int.ll 1970-01-01 01:00:00.000000000 +0100
22751 +++ llvm-r600/test/CodeGen/R600/selectcc_cnde_int.ll    2013-01-25 19:43:58.466716366 +0100
22752 @@ -0,0 +1,11 @@
22753 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22754 +
22755 +;CHECK-NOT: SETE_INT
22756 +;CHECK: CNDE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1, literal.x, 2}}
22757 +define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
22758 +  %1 = load i32 addrspace(1)* %in
22759 +  %2 = icmp eq i32 %1, 0
22760 +  %3 = select i1 %2, i32 1, i32 2
22761 +  store i32 %3, i32 addrspace(1)* %out
22762 +  ret void
22763 +}
22764 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/selectcc_cnde.ll llvm-r600/test/CodeGen/R600/selectcc_cnde.ll
22765 --- llvm-3.2.src/test/CodeGen/R600/selectcc_cnde.ll     1970-01-01 01:00:00.000000000 +0100
22766 +++ llvm-r600/test/CodeGen/R600/selectcc_cnde.ll        2013-01-25 19:43:58.466716366 +0100
22767 @@ -0,0 +1,11 @@
22768 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22769 +
22770 +;CHECK-NOT: SETE
22771 +;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1.0, literal.x, [-0-9]+\(2.0}}
22772 +define void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
22773 +  %1 = load float addrspace(1)* %in
22774 +  %2 = fcmp oeq float %1, 0.0
22775 +  %3 = select i1 %2, float 1.0, float 2.0
22776 +  store float %3, float addrspace(1)* %out
22777 +  ret void
22778 +}
22779 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/selectcc-icmp-select-float.ll llvm-r600/test/CodeGen/R600/selectcc-icmp-select-float.ll
22780 --- llvm-3.2.src/test/CodeGen/R600/selectcc-icmp-select-float.ll        1970-01-01 01:00:00.000000000 +0100
22781 +++ llvm-r600/test/CodeGen/R600/selectcc-icmp-select-float.ll   2013-01-25 19:43:58.466716366 +0100
22782 @@ -0,0 +1,15 @@
22783 +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22784 +
22785 +; Note additional optimizations may cause this SGT to be replaced with a
22786 +; CND* instruction.
22787 +; CHECK: SGT_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, -1}}
22788 +; Test a selectcc with i32 LHS/RHS and float True/False
22789 +
22790 +define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
22791 +entry:
22792 +  %0 = load i32 addrspace(1)* %in
22793 +  %1 = icmp sge i32 %0, 0
22794 +  %2 = select i1 %1, float 1.0, float 0.0
22795 +  store float %2, float addrspace(1)* %out
22796 +  ret void
22797 +}
22798 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/setcc.v4i32.ll llvm-r600/test/CodeGen/R600/setcc.v4i32.ll
22799 --- llvm-3.2.src/test/CodeGen/R600/setcc.v4i32.ll       1970-01-01 01:00:00.000000000 +0100
22800 +++ llvm-r600/test/CodeGen/R600/setcc.v4i32.ll  2013-01-25 19:43:58.466716366 +0100
22801 @@ -0,0 +1,12 @@
22802 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22803 +;CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22804 +
22805 +define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
22806 +  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
22807 +  %a = load <4 x i32> addrspace(1) * %in
22808 +  %b = load <4 x i32> addrspace(1) * %b_ptr
22809 +  %result = icmp eq <4 x i32> %a, %b
22810 +  %sext = sext <4 x i1> %result to <4 x i32>
22811 +  store <4 x i32> %sext, <4 x i32> addrspace(1)* %out
22812 +  ret void
22813 +}
22814 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/short-args.ll llvm-r600/test/CodeGen/R600/short-args.ll
22815 --- llvm-3.2.src/test/CodeGen/R600/short-args.ll        1970-01-01 01:00:00.000000000 +0100
22816 +++ llvm-r600/test/CodeGen/R600/short-args.ll   2013-01-25 19:43:58.466716366 +0100
22817 @@ -0,0 +1,37 @@
22818 +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22819 +
22820 +; CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
22821 +
22822 +define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
22823 +entry:
22824 +  %0 = zext i8 %in to i32
22825 +  store i32 %0, i32 addrspace(1)* %out, align 4
22826 +  ret void
22827 +}
22828 +
22829 +; CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
22830 +
22831 +define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
22832 +entry:
22833 +  %0 = zext i8 %in to i32
22834 +  store i32 %0, i32 addrspace(1)* %out, align 4
22835 +  ret void
22836 +}
22837 +
22838 +; CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
22839 +
22840 +define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
22841 +entry:
22842 +  %0 = zext i16 %in to i32
22843 +  store i32 %0, i32 addrspace(1)* %out, align 4
22844 +  ret void
22845 +}
22846 +
22847 +; CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
22848 +
22849 +define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
22850 +entry:
22851 +  %0 = zext i16 %in to i32
22852 +  store i32 %0, i32 addrspace(1)* %out, align 4
22853 +  ret void
22854 +}
22855 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/store.v4f32.ll llvm-r600/test/CodeGen/R600/store.v4f32.ll
22856 --- llvm-3.2.src/test/CodeGen/R600/store.v4f32.ll       1970-01-01 01:00:00.000000000 +0100
22857 +++ llvm-r600/test/CodeGen/R600/store.v4f32.ll  2013-01-25 19:43:58.466716366 +0100
22858 @@ -0,0 +1,9 @@
22859 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22860 +
22861 +;CHECK: RAT_WRITE_CACHELESS_128 T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
22862 +
22863 +define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
22864 +  %1 = load <4 x float> addrspace(1) * %in
22865 +  store <4 x float> %1, <4 x float> addrspace(1)* %out
22866 +  ret void
22867 +}
22868 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/store.v4i32.ll llvm-r600/test/CodeGen/R600/store.v4i32.ll
22869 --- llvm-3.2.src/test/CodeGen/R600/store.v4i32.ll       1970-01-01 01:00:00.000000000 +0100
22870 +++ llvm-r600/test/CodeGen/R600/store.v4i32.ll  2013-01-25 19:43:58.466716366 +0100
22871 @@ -0,0 +1,9 @@
22872 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22873 +
22874 +;CHECK: RAT_WRITE_CACHELESS_128 T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
22875 +
22876 +define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
22877 +  %1 = load <4 x i32> addrspace(1) * %in
22878 +  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
22879 +  ret void
22880 +}
22881 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/udiv.v4i32.ll llvm-r600/test/CodeGen/R600/udiv.v4i32.ll
22882 --- llvm-3.2.src/test/CodeGen/R600/udiv.v4i32.ll        1970-01-01 01:00:00.000000000 +0100
22883 +++ llvm-r600/test/CodeGen/R600/udiv.v4i32.ll   2013-01-25 19:43:58.466716366 +0100
22884 @@ -0,0 +1,15 @@
22885 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22886 +
22887 +;The code generated by udiv is long and complex and may frequently change.
22888 +;The goal of this test is to make sure the ISel doesn't fail when it gets
22889 +;a v4i32 udiv
22890 +;CHECK: RETURN
22891 +
22892 +define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
22893 +  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
22894 +  %a = load <4 x i32> addrspace(1) * %in
22895 +  %b = load <4 x i32> addrspace(1) * %b_ptr
22896 +  %result = udiv <4 x i32> %a, %b
22897 +  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
22898 +  ret void
22899 +}
22900 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/urem.v4i32.ll llvm-r600/test/CodeGen/R600/urem.v4i32.ll
22901 --- llvm-3.2.src/test/CodeGen/R600/urem.v4i32.ll        1970-01-01 01:00:00.000000000 +0100
22902 +++ llvm-r600/test/CodeGen/R600/urem.v4i32.ll   2013-01-25 19:43:58.470049700 +0100
22903 @@ -0,0 +1,15 @@
22904 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22905 +
22906 +;The code generated by urem is long and complex and may frequently change.
22907 +;The goal of this test is to make sure the ISel doesn't fail when it gets
22908 +;a v4i32 urem
22909 +;CHECK: RETURN
22910 +
22911 +define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
22912 +  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
22913 +  %a = load <4 x i32> addrspace(1) * %in
22914 +  %b = load <4 x i32> addrspace(1) * %b_ptr
22915 +  %result = urem <4 x i32> %a, %b
22916 +  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
22917 +  ret void
22918 +}
22919 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/vec4-expand.ll llvm-r600/test/CodeGen/R600/vec4-expand.ll
22920 --- llvm-3.2.src/test/CodeGen/R600/vec4-expand.ll       1970-01-01 01:00:00.000000000 +0100
22921 +++ llvm-r600/test/CodeGen/R600/vec4-expand.ll  2013-01-25 19:43:58.470049700 +0100
22922 @@ -0,0 +1,49 @@
22923 +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22924 +
22925 +; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22926 +; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22927 +; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22928 +; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22929 +
22930 +define void @fp_to_sint(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
22931 +  %value = load <4 x float> addrspace(1) * %in
22932 +  %result = fptosi <4 x float> %value to <4 x i32>
22933 +  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
22934 +  ret void
22935 +}
22936 +
22937 +; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22938 +; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22939 +; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22940 +; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22941 +
22942 +define void @fp_to_uint(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
22943 +  %value = load <4 x float> addrspace(1) * %in
22944 +  %result = fptoui <4 x float> %value to <4 x i32>
22945 +  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
22946 +  ret void
22947 +}
22948 +
22949 +; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22950 +; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22951 +; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22952 +; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22953 +
22954 +define void @sint_to_fp(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
22955 +  %value = load <4 x i32> addrspace(1) * %in
22956 +  %result = sitofp <4 x i32> %value to <4 x float>
22957 +  store <4 x float> %result, <4 x float> addrspace(1)* %out
22958 +  ret void
22959 +}
22960 +
22961 +; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22962 +; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22963 +; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22964 +; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22965 +
22966 +define void @uint_to_fp(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
22967 +  %value = load <4 x i32> addrspace(1) * %in
22968 +  %result = uitofp <4 x i32> %value to <4 x float>
22969 +  store <4 x float> %result, <4 x float> addrspace(1)* %out
22970 +  ret void
22971 +}
22972 diff -Nur -x .git llvm-3.2.src/test/CodeGen/SI/sanity.ll llvm-r600/test/CodeGen/SI/sanity.ll
22973 --- llvm-3.2.src/test/CodeGen/SI/sanity.ll      1970-01-01 01:00:00.000000000 +0100
22974 +++ llvm-r600/test/CodeGen/SI/sanity.ll 2013-01-25 19:43:58.470049700 +0100
22975 @@ -0,0 +1,37 @@
22976 +;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
22977 +
22978 +; CHECK: S_ENDPGM
22979 +
22980 +define void @main() {
22981 +main_body:
22982 +  call void @llvm.AMDGPU.shader.type(i32 1)
22983 +  %0 = load <4 x i32> addrspace(2)* addrspace(8)* inttoptr (i32 6 to <4 x i32> addrspace(2)* addrspace(8)*)
22984 +  %1 = getelementptr <4 x i32> addrspace(2)* %0, i32 0
22985 +  %2 = load <4 x i32> addrspace(2)* %1
22986 +  %3 = call i32 @llvm.SI.vs.load.buffer.index()
22987 +  %4 = call <4 x float> @llvm.SI.vs.load.input(<4 x i32> %2, i32 0, i32 %3)
22988 +  %5 = extractelement <4 x float> %4, i32 0
22989 +  %6 = extractelement <4 x float> %4, i32 1
22990 +  %7 = extractelement <4 x float> %4, i32 2
22991 +  %8 = extractelement <4 x float> %4, i32 3
22992 +  %9 = load <4 x i32> addrspace(2)* addrspace(8)* inttoptr (i32 6 to <4 x i32> addrspace(2)* addrspace(8)*)
22993 +  %10 = getelementptr <4 x i32> addrspace(2)* %9, i32 1
22994 +  %11 = load <4 x i32> addrspace(2)* %10
22995 +  %12 = call i32 @llvm.SI.vs.load.buffer.index()
22996 +  %13 = call <4 x float> @llvm.SI.vs.load.input(<4 x i32> %11, i32 0, i32 %12)
22997 +  %14 = extractelement <4 x float> %13, i32 0
22998 +  %15 = extractelement <4 x float> %13, i32 1
22999 +  %16 = extractelement <4 x float> %13, i32 2
23000 +  %17 = extractelement <4 x float> %13, i32 3
23001 +  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %14, float %15, float %16, float %17)
23002 +  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %5, float %6, float %7, float %8)
23003 +  ret void
23004 +}
23005 +
23006 +declare void @llvm.AMDGPU.shader.type(i32)
23007 +
23008 +declare i32 @llvm.SI.vs.load.buffer.index() readnone
23009 +
23010 +declare <4 x float> @llvm.SI.vs.load.input(<4 x i32>, i32, i32)
23011 +
23012 +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
23013 diff -Nur -x .git llvm-3.2.src/test/CodeGen/X86/cvtv2f32.ll llvm-r600/test/CodeGen/X86/cvtv2f32.ll
23014 --- llvm-3.2.src/test/CodeGen/X86/cvtv2f32.ll   2012-10-24 06:14:18.000000000 +0200
23015 +++ llvm-r600/test/CodeGen/X86/cvtv2f32.ll      2013-01-25 19:43:58.856716358 +0100
23016 @@ -1,3 +1,7 @@
23017 +; A bug fix in the DAGCombiner made this test fail, so marking as xfail
23018 +; until this can be investigated further.
23019 +; XFAIL: *
23020 +
23021  ; RUN: llc < %s -mtriple=i686-linux-pc -mcpu=corei7 | FileCheck %s
23022
23023  define <2 x float> @foo(i32 %x, i32 %y, <2 x float> %v) {