From: Marcin Krol Date: Mon, 29 Apr 2013 11:35:04 +0000 (+0000) Subject: - from PLD, TLDized X-Git-Url: https://git.tld-linux.org/?a=commitdiff_plain;h=cd83bbc0833500036d5e66f05dd9389d679c608c;p=packages%2Fllvm.git - from PLD, TLDized --- cd83bbc0833500036d5e66f05dd9389d679c608c diff --git a/llvm-2.6-timestamp.patch b/llvm-2.6-timestamp.patch new file mode 100644 index 0000000..ab0979e --- /dev/null +++ b/llvm-2.6-timestamp.patch @@ -0,0 +1,11 @@ +--- llvm-2.6/Makefile.rules.timestamp 2009-08-19 18:04:44.000000000 -0400 ++++ llvm-2.6/Makefile.rules 2009-09-09 02:10:38.287389725 -0400 +@@ -672,7 +672,7 @@ + + ProgInstall = $(INSTALL) $(Install.StripFlag) -m 0755 + ScriptInstall = $(INSTALL) -m 0755 +-DataInstall = $(INSTALL) -m 0644 ++DataInstall = $(INSTALL) -p -m 0644 + + # When compiling under Mingw/Cygwin, the tblgen tool expects Windows + # paths. In this case, the SYSPATH function (defined in diff --git a/llvm-config.patch b/llvm-config.patch new file mode 100644 index 0000000..abdbdb9 --- /dev/null +++ b/llvm-config.patch @@ -0,0 +1,115 @@ +--- llvm-3.1.src/tools/llvm-config/llvm-config.cpp.orig 2012-05-16 00:06:08.000000000 +0200 ++++ llvm-3.1.src/tools/llvm-config/llvm-config.cpp 2012-10-12 17:44:41.041037043 +0200 +@@ -234,11 +234,11 @@ + break; + case CMakeStyle: + ActiveBinDir = ActiveObjRoot + "/bin"; +- ActiveLibDir = ActiveObjRoot + "/lib"; ++ ActiveLibDir = LLVM_LIBDIR; + break; + case CMakeBuildModeStyle: + ActiveBinDir = ActiveObjRoot + "/bin/" + LLVM_BUILDMODE; +- ActiveLibDir = ActiveObjRoot + "/lib/" + LLVM_BUILDMODE; ++ ActiveLibDir = LLVM_LIBDIR "/" LLVM_BUILDMODE; + break; + } + +@@ -249,7 +249,7 @@ + ActivePrefix = CurrentExecPrefix; + ActiveIncludeDir = ActivePrefix + "/include"; + ActiveBinDir = ActivePrefix + "/bin"; +- ActiveLibDir = ActivePrefix + "/lib"; ++ ActiveLibDir = LLVM_LIBDIR; + ActiveIncludeOption = "-I" + ActiveIncludeDir; + } + +--- llvm-3.1.src/autoconf/configure.ac.orig 2012-05-11 22:48:57.000000000 +0200 ++++ llvm-3.1.src/autoconf/configure.ac 2012-10-12 17:39:00.668599306 +0200 +@@ -1472,13 +1472,13 @@ + fi + eval LLVM_PREFIX="${prefix}"; + eval LLVM_BINDIR="${prefix}/bin"; +-eval LLVM_LIBDIR="${prefix}/lib"; ++eval LLVM_LIBDIR="${libdir}"; + eval LLVM_DATADIR="${prefix}/share/llvm"; + eval LLVM_DOCSDIR="${prefix}/share/doc/llvm"; + eval LLVM_ETCDIR="${prefix}/etc/llvm"; + eval LLVM_INCLUDEDIR="${prefix}/include"; +-eval LLVM_INFODIR="${prefix}/info"; +-eval LLVM_MANDIR="${prefix}/man"; ++eval LLVM_INFODIR="${datadir}/info"; ++eval LLVM_MANDIR="${datadir}/man"; + LLVM_CONFIGTIME=`date` + AC_SUBST(LLVM_PREFIX) + AC_SUBST(LLVM_BINDIR) +--- llvm-3.2.src/tools/clang/lib/Driver/ToolChains.cpp.orig 2013-01-26 17:40:15.003203777 +0100 ++++ llvm-3.2.src/tools/clang/lib/Driver/ToolChains.cpp 2013-01-26 18:49:17.313536763 +0100 +@@ -2154,7 +2154,7 @@ Linux::Linux(const Driver &D, const llvm + // host system, and a more minimal sysroot available that is the target of + // the cross. + if (StringRef(LibPath).startswith(SysRoot)) { +- addPathIfExists(LibPath + "/../" + GCCTriple.str() + "/lib/../" + Multilib, ++ addPathIfExists(LibPath + "/../" + GCCTriple.str() + "/" + Multilib, + Paths); + addPathIfExists(LibPath + "/" + MultiarchTriple, Paths); + addPathIfExists(LibPath + "/../" + Multilib, Paths); +@@ -2166,14 +2166,14 @@ Linux::Linux(const Driver &D, const llvm + } + } + addPathIfExists(SysRoot + "/lib/" + MultiarchTriple, Paths); +- addPathIfExists(SysRoot + "/lib/../" + Multilib, Paths); +- addPathIfExists(SysRoot + "/usr/lib/" + MultiarchTriple, Paths); +- addPathIfExists(SysRoot + "/usr/lib/../" + Multilib, Paths); ++ addPathIfExists(SysRoot + "/" + Multilib, Paths); ++ addPathIfExists(SysRoot + LLVM_LIBDIR "/" + MultiarchTriple, Paths); ++ addPathIfExists(SysRoot + "/usr/" + Multilib, Paths); + + // Try walking via the GCC triple path in case of multiarch GCC + // installations with strange symlinks. + if (GCCInstallation.isValid()) +- addPathIfExists(SysRoot + "/usr/lib/" + GCCInstallation.getTriple().str() + ++ addPathIfExists(SysRoot + LLVM_LIBDIR "/" + GCCInstallation.getTriple().str() + + "/../../" + Multilib, Paths); + + // Add the non-multilib suffixed paths (if potentially different). +@@ -2189,7 +2189,7 @@ Linux::Linux(const Driver &D, const llvm + } + } + addPathIfExists(SysRoot + "/lib", Paths); +- addPathIfExists(SysRoot + "/usr/lib", Paths); ++ addPathIfExists(SysRoot + LLVM_LIBDIR, Paths); + } + + bool Linux::HasNativeLLVMSupport() const { +--- llvm-3.2.src/tools/clang/lib/Driver/Tools.cpp.orig 2012-11-21 08:56:23.000000000 +0100 ++++ llvm-3.2.src/tools/clang/lib/Driver/Tools.cpp 2013-01-26 18:43:56.952167604 +0100 +@@ -218,7 +218,7 @@ static void addProfileRT(const ToolChain + // libprofile_rt.so. We used to use the -l:libprofile_rt.a syntax, but that is + // not supported by old linkers. + std::string ProfileRT = +- std::string(TC.getDriver().Dir) + "/../lib/libprofile_rt.a"; ++ LLVM_LIBDIR "/libprofile_rt.a"; + + CmdArgs.push_back(Args.MakeArgString(ProfileRT)); + } +@@ -4881,9 +4881,9 @@ void solaris::Link::ConstructJob(Compila + const ArgList &Args, + const char *LinkingOutput) const { + // FIXME: Find a real GCC, don't hard-code versions here +- std::string GCCLibPath = "/usr/gcc/4.5/lib/gcc/"; ++ std::string GCCLibPath = LLVM_LIBDIR "/gcc/"; + const llvm::Triple &T = getToolChain().getTriple(); +- std::string LibPath = "/usr/lib/"; ++ std::string LibPath = LLVM_LIBDIR "/"; + llvm::Triple::ArchType Arch = T.getArch(); + switch (Arch) { + case llvm::Triple::x86: +@@ -6049,7 +6049,7 @@ void linuxtools::Link::ConstructJob(Comp + // forward. + if (D.IsUsingLTO(Args) || Args.hasArg(options::OPT_use_gold_plugin)) { + CmdArgs.push_back("-plugin"); +- std::string Plugin = ToolChain.getDriver().Dir + "/../lib/LLVMgold.so"; ++ std::string Plugin = LLVM_LIBDIR "/LLVMgold.so"; + CmdArgs.push_back(Args.MakeArgString(Plugin)); + } + diff --git a/llvm-r600.patch b/llvm-r600.patch new file mode 100644 index 0000000..0957c01 --- /dev/null +++ b/llvm-r600.patch @@ -0,0 +1,23023 @@ +diff -Nur -x .git llvm-3.2.src/autoconf/configure.ac llvm-r600/autoconf/configure.ac +--- llvm-3.2.src/autoconf/configure.ac 2012-11-21 17:13:35.000000000 +0100 ++++ llvm-r600/autoconf/configure.ac 2013-01-25 19:43:56.096716416 +0100 +@@ -751,6 +751,11 @@ + + if test ${enableval} != "disable" + then ++ if test ${enableval} = "AMDGPU" ++ then ++ AC_MSG_ERROR([The AMDGPU target has been renamed to R600, please reconfigure with --enable-experimental-targets=R600]) ++ enableval="R600" ++ fi + TARGETS_TO_BUILD="$enableval $TARGETS_TO_BUILD" + fi + +diff -Nur -x .git llvm-3.2.src/configure llvm-r600/configure +--- llvm-3.2.src/configure 2012-11-21 17:13:35.000000000 +0100 ++++ llvm-r600/configure 2013-01-25 19:43:56.173383081 +0100 +@@ -5473,6 +5473,13 @@ + + if test ${enableval} != "disable" + then ++ if test ${enableval} = "AMDGPU" ++ then ++ { { echo "$as_me:$LINENO: error: The AMDGPU target has been renamed to R600, please reconfigure with --enable-experimental-targets=R600" >&5 ++echo "$as_me: error: The AMDGPU target has been renamed to R600, please reconfigure with --enable-experimental-targets=R600" >&2;} ++ { (exit 1); exit 1; }; } ++ enableval="R600" ++ fi + TARGETS_TO_BUILD="$enableval $TARGETS_TO_BUILD" + fi + +@@ -10316,7 +10323,7 @@ + lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 + lt_status=$lt_dlunknown + cat > conftest.$ac_ext < ++ : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>, ++ GCCBuiltin; ++ ++multiclass R600ReadPreloadRegisterIntrinsic_xyz { ++ def _x : R600ReadPreloadRegisterIntrinsic; ++ def _y : R600ReadPreloadRegisterIntrinsic; ++ def _z : R600ReadPreloadRegisterIntrinsic; ++} ++ ++defm int_r600_read_global_size : R600ReadPreloadRegisterIntrinsic_xyz < ++ "__builtin_r600_read_global_size">; ++defm int_r600_read_local_size : R600ReadPreloadRegisterIntrinsic_xyz < ++ "__builtin_r600_read_local_size">; ++defm int_r600_read_ngroups : R600ReadPreloadRegisterIntrinsic_xyz < ++ "__builtin_r600_read_ngroups">; ++defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz < ++ "__builtin_r600_read_tgid">; ++defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz < ++ "__builtin_r600_read_tidig">; ++} // End TargetPrefix = "r600" +diff -Nur -x .git llvm-3.2.src/include/llvm/Intrinsics.td llvm-r600/include/llvm/Intrinsics.td +--- llvm-3.2.src/include/llvm/Intrinsics.td 2012-10-20 01:00:20.000000000 +0200 ++++ llvm-r600/include/llvm/Intrinsics.td 2013-01-25 19:43:56.426716409 +0100 +@@ -469,3 +469,4 @@ + include "llvm/IntrinsicsHexagon.td" + include "llvm/IntrinsicsNVVM.td" + include "llvm/IntrinsicsMips.td" ++include "llvm/IntrinsicsR600.td" +diff -Nur -x .git llvm-3.2.src/lib/CodeGen/SelectionDAG/DAGCombiner.cpp llvm-r600/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +--- llvm-3.2.src/lib/CodeGen/SelectionDAG/DAGCombiner.cpp 2012-11-26 18:01:12.000000000 +0100 ++++ llvm-r600/lib/CodeGen/SelectionDAG/DAGCombiner.cpp 2013-01-25 19:43:56.720049736 +0100 +@@ -8514,11 +8514,8 @@ + if (Opcode == ISD::DELETED_NODE && + (Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP)) { + Opcode = Opc; +- // If not supported by target, bail out. +- if (TLI.getOperationAction(Opcode, VT) != TargetLowering::Legal && +- TLI.getOperationAction(Opcode, VT) != TargetLowering::Custom) +- return SDValue(); + } ++ + if (Opc != Opcode) + return SDValue(); + +@@ -8543,6 +8540,10 @@ + assert(SrcVT != MVT::Other && "Cannot determine source type!"); + + EVT NVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumInScalars); ++ ++ if (!TLI.isOperationLegalOrCustom(Opcode, NVT)) ++ return SDValue(); ++ + SmallVector Opnds; + for (unsigned i = 0; i != NumInScalars; ++i) { + SDValue In = N->getOperand(i); +diff -Nur -x .git llvm-3.2.src/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp llvm-r600/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +--- llvm-3.2.src/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp 2012-10-24 19:25:11.000000000 +0200 ++++ llvm-r600/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp 2013-01-25 19:43:56.733383069 +0100 +@@ -731,9 +731,10 @@ + return; + } + case TargetLowering::Promote: { +- assert(VT.isVector() && "Unknown legal promote case!"); +- Value = DAG.getNode(ISD::BITCAST, dl, +- TLI.getTypeToPromoteTo(ISD::STORE, VT), Value); ++ EVT NVT = TLI.getTypeToPromoteTo(ISD::STORE, VT); ++ assert(NVT.getSizeInBits() == VT.getSizeInBits() && ++ "Can only promote stores to same size type"); ++ Value = DAG.getNode(ISD::BITCAST, dl, NVT, Value); + SDValue Result = + DAG.getStore(Chain, dl, Value, Ptr, + ST->getPointerInfo(), isVolatile, +@@ -889,10 +890,9 @@ + break; + } + case TargetLowering::Promote: { +- // Only promote a load of vector type to another. +- assert(VT.isVector() && "Cannot promote this load!"); +- // Change base type to a different vector type. + EVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT); ++ assert(NVT.getSizeInBits() == VT.getSizeInBits() && ++ "Can only promote loads to same size type"); + + SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(), + LD->isVolatile(), LD->isNonTemporal(), +diff -Nur -x .git llvm-3.2.src/lib/Target/LLVMBuild.txt llvm-r600/lib/Target/LLVMBuild.txt +--- llvm-3.2.src/lib/Target/LLVMBuild.txt 2012-07-16 20:19:46.000000000 +0200 ++++ llvm-r600/lib/Target/LLVMBuild.txt 2013-01-25 19:43:57.173383060 +0100 +@@ -16,7 +16,7 @@ + ;===------------------------------------------------------------------------===; + + [common] +-subdirectories = ARM CellSPU CppBackend Hexagon MBlaze MSP430 NVPTX Mips PowerPC Sparc X86 XCore ++subdirectories = ARM CellSPU CppBackend Hexagon MBlaze MSP430 NVPTX Mips PowerPC R600 Sparc X86 XCore + + ; This is a special group whose required libraries are extended (by llvm-build) + ; with the best execution engine (the native JIT, if available, or the +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUAsmPrinter.cpp llvm-r600/lib/Target/R600/AMDGPUAsmPrinter.cpp +--- llvm-3.2.src/lib/Target/R600/AMDGPUAsmPrinter.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPUAsmPrinter.cpp 2013-01-25 19:43:57.423383055 +0100 +@@ -0,0 +1,138 @@ ++//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// ++/// The AMDGPUAsmPrinter is used to print both assembly string and also binary ++/// code. When passed an MCAsmStreamer it prints assembly and when passed ++/// an MCObjectStreamer it outputs binary code. ++// ++//===----------------------------------------------------------------------===// ++// ++ ++ ++#include "AMDGPUAsmPrinter.h" ++#include "AMDGPU.h" ++#include "SIMachineFunctionInfo.h" ++#include "SIRegisterInfo.h" ++#include "llvm/MC/MCStreamer.h" ++#include "llvm/Target/TargetLoweringObjectFile.h" ++#include "llvm/Support/TargetRegistry.h" ++ ++using namespace llvm; ++ ++ ++static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm, ++ MCStreamer &Streamer) { ++ return new AMDGPUAsmPrinter(tm, Streamer); ++} ++ ++extern "C" void LLVMInitializeR600AsmPrinter() { ++ TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass); ++} ++ ++/// We need to override this function so we can avoid ++/// the call to EmitFunctionHeader(), which the MCPureStreamer can't handle. ++bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { ++ const AMDGPUSubtarget &STM = TM.getSubtarget(); ++ if (STM.dumpCode()) { ++#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) ++ MF.dump(); ++#endif ++ } ++ SetupMachineFunction(MF); ++ OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); ++ if (STM.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { ++ EmitProgramInfo(MF); ++ } ++ EmitFunctionBody(); ++ return false; ++} ++ ++void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) { ++ unsigned MaxSGPR = 0; ++ unsigned MaxVGPR = 0; ++ bool VCCUsed = false; ++ const SIRegisterInfo * RI = ++ static_cast(TM.getRegisterInfo()); ++ ++ for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); ++ BB != BB_E; ++BB) { ++ MachineBasicBlock &MBB = *BB; ++ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); ++ I != E; ++I) { ++ MachineInstr &MI = *I; ++ ++ unsigned numOperands = MI.getNumOperands(); ++ for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { ++ MachineOperand & MO = MI.getOperand(op_idx); ++ unsigned maxUsed; ++ unsigned width = 0; ++ bool isSGPR = false; ++ unsigned reg; ++ unsigned hwReg; ++ if (!MO.isReg()) { ++ continue; ++ } ++ reg = MO.getReg(); ++ if (reg == AMDGPU::VCC) { ++ VCCUsed = true; ++ continue; ++ } ++ switch (reg) { ++ default: break; ++ case AMDGPU::EXEC: ++ case AMDGPU::SI_LITERAL_CONSTANT: ++ case AMDGPU::SREG_LIT_0: ++ case AMDGPU::M0: ++ continue; ++ } ++ ++ if (AMDGPU::SReg_32RegClass.contains(reg)) { ++ isSGPR = true; ++ width = 1; ++ } else if (AMDGPU::VReg_32RegClass.contains(reg)) { ++ isSGPR = false; ++ width = 1; ++ } else if (AMDGPU::SReg_64RegClass.contains(reg)) { ++ isSGPR = true; ++ width = 2; ++ } else if (AMDGPU::VReg_64RegClass.contains(reg)) { ++ isSGPR = false; ++ width = 2; ++ } else if (AMDGPU::SReg_128RegClass.contains(reg)) { ++ isSGPR = true; ++ width = 4; ++ } else if (AMDGPU::VReg_128RegClass.contains(reg)) { ++ isSGPR = false; ++ width = 4; ++ } else if (AMDGPU::SReg_256RegClass.contains(reg)) { ++ isSGPR = true; ++ width = 8; ++ } else { ++ assert(!"Unknown register class"); ++ } ++ hwReg = RI->getEncodingValue(reg); ++ maxUsed = hwReg + width - 1; ++ if (isSGPR) { ++ MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR; ++ } else { ++ MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR; ++ } ++ } ++ } ++ } ++ if (VCCUsed) { ++ MaxSGPR += 2; ++ } ++ SIMachineFunctionInfo * MFI = MF.getInfo(); ++ OutStreamer.EmitIntValue(MaxSGPR + 1, 4); ++ OutStreamer.EmitIntValue(MaxVGPR + 1, 4); ++ OutStreamer.EmitIntValue(MFI->SPIPSInputAddr, 4); ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUAsmPrinter.h llvm-r600/lib/Target/R600/AMDGPUAsmPrinter.h +--- llvm-3.2.src/lib/Target/R600/AMDGPUAsmPrinter.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPUAsmPrinter.h 2013-01-25 19:43:57.426716388 +0100 +@@ -0,0 +1,44 @@ ++//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code -------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief AMDGPU Assembly printer class. ++// ++//===----------------------------------------------------------------------===// ++ ++#ifndef AMDGPU_ASMPRINTER_H ++#define AMDGPU_ASMPRINTER_H ++ ++#include "llvm/CodeGen/AsmPrinter.h" ++ ++namespace llvm { ++ ++class AMDGPUAsmPrinter : public AsmPrinter { ++ ++public: ++ explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) ++ : AsmPrinter(TM, Streamer) { } ++ ++ virtual bool runOnMachineFunction(MachineFunction &MF); ++ ++ virtual const char *getPassName() const { ++ return "AMDGPU Assembly Printer"; ++ } ++ ++ /// \brief Emit register usage information so that the GPU driver ++ /// can correctly setup the GPU state. ++ void EmitProgramInfo(MachineFunction &MF); ++ ++ /// Implemented in AMDGPUMCInstLower.cpp ++ virtual void EmitInstruction(const MachineInstr *MI); ++}; ++ ++} // End anonymous llvm ++ ++#endif //AMDGPU_ASMPRINTER_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUCodeEmitter.h llvm-r600/lib/Target/R600/AMDGPUCodeEmitter.h +--- llvm-3.2.src/lib/Target/R600/AMDGPUCodeEmitter.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPUCodeEmitter.h 2013-01-25 19:43:57.426716388 +0100 +@@ -0,0 +1,49 @@ ++//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief CodeEmitter interface for R600 and SI codegen. ++// ++//===----------------------------------------------------------------------===// ++ ++#ifndef AMDGPUCODEEMITTER_H ++#define AMDGPUCODEEMITTER_H ++ ++namespace llvm { ++ ++class AMDGPUCodeEmitter { ++public: ++ uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const; ++ virtual uint64_t getMachineOpValue(const MachineInstr &MI, ++ const MachineOperand &MO) const { return 0; } ++ virtual unsigned GPR4AlignEncode(const MachineInstr &MI, ++ unsigned OpNo) const { ++ return 0; ++ } ++ virtual unsigned GPR2AlignEncode(const MachineInstr &MI, ++ unsigned OpNo) const { ++ return 0; ++ } ++ virtual uint64_t VOPPostEncode(const MachineInstr &MI, ++ uint64_t Value) const { ++ return Value; ++ } ++ virtual uint64_t i32LiteralEncode(const MachineInstr &MI, ++ unsigned OpNo) const { ++ return 0; ++ } ++ virtual uint32_t SMRDmemriEncode(const MachineInstr &MI, unsigned OpNo) ++ const { ++ return 0; ++ } ++}; ++ ++} // End namespace llvm ++ ++#endif // AMDGPUCODEEMITTER_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUConvertToISA.cpp llvm-r600/lib/Target/R600/AMDGPUConvertToISA.cpp +--- llvm-3.2.src/lib/Target/R600/AMDGPUConvertToISA.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPUConvertToISA.cpp 2013-01-25 19:43:57.426716388 +0100 +@@ -0,0 +1,62 @@ ++//===-- AMDGPUConvertToISA.cpp - Lower AMDIL to HW ISA --------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief This pass lowers AMDIL machine instructions to the appropriate ++/// hardware instructions. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "AMDGPU.h" ++#include "AMDGPUInstrInfo.h" ++#include "llvm/CodeGen/MachineFunctionPass.h" ++ ++using namespace llvm; ++ ++namespace { ++ ++class AMDGPUConvertToISAPass : public MachineFunctionPass { ++ ++private: ++ static char ID; ++ TargetMachine &TM; ++ ++public: ++ AMDGPUConvertToISAPass(TargetMachine &tm) : ++ MachineFunctionPass(ID), TM(tm) { } ++ ++ virtual bool runOnMachineFunction(MachineFunction &MF); ++ ++ virtual const char *getPassName() const {return "AMDGPU Convert to ISA";} ++ ++}; ++ ++} // End anonymous namespace ++ ++char AMDGPUConvertToISAPass::ID = 0; ++ ++FunctionPass *llvm::createAMDGPUConvertToISAPass(TargetMachine &tm) { ++ return new AMDGPUConvertToISAPass(tm); ++} ++ ++bool AMDGPUConvertToISAPass::runOnMachineFunction(MachineFunction &MF) { ++ const AMDGPUInstrInfo * TII = ++ static_cast(TM.getInstrInfo()); ++ ++ for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); ++ BB != BB_E; ++BB) { ++ MachineBasicBlock &MBB = *BB; ++ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); ++ I != E; ++I) { ++ MachineInstr &MI = *I; ++ TII->convertToISA(MI, MF, MBB.findDebugLoc(I)); ++ } ++ } ++ return false; ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPU.h llvm-r600/lib/Target/R600/AMDGPU.h +--- llvm-3.2.src/lib/Target/R600/AMDGPU.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPU.h 2013-01-25 19:43:57.423383055 +0100 +@@ -0,0 +1,51 @@ ++//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++/// \file ++//===----------------------------------------------------------------------===// ++ ++#ifndef AMDGPU_H ++#define AMDGPU_H ++ ++#include "AMDGPUTargetMachine.h" ++#include "llvm/Support/TargetRegistry.h" ++#include "llvm/Target/TargetMachine.h" ++ ++namespace llvm { ++ ++class FunctionPass; ++class AMDGPUTargetMachine; ++ ++// R600 Passes ++FunctionPass* createR600KernelParametersPass(const DataLayout *TD); ++FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm); ++FunctionPass *createR600LowerConstCopy(TargetMachine &tm); ++ ++// SI Passes ++FunctionPass *createSIAnnotateControlFlowPass(); ++FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm); ++FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); ++FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); ++FunctionPass *createSILowerLiteralConstantsPass(TargetMachine &tm); ++FunctionPass *createSIInsertWaits(TargetMachine &tm); ++ ++// Passes common to R600 and SI ++Pass *createAMDGPUStructurizeCFGPass(); ++FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm); ++ ++} // End namespace llvm ++ ++namespace ShaderType { ++ enum Type { ++ PIXEL = 0, ++ VERTEX = 1, ++ GEOMETRY = 2, ++ COMPUTE = 3 ++ }; ++} ++ ++#endif // AMDGPU_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.cpp llvm-r600/lib/Target/R600/AMDGPUInstrInfo.cpp +--- llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPUInstrInfo.cpp 2013-01-25 19:43:57.426716388 +0100 +@@ -0,0 +1,257 @@ ++//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Implementation of the TargetInstrInfo class that is common to all ++/// AMD GPUs. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "AMDGPUInstrInfo.h" ++#include "AMDGPURegisterInfo.h" ++#include "AMDGPUTargetMachine.h" ++#include "AMDIL.h" ++#include "llvm/CodeGen/MachineFrameInfo.h" ++#include "llvm/CodeGen/MachineInstrBuilder.h" ++#include "llvm/CodeGen/MachineRegisterInfo.h" ++ ++#define GET_INSTRINFO_CTOR ++#include "AMDGPUGenInstrInfo.inc" ++ ++using namespace llvm; ++ ++AMDGPUInstrInfo::AMDGPUInstrInfo(TargetMachine &tm) ++ : AMDGPUGenInstrInfo(0,0), RI(tm, *this), TM(tm) { } ++ ++const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const { ++ return RI; ++} ++ ++bool AMDGPUInstrInfo::isCoalescableExtInstr(const MachineInstr &MI, ++ unsigned &SrcReg, unsigned &DstReg, ++ unsigned &SubIdx) const { ++// TODO: Implement this function ++ return false; ++} ++ ++unsigned AMDGPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, ++ int &FrameIndex) const { ++// TODO: Implement this function ++ return 0; ++} ++ ++unsigned AMDGPUInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, ++ int &FrameIndex) const { ++// TODO: Implement this function ++ return 0; ++} ++ ++bool AMDGPUInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI, ++ const MachineMemOperand *&MMO, ++ int &FrameIndex) const { ++// TODO: Implement this function ++ return false; ++} ++unsigned AMDGPUInstrInfo::isStoreFromStackSlot(const MachineInstr *MI, ++ int &FrameIndex) const { ++// TODO: Implement this function ++ return 0; ++} ++unsigned AMDGPUInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI, ++ int &FrameIndex) const { ++// TODO: Implement this function ++ return 0; ++} ++bool AMDGPUInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI, ++ const MachineMemOperand *&MMO, ++ int &FrameIndex) const { ++// TODO: Implement this function ++ return false; ++} ++ ++MachineInstr * ++AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, ++ MachineBasicBlock::iterator &MBBI, ++ LiveVariables *LV) const { ++// TODO: Implement this function ++ return NULL; ++} ++bool AMDGPUInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter, ++ MachineBasicBlock &MBB) const { ++ while (iter != MBB.end()) { ++ switch (iter->getOpcode()) { ++ default: ++ break; ++ case AMDGPU::BRANCH_COND_i32: ++ case AMDGPU::BRANCH_COND_f32: ++ case AMDGPU::BRANCH: ++ return true; ++ }; ++ ++iter; ++ } ++ return false; ++} ++ ++MachineBasicBlock::iterator skipFlowControl(MachineBasicBlock *MBB) { ++ MachineBasicBlock::iterator tmp = MBB->end(); ++ if (!MBB->size()) { ++ return MBB->end(); ++ } ++ while (--tmp) { ++ if (tmp->getOpcode() == AMDGPU::ENDLOOP ++ || tmp->getOpcode() == AMDGPU::ENDIF ++ || tmp->getOpcode() == AMDGPU::ELSE) { ++ if (tmp == MBB->begin()) { ++ return tmp; ++ } else { ++ continue; ++ } ++ } else { ++ return ++tmp; ++ } ++ } ++ return MBB->end(); ++} ++ ++void ++AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator MI, ++ unsigned SrcReg, bool isKill, ++ int FrameIndex, ++ const TargetRegisterClass *RC, ++ const TargetRegisterInfo *TRI) const { ++ assert(!"Not Implemented"); ++} ++ ++void ++AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator MI, ++ unsigned DestReg, int FrameIndex, ++ const TargetRegisterClass *RC, ++ const TargetRegisterInfo *TRI) const { ++ assert(!"Not Implemented"); ++} ++ ++MachineInstr * ++AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, ++ MachineInstr *MI, ++ const SmallVectorImpl &Ops, ++ int FrameIndex) const { ++// TODO: Implement this function ++ return 0; ++} ++MachineInstr* ++AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, ++ MachineInstr *MI, ++ const SmallVectorImpl &Ops, ++ MachineInstr *LoadMI) const { ++ // TODO: Implement this function ++ return 0; ++} ++bool ++AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI, ++ const SmallVectorImpl &Ops) const { ++ // TODO: Implement this function ++ return false; ++} ++bool ++AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, ++ unsigned Reg, bool UnfoldLoad, ++ bool UnfoldStore, ++ SmallVectorImpl &NewMIs) const { ++ // TODO: Implement this function ++ return false; ++} ++ ++bool ++AMDGPUInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, ++ SmallVectorImpl &NewNodes) const { ++ // TODO: Implement this function ++ return false; ++} ++ ++unsigned ++AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, ++ bool UnfoldLoad, bool UnfoldStore, ++ unsigned *LoadRegIndex) const { ++ // TODO: Implement this function ++ return 0; ++} ++ ++bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, ++ int64_t Offset1, int64_t Offset2, ++ unsigned NumLoads) const { ++ assert(Offset2 > Offset1 ++ && "Second offset should be larger than first offset!"); ++ // If we have less than 16 loads in a row, and the offsets are within 16, ++ // then schedule together. ++ // TODO: Make the loads schedule near if it fits in a cacheline ++ return (NumLoads < 16 && (Offset2 - Offset1) < 16); ++} ++ ++bool ++AMDGPUInstrInfo::ReverseBranchCondition(SmallVectorImpl &Cond) ++ const { ++ // TODO: Implement this function ++ return true; ++} ++void AMDGPUInstrInfo::insertNoop(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator MI) const { ++ // TODO: Implement this function ++} ++ ++bool AMDGPUInstrInfo::isPredicated(const MachineInstr *MI) const { ++ // TODO: Implement this function ++ return false; ++} ++bool ++AMDGPUInstrInfo::SubsumesPredicate(const SmallVectorImpl &Pred1, ++ const SmallVectorImpl &Pred2) ++ const { ++ // TODO: Implement this function ++ return false; ++} ++ ++bool AMDGPUInstrInfo::DefinesPredicate(MachineInstr *MI, ++ std::vector &Pred) const { ++ // TODO: Implement this function ++ return false; ++} ++ ++bool AMDGPUInstrInfo::isPredicable(MachineInstr *MI) const { ++ // TODO: Implement this function ++ return MI->getDesc().isPredicable(); ++} ++ ++bool ++AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { ++ // TODO: Implement this function ++ return true; ++} ++ ++void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF, ++ DebugLoc DL) const { ++ MachineRegisterInfo &MRI = MF.getRegInfo(); ++ const AMDGPURegisterInfo & RI = getRegisterInfo(); ++ ++ for (unsigned i = 0; i < MI.getNumOperands(); i++) { ++ MachineOperand &MO = MI.getOperand(i); ++ // Convert dst regclass to one that is supported by the ISA ++ if (MO.isReg() && MO.isDef()) { ++ if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) { ++ const TargetRegisterClass * oldRegClass = MRI.getRegClass(MO.getReg()); ++ const TargetRegisterClass * newRegClass = RI.getISARegClass(oldRegClass); ++ ++ assert(newRegClass); ++ ++ MRI.setRegClass(MO.getReg(), newRegClass); ++ } ++ } ++ } ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.h llvm-r600/lib/Target/R600/AMDGPUInstrInfo.h +--- llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPUInstrInfo.h 2013-01-25 19:43:57.430049721 +0100 +@@ -0,0 +1,149 @@ ++//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Contains the definition of a TargetInstrInfo class that is common ++/// to all AMD GPUs. ++// ++//===----------------------------------------------------------------------===// ++ ++#ifndef AMDGPUINSTRUCTIONINFO_H ++#define AMDGPUINSTRUCTIONINFO_H ++ ++#include "AMDGPURegisterInfo.h" ++#include "AMDGPUInstrInfo.h" ++#include "llvm/Target/TargetInstrInfo.h" ++ ++#include ++ ++#define GET_INSTRINFO_HEADER ++#define GET_INSTRINFO_ENUM ++#include "AMDGPUGenInstrInfo.inc" ++ ++#define OPCODE_IS_ZERO_INT AMDGPU::PRED_SETE_INT ++#define OPCODE_IS_NOT_ZERO_INT AMDGPU::PRED_SETNE_INT ++#define OPCODE_IS_ZERO AMDGPU::PRED_SETE ++#define OPCODE_IS_NOT_ZERO AMDGPU::PRED_SETNE ++ ++namespace llvm { ++ ++class AMDGPUTargetMachine; ++class MachineFunction; ++class MachineInstr; ++class MachineInstrBuilder; ++ ++class AMDGPUInstrInfo : public AMDGPUGenInstrInfo { ++private: ++ const AMDGPURegisterInfo RI; ++ TargetMachine &TM; ++ bool getNextBranchInstr(MachineBasicBlock::iterator &iter, ++ MachineBasicBlock &MBB) const; ++public: ++ explicit AMDGPUInstrInfo(TargetMachine &tm); ++ ++ virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0; ++ ++ bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, ++ unsigned &DstReg, unsigned &SubIdx) const; ++ ++ unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; ++ unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI, ++ int &FrameIndex) const; ++ bool hasLoadFromStackSlot(const MachineInstr *MI, ++ const MachineMemOperand *&MMO, ++ int &FrameIndex) const; ++ unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; ++ unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI, ++ int &FrameIndex) const; ++ bool hasStoreFromStackSlot(const MachineInstr *MI, ++ const MachineMemOperand *&MMO, ++ int &FrameIndex) const; ++ ++ MachineInstr * ++ convertToThreeAddress(MachineFunction::iterator &MFI, ++ MachineBasicBlock::iterator &MBBI, ++ LiveVariables *LV) const; ++ ++ ++ virtual void copyPhysReg(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator MI, DebugLoc DL, ++ unsigned DestReg, unsigned SrcReg, ++ bool KillSrc) const = 0; ++ ++ void storeRegToStackSlot(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator MI, ++ unsigned SrcReg, bool isKill, int FrameIndex, ++ const TargetRegisterClass *RC, ++ const TargetRegisterInfo *TRI) const; ++ void loadRegFromStackSlot(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator MI, ++ unsigned DestReg, int FrameIndex, ++ const TargetRegisterClass *RC, ++ const TargetRegisterInfo *TRI) const; ++ ++protected: ++ MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, ++ MachineInstr *MI, ++ const SmallVectorImpl &Ops, ++ int FrameIndex) const; ++ MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, ++ MachineInstr *MI, ++ const SmallVectorImpl &Ops, ++ MachineInstr *LoadMI) const; ++public: ++ bool canFoldMemoryOperand(const MachineInstr *MI, ++ const SmallVectorImpl &Ops) const; ++ bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, ++ unsigned Reg, bool UnfoldLoad, bool UnfoldStore, ++ SmallVectorImpl &NewMIs) const; ++ bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, ++ SmallVectorImpl &NewNodes) const; ++ unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, ++ bool UnfoldLoad, bool UnfoldStore, ++ unsigned *LoadRegIndex = 0) const; ++ bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, ++ int64_t Offset1, int64_t Offset2, ++ unsigned NumLoads) const; ++ ++ bool ReverseBranchCondition(SmallVectorImpl &Cond) const; ++ void insertNoop(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator MI) const; ++ bool isPredicated(const MachineInstr *MI) const; ++ bool SubsumesPredicate(const SmallVectorImpl &Pred1, ++ const SmallVectorImpl &Pred2) const; ++ bool DefinesPredicate(MachineInstr *MI, ++ std::vector &Pred) const; ++ bool isPredicable(MachineInstr *MI) const; ++ bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const; ++ ++ // Helper functions that check the opcode for status information ++ bool isLoadInst(llvm::MachineInstr *MI) const; ++ bool isExtLoadInst(llvm::MachineInstr *MI) const; ++ bool isSWSExtLoadInst(llvm::MachineInstr *MI) const; ++ bool isSExtLoadInst(llvm::MachineInstr *MI) const; ++ bool isZExtLoadInst(llvm::MachineInstr *MI) const; ++ bool isAExtLoadInst(llvm::MachineInstr *MI) const; ++ bool isStoreInst(llvm::MachineInstr *MI) const; ++ bool isTruncStoreInst(llvm::MachineInstr *MI) const; ++ ++ virtual MachineInstr* getMovImmInstr(MachineFunction *MF, unsigned DstReg, ++ int64_t Imm) const = 0; ++ virtual unsigned getIEQOpcode() const = 0; ++ virtual bool isMov(unsigned opcode) const = 0; ++ ++ /// \brief Convert the AMDIL MachineInstr to a supported ISA ++ /// MachineInstr ++ virtual void convertToISA(MachineInstr & MI, MachineFunction &MF, ++ DebugLoc DL) const; ++ ++}; ++ ++} // End llvm namespace ++ ++#endif // AMDGPUINSTRINFO_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.td llvm-r600/lib/Target/R600/AMDGPUInstrInfo.td +--- llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.td 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPUInstrInfo.td 2013-01-25 19:43:57.430049721 +0100 +@@ -0,0 +1,74 @@ ++//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++// This file contains DAG node defintions for the AMDGPU target. ++// ++//===----------------------------------------------------------------------===// ++ ++//===----------------------------------------------------------------------===// ++// AMDGPU DAG Profiles ++//===----------------------------------------------------------------------===// ++ ++def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [ ++ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3> ++]>; ++ ++//===----------------------------------------------------------------------===// ++// AMDGPU DAG Nodes ++// ++ ++// out = ((a << 32) | b) >> c) ++// ++// Can be used to optimize rtol: ++// rotl(a, b) = bitalign(a, a, 32 - b) ++def AMDGPUbitalign : SDNode<"AMDGPUISD::BITALIGN", AMDGPUDTIntTernaryOp>; ++ ++// This argument to this node is a dword address. ++def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; ++ ++// out = a - floor(a) ++def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; ++ ++// out = max(a, b) a and b are floats ++def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp, ++ [SDNPCommutative, SDNPAssociative] ++>; ++ ++// out = max(a, b) a and b are signed ints ++def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp, ++ [SDNPCommutative, SDNPAssociative] ++>; ++ ++// out = max(a, b) a and b are unsigned ints ++def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp, ++ [SDNPCommutative, SDNPAssociative] ++>; ++ ++// out = min(a, b) a and b are floats ++def AMDGPUfmin : SDNode<"AMDGPUISD::FMIN", SDTFPBinOp, ++ [SDNPCommutative, SDNPAssociative] ++>; ++ ++// out = min(a, b) a snd b are signed ints ++def AMDGPUsmin : SDNode<"AMDGPUISD::SMIN", SDTIntBinOp, ++ [SDNPCommutative, SDNPAssociative] ++>; ++ ++// out = min(a, b) a and b are unsigned ints ++def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp, ++ [SDNPCommutative, SDNPAssociative] ++>; ++ ++// urecip - This operation is a helper for integer division, it returns the ++// result of 1 / a as a fractional unsigned integer. ++// out = (2^32 / a) + e ++// e is rounding error ++def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>; ++ ++def fpow : SDNode<"ISD::FPOW", SDTFPBinOp>; +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUInstructions.td llvm-r600/lib/Target/R600/AMDGPUInstructions.td +--- llvm-3.2.src/lib/Target/R600/AMDGPUInstructions.td 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPUInstructions.td 2013-01-25 19:43:57.430049721 +0100 +@@ -0,0 +1,190 @@ ++//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++// This file contains instruction defs that are common to all hw codegen ++// targets. ++// ++//===----------------------------------------------------------------------===// ++ ++class AMDGPUInst pattern> : Instruction { ++ field bits<16> AMDILOp = 0; ++ field bits<3> Gen = 0; ++ ++ let Namespace = "AMDGPU"; ++ let OutOperandList = outs; ++ let InOperandList = ins; ++ let AsmString = asm; ++ let Pattern = pattern; ++ let Itinerary = NullALU; ++ let TSFlags{42-40} = Gen; ++ let TSFlags{63-48} = AMDILOp; ++} ++ ++class AMDGPUShaderInst pattern> ++ : AMDGPUInst { ++ ++ field bits<32> Inst = 0xffffffff; ++ ++} ++ ++def InstFlag : OperandWithDefaultOps ; ++ ++def COND_EQ : PatLeaf < ++ (cond), ++ [{switch(N->get()){{default: return false; ++ case ISD::SETOEQ: case ISD::SETUEQ: ++ case ISD::SETEQ: return true;}}}] ++>; ++ ++def COND_NE : PatLeaf < ++ (cond), ++ [{switch(N->get()){{default: return false; ++ case ISD::SETONE: case ISD::SETUNE: ++ case ISD::SETNE: return true;}}}] ++>; ++def COND_GT : PatLeaf < ++ (cond), ++ [{switch(N->get()){{default: return false; ++ case ISD::SETOGT: case ISD::SETUGT: ++ case ISD::SETGT: return true;}}}] ++>; ++ ++def COND_GE : PatLeaf < ++ (cond), ++ [{switch(N->get()){{default: return false; ++ case ISD::SETOGE: case ISD::SETUGE: ++ case ISD::SETGE: return true;}}}] ++>; ++ ++def COND_LT : PatLeaf < ++ (cond), ++ [{switch(N->get()){{default: return false; ++ case ISD::SETOLT: case ISD::SETULT: ++ case ISD::SETLT: return true;}}}] ++>; ++ ++def COND_LE : PatLeaf < ++ (cond), ++ [{switch(N->get()){{default: return false; ++ case ISD::SETOLE: case ISD::SETULE: ++ case ISD::SETLE: return true;}}}] ++>; ++ ++//===----------------------------------------------------------------------===// ++// Load/Store Pattern Fragments ++//===----------------------------------------------------------------------===// ++ ++def zextloadi8_global : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr), [{ ++ return isGlobalLoad(dyn_cast(N)); ++}]>; ++ ++class Constants { ++int TWO_PI = 0x40c90fdb; ++int PI = 0x40490fdb; ++int TWO_PI_INV = 0x3e22f983; ++} ++def CONST : Constants; ++ ++def FP_ZERO : PatLeaf < ++ (fpimm), ++ [{return N->getValueAPF().isZero();}] ++>; ++ ++def FP_ONE : PatLeaf < ++ (fpimm), ++ [{return N->isExactlyValue(1.0);}] ++>; ++ ++let isCodeGenOnly = 1, isPseudo = 1, usesCustomInserter = 1 in { ++ ++class CLAMP : AMDGPUShaderInst < ++ (outs rc:$dst), ++ (ins rc:$src0), ++ "CLAMP $dst, $src0", ++ [(set rc:$dst, (int_AMDIL_clamp rc:$src0, (f32 FP_ZERO), (f32 FP_ONE)))] ++>; ++ ++class FABS : AMDGPUShaderInst < ++ (outs rc:$dst), ++ (ins rc:$src0), ++ "FABS $dst, $src0", ++ [(set rc:$dst, (fabs rc:$src0))] ++>; ++ ++class FNEG : AMDGPUShaderInst < ++ (outs rc:$dst), ++ (ins rc:$src0), ++ "FNEG $dst, $src0", ++ [(set rc:$dst, (fneg rc:$src0))] ++>; ++ ++def SHADER_TYPE : AMDGPUShaderInst < ++ (outs), ++ (ins i32imm:$type), ++ "SHADER_TYPE $type", ++ [(int_AMDGPU_shader_type imm:$type)] ++>; ++ ++} // End isCodeGenOnly = 1, isPseudo = 1, hasCustomInserter = 1 ++ ++/* Generic helper patterns for intrinsics */ ++/* -------------------------------------- */ ++ ++class POW_Common : Pat < ++ (fpow rc:$src0, rc:$src1), ++ (exp_ieee (mul rc:$src1, (log_ieee rc:$src0))) ++>; ++ ++/* Other helper patterns */ ++/* --------------------- */ ++ ++/* Extract element pattern */ ++class Extract_Element : Pat< ++ (sub_type (vector_extract (vec_type vec_class:$src), sub_idx)), ++ (EXTRACT_SUBREG vec_class:$src, sub_reg) ++>; ++ ++/* Insert element pattern */ ++class Insert_Element : Pat < ++ ++ (vec_type (vector_insert (vec_type vec_class:$vec), ++ (elem_type elem_class:$elem), sub_idx)), ++ (INSERT_SUBREG vec_class:$vec, elem_class:$elem, sub_reg) ++>; ++ ++// Vector Build pattern ++class Vector_Build : Pat < ++ (vecType (build_vector (elemType elemClass:$x), (elemType elemClass:$y), ++ (elemType elemClass:$z), (elemType elemClass:$w))), ++ (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG ++ (vecType (IMPLICIT_DEF)), elemClass:$x, sel_x), elemClass:$y, sel_y), ++ elemClass:$z, sel_z), elemClass:$w, sel_w) ++>; ++ ++// bitconvert pattern ++class BitConvert : Pat < ++ (dt (bitconvert (st rc:$src0))), ++ (dt rc:$src0) ++>; ++ ++class DwordAddrPat : Pat < ++ (vt (AMDGPUdwordaddr (vt rc:$addr))), ++ (vt rc:$addr) ++>; ++ ++include "R600Instructions.td" ++ ++include "SIInstrInfo.td" ++ +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUIntrinsics.td llvm-r600/lib/Target/R600/AMDGPUIntrinsics.td +--- llvm-3.2.src/lib/Target/R600/AMDGPUIntrinsics.td 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPUIntrinsics.td 2013-01-25 19:43:57.430049721 +0100 +@@ -0,0 +1,62 @@ ++//===-- AMDGPUIntrinsics.td - Common intrinsics -*- tablegen -*-----------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++// This file defines intrinsics that are used by all hw codegen targets. ++// ++//===----------------------------------------------------------------------===// ++ ++let TargetPrefix = "AMDGPU", isTarget = 1 in { ++ ++ def int_AMDGPU_load_const : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; ++ def int_AMDGPU_load_imm : Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>; ++ def int_AMDGPU_reserve_reg : Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>; ++ def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>; ++ def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; ++ ++ def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>; ++ def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; ++ def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; ++ def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; ++ def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>; ++ def int_AMDGPU_kilp : Intrinsic<[], [], []>; ++ def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; ++ def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; ++ def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; ++ def int_AMDGPU_rcp : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; ++ def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; ++ def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; ++ def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; ++ def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; ++ def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; ++ def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; ++ def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; ++ def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; ++ def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; ++ def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; ++ def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; ++ def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; ++ def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; ++ def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; ++ def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; ++ def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; ++ def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; ++ def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; ++ def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; ++ def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; ++ def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; ++ ++ def int_AMDGPU_shader_type : Intrinsic<[], [llvm_i32_ty], []>; ++} ++ ++let TargetPrefix = "TGSI", isTarget = 1 in { ++ ++ def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[IntrNoMem]>; ++} ++ ++include "SIIntrinsics.td" +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUISelLowering.cpp llvm-r600/lib/Target/R600/AMDGPUISelLowering.cpp +--- llvm-3.2.src/lib/Target/R600/AMDGPUISelLowering.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPUISelLowering.cpp 2013-01-25 19:43:57.426716388 +0100 +@@ -0,0 +1,418 @@ ++//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief This is the parent TargetLowering class for hardware code gen ++/// targets. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "AMDGPUISelLowering.h" ++#include "AMDILIntrinsicInfo.h" ++#include "llvm/CodeGen/MachineFunction.h" ++#include "llvm/CodeGen/MachineRegisterInfo.h" ++#include "llvm/CodeGen/SelectionDAG.h" ++#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" ++ ++using namespace llvm; ++ ++AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : ++ TargetLowering(TM, new TargetLoweringObjectFileELF()) { ++ ++ // Initialize target lowering borrowed from AMDIL ++ InitAMDILLowering(); ++ ++ // We need to custom lower some of the intrinsics ++ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); ++ ++ // Library functions. These default to Expand, but we have instructions ++ // for them. ++ setOperationAction(ISD::FCEIL, MVT::f32, Legal); ++ setOperationAction(ISD::FEXP2, MVT::f32, Legal); ++ setOperationAction(ISD::FPOW, MVT::f32, Legal); ++ setOperationAction(ISD::FLOG2, MVT::f32, Legal); ++ setOperationAction(ISD::FABS, MVT::f32, Legal); ++ setOperationAction(ISD::FFLOOR, MVT::f32, Legal); ++ setOperationAction(ISD::FRINT, MVT::f32, Legal); ++ ++ // Lower floating point store/load to integer store/load to reduce the number ++ // of patterns in tablegen. ++ setOperationAction(ISD::STORE, MVT::f32, Promote); ++ AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); ++ ++ setOperationAction(ISD::STORE, MVT::v4f32, Promote); ++ AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); ++ ++ setOperationAction(ISD::LOAD, MVT::f32, Promote); ++ AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); ++ ++ setOperationAction(ISD::LOAD, MVT::v4f32, Promote); ++ AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); ++ ++ setOperationAction(ISD::UDIV, MVT::i32, Expand); ++ setOperationAction(ISD::UDIVREM, MVT::i32, Custom); ++ setOperationAction(ISD::UREM, MVT::i32, Expand); ++} ++ ++//===---------------------------------------------------------------------===// ++// TargetLowering Callbacks ++//===---------------------------------------------------------------------===// ++ ++SDValue AMDGPUTargetLowering::LowerFormalArguments( ++ SDValue Chain, ++ CallingConv::ID CallConv, ++ bool isVarArg, ++ const SmallVectorImpl &Ins, ++ DebugLoc DL, SelectionDAG &DAG, ++ SmallVectorImpl &InVals) const { ++ for (unsigned i = 0, e = Ins.size(); i < e; ++i) { ++ InVals.push_back(SDValue()); ++ } ++ return Chain; ++} ++ ++SDValue AMDGPUTargetLowering::LowerReturn( ++ SDValue Chain, ++ CallingConv::ID CallConv, ++ bool isVarArg, ++ const SmallVectorImpl &Outs, ++ const SmallVectorImpl &OutVals, ++ DebugLoc DL, SelectionDAG &DAG) const { ++ return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain); ++} ++ ++//===---------------------------------------------------------------------===// ++// Target specific lowering ++//===---------------------------------------------------------------------===// ++ ++SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) ++ const { ++ switch (Op.getOpcode()) { ++ default: ++ Op.getNode()->dump(); ++ assert(0 && "Custom lowering code for this" ++ "instruction is not implemented yet!"); ++ break; ++ // AMDIL DAG lowering ++ case ISD::SDIV: return LowerSDIV(Op, DAG); ++ case ISD::SREM: return LowerSREM(Op, DAG); ++ case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); ++ case ISD::BRCOND: return LowerBRCOND(Op, DAG); ++ // AMDGPU DAG lowering ++ case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); ++ case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); ++ } ++ return Op; ++} ++ ++SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, ++ SelectionDAG &DAG) const { ++ unsigned IntrinsicID = cast(Op.getOperand(0))->getZExtValue(); ++ DebugLoc DL = Op.getDebugLoc(); ++ EVT VT = Op.getValueType(); ++ ++ switch (IntrinsicID) { ++ default: return Op; ++ case AMDGPUIntrinsic::AMDIL_abs: ++ return LowerIntrinsicIABS(Op, DAG); ++ case AMDGPUIntrinsic::AMDIL_exp: ++ return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1)); ++ case AMDGPUIntrinsic::AMDGPU_lrp: ++ return LowerIntrinsicLRP(Op, DAG); ++ case AMDGPUIntrinsic::AMDIL_fraction: ++ return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); ++ case AMDGPUIntrinsic::AMDIL_mad: ++ return DAG.getNode(AMDGPUISD::MAD, DL, VT, Op.getOperand(1), ++ Op.getOperand(2), Op.getOperand(3)); ++ case AMDGPUIntrinsic::AMDIL_max: ++ return DAG.getNode(AMDGPUISD::FMAX, DL, VT, Op.getOperand(1), ++ Op.getOperand(2)); ++ case AMDGPUIntrinsic::AMDGPU_imax: ++ return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1), ++ Op.getOperand(2)); ++ case AMDGPUIntrinsic::AMDGPU_umax: ++ return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1), ++ Op.getOperand(2)); ++ case AMDGPUIntrinsic::AMDIL_min: ++ return DAG.getNode(AMDGPUISD::FMIN, DL, VT, Op.getOperand(1), ++ Op.getOperand(2)); ++ case AMDGPUIntrinsic::AMDGPU_imin: ++ return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1), ++ Op.getOperand(2)); ++ case AMDGPUIntrinsic::AMDGPU_umin: ++ return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1), ++ Op.getOperand(2)); ++ case AMDGPUIntrinsic::AMDIL_round_nearest: ++ return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); ++ } ++} ++ ++///IABS(a) = SMAX(sub(0, a), a) ++SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op, ++ SelectionDAG &DAG) const { ++ ++ DebugLoc DL = Op.getDebugLoc(); ++ EVT VT = Op.getValueType(); ++ SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), ++ Op.getOperand(1)); ++ ++ return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1)); ++} ++ ++/// Linear Interpolation ++/// LRP(a, b, c) = muladd(a, b, (1 - a) * c) ++SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, ++ SelectionDAG &DAG) const { ++ DebugLoc DL = Op.getDebugLoc(); ++ EVT VT = Op.getValueType(); ++ SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT, ++ DAG.getConstantFP(1.0f, MVT::f32), ++ Op.getOperand(1)); ++ SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA, ++ Op.getOperand(3)); ++ return DAG.getNode(AMDGPUISD::MAD, DL, VT, Op.getOperand(1), ++ Op.getOperand(2), ++ OneSubAC); ++} ++ ++/// \brief Generate Min/Max node ++SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op, ++ SelectionDAG &DAG) const { ++ DebugLoc DL = Op.getDebugLoc(); ++ EVT VT = Op.getValueType(); ++ ++ SDValue LHS = Op.getOperand(0); ++ SDValue RHS = Op.getOperand(1); ++ SDValue True = Op.getOperand(2); ++ SDValue False = Op.getOperand(3); ++ SDValue CC = Op.getOperand(4); ++ ++ if (VT != MVT::f32 || ++ !((LHS == True && RHS == False) || (LHS == False && RHS == True))) { ++ return SDValue(); ++ } ++ ++ ISD::CondCode CCOpcode = cast(CC)->get(); ++ switch (CCOpcode) { ++ case ISD::SETOEQ: ++ case ISD::SETONE: ++ case ISD::SETUNE: ++ case ISD::SETNE: ++ case ISD::SETUEQ: ++ case ISD::SETEQ: ++ case ISD::SETFALSE: ++ case ISD::SETFALSE2: ++ case ISD::SETTRUE: ++ case ISD::SETTRUE2: ++ case ISD::SETUO: ++ case ISD::SETO: ++ assert(0 && "Operation should already be optimised !"); ++ case ISD::SETULE: ++ case ISD::SETULT: ++ case ISD::SETOLE: ++ case ISD::SETOLT: ++ case ISD::SETLE: ++ case ISD::SETLT: { ++ if (LHS == True) ++ return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS); ++ else ++ return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS); ++ } ++ case ISD::SETGT: ++ case ISD::SETGE: ++ case ISD::SETUGE: ++ case ISD::SETOGE: ++ case ISD::SETUGT: ++ case ISD::SETOGT: { ++ if (LHS == True) ++ return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS); ++ else ++ return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS); ++ } ++ case ISD::SETCC_INVALID: ++ assert(0 && "Invalid setcc condcode !"); ++ } ++ return Op; ++} ++ ++ ++ ++SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, ++ SelectionDAG &DAG) const { ++ DebugLoc DL = Op.getDebugLoc(); ++ EVT VT = Op.getValueType(); ++ ++ SDValue Num = Op.getOperand(0); ++ SDValue Den = Op.getOperand(1); ++ ++ SmallVector Results; ++ ++ // RCP = URECIP(Den) = 2^32 / Den + e ++ // e is rounding error. ++ SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); ++ ++ // RCP_LO = umulo(RCP, Den) */ ++ SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den); ++ ++ // RCP_HI = mulhu (RCP, Den) */ ++ SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); ++ ++ // NEG_RCP_LO = -RCP_LO ++ SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), ++ RCP_LO); ++ ++ // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) ++ SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT), ++ NEG_RCP_LO, RCP_LO, ++ ISD::SETEQ); ++ // Calculate the rounding error from the URECIP instruction ++ // E = mulhu(ABS_RCP_LO, RCP) ++ SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP); ++ ++ // RCP_A_E = RCP + E ++ SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E); ++ ++ // RCP_S_E = RCP - E ++ SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E); ++ ++ // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) ++ SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT), ++ RCP_A_E, RCP_S_E, ++ ISD::SETEQ); ++ // Quotient = mulhu(Tmp0, Num) ++ SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num); ++ ++ // Num_S_Remainder = Quotient * Den ++ SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den); ++ ++ // Remainder = Num - Num_S_Remainder ++ SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder); ++ ++ // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) ++ SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den, ++ DAG.getConstant(-1, VT), ++ DAG.getConstant(0, VT), ++ ISD::SETGE); ++ // Remainder_GE_Zero = (Remainder >= 0 ? -1 : 0) ++ SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Remainder, ++ DAG.getConstant(0, VT), ++ DAG.getConstant(-1, VT), ++ DAG.getConstant(0, VT), ++ ISD::SETGE); ++ // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero ++ SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den, ++ Remainder_GE_Zero); ++ ++ // Calculate Division result: ++ ++ // Quotient_A_One = Quotient + 1 ++ SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient, ++ DAG.getConstant(1, VT)); ++ ++ // Quotient_S_One = Quotient - 1 ++ SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient, ++ DAG.getConstant(1, VT)); ++ ++ // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) ++ SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT), ++ Quotient, Quotient_A_One, ISD::SETEQ); ++ ++ // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) ++ Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT), ++ Quotient_S_One, Div, ISD::SETEQ); ++ ++ // Calculate Rem result: ++ ++ // Remainder_S_Den = Remainder - Den ++ SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den); ++ ++ // Remainder_A_Den = Remainder + Den ++ SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den); ++ ++ // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) ++ SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT), ++ Remainder, Remainder_S_Den, ISD::SETEQ); ++ ++ // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) ++ Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT), ++ Remainder_A_Den, Rem, ISD::SETEQ); ++ SDValue Ops[2]; ++ Ops[0] = Div; ++ Ops[1] = Rem; ++ return DAG.getMergeValues(Ops, 2, DL); ++} ++ ++//===----------------------------------------------------------------------===// ++// Helper functions ++//===----------------------------------------------------------------------===// ++ ++bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const { ++ if (ConstantFPSDNode * CFP = dyn_cast(Op)) { ++ return CFP->isExactlyValue(1.0); ++ } ++ if (ConstantSDNode *C = dyn_cast(Op)) { ++ return C->isAllOnesValue(); ++ } ++ return false; ++} ++ ++bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const { ++ if (ConstantFPSDNode * CFP = dyn_cast(Op)) { ++ return CFP->getValueAPF().isZero(); ++ } ++ if (ConstantSDNode *C = dyn_cast(Op)) { ++ return C->isNullValue(); ++ } ++ return false; ++} ++ ++SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, ++ const TargetRegisterClass *RC, ++ unsigned Reg, EVT VT) const { ++ MachineFunction &MF = DAG.getMachineFunction(); ++ MachineRegisterInfo &MRI = MF.getRegInfo(); ++ unsigned VirtualRegister; ++ if (!MRI.isLiveIn(Reg)) { ++ VirtualRegister = MRI.createVirtualRegister(RC); ++ MRI.addLiveIn(Reg, VirtualRegister); ++ } else { ++ VirtualRegister = MRI.getLiveInVirtReg(Reg); ++ } ++ return DAG.getRegister(VirtualRegister, VT); ++} ++ ++#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; ++ ++const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { ++ switch (Opcode) { ++ default: return 0; ++ // AMDIL DAG nodes ++ NODE_NAME_CASE(MAD); ++ NODE_NAME_CASE(CALL); ++ NODE_NAME_CASE(UMUL); ++ NODE_NAME_CASE(DIV_INF); ++ NODE_NAME_CASE(RET_FLAG); ++ NODE_NAME_CASE(BRANCH_COND); ++ ++ // AMDGPU DAG nodes ++ NODE_NAME_CASE(DWORDADDR) ++ NODE_NAME_CASE(FRACT) ++ NODE_NAME_CASE(FMAX) ++ NODE_NAME_CASE(SMAX) ++ NODE_NAME_CASE(UMAX) ++ NODE_NAME_CASE(FMIN) ++ NODE_NAME_CASE(SMIN) ++ NODE_NAME_CASE(UMIN) ++ NODE_NAME_CASE(URECIP) ++ NODE_NAME_CASE(INTERP) ++ NODE_NAME_CASE(INTERP_P0) ++ NODE_NAME_CASE(EXPORT) ++ NODE_NAME_CASE(CONST_ADDRESS) ++ } ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUISelLowering.h llvm-r600/lib/Target/R600/AMDGPUISelLowering.h +--- llvm-3.2.src/lib/Target/R600/AMDGPUISelLowering.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPUISelLowering.h 2013-01-25 19:43:57.426716388 +0100 +@@ -0,0 +1,145 @@ ++//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Interface definition of the TargetLowering class that is common ++/// to all AMD GPUs. ++// ++//===----------------------------------------------------------------------===// ++ ++#ifndef AMDGPUISELLOWERING_H ++#define AMDGPUISELLOWERING_H ++ ++#include "llvm/Target/TargetLowering.h" ++ ++namespace llvm { ++ ++class MachineRegisterInfo; ++ ++class AMDGPUTargetLowering : public TargetLowering { ++private: ++ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; ++ ++protected: ++ ++ /// \brief Helper function that adds Reg to the LiveIn list of the DAG's ++ /// MachineFunction. ++ /// ++ /// \returns a RegisterSDNode representing Reg. ++ SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, ++ unsigned Reg, EVT VT) const; ++ ++ bool isHWTrueValue(SDValue Op) const; ++ bool isHWFalseValue(SDValue Op) const; ++ ++public: ++ AMDGPUTargetLowering(TargetMachine &TM); ++ ++ virtual SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, ++ bool isVarArg, ++ const SmallVectorImpl &Ins, ++ DebugLoc DL, SelectionDAG &DAG, ++ SmallVectorImpl &InVals) const; ++ ++ virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, ++ bool isVarArg, ++ const SmallVectorImpl &Outs, ++ const SmallVectorImpl &OutVals, ++ DebugLoc DL, SelectionDAG &DAG) const; ++ ++ virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const; ++ virtual const char* getTargetNodeName(unsigned Opcode) const; ++ ++// Functions defined in AMDILISelLowering.cpp ++public: ++ ++ /// \brief Determine which of the bits specified in \p Mask are known to be ++ /// either zero or one and return them in the \p KnownZero and \p KnownOne ++ /// bitsets. ++ virtual void computeMaskedBitsForTargetNode(const SDValue Op, ++ APInt &KnownZero, ++ APInt &KnownOne, ++ const SelectionDAG &DAG, ++ unsigned Depth = 0) const; ++ ++ virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info, ++ const CallInst &I, unsigned Intrinsic) const; ++ ++ /// We want to mark f32/f64 floating point values as legal. ++ bool isFPImmLegal(const APFloat &Imm, EVT VT) const; ++ ++ /// We don't want to shrink f64/f32 constants. ++ bool ShouldShrinkFPConstant(EVT VT) const; ++ ++private: ++ void InitAMDILLowering(); ++ SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerSREM8(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerSREM16(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; ++ EVT genIntType(uint32_t size = 32, uint32_t numEle = 1) const; ++ SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; ++}; ++ ++namespace AMDGPUISD { ++ ++enum { ++ // AMDIL ISD Opcodes ++ FIRST_NUMBER = ISD::BUILTIN_OP_END, ++ MAD, // 32bit Fused Multiply Add instruction ++ CALL, // Function call based on a single integer ++ UMUL, // 32bit unsigned multiplication ++ DIV_INF, // Divide with infinity returned on zero divisor ++ RET_FLAG, ++ BRANCH_COND, ++ // End AMDIL ISD Opcodes ++ BITALIGN, ++ DWORDADDR, ++ FRACT, ++ FMAX, ++ SMAX, ++ UMAX, ++ FMIN, ++ SMIN, ++ UMIN, ++ URECIP, ++ INTERP, ++ INTERP_P0, ++ EXPORT, ++ CONST_ADDRESS, ++ LAST_AMDGPU_ISD_NUMBER ++}; ++ ++ ++} // End namespace AMDGPUISD ++ ++namespace SIISD { ++ ++enum { ++ SI_FIRST = AMDGPUISD::LAST_AMDGPU_ISD_NUMBER, ++ VCC_AND, ++ VCC_BITCAST ++}; ++ ++} // End namespace SIISD ++ ++} // End namespace llvm ++ ++#endif // AMDGPUISELLOWERING_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUMCInstLower.cpp llvm-r600/lib/Target/R600/AMDGPUMCInstLower.cpp +--- llvm-3.2.src/lib/Target/R600/AMDGPUMCInstLower.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPUMCInstLower.cpp 2013-01-25 19:43:57.430049721 +0100 +@@ -0,0 +1,83 @@ ++//===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst. ++// ++//===----------------------------------------------------------------------===// ++// ++ ++#include "AMDGPUMCInstLower.h" ++#include "AMDGPUAsmPrinter.h" ++#include "R600InstrInfo.h" ++#include "llvm/CodeGen/MachineBasicBlock.h" ++#include "llvm/CodeGen/MachineInstr.h" ++#include "llvm/Constants.h" ++#include "llvm/MC/MCInst.h" ++#include "llvm/MC/MCStreamer.h" ++#include "llvm/MC/MCExpr.h" ++#include "llvm/Support/ErrorHandling.h" ++ ++using namespace llvm; ++ ++AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx): ++ Ctx(ctx) ++{ } ++ ++void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { ++ OutMI.setOpcode(MI->getOpcode()); ++ ++ for (unsigned i = 0, e = MI->getNumExplicitOperands(); i != e; ++i) { ++ const MachineOperand &MO = MI->getOperand(i); ++ ++ MCOperand MCOp; ++ switch (MO.getType()) { ++ default: ++ llvm_unreachable("unknown operand type"); ++ case MachineOperand::MO_FPImmediate: { ++ const APFloat &FloatValue = MO.getFPImm()->getValueAPF(); ++ assert(&FloatValue.getSemantics() == &APFloat::IEEEsingle && ++ "Only floating point immediates are supported at the moment."); ++ MCOp = MCOperand::CreateFPImm(FloatValue.convertToFloat()); ++ break; ++ } ++ case MachineOperand::MO_Immediate: ++ MCOp = MCOperand::CreateImm(MO.getImm()); ++ break; ++ case MachineOperand::MO_Register: ++ MCOp = MCOperand::CreateReg(MO.getReg()); ++ break; ++ case MachineOperand::MO_MachineBasicBlock: ++ MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create( ++ MO.getMBB()->getSymbol(), Ctx)); ++ } ++ OutMI.addOperand(MCOp); ++ } ++} ++ ++void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { ++ AMDGPUMCInstLower MCInstLowering(OutContext); ++ ++ if (MI->isBundle()) { ++ const MachineBasicBlock *MBB = MI->getParent(); ++ MachineBasicBlock::const_instr_iterator I = MI; ++ ++I; ++ while (I != MBB->end() && I->isInsideBundle()) { ++ MCInst MCBundleInst; ++ const MachineInstr *BundledInst = I; ++ MCInstLowering.lower(BundledInst, MCBundleInst); ++ OutStreamer.EmitInstruction(MCBundleInst); ++ ++I; ++ } ++ } else { ++ MCInst TmpInst; ++ MCInstLowering.lower(MI, TmpInst); ++ OutStreamer.EmitInstruction(TmpInst); ++ } ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUMCInstLower.h llvm-r600/lib/Target/R600/AMDGPUMCInstLower.h +--- llvm-3.2.src/lib/Target/R600/AMDGPUMCInstLower.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPUMCInstLower.h 2013-01-25 19:43:57.430049721 +0100 +@@ -0,0 +1,34 @@ ++//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++/// \file ++//===----------------------------------------------------------------------===// ++ ++#ifndef AMDGPU_MCINSTLOWER_H ++#define AMDGPU_MCINSTLOWER_H ++ ++namespace llvm { ++ ++class MCInst; ++class MCContext; ++class MachineInstr; ++ ++class AMDGPUMCInstLower { ++ ++ MCContext &Ctx; ++ ++public: ++ AMDGPUMCInstLower(MCContext &ctx); ++ ++ /// \brief Lower a MachineInstr to an MCInst ++ void lower(const MachineInstr *MI, MCInst &OutMI) const; ++ ++}; ++ ++} // End namespace llvm ++ ++#endif //AMDGPU_MCINSTLOWER_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.cpp llvm-r600/lib/Target/R600/AMDGPURegisterInfo.cpp +--- llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPURegisterInfo.cpp 2013-01-25 19:43:57.430049721 +0100 +@@ -0,0 +1,51 @@ ++//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Parent TargetRegisterInfo class common to all hw codegen targets. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "AMDGPURegisterInfo.h" ++#include "AMDGPUTargetMachine.h" ++ ++using namespace llvm; ++ ++AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm, ++ const TargetInstrInfo &tii) ++: AMDGPUGenRegisterInfo(0), ++ TM(tm), ++ TII(tii) ++ { } ++ ++//===----------------------------------------------------------------------===// ++// Function handling callbacks - Functions are a seldom used feature of GPUS, so ++// they are not supported at this time. ++//===----------------------------------------------------------------------===// ++ ++const uint16_t AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister; ++ ++const uint16_t* AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) ++ const { ++ return &CalleeSavedReg; ++} ++ ++void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, ++ int SPAdj, ++ RegScavenger *RS) const { ++ assert(!"Subroutines not supported yet"); ++} ++ ++unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const { ++ assert(!"Subroutines not supported yet"); ++ return 0; ++} ++ ++#define GET_REGINFO_TARGET_DESC ++#include "AMDGPUGenRegisterInfo.inc" +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.h llvm-r600/lib/Target/R600/AMDGPURegisterInfo.h +--- llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPURegisterInfo.h 2013-01-25 19:43:57.430049721 +0100 +@@ -0,0 +1,63 @@ ++//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief TargetRegisterInfo interface that is implemented by all hw codegen ++/// targets. ++// ++//===----------------------------------------------------------------------===// ++ ++#ifndef AMDGPUREGISTERINFO_H ++#define AMDGPUREGISTERINFO_H ++ ++#include "llvm/ADT/BitVector.h" ++#include "llvm/Target/TargetRegisterInfo.h" ++ ++#define GET_REGINFO_HEADER ++#define GET_REGINFO_ENUM ++#include "AMDGPUGenRegisterInfo.inc" ++ ++namespace llvm { ++ ++class AMDGPUTargetMachine; ++class TargetInstrInfo; ++ ++struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { ++ TargetMachine &TM; ++ const TargetInstrInfo &TII; ++ static const uint16_t CalleeSavedReg; ++ ++ AMDGPURegisterInfo(TargetMachine &tm, const TargetInstrInfo &tii); ++ ++ virtual BitVector getReservedRegs(const MachineFunction &MF) const { ++ assert(!"Unimplemented"); return BitVector(); ++ } ++ ++ /// \param RC is an AMDIL reg class. ++ /// ++ /// \returns The ISA reg class that is equivalent to \p RC. ++ virtual const TargetRegisterClass * getISARegClass( ++ const TargetRegisterClass * RC) const { ++ assert(!"Unimplemented"); return NULL; ++ } ++ ++ virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const { ++ assert(!"Unimplemented"); return NULL; ++ } ++ ++ const uint16_t* getCalleeSavedRegs(const MachineFunction *MF) const; ++ void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, ++ RegScavenger *RS) const; ++ unsigned getFrameRegister(const MachineFunction &MF) const; ++ ++}; ++ ++} // End namespace llvm ++ ++#endif // AMDIDSAREGISTERINFO_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.td llvm-r600/lib/Target/R600/AMDGPURegisterInfo.td +--- llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.td 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPURegisterInfo.td 2013-01-25 19:43:57.433383055 +0100 +@@ -0,0 +1,22 @@ ++//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++// Tablegen register definitions common to all hw codegen targets. ++// ++//===----------------------------------------------------------------------===// ++ ++let Namespace = "AMDGPU" in { ++ def sel_x : SubRegIndex; ++ def sel_y : SubRegIndex; ++ def sel_z : SubRegIndex; ++ def sel_w : SubRegIndex; ++} ++ ++include "R600RegisterInfo.td" ++include "SIRegisterInfo.td" +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUStructurizeCFG.cpp llvm-r600/lib/Target/R600/AMDGPUStructurizeCFG.cpp +--- llvm-3.2.src/lib/Target/R600/AMDGPUStructurizeCFG.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPUStructurizeCFG.cpp 2013-01-25 19:43:57.433383055 +0100 +@@ -0,0 +1,714 @@ ++//===-- AMDGPUStructurizeCFG.cpp - ------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// The pass implemented in this file transforms the programs control flow ++/// graph into a form that's suitable for code generation on hardware that ++/// implements control flow by execution masking. This currently includes all ++/// AMD GPUs but may as well be useful for other types of hardware. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "AMDGPU.h" ++#include "llvm/Module.h" ++#include "llvm/ADT/SCCIterator.h" ++#include "llvm/Analysis/RegionIterator.h" ++#include "llvm/Analysis/RegionInfo.h" ++#include "llvm/Analysis/RegionPass.h" ++#include "llvm/Transforms/Utils/SSAUpdater.h" ++ ++using namespace llvm; ++ ++namespace { ++ ++// Definition of the complex types used in this pass. ++ ++typedef std::pair BBValuePair; ++typedef ArrayRef BBVecRef; ++ ++typedef SmallVector RNVector; ++typedef SmallVector BBVector; ++typedef SmallVector BBValueVector; ++ ++typedef DenseMap PhiMap; ++typedef DenseMap BBPhiMap; ++typedef DenseMap BBPredicates; ++typedef DenseMap PredMap; ++typedef DenseMap VisitedMap; ++ ++// The name for newly created blocks. ++ ++static const char *FlowBlockName = "Flow"; ++ ++/// @brief Transforms the control flow graph on one single entry/exit region ++/// at a time. ++/// ++/// After the transform all "If"/"Then"/"Else" style control flow looks like ++/// this: ++/// ++/// \verbatim ++/// 1 ++/// || ++/// | | ++/// 2 | ++/// | / ++/// |/ ++/// 3 ++/// || Where: ++/// | | 1 = "If" block, calculates the condition ++/// 4 | 2 = "Then" subregion, runs if the condition is true ++/// | / 3 = "Flow" blocks, newly inserted flow blocks, rejoins the flow ++/// |/ 4 = "Else" optional subregion, runs if the condition is false ++/// 5 5 = "End" block, also rejoins the control flow ++/// \endverbatim ++/// ++/// Control flow is expressed as a branch where the true exit goes into the ++/// "Then"/"Else" region, while the false exit skips the region ++/// The condition for the optional "Else" region is expressed as a PHI node. ++/// The incomming values of the PHI node are true for the "If" edge and false ++/// for the "Then" edge. ++/// ++/// Additionally to that even complicated loops look like this: ++/// ++/// \verbatim ++/// 1 ++/// || ++/// | | ++/// 2 ^ Where: ++/// | / 1 = "Entry" block ++/// |/ 2 = "Loop" optional subregion, with all exits at "Flow" block ++/// 3 3 = "Flow" block, with back edge to entry block ++/// | ++/// \endverbatim ++/// ++/// The back edge of the "Flow" block is always on the false side of the branch ++/// while the true side continues the general flow. So the loop condition ++/// consist of a network of PHI nodes where the true incoming values expresses ++/// breaks and the false values expresses continue states. ++class AMDGPUStructurizeCFG : public RegionPass { ++ ++ static char ID; ++ ++ Type *Boolean; ++ ConstantInt *BoolTrue; ++ ConstantInt *BoolFalse; ++ UndefValue *BoolUndef; ++ ++ Function *Func; ++ Region *ParentRegion; ++ ++ DominatorTree *DT; ++ ++ RNVector Order; ++ VisitedMap Visited; ++ PredMap Predicates; ++ BBPhiMap DeletedPhis; ++ BBVector FlowsInserted; ++ ++ BasicBlock *LoopStart; ++ BasicBlock *LoopEnd; ++ BBPredicates LoopPred; ++ ++ void orderNodes(); ++ ++ void buildPredicate(BranchInst *Term, unsigned Idx, ++ BBPredicates &Pred, bool Invert); ++ ++ void analyzeBlock(BasicBlock *BB); ++ ++ void analyzeLoop(BasicBlock *BB, unsigned &LoopIdx); ++ ++ void collectInfos(); ++ ++ bool dominatesPredicates(BasicBlock *A, BasicBlock *B); ++ ++ void killTerminator(BasicBlock *BB); ++ ++ RegionNode *skipChained(RegionNode *Node); ++ ++ void delPhiValues(BasicBlock *From, BasicBlock *To); ++ ++ void addPhiValues(BasicBlock *From, BasicBlock *To); ++ ++ BasicBlock *getNextFlow(BasicBlock *Prev); ++ ++ bool isPredictableTrue(BasicBlock *Prev, BasicBlock *Node); ++ ++ BasicBlock *wireFlowBlock(BasicBlock *Prev, RegionNode *Node); ++ ++ void createFlow(); ++ ++ void insertConditions(); ++ ++ void rebuildSSA(); ++ ++public: ++ AMDGPUStructurizeCFG(): ++ RegionPass(ID) { ++ ++ initializeRegionInfoPass(*PassRegistry::getPassRegistry()); ++ } ++ ++ virtual bool doInitialization(Region *R, RGPassManager &RGM); ++ ++ virtual bool runOnRegion(Region *R, RGPassManager &RGM); ++ ++ virtual const char *getPassName() const { ++ return "AMDGPU simplify control flow"; ++ } ++ ++ void getAnalysisUsage(AnalysisUsage &AU) const { ++ ++ AU.addRequired(); ++ AU.addPreserved(); ++ RegionPass::getAnalysisUsage(AU); ++ } ++ ++}; ++ ++} // end anonymous namespace ++ ++char AMDGPUStructurizeCFG::ID = 0; ++ ++/// \brief Initialize the types and constants used in the pass ++bool AMDGPUStructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) { ++ LLVMContext &Context = R->getEntry()->getContext(); ++ ++ Boolean = Type::getInt1Ty(Context); ++ BoolTrue = ConstantInt::getTrue(Context); ++ BoolFalse = ConstantInt::getFalse(Context); ++ BoolUndef = UndefValue::get(Boolean); ++ ++ return false; ++} ++ ++/// \brief Build up the general order of nodes ++void AMDGPUStructurizeCFG::orderNodes() { ++ scc_iterator I = scc_begin(ParentRegion), ++ E = scc_end(ParentRegion); ++ for (Order.clear(); I != E; ++I) { ++ std::vector &Nodes = *I; ++ Order.append(Nodes.begin(), Nodes.end()); ++ } ++} ++ ++/// \brief Build blocks and loop predicates ++void AMDGPUStructurizeCFG::buildPredicate(BranchInst *Term, unsigned Idx, ++ BBPredicates &Pred, bool Invert) { ++ Value *True = Invert ? BoolFalse : BoolTrue; ++ Value *False = Invert ? BoolTrue : BoolFalse; ++ ++ RegionInfo *RI = ParentRegion->getRegionInfo(); ++ BasicBlock *BB = Term->getParent(); ++ ++ // Handle the case where multiple regions start at the same block ++ Region *R = BB != ParentRegion->getEntry() ? ++ RI->getRegionFor(BB) : ParentRegion; ++ ++ if (R == ParentRegion) { ++ // It's a top level block in our region ++ Value *Cond = True; ++ if (Term->isConditional()) { ++ BasicBlock *Other = Term->getSuccessor(!Idx); ++ ++ if (Visited.count(Other)) { ++ if (!Pred.count(Other)) ++ Pred[Other] = False; ++ ++ if (!Pred.count(BB)) ++ Pred[BB] = True; ++ return; ++ } ++ Cond = Term->getCondition(); ++ ++ if (Idx != Invert) ++ Cond = BinaryOperator::CreateNot(Cond, "", Term); ++ } ++ ++ Pred[BB] = Cond; ++ ++ } else if (ParentRegion->contains(R)) { ++ // It's a block in a sub region ++ while(R->getParent() != ParentRegion) ++ R = R->getParent(); ++ ++ Pred[R->getEntry()] = True; ++ ++ } else { ++ // It's a branch from outside into our parent region ++ Pred[BB] = True; ++ } ++} ++ ++/// \brief Analyze the successors of each block and build up predicates ++void AMDGPUStructurizeCFG::analyzeBlock(BasicBlock *BB) { ++ pred_iterator PI = pred_begin(BB), PE = pred_end(BB); ++ BBPredicates &Pred = Predicates[BB]; ++ ++ for (; PI != PE; ++PI) { ++ BranchInst *Term = cast((*PI)->getTerminator()); ++ ++ for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { ++ BasicBlock *Succ = Term->getSuccessor(i); ++ if (Succ != BB) ++ continue; ++ buildPredicate(Term, i, Pred, false); ++ } ++ } ++} ++ ++/// \brief Analyze the conditions leading to loop to a previous block ++void AMDGPUStructurizeCFG::analyzeLoop(BasicBlock *BB, unsigned &LoopIdx) { ++ BranchInst *Term = cast(BB->getTerminator()); ++ ++ for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { ++ BasicBlock *Succ = Term->getSuccessor(i); ++ ++ // Ignore it if it's not a back edge ++ if (!Visited.count(Succ)) ++ continue; ++ ++ buildPredicate(Term, i, LoopPred, true); ++ ++ LoopEnd = BB; ++ if (Visited[Succ] < LoopIdx) { ++ LoopIdx = Visited[Succ]; ++ LoopStart = Succ; ++ } ++ } ++} ++ ++/// \brief Collect various loop and predicate infos ++void AMDGPUStructurizeCFG::collectInfos() { ++ unsigned Number = 0, LoopIdx = ~0; ++ ++ // Reset predicate ++ Predicates.clear(); ++ ++ // and loop infos ++ LoopStart = LoopEnd = 0; ++ LoopPred.clear(); ++ ++ RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend(); ++ for (Visited.clear(); OI != OE; Visited[(*OI++)->getEntry()] = ++Number) { ++ ++ // Analyze all the conditions leading to a node ++ analyzeBlock((*OI)->getEntry()); ++ ++ if ((*OI)->isSubRegion()) ++ continue; ++ ++ // Find the first/last loop nodes and loop predicates ++ analyzeLoop((*OI)->getNodeAs(), LoopIdx); ++ } ++} ++ ++/// \brief Does A dominate all the predicates of B ? ++bool AMDGPUStructurizeCFG::dominatesPredicates(BasicBlock *A, BasicBlock *B) { ++ BBPredicates &Preds = Predicates[B]; ++ for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end(); ++ PI != PE; ++PI) { ++ ++ if (!DT->dominates(A, PI->first)) ++ return false; ++ } ++ return true; ++} ++ ++/// \brief Remove phi values from all successors and the remove the terminator. ++void AMDGPUStructurizeCFG::killTerminator(BasicBlock *BB) { ++ TerminatorInst *Term = BB->getTerminator(); ++ if (!Term) ++ return; ++ ++ for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); ++ SI != SE; ++SI) { ++ ++ delPhiValues(BB, *SI); ++ } ++ ++ Term->eraseFromParent(); ++} ++ ++/// First: Skip forward to the first region node that either isn't a subregion or not ++/// dominating it's exit, remove all the skipped nodes from the node order. ++/// ++/// Second: Handle the first successor directly if the resulting nodes successor ++/// predicates are still dominated by the original entry ++RegionNode *AMDGPUStructurizeCFG::skipChained(RegionNode *Node) { ++ BasicBlock *Entry = Node->getEntry(); ++ ++ // Skip forward as long as it is just a linear flow ++ while (true) { ++ BasicBlock *Entry = Node->getEntry(); ++ BasicBlock *Exit; ++ ++ if (Node->isSubRegion()) { ++ Exit = Node->getNodeAs()->getExit(); ++ } else { ++ TerminatorInst *Term = Entry->getTerminator(); ++ if (Term->getNumSuccessors() != 1) ++ break; ++ Exit = Term->getSuccessor(0); ++ } ++ ++ // It's a back edge, break here so we can insert a loop node ++ if (!Visited.count(Exit)) ++ return Node; ++ ++ // More than node edges are pointing to exit ++ if (!DT->dominates(Entry, Exit)) ++ return Node; ++ ++ RegionNode *Next = ParentRegion->getNode(Exit); ++ RNVector::iterator I = std::find(Order.begin(), Order.end(), Next); ++ assert(I != Order.end()); ++ ++ Visited.erase(Next->getEntry()); ++ Order.erase(I); ++ Node = Next; ++ } ++ ++ BasicBlock *BB = Node->getEntry(); ++ TerminatorInst *Term = BB->getTerminator(); ++ if (Term->getNumSuccessors() != 2) ++ return Node; ++ ++ // Our node has exactly two succesors, check if we can handle ++ // any of them directly ++ BasicBlock *Succ = Term->getSuccessor(0); ++ if (!Visited.count(Succ) || !dominatesPredicates(Entry, Succ)) { ++ Succ = Term->getSuccessor(1); ++ if (!Visited.count(Succ) || !dominatesPredicates(Entry, Succ)) ++ return Node; ++ } else { ++ BasicBlock *Succ2 = Term->getSuccessor(1); ++ if (Visited.count(Succ2) && Visited[Succ] > Visited[Succ2] && ++ dominatesPredicates(Entry, Succ2)) ++ Succ = Succ2; ++ } ++ ++ RegionNode *Next = ParentRegion->getNode(Succ); ++ RNVector::iterator E = Order.end(); ++ RNVector::iterator I = std::find(Order.begin(), E, Next); ++ assert(I != E); ++ ++ killTerminator(BB); ++ FlowsInserted.push_back(BB); ++ Visited.erase(Succ); ++ Order.erase(I); ++ return ParentRegion->getNode(wireFlowBlock(BB, Next)); ++} ++ ++/// \brief Remove all PHI values coming from "From" into "To" and remember ++/// them in DeletedPhis ++void AMDGPUStructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) { ++ PhiMap &Map = DeletedPhis[To]; ++ for (BasicBlock::iterator I = To->begin(), E = To->end(); ++ I != E && isa(*I);) { ++ ++ PHINode &Phi = cast(*I++); ++ while (Phi.getBasicBlockIndex(From) != -1) { ++ Value *Deleted = Phi.removeIncomingValue(From, false); ++ Map[&Phi].push_back(std::make_pair(From, Deleted)); ++ } ++ } ++} ++ ++/// \brief Add the PHI values back once we knew the new predecessor ++void AMDGPUStructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) { ++ if (!DeletedPhis.count(To)) ++ return; ++ ++ PhiMap &Map = DeletedPhis[To]; ++ SSAUpdater Updater; ++ ++ for (PhiMap::iterator I = Map.begin(), E = Map.end(); I != E; ++I) { ++ ++ PHINode *Phi = I->first; ++ Updater.Initialize(Phi->getType(), ""); ++ BasicBlock *Fallback = To; ++ bool HaveFallback = false; ++ ++ for (BBValueVector::iterator VI = I->second.begin(), VE = I->second.end(); ++ VI != VE; ++VI) { ++ ++ Updater.AddAvailableValue(VI->first, VI->second); ++ BasicBlock *Dom = DT->findNearestCommonDominator(Fallback, VI->first); ++ if (Dom == VI->first) ++ HaveFallback = true; ++ else if (Dom != Fallback) ++ HaveFallback = false; ++ Fallback = Dom; ++ } ++ if (!HaveFallback) { ++ Value *Undef = UndefValue::get(Phi->getType()); ++ Updater.AddAvailableValue(Fallback, Undef); ++ } ++ ++ Phi->addIncoming(Updater.GetValueAtEndOfBlock(From), From); ++ } ++ DeletedPhis.erase(To); ++} ++ ++/// \brief Create a new flow node and update dominator tree and region info ++BasicBlock *AMDGPUStructurizeCFG::getNextFlow(BasicBlock *Prev) { ++ LLVMContext &Context = Func->getContext(); ++ BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() : ++ Order.back()->getEntry(); ++ BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName, ++ Func, Insert); ++ DT->addNewBlock(Flow, Prev); ++ ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion); ++ FlowsInserted.push_back(Flow); ++ return Flow; ++} ++ ++/// \brief Can we predict that this node will always be called? ++bool AMDGPUStructurizeCFG::isPredictableTrue(BasicBlock *Prev, ++ BasicBlock *Node) { ++ BBPredicates &Preds = Predicates[Node]; ++ bool Dominated = false; ++ ++ for (BBPredicates::iterator I = Preds.begin(), E = Preds.end(); ++ I != E; ++I) { ++ ++ if (I->second != BoolTrue) ++ return false; ++ ++ if (!Dominated && DT->dominates(I->first, Prev)) ++ Dominated = true; ++ } ++ return Dominated; ++} ++ ++/// \brief Wire up the new control flow by inserting or updating the branch ++/// instructions at node exits ++BasicBlock *AMDGPUStructurizeCFG::wireFlowBlock(BasicBlock *Prev, ++ RegionNode *Node) { ++ BasicBlock *Entry = Node->getEntry(); ++ ++ if (LoopStart == Entry) { ++ LoopStart = Prev; ++ LoopPred[Prev] = BoolTrue; ++ } ++ ++ // Wire it up temporary, skipChained may recurse into us ++ BranchInst::Create(Entry, Prev); ++ DT->changeImmediateDominator(Entry, Prev); ++ addPhiValues(Prev, Entry); ++ ++ Node = skipChained(Node); ++ ++ BasicBlock *Next = getNextFlow(Prev); ++ if (!isPredictableTrue(Prev, Entry)) { ++ // Let Prev point to entry and next block ++ Prev->getTerminator()->eraseFromParent(); ++ BranchInst::Create(Entry, Next, BoolUndef, Prev); ++ } else { ++ DT->changeImmediateDominator(Next, Entry); ++ } ++ ++ // Let node exit(s) point to next block ++ if (Node->isSubRegion()) { ++ Region *SubRegion = Node->getNodeAs(); ++ BasicBlock *Exit = SubRegion->getExit(); ++ ++ // Find all the edges from the sub region to the exit ++ BBVector ToDo; ++ for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) { ++ if (SubRegion->contains(*I)) ++ ToDo.push_back(*I); ++ } ++ ++ // Modify the edges to point to the new flow block ++ for (BBVector::iterator I = ToDo.begin(), E = ToDo.end(); I != E; ++I) { ++ delPhiValues(*I, Exit); ++ TerminatorInst *Term = (*I)->getTerminator(); ++ Term->replaceUsesOfWith(Exit, Next); ++ } ++ ++ // Update the region info ++ SubRegion->replaceExit(Next); ++ ++ } else { ++ BasicBlock *BB = Node->getNodeAs(); ++ killTerminator(BB); ++ BranchInst::Create(Next, BB); ++ ++ if (BB == LoopEnd) ++ LoopEnd = 0; ++ } ++ ++ return Next; ++} ++ ++/// Destroy node order and visited map, build up flow order instead. ++/// After this function control flow looks like it should be, but ++/// branches only have undefined conditions. ++void AMDGPUStructurizeCFG::createFlow() { ++ DeletedPhis.clear(); ++ ++ BasicBlock *Prev = Order.pop_back_val()->getEntry(); ++ assert(Prev == ParentRegion->getEntry() && "Incorrect node order!"); ++ Visited.erase(Prev); ++ ++ if (LoopStart == Prev) { ++ // Loop starts at entry, split entry so that we can predicate it ++ BasicBlock::iterator Insert = Prev->getFirstInsertionPt(); ++ BasicBlock *Split = Prev->splitBasicBlock(Insert, FlowBlockName); ++ DT->addNewBlock(Split, Prev); ++ ParentRegion->getRegionInfo()->setRegionFor(Split, ParentRegion); ++ Predicates[Split] = Predicates[Prev]; ++ Order.push_back(ParentRegion->getBBNode(Split)); ++ LoopPred[Prev] = BoolTrue; ++ ++ } else if (LoopStart == Order.back()->getEntry()) { ++ // Loop starts behind entry, split entry so that we can jump to it ++ Instruction *Term = Prev->getTerminator(); ++ BasicBlock *Split = Prev->splitBasicBlock(Term, FlowBlockName); ++ DT->addNewBlock(Split, Prev); ++ ParentRegion->getRegionInfo()->setRegionFor(Split, ParentRegion); ++ Prev = Split; ++ } ++ ++ killTerminator(Prev); ++ FlowsInserted.clear(); ++ FlowsInserted.push_back(Prev); ++ ++ while (!Order.empty()) { ++ RegionNode *Node = Order.pop_back_val(); ++ Visited.erase(Node->getEntry()); ++ Prev = wireFlowBlock(Prev, Node); ++ if (LoopStart && !LoopEnd) { ++ // Create an extra loop end node ++ LoopEnd = Prev; ++ Prev = getNextFlow(LoopEnd); ++ BranchInst::Create(Prev, LoopStart, BoolUndef, LoopEnd); ++ addPhiValues(LoopEnd, LoopStart); ++ } ++ } ++ ++ BasicBlock *Exit = ParentRegion->getExit(); ++ BranchInst::Create(Exit, Prev); ++ addPhiValues(Prev, Exit); ++ if (DT->dominates(ParentRegion->getEntry(), Exit)) ++ DT->changeImmediateDominator(Exit, Prev); ++ ++ if (LoopStart && LoopEnd) { ++ BBVector::iterator FI = std::find(FlowsInserted.begin(), ++ FlowsInserted.end(), ++ LoopStart); ++ for (; *FI != LoopEnd; ++FI) { ++ addPhiValues(*FI, (*FI)->getTerminator()->getSuccessor(0)); ++ } ++ } ++ ++ assert(Order.empty()); ++ assert(Visited.empty()); ++ assert(DeletedPhis.empty()); ++} ++ ++/// \brief Insert the missing branch conditions ++void AMDGPUStructurizeCFG::insertConditions() { ++ SSAUpdater PhiInserter; ++ ++ for (BBVector::iterator FI = FlowsInserted.begin(), FE = FlowsInserted.end(); ++ FI != FE; ++FI) { ++ ++ BranchInst *Term = cast((*FI)->getTerminator()); ++ if (Term->isUnconditional()) ++ continue; ++ ++ PhiInserter.Initialize(Boolean, ""); ++ PhiInserter.AddAvailableValue(&Func->getEntryBlock(), BoolFalse); ++ ++ BasicBlock *Succ = Term->getSuccessor(0); ++ BBPredicates &Preds = (*FI == LoopEnd) ? LoopPred : Predicates[Succ]; ++ for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end(); ++ PI != PE; ++PI) { ++ ++ PhiInserter.AddAvailableValue(PI->first, PI->second); ++ } ++ ++ Term->setCondition(PhiInserter.GetValueAtEndOfBlock(*FI)); ++ } ++} ++ ++/// Handle a rare case where the disintegrated nodes instructions ++/// no longer dominate all their uses. Not sure if this is really nessasary ++void AMDGPUStructurizeCFG::rebuildSSA() { ++ SSAUpdater Updater; ++ for (Region::block_iterator I = ParentRegion->block_begin(), ++ E = ParentRegion->block_end(); ++ I != E; ++I) { ++ ++ BasicBlock *BB = *I; ++ for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); ++ II != IE; ++II) { ++ ++ bool Initialized = false; ++ for (Use *I = &II->use_begin().getUse(), *Next; I; I = Next) { ++ ++ Next = I->getNext(); ++ ++ Instruction *User = cast(I->getUser()); ++ if (User->getParent() == BB) { ++ continue; ++ ++ } else if (PHINode *UserPN = dyn_cast(User)) { ++ if (UserPN->getIncomingBlock(*I) == BB) ++ continue; ++ } ++ ++ if (DT->dominates(II, User)) ++ continue; ++ ++ if (!Initialized) { ++ Value *Undef = UndefValue::get(II->getType()); ++ Updater.Initialize(II->getType(), ""); ++ Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); ++ Updater.AddAvailableValue(BB, II); ++ Initialized = true; ++ } ++ Updater.RewriteUseAfterInsertions(*I); ++ } ++ } ++ } ++} ++ ++/// \brief Run the transformation for each region found ++bool AMDGPUStructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) { ++ if (R->isTopLevelRegion()) ++ return false; ++ ++ Func = R->getEntry()->getParent(); ++ ParentRegion = R; ++ ++ DT = &getAnalysis(); ++ ++ orderNodes(); ++ collectInfos(); ++ createFlow(); ++ insertConditions(); ++ rebuildSSA(); ++ ++ Order.clear(); ++ Visited.clear(); ++ Predicates.clear(); ++ DeletedPhis.clear(); ++ FlowsInserted.clear(); ++ ++ return true; ++} ++ ++/// \brief Create the pass ++Pass *llvm::createAMDGPUStructurizeCFGPass() { ++ return new AMDGPUStructurizeCFG(); ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUSubtarget.cpp llvm-r600/lib/Target/R600/AMDGPUSubtarget.cpp +--- llvm-3.2.src/lib/Target/R600/AMDGPUSubtarget.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPUSubtarget.cpp 2013-01-25 19:43:57.433383055 +0100 +@@ -0,0 +1,87 @@ ++//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Implements the AMDGPU specific subclass of TargetSubtarget. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "AMDGPUSubtarget.h" ++ ++using namespace llvm; ++ ++#define GET_SUBTARGETINFO_ENUM ++#define GET_SUBTARGETINFO_TARGET_DESC ++#define GET_SUBTARGETINFO_CTOR ++#include "AMDGPUGenSubtargetInfo.inc" ++ ++AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) : ++ AMDGPUGenSubtargetInfo(TT, CPU, FS), DumpCode(false) { ++ InstrItins = getInstrItineraryForCPU(CPU); ++ ++ memset(CapsOverride, 0, sizeof(*CapsOverride) ++ * AMDGPUDeviceInfo::MaxNumberCapabilities); ++ // Default card ++ StringRef GPU = CPU; ++ Is64bit = false; ++ DefaultSize[0] = 64; ++ DefaultSize[1] = 1; ++ DefaultSize[2] = 1; ++ ParseSubtargetFeatures(GPU, FS); ++ DevName = GPU; ++ Device = AMDGPUDeviceInfo::getDeviceFromName(DevName, this, Is64bit); ++} ++ ++AMDGPUSubtarget::~AMDGPUSubtarget() { ++ delete Device; ++} ++ ++bool ++AMDGPUSubtarget::isOverride(AMDGPUDeviceInfo::Caps caps) const { ++ assert(caps < AMDGPUDeviceInfo::MaxNumberCapabilities && ++ "Caps index is out of bounds!"); ++ return CapsOverride[caps]; ++} ++bool ++AMDGPUSubtarget::is64bit() const { ++ return Is64bit; ++} ++bool ++AMDGPUSubtarget::isTargetELF() const { ++ return false; ++} ++size_t ++AMDGPUSubtarget::getDefaultSize(uint32_t dim) const { ++ if (dim > 3) { ++ return 1; ++ } else { ++ return DefaultSize[dim]; ++ } ++} ++ ++std::string ++AMDGPUSubtarget::getDataLayout() const { ++ if (!Device) { ++ return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16" ++ "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32" ++ "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64" ++ "-v96:128:128-v128:128:128-v192:256:256-v256:256:256" ++ "-v512:512:512-v1024:1024:1024-v2048:2048:2048-a0:0:64"); ++ } ++ return Device->getDataLayout(); ++} ++ ++std::string ++AMDGPUSubtarget::getDeviceName() const { ++ return DevName; ++} ++const AMDGPUDevice * ++AMDGPUSubtarget::device() const { ++ return Device; ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUSubtarget.h llvm-r600/lib/Target/R600/AMDGPUSubtarget.h +--- llvm-3.2.src/lib/Target/R600/AMDGPUSubtarget.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPUSubtarget.h 2013-01-25 19:43:57.433383055 +0100 +@@ -0,0 +1,65 @@ ++//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//==-----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief AMDGPU specific subclass of TargetSubtarget. ++// ++//===----------------------------------------------------------------------===// ++ ++#ifndef AMDGPUSUBTARGET_H ++#define AMDGPUSUBTARGET_H ++#include "AMDILDevice.h" ++#include "llvm/ADT/StringExtras.h" ++#include "llvm/ADT/StringRef.h" ++#include "llvm/Target/TargetSubtargetInfo.h" ++ ++#define GET_SUBTARGETINFO_HEADER ++#include "AMDGPUGenSubtargetInfo.inc" ++ ++#define MAX_CB_SIZE (1 << 16) ++ ++namespace llvm { ++ ++class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { ++private: ++ bool CapsOverride[AMDGPUDeviceInfo::MaxNumberCapabilities]; ++ const AMDGPUDevice *Device; ++ size_t DefaultSize[3]; ++ std::string DevName; ++ bool Is64bit; ++ bool Is32on64bit; ++ bool DumpCode; ++ bool R600ALUInst; ++ ++ InstrItineraryData InstrItins; ++ ++public: ++ AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS); ++ virtual ~AMDGPUSubtarget(); ++ ++ const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } ++ virtual void ParseSubtargetFeatures(llvm::StringRef CPU, llvm::StringRef FS); ++ ++ bool isOverride(AMDGPUDeviceInfo::Caps) const; ++ bool is64bit() const; ++ ++ // Helper functions to simplify if statements ++ bool isTargetELF() const; ++ const AMDGPUDevice* device() const; ++ std::string getDataLayout() const; ++ std::string getDeviceName() const; ++ virtual size_t getDefaultSize(uint32_t dim) const; ++ bool dumpCode() const { return DumpCode; } ++ bool r600ALUEncoding() const { return R600ALUInst; } ++ ++}; ++ ++} // End namespace llvm ++ ++#endif // AMDGPUSUBTARGET_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUTargetMachine.cpp llvm-r600/lib/Target/R600/AMDGPUTargetMachine.cpp +--- llvm-3.2.src/lib/Target/R600/AMDGPUTargetMachine.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPUTargetMachine.cpp 2013-01-25 19:43:57.433383055 +0100 +@@ -0,0 +1,148 @@ ++//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief The AMDGPU target machine contains all of the hardware specific ++/// information needed to emit code for R600 and SI GPUs. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "AMDGPUTargetMachine.h" ++#include "AMDGPU.h" ++#include "R600ISelLowering.h" ++#include "R600InstrInfo.h" ++#include "SIISelLowering.h" ++#include "SIInstrInfo.h" ++#include "llvm/Analysis/Passes.h" ++#include "llvm/Analysis/Verifier.h" ++#include "llvm/CodeGen/MachineFunctionAnalysis.h" ++#include "llvm/CodeGen/MachineModuleInfo.h" ++#include "llvm/CodeGen/Passes.h" ++#include "llvm/MC/MCAsmInfo.h" ++#include "llvm/PassManager.h" ++#include "llvm/Support/TargetRegistry.h" ++#include "llvm/Support/raw_os_ostream.h" ++#include "llvm/Transforms/IPO.h" ++#include "llvm/Transforms/Scalar.h" ++#include ++ ++using namespace llvm; ++ ++extern "C" void LLVMInitializeR600Target() { ++ // Register the target ++ RegisterTargetMachine X(TheAMDGPUTarget); ++} ++ ++AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT, ++ StringRef CPU, StringRef FS, ++ TargetOptions Options, ++ Reloc::Model RM, CodeModel::Model CM, ++ CodeGenOpt::Level OptLevel ++) ++: ++ LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel), ++ Subtarget(TT, CPU, FS), ++ Layout(Subtarget.getDataLayout()), ++ FrameLowering(TargetFrameLowering::StackGrowsUp, ++ Subtarget.device()->getStackAlignment(), 0), ++ IntrinsicInfo(this), ++ InstrItins(&Subtarget.getInstrItineraryData()) { ++ // TLInfo uses InstrInfo so it must be initialized after. ++ if (Subtarget.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { ++ InstrInfo = new R600InstrInfo(*this); ++ TLInfo = new R600TargetLowering(*this); ++ } else { ++ InstrInfo = new SIInstrInfo(*this); ++ TLInfo = new SITargetLowering(*this); ++ } ++} ++ ++AMDGPUTargetMachine::~AMDGPUTargetMachine() { ++} ++ ++namespace { ++class AMDGPUPassConfig : public TargetPassConfig { ++public: ++ AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM) ++ : TargetPassConfig(TM, PM) {} ++ ++ AMDGPUTargetMachine &getAMDGPUTargetMachine() const { ++ return getTM(); ++ } ++ ++ virtual bool addPreISel(); ++ virtual bool addInstSelector(); ++ virtual bool addPreRegAlloc(); ++ virtual bool addPostRegAlloc(); ++ virtual bool addPreSched2(); ++ virtual bool addPreEmitPass(); ++}; ++} // End of anonymous namespace ++ ++TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) { ++ return new AMDGPUPassConfig(this, PM); ++} ++ ++bool ++AMDGPUPassConfig::addPreISel() { ++ const AMDGPUSubtarget &ST = TM->getSubtarget(); ++ if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { ++ addPass(createAMDGPUStructurizeCFGPass()); ++ addPass(createSIAnnotateControlFlowPass()); ++ } ++ return false; ++} ++ ++bool AMDGPUPassConfig::addInstSelector() { ++ addPass(createAMDGPUPeepholeOpt(*TM)); ++ addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); ++ return false; ++} ++ ++bool AMDGPUPassConfig::addPreRegAlloc() { ++ const AMDGPUSubtarget &ST = TM->getSubtarget(); ++ ++ if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { ++ addPass(createSIAssignInterpRegsPass(*TM)); ++ } ++ addPass(createAMDGPUConvertToISAPass(*TM)); ++ return false; ++} ++ ++bool AMDGPUPassConfig::addPostRegAlloc() { ++ const AMDGPUSubtarget &ST = TM->getSubtarget(); ++ ++ if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { ++ addPass(createSIInsertWaits(*TM)); ++ } ++ return false; ++} ++ ++bool AMDGPUPassConfig::addPreSched2() { ++ ++ addPass(&IfConverterID); ++ return false; ++} ++ ++bool AMDGPUPassConfig::addPreEmitPass() { ++ const AMDGPUSubtarget &ST = TM->getSubtarget(); ++ if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { ++ addPass(createAMDGPUCFGPreparationPass(*TM)); ++ addPass(createAMDGPUCFGStructurizerPass(*TM)); ++ addPass(createR600ExpandSpecialInstrsPass(*TM)); ++ addPass(createR600LowerConstCopy(*TM)); ++ addPass(&FinalizeMachineBundlesID); ++ } else { ++ addPass(createSILowerLiteralConstantsPass(*TM)); ++ addPass(createSILowerControlFlowPass(*TM)); ++ } ++ ++ return false; ++} ++ +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUTargetMachine.h llvm-r600/lib/Target/R600/AMDGPUTargetMachine.h +--- llvm-3.2.src/lib/Target/R600/AMDGPUTargetMachine.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPUTargetMachine.h 2013-01-25 19:43:57.433383055 +0100 +@@ -0,0 +1,70 @@ ++//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets. ++// ++//===----------------------------------------------------------------------===// ++ ++#ifndef AMDGPU_TARGET_MACHINE_H ++#define AMDGPU_TARGET_MACHINE_H ++ ++#include "AMDGPUInstrInfo.h" ++#include "AMDGPUSubtarget.h" ++#include "AMDILFrameLowering.h" ++#include "AMDILIntrinsicInfo.h" ++#include "R600ISelLowering.h" ++#include "llvm/ADT/OwningPtr.h" ++#include "llvm/DataLayout.h" ++ ++namespace llvm { ++ ++MCAsmInfo* createMCAsmInfo(const Target &T, StringRef TT); ++ ++class AMDGPUTargetMachine : public LLVMTargetMachine { ++ ++ AMDGPUSubtarget Subtarget; ++ const DataLayout Layout; ++ AMDGPUFrameLowering FrameLowering; ++ AMDGPUIntrinsicInfo IntrinsicInfo; ++ const AMDGPUInstrInfo * InstrInfo; ++ AMDGPUTargetLowering * TLInfo; ++ const InstrItineraryData* InstrItins; ++ ++public: ++ AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef FS, ++ StringRef CPU, ++ TargetOptions Options, ++ Reloc::Model RM, CodeModel::Model CM, ++ CodeGenOpt::Level OL); ++ ~AMDGPUTargetMachine(); ++ virtual const AMDGPUFrameLowering* getFrameLowering() const { ++ return &FrameLowering; ++ } ++ virtual const AMDGPUIntrinsicInfo* getIntrinsicInfo() const { ++ return &IntrinsicInfo; ++ } ++ virtual const AMDGPUInstrInfo *getInstrInfo() const {return InstrInfo;} ++ virtual const AMDGPUSubtarget *getSubtargetImpl() const {return &Subtarget; } ++ virtual const AMDGPURegisterInfo *getRegisterInfo() const { ++ return &InstrInfo->getRegisterInfo(); ++ } ++ virtual AMDGPUTargetLowering * getTargetLowering() const { ++ return TLInfo; ++ } ++ virtual const InstrItineraryData* getInstrItineraryData() const { ++ return InstrItins; ++ } ++ virtual const DataLayout* getDataLayout() const { return &Layout; } ++ virtual TargetPassConfig *createPassConfig(PassManagerBase &PM); ++}; ++ ++} // End namespace llvm ++ ++#endif // AMDGPU_TARGET_MACHINE_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPU.td llvm-r600/lib/Target/R600/AMDGPU.td +--- llvm-3.2.src/lib/Target/R600/AMDGPU.td 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDGPU.td 2013-01-25 19:43:57.423383055 +0100 +@@ -0,0 +1,40 @@ ++//===-- AMDIL.td - AMDIL Tablegen files --*- tablegen -*-------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//==-----------------------------------------------------------------------===// ++ ++// Include AMDIL TD files ++include "AMDILBase.td" ++ ++ ++def AMDGPUInstrInfo : InstrInfo { ++ let guessInstructionProperties = 1; ++} ++ ++//===----------------------------------------------------------------------===// ++// Declare the target which we are implementing ++//===----------------------------------------------------------------------===// ++def AMDGPUAsmWriter : AsmWriter { ++ string AsmWriterClassName = "InstPrinter"; ++ int Variant = 0; ++ bit isMCAsmWriter = 1; ++} ++ ++def AMDGPU : Target { ++ // Pull in Instruction Info: ++ let InstructionSet = AMDGPUInstrInfo; ++ let AssemblyWriters = [AMDGPUAsmWriter]; ++} ++ ++// Include AMDGPU TD files ++include "R600Schedule.td" ++include "SISchedule.td" ++include "Processors.td" ++include "AMDGPUInstrInfo.td" ++include "AMDGPUIntrinsics.td" ++include "AMDGPURegisterInfo.td" ++include "AMDGPUInstructions.td" +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDIL7XXDevice.cpp llvm-r600/lib/Target/R600/AMDIL7XXDevice.cpp +--- llvm-3.2.src/lib/Target/R600/AMDIL7XXDevice.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDIL7XXDevice.cpp 2013-01-25 19:43:57.433383055 +0100 +@@ -0,0 +1,115 @@ ++//===-- AMDIL7XXDevice.cpp - Device Info for 7XX GPUs ---------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++// \file ++//==-----------------------------------------------------------------------===// ++#include "AMDIL7XXDevice.h" ++#include "AMDGPUSubtarget.h" ++#include "AMDILDevice.h" ++ ++using namespace llvm; ++ ++AMDGPU7XXDevice::AMDGPU7XXDevice(AMDGPUSubtarget *ST) : AMDGPUDevice(ST) { ++ setCaps(); ++ std::string name = mSTM->getDeviceName(); ++ if (name == "rv710") { ++ DeviceFlag = OCL_DEVICE_RV710; ++ } else if (name == "rv730") { ++ DeviceFlag = OCL_DEVICE_RV730; ++ } else { ++ DeviceFlag = OCL_DEVICE_RV770; ++ } ++} ++ ++AMDGPU7XXDevice::~AMDGPU7XXDevice() { ++} ++ ++void AMDGPU7XXDevice::setCaps() { ++ mSWBits.set(AMDGPUDeviceInfo::LocalMem); ++} ++ ++size_t AMDGPU7XXDevice::getMaxLDSSize() const { ++ if (usesHardware(AMDGPUDeviceInfo::LocalMem)) { ++ return MAX_LDS_SIZE_700; ++ } ++ return 0; ++} ++ ++size_t AMDGPU7XXDevice::getWavefrontSize() const { ++ return AMDGPUDevice::HalfWavefrontSize; ++} ++ ++uint32_t AMDGPU7XXDevice::getGeneration() const { ++ return AMDGPUDeviceInfo::HD4XXX; ++} ++ ++uint32_t AMDGPU7XXDevice::getResourceID(uint32_t DeviceID) const { ++ switch (DeviceID) { ++ default: ++ assert(0 && "ID type passed in is unknown!"); ++ break; ++ case GLOBAL_ID: ++ case CONSTANT_ID: ++ case RAW_UAV_ID: ++ case ARENA_UAV_ID: ++ break; ++ case LDS_ID: ++ if (usesHardware(AMDGPUDeviceInfo::LocalMem)) { ++ return DEFAULT_LDS_ID; ++ } ++ break; ++ case SCRATCH_ID: ++ if (usesHardware(AMDGPUDeviceInfo::PrivateMem)) { ++ return DEFAULT_SCRATCH_ID; ++ } ++ break; ++ case GDS_ID: ++ assert(0 && "GDS UAV ID is not supported on this chip"); ++ if (usesHardware(AMDGPUDeviceInfo::RegionMem)) { ++ return DEFAULT_GDS_ID; ++ } ++ break; ++ }; ++ ++ return 0; ++} ++ ++uint32_t AMDGPU7XXDevice::getMaxNumUAVs() const { ++ return 1; ++} ++ ++AMDGPU770Device::AMDGPU770Device(AMDGPUSubtarget *ST): AMDGPU7XXDevice(ST) { ++ setCaps(); ++} ++ ++AMDGPU770Device::~AMDGPU770Device() { ++} ++ ++void AMDGPU770Device::setCaps() { ++ if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) { ++ mSWBits.set(AMDGPUDeviceInfo::FMA); ++ mHWBits.set(AMDGPUDeviceInfo::DoubleOps); ++ } ++ mSWBits.set(AMDGPUDeviceInfo::BarrierDetect); ++ mHWBits.reset(AMDGPUDeviceInfo::LongOps); ++ mSWBits.set(AMDGPUDeviceInfo::LongOps); ++ mSWBits.set(AMDGPUDeviceInfo::LocalMem); ++} ++ ++size_t AMDGPU770Device::getWavefrontSize() const { ++ return AMDGPUDevice::WavefrontSize; ++} ++ ++AMDGPU710Device::AMDGPU710Device(AMDGPUSubtarget *ST) : AMDGPU7XXDevice(ST) { ++} ++ ++AMDGPU710Device::~AMDGPU710Device() { ++} ++ ++size_t AMDGPU710Device::getWavefrontSize() const { ++ return AMDGPUDevice::QuarterWavefrontSize; ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDIL7XXDevice.h llvm-r600/lib/Target/R600/AMDIL7XXDevice.h +--- llvm-3.2.src/lib/Target/R600/AMDIL7XXDevice.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDIL7XXDevice.h 2013-01-25 19:43:57.436716388 +0100 +@@ -0,0 +1,72 @@ ++//==-- AMDIL7XXDevice.h - Define 7XX Device Device for AMDIL ---*- C++ -*--===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//==-----------------------------------------------------------------------===// ++/// \file ++/// \brief Interface for the subtarget data classes. ++/// ++/// This file will define the interface that each generation needs to ++/// implement in order to correctly answer queries on the capabilities of the ++/// specific hardware. ++//===----------------------------------------------------------------------===// ++#ifndef AMDIL7XXDEVICEIMPL_H ++#define AMDIL7XXDEVICEIMPL_H ++#include "AMDILDevice.h" ++ ++namespace llvm { ++class AMDGPUSubtarget; ++ ++//===----------------------------------------------------------------------===// ++// 7XX generation of devices and their respective sub classes ++//===----------------------------------------------------------------------===// ++ ++/// \brief The AMDGPU7XXDevice class represents the generic 7XX device. ++/// ++/// All 7XX devices are derived from this class. The AMDGPU7XX device will only ++/// support the minimal features that are required to be considered OpenCL 1.0 ++/// compliant and nothing more. ++class AMDGPU7XXDevice : public AMDGPUDevice { ++public: ++ AMDGPU7XXDevice(AMDGPUSubtarget *ST); ++ virtual ~AMDGPU7XXDevice(); ++ virtual size_t getMaxLDSSize() const; ++ virtual size_t getWavefrontSize() const; ++ virtual uint32_t getGeneration() const; ++ virtual uint32_t getResourceID(uint32_t DeviceID) const; ++ virtual uint32_t getMaxNumUAVs() const; ++ ++protected: ++ virtual void setCaps(); ++}; ++ ++/// \brief The AMDGPU770Device class represents the RV770 chip and it's ++/// derivative cards. ++/// ++/// The difference between this device and the base class is this device device ++/// adds support for double precision and has a larger wavefront size. ++class AMDGPU770Device : public AMDGPU7XXDevice { ++public: ++ AMDGPU770Device(AMDGPUSubtarget *ST); ++ virtual ~AMDGPU770Device(); ++ virtual size_t getWavefrontSize() const; ++private: ++ virtual void setCaps(); ++}; ++ ++/// \brief The AMDGPU710Device class derives from the 7XX base class. ++/// ++/// This class is a smaller derivative, so we need to overload some of the ++/// functions in order to correctly specify this information. ++class AMDGPU710Device : public AMDGPU7XXDevice { ++public: ++ AMDGPU710Device(AMDGPUSubtarget *ST); ++ virtual ~AMDGPU710Device(); ++ virtual size_t getWavefrontSize() const; ++}; ++ ++} // namespace llvm ++#endif // AMDILDEVICEIMPL_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILBase.td llvm-r600/lib/Target/R600/AMDILBase.td +--- llvm-3.2.src/lib/Target/R600/AMDILBase.td 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILBase.td 2013-01-25 19:43:57.436716388 +0100 +@@ -0,0 +1,85 @@ ++//===- AMDIL.td - AMDIL Target Machine -------------*- tablegen -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// Target-independent interfaces which we are implementing ++//===----------------------------------------------------------------------===// ++ ++include "llvm/Target/Target.td" ++ ++// Dummy Instruction itineraries for pseudo instructions ++def ALU_NULL : FuncUnit; ++def NullALU : InstrItinClass; ++ ++//===----------------------------------------------------------------------===// ++// AMDIL Subtarget features. ++//===----------------------------------------------------------------------===// ++def FeatureFP64 : SubtargetFeature<"fp64", ++ "CapsOverride[AMDGPUDeviceInfo::DoubleOps]", ++ "true", ++ "Enable 64bit double precision operations">; ++def FeatureByteAddress : SubtargetFeature<"byte_addressable_store", ++ "CapsOverride[AMDGPUDeviceInfo::ByteStores]", ++ "true", ++ "Enable byte addressable stores">; ++def FeatureBarrierDetect : SubtargetFeature<"barrier_detect", ++ "CapsOverride[AMDGPUDeviceInfo::BarrierDetect]", ++ "true", ++ "Enable duplicate barrier detection(HD5XXX or later).">; ++def FeatureImages : SubtargetFeature<"images", ++ "CapsOverride[AMDGPUDeviceInfo::Images]", ++ "true", ++ "Enable image functions">; ++def FeatureMultiUAV : SubtargetFeature<"multi_uav", ++ "CapsOverride[AMDGPUDeviceInfo::MultiUAV]", ++ "true", ++ "Generate multiple UAV code(HD5XXX family or later)">; ++def FeatureMacroDB : SubtargetFeature<"macrodb", ++ "CapsOverride[AMDGPUDeviceInfo::MacroDB]", ++ "true", ++ "Use internal macrodb, instead of macrodb in driver">; ++def FeatureNoAlias : SubtargetFeature<"noalias", ++ "CapsOverride[AMDGPUDeviceInfo::NoAlias]", ++ "true", ++ "assert that all kernel argument pointers are not aliased">; ++def FeatureNoInline : SubtargetFeature<"no-inline", ++ "CapsOverride[AMDGPUDeviceInfo::NoInline]", ++ "true", ++ "specify whether to not inline functions">; ++ ++def Feature64BitPtr : SubtargetFeature<"64BitPtr", ++ "Is64bit", ++ "false", ++ "Specify if 64bit addressing should be used.">; ++ ++def Feature32on64BitPtr : SubtargetFeature<"64on32BitPtr", ++ "Is32on64bit", ++ "false", ++ "Specify if 64bit sized pointers with 32bit addressing should be used.">; ++def FeatureDebug : SubtargetFeature<"debug", ++ "CapsOverride[AMDGPUDeviceInfo::Debug]", ++ "true", ++ "Debug mode is enabled, so disable hardware accelerated address spaces.">; ++def FeatureDumpCode : SubtargetFeature <"DumpCode", ++ "DumpCode", ++ "true", ++ "Dump MachineInstrs in the CodeEmitter">; ++ ++def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst", ++ "R600ALUInst", ++ "false", ++ "Older version of ALU instructions encoding.">; ++ ++ ++//===----------------------------------------------------------------------===// ++// Register File, Calling Conv, Instruction Descriptions ++//===----------------------------------------------------------------------===// ++ ++ ++include "AMDILRegisterInfo.td" ++include "AMDILInstrInfo.td" ++ +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILCFGStructurizer.cpp llvm-r600/lib/Target/R600/AMDILCFGStructurizer.cpp +--- llvm-3.2.src/lib/Target/R600/AMDILCFGStructurizer.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILCFGStructurizer.cpp 2013-01-25 19:43:57.436716388 +0100 +@@ -0,0 +1,3045 @@ ++//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++/// \file ++//==-----------------------------------------------------------------------===// ++ ++#define DEBUGME 0 ++#define DEBUG_TYPE "structcfg" ++ ++#include "AMDGPUInstrInfo.h" ++#include "AMDIL.h" ++#include "llvm/ADT/SCCIterator.h" ++#include "llvm/ADT/SmallVector.h" ++#include "llvm/ADT/Statistic.h" ++#include "llvm/Analysis/DominatorInternals.h" ++#include "llvm/Analysis/Dominators.h" ++#include "llvm/CodeGen/MachinePostDominators.h" ++#include "llvm/CodeGen/MachineDominators.h" ++#include "llvm/CodeGen/MachineFunction.h" ++#include "llvm/CodeGen/MachineFunctionAnalysis.h" ++#include "llvm/CodeGen/MachineFunctionPass.h" ++#include "llvm/CodeGen/MachineInstrBuilder.h" ++#include "llvm/CodeGen/MachineJumpTableInfo.h" ++#include "llvm/CodeGen/MachineLoopInfo.h" ++#include "llvm/CodeGen/MachineRegisterInfo.h" ++#include "llvm/Target/TargetInstrInfo.h" ++ ++using namespace llvm; ++ ++// TODO: move-begin. ++ ++//===----------------------------------------------------------------------===// ++// ++// Statistics for CFGStructurizer. ++// ++//===----------------------------------------------------------------------===// ++ ++STATISTIC(numSerialPatternMatch, "CFGStructurizer number of serial pattern " ++ "matched"); ++STATISTIC(numIfPatternMatch, "CFGStructurizer number of if pattern " ++ "matched"); ++STATISTIC(numLoopbreakPatternMatch, "CFGStructurizer number of loop-break " ++ "pattern matched"); ++STATISTIC(numLoopcontPatternMatch, "CFGStructurizer number of loop-continue " ++ "pattern matched"); ++STATISTIC(numLoopPatternMatch, "CFGStructurizer number of loop pattern " ++ "matched"); ++STATISTIC(numClonedBlock, "CFGStructurizer cloned blocks"); ++STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions"); ++ ++//===----------------------------------------------------------------------===// ++// ++// Miscellaneous utility for CFGStructurizer. ++// ++//===----------------------------------------------------------------------===// ++namespace llvmCFGStruct { ++#define SHOWNEWINSTR(i) \ ++ if (DEBUGME) errs() << "New instr: " << *i << "\n" ++ ++#define SHOWNEWBLK(b, msg) \ ++if (DEBUGME) { \ ++ errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ ++ errs() << "\n"; \ ++} ++ ++#define SHOWBLK_DETAIL(b, msg) \ ++if (DEBUGME) { \ ++ if (b) { \ ++ errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ ++ b->print(errs()); \ ++ errs() << "\n"; \ ++ } \ ++} ++ ++#define INVALIDSCCNUM -1 ++#define INVALIDREGNUM 0 ++ ++template ++void PrintLoopinfo(const LoopinfoT &LoopInfo, llvm::raw_ostream &OS) { ++ for (typename LoopinfoT::iterator iter = LoopInfo.begin(), ++ iterEnd = LoopInfo.end(); ++ iter != iterEnd; ++iter) { ++ (*iter)->print(OS, 0); ++ } ++} ++ ++template ++void ReverseVector(SmallVector &Src) { ++ size_t sz = Src.size(); ++ for (size_t i = 0; i < sz/2; ++i) { ++ NodeT *t = Src[i]; ++ Src[i] = Src[sz - i - 1]; ++ Src[sz - i - 1] = t; ++ } ++} ++ ++} //end namespace llvmCFGStruct ++ ++//===----------------------------------------------------------------------===// ++// ++// supporting data structure for CFGStructurizer ++// ++//===----------------------------------------------------------------------===// ++ ++namespace llvmCFGStruct { ++template ++struct CFGStructTraits { ++}; ++ ++template ++class BlockInformation { ++public: ++ bool isRetired; ++ int sccNum; ++ //SmallVector succInstr; ++ //Instructions defining the corresponding successor. ++ BlockInformation() : isRetired(false), sccNum(INVALIDSCCNUM) {} ++}; ++ ++template ++class LandInformation { ++public: ++ BlockT *landBlk; ++ std::set breakInitRegs; //Registers that need to "reg = 0", before ++ //WHILELOOP(thisloop) init before entering ++ //thisloop. ++ std::set contInitRegs; //Registers that need to "reg = 0", after ++ //WHILELOOP(thisloop) init after entering ++ //thisloop. ++ std::set endbranchInitRegs; //Init before entering this loop, at loop ++ //land block, branch cond on this reg. ++ std::set breakOnRegs; //registers that need to "if (reg) break ++ //endif" after ENDLOOP(thisloop) break ++ //outerLoopOf(thisLoop). ++ std::set contOnRegs; //registers that need to "if (reg) continue ++ //endif" after ENDLOOP(thisloop) continue on ++ //outerLoopOf(thisLoop). ++ LandInformation() : landBlk(NULL) {} ++}; ++ ++} //end of namespace llvmCFGStruct ++ ++//===----------------------------------------------------------------------===// ++// ++// CFGStructurizer ++// ++//===----------------------------------------------------------------------===// ++ ++namespace llvmCFGStruct { ++// bixia TODO: port it to BasicBlock, not just MachineBasicBlock. ++template ++class CFGStructurizer { ++public: ++ typedef enum { ++ Not_SinglePath = 0, ++ SinglePath_InPath = 1, ++ SinglePath_NotInPath = 2 ++ } PathToKind; ++ ++public: ++ typedef typename PassT::InstructionType InstrT; ++ typedef typename PassT::FunctionType FuncT; ++ typedef typename PassT::DominatortreeType DomTreeT; ++ typedef typename PassT::PostDominatortreeType PostDomTreeT; ++ typedef typename PassT::DomTreeNodeType DomTreeNodeT; ++ typedef typename PassT::LoopinfoType LoopInfoT; ++ ++ typedef GraphTraits FuncGTraits; ++ //typedef FuncGTraits::nodes_iterator BlockIterator; ++ typedef typename FuncT::iterator BlockIterator; ++ ++ typedef typename FuncGTraits::NodeType BlockT; ++ typedef GraphTraits BlockGTraits; ++ typedef GraphTraits > InvBlockGTraits; ++ //typedef BlockGTraits::succ_iterator InstructionIterator; ++ typedef typename BlockT::iterator InstrIterator; ++ ++ typedef CFGStructTraits CFGTraits; ++ typedef BlockInformation BlockInfo; ++ typedef std::map BlockInfoMap; ++ ++ typedef int RegiT; ++ typedef typename PassT::LoopType LoopT; ++ typedef LandInformation LoopLandInfo; ++ typedef std::map LoopLandInfoMap; ++ //landing info for loop break ++ typedef SmallVector BlockTSmallerVector; ++ ++public: ++ CFGStructurizer(); ++ ~CFGStructurizer(); ++ ++ /// Perform the CFG structurization ++ bool run(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri); ++ ++ /// Perform the CFG preparation ++ bool prepare(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri); ++ ++private: ++ void reversePredicateSetter(typename BlockT::iterator); ++ void orderBlocks(); ++ void printOrderedBlocks(llvm::raw_ostream &OS); ++ int patternMatch(BlockT *CurBlock); ++ int patternMatchGroup(BlockT *CurBlock); ++ ++ int serialPatternMatch(BlockT *CurBlock); ++ int ifPatternMatch(BlockT *CurBlock); ++ int switchPatternMatch(BlockT *CurBlock); ++ int loopendPatternMatch(BlockT *CurBlock); ++ int loopPatternMatch(BlockT *CurBlock); ++ ++ int loopbreakPatternMatch(LoopT *LoopRep, BlockT *LoopHeader); ++ int loopcontPatternMatch(LoopT *LoopRep, BlockT *LoopHeader); ++ //int loopWithoutBreak(BlockT *); ++ ++ void handleLoopbreak (BlockT *ExitingBlock, LoopT *ExitingLoop, ++ BlockT *ExitBlock, LoopT *exitLoop, BlockT *landBlock); ++ void handleLoopcontBlock(BlockT *ContingBlock, LoopT *contingLoop, ++ BlockT *ContBlock, LoopT *contLoop); ++ bool isSameloopDetachedContbreak(BlockT *Src1Block, BlockT *Src2Block); ++ int handleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock, ++ BlockT *FalseBlock); ++ int handleJumpintoIfImp(BlockT *HeadBlock, BlockT *TrueBlock, ++ BlockT *FalseBlock); ++ int improveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock, ++ BlockT *FalseBlock, BlockT **LandBlockPtr); ++ void showImproveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock, ++ BlockT *FalseBlock, BlockT *LandBlock, ++ bool Detail = false); ++ PathToKind singlePathTo(BlockT *SrcBlock, BlockT *DstBlock, ++ bool AllowSideEntry = true); ++ BlockT *singlePathEnd(BlockT *srcBlock, BlockT *DstBlock, ++ bool AllowSideEntry = true); ++ int cloneOnSideEntryTo(BlockT *PreBlock, BlockT *SrcBlock, BlockT *DstBlock); ++ void mergeSerialBlock(BlockT *DstBlock, BlockT *srcBlock); ++ ++ void mergeIfthenelseBlock(InstrT *BranchInstr, BlockT *CurBlock, ++ BlockT *TrueBlock, BlockT *FalseBlock, ++ BlockT *LandBlock); ++ void mergeLooplandBlock(BlockT *DstBlock, LoopLandInfo *LoopLand); ++ void mergeLoopbreakBlock(BlockT *ExitingBlock, BlockT *ExitBlock, ++ BlockT *ExitLandBlock, RegiT SetReg); ++ void settleLoopcontBlock(BlockT *ContingBlock, BlockT *ContBlock, ++ RegiT SetReg); ++ BlockT *relocateLoopcontBlock(LoopT *ParentLoopRep, LoopT *LoopRep, ++ std::set &ExitBlockSet, ++ BlockT *ExitLandBlk); ++ BlockT *addLoopEndbranchBlock(LoopT *LoopRep, ++ BlockTSmallerVector &ExitingBlocks, ++ BlockTSmallerVector &ExitBlocks); ++ BlockT *normalizeInfiniteLoopExit(LoopT *LoopRep); ++ void removeUnconditionalBranch(BlockT *SrcBlock); ++ void removeRedundantConditionalBranch(BlockT *SrcBlock); ++ void addDummyExitBlock(SmallVector &RetBlocks); ++ ++ void removeSuccessor(BlockT *SrcBlock); ++ BlockT *cloneBlockForPredecessor(BlockT *CurBlock, BlockT *PredBlock); ++ BlockT *exitingBlock2ExitBlock (LoopT *LoopRep, BlockT *exitingBlock); ++ ++ void migrateInstruction(BlockT *SrcBlock, BlockT *DstBlock, ++ InstrIterator InsertPos); ++ ++ void recordSccnum(BlockT *SrcBlock, int SCCNum); ++ int getSCCNum(BlockT *srcBlk); ++ ++ void retireBlock(BlockT *DstBlock, BlockT *SrcBlock); ++ bool isRetiredBlock(BlockT *SrcBlock); ++ bool isActiveLoophead(BlockT *CurBlock); ++ bool needMigrateBlock(BlockT *Block); ++ ++ BlockT *recordLoopLandBlock(LoopT *LoopRep, BlockT *LandBlock, ++ BlockTSmallerVector &exitBlocks, ++ std::set &ExitBlockSet); ++ void setLoopLandBlock(LoopT *LoopRep, BlockT *Block = NULL); ++ BlockT *getLoopLandBlock(LoopT *LoopRep); ++ LoopLandInfo *getLoopLandInfo(LoopT *LoopRep); ++ ++ void addLoopBreakOnReg(LoopT *LoopRep, RegiT RegNum); ++ void addLoopContOnReg(LoopT *LoopRep, RegiT RegNum); ++ void addLoopBreakInitReg(LoopT *LoopRep, RegiT RegNum); ++ void addLoopContInitReg(LoopT *LoopRep, RegiT RegNum); ++ void addLoopEndbranchInitReg(LoopT *LoopRep, RegiT RegNum); ++ ++ bool hasBackEdge(BlockT *curBlock); ++ unsigned getLoopDepth (LoopT *LoopRep); ++ int countActiveBlock( ++ typename SmallVector::const_iterator IterStart, ++ typename SmallVector::const_iterator IterEnd); ++ BlockT *findNearestCommonPostDom(std::set&); ++ BlockT *findNearestCommonPostDom(BlockT *Block1, BlockT *Block2); ++ ++private: ++ DomTreeT *domTree; ++ PostDomTreeT *postDomTree; ++ LoopInfoT *loopInfo; ++ PassT *passRep; ++ FuncT *funcRep; ++ ++ BlockInfoMap blockInfoMap; ++ LoopLandInfoMap loopLandInfoMap; ++ SmallVector orderedBlks; ++ const AMDGPURegisterInfo *TRI; ++ ++}; //template class CFGStructurizer ++ ++template CFGStructurizer::CFGStructurizer() ++ : domTree(NULL), postDomTree(NULL), loopInfo(NULL) { ++} ++ ++template CFGStructurizer::~CFGStructurizer() { ++ for (typename BlockInfoMap::iterator I = blockInfoMap.begin(), ++ E = blockInfoMap.end(); I != E; ++I) { ++ delete I->second; ++ } ++} ++ ++template ++bool CFGStructurizer::prepare(FuncT &func, PassT &pass, ++ const AMDGPURegisterInfo * tri) { ++ passRep = &pass; ++ funcRep = &func; ++ TRI = tri; ++ ++ bool changed = false; ++ ++ //FIXME: if not reducible flow graph, make it so ??? ++ ++ if (DEBUGME) { ++ errs() << "AMDGPUCFGStructurizer::prepare\n"; ++ } ++ ++ loopInfo = CFGTraits::getLoopInfo(pass); ++ if (DEBUGME) { ++ errs() << "LoopInfo:\n"; ++ PrintLoopinfo(*loopInfo, errs()); ++ } ++ ++ orderBlocks(); ++ if (DEBUGME) { ++ errs() << "Ordered blocks:\n"; ++ printOrderedBlocks(errs()); ++ } ++ ++ SmallVector retBlks; ++ ++ for (typename LoopInfoT::iterator iter = loopInfo->begin(), ++ iterEnd = loopInfo->end(); ++ iter != iterEnd; ++iter) { ++ LoopT* loopRep = (*iter); ++ BlockTSmallerVector exitingBlks; ++ loopRep->getExitingBlocks(exitingBlks); ++ ++ if (exitingBlks.size() == 0) { ++ BlockT* dummyExitBlk = normalizeInfiniteLoopExit(loopRep); ++ if (dummyExitBlk != NULL) ++ retBlks.push_back(dummyExitBlk); ++ } ++ } ++ ++ // Remove unconditional branch instr. ++ // Add dummy exit block iff there are multiple returns. ++ ++ for (typename SmallVector::const_iterator ++ iterBlk = orderedBlks.begin(), iterEndBlk = orderedBlks.end(); ++ iterBlk != iterEndBlk; ++ ++iterBlk) { ++ BlockT *curBlk = *iterBlk; ++ removeUnconditionalBranch(curBlk); ++ removeRedundantConditionalBranch(curBlk); ++ if (CFGTraits::isReturnBlock(curBlk)) { ++ retBlks.push_back(curBlk); ++ } ++ assert(curBlk->succ_size() <= 2); ++ } //for ++ ++ if (retBlks.size() >= 2) { ++ addDummyExitBlock(retBlks); ++ changed = true; ++ } ++ ++ return changed; ++} //CFGStructurizer::prepare ++ ++template ++bool CFGStructurizer::run(FuncT &func, PassT &pass, ++ const AMDGPURegisterInfo * tri) { ++ passRep = &pass; ++ funcRep = &func; ++ TRI = tri; ++ ++ //Assume reducible CFG... ++ if (DEBUGME) { ++ errs() << "AMDGPUCFGStructurizer::run\n"; ++ func.viewCFG(); ++ } ++ ++ domTree = CFGTraits::getDominatorTree(pass); ++ if (DEBUGME) { ++ domTree->print(errs(), (const llvm::Module*)0); ++ } ++ ++ postDomTree = CFGTraits::getPostDominatorTree(pass); ++ if (DEBUGME) { ++ postDomTree->print(errs()); ++ } ++ ++ loopInfo = CFGTraits::getLoopInfo(pass); ++ if (DEBUGME) { ++ errs() << "LoopInfo:\n"; ++ PrintLoopinfo(*loopInfo, errs()); ++ } ++ ++ orderBlocks(); ++#ifdef STRESSTEST ++ //Use the worse block ordering to test the algorithm. ++ ReverseVector(orderedBlks); ++#endif ++ ++ if (DEBUGME) { ++ errs() << "Ordered blocks:\n"; ++ printOrderedBlocks(errs()); ++ } ++ int numIter = 0; ++ bool finish = false; ++ BlockT *curBlk; ++ bool makeProgress = false; ++ int numRemainedBlk = countActiveBlock(orderedBlks.begin(), ++ orderedBlks.end()); ++ ++ do { ++ ++numIter; ++ if (DEBUGME) { ++ errs() << "numIter = " << numIter ++ << ", numRemaintedBlk = " << numRemainedBlk << "\n"; ++ } ++ ++ typename SmallVector::const_iterator ++ iterBlk = orderedBlks.begin(); ++ typename SmallVector::const_iterator ++ iterBlkEnd = orderedBlks.end(); ++ ++ typename SmallVector::const_iterator ++ sccBeginIter = iterBlk; ++ BlockT *sccBeginBlk = NULL; ++ int sccNumBlk = 0; // The number of active blocks, init to a ++ // maximum possible number. ++ int sccNumIter; // Number of iteration in this SCC. ++ ++ while (iterBlk != iterBlkEnd) { ++ curBlk = *iterBlk; ++ ++ if (sccBeginBlk == NULL) { ++ sccBeginIter = iterBlk; ++ sccBeginBlk = curBlk; ++ sccNumIter = 0; ++ sccNumBlk = numRemainedBlk; // Init to maximum possible number. ++ if (DEBUGME) { ++ errs() << "start processing SCC" << getSCCNum(sccBeginBlk); ++ errs() << "\n"; ++ } ++ } ++ ++ if (!isRetiredBlock(curBlk)) { ++ patternMatch(curBlk); ++ } ++ ++ ++iterBlk; ++ ++ bool contNextScc = true; ++ if (iterBlk == iterBlkEnd ++ || getSCCNum(sccBeginBlk) != getSCCNum(*iterBlk)) { ++ // Just finish one scc. ++ ++sccNumIter; ++ int sccRemainedNumBlk = countActiveBlock(sccBeginIter, iterBlk); ++ if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= sccNumBlk) { ++ if (DEBUGME) { ++ errs() << "Can't reduce SCC " << getSCCNum(curBlk) ++ << ", sccNumIter = " << sccNumIter; ++ errs() << "doesn't make any progress\n"; ++ } ++ contNextScc = true; ++ } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < sccNumBlk) { ++ sccNumBlk = sccRemainedNumBlk; ++ iterBlk = sccBeginIter; ++ contNextScc = false; ++ if (DEBUGME) { ++ errs() << "repeat processing SCC" << getSCCNum(curBlk) ++ << "sccNumIter = " << sccNumIter << "\n"; ++ func.viewCFG(); ++ } ++ } else { ++ // Finish the current scc. ++ contNextScc = true; ++ } ++ } else { ++ // Continue on next component in the current scc. ++ contNextScc = false; ++ } ++ ++ if (contNextScc) { ++ sccBeginBlk = NULL; ++ } ++ } //while, "one iteration" over the function. ++ ++ BlockT *entryBlk = FuncGTraits::nodes_begin(&func); ++ if (entryBlk->succ_size() == 0) { ++ finish = true; ++ if (DEBUGME) { ++ errs() << "Reduce to one block\n"; ++ } ++ } else { ++ int newnumRemainedBlk ++ = countActiveBlock(orderedBlks.begin(), orderedBlks.end()); ++ // consider cloned blocks ?? ++ if (newnumRemainedBlk == 1 || newnumRemainedBlk < numRemainedBlk) { ++ makeProgress = true; ++ numRemainedBlk = newnumRemainedBlk; ++ } else { ++ makeProgress = false; ++ if (DEBUGME) { ++ errs() << "No progress\n"; ++ } ++ } ++ } ++ } while (!finish && makeProgress); ++ ++ // Misc wrap up to maintain the consistency of the Function representation. ++ CFGTraits::wrapup(FuncGTraits::nodes_begin(&func)); ++ ++ // Detach retired Block, release memory. ++ for (typename BlockInfoMap::iterator iterMap = blockInfoMap.begin(), ++ iterEndMap = blockInfoMap.end(); iterMap != iterEndMap; ++iterMap) { ++ if ((*iterMap).second && (*iterMap).second->isRetired) { ++ assert(((*iterMap).first)->getNumber() != -1); ++ if (DEBUGME) { ++ errs() << "Erase BB" << ((*iterMap).first)->getNumber() << "\n"; ++ } ++ (*iterMap).first->eraseFromParent(); //Remove from the parent Function. ++ } ++ delete (*iterMap).second; ++ } ++ blockInfoMap.clear(); ++ ++ // clear loopLandInfoMap ++ for (typename LoopLandInfoMap::iterator iterMap = loopLandInfoMap.begin(), ++ iterEndMap = loopLandInfoMap.end(); iterMap != iterEndMap; ++iterMap) { ++ delete (*iterMap).second; ++ } ++ loopLandInfoMap.clear(); ++ ++ if (DEBUGME) { ++ func.viewCFG(); ++ } ++ ++ if (!finish) { ++ assert(!"IRREDUCIBL_CF"); ++ } ++ ++ return true; ++} //CFGStructurizer::run ++ ++/// Print the ordered Blocks. ++/// ++template ++void CFGStructurizer::printOrderedBlocks(llvm::raw_ostream &os) { ++ size_t i = 0; ++ for (typename SmallVector::const_iterator ++ iterBlk = orderedBlks.begin(), iterBlkEnd = orderedBlks.end(); ++ iterBlk != iterBlkEnd; ++ ++iterBlk, ++i) { ++ os << "BB" << (*iterBlk)->getNumber(); ++ os << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")"; ++ if (i != 0 && i % 10 == 0) { ++ os << "\n"; ++ } else { ++ os << " "; ++ } ++ } ++} //printOrderedBlocks ++ ++/// Compute the reversed DFS post order of Blocks ++/// ++template void CFGStructurizer::orderBlocks() { ++ int sccNum = 0; ++ BlockT *bb; ++ for (scc_iterator sccIter = scc_begin(funcRep), ++ sccEnd = scc_end(funcRep); sccIter != sccEnd; ++sccIter, ++sccNum) { ++ std::vector &sccNext = *sccIter; ++ for (typename std::vector::const_iterator ++ blockIter = sccNext.begin(), blockEnd = sccNext.end(); ++ blockIter != blockEnd; ++blockIter) { ++ bb = *blockIter; ++ orderedBlks.push_back(bb); ++ recordSccnum(bb, sccNum); ++ } ++ } ++ ++ //walk through all the block in func to check for unreachable ++ for (BlockIterator blockIter1 = FuncGTraits::nodes_begin(funcRep), ++ blockEnd1 = FuncGTraits::nodes_end(funcRep); ++ blockIter1 != blockEnd1; ++blockIter1) { ++ BlockT *bb = &(*blockIter1); ++ sccNum = getSCCNum(bb); ++ if (sccNum == INVALIDSCCNUM) { ++ errs() << "unreachable block BB" << bb->getNumber() << "\n"; ++ } ++ } ++} //orderBlocks ++ ++template int CFGStructurizer::patternMatch(BlockT *curBlk) { ++ int numMatch = 0; ++ int curMatch; ++ ++ if (DEBUGME) { ++ errs() << "Begin patternMatch BB" << curBlk->getNumber() << "\n"; ++ } ++ ++ while ((curMatch = patternMatchGroup(curBlk)) > 0) { ++ numMatch += curMatch; ++ } ++ ++ if (DEBUGME) { ++ errs() << "End patternMatch BB" << curBlk->getNumber() ++ << ", numMatch = " << numMatch << "\n"; ++ } ++ ++ return numMatch; ++} //patternMatch ++ ++template ++int CFGStructurizer::patternMatchGroup(BlockT *curBlk) { ++ int numMatch = 0; ++ numMatch += serialPatternMatch(curBlk); ++ numMatch += ifPatternMatch(curBlk); ++ numMatch += loopendPatternMatch(curBlk); ++ numMatch += loopPatternMatch(curBlk); ++ return numMatch; ++}//patternMatchGroup ++ ++template ++int CFGStructurizer::serialPatternMatch(BlockT *curBlk) { ++ if (curBlk->succ_size() != 1) { ++ return 0; ++ } ++ ++ BlockT *childBlk = *curBlk->succ_begin(); ++ if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk)) { ++ return 0; ++ } ++ ++ mergeSerialBlock(curBlk, childBlk); ++ ++numSerialPatternMatch; ++ return 1; ++} //serialPatternMatch ++ ++template ++int CFGStructurizer::ifPatternMatch(BlockT *curBlk) { ++ //two edges ++ if (curBlk->succ_size() != 2) { ++ return 0; ++ } ++ ++ if (hasBackEdge(curBlk)) { ++ return 0; ++ } ++ ++ InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(curBlk); ++ if (branchInstr == NULL) { ++ return 0; ++ } ++ ++ assert(CFGTraits::isCondBranch(branchInstr)); ++ ++ BlockT *trueBlk = CFGTraits::getTrueBranch(branchInstr); ++ BlockT *falseBlk = CFGTraits::getFalseBranch(curBlk, branchInstr); ++ BlockT *landBlk; ++ int cloned = 0; ++ ++ // TODO: Simplify ++ if (trueBlk->succ_size() == 1 && falseBlk->succ_size() == 1 ++ && *trueBlk->succ_begin() == *falseBlk->succ_begin()) { ++ landBlk = *trueBlk->succ_begin(); ++ } else if (trueBlk->succ_size() == 0 && falseBlk->succ_size() == 0) { ++ landBlk = NULL; ++ } else if (trueBlk->succ_size() == 1 && *trueBlk->succ_begin() == falseBlk) { ++ landBlk = falseBlk; ++ falseBlk = NULL; ++ } else if (falseBlk->succ_size() == 1 ++ && *falseBlk->succ_begin() == trueBlk) { ++ landBlk = trueBlk; ++ trueBlk = NULL; ++ } else if (falseBlk->succ_size() == 1 ++ && isSameloopDetachedContbreak(trueBlk, falseBlk)) { ++ landBlk = *falseBlk->succ_begin(); ++ } else if (trueBlk->succ_size() == 1 ++ && isSameloopDetachedContbreak(falseBlk, trueBlk)) { ++ landBlk = *trueBlk->succ_begin(); ++ } else { ++ return handleJumpintoIf(curBlk, trueBlk, falseBlk); ++ } ++ ++ // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the ++ // new BB created for landBlk==NULL may introduce new challenge to the ++ // reduction process. ++ if (landBlk != NULL && ++ ((trueBlk && trueBlk->pred_size() > 1) ++ || (falseBlk && falseBlk->pred_size() > 1))) { ++ cloned += improveSimpleJumpintoIf(curBlk, trueBlk, falseBlk, &landBlk); ++ } ++ ++ if (trueBlk && trueBlk->pred_size() > 1) { ++ trueBlk = cloneBlockForPredecessor(trueBlk, curBlk); ++ ++cloned; ++ } ++ ++ if (falseBlk && falseBlk->pred_size() > 1) { ++ falseBlk = cloneBlockForPredecessor(falseBlk, curBlk); ++ ++cloned; ++ } ++ ++ mergeIfthenelseBlock(branchInstr, curBlk, trueBlk, falseBlk, landBlk); ++ ++ ++numIfPatternMatch; ++ ++ numClonedBlock += cloned; ++ ++ return 1 + cloned; ++} //ifPatternMatch ++ ++template ++int CFGStructurizer::switchPatternMatch(BlockT *curBlk) { ++ return 0; ++} //switchPatternMatch ++ ++template ++int CFGStructurizer::loopendPatternMatch(BlockT *curBlk) { ++ LoopT *loopRep = loopInfo->getLoopFor(curBlk); ++ typename std::vector nestedLoops; ++ while (loopRep) { ++ nestedLoops.push_back(loopRep); ++ loopRep = loopRep->getParentLoop(); ++ } ++ ++ if (nestedLoops.size() == 0) { ++ return 0; ++ } ++ ++ // Process nested loop outside->inside, so "continue" to a outside loop won't ++ // be mistaken as "break" of the current loop. ++ int num = 0; ++ for (typename std::vector::reverse_iterator ++ iter = nestedLoops.rbegin(), iterEnd = nestedLoops.rend(); ++ iter != iterEnd; ++iter) { ++ loopRep = *iter; ++ ++ if (getLoopLandBlock(loopRep) != NULL) { ++ continue; ++ } ++ ++ BlockT *loopHeader = loopRep->getHeader(); ++ ++ int numBreak = loopbreakPatternMatch(loopRep, loopHeader); ++ ++ if (numBreak == -1) { ++ break; ++ } ++ ++ int numCont = loopcontPatternMatch(loopRep, loopHeader); ++ num += numBreak + numCont; ++ } ++ ++ return num; ++} //loopendPatternMatch ++ ++template ++int CFGStructurizer::loopPatternMatch(BlockT *curBlk) { ++ if (curBlk->succ_size() != 0) { ++ return 0; ++ } ++ ++ int numLoop = 0; ++ LoopT *loopRep = loopInfo->getLoopFor(curBlk); ++ while (loopRep && loopRep->getHeader() == curBlk) { ++ LoopLandInfo *loopLand = getLoopLandInfo(loopRep); ++ if (loopLand) { ++ BlockT *landBlk = loopLand->landBlk; ++ assert(landBlk); ++ if (!isRetiredBlock(landBlk)) { ++ mergeLooplandBlock(curBlk, loopLand); ++ ++numLoop; ++ } ++ } ++ loopRep = loopRep->getParentLoop(); ++ } ++ ++ numLoopPatternMatch += numLoop; ++ ++ return numLoop; ++} //loopPatternMatch ++ ++template ++int CFGStructurizer::loopbreakPatternMatch(LoopT *loopRep, ++ BlockT *loopHeader) { ++ BlockTSmallerVector exitingBlks; ++ loopRep->getExitingBlocks(exitingBlks); ++ ++ if (DEBUGME) { ++ errs() << "Loop has " << exitingBlks.size() << " exiting blocks\n"; ++ } ++ ++ if (exitingBlks.size() == 0) { ++ setLoopLandBlock(loopRep); ++ return 0; ++ } ++ ++ // Compute the corresponding exitBlks and exit block set. ++ BlockTSmallerVector exitBlks; ++ std::set exitBlkSet; ++ for (typename BlockTSmallerVector::const_iterator iter = exitingBlks.begin(), ++ iterEnd = exitingBlks.end(); iter != iterEnd; ++iter) { ++ BlockT *exitingBlk = *iter; ++ BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk); ++ exitBlks.push_back(exitBlk); ++ exitBlkSet.insert(exitBlk); //non-duplicate insert ++ } ++ ++ assert(exitBlkSet.size() > 0); ++ assert(exitBlks.size() == exitingBlks.size()); ++ ++ if (DEBUGME) { ++ errs() << "Loop has " << exitBlkSet.size() << " exit blocks\n"; ++ } ++ ++ // Find exitLandBlk. ++ BlockT *exitLandBlk = NULL; ++ int numCloned = 0; ++ int numSerial = 0; ++ ++ if (exitBlkSet.size() == 1) { ++ exitLandBlk = *exitBlkSet.begin(); ++ } else { ++ exitLandBlk = findNearestCommonPostDom(exitBlkSet); ++ ++ if (exitLandBlk == NULL) { ++ return -1; ++ } ++ ++ bool allInPath = true; ++ bool allNotInPath = true; ++ for (typename std::set::const_iterator ++ iter = exitBlkSet.begin(), ++ iterEnd = exitBlkSet.end(); ++ iter != iterEnd; ++iter) { ++ BlockT *exitBlk = *iter; ++ ++ PathToKind pathKind = singlePathTo(exitBlk, exitLandBlk, true); ++ if (DEBUGME) { ++ errs() << "BB" << exitBlk->getNumber() ++ << " to BB" << exitLandBlk->getNumber() << " PathToKind=" ++ << pathKind << "\n"; ++ } ++ ++ allInPath = allInPath && (pathKind == SinglePath_InPath); ++ allNotInPath = allNotInPath && (pathKind == SinglePath_NotInPath); ++ ++ if (!allInPath && !allNotInPath) { ++ if (DEBUGME) { ++ errs() << "singlePath check fail\n"; ++ } ++ return -1; ++ } ++ } // check all exit blocks ++ ++ if (allNotInPath) { ++ ++ // TODO: Simplify, maybe separate function? ++ LoopT *parentLoopRep = loopRep->getParentLoop(); ++ BlockT *parentLoopHeader = NULL; ++ if (parentLoopRep) ++ parentLoopHeader = parentLoopRep->getHeader(); ++ ++ if (exitLandBlk == parentLoopHeader && ++ (exitLandBlk = relocateLoopcontBlock(parentLoopRep, ++ loopRep, ++ exitBlkSet, ++ exitLandBlk)) != NULL) { ++ if (DEBUGME) { ++ errs() << "relocateLoopcontBlock success\n"; ++ } ++ } else if ((exitLandBlk = addLoopEndbranchBlock(loopRep, ++ exitingBlks, ++ exitBlks)) != NULL) { ++ if (DEBUGME) { ++ errs() << "insertEndbranchBlock success\n"; ++ } ++ } else { ++ if (DEBUGME) { ++ errs() << "loop exit fail\n"; ++ } ++ return -1; ++ } ++ } ++ ++ // Handle side entry to exit path. ++ exitBlks.clear(); ++ exitBlkSet.clear(); ++ for (typename BlockTSmallerVector::iterator iterExiting = ++ exitingBlks.begin(), ++ iterExitingEnd = exitingBlks.end(); ++ iterExiting != iterExitingEnd; ++iterExiting) { ++ BlockT *exitingBlk = *iterExiting; ++ BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk); ++ BlockT *newExitBlk = exitBlk; ++ ++ if (exitBlk != exitLandBlk && exitBlk->pred_size() > 1) { ++ newExitBlk = cloneBlockForPredecessor(exitBlk, exitingBlk); ++ ++numCloned; ++ } ++ ++ numCloned += cloneOnSideEntryTo(exitingBlk, newExitBlk, exitLandBlk); ++ ++ exitBlks.push_back(newExitBlk); ++ exitBlkSet.insert(newExitBlk); ++ } ++ ++ for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(), ++ iterExitEnd = exitBlks.end(); ++ iterExit != iterExitEnd; ++iterExit) { ++ BlockT *exitBlk = *iterExit; ++ numSerial += serialPatternMatch(exitBlk); ++ } ++ ++ for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(), ++ iterExitEnd = exitBlks.end(); ++ iterExit != iterExitEnd; ++iterExit) { ++ BlockT *exitBlk = *iterExit; ++ if (exitBlk->pred_size() > 1) { ++ if (exitBlk != exitLandBlk) { ++ return -1; ++ } ++ } else { ++ if (exitBlk != exitLandBlk && ++ (exitBlk->succ_size() != 1 || ++ *exitBlk->succ_begin() != exitLandBlk)) { ++ return -1; ++ } ++ } ++ } ++ } // else ++ ++ exitLandBlk = recordLoopLandBlock(loopRep, exitLandBlk, exitBlks, exitBlkSet); ++ ++ // Fold break into the breaking block. Leverage across level breaks. ++ assert(exitingBlks.size() == exitBlks.size()); ++ for (typename BlockTSmallerVector::const_iterator iterExit = exitBlks.begin(), ++ iterExiting = exitingBlks.begin(), iterExitEnd = exitBlks.end(); ++ iterExit != iterExitEnd; ++iterExit, ++iterExiting) { ++ BlockT *exitBlk = *iterExit; ++ BlockT *exitingBlk = *iterExiting; ++ assert(exitBlk->pred_size() == 1 || exitBlk == exitLandBlk); ++ LoopT *exitingLoop = loopInfo->getLoopFor(exitingBlk); ++ handleLoopbreak(exitingBlk, exitingLoop, exitBlk, loopRep, exitLandBlk); ++ } ++ ++ int numBreak = static_cast(exitingBlks.size()); ++ numLoopbreakPatternMatch += numBreak; ++ numClonedBlock += numCloned; ++ return numBreak + numSerial + numCloned; ++} //loopbreakPatternMatch ++ ++template ++int CFGStructurizer::loopcontPatternMatch(LoopT *loopRep, ++ BlockT *loopHeader) { ++ int numCont = 0; ++ SmallVector contBlk; ++ for (typename InvBlockGTraits::ChildIteratorType iter = ++ InvBlockGTraits::child_begin(loopHeader), ++ iterEnd = InvBlockGTraits::child_end(loopHeader); ++ iter != iterEnd; ++iter) { ++ BlockT *curBlk = *iter; ++ if (loopRep->contains(curBlk)) { ++ handleLoopcontBlock(curBlk, loopInfo->getLoopFor(curBlk), ++ loopHeader, loopRep); ++ contBlk.push_back(curBlk); ++ ++numCont; ++ } ++ } ++ ++ for (typename SmallVector::iterator ++ iter = contBlk.begin(), iterEnd = contBlk.end(); ++ iter != iterEnd; ++iter) { ++ (*iter)->removeSuccessor(loopHeader); ++ } ++ ++ numLoopcontPatternMatch += numCont; ++ ++ return numCont; ++} //loopcontPatternMatch ++ ++ ++template ++bool CFGStructurizer::isSameloopDetachedContbreak(BlockT *src1Blk, ++ BlockT *src2Blk) { ++ // return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in the ++ // same loop with LoopLandInfo without explicitly keeping track of ++ // loopContBlks and loopBreakBlks, this is a method to get the information. ++ // ++ if (src1Blk->succ_size() == 0) { ++ LoopT *loopRep = loopInfo->getLoopFor(src1Blk); ++ if (loopRep != NULL && loopRep == loopInfo->getLoopFor(src2Blk)) { ++ LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; ++ if (theEntry != NULL) { ++ if (DEBUGME) { ++ errs() << "isLoopContBreakBlock yes src1 = BB" ++ << src1Blk->getNumber() ++ << " src2 = BB" << src2Blk->getNumber() << "\n"; ++ } ++ return true; ++ } ++ } ++ } ++ return false; ++} //isSameloopDetachedContbreak ++ ++template ++int CFGStructurizer::handleJumpintoIf(BlockT *headBlk, ++ BlockT *trueBlk, ++ BlockT *falseBlk) { ++ int num = handleJumpintoIfImp(headBlk, trueBlk, falseBlk); ++ if (num == 0) { ++ if (DEBUGME) { ++ errs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n"; ++ } ++ num = handleJumpintoIfImp(headBlk, falseBlk, trueBlk); ++ } ++ return num; ++} ++ ++template ++int CFGStructurizer::handleJumpintoIfImp(BlockT *headBlk, ++ BlockT *trueBlk, ++ BlockT *falseBlk) { ++ int num = 0; ++ BlockT *downBlk; ++ ++ //trueBlk could be the common post dominator ++ downBlk = trueBlk; ++ ++ if (DEBUGME) { ++ errs() << "handleJumpintoIfImp head = BB" << headBlk->getNumber() ++ << " true = BB" << trueBlk->getNumber() ++ << ", numSucc=" << trueBlk->succ_size() ++ << " false = BB" << falseBlk->getNumber() << "\n"; ++ } ++ ++ while (downBlk) { ++ if (DEBUGME) { ++ errs() << "check down = BB" << downBlk->getNumber(); ++ } ++ ++ if (singlePathTo(falseBlk, downBlk) == SinglePath_InPath) { ++ if (DEBUGME) { ++ errs() << " working\n"; ++ } ++ ++ num += cloneOnSideEntryTo(headBlk, trueBlk, downBlk); ++ num += cloneOnSideEntryTo(headBlk, falseBlk, downBlk); ++ ++ numClonedBlock += num; ++ num += serialPatternMatch(*headBlk->succ_begin()); ++ num += serialPatternMatch(*(++headBlk->succ_begin())); ++ num += ifPatternMatch(headBlk); ++ assert(num > 0); ++ ++ break; ++ } ++ if (DEBUGME) { ++ errs() << " not working\n"; ++ } ++ downBlk = (downBlk->succ_size() == 1) ? (*downBlk->succ_begin()) : NULL; ++ } // walk down the postDomTree ++ ++ return num; ++} //handleJumpintoIf ++ ++template ++void CFGStructurizer::showImproveSimpleJumpintoIf(BlockT *headBlk, ++ BlockT *trueBlk, ++ BlockT *falseBlk, ++ BlockT *landBlk, ++ bool detail) { ++ errs() << "head = BB" << headBlk->getNumber() ++ << " size = " << headBlk->size(); ++ if (detail) { ++ errs() << "\n"; ++ headBlk->print(errs()); ++ errs() << "\n"; ++ } ++ ++ if (trueBlk) { ++ errs() << ", true = BB" << trueBlk->getNumber() << " size = " ++ << trueBlk->size() << " numPred = " << trueBlk->pred_size(); ++ if (detail) { ++ errs() << "\n"; ++ trueBlk->print(errs()); ++ errs() << "\n"; ++ } ++ } ++ if (falseBlk) { ++ errs() << ", false = BB" << falseBlk->getNumber() << " size = " ++ << falseBlk->size() << " numPred = " << falseBlk->pred_size(); ++ if (detail) { ++ errs() << "\n"; ++ falseBlk->print(errs()); ++ errs() << "\n"; ++ } ++ } ++ if (landBlk) { ++ errs() << ", land = BB" << landBlk->getNumber() << " size = " ++ << landBlk->size() << " numPred = " << landBlk->pred_size(); ++ if (detail) { ++ errs() << "\n"; ++ landBlk->print(errs()); ++ errs() << "\n"; ++ } ++ } ++ ++ errs() << "\n"; ++} //showImproveSimpleJumpintoIf ++ ++template ++int CFGStructurizer::improveSimpleJumpintoIf(BlockT *headBlk, ++ BlockT *trueBlk, ++ BlockT *falseBlk, ++ BlockT **plandBlk) { ++ bool migrateTrue = false; ++ bool migrateFalse = false; ++ ++ BlockT *landBlk = *plandBlk; ++ ++ assert((trueBlk == NULL || trueBlk->succ_size() <= 1) ++ && (falseBlk == NULL || falseBlk->succ_size() <= 1)); ++ ++ if (trueBlk == falseBlk) { ++ return 0; ++ } ++ ++ migrateTrue = needMigrateBlock(trueBlk); ++ migrateFalse = needMigrateBlock(falseBlk); ++ ++ if (!migrateTrue && !migrateFalse) { ++ return 0; ++ } ++ ++ // If we need to migrate either trueBlk and falseBlk, migrate the rest that ++ // have more than one predecessors. without doing this, its predecessor ++ // rather than headBlk will have undefined value in initReg. ++ if (!migrateTrue && trueBlk && trueBlk->pred_size() > 1) { ++ migrateTrue = true; ++ } ++ if (!migrateFalse && falseBlk && falseBlk->pred_size() > 1) { ++ migrateFalse = true; ++ } ++ ++ if (DEBUGME) { ++ errs() << "before improveSimpleJumpintoIf: "; ++ showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0); ++ } ++ ++ // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk ++ // ++ // new: headBlk => if () {initReg = 1; org trueBlk branch} else ++ // {initReg = 0; org falseBlk branch } ++ // => landBlk => if (initReg) {org trueBlk} else {org falseBlk} ++ // => org landBlk ++ // if landBlk->pred_size() > 2, put the about if-else inside ++ // if (initReg !=2) {...} ++ // ++ // add initReg = initVal to headBlk ++ ++ const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); ++ unsigned initReg = ++ funcRep->getRegInfo().createVirtualRegister(I32RC); ++ if (!migrateTrue || !migrateFalse) { ++ int initVal = migrateTrue ? 0 : 1; ++ CFGTraits::insertAssignInstrBefore(headBlk, passRep, initReg, initVal); ++ } ++ ++ int numNewBlk = 0; ++ ++ if (landBlk == NULL) { ++ landBlk = funcRep->CreateMachineBasicBlock(); ++ funcRep->push_back(landBlk); //insert to function ++ ++ if (trueBlk) { ++ trueBlk->addSuccessor(landBlk); ++ } else { ++ headBlk->addSuccessor(landBlk); ++ } ++ ++ if (falseBlk) { ++ falseBlk->addSuccessor(landBlk); ++ } else { ++ headBlk->addSuccessor(landBlk); ++ } ++ ++ numNewBlk ++; ++ } ++ ++ bool landBlkHasOtherPred = (landBlk->pred_size() > 2); ++ ++ //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL" ++ typename BlockT::iterator insertPos = ++ CFGTraits::getInstrPos ++ (landBlk, CFGTraits::insertInstrBefore(landBlk, AMDGPU::ENDIF, passRep)); ++ ++ if (landBlkHasOtherPred) { ++ unsigned immReg = ++ funcRep->getRegInfo().createVirtualRegister(I32RC); ++ CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 2); ++ unsigned cmpResReg = ++ funcRep->getRegInfo().createVirtualRegister(I32RC); ++ ++ CFGTraits::insertCompareInstrBefore(landBlk, insertPos, passRep, cmpResReg, ++ initReg, immReg); ++ CFGTraits::insertCondBranchBefore(landBlk, insertPos, ++ AMDGPU::IF_PREDICATE_SET, passRep, ++ cmpResReg, DebugLoc()); ++ } ++ ++ CFGTraits::insertCondBranchBefore(landBlk, insertPos, AMDGPU::IF_PREDICATE_SET, ++ passRep, initReg, DebugLoc()); ++ ++ if (migrateTrue) { ++ migrateInstruction(trueBlk, landBlk, insertPos); ++ // need to uncondionally insert the assignment to ensure a path from its ++ // predecessor rather than headBlk has valid value in initReg if ++ // (initVal != 1). ++ CFGTraits::insertAssignInstrBefore(trueBlk, passRep, initReg, 1); ++ } ++ CFGTraits::insertInstrBefore(insertPos, AMDGPU::ELSE, passRep); ++ ++ if (migrateFalse) { ++ migrateInstruction(falseBlk, landBlk, insertPos); ++ // need to uncondionally insert the assignment to ensure a path from its ++ // predecessor rather than headBlk has valid value in initReg if ++ // (initVal != 0) ++ CFGTraits::insertAssignInstrBefore(falseBlk, passRep, initReg, 0); ++ } ++ ++ if (landBlkHasOtherPred) { ++ // add endif ++ CFGTraits::insertInstrBefore(insertPos, AMDGPU::ENDIF, passRep); ++ ++ // put initReg = 2 to other predecessors of landBlk ++ for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(), ++ predIterEnd = landBlk->pred_end(); predIter != predIterEnd; ++ ++predIter) { ++ BlockT *curBlk = *predIter; ++ if (curBlk != trueBlk && curBlk != falseBlk) { ++ CFGTraits::insertAssignInstrBefore(curBlk, passRep, initReg, 2); ++ } ++ } //for ++ } ++ if (DEBUGME) { ++ errs() << "result from improveSimpleJumpintoIf: "; ++ showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0); ++ } ++ ++ // update landBlk ++ *plandBlk = landBlk; ++ ++ return numNewBlk; ++} //improveSimpleJumpintoIf ++ ++template ++void CFGStructurizer::handleLoopbreak(BlockT *exitingBlk, ++ LoopT *exitingLoop, ++ BlockT *exitBlk, ++ LoopT *exitLoop, ++ BlockT *landBlk) { ++ if (DEBUGME) { ++ errs() << "Trying to break loop-depth = " << getLoopDepth(exitLoop) ++ << " from loop-depth = " << getLoopDepth(exitingLoop) << "\n"; ++ } ++ const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); ++ ++ RegiT initReg = INVALIDREGNUM; ++ if (exitingLoop != exitLoop) { ++ initReg = static_cast ++ (funcRep->getRegInfo().createVirtualRegister(I32RC)); ++ assert(initReg != INVALIDREGNUM); ++ addLoopBreakInitReg(exitLoop, initReg); ++ while (exitingLoop != exitLoop && exitingLoop) { ++ addLoopBreakOnReg(exitingLoop, initReg); ++ exitingLoop = exitingLoop->getParentLoop(); ++ } ++ assert(exitingLoop == exitLoop); ++ } ++ ++ mergeLoopbreakBlock(exitingBlk, exitBlk, landBlk, initReg); ++ ++} //handleLoopbreak ++ ++template ++void CFGStructurizer::handleLoopcontBlock(BlockT *contingBlk, ++ LoopT *contingLoop, ++ BlockT *contBlk, ++ LoopT *contLoop) { ++ if (DEBUGME) { ++ errs() << "loopcontPattern cont = BB" << contingBlk->getNumber() ++ << " header = BB" << contBlk->getNumber() << "\n"; ++ ++ errs() << "Trying to continue loop-depth = " ++ << getLoopDepth(contLoop) ++ << " from loop-depth = " << getLoopDepth(contingLoop) << "\n"; ++ } ++ ++ RegiT initReg = INVALIDREGNUM; ++ const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); ++ if (contingLoop != contLoop) { ++ initReg = static_cast ++ (funcRep->getRegInfo().createVirtualRegister(I32RC)); ++ assert(initReg != INVALIDREGNUM); ++ addLoopContInitReg(contLoop, initReg); ++ while (contingLoop && contingLoop->getParentLoop() != contLoop) { ++ addLoopBreakOnReg(contingLoop, initReg); //not addLoopContOnReg ++ contingLoop = contingLoop->getParentLoop(); ++ } ++ assert(contingLoop && contingLoop->getParentLoop() == contLoop); ++ addLoopContOnReg(contingLoop, initReg); ++ } ++ ++ settleLoopcontBlock(contingBlk, contBlk, initReg); ++} //handleLoopcontBlock ++ ++template ++void CFGStructurizer::mergeSerialBlock(BlockT *dstBlk, BlockT *srcBlk) { ++ if (DEBUGME) { ++ errs() << "serialPattern BB" << dstBlk->getNumber() ++ << " <= BB" << srcBlk->getNumber() << "\n"; ++ } ++ dstBlk->splice(dstBlk->end(), srcBlk, srcBlk->begin(), srcBlk->end()); ++ ++ dstBlk->removeSuccessor(srcBlk); ++ CFGTraits::cloneSuccessorList(dstBlk, srcBlk); ++ ++ removeSuccessor(srcBlk); ++ retireBlock(dstBlk, srcBlk); ++} //mergeSerialBlock ++ ++template ++void CFGStructurizer::mergeIfthenelseBlock(InstrT *branchInstr, ++ BlockT *curBlk, ++ BlockT *trueBlk, ++ BlockT *falseBlk, ++ BlockT *landBlk) { ++ if (DEBUGME) { ++ errs() << "ifPattern BB" << curBlk->getNumber(); ++ errs() << "{ "; ++ if (trueBlk) { ++ errs() << "BB" << trueBlk->getNumber(); ++ } ++ errs() << " } else "; ++ errs() << "{ "; ++ if (falseBlk) { ++ errs() << "BB" << falseBlk->getNumber(); ++ } ++ errs() << " }\n "; ++ errs() << "landBlock: "; ++ if (landBlk == NULL) { ++ errs() << "NULL"; ++ } else { ++ errs() << "BB" << landBlk->getNumber(); ++ } ++ errs() << "\n"; ++ } ++ ++ int oldOpcode = branchInstr->getOpcode(); ++ DebugLoc branchDL = branchInstr->getDebugLoc(); ++ ++// transform to ++// if cond ++// trueBlk ++// else ++// falseBlk ++// endif ++// landBlk ++ ++ typename BlockT::iterator branchInstrPos = ++ CFGTraits::getInstrPos(curBlk, branchInstr); ++ CFGTraits::insertCondBranchBefore(branchInstrPos, ++ CFGTraits::getBranchNzeroOpcode(oldOpcode), ++ passRep, ++ branchDL); ++ ++ if (trueBlk) { ++ curBlk->splice(branchInstrPos, trueBlk, trueBlk->begin(), trueBlk->end()); ++ curBlk->removeSuccessor(trueBlk); ++ if (landBlk && trueBlk->succ_size()!=0) { ++ trueBlk->removeSuccessor(landBlk); ++ } ++ retireBlock(curBlk, trueBlk); ++ } ++ CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ELSE, passRep); ++ ++ if (falseBlk) { ++ curBlk->splice(branchInstrPos, falseBlk, falseBlk->begin(), ++ falseBlk->end()); ++ curBlk->removeSuccessor(falseBlk); ++ if (landBlk && falseBlk->succ_size() != 0) { ++ falseBlk->removeSuccessor(landBlk); ++ } ++ retireBlock(curBlk, falseBlk); ++ } ++ CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ENDIF, passRep); ++ ++ branchInstr->eraseFromParent(); ++ ++ if (landBlk && trueBlk && falseBlk) { ++ curBlk->addSuccessor(landBlk); ++ } ++ ++} //mergeIfthenelseBlock ++ ++template ++void CFGStructurizer::mergeLooplandBlock(BlockT *dstBlk, ++ LoopLandInfo *loopLand) { ++ BlockT *landBlk = loopLand->landBlk; ++ ++ if (DEBUGME) { ++ errs() << "loopPattern header = BB" << dstBlk->getNumber() ++ << " land = BB" << landBlk->getNumber() << "\n"; ++ } ++ ++ // Loop contInitRegs are init at the beginning of the loop. ++ for (typename std::set::const_iterator iter = ++ loopLand->contInitRegs.begin(), ++ iterEnd = loopLand->contInitRegs.end(); iter != iterEnd; ++iter) { ++ CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0); ++ } ++ ++ /* we last inserterd the DebugLoc in the ++ * BREAK_LOGICALZ_i32 or AMDGPU::BREAK_LOGICALNZ statement in the current dstBlk. ++ * search for the DebugLoc in the that statement. ++ * if not found, we have to insert the empty/default DebugLoc */ ++ InstrT *loopBreakInstr = CFGTraits::getLoopBreakInstr(dstBlk); ++ DebugLoc DLBreak = (loopBreakInstr) ? loopBreakInstr->getDebugLoc() : DebugLoc(); ++ ++ CFGTraits::insertInstrBefore(dstBlk, AMDGPU::WHILELOOP, passRep, DLBreak); ++ // Loop breakInitRegs are init before entering the loop. ++ for (typename std::set::const_iterator iter = ++ loopLand->breakInitRegs.begin(), ++ iterEnd = loopLand->breakInitRegs.end(); iter != iterEnd; ++iter) { ++ CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0); ++ } ++ // Loop endbranchInitRegs are init before entering the loop. ++ for (typename std::set::const_iterator iter = ++ loopLand->endbranchInitRegs.begin(), ++ iterEnd = loopLand->endbranchInitRegs.end(); iter != iterEnd; ++iter) { ++ CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0); ++ } ++ ++ /* we last inserterd the DebugLoc in the continue statement in the current dstBlk ++ * search for the DebugLoc in the continue statement. ++ * if not found, we have to insert the empty/default DebugLoc */ ++ InstrT *continueInstr = CFGTraits::getContinueInstr(dstBlk); ++ DebugLoc DLContinue = (continueInstr) ? continueInstr->getDebugLoc() : DebugLoc(); ++ ++ CFGTraits::insertInstrEnd(dstBlk, AMDGPU::ENDLOOP, passRep, DLContinue); ++ // Loop breakOnRegs are check after the ENDLOOP: break the loop outside this ++ // loop. ++ for (typename std::set::const_iterator iter = ++ loopLand->breakOnRegs.begin(), ++ iterEnd = loopLand->breakOnRegs.end(); iter != iterEnd; ++iter) { ++ CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::PREDICATED_BREAK, passRep, ++ *iter); ++ } ++ ++ // Loop contOnRegs are check after the ENDLOOP: cont the loop outside this ++ // loop. ++ for (std::set::const_iterator iter = loopLand->contOnRegs.begin(), ++ iterEnd = loopLand->contOnRegs.end(); iter != iterEnd; ++iter) { ++ CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::CONTINUE_LOGICALNZ_i32, ++ passRep, *iter); ++ } ++ ++ dstBlk->splice(dstBlk->end(), landBlk, landBlk->begin(), landBlk->end()); ++ ++ for (typename BlockT::succ_iterator iter = landBlk->succ_begin(), ++ iterEnd = landBlk->succ_end(); iter != iterEnd; ++iter) { ++ dstBlk->addSuccessor(*iter); // *iter's predecessor is also taken care of. ++ } ++ ++ removeSuccessor(landBlk); ++ retireBlock(dstBlk, landBlk); ++} //mergeLooplandBlock ++ ++template ++void CFGStructurizer::reversePredicateSetter(typename BlockT::iterator I) { ++ while (I--) { ++ if (I->getOpcode() == AMDGPU::PRED_X) { ++ switch (static_cast(I)->getOperand(2).getImm()) { ++ case OPCODE_IS_ZERO_INT: ++ static_cast(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO_INT); ++ return; ++ case OPCODE_IS_NOT_ZERO_INT: ++ static_cast(I)->getOperand(2).setImm(OPCODE_IS_ZERO_INT); ++ return; ++ case OPCODE_IS_ZERO: ++ static_cast(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO); ++ return; ++ case OPCODE_IS_NOT_ZERO: ++ static_cast(I)->getOperand(2).setImm(OPCODE_IS_ZERO); ++ return; ++ default: ++ assert(0 && "PRED_X Opcode invalid!"); ++ } ++ } ++ } ++} ++ ++template ++void CFGStructurizer::mergeLoopbreakBlock(BlockT *exitingBlk, ++ BlockT *exitBlk, ++ BlockT *exitLandBlk, ++ RegiT setReg) { ++ if (DEBUGME) { ++ errs() << "loopbreakPattern exiting = BB" << exitingBlk->getNumber() ++ << " exit = BB" << exitBlk->getNumber() ++ << " land = BB" << exitLandBlk->getNumber() << "\n"; ++ } ++ ++ InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(exitingBlk); ++ assert(branchInstr && CFGTraits::isCondBranch(branchInstr)); ++ ++ DebugLoc DL = branchInstr->getDebugLoc(); ++ ++ BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr); ++ ++ // transform exitingBlk to ++ // if ( ) { ++ // exitBlk (if exitBlk != exitLandBlk) ++ // setReg = 1 ++ // break ++ // }endif ++ // successor = {orgSuccessor(exitingBlk) - exitBlk} ++ ++ typename BlockT::iterator branchInstrPos = ++ CFGTraits::getInstrPos(exitingBlk, branchInstr); ++ ++ if (exitBlk == exitLandBlk && setReg == INVALIDREGNUM) { ++ //break_logical ++ ++ if (trueBranch != exitBlk) { ++ reversePredicateSetter(branchInstrPos); ++ } ++ CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL); ++ } else { ++ if (trueBranch != exitBlk) { ++ reversePredicateSetter(branchInstr); ++ } ++ CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL); ++ if (exitBlk != exitLandBlk) { ++ //splice is insert-before ... ++ exitingBlk->splice(branchInstrPos, exitBlk, exitBlk->begin(), ++ exitBlk->end()); ++ } ++ if (setReg != INVALIDREGNUM) { ++ CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1); ++ } ++ CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::BREAK, passRep); ++ } //if_logical ++ ++ //now branchInst can be erase safely ++ branchInstr->eraseFromParent(); ++ ++ //now take care of successors, retire blocks ++ exitingBlk->removeSuccessor(exitBlk); ++ if (exitBlk != exitLandBlk) { ++ //splice is insert-before ... ++ exitBlk->removeSuccessor(exitLandBlk); ++ retireBlock(exitingBlk, exitBlk); ++ } ++ ++} //mergeLoopbreakBlock ++ ++template ++void CFGStructurizer::settleLoopcontBlock(BlockT *contingBlk, ++ BlockT *contBlk, ++ RegiT setReg) { ++ if (DEBUGME) { ++ errs() << "settleLoopcontBlock conting = BB" ++ << contingBlk->getNumber() ++ << ", cont = BB" << contBlk->getNumber() << "\n"; ++ } ++ ++ InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(contingBlk); ++ if (branchInstr) { ++ assert(CFGTraits::isCondBranch(branchInstr)); ++ typename BlockT::iterator branchInstrPos = ++ CFGTraits::getInstrPos(contingBlk, branchInstr); ++ BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr); ++ int oldOpcode = branchInstr->getOpcode(); ++ DebugLoc DL = branchInstr->getDebugLoc(); ++ ++ // transform contingBlk to ++ // if () { ++ // move instr after branchInstr ++ // continue ++ // or ++ // setReg = 1 ++ // break ++ // }endif ++ // successor = {orgSuccessor(contingBlk) - loopHeader} ++ ++ bool useContinueLogical = ++ (setReg == INVALIDREGNUM && (&*contingBlk->rbegin()) == branchInstr); ++ ++ if (useContinueLogical == false) { ++ int branchOpcode = ++ trueBranch == contBlk ? CFGTraits::getBranchNzeroOpcode(oldOpcode) ++ : CFGTraits::getBranchZeroOpcode(oldOpcode); ++ ++ CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL); ++ ++ if (setReg != INVALIDREGNUM) { ++ CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1); ++ // insertEnd to ensure phi-moves, if exist, go before the continue-instr. ++ CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, DL); ++ } else { ++ // insertEnd to ensure phi-moves, if exist, go before the continue-instr. ++ CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, DL); ++ } ++ ++ CFGTraits::insertInstrEnd(contingBlk, AMDGPU::ENDIF, passRep, DL); ++ } else { ++ int branchOpcode = ++ trueBranch == contBlk ? CFGTraits::getContinueNzeroOpcode(oldOpcode) ++ : CFGTraits::getContinueZeroOpcode(oldOpcode); ++ ++ CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL); ++ } ++ ++ branchInstr->eraseFromParent(); ++ } else { ++ // if we've arrived here then we've already erased the branch instruction ++ // travel back up the basic block to see the last reference of our debug location ++ // we've just inserted that reference here so it should be representative ++ if (setReg != INVALIDREGNUM) { ++ CFGTraits::insertAssignInstrBefore(contingBlk, passRep, setReg, 1); ++ // insertEnd to ensure phi-moves, if exist, go before the continue-instr. ++ CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, CFGTraits::getLastDebugLocInBB(contingBlk)); ++ } else { ++ // insertEnd to ensure phi-moves, if exist, go before the continue-instr. ++ CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, CFGTraits::getLastDebugLocInBB(contingBlk)); ++ } ++ } //else ++ ++} //settleLoopcontBlock ++ ++// BBs in exitBlkSet are determined as in break-path for loopRep, ++// before we can put code for BBs as inside loop-body for loopRep ++// check whether those BBs are determined as cont-BB for parentLoopRep ++// earlier. ++// If so, generate a new BB newBlk ++// (1) set newBlk common successor of BBs in exitBlkSet ++// (2) change the continue-instr in BBs in exitBlkSet to break-instr ++// (3) generate continue-instr in newBlk ++// ++template ++typename CFGStructurizer::BlockT * ++CFGStructurizer::relocateLoopcontBlock(LoopT *parentLoopRep, ++ LoopT *loopRep, ++ std::set &exitBlkSet, ++ BlockT *exitLandBlk) { ++ std::set endBlkSet; ++ ++ ++ ++ for (typename std::set::const_iterator iter = exitBlkSet.begin(), ++ iterEnd = exitBlkSet.end(); ++ iter != iterEnd; ++iter) { ++ BlockT *exitBlk = *iter; ++ BlockT *endBlk = singlePathEnd(exitBlk, exitLandBlk); ++ ++ if (endBlk == NULL || CFGTraits::getContinueInstr(endBlk) == NULL) ++ return NULL; ++ ++ endBlkSet.insert(endBlk); ++ } ++ ++ BlockT *newBlk = funcRep->CreateMachineBasicBlock(); ++ funcRep->push_back(newBlk); //insert to function ++ CFGTraits::insertInstrEnd(newBlk, AMDGPU::CONTINUE, passRep); ++ SHOWNEWBLK(newBlk, "New continue block: "); ++ ++ for (typename std::set::const_iterator iter = endBlkSet.begin(), ++ iterEnd = endBlkSet.end(); ++ iter != iterEnd; ++iter) { ++ BlockT *endBlk = *iter; ++ InstrT *contInstr = CFGTraits::getContinueInstr(endBlk); ++ if (contInstr) { ++ contInstr->eraseFromParent(); ++ } ++ endBlk->addSuccessor(newBlk); ++ if (DEBUGME) { ++ errs() << "Add new continue Block to BB" ++ << endBlk->getNumber() << " successors\n"; ++ } ++ } ++ ++ return newBlk; ++} //relocateLoopcontBlock ++ ++ ++// LoopEndbranchBlock is a BB created by the CFGStructurizer to use as ++// LoopLandBlock. This BB branch on the loop endBranchInit register to the ++// pathes corresponding to the loop exiting branches. ++ ++template ++typename CFGStructurizer::BlockT * ++CFGStructurizer::addLoopEndbranchBlock(LoopT *loopRep, ++ BlockTSmallerVector &exitingBlks, ++ BlockTSmallerVector &exitBlks) { ++ const AMDGPUInstrInfo *tii = ++ static_cast(passRep->getTargetInstrInfo()); ++ const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); ++ ++ RegiT endBranchReg = static_cast ++ (funcRep->getRegInfo().createVirtualRegister(I32RC)); ++ assert(endBranchReg >= 0); ++ ++ // reg = 0 before entering the loop ++ addLoopEndbranchInitReg(loopRep, endBranchReg); ++ ++ uint32_t numBlks = static_cast(exitingBlks.size()); ++ assert(numBlks >=2 && numBlks == exitBlks.size()); ++ ++ BlockT *preExitingBlk = exitingBlks[0]; ++ BlockT *preExitBlk = exitBlks[0]; ++ BlockT *preBranchBlk = funcRep->CreateMachineBasicBlock(); ++ funcRep->push_back(preBranchBlk); //insert to function ++ SHOWNEWBLK(preBranchBlk, "New loopEndbranch block: "); ++ ++ BlockT *newLandBlk = preBranchBlk; ++ ++ CFGTraits::replaceInstrUseOfBlockWith(preExitingBlk, preExitBlk, ++ newLandBlk); ++ preExitingBlk->removeSuccessor(preExitBlk); ++ preExitingBlk->addSuccessor(newLandBlk); ++ ++ //it is redundant to add reg = 0 to exitingBlks[0] ++ ++ // For 1..n th exiting path (the last iteration handles two pathes) create the ++ // branch to the previous path and the current path. ++ for (uint32_t i = 1; i < numBlks; ++i) { ++ BlockT *curExitingBlk = exitingBlks[i]; ++ BlockT *curExitBlk = exitBlks[i]; ++ BlockT *curBranchBlk; ++ ++ if (i == numBlks - 1) { ++ curBranchBlk = curExitBlk; ++ } else { ++ curBranchBlk = funcRep->CreateMachineBasicBlock(); ++ funcRep->push_back(curBranchBlk); //insert to function ++ SHOWNEWBLK(curBranchBlk, "New loopEndbranch block: "); ++ } ++ ++ // Add reg = i to exitingBlks[i]. ++ CFGTraits::insertAssignInstrBefore(curExitingBlk, passRep, ++ endBranchReg, i); ++ ++ // Remove the edge (exitingBlks[i] exitBlks[i]) add new edge ++ // (exitingBlks[i], newLandBlk). ++ CFGTraits::replaceInstrUseOfBlockWith(curExitingBlk, curExitBlk, ++ newLandBlk); ++ curExitingBlk->removeSuccessor(curExitBlk); ++ curExitingBlk->addSuccessor(newLandBlk); ++ ++ // add to preBranchBlk the branch instruction: ++ // if (endBranchReg == preVal) ++ // preExitBlk ++ // else ++ // curBranchBlk ++ // ++ // preValReg = i - 1 ++ ++ DebugLoc DL; ++ RegiT preValReg = static_cast ++ (funcRep->getRegInfo().createVirtualRegister(I32RC)); ++ ++ preBranchBlk->insert(preBranchBlk->begin(), ++ tii->getMovImmInstr(preBranchBlk->getParent(), preValReg, ++ i - 1)); ++ ++ // condResReg = (endBranchReg == preValReg) ++ RegiT condResReg = static_cast ++ (funcRep->getRegInfo().createVirtualRegister(I32RC)); ++ BuildMI(preBranchBlk, DL, tii->get(tii->getIEQOpcode()), condResReg) ++ .addReg(endBranchReg).addReg(preValReg); ++ ++ BuildMI(preBranchBlk, DL, tii->get(AMDGPU::BRANCH_COND_i32)) ++ .addMBB(preExitBlk).addReg(condResReg); ++ ++ preBranchBlk->addSuccessor(preExitBlk); ++ preBranchBlk->addSuccessor(curBranchBlk); ++ ++ // Update preExitingBlk, preExitBlk, preBranchBlk. ++ preExitingBlk = curExitingBlk; ++ preExitBlk = curExitBlk; ++ preBranchBlk = curBranchBlk; ++ ++ } //end for 1 .. n blocks ++ ++ return newLandBlk; ++} //addLoopEndbranchBlock ++ ++template ++typename CFGStructurizer::PathToKind ++CFGStructurizer::singlePathTo(BlockT *srcBlk, BlockT *dstBlk, ++ bool allowSideEntry) { ++ assert(dstBlk); ++ ++ if (srcBlk == dstBlk) { ++ return SinglePath_InPath; ++ } ++ ++ while (srcBlk && srcBlk->succ_size() == 1) { ++ srcBlk = *srcBlk->succ_begin(); ++ if (srcBlk == dstBlk) { ++ return SinglePath_InPath; ++ } ++ ++ if (!allowSideEntry && srcBlk->pred_size() > 1) { ++ return Not_SinglePath; ++ } ++ } ++ ++ if (srcBlk && srcBlk->succ_size()==0) { ++ return SinglePath_NotInPath; ++ } ++ ++ return Not_SinglePath; ++} //singlePathTo ++ ++// If there is a single path from srcBlk to dstBlk, return the last block before ++// dstBlk If there is a single path from srcBlk->end without dstBlk, return the ++// last block in the path Otherwise, return NULL ++template ++typename CFGStructurizer::BlockT * ++CFGStructurizer::singlePathEnd(BlockT *srcBlk, BlockT *dstBlk, ++ bool allowSideEntry) { ++ assert(dstBlk); ++ ++ if (srcBlk == dstBlk) { ++ return srcBlk; ++ } ++ ++ if (srcBlk->succ_size() == 0) { ++ return srcBlk; ++ } ++ ++ while (srcBlk && srcBlk->succ_size() == 1) { ++ BlockT *preBlk = srcBlk; ++ ++ srcBlk = *srcBlk->succ_begin(); ++ if (srcBlk == NULL) { ++ return preBlk; ++ } ++ ++ if (!allowSideEntry && srcBlk->pred_size() > 1) { ++ return NULL; ++ } ++ } ++ ++ if (srcBlk && srcBlk->succ_size()==0) { ++ return srcBlk; ++ } ++ ++ return NULL; ++ ++} //singlePathEnd ++ ++template ++int CFGStructurizer::cloneOnSideEntryTo(BlockT *preBlk, BlockT *srcBlk, ++ BlockT *dstBlk) { ++ int cloned = 0; ++ assert(preBlk->isSuccessor(srcBlk)); ++ while (srcBlk && srcBlk != dstBlk) { ++ assert(srcBlk->succ_size() == 1); ++ if (srcBlk->pred_size() > 1) { ++ srcBlk = cloneBlockForPredecessor(srcBlk, preBlk); ++ ++cloned; ++ } ++ ++ preBlk = srcBlk; ++ srcBlk = *srcBlk->succ_begin(); ++ } ++ ++ return cloned; ++} //cloneOnSideEntryTo ++ ++template ++typename CFGStructurizer::BlockT * ++CFGStructurizer::cloneBlockForPredecessor(BlockT *curBlk, ++ BlockT *predBlk) { ++ assert(predBlk->isSuccessor(curBlk) && ++ "succBlk is not a prececessor of curBlk"); ++ ++ BlockT *cloneBlk = CFGTraits::clone(curBlk); //clone instructions ++ CFGTraits::replaceInstrUseOfBlockWith(predBlk, curBlk, cloneBlk); ++ //srcBlk, oldBlk, newBlk ++ ++ predBlk->removeSuccessor(curBlk); ++ predBlk->addSuccessor(cloneBlk); ++ ++ // add all successor to cloneBlk ++ CFGTraits::cloneSuccessorList(cloneBlk, curBlk); ++ ++ numClonedInstr += curBlk->size(); ++ ++ if (DEBUGME) { ++ errs() << "Cloned block: " << "BB" ++ << curBlk->getNumber() << "size " << curBlk->size() << "\n"; ++ } ++ ++ SHOWNEWBLK(cloneBlk, "result of Cloned block: "); ++ ++ return cloneBlk; ++} //cloneBlockForPredecessor ++ ++template ++typename CFGStructurizer::BlockT * ++CFGStructurizer::exitingBlock2ExitBlock(LoopT *loopRep, ++ BlockT *exitingBlk) { ++ BlockT *exitBlk = NULL; ++ ++ for (typename BlockT::succ_iterator iterSucc = exitingBlk->succ_begin(), ++ iterSuccEnd = exitingBlk->succ_end(); ++ iterSucc != iterSuccEnd; ++iterSucc) { ++ BlockT *curBlk = *iterSucc; ++ if (!loopRep->contains(curBlk)) { ++ assert(exitBlk == NULL); ++ exitBlk = curBlk; ++ } ++ } ++ ++ assert(exitBlk != NULL); ++ ++ return exitBlk; ++} //exitingBlock2ExitBlock ++ ++template ++void CFGStructurizer::migrateInstruction(BlockT *srcBlk, ++ BlockT *dstBlk, ++ InstrIterator insertPos) { ++ InstrIterator spliceEnd; ++ //look for the input branchinstr, not the AMDGPU branchinstr ++ InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk); ++ if (branchInstr == NULL) { ++ if (DEBUGME) { ++ errs() << "migrateInstruction don't see branch instr\n" ; ++ } ++ spliceEnd = srcBlk->end(); ++ } else { ++ if (DEBUGME) { ++ errs() << "migrateInstruction see branch instr\n" ; ++ branchInstr->dump(); ++ } ++ spliceEnd = CFGTraits::getInstrPos(srcBlk, branchInstr); ++ } ++ if (DEBUGME) { ++ errs() << "migrateInstruction before splice dstSize = " << dstBlk->size() ++ << "srcSize = " << srcBlk->size() << "\n"; ++ } ++ ++ //splice insert before insertPos ++ dstBlk->splice(insertPos, srcBlk, srcBlk->begin(), spliceEnd); ++ ++ if (DEBUGME) { ++ errs() << "migrateInstruction after splice dstSize = " << dstBlk->size() ++ << "srcSize = " << srcBlk->size() << "\n"; ++ } ++} //migrateInstruction ++ ++// normalizeInfiniteLoopExit change ++// B1: ++// uncond_br LoopHeader ++// ++// to ++// B1: ++// cond_br 1 LoopHeader dummyExit ++// and return the newly added dummy exit block ++// ++template ++typename CFGStructurizer::BlockT * ++CFGStructurizer::normalizeInfiniteLoopExit(LoopT* LoopRep) { ++ BlockT *loopHeader; ++ BlockT *loopLatch; ++ loopHeader = LoopRep->getHeader(); ++ loopLatch = LoopRep->getLoopLatch(); ++ BlockT *dummyExitBlk = NULL; ++ const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); ++ if (loopHeader!=NULL && loopLatch!=NULL) { ++ InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(loopLatch); ++ if (branchInstr!=NULL && CFGTraits::isUncondBranch(branchInstr)) { ++ dummyExitBlk = funcRep->CreateMachineBasicBlock(); ++ funcRep->push_back(dummyExitBlk); //insert to function ++ SHOWNEWBLK(dummyExitBlk, "DummyExitBlock to normalize infiniteLoop: "); ++ ++ if (DEBUGME) errs() << "Old branch instr: " << *branchInstr << "\n"; ++ ++ typename BlockT::iterator insertPos = ++ CFGTraits::getInstrPos(loopLatch, branchInstr); ++ unsigned immReg = ++ funcRep->getRegInfo().createVirtualRegister(I32RC); ++ CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 1); ++ InstrT *newInstr = ++ CFGTraits::insertInstrBefore(insertPos, AMDGPU::BRANCH_COND_i32, passRep); ++ MachineInstrBuilder(newInstr).addMBB(loopHeader).addReg(immReg, false); ++ ++ SHOWNEWINSTR(newInstr); ++ ++ branchInstr->eraseFromParent(); ++ loopLatch->addSuccessor(dummyExitBlk); ++ } ++ } ++ ++ return dummyExitBlk; ++} //normalizeInfiniteLoopExit ++ ++template ++void CFGStructurizer::removeUnconditionalBranch(BlockT *srcBlk) { ++ InstrT *branchInstr; ++ ++ // I saw two unconditional branch in one basic block in example ++ // test_fc_do_while_or.c need to fix the upstream on this to remove the loop. ++ while ((branchInstr = CFGTraits::getLoopendBlockBranchInstr(srcBlk)) ++ && CFGTraits::isUncondBranch(branchInstr)) { ++ if (DEBUGME) { ++ errs() << "Removing unconditional branch instruction" ; ++ branchInstr->dump(); ++ } ++ branchInstr->eraseFromParent(); ++ } ++} //removeUnconditionalBranch ++ ++template ++void CFGStructurizer::removeRedundantConditionalBranch(BlockT *srcBlk) { ++ if (srcBlk->succ_size() == 2) { ++ BlockT *blk1 = *srcBlk->succ_begin(); ++ BlockT *blk2 = *(++srcBlk->succ_begin()); ++ ++ if (blk1 == blk2) { ++ InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk); ++ assert(branchInstr && CFGTraits::isCondBranch(branchInstr)); ++ if (DEBUGME) { ++ errs() << "Removing unneeded conditional branch instruction" ; ++ branchInstr->dump(); ++ } ++ branchInstr->eraseFromParent(); ++ SHOWNEWBLK(blk1, "Removing redundant successor"); ++ srcBlk->removeSuccessor(blk1); ++ } ++ } ++} //removeRedundantConditionalBranch ++ ++template ++void CFGStructurizer::addDummyExitBlock(SmallVector &retBlks) { ++ BlockT *dummyExitBlk = funcRep->CreateMachineBasicBlock(); ++ funcRep->push_back(dummyExitBlk); //insert to function ++ CFGTraits::insertInstrEnd(dummyExitBlk, AMDGPU::RETURN, passRep); ++ ++ for (typename SmallVector::iterator iter = ++ retBlks.begin(), ++ iterEnd = retBlks.end(); iter != iterEnd; ++iter) { ++ BlockT *curBlk = *iter; ++ InstrT *curInstr = CFGTraits::getReturnInstr(curBlk); ++ if (curInstr) { ++ curInstr->eraseFromParent(); ++ } ++ curBlk->addSuccessor(dummyExitBlk); ++ if (DEBUGME) { ++ errs() << "Add dummyExitBlock to BB" << curBlk->getNumber() ++ << " successors\n"; ++ } ++ } //for ++ ++ SHOWNEWBLK(dummyExitBlk, "DummyExitBlock: "); ++} //addDummyExitBlock ++ ++template ++void CFGStructurizer::removeSuccessor(BlockT *srcBlk) { ++ while (srcBlk->succ_size()) { ++ srcBlk->removeSuccessor(*srcBlk->succ_begin()); ++ } ++} ++ ++template ++void CFGStructurizer::recordSccnum(BlockT *srcBlk, int sccNum) { ++ BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk]; ++ ++ if (srcBlkInfo == NULL) { ++ srcBlkInfo = new BlockInfo(); ++ } ++ ++ srcBlkInfo->sccNum = sccNum; ++} ++ ++template ++int CFGStructurizer::getSCCNum(BlockT *srcBlk) { ++ BlockInfo *srcBlkInfo = blockInfoMap[srcBlk]; ++ return srcBlkInfo ? srcBlkInfo->sccNum : INVALIDSCCNUM; ++} ++ ++template ++void CFGStructurizer::retireBlock(BlockT *dstBlk, BlockT *srcBlk) { ++ if (DEBUGME) { ++ errs() << "Retiring BB" << srcBlk->getNumber() << "\n"; ++ } ++ ++ BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk]; ++ ++ if (srcBlkInfo == NULL) { ++ srcBlkInfo = new BlockInfo(); ++ } ++ ++ srcBlkInfo->isRetired = true; ++ assert(srcBlk->succ_size() == 0 && srcBlk->pred_size() == 0 ++ && "can't retire block yet"); ++} ++ ++template ++bool CFGStructurizer::isRetiredBlock(BlockT *srcBlk) { ++ BlockInfo *srcBlkInfo = blockInfoMap[srcBlk]; ++ return (srcBlkInfo && srcBlkInfo->isRetired); ++} ++ ++template ++bool CFGStructurizer::isActiveLoophead(BlockT *curBlk) { ++ LoopT *loopRep = loopInfo->getLoopFor(curBlk); ++ while (loopRep && loopRep->getHeader() == curBlk) { ++ LoopLandInfo *loopLand = getLoopLandInfo(loopRep); ++ ++ if(loopLand == NULL) ++ return true; ++ ++ BlockT *landBlk = loopLand->landBlk; ++ assert(landBlk); ++ if (!isRetiredBlock(landBlk)) { ++ return true; ++ } ++ ++ loopRep = loopRep->getParentLoop(); ++ } ++ ++ return false; ++} //isActiveLoophead ++ ++template ++bool CFGStructurizer::needMigrateBlock(BlockT *blk) { ++ const unsigned blockSizeThreshold = 30; ++ const unsigned cloneInstrThreshold = 100; ++ ++ bool multiplePreds = blk && (blk->pred_size() > 1); ++ ++ if(!multiplePreds) ++ return false; ++ ++ unsigned blkSize = blk->size(); ++ return ((blkSize > blockSizeThreshold) ++ && (blkSize * (blk->pred_size() - 1) > cloneInstrThreshold)); ++} //needMigrateBlock ++ ++template ++typename CFGStructurizer::BlockT * ++CFGStructurizer::recordLoopLandBlock(LoopT *loopRep, BlockT *landBlk, ++ BlockTSmallerVector &exitBlks, ++ std::set &exitBlkSet) { ++ SmallVector inpathBlks; //in exit path blocks ++ ++ for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(), ++ predIterEnd = landBlk->pred_end(); ++ predIter != predIterEnd; ++predIter) { ++ BlockT *curBlk = *predIter; ++ if (loopRep->contains(curBlk) || exitBlkSet.count(curBlk)) { ++ inpathBlks.push_back(curBlk); ++ } ++ } //for ++ ++ //if landBlk has predecessors that are not in the given loop, ++ //create a new block ++ BlockT *newLandBlk = landBlk; ++ if (inpathBlks.size() != landBlk->pred_size()) { ++ newLandBlk = funcRep->CreateMachineBasicBlock(); ++ funcRep->push_back(newLandBlk); //insert to function ++ newLandBlk->addSuccessor(landBlk); ++ for (typename SmallVector::iterator iter = ++ inpathBlks.begin(), ++ iterEnd = inpathBlks.end(); iter != iterEnd; ++iter) { ++ BlockT *curBlk = *iter; ++ CFGTraits::replaceInstrUseOfBlockWith(curBlk, landBlk, newLandBlk); ++ //srcBlk, oldBlk, newBlk ++ curBlk->removeSuccessor(landBlk); ++ curBlk->addSuccessor(newLandBlk); ++ } ++ for (size_t i = 0, tot = exitBlks.size(); i < tot; ++i) { ++ if (exitBlks[i] == landBlk) { ++ exitBlks[i] = newLandBlk; ++ } ++ } ++ SHOWNEWBLK(newLandBlk, "NewLandingBlock: "); ++ } ++ ++ setLoopLandBlock(loopRep, newLandBlk); ++ ++ return newLandBlk; ++} // recordLoopbreakLand ++ ++template ++void CFGStructurizer::setLoopLandBlock(LoopT *loopRep, BlockT *blk) { ++ LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; ++ ++ if (theEntry == NULL) { ++ theEntry = new LoopLandInfo(); ++ } ++ assert(theEntry->landBlk == NULL); ++ ++ if (blk == NULL) { ++ blk = funcRep->CreateMachineBasicBlock(); ++ funcRep->push_back(blk); //insert to function ++ SHOWNEWBLK(blk, "DummyLandingBlock for loop without break: "); ++ } ++ ++ theEntry->landBlk = blk; ++ ++ if (DEBUGME) { ++ errs() << "setLoopLandBlock loop-header = BB" ++ << loopRep->getHeader()->getNumber() ++ << " landing-block = BB" << blk->getNumber() << "\n"; ++ } ++} // setLoopLandBlock ++ ++template ++void CFGStructurizer::addLoopBreakOnReg(LoopT *loopRep, RegiT regNum) { ++ LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; ++ ++ if (theEntry == NULL) { ++ theEntry = new LoopLandInfo(); ++ } ++ ++ theEntry->breakOnRegs.insert(regNum); ++ ++ if (DEBUGME) { ++ errs() << "addLoopBreakOnReg loop-header = BB" ++ << loopRep->getHeader()->getNumber() ++ << " regNum = " << regNum << "\n"; ++ } ++} // addLoopBreakOnReg ++ ++template ++void CFGStructurizer::addLoopContOnReg(LoopT *loopRep, RegiT regNum) { ++ LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; ++ ++ if (theEntry == NULL) { ++ theEntry = new LoopLandInfo(); ++ } ++ theEntry->contOnRegs.insert(regNum); ++ ++ if (DEBUGME) { ++ errs() << "addLoopContOnReg loop-header = BB" ++ << loopRep->getHeader()->getNumber() ++ << " regNum = " << regNum << "\n"; ++ } ++} // addLoopContOnReg ++ ++template ++void CFGStructurizer::addLoopBreakInitReg(LoopT *loopRep, RegiT regNum) { ++ LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; ++ ++ if (theEntry == NULL) { ++ theEntry = new LoopLandInfo(); ++ } ++ theEntry->breakInitRegs.insert(regNum); ++ ++ if (DEBUGME) { ++ errs() << "addLoopBreakInitReg loop-header = BB" ++ << loopRep->getHeader()->getNumber() ++ << " regNum = " << regNum << "\n"; ++ } ++} // addLoopBreakInitReg ++ ++template ++void CFGStructurizer::addLoopContInitReg(LoopT *loopRep, RegiT regNum) { ++ LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; ++ ++ if (theEntry == NULL) { ++ theEntry = new LoopLandInfo(); ++ } ++ theEntry->contInitRegs.insert(regNum); ++ ++ if (DEBUGME) { ++ errs() << "addLoopContInitReg loop-header = BB" ++ << loopRep->getHeader()->getNumber() ++ << " regNum = " << regNum << "\n"; ++ } ++} // addLoopContInitReg ++ ++template ++void CFGStructurizer::addLoopEndbranchInitReg(LoopT *loopRep, ++ RegiT regNum) { ++ LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; ++ ++ if (theEntry == NULL) { ++ theEntry = new LoopLandInfo(); ++ } ++ theEntry->endbranchInitRegs.insert(regNum); ++ ++ if (DEBUGME) { ++ errs() << "addLoopEndbranchInitReg loop-header = BB" ++ << loopRep->getHeader()->getNumber() ++ << " regNum = " << regNum << "\n"; ++ } ++} // addLoopEndbranchInitReg ++ ++template ++typename CFGStructurizer::LoopLandInfo * ++CFGStructurizer::getLoopLandInfo(LoopT *loopRep) { ++ LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; ++ ++ return theEntry; ++} // getLoopLandInfo ++ ++template ++typename CFGStructurizer::BlockT * ++CFGStructurizer::getLoopLandBlock(LoopT *loopRep) { ++ LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; ++ ++ return theEntry ? theEntry->landBlk : NULL; ++} // getLoopLandBlock ++ ++ ++template ++bool CFGStructurizer::hasBackEdge(BlockT *curBlk) { ++ LoopT *loopRep = loopInfo->getLoopFor(curBlk); ++ if (loopRep == NULL) ++ return false; ++ ++ BlockT *loopHeader = loopRep->getHeader(); ++ ++ return curBlk->isSuccessor(loopHeader); ++ ++} //hasBackEdge ++ ++template ++unsigned CFGStructurizer::getLoopDepth(LoopT *loopRep) { ++ return loopRep ? loopRep->getLoopDepth() : 0; ++} //getLoopDepth ++ ++template ++int CFGStructurizer::countActiveBlock ++(typename SmallVector::const_iterator iterStart, ++ typename SmallVector::const_iterator iterEnd) { ++ int count = 0; ++ while (iterStart != iterEnd) { ++ if (!isRetiredBlock(*iterStart)) { ++ ++count; ++ } ++ ++iterStart; ++ } ++ ++ return count; ++} //countActiveBlock ++ ++// This is work around solution for findNearestCommonDominator not avaiable to ++// post dom a proper fix should go to Dominators.h. ++ ++template ++typename CFGStructurizer::BlockT* ++CFGStructurizer::findNearestCommonPostDom(BlockT *blk1, BlockT *blk2) { ++ ++ if (postDomTree->dominates(blk1, blk2)) { ++ return blk1; ++ } ++ if (postDomTree->dominates(blk2, blk1)) { ++ return blk2; ++ } ++ ++ DomTreeNodeT *node1 = postDomTree->getNode(blk1); ++ DomTreeNodeT *node2 = postDomTree->getNode(blk2); ++ ++ // Handle newly cloned node. ++ if (node1 == NULL && blk1->succ_size() == 1) { ++ return findNearestCommonPostDom(*blk1->succ_begin(), blk2); ++ } ++ if (node2 == NULL && blk2->succ_size() == 1) { ++ return findNearestCommonPostDom(blk1, *blk2->succ_begin()); ++ } ++ ++ if (node1 == NULL || node2 == NULL) { ++ return NULL; ++ } ++ ++ node1 = node1->getIDom(); ++ while (node1) { ++ if (postDomTree->dominates(node1, node2)) { ++ return node1->getBlock(); ++ } ++ node1 = node1->getIDom(); ++ } ++ ++ return NULL; ++} ++ ++template ++typename CFGStructurizer::BlockT * ++CFGStructurizer::findNearestCommonPostDom ++(typename std::set &blks) { ++ BlockT *commonDom; ++ typename std::set::const_iterator iter = blks.begin(); ++ typename std::set::const_iterator iterEnd = blks.end(); ++ for (commonDom = *iter; iter != iterEnd && commonDom != NULL; ++iter) { ++ BlockT *curBlk = *iter; ++ if (curBlk != commonDom) { ++ commonDom = findNearestCommonPostDom(curBlk, commonDom); ++ } ++ } ++ ++ if (DEBUGME) { ++ errs() << "Common post dominator for exit blocks is "; ++ if (commonDom) { ++ errs() << "BB" << commonDom->getNumber() << "\n"; ++ } else { ++ errs() << "NULL\n"; ++ } ++ } ++ ++ return commonDom; ++} //findNearestCommonPostDom ++ ++} //end namespace llvm ++ ++//todo: move-end ++ ++ ++//===----------------------------------------------------------------------===// ++// ++// CFGStructurizer for AMDGPU ++// ++//===----------------------------------------------------------------------===// ++ ++ ++using namespace llvmCFGStruct; ++ ++namespace llvm { ++class AMDGPUCFGStructurizer : public MachineFunctionPass { ++public: ++ typedef MachineInstr InstructionType; ++ typedef MachineFunction FunctionType; ++ typedef MachineBasicBlock BlockType; ++ typedef MachineLoopInfo LoopinfoType; ++ typedef MachineDominatorTree DominatortreeType; ++ typedef MachinePostDominatorTree PostDominatortreeType; ++ typedef MachineDomTreeNode DomTreeNodeType; ++ typedef MachineLoop LoopType; ++ ++protected: ++ TargetMachine &TM; ++ const TargetInstrInfo *TII; ++ const AMDGPURegisterInfo *TRI; ++ ++public: ++ AMDGPUCFGStructurizer(char &pid, TargetMachine &tm); ++ const TargetInstrInfo *getTargetInstrInfo() const; ++ ++private: ++ ++}; ++ ++} //end of namespace llvm ++AMDGPUCFGStructurizer::AMDGPUCFGStructurizer(char &pid, TargetMachine &tm) ++: MachineFunctionPass(pid), TM(tm), TII(tm.getInstrInfo()), ++ TRI(static_cast(tm.getRegisterInfo())) { ++} ++ ++const TargetInstrInfo *AMDGPUCFGStructurizer::getTargetInstrInfo() const { ++ return TII; ++} ++//===----------------------------------------------------------------------===// ++// ++// CFGPrepare ++// ++//===----------------------------------------------------------------------===// ++ ++ ++using namespace llvmCFGStruct; ++ ++namespace llvm { ++class AMDGPUCFGPrepare : public AMDGPUCFGStructurizer { ++public: ++ static char ID; ++ ++public: ++ AMDGPUCFGPrepare(TargetMachine &tm); ++ ++ virtual const char *getPassName() const; ++ virtual void getAnalysisUsage(AnalysisUsage &AU) const; ++ ++ bool runOnMachineFunction(MachineFunction &F); ++ ++private: ++ ++}; ++ ++char AMDGPUCFGPrepare::ID = 0; ++} //end of namespace llvm ++ ++AMDGPUCFGPrepare::AMDGPUCFGPrepare(TargetMachine &tm) ++ : AMDGPUCFGStructurizer(ID, tm ) { ++} ++const char *AMDGPUCFGPrepare::getPassName() const { ++ return "AMD IL Control Flow Graph Preparation Pass"; ++} ++ ++void AMDGPUCFGPrepare::getAnalysisUsage(AnalysisUsage &AU) const { ++ AU.addPreserved(); ++ AU.addRequired(); ++ AU.addRequired(); ++ AU.addRequired(); ++ AU.addRequired(); ++} ++ ++//===----------------------------------------------------------------------===// ++// ++// CFGPerform ++// ++//===----------------------------------------------------------------------===// ++ ++ ++using namespace llvmCFGStruct; ++ ++namespace llvm { ++class AMDGPUCFGPerform : public AMDGPUCFGStructurizer { ++public: ++ static char ID; ++ ++public: ++ AMDGPUCFGPerform(TargetMachine &tm); ++ virtual const char *getPassName() const; ++ virtual void getAnalysisUsage(AnalysisUsage &AU) const; ++ bool runOnMachineFunction(MachineFunction &F); ++ ++private: ++ ++}; ++ ++char AMDGPUCFGPerform::ID = 0; ++} //end of namespace llvm ++ ++ AMDGPUCFGPerform::AMDGPUCFGPerform(TargetMachine &tm) ++: AMDGPUCFGStructurizer(ID, tm) { ++} ++ ++const char *AMDGPUCFGPerform::getPassName() const { ++ return "AMD IL Control Flow Graph structurizer Pass"; ++} ++ ++void AMDGPUCFGPerform::getAnalysisUsage(AnalysisUsage &AU) const { ++ AU.addPreserved(); ++ AU.addRequired(); ++ AU.addRequired(); ++ AU.addRequired(); ++ AU.addRequired(); ++} ++ ++//===----------------------------------------------------------------------===// ++// ++// CFGStructTraits ++// ++//===----------------------------------------------------------------------===// ++ ++namespace llvmCFGStruct { ++// this class is tailor to the AMDGPU backend ++template<> ++struct CFGStructTraits { ++ typedef int RegiT; ++ ++ static int getBranchNzeroOpcode(int oldOpcode) { ++ switch(oldOpcode) { ++ case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; ++ case AMDGPU::BRANCH_COND_i32: ++ case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32; ++ default: ++ assert(0 && "internal error"); ++ } ++ return -1; ++ } ++ ++ static int getBranchZeroOpcode(int oldOpcode) { ++ switch(oldOpcode) { ++ case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; ++ case AMDGPU::BRANCH_COND_i32: ++ case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32; ++ default: ++ assert(0 && "internal error"); ++ } ++ return -1; ++ } ++ ++ static int getContinueNzeroOpcode(int oldOpcode) { ++ switch(oldOpcode) { ++ case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32; ++ default: ++ assert(0 && "internal error"); ++ }; ++ return -1; ++ } ++ ++ static int getContinueZeroOpcode(int oldOpcode) { ++ switch(oldOpcode) { ++ case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32; ++ default: ++ assert(0 && "internal error"); ++ } ++ return -1; ++ } ++ ++ static MachineBasicBlock *getTrueBranch(MachineInstr *instr) { ++ return instr->getOperand(0).getMBB(); ++ } ++ ++ static void setTrueBranch(MachineInstr *instr, MachineBasicBlock *blk) { ++ instr->getOperand(0).setMBB(blk); ++ } ++ ++ static MachineBasicBlock * ++ getFalseBranch(MachineBasicBlock *blk, MachineInstr *instr) { ++ assert(blk->succ_size() == 2); ++ MachineBasicBlock *trueBranch = getTrueBranch(instr); ++ MachineBasicBlock::succ_iterator iter = blk->succ_begin(); ++ MachineBasicBlock::succ_iterator iterNext = iter; ++ ++iterNext; ++ ++ return (*iter == trueBranch) ? *iterNext : *iter; ++ } ++ ++ static bool isCondBranch(MachineInstr *instr) { ++ switch (instr->getOpcode()) { ++ case AMDGPU::JUMP: ++ return instr->getOperand(instr->findFirstPredOperandIdx()).getReg() != 0; ++ case AMDGPU::BRANCH_COND_i32: ++ case AMDGPU::BRANCH_COND_f32: ++ break; ++ default: ++ return false; ++ } ++ return true; ++ } ++ ++ static bool isUncondBranch(MachineInstr *instr) { ++ switch (instr->getOpcode()) { ++ case AMDGPU::JUMP: ++ return instr->getOperand(instr->findFirstPredOperandIdx()).getReg() == 0; ++ case AMDGPU::BRANCH: ++ return true; ++ default: ++ return false; ++ } ++ return true; ++ } ++ ++ static DebugLoc getLastDebugLocInBB(MachineBasicBlock *blk) { ++ //get DebugLoc from the first MachineBasicBlock instruction with debug info ++ DebugLoc DL; ++ for (MachineBasicBlock::iterator iter = blk->begin(); iter != blk->end(); ++iter) { ++ MachineInstr *instr = &(*iter); ++ if (instr->getDebugLoc().isUnknown() == false) { ++ DL = instr->getDebugLoc(); ++ } ++ } ++ return DL; ++ } ++ ++ static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *blk) { ++ MachineBasicBlock::reverse_iterator iter = blk->rbegin(); ++ MachineInstr *instr = &*iter; ++ if (instr && (isCondBranch(instr) || isUncondBranch(instr))) { ++ return instr; ++ } ++ return NULL; ++ } ++ ++ // The correct naming for this is getPossibleLoopendBlockBranchInstr. ++ // ++ // BB with backward-edge could have move instructions after the branch ++ // instruction. Such move instruction "belong to" the loop backward-edge. ++ // ++ static MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *blk) { ++ const AMDGPUInstrInfo * TII = static_cast( ++ blk->getParent()->getTarget().getInstrInfo()); ++ ++ for (MachineBasicBlock::reverse_iterator iter = blk->rbegin(), ++ iterEnd = blk->rend(); iter != iterEnd; ++iter) { ++ // FIXME: Simplify ++ MachineInstr *instr = &*iter; ++ if (instr) { ++ if (isCondBranch(instr) || isUncondBranch(instr)) { ++ return instr; ++ } else if (!TII->isMov(instr->getOpcode())) { ++ break; ++ } ++ } ++ } ++ return NULL; ++ } ++ ++ static MachineInstr *getReturnInstr(MachineBasicBlock *blk) { ++ MachineBasicBlock::reverse_iterator iter = blk->rbegin(); ++ if (iter != blk->rend()) { ++ MachineInstr *instr = &(*iter); ++ if (instr->getOpcode() == AMDGPU::RETURN) { ++ return instr; ++ } ++ } ++ return NULL; ++ } ++ ++ static MachineInstr *getContinueInstr(MachineBasicBlock *blk) { ++ MachineBasicBlock::reverse_iterator iter = blk->rbegin(); ++ if (iter != blk->rend()) { ++ MachineInstr *instr = &(*iter); ++ if (instr->getOpcode() == AMDGPU::CONTINUE) { ++ return instr; ++ } ++ } ++ return NULL; ++ } ++ ++ static MachineInstr *getLoopBreakInstr(MachineBasicBlock *blk) { ++ for (MachineBasicBlock::iterator iter = blk->begin(); (iter != blk->end()); ++iter) { ++ MachineInstr *instr = &(*iter); ++ if (instr->getOpcode() == AMDGPU::PREDICATED_BREAK) { ++ return instr; ++ } ++ } ++ return NULL; ++ } ++ ++ static bool isReturnBlock(MachineBasicBlock *blk) { ++ MachineInstr *instr = getReturnInstr(blk); ++ bool isReturn = (blk->succ_size() == 0); ++ if (instr) { ++ assert(isReturn); ++ } else if (isReturn) { ++ if (DEBUGME) { ++ errs() << "BB" << blk->getNumber() ++ <<" is return block without RETURN instr\n"; ++ } ++ } ++ ++ return isReturn; ++ } ++ ++ static MachineBasicBlock::iterator ++ getInstrPos(MachineBasicBlock *blk, MachineInstr *instr) { ++ assert(instr->getParent() == blk && "instruction doesn't belong to block"); ++ MachineBasicBlock::iterator iter = blk->begin(); ++ MachineBasicBlock::iterator iterEnd = blk->end(); ++ while (&(*iter) != instr && iter != iterEnd) { ++ ++iter; ++ } ++ ++ assert(iter != iterEnd); ++ return iter; ++ }//getInstrPos ++ ++ static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode, ++ AMDGPUCFGStructurizer *passRep) { ++ return insertInstrBefore(blk,newOpcode,passRep,DebugLoc()); ++ } //insertInstrBefore ++ ++ static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode, ++ AMDGPUCFGStructurizer *passRep, DebugLoc DL) { ++ const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); ++ MachineInstr *newInstr = ++ blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL); ++ ++ MachineBasicBlock::iterator res; ++ if (blk->begin() != blk->end()) { ++ blk->insert(blk->begin(), newInstr); ++ } else { ++ blk->push_back(newInstr); ++ } ++ ++ SHOWNEWINSTR(newInstr); ++ ++ return newInstr; ++ } //insertInstrBefore ++ ++ static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode, ++ AMDGPUCFGStructurizer *passRep) { ++ insertInstrEnd(blk,newOpcode,passRep,DebugLoc()); ++ } //insertInstrEnd ++ ++ static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode, ++ AMDGPUCFGStructurizer *passRep, DebugLoc DL) { ++ const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); ++ MachineInstr *newInstr = blk->getParent() ++ ->CreateMachineInstr(tii->get(newOpcode), DL); ++ ++ blk->push_back(newInstr); ++ //assume the instruction doesn't take any reg operand ... ++ ++ SHOWNEWINSTR(newInstr); ++ } //insertInstrEnd ++ ++ static MachineInstr *insertInstrBefore(MachineBasicBlock::iterator instrPos, ++ int newOpcode, ++ AMDGPUCFGStructurizer *passRep) { ++ MachineInstr *oldInstr = &(*instrPos); ++ const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); ++ MachineBasicBlock *blk = oldInstr->getParent(); ++ MachineInstr *newInstr = ++ blk->getParent()->CreateMachineInstr(tii->get(newOpcode), ++ DebugLoc()); ++ ++ blk->insert(instrPos, newInstr); ++ //assume the instruction doesn't take any reg operand ... ++ ++ SHOWNEWINSTR(newInstr); ++ return newInstr; ++ } //insertInstrBefore ++ ++ static void insertCondBranchBefore(MachineBasicBlock::iterator instrPos, ++ int newOpcode, ++ AMDGPUCFGStructurizer *passRep, ++ DebugLoc DL) { ++ MachineInstr *oldInstr = &(*instrPos); ++ const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); ++ MachineBasicBlock *blk = oldInstr->getParent(); ++ MachineInstr *newInstr = ++ blk->getParent()->CreateMachineInstr(tii->get(newOpcode), ++ DL); ++ ++ blk->insert(instrPos, newInstr); ++ MachineInstrBuilder(newInstr).addReg(oldInstr->getOperand(1).getReg(), ++ false); ++ ++ SHOWNEWINSTR(newInstr); ++ //erase later oldInstr->eraseFromParent(); ++ } //insertCondBranchBefore ++ ++ static void insertCondBranchBefore(MachineBasicBlock *blk, ++ MachineBasicBlock::iterator insertPos, ++ int newOpcode, ++ AMDGPUCFGStructurizer *passRep, ++ RegiT regNum, ++ DebugLoc DL) { ++ const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); ++ ++ MachineInstr *newInstr = ++ blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL); ++ ++ //insert before ++ blk->insert(insertPos, newInstr); ++ MachineInstrBuilder(newInstr).addReg(regNum, false); ++ ++ SHOWNEWINSTR(newInstr); ++ } //insertCondBranchBefore ++ ++ static void insertCondBranchEnd(MachineBasicBlock *blk, ++ int newOpcode, ++ AMDGPUCFGStructurizer *passRep, ++ RegiT regNum) { ++ const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); ++ MachineInstr *newInstr = ++ blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DebugLoc()); ++ ++ blk->push_back(newInstr); ++ MachineInstrBuilder(newInstr).addReg(regNum, false); ++ ++ SHOWNEWINSTR(newInstr); ++ } //insertCondBranchEnd ++ ++ ++ static void insertAssignInstrBefore(MachineBasicBlock::iterator instrPos, ++ AMDGPUCFGStructurizer *passRep, ++ RegiT regNum, int regVal) { ++ MachineInstr *oldInstr = &(*instrPos); ++ const AMDGPUInstrInfo *tii = ++ static_cast(passRep->getTargetInstrInfo()); ++ MachineBasicBlock *blk = oldInstr->getParent(); ++ MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum, ++ regVal); ++ blk->insert(instrPos, newInstr); ++ ++ SHOWNEWINSTR(newInstr); ++ } //insertAssignInstrBefore ++ ++ static void insertAssignInstrBefore(MachineBasicBlock *blk, ++ AMDGPUCFGStructurizer *passRep, ++ RegiT regNum, int regVal) { ++ const AMDGPUInstrInfo *tii = ++ static_cast(passRep->getTargetInstrInfo()); ++ ++ MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum, ++ regVal); ++ if (blk->begin() != blk->end()) { ++ blk->insert(blk->begin(), newInstr); ++ } else { ++ blk->push_back(newInstr); ++ } ++ ++ SHOWNEWINSTR(newInstr); ++ ++ } //insertInstrBefore ++ ++ static void insertCompareInstrBefore(MachineBasicBlock *blk, ++ MachineBasicBlock::iterator instrPos, ++ AMDGPUCFGStructurizer *passRep, ++ RegiT dstReg, RegiT src1Reg, ++ RegiT src2Reg) { ++ const AMDGPUInstrInfo *tii = ++ static_cast(passRep->getTargetInstrInfo()); ++ MachineInstr *newInstr = ++ blk->getParent()->CreateMachineInstr(tii->get(tii->getIEQOpcode()), DebugLoc()); ++ ++ MachineInstrBuilder(newInstr).addReg(dstReg, RegState::Define); //set target ++ MachineInstrBuilder(newInstr).addReg(src1Reg); //set src value ++ MachineInstrBuilder(newInstr).addReg(src2Reg); //set src value ++ ++ blk->insert(instrPos, newInstr); ++ SHOWNEWINSTR(newInstr); ++ ++ } //insertCompareInstrBefore ++ ++ static void cloneSuccessorList(MachineBasicBlock *dstBlk, ++ MachineBasicBlock *srcBlk) { ++ for (MachineBasicBlock::succ_iterator iter = srcBlk->succ_begin(), ++ iterEnd = srcBlk->succ_end(); iter != iterEnd; ++iter) { ++ dstBlk->addSuccessor(*iter); // *iter's predecessor is also taken care of ++ } ++ } //cloneSuccessorList ++ ++ static MachineBasicBlock *clone(MachineBasicBlock *srcBlk) { ++ MachineFunction *func = srcBlk->getParent(); ++ MachineBasicBlock *newBlk = func->CreateMachineBasicBlock(); ++ func->push_back(newBlk); //insert to function ++ for (MachineBasicBlock::iterator iter = srcBlk->begin(), ++ iterEnd = srcBlk->end(); ++ iter != iterEnd; ++iter) { ++ MachineInstr *instr = func->CloneMachineInstr(iter); ++ newBlk->push_back(instr); ++ } ++ return newBlk; ++ } ++ ++ //MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose because ++ //the AMDGPU instruction is not recognized as terminator fix this and retire ++ //this routine ++ static void replaceInstrUseOfBlockWith(MachineBasicBlock *srcBlk, ++ MachineBasicBlock *oldBlk, ++ MachineBasicBlock *newBlk) { ++ MachineInstr *branchInstr = getLoopendBlockBranchInstr(srcBlk); ++ if (branchInstr && isCondBranch(branchInstr) && ++ getTrueBranch(branchInstr) == oldBlk) { ++ setTrueBranch(branchInstr, newBlk); ++ } ++ } ++ ++ static void wrapup(MachineBasicBlock *entryBlk) { ++ assert((!entryBlk->getParent()->getJumpTableInfo() ++ || entryBlk->getParent()->getJumpTableInfo()->isEmpty()) ++ && "found a jump table"); ++ ++ //collect continue right before endloop ++ SmallVector contInstr; ++ MachineBasicBlock::iterator pre = entryBlk->begin(); ++ MachineBasicBlock::iterator iterEnd = entryBlk->end(); ++ MachineBasicBlock::iterator iter = pre; ++ while (iter != iterEnd) { ++ if (pre->getOpcode() == AMDGPU::CONTINUE ++ && iter->getOpcode() == AMDGPU::ENDLOOP) { ++ contInstr.push_back(pre); ++ } ++ pre = iter; ++ ++iter; ++ } //end while ++ ++ //delete continue right before endloop ++ for (unsigned i = 0; i < contInstr.size(); ++i) { ++ contInstr[i]->eraseFromParent(); ++ } ++ ++ // TODO to fix up jump table so later phase won't be confused. if ++ // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but ++ // there isn't such an interface yet. alternatively, replace all the other ++ // blocks in the jump table with the entryBlk //} ++ ++ } //wrapup ++ ++ static MachineDominatorTree *getDominatorTree(AMDGPUCFGStructurizer &pass) { ++ return &pass.getAnalysis(); ++ } ++ ++ static MachinePostDominatorTree* ++ getPostDominatorTree(AMDGPUCFGStructurizer &pass) { ++ return &pass.getAnalysis(); ++ } ++ ++ static MachineLoopInfo *getLoopInfo(AMDGPUCFGStructurizer &pass) { ++ return &pass.getAnalysis(); ++ } ++}; // template class CFGStructTraits ++} //end of namespace llvm ++ ++// createAMDGPUCFGPreparationPass- Returns a pass ++FunctionPass *llvm::createAMDGPUCFGPreparationPass(TargetMachine &tm ++ ) { ++ return new AMDGPUCFGPrepare(tm ); ++} ++ ++bool AMDGPUCFGPrepare::runOnMachineFunction(MachineFunction &func) { ++ return llvmCFGStruct::CFGStructurizer().prepare(func, ++ *this, ++ TRI); ++} ++ ++// createAMDGPUCFGStructurizerPass- Returns a pass ++FunctionPass *llvm::createAMDGPUCFGStructurizerPass(TargetMachine &tm ++ ) { ++ return new AMDGPUCFGPerform(tm ); ++} ++ ++bool AMDGPUCFGPerform::runOnMachineFunction(MachineFunction &func) { ++ return llvmCFGStruct::CFGStructurizer().run(func, ++ *this, ++ TRI); ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILDevice.cpp llvm-r600/lib/Target/R600/AMDILDevice.cpp +--- llvm-3.2.src/lib/Target/R600/AMDILDevice.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILDevice.cpp 2013-01-25 19:43:57.440049721 +0100 +@@ -0,0 +1,124 @@ ++//===-- AMDILDevice.cpp - Base class for AMDIL Devices --------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++/// \file ++//==-----------------------------------------------------------------------===// ++#include "AMDILDevice.h" ++#include "AMDGPUSubtarget.h" ++ ++using namespace llvm; ++// Default implementation for all of the classes. ++AMDGPUDevice::AMDGPUDevice(AMDGPUSubtarget *ST) : mSTM(ST) { ++ mHWBits.resize(AMDGPUDeviceInfo::MaxNumberCapabilities); ++ mSWBits.resize(AMDGPUDeviceInfo::MaxNumberCapabilities); ++ setCaps(); ++ DeviceFlag = OCL_DEVICE_ALL; ++} ++ ++AMDGPUDevice::~AMDGPUDevice() { ++ mHWBits.clear(); ++ mSWBits.clear(); ++} ++ ++size_t AMDGPUDevice::getMaxGDSSize() const { ++ return 0; ++} ++ ++uint32_t ++AMDGPUDevice::getDeviceFlag() const { ++ return DeviceFlag; ++} ++ ++size_t AMDGPUDevice::getMaxNumCBs() const { ++ if (usesHardware(AMDGPUDeviceInfo::ConstantMem)) { ++ return HW_MAX_NUM_CB; ++ } ++ ++ return 0; ++} ++ ++size_t AMDGPUDevice::getMaxCBSize() const { ++ if (usesHardware(AMDGPUDeviceInfo::ConstantMem)) { ++ return MAX_CB_SIZE; ++ } ++ ++ return 0; ++} ++ ++size_t AMDGPUDevice::getMaxScratchSize() const { ++ return 65536; ++} ++ ++uint32_t AMDGPUDevice::getStackAlignment() const { ++ return 16; ++} ++ ++void AMDGPUDevice::setCaps() { ++ mSWBits.set(AMDGPUDeviceInfo::HalfOps); ++ mSWBits.set(AMDGPUDeviceInfo::ByteOps); ++ mSWBits.set(AMDGPUDeviceInfo::ShortOps); ++ mSWBits.set(AMDGPUDeviceInfo::HW64BitDivMod); ++ if (mSTM->isOverride(AMDGPUDeviceInfo::NoInline)) { ++ mSWBits.set(AMDGPUDeviceInfo::NoInline); ++ } ++ if (mSTM->isOverride(AMDGPUDeviceInfo::MacroDB)) { ++ mSWBits.set(AMDGPUDeviceInfo::MacroDB); ++ } ++ if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) { ++ mSWBits.set(AMDGPUDeviceInfo::ConstantMem); ++ } else { ++ mHWBits.set(AMDGPUDeviceInfo::ConstantMem); ++ } ++ if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) { ++ mSWBits.set(AMDGPUDeviceInfo::PrivateMem); ++ } else { ++ mHWBits.set(AMDGPUDeviceInfo::PrivateMem); ++ } ++ if (mSTM->isOverride(AMDGPUDeviceInfo::BarrierDetect)) { ++ mSWBits.set(AMDGPUDeviceInfo::BarrierDetect); ++ } ++ mSWBits.set(AMDGPUDeviceInfo::ByteLDSOps); ++ mSWBits.set(AMDGPUDeviceInfo::LongOps); ++} ++ ++AMDGPUDeviceInfo::ExecutionMode ++AMDGPUDevice::getExecutionMode(AMDGPUDeviceInfo::Caps Caps) const { ++ if (mHWBits[Caps]) { ++ assert(!mSWBits[Caps] && "Cannot set both SW and HW caps"); ++ return AMDGPUDeviceInfo::Hardware; ++ } ++ ++ if (mSWBits[Caps]) { ++ assert(!mHWBits[Caps] && "Cannot set both SW and HW caps"); ++ return AMDGPUDeviceInfo::Software; ++ } ++ ++ return AMDGPUDeviceInfo::Unsupported; ++ ++} ++ ++bool AMDGPUDevice::isSupported(AMDGPUDeviceInfo::Caps Mode) const { ++ return getExecutionMode(Mode) != AMDGPUDeviceInfo::Unsupported; ++} ++ ++bool AMDGPUDevice::usesHardware(AMDGPUDeviceInfo::Caps Mode) const { ++ return getExecutionMode(Mode) == AMDGPUDeviceInfo::Hardware; ++} ++ ++bool AMDGPUDevice::usesSoftware(AMDGPUDeviceInfo::Caps Mode) const { ++ return getExecutionMode(Mode) == AMDGPUDeviceInfo::Software; ++} ++ ++std::string ++AMDGPUDevice::getDataLayout() const { ++ return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16" ++ "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32" ++ "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64" ++ "-v96:128:128-v128:128:128-v192:256:256-v256:256:256" ++ "-v512:512:512-v1024:1024:1024-v2048:2048:2048" ++ "-n8:16:32:64"); ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILDevice.h llvm-r600/lib/Target/R600/AMDILDevice.h +--- llvm-3.2.src/lib/Target/R600/AMDILDevice.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILDevice.h 2013-01-25 19:43:57.440049721 +0100 +@@ -0,0 +1,117 @@ ++//===---- AMDILDevice.h - Define Device Data for AMDGPU -----*- C++ -*------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//==-----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Interface for the subtarget data classes. ++// ++/// This file will define the interface that each generation needs to ++/// implement in order to correctly answer queries on the capabilities of the ++/// specific hardware. ++//===----------------------------------------------------------------------===// ++#ifndef AMDILDEVICEIMPL_H ++#define AMDILDEVICEIMPL_H ++#include "AMDIL.h" ++#include "llvm/ADT/BitVector.h" ++ ++namespace llvm { ++ class AMDGPUSubtarget; ++ class MCStreamer; ++//===----------------------------------------------------------------------===// ++// Interface for data that is specific to a single device ++//===----------------------------------------------------------------------===// ++class AMDGPUDevice { ++public: ++ AMDGPUDevice(AMDGPUSubtarget *ST); ++ virtual ~AMDGPUDevice(); ++ ++ // Enum values for the various memory types. ++ enum { ++ RAW_UAV_ID = 0, ++ ARENA_UAV_ID = 1, ++ LDS_ID = 2, ++ GDS_ID = 3, ++ SCRATCH_ID = 4, ++ CONSTANT_ID = 5, ++ GLOBAL_ID = 6, ++ MAX_IDS = 7 ++ } IO_TYPE_IDS; ++ ++ /// \returns The max LDS size that the hardware supports. Size is in ++ /// bytes. ++ virtual size_t getMaxLDSSize() const = 0; ++ ++ /// \returns The max GDS size that the hardware supports if the GDS is ++ /// supported by the hardware. Size is in bytes. ++ virtual size_t getMaxGDSSize() const; ++ ++ /// \returns The max number of hardware constant address spaces that ++ /// are supported by this device. ++ virtual size_t getMaxNumCBs() const; ++ ++ /// \returns The max number of bytes a single hardware constant buffer ++ /// can support. Size is in bytes. ++ virtual size_t getMaxCBSize() const; ++ ++ /// \returns The max number of bytes allowed by the hardware scratch ++ /// buffer. Size is in bytes. ++ virtual size_t getMaxScratchSize() const; ++ ++ /// \brief Get the flag that corresponds to the device. ++ virtual uint32_t getDeviceFlag() const; ++ ++ /// \returns The number of work-items that exist in a single hardware ++ /// wavefront. ++ virtual size_t getWavefrontSize() const = 0; ++ ++ /// \brief Get the generational name of this specific device. ++ virtual uint32_t getGeneration() const = 0; ++ ++ /// \brief Get the stack alignment of this specific device. ++ virtual uint32_t getStackAlignment() const; ++ ++ /// \brief Get the resource ID for this specific device. ++ virtual uint32_t getResourceID(uint32_t DeviceID) const = 0; ++ ++ /// \brief Get the max number of UAV's for this device. ++ virtual uint32_t getMaxNumUAVs() const = 0; ++ ++ ++ // API utilizing more detailed capabilities of each family of ++ // cards. If a capability is supported, then either usesHardware or ++ // usesSoftware returned true. If usesHardware returned true, then ++ // usesSoftware must return false for the same capability. Hardware ++ // execution means that the feature is done natively by the hardware ++ // and is not emulated by the softare. Software execution means ++ // that the feature could be done in the hardware, but there is ++ // software that emulates it with possibly using the hardware for ++ // support since the hardware does not fully comply with OpenCL ++ // specs. ++ ++ bool isSupported(AMDGPUDeviceInfo::Caps Mode) const; ++ bool usesHardware(AMDGPUDeviceInfo::Caps Mode) const; ++ bool usesSoftware(AMDGPUDeviceInfo::Caps Mode) const; ++ virtual std::string getDataLayout() const; ++ static const unsigned int MAX_LDS_SIZE_700 = 16384; ++ static const unsigned int MAX_LDS_SIZE_800 = 32768; ++ static const unsigned int WavefrontSize = 64; ++ static const unsigned int HalfWavefrontSize = 32; ++ static const unsigned int QuarterWavefrontSize = 16; ++protected: ++ virtual void setCaps(); ++ llvm::BitVector mHWBits; ++ llvm::BitVector mSWBits; ++ AMDGPUSubtarget *mSTM; ++ uint32_t DeviceFlag; ++private: ++ AMDGPUDeviceInfo::ExecutionMode ++ getExecutionMode(AMDGPUDeviceInfo::Caps Caps) const; ++}; ++ ++} // namespace llvm ++#endif // AMDILDEVICEIMPL_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILDeviceInfo.cpp llvm-r600/lib/Target/R600/AMDILDeviceInfo.cpp +--- llvm-3.2.src/lib/Target/R600/AMDILDeviceInfo.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILDeviceInfo.cpp 2013-01-25 19:43:57.440049721 +0100 +@@ -0,0 +1,94 @@ ++//===-- AMDILDeviceInfo.cpp - AMDILDeviceInfo class -----------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//==-----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Function that creates DeviceInfo from a device name and other information. ++// ++//==-----------------------------------------------------------------------===// ++#include "AMDILDevices.h" ++#include "AMDGPUSubtarget.h" ++ ++using namespace llvm; ++namespace llvm { ++namespace AMDGPUDeviceInfo { ++ ++AMDGPUDevice* getDeviceFromName(const std::string &deviceName, ++ AMDGPUSubtarget *ptr, ++ bool is64bit, bool is64on32bit) { ++ if (deviceName.c_str()[2] == '7') { ++ switch (deviceName.c_str()[3]) { ++ case '1': ++ return new AMDGPU710Device(ptr); ++ case '7': ++ return new AMDGPU770Device(ptr); ++ default: ++ return new AMDGPU7XXDevice(ptr); ++ } ++ } else if (deviceName == "cypress") { ++#if DEBUG ++ assert(!is64bit && "This device does not support 64bit pointers!"); ++ assert(!is64on32bit && "This device does not support 64bit" ++ " on 32bit pointers!"); ++#endif ++ return new AMDGPUCypressDevice(ptr); ++ } else if (deviceName == "juniper") { ++#if DEBUG ++ assert(!is64bit && "This device does not support 64bit pointers!"); ++ assert(!is64on32bit && "This device does not support 64bit" ++ " on 32bit pointers!"); ++#endif ++ return new AMDGPUEvergreenDevice(ptr); ++ } else if (deviceName == "redwood") { ++#if DEBUG ++ assert(!is64bit && "This device does not support 64bit pointers!"); ++ assert(!is64on32bit && "This device does not support 64bit" ++ " on 32bit pointers!"); ++#endif ++ return new AMDGPURedwoodDevice(ptr); ++ } else if (deviceName == "cedar") { ++#if DEBUG ++ assert(!is64bit && "This device does not support 64bit pointers!"); ++ assert(!is64on32bit && "This device does not support 64bit" ++ " on 32bit pointers!"); ++#endif ++ return new AMDGPUCedarDevice(ptr); ++ } else if (deviceName == "barts" || deviceName == "turks") { ++#if DEBUG ++ assert(!is64bit && "This device does not support 64bit pointers!"); ++ assert(!is64on32bit && "This device does not support 64bit" ++ " on 32bit pointers!"); ++#endif ++ return new AMDGPUNIDevice(ptr); ++ } else if (deviceName == "cayman") { ++#if DEBUG ++ assert(!is64bit && "This device does not support 64bit pointers!"); ++ assert(!is64on32bit && "This device does not support 64bit" ++ " on 32bit pointers!"); ++#endif ++ return new AMDGPUCaymanDevice(ptr); ++ } else if (deviceName == "caicos") { ++#if DEBUG ++ assert(!is64bit && "This device does not support 64bit pointers!"); ++ assert(!is64on32bit && "This device does not support 64bit" ++ " on 32bit pointers!"); ++#endif ++ return new AMDGPUNIDevice(ptr); ++ } else if (deviceName == "SI") { ++ return new AMDGPUSIDevice(ptr); ++ } else { ++#if DEBUG ++ assert(!is64bit && "This device does not support 64bit pointers!"); ++ assert(!is64on32bit && "This device does not support 64bit" ++ " on 32bit pointers!"); ++#endif ++ return new AMDGPU7XXDevice(ptr); ++ } ++} ++} // End namespace AMDGPUDeviceInfo ++} // End namespace llvm +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILDeviceInfo.h llvm-r600/lib/Target/R600/AMDILDeviceInfo.h +--- llvm-3.2.src/lib/Target/R600/AMDILDeviceInfo.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILDeviceInfo.h 2013-01-25 19:43:57.440049721 +0100 +@@ -0,0 +1,88 @@ ++//===-- AMDILDeviceInfo.h - Constants for describing devices --------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++/// \file ++//==-----------------------------------------------------------------------===// ++#ifndef AMDILDEVICEINFO_H ++#define AMDILDEVICEINFO_H ++ ++ ++#include ++ ++namespace llvm { ++ class AMDGPUDevice; ++ class AMDGPUSubtarget; ++ namespace AMDGPUDeviceInfo { ++ /// Each Capabilities can be executed using a hardware instruction, ++ /// emulated with a sequence of software instructions, or not ++ /// supported at all. ++ enum ExecutionMode { ++ Unsupported = 0, ///< Unsupported feature on the card(Default value) ++ /// This is the execution mode that is set if the feature is emulated in ++ /// software. ++ Software, ++ /// This execution mode is set if the feature exists natively in hardware ++ Hardware ++ }; ++ ++ enum Caps { ++ HalfOps = 0x1, ///< Half float is supported or not. ++ DoubleOps = 0x2, ///< Double is supported or not. ++ ByteOps = 0x3, ///< Byte(char) is support or not. ++ ShortOps = 0x4, ///< Short is supported or not. ++ LongOps = 0x5, ///< Long is supported or not. ++ Images = 0x6, ///< Images are supported or not. ++ ByteStores = 0x7, ///< ByteStores available(!HD4XXX). ++ ConstantMem = 0x8, ///< Constant/CB memory. ++ LocalMem = 0x9, ///< Local/LDS memory. ++ PrivateMem = 0xA, ///< Scratch/Private/Stack memory. ++ RegionMem = 0xB, ///< OCL GDS Memory Extension. ++ FMA = 0xC, ///< Use HW FMA or SW FMA. ++ ArenaSegment = 0xD, ///< Use for Arena UAV per pointer 12-1023. ++ MultiUAV = 0xE, ///< Use for UAV per Pointer 0-7. ++ Reserved0 = 0xF, ///< ReservedFlag ++ NoAlias = 0x10, ///< Cached loads. ++ Signed24BitOps = 0x11, ///< Peephole Optimization. ++ /// Debug mode implies that no hardware features or optimizations ++ /// are performned and that all memory access go through a single ++ /// uav(Arena on HD5XXX/HD6XXX and Raw on HD4XXX). ++ Debug = 0x12, ++ CachedMem = 0x13, ///< Cached mem is available or not. ++ BarrierDetect = 0x14, ///< Detect duplicate barriers. ++ Reserved1 = 0x15, ///< Reserved flag ++ ByteLDSOps = 0x16, ///< Flag to specify if byte LDS ops are available. ++ ArenaVectors = 0x17, ///< Flag to specify if vector loads from arena work. ++ TmrReg = 0x18, ///< Flag to specify if Tmr register is supported. ++ NoInline = 0x19, ///< Flag to specify that no inlining should occur. ++ MacroDB = 0x1A, ///< Flag to specify that backend handles macrodb. ++ HW64BitDivMod = 0x1B, ///< Flag for backend to generate 64bit div/mod. ++ ArenaUAV = 0x1C, ///< Flag to specify that arena uav is supported. ++ PrivateUAV = 0x1D, ///< Flag to specify that private memory uses uav's. ++ /// If more capabilities are required, then ++ /// this number needs to be increased. ++ /// All capabilities must come before this ++ /// number. ++ MaxNumberCapabilities = 0x20 ++ }; ++ /// These have to be in order with the older generations ++ /// having the lower number enumerations. ++ enum Generation { ++ HD4XXX = 0, ///< 7XX based devices. ++ HD5XXX, ///< Evergreen based devices. ++ HD6XXX, ///< NI/Evergreen+ based devices. ++ HD7XXX, ///< Southern Islands based devices. ++ HDTEST, ///< Experimental feature testing device. ++ HDNUMGEN ++ }; ++ ++ ++ AMDGPUDevice* ++ getDeviceFromName(const std::string &name, AMDGPUSubtarget *ptr, ++ bool is64bit = false, bool is64on32bit = false); ++ } // namespace AMDILDeviceInfo ++} // namespace llvm ++#endif // AMDILDEVICEINFO_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILDevices.h llvm-r600/lib/Target/R600/AMDILDevices.h +--- llvm-3.2.src/lib/Target/R600/AMDILDevices.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILDevices.h 2013-01-25 19:43:57.440049721 +0100 +@@ -0,0 +1,19 @@ ++//===-- AMDILDevices.h - Consolidate AMDIL Device headers -----------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++/// \file ++//==-----------------------------------------------------------------------===// ++#ifndef AMDIL_DEVICES_H ++#define AMDIL_DEVICES_H ++// Include all of the device specific header files ++#include "AMDIL7XXDevice.h" ++#include "AMDILDevice.h" ++#include "AMDILEvergreenDevice.h" ++#include "AMDILNIDevice.h" ++#include "AMDILSIDevice.h" ++ ++#endif // AMDIL_DEVICES_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILEvergreenDevice.cpp llvm-r600/lib/Target/R600/AMDILEvergreenDevice.cpp +--- llvm-3.2.src/lib/Target/R600/AMDILEvergreenDevice.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILEvergreenDevice.cpp 2013-01-25 19:43:57.440049721 +0100 +@@ -0,0 +1,169 @@ ++//===-- AMDILEvergreenDevice.cpp - Device Info for Evergreen --------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++/// \file ++//==-----------------------------------------------------------------------===// ++#include "AMDILEvergreenDevice.h" ++ ++using namespace llvm; ++ ++AMDGPUEvergreenDevice::AMDGPUEvergreenDevice(AMDGPUSubtarget *ST) ++: AMDGPUDevice(ST) { ++ setCaps(); ++ std::string name = ST->getDeviceName(); ++ if (name == "cedar") { ++ DeviceFlag = OCL_DEVICE_CEDAR; ++ } else if (name == "redwood") { ++ DeviceFlag = OCL_DEVICE_REDWOOD; ++ } else if (name == "cypress") { ++ DeviceFlag = OCL_DEVICE_CYPRESS; ++ } else { ++ DeviceFlag = OCL_DEVICE_JUNIPER; ++ } ++} ++ ++AMDGPUEvergreenDevice::~AMDGPUEvergreenDevice() { ++} ++ ++size_t AMDGPUEvergreenDevice::getMaxLDSSize() const { ++ if (usesHardware(AMDGPUDeviceInfo::LocalMem)) { ++ return MAX_LDS_SIZE_800; ++ } else { ++ return 0; ++ } ++} ++size_t AMDGPUEvergreenDevice::getMaxGDSSize() const { ++ if (usesHardware(AMDGPUDeviceInfo::RegionMem)) { ++ return MAX_LDS_SIZE_800; ++ } else { ++ return 0; ++ } ++} ++uint32_t AMDGPUEvergreenDevice::getMaxNumUAVs() const { ++ return 12; ++} ++ ++uint32_t AMDGPUEvergreenDevice::getResourceID(uint32_t id) const { ++ switch(id) { ++ default: ++ assert(0 && "ID type passed in is unknown!"); ++ break; ++ case CONSTANT_ID: ++ case RAW_UAV_ID: ++ return GLOBAL_RETURN_RAW_UAV_ID; ++ case GLOBAL_ID: ++ case ARENA_UAV_ID: ++ return DEFAULT_ARENA_UAV_ID; ++ case LDS_ID: ++ if (usesHardware(AMDGPUDeviceInfo::LocalMem)) { ++ return DEFAULT_LDS_ID; ++ } else { ++ return DEFAULT_ARENA_UAV_ID; ++ } ++ case GDS_ID: ++ if (usesHardware(AMDGPUDeviceInfo::RegionMem)) { ++ return DEFAULT_GDS_ID; ++ } else { ++ return DEFAULT_ARENA_UAV_ID; ++ } ++ case SCRATCH_ID: ++ if (usesHardware(AMDGPUDeviceInfo::PrivateMem)) { ++ return DEFAULT_SCRATCH_ID; ++ } else { ++ return DEFAULT_ARENA_UAV_ID; ++ } ++ }; ++ return 0; ++} ++ ++size_t AMDGPUEvergreenDevice::getWavefrontSize() const { ++ return AMDGPUDevice::WavefrontSize; ++} ++ ++uint32_t AMDGPUEvergreenDevice::getGeneration() const { ++ return AMDGPUDeviceInfo::HD5XXX; ++} ++ ++void AMDGPUEvergreenDevice::setCaps() { ++ mSWBits.set(AMDGPUDeviceInfo::ArenaSegment); ++ mHWBits.set(AMDGPUDeviceInfo::ArenaUAV); ++ mHWBits.set(AMDGPUDeviceInfo::HW64BitDivMod); ++ mSWBits.reset(AMDGPUDeviceInfo::HW64BitDivMod); ++ mSWBits.set(AMDGPUDeviceInfo::Signed24BitOps); ++ if (mSTM->isOverride(AMDGPUDeviceInfo::ByteStores)) { ++ mHWBits.set(AMDGPUDeviceInfo::ByteStores); ++ } ++ if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) { ++ mSWBits.set(AMDGPUDeviceInfo::LocalMem); ++ mSWBits.set(AMDGPUDeviceInfo::RegionMem); ++ } else { ++ mHWBits.set(AMDGPUDeviceInfo::LocalMem); ++ mHWBits.set(AMDGPUDeviceInfo::RegionMem); ++ } ++ mHWBits.set(AMDGPUDeviceInfo::Images); ++ if (mSTM->isOverride(AMDGPUDeviceInfo::NoAlias)) { ++ mHWBits.set(AMDGPUDeviceInfo::NoAlias); ++ } ++ mHWBits.set(AMDGPUDeviceInfo::CachedMem); ++ if (mSTM->isOverride(AMDGPUDeviceInfo::MultiUAV)) { ++ mHWBits.set(AMDGPUDeviceInfo::MultiUAV); ++ } ++ mHWBits.set(AMDGPUDeviceInfo::ByteLDSOps); ++ mSWBits.reset(AMDGPUDeviceInfo::ByteLDSOps); ++ mHWBits.set(AMDGPUDeviceInfo::ArenaVectors); ++ mHWBits.set(AMDGPUDeviceInfo::LongOps); ++ mSWBits.reset(AMDGPUDeviceInfo::LongOps); ++ mHWBits.set(AMDGPUDeviceInfo::TmrReg); ++} ++ ++AMDGPUCypressDevice::AMDGPUCypressDevice(AMDGPUSubtarget *ST) ++ : AMDGPUEvergreenDevice(ST) { ++ setCaps(); ++} ++ ++AMDGPUCypressDevice::~AMDGPUCypressDevice() { ++} ++ ++void AMDGPUCypressDevice::setCaps() { ++ if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) { ++ mHWBits.set(AMDGPUDeviceInfo::DoubleOps); ++ mHWBits.set(AMDGPUDeviceInfo::FMA); ++ } ++} ++ ++ ++AMDGPUCedarDevice::AMDGPUCedarDevice(AMDGPUSubtarget *ST) ++ : AMDGPUEvergreenDevice(ST) { ++ setCaps(); ++} ++ ++AMDGPUCedarDevice::~AMDGPUCedarDevice() { ++} ++ ++void AMDGPUCedarDevice::setCaps() { ++ mSWBits.set(AMDGPUDeviceInfo::FMA); ++} ++ ++size_t AMDGPUCedarDevice::getWavefrontSize() const { ++ return AMDGPUDevice::QuarterWavefrontSize; ++} ++ ++AMDGPURedwoodDevice::AMDGPURedwoodDevice(AMDGPUSubtarget *ST) ++ : AMDGPUEvergreenDevice(ST) { ++ setCaps(); ++} ++ ++AMDGPURedwoodDevice::~AMDGPURedwoodDevice() { ++} ++ ++void AMDGPURedwoodDevice::setCaps() { ++ mSWBits.set(AMDGPUDeviceInfo::FMA); ++} ++ ++size_t AMDGPURedwoodDevice::getWavefrontSize() const { ++ return AMDGPUDevice::HalfWavefrontSize; ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILEvergreenDevice.h llvm-r600/lib/Target/R600/AMDILEvergreenDevice.h +--- llvm-3.2.src/lib/Target/R600/AMDILEvergreenDevice.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILEvergreenDevice.h 2013-01-25 19:43:57.440049721 +0100 +@@ -0,0 +1,93 @@ ++//==- AMDILEvergreenDevice.h - Define Evergreen Device for AMDIL -*- C++ -*--=// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//==-----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Interface for the subtarget data classes. ++/// ++/// This file will define the interface that each generation needs to ++/// implement in order to correctly answer queries on the capabilities of the ++/// specific hardware. ++//===----------------------------------------------------------------------===// ++#ifndef AMDILEVERGREENDEVICE_H ++#define AMDILEVERGREENDEVICE_H ++#include "AMDILDevice.h" ++#include "AMDGPUSubtarget.h" ++ ++namespace llvm { ++ class AMDGPUSubtarget; ++//===----------------------------------------------------------------------===// ++// Evergreen generation of devices and their respective sub classes ++//===----------------------------------------------------------------------===// ++ ++ ++/// \brief The AMDGPUEvergreenDevice is the base device class for all of the Evergreen ++/// series of cards. ++/// ++/// This class contains information required to differentiate ++/// the Evergreen device from the generic AMDGPUDevice. This device represents ++/// that capabilities of the 'Juniper' cards, also known as the HD57XX. ++class AMDGPUEvergreenDevice : public AMDGPUDevice { ++public: ++ AMDGPUEvergreenDevice(AMDGPUSubtarget *ST); ++ virtual ~AMDGPUEvergreenDevice(); ++ virtual size_t getMaxLDSSize() const; ++ virtual size_t getMaxGDSSize() const; ++ virtual size_t getWavefrontSize() const; ++ virtual uint32_t getGeneration() const; ++ virtual uint32_t getMaxNumUAVs() const; ++ virtual uint32_t getResourceID(uint32_t) const; ++protected: ++ virtual void setCaps(); ++}; ++ ++/// The AMDGPUCypressDevice is similiar to the AMDGPUEvergreenDevice, except it has ++/// support for double precision operations. This device is used to represent ++/// both the Cypress and Hemlock cards, which are commercially known as HD58XX ++/// and HD59XX cards. ++class AMDGPUCypressDevice : public AMDGPUEvergreenDevice { ++public: ++ AMDGPUCypressDevice(AMDGPUSubtarget *ST); ++ virtual ~AMDGPUCypressDevice(); ++private: ++ virtual void setCaps(); ++}; ++ ++ ++/// \brief The AMDGPUCedarDevice is the class that represents all of the 'Cedar' based ++/// devices. ++/// ++/// This class differs from the base AMDGPUEvergreenDevice in that the ++/// device is a ~quarter of the 'Juniper'. These are commercially known as the ++/// HD54XX and HD53XX series of cards. ++class AMDGPUCedarDevice : public AMDGPUEvergreenDevice { ++public: ++ AMDGPUCedarDevice(AMDGPUSubtarget *ST); ++ virtual ~AMDGPUCedarDevice(); ++ virtual size_t getWavefrontSize() const; ++private: ++ virtual void setCaps(); ++}; ++ ++/// \brief The AMDGPURedwoodDevice is the class the represents all of the 'Redwood' based ++/// devices. ++/// ++/// This class differs from the base class, in that these devices are ++/// considered about half of a 'Juniper' device. These are commercially known as ++/// the HD55XX and HD56XX series of cards. ++class AMDGPURedwoodDevice : public AMDGPUEvergreenDevice { ++public: ++ AMDGPURedwoodDevice(AMDGPUSubtarget *ST); ++ virtual ~AMDGPURedwoodDevice(); ++ virtual size_t getWavefrontSize() const; ++private: ++ virtual void setCaps(); ++}; ++ ++} // namespace llvm ++#endif // AMDILEVERGREENDEVICE_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILFrameLowering.cpp llvm-r600/lib/Target/R600/AMDILFrameLowering.cpp +--- llvm-3.2.src/lib/Target/R600/AMDILFrameLowering.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILFrameLowering.cpp 2013-01-25 19:43:57.440049721 +0100 +@@ -0,0 +1,47 @@ ++//===----------------------- AMDILFrameLowering.cpp -----------------*- C++ -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//==-----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Interface to describe a layout of a stack frame on a AMDGPU target ++/// machine. ++// ++//===----------------------------------------------------------------------===// ++#include "AMDILFrameLowering.h" ++#include "llvm/CodeGen/MachineFrameInfo.h" ++ ++using namespace llvm; ++AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl, ++ int LAO, unsigned TransAl) ++ : TargetFrameLowering(D, StackAl, LAO, TransAl) { ++} ++ ++AMDGPUFrameLowering::~AMDGPUFrameLowering() { ++} ++ ++int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF, ++ int FI) const { ++ const MachineFrameInfo *MFI = MF.getFrameInfo(); ++ return MFI->getObjectOffset(FI); ++} ++ ++const TargetFrameLowering::SpillSlot * ++AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const { ++ NumEntries = 0; ++ return 0; ++} ++void ++AMDGPUFrameLowering::emitPrologue(MachineFunction &MF) const { ++} ++void ++AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { ++} ++bool ++AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const { ++ return false; ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILFrameLowering.h llvm-r600/lib/Target/R600/AMDILFrameLowering.h +--- llvm-3.2.src/lib/Target/R600/AMDILFrameLowering.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILFrameLowering.h 2013-01-25 19:43:57.443383054 +0100 +@@ -0,0 +1,40 @@ ++//===--------------------- AMDILFrameLowering.h -----------------*- C++ -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Interface to describe a layout of a stack frame on a AMDIL target ++/// machine. ++// ++//===----------------------------------------------------------------------===// ++#ifndef AMDILFRAME_LOWERING_H ++#define AMDILFRAME_LOWERING_H ++ ++#include "llvm/CodeGen/MachineFunction.h" ++#include "llvm/Target/TargetFrameLowering.h" ++ ++namespace llvm { ++ ++/// \brief Information about the stack frame layout on the AMDGPU targets. ++/// ++/// It holds the direction of the stack growth, the known stack alignment on ++/// entry to each function, and the offset to the locals area. ++/// See TargetFrameInfo for more comments. ++class AMDGPUFrameLowering : public TargetFrameLowering { ++public: ++ AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO, ++ unsigned TransAl = 1); ++ virtual ~AMDGPUFrameLowering(); ++ virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const; ++ virtual const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const; ++ virtual void emitPrologue(MachineFunction &MF) const; ++ virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; ++ virtual bool hasFP(const MachineFunction &MF) const; ++}; ++} // namespace llvm ++#endif // AMDILFRAME_LOWERING_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDIL.h llvm-r600/lib/Target/R600/AMDIL.h +--- llvm-3.2.src/lib/Target/R600/AMDIL.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDIL.h 2013-01-25 19:43:57.433383055 +0100 +@@ -0,0 +1,122 @@ ++//===-- AMDIL.h - Top-level interface for AMDIL representation --*- C++ -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//==-----------------------------------------------------------------------===// ++// ++/// This file contains the entry points for global functions defined in the LLVM ++/// AMDGPU back-end. ++// ++//===----------------------------------------------------------------------===// ++ ++#ifndef AMDIL_H ++#define AMDIL_H ++ ++#include "llvm/CodeGen/MachineFunction.h" ++#include "llvm/Target/TargetMachine.h" ++ ++#define ARENA_SEGMENT_RESERVED_UAVS 12 ++#define DEFAULT_ARENA_UAV_ID 8 ++#define DEFAULT_RAW_UAV_ID 7 ++#define GLOBAL_RETURN_RAW_UAV_ID 11 ++#define HW_MAX_NUM_CB 8 ++#define MAX_NUM_UNIQUE_UAVS 8 ++#define OPENCL_MAX_NUM_ATOMIC_COUNTERS 8 ++#define OPENCL_MAX_READ_IMAGES 128 ++#define OPENCL_MAX_WRITE_IMAGES 8 ++#define OPENCL_MAX_SAMPLERS 16 ++ ++// The next two values can never be zero, as zero is the ID that is ++// used to assert against. ++#define DEFAULT_LDS_ID 1 ++#define DEFAULT_GDS_ID 1 ++#define DEFAULT_SCRATCH_ID 1 ++#define DEFAULT_VEC_SLOTS 8 ++ ++#define OCL_DEVICE_RV710 0x0001 ++#define OCL_DEVICE_RV730 0x0002 ++#define OCL_DEVICE_RV770 0x0004 ++#define OCL_DEVICE_CEDAR 0x0008 ++#define OCL_DEVICE_REDWOOD 0x0010 ++#define OCL_DEVICE_JUNIPER 0x0020 ++#define OCL_DEVICE_CYPRESS 0x0040 ++#define OCL_DEVICE_CAICOS 0x0080 ++#define OCL_DEVICE_TURKS 0x0100 ++#define OCL_DEVICE_BARTS 0x0200 ++#define OCL_DEVICE_CAYMAN 0x0400 ++#define OCL_DEVICE_ALL 0x3FFF ++ ++/// The number of function ID's that are reserved for ++/// internal compiler usage. ++const unsigned int RESERVED_FUNCS = 1024; ++ ++namespace llvm { ++class AMDGPUInstrPrinter; ++class FunctionPass; ++class MCAsmInfo; ++class raw_ostream; ++class Target; ++class TargetMachine; ++ ++// Instruction selection passes. ++FunctionPass* ++ createAMDGPUISelDag(TargetMachine &TM); ++FunctionPass* ++ createAMDGPUPeepholeOpt(TargetMachine &TM); ++ ++// Pre emit passes. ++FunctionPass* ++ createAMDGPUCFGPreparationPass(TargetMachine &TM); ++FunctionPass* ++ createAMDGPUCFGStructurizerPass(TargetMachine &TM); ++ ++extern Target TheAMDGPUTarget; ++} // end namespace llvm; ++ ++// Include device information enumerations ++#include "AMDILDeviceInfo.h" ++ ++namespace llvm { ++/// OpenCL uses address spaces to differentiate between ++/// various memory regions on the hardware. On the CPU ++/// all of the address spaces point to the same memory, ++/// however on the GPU, each address space points to ++/// a seperate piece of memory that is unique from other ++/// memory locations. ++namespace AMDGPUAS { ++enum AddressSpaces { ++ PRIVATE_ADDRESS = 0, ///< Address space for private memory. ++ GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). ++ CONSTANT_ADDRESS = 2, ///< Address space for constant memory ++ LOCAL_ADDRESS = 3, ///< Address space for local memory. ++ REGION_ADDRESS = 4, ///< Address space for region memory. ++ ADDRESS_NONE = 5, ///< Address space for unknown memory. ++ PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0) ++ PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1) ++ USER_SGPR_ADDRESS = 8, ///< Address space for USER_SGPRS on SI ++ CONSTANT_BUFFER_0 = 9, ++ CONSTANT_BUFFER_1 = 10, ++ CONSTANT_BUFFER_2 = 11, ++ CONSTANT_BUFFER_3 = 12, ++ CONSTANT_BUFFER_4 = 13, ++ CONSTANT_BUFFER_5 = 14, ++ CONSTANT_BUFFER_6 = 15, ++ CONSTANT_BUFFER_7 = 16, ++ CONSTANT_BUFFER_8 = 17, ++ CONSTANT_BUFFER_9 = 18, ++ CONSTANT_BUFFER_10 = 19, ++ CONSTANT_BUFFER_11 = 20, ++ CONSTANT_BUFFER_12 = 21, ++ CONSTANT_BUFFER_13 = 22, ++ CONSTANT_BUFFER_14 = 23, ++ CONSTANT_BUFFER_15 = 24, ++ LAST_ADDRESS = 25 ++}; ++ ++} // namespace AMDGPUAS ++ ++} // end namespace llvm ++#endif // AMDIL_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILInstrInfo.td llvm-r600/lib/Target/R600/AMDILInstrInfo.td +--- llvm-3.2.src/lib/Target/R600/AMDILInstrInfo.td 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILInstrInfo.td 2013-01-25 19:43:57.443383054 +0100 +@@ -0,0 +1,208 @@ ++//===------------ AMDILInstrInfo.td - AMDIL Target ------*-tablegen-*------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//==-----------------------------------------------------------------------===// ++// ++// This file describes the AMDIL instructions in TableGen format. ++// ++//===----------------------------------------------------------------------===// ++// AMDIL Instruction Predicate Definitions ++// Predicate that is set to true if the hardware supports double precision ++// divide ++def HasHWDDiv : Predicate<"Subtarget.device()" ++ "->getGeneration() > AMDGPUDeviceInfo::HD4XXX && " ++ "Subtarget.device()->usesHardware(AMDGPUDeviceInfo::DoubleOps)">; ++ ++// Predicate that is set to true if the hardware supports double, but not double ++// precision divide in hardware ++def HasSWDDiv : Predicate<"Subtarget.device()" ++ "->getGeneration() == AMDGPUDeviceInfo::HD4XXX &&" ++ "Subtarget.device()->usesHardware(AMDGPUDeviceInfo::DoubleOps)">; ++ ++// Predicate that is set to true if the hardware support 24bit signed ++// math ops. Otherwise a software expansion to 32bit math ops is used instead. ++def HasHWSign24Bit : Predicate<"Subtarget.device()" ++ "->getGeneration() > AMDGPUDeviceInfo::HD5XXX">; ++ ++// Predicate that is set to true if 64bit operations are supported or not ++def HasHW64Bit : Predicate<"Subtarget.device()" ++ "->usesHardware(AMDGPUDeviceInfo::LongOps)">; ++def HasSW64Bit : Predicate<"Subtarget.device()" ++ "->usesSoftware(AMDGPUDeviceInfo::LongOps)">; ++ ++// Predicate that is set to true if the timer register is supported ++def HasTmrRegister : Predicate<"Subtarget.device()" ++ "->isSupported(AMDGPUDeviceInfo::TmrReg)">; ++// Predicate that is true if we are at least evergreen series ++def HasDeviceIDInst : Predicate<"Subtarget.device()" ++ "->getGeneration() >= AMDGPUDeviceInfo::HD5XXX">; ++ ++// Predicate that is true if we have region address space. ++def hasRegionAS : Predicate<"Subtarget.device()" ++ "->usesHardware(AMDGPUDeviceInfo::RegionMem)">; ++ ++// Predicate that is false if we don't have region address space. ++def noRegionAS : Predicate<"!Subtarget.device()" ++ "->isSupported(AMDGPUDeviceInfo::RegionMem)">; ++ ++ ++// Predicate that is set to true if 64bit Mul is supported in the IL or not ++def HasHW64Mul : Predicate<"Subtarget.calVersion()" ++ ">= CAL_VERSION_SC_139" ++ "&& Subtarget.device()" ++ "->getGeneration() >=" ++ "AMDGPUDeviceInfo::HD5XXX">; ++def HasSW64Mul : Predicate<"Subtarget.calVersion()" ++ "< CAL_VERSION_SC_139">; ++// Predicate that is set to true if 64bit Div/Mod is supported in the IL or not ++def HasHW64DivMod : Predicate<"Subtarget.device()" ++ "->usesHardware(AMDGPUDeviceInfo::HW64BitDivMod)">; ++def HasSW64DivMod : Predicate<"Subtarget.device()" ++ "->usesSoftware(AMDGPUDeviceInfo::HW64BitDivMod)">; ++ ++// Predicate that is set to true if 64bit pointer are used. ++def Has64BitPtr : Predicate<"Subtarget.is64bit()">; ++def Has32BitPtr : Predicate<"!Subtarget.is64bit()">; ++//===--------------------------------------------------------------------===// ++// Custom Operands ++//===--------------------------------------------------------------------===// ++def brtarget : Operand; ++ ++//===--------------------------------------------------------------------===// ++// Custom Selection DAG Type Profiles ++//===--------------------------------------------------------------------===// ++//===----------------------------------------------------------------------===// ++// Generic Profile Types ++//===----------------------------------------------------------------------===// ++ ++def SDTIL_GenBinaryOp : SDTypeProfile<1, 2, [ ++ SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2> ++ ]>; ++def SDTIL_GenTernaryOp : SDTypeProfile<1, 3, [ ++ SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisSameAs<2, 3> ++ ]>; ++def SDTIL_GenVecBuild : SDTypeProfile<1, 1, [ ++ SDTCisEltOfVec<1, 0> ++ ]>; ++ ++//===----------------------------------------------------------------------===// ++// Flow Control Profile Types ++//===----------------------------------------------------------------------===// ++// Branch instruction where second and third are basic blocks ++def SDTIL_BRCond : SDTypeProfile<0, 2, [ ++ SDTCisVT<0, OtherVT> ++ ]>; ++ ++//===--------------------------------------------------------------------===// ++// Custom Selection DAG Nodes ++//===--------------------------------------------------------------------===// ++//===----------------------------------------------------------------------===// ++// Flow Control DAG Nodes ++//===----------------------------------------------------------------------===// ++def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>; ++ ++//===----------------------------------------------------------------------===// ++// Call/Return DAG Nodes ++//===----------------------------------------------------------------------===// ++def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, ++ [SDNPHasChain, SDNPOptInGlue]>; ++ ++//===--------------------------------------------------------------------===// ++// Instructions ++//===--------------------------------------------------------------------===// ++// Floating point math functions ++def IL_div_inf : SDNode<"AMDGPUISD::DIV_INF", SDTIL_GenBinaryOp>; ++def IL_mad : SDNode<"AMDGPUISD::MAD", SDTIL_GenTernaryOp>; ++ ++//===----------------------------------------------------------------------===// ++// Integer functions ++//===----------------------------------------------------------------------===// ++def IL_umul : SDNode<"AMDGPUISD::UMUL" , SDTIntBinOp, ++ [SDNPCommutative, SDNPAssociative]>; ++ ++//===--------------------------------------------------------------------===// ++// Custom Pattern DAG Nodes ++//===--------------------------------------------------------------------===// ++def global_store : PatFrag<(ops node:$val, node:$ptr), ++ (store node:$val, node:$ptr), [{ ++ return isGlobalStore(dyn_cast(N)); ++}]>; ++ ++//===----------------------------------------------------------------------===// ++// Load pattern fragments ++//===----------------------------------------------------------------------===// ++// Global address space loads ++def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ ++ return isGlobalLoad(dyn_cast(N)); ++}]>; ++// Constant address space loads ++def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ ++ return isConstantLoad(dyn_cast(N), -1); ++}]>; ++ ++//===----------------------------------------------------------------------===// ++// Complex addressing mode patterns ++//===----------------------------------------------------------------------===// ++def ADDR : ComplexPattern; ++def ADDRF : ComplexPattern; ++def ADDR64 : ComplexPattern; ++def ADDR64F : ComplexPattern; ++ ++//===----------------------------------------------------------------------===// ++// Instruction format classes ++//===----------------------------------------------------------------------===// ++class ILFormat pattern> ++: Instruction { ++ ++ let Namespace = "AMDGPU"; ++ dag OutOperandList = outs; ++ dag InOperandList = ins; ++ let Pattern = pattern; ++ let AsmString = !strconcat(asmstr, "\n"); ++ let isPseudo = 1; ++ let Itinerary = NullALU; ++ bit hasIEEEFlag = 0; ++ bit hasZeroOpFlag = 0; ++ let mayLoad = 0; ++ let mayStore = 0; ++ let hasSideEffects = 0; ++} ++ ++//===--------------------------------------------------------------------===// ++// Multiclass Instruction formats ++//===--------------------------------------------------------------------===// ++// Multiclass that handles branch instructions ++multiclass BranchConditional { ++ def _i32 : ILFormat<(outs), ++ (ins brtarget:$target, GPRI32:$src0), ++ "; i32 Pseudo branch instruction", ++ [(Op bb:$target, GPRI32:$src0)]>; ++ def _f32 : ILFormat<(outs), ++ (ins brtarget:$target, GPRF32:$src0), ++ "; f32 Pseudo branch instruction", ++ [(Op bb:$target, GPRF32:$src0)]>; ++} ++ ++// Only scalar types should generate flow control ++multiclass BranchInstr { ++ def _i32 : ILFormat<(outs), (ins GPRI32:$src), ++ !strconcat(name, " $src"), []>; ++ def _f32 : ILFormat<(outs), (ins GPRF32:$src), ++ !strconcat(name, " $src"), []>; ++} ++// Only scalar types should generate flow control ++multiclass BranchInstr2 { ++ def _i32 : ILFormat<(outs), (ins GPRI32:$src0, GPRI32:$src1), ++ !strconcat(name, " $src0, $src1"), []>; ++ def _f32 : ILFormat<(outs), (ins GPRF32:$src0, GPRF32:$src1), ++ !strconcat(name, " $src0, $src1"), []>; ++} ++ ++//===--------------------------------------------------------------------===// ++// Intrinsics support ++//===--------------------------------------------------------------------===// ++include "AMDILIntrinsics.td" +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILIntrinsicInfo.cpp llvm-r600/lib/Target/R600/AMDILIntrinsicInfo.cpp +--- llvm-3.2.src/lib/Target/R600/AMDILIntrinsicInfo.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILIntrinsicInfo.cpp 2013-01-25 19:43:57.446716388 +0100 +@@ -0,0 +1,79 @@ ++//===- AMDILIntrinsicInfo.cpp - AMDGPU Intrinsic Information ------*- C++ -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//==-----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief AMDGPU Implementation of the IntrinsicInfo class. ++// ++//===-----------------------------------------------------------------------===// ++ ++#include "AMDILIntrinsicInfo.h" ++#include "AMDIL.h" ++#include "AMDGPUSubtarget.h" ++#include "llvm/DerivedTypes.h" ++#include "llvm/Intrinsics.h" ++#include "llvm/Module.h" ++ ++using namespace llvm; ++ ++#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN ++#include "AMDGPUGenIntrinsics.inc" ++#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN ++ ++AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm) ++ : TargetIntrinsicInfo() { ++} ++ ++std::string ++AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys, ++ unsigned int numTys) const { ++ static const char* const names[] = { ++#define GET_INTRINSIC_NAME_TABLE ++#include "AMDGPUGenIntrinsics.inc" ++#undef GET_INTRINSIC_NAME_TABLE ++ }; ++ ++ if (IntrID < Intrinsic::num_intrinsics) { ++ return 0; ++ } ++ assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics ++ && "Invalid intrinsic ID"); ++ ++ std::string Result(names[IntrID - Intrinsic::num_intrinsics]); ++ return Result; ++} ++ ++unsigned int ++AMDGPUIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const { ++#define GET_FUNCTION_RECOGNIZER ++#include "AMDGPUGenIntrinsics.inc" ++#undef GET_FUNCTION_RECOGNIZER ++ AMDGPUIntrinsic::ID IntrinsicID ++ = (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic; ++ IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name); ++ ++ if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) { ++ return IntrinsicID; ++ } ++ return 0; ++} ++ ++bool ++AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const { ++ // Overload Table ++#define GET_INTRINSIC_OVERLOAD_TABLE ++#include "AMDGPUGenIntrinsics.inc" ++#undef GET_INTRINSIC_OVERLOAD_TABLE ++} ++ ++Function* ++AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, ++ Type **Tys, ++ unsigned numTys) const { ++ assert(!"Not implemented"); ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILIntrinsicInfo.h llvm-r600/lib/Target/R600/AMDILIntrinsicInfo.h +--- llvm-3.2.src/lib/Target/R600/AMDILIntrinsicInfo.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILIntrinsicInfo.h 2013-01-25 19:43:57.446716388 +0100 +@@ -0,0 +1,49 @@ ++//===- AMDILIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//==-----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class. ++// ++//===-----------------------------------------------------------------------===// ++#ifndef AMDIL_INTRINSICS_H ++#define AMDIL_INTRINSICS_H ++ ++#include "llvm/Intrinsics.h" ++#include "llvm/Target/TargetIntrinsicInfo.h" ++ ++namespace llvm { ++class TargetMachine; ++ ++namespace AMDGPUIntrinsic { ++enum ID { ++ last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1, ++#define GET_INTRINSIC_ENUM_VALUES ++#include "AMDGPUGenIntrinsics.inc" ++#undef GET_INTRINSIC_ENUM_VALUES ++ , num_AMDGPU_intrinsics ++}; ++ ++} // end namespace AMDGPUIntrinsic ++ ++class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo { ++public: ++ AMDGPUIntrinsicInfo(TargetMachine *tm); ++ std::string getName(unsigned int IntrId, Type **Tys = 0, ++ unsigned int numTys = 0) const; ++ unsigned int lookupName(const char *Name, unsigned int Len) const; ++ bool isOverloaded(unsigned int IID) const; ++ Function *getDeclaration(Module *M, unsigned int ID, ++ Type **Tys = 0, ++ unsigned int numTys = 0) const; ++}; ++ ++} // end namespace llvm ++ ++#endif // AMDIL_INTRINSICS_H ++ +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILIntrinsics.td llvm-r600/lib/Target/R600/AMDILIntrinsics.td +--- llvm-3.2.src/lib/Target/R600/AMDILIntrinsics.td 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILIntrinsics.td 2013-01-25 19:43:57.446716388 +0100 +@@ -0,0 +1,242 @@ ++//===- AMDILIntrinsics.td - Defines AMDIL Intrinscs -*- tablegen -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//==-----------------------------------------------------------------------===// ++// ++// This file defines all of the amdil-specific intrinsics ++// ++//===---------------------------------------------------------------===// ++//===--------------------------------------------------------------------===// ++// Intrinsic classes ++// Generic versions of the above classes but for Target specific intrinsics ++// instead of SDNode patterns. ++//===--------------------------------------------------------------------===// ++let TargetPrefix = "AMDIL", isTarget = 1 in { ++ class VoidIntLong : ++ Intrinsic<[llvm_i64_ty], [], []>; ++ class VoidIntInt : ++ Intrinsic<[llvm_i32_ty], [], []>; ++ class VoidIntBool : ++ Intrinsic<[llvm_i32_ty], [], []>; ++ class UnaryIntInt : ++ Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; ++ class UnaryIntFloat : ++ Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; ++ class ConvertIntFTOI : ++ Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>; ++ class ConvertIntITOF : ++ Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty], [IntrNoMem]>; ++ class UnaryIntNoRetInt : ++ Intrinsic<[], [llvm_anyint_ty], []>; ++ class UnaryIntNoRetFloat : ++ Intrinsic<[], [llvm_anyfloat_ty], []>; ++ class BinaryIntInt : ++ Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; ++ class BinaryIntFloat : ++ Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; ++ class BinaryIntNoRetInt : ++ Intrinsic<[], [llvm_anyint_ty, LLVMMatchType<0>], []>; ++ class BinaryIntNoRetFloat : ++ Intrinsic<[], [llvm_anyfloat_ty, LLVMMatchType<0>], []>; ++ class TernaryIntInt : ++ Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, ++ LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; ++ class TernaryIntFloat : ++ Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, ++ LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; ++ class QuaternaryIntInt : ++ Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, ++ LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; ++ class UnaryAtomicInt : ++ Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; ++ class BinaryAtomicInt : ++ Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; ++ class TernaryAtomicInt : ++ Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; ++ class UnaryAtomicIntNoRet : ++ Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; ++ class BinaryAtomicIntNoRet : ++ Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; ++ class TernaryAtomicIntNoRet : ++ Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; ++} ++ ++let TargetPrefix = "AMDIL", isTarget = 1 in { ++ def int_AMDIL_abs : GCCBuiltin<"__amdil_abs">, UnaryIntInt; ++ ++ def int_AMDIL_bit_extract_i32 : GCCBuiltin<"__amdil_ibit_extract">, ++ TernaryIntInt; ++ def int_AMDIL_bit_extract_u32 : GCCBuiltin<"__amdil_ubit_extract">, ++ TernaryIntInt; ++ def int_AMDIL_bit_reverse_u32 : GCCBuiltin<"__amdil_ubit_reverse">, ++ UnaryIntInt; ++ def int_AMDIL_bit_count_i32 : GCCBuiltin<"__amdil_count_bits">, ++ UnaryIntInt; ++ def int_AMDIL_bit_find_first_lo : GCCBuiltin<"__amdil_ffb_lo">, ++ UnaryIntInt; ++ def int_AMDIL_bit_find_first_hi : GCCBuiltin<"__amdil_ffb_hi">, ++ UnaryIntInt; ++ def int_AMDIL_bit_find_first_sgn : GCCBuiltin<"__amdil_ffb_signed">, ++ UnaryIntInt; ++ def int_AMDIL_media_bitalign : GCCBuiltin<"__amdil_bitalign">, ++ TernaryIntInt; ++ def int_AMDIL_media_bytealign : GCCBuiltin<"__amdil_bytealign">, ++ TernaryIntInt; ++ def int_AMDIL_bit_insert_u32 : GCCBuiltin<"__amdil_ubit_insert">, ++ QuaternaryIntInt; ++ def int_AMDIL_bfi : GCCBuiltin<"__amdil_bfi">, ++ TernaryIntInt; ++ def int_AMDIL_bfm : GCCBuiltin<"__amdil_bfm">, ++ BinaryIntInt; ++ def int_AMDIL_mad_i32 : GCCBuiltin<"__amdil_imad">, ++ TernaryIntInt; ++ def int_AMDIL_mad_u32 : GCCBuiltin<"__amdil_umad">, ++ TernaryIntInt; ++ def int_AMDIL_mad : GCCBuiltin<"__amdil_mad">, ++ TernaryIntFloat; ++ def int_AMDIL_mulhi_i32 : GCCBuiltin<"__amdil_imul_high">, ++ BinaryIntInt; ++ def int_AMDIL_mulhi_u32 : GCCBuiltin<"__amdil_umul_high">, ++ BinaryIntInt; ++ def int_AMDIL_mul24_i32 : GCCBuiltin<"__amdil_imul24">, ++ BinaryIntInt; ++ def int_AMDIL_mul24_u32 : GCCBuiltin<"__amdil_umul24">, ++ BinaryIntInt; ++ def int_AMDIL_mulhi24_i32 : GCCBuiltin<"__amdil_imul24_high">, ++ BinaryIntInt; ++ def int_AMDIL_mulhi24_u32 : GCCBuiltin<"__amdil_umul24_high">, ++ BinaryIntInt; ++ def int_AMDIL_mad24_i32 : GCCBuiltin<"__amdil_imad24">, ++ TernaryIntInt; ++ def int_AMDIL_mad24_u32 : GCCBuiltin<"__amdil_umad24">, ++ TernaryIntInt; ++ def int_AMDIL_carry_i32 : GCCBuiltin<"__amdil_carry">, ++ BinaryIntInt; ++ def int_AMDIL_borrow_i32 : GCCBuiltin<"__amdil_borrow">, ++ BinaryIntInt; ++ def int_AMDIL_min_i32 : GCCBuiltin<"__amdil_imin">, ++ BinaryIntInt; ++ def int_AMDIL_min_u32 : GCCBuiltin<"__amdil_umin">, ++ BinaryIntInt; ++ def int_AMDIL_min : GCCBuiltin<"__amdil_min">, ++ BinaryIntFloat; ++ def int_AMDIL_max_i32 : GCCBuiltin<"__amdil_imax">, ++ BinaryIntInt; ++ def int_AMDIL_max_u32 : GCCBuiltin<"__amdil_umax">, ++ BinaryIntInt; ++ def int_AMDIL_max : GCCBuiltin<"__amdil_max">, ++ BinaryIntFloat; ++ def int_AMDIL_media_lerp_u4 : GCCBuiltin<"__amdil_u4lerp">, ++ TernaryIntInt; ++ def int_AMDIL_media_sad : GCCBuiltin<"__amdil_sad">, ++ TernaryIntInt; ++ def int_AMDIL_media_sad_hi : GCCBuiltin<"__amdil_sadhi">, ++ TernaryIntInt; ++ def int_AMDIL_fraction : GCCBuiltin<"__amdil_fraction">, ++ UnaryIntFloat; ++ def int_AMDIL_clamp : GCCBuiltin<"__amdil_clamp">, ++ TernaryIntFloat; ++ def int_AMDIL_pireduce : GCCBuiltin<"__amdil_pireduce">, ++ UnaryIntFloat; ++ def int_AMDIL_round_nearest : GCCBuiltin<"__amdil_round_nearest">, ++ UnaryIntFloat; ++ def int_AMDIL_round_neginf : GCCBuiltin<"__amdil_round_neginf">, ++ UnaryIntFloat; ++ def int_AMDIL_round_zero : GCCBuiltin<"__amdil_round_zero">, ++ UnaryIntFloat; ++ def int_AMDIL_acos : GCCBuiltin<"__amdil_acos">, ++ UnaryIntFloat; ++ def int_AMDIL_atan : GCCBuiltin<"__amdil_atan">, ++ UnaryIntFloat; ++ def int_AMDIL_asin : GCCBuiltin<"__amdil_asin">, ++ UnaryIntFloat; ++ def int_AMDIL_cos : GCCBuiltin<"__amdil_cos">, ++ UnaryIntFloat; ++ def int_AMDIL_cos_vec : GCCBuiltin<"__amdil_cos_vec">, ++ UnaryIntFloat; ++ def int_AMDIL_tan : GCCBuiltin<"__amdil_tan">, ++ UnaryIntFloat; ++ def int_AMDIL_sin : GCCBuiltin<"__amdil_sin">, ++ UnaryIntFloat; ++ def int_AMDIL_sin_vec : GCCBuiltin<"__amdil_sin_vec">, ++ UnaryIntFloat; ++ def int_AMDIL_pow : GCCBuiltin<"__amdil_pow">, BinaryIntFloat; ++ def int_AMDIL_div : GCCBuiltin<"__amdil_div">, BinaryIntFloat; ++ def int_AMDIL_udiv : GCCBuiltin<"__amdil_udiv">, BinaryIntInt; ++ def int_AMDIL_sqrt: GCCBuiltin<"__amdil_sqrt">, ++ UnaryIntFloat; ++ def int_AMDIL_sqrt_vec: GCCBuiltin<"__amdil_sqrt_vec">, ++ UnaryIntFloat; ++ def int_AMDIL_exp : GCCBuiltin<"__amdil_exp">, ++ UnaryIntFloat; ++ def int_AMDIL_exp_vec : GCCBuiltin<"__amdil_exp_vec">, ++ UnaryIntFloat; ++ def int_AMDIL_exn : GCCBuiltin<"__amdil_exn">, ++ UnaryIntFloat; ++ def int_AMDIL_log_vec : GCCBuiltin<"__amdil_log_vec">, ++ UnaryIntFloat; ++ def int_AMDIL_ln : GCCBuiltin<"__amdil_ln">, ++ UnaryIntFloat; ++ def int_AMDIL_sign: GCCBuiltin<"__amdil_sign">, ++ UnaryIntFloat; ++ def int_AMDIL_fma: GCCBuiltin<"__amdil_fma">, ++ TernaryIntFloat; ++ def int_AMDIL_rsq : GCCBuiltin<"__amdil_rsq">, ++ UnaryIntFloat; ++ def int_AMDIL_rsq_vec : GCCBuiltin<"__amdil_rsq_vec">, ++ UnaryIntFloat; ++ def int_AMDIL_length : GCCBuiltin<"__amdil_length">, ++ UnaryIntFloat; ++ def int_AMDIL_lerp : GCCBuiltin<"__amdil_lerp">, ++ TernaryIntFloat; ++ def int_AMDIL_media_sad4 : GCCBuiltin<"__amdil_sad4">, ++ Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty, ++ llvm_v4i32_ty, llvm_i32_ty], []>; ++ ++ def int_AMDIL_frexp_f64 : GCCBuiltin<"__amdil_frexp">, ++ Intrinsic<[llvm_v2i64_ty], [llvm_double_ty], []>; ++ def int_AMDIL_ldexp : GCCBuiltin<"__amdil_ldexp">, ++ Intrinsic<[llvm_anyfloat_ty], [llvm_anyfloat_ty, llvm_anyint_ty], []>; ++ def int_AMDIL_drcp : GCCBuiltin<"__amdil_rcp">, ++ Intrinsic<[llvm_double_ty], [llvm_double_ty], []>; ++ def int_AMDIL_convert_f16_f32 : GCCBuiltin<"__amdil_half_to_float">, ++ ConvertIntITOF; ++ def int_AMDIL_convert_f32_f16 : GCCBuiltin<"__amdil_float_to_half">, ++ ConvertIntFTOI; ++ def int_AMDIL_convert_f32_i32_rpi : GCCBuiltin<"__amdil_float_to_int_rpi">, ++ ConvertIntFTOI; ++ def int_AMDIL_convert_f32_i32_flr : GCCBuiltin<"__amdil_float_to_int_flr">, ++ ConvertIntFTOI; ++ def int_AMDIL_convert_f32_f16_near : GCCBuiltin<"__amdil_float_to_half_near">, ++ ConvertIntFTOI; ++ def int_AMDIL_convert_f32_f16_neg_inf : GCCBuiltin<"__amdil_float_to_half_neg_inf">, ++ ConvertIntFTOI; ++ def int_AMDIL_convert_f32_f16_plus_inf : GCCBuiltin<"__amdil_float_to_half_plus_inf">, ++ ConvertIntFTOI; ++ def int_AMDIL_media_convert_f2v4u8 : GCCBuiltin<"__amdil_f_2_u4">, ++ Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], []>; ++ def int_AMDIL_media_unpack_byte_0 : GCCBuiltin<"__amdil_unpack_0">, ++ ConvertIntITOF; ++ def int_AMDIL_media_unpack_byte_1 : GCCBuiltin<"__amdil_unpack_1">, ++ ConvertIntITOF; ++ def int_AMDIL_media_unpack_byte_2 : GCCBuiltin<"__amdil_unpack_2">, ++ ConvertIntITOF; ++ def int_AMDIL_media_unpack_byte_3 : GCCBuiltin<"__amdil_unpack_3">, ++ ConvertIntITOF; ++ def int_AMDIL_dp2_add : GCCBuiltin<"__amdil_dp2_add">, ++ Intrinsic<[llvm_float_ty], [llvm_v2f32_ty, ++ llvm_v2f32_ty, llvm_float_ty], []>; ++ def int_AMDIL_dp2 : GCCBuiltin<"__amdil_dp2">, ++ Intrinsic<[llvm_float_ty], [llvm_v2f32_ty, ++ llvm_v2f32_ty], []>; ++ def int_AMDIL_dp3 : GCCBuiltin<"__amdil_dp3">, ++ Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, ++ llvm_v4f32_ty], []>; ++ def int_AMDIL_dp4 : GCCBuiltin<"__amdil_dp4">, ++ Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, ++ llvm_v4f32_ty], []>; ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILISelDAGToDAG.cpp llvm-r600/lib/Target/R600/AMDILISelDAGToDAG.cpp +--- llvm-3.2.src/lib/Target/R600/AMDILISelDAGToDAG.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILISelDAGToDAG.cpp 2013-01-25 19:43:57.443383054 +0100 +@@ -0,0 +1,567 @@ ++//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//==-----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Defines an instruction selector for the AMDGPU target. ++// ++//===----------------------------------------------------------------------===// ++#include "AMDGPUInstrInfo.h" ++#include "AMDGPUISelLowering.h" // For AMDGPUISD ++#include "AMDGPURegisterInfo.h" ++#include "AMDILDevices.h" ++#include "R600InstrInfo.h" ++#include "llvm/ADT/ValueMap.h" ++#include "llvm/CodeGen/PseudoSourceValue.h" ++#include "llvm/CodeGen/SelectionDAGISel.h" ++#include "llvm/Support/Compiler.h" ++#include "llvm/CodeGen/SelectionDAG.h" ++#include ++#include ++ ++using namespace llvm; ++ ++//===----------------------------------------------------------------------===// ++// Instruction Selector Implementation ++//===----------------------------------------------------------------------===// ++ ++namespace { ++/// AMDGPU specific code to select AMDGPU machine instructions for ++/// SelectionDAG operations. ++class AMDGPUDAGToDAGISel : public SelectionDAGISel { ++ // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can ++ // make the right decision when generating code for different targets. ++ const AMDGPUSubtarget &Subtarget; ++public: ++ AMDGPUDAGToDAGISel(TargetMachine &TM); ++ virtual ~AMDGPUDAGToDAGISel(); ++ ++ SDNode *Select(SDNode *N); ++ virtual const char *getPassName() const; ++ ++private: ++ inline SDValue getSmallIPtrImm(unsigned Imm); ++ bool FoldOperands(unsigned, const R600InstrInfo *, std::vector &); ++ ++ // Complex pattern selectors ++ bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2); ++ bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2); ++ bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2); ++ ++ static bool checkType(const Value *ptr, unsigned int addrspace); ++ static const Value *getBasePointerValue(const Value *V); ++ ++ static bool isGlobalStore(const StoreSDNode *N); ++ static bool isPrivateStore(const StoreSDNode *N); ++ static bool isLocalStore(const StoreSDNode *N); ++ static bool isRegionStore(const StoreSDNode *N); ++ ++ static bool isCPLoad(const LoadSDNode *N); ++ static bool isConstantLoad(const LoadSDNode *N, int cbID); ++ static bool isGlobalLoad(const LoadSDNode *N); ++ static bool isParamLoad(const LoadSDNode *N); ++ static bool isPrivateLoad(const LoadSDNode *N); ++ static bool isLocalLoad(const LoadSDNode *N); ++ static bool isRegionLoad(const LoadSDNode *N); ++ ++ bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); ++ bool SelectGlobalValueVariableOffset(SDValue Addr, ++ SDValue &BaseReg, SDValue& Offset); ++ bool SelectADDR8BitOffset(SDValue Addr, SDValue& Base, SDValue& Offset); ++ bool SelectADDRReg(SDValue Addr, SDValue& Base, SDValue& Offset); ++ bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); ++ ++ // Include the pieces autogenerated from the target description. ++#include "AMDGPUGenDAGISel.inc" ++}; ++} // end anonymous namespace ++ ++/// \brief This pass converts a legalized DAG into a AMDGPU-specific ++// DAG, ready for instruction scheduling. ++FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM ++ ) { ++ return new AMDGPUDAGToDAGISel(TM); ++} ++ ++AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM ++ ) ++ : SelectionDAGISel(TM), Subtarget(TM.getSubtarget()) { ++} ++ ++AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() { ++} ++ ++SDValue AMDGPUDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) { ++ return CurDAG->getTargetConstant(Imm, MVT::i32); ++} ++ ++bool AMDGPUDAGToDAGISel::SelectADDRParam( ++ SDValue Addr, SDValue& R1, SDValue& R2) { ++ ++ if (Addr.getOpcode() == ISD::FrameIndex) { ++ if (FrameIndexSDNode *FIN = dyn_cast(Addr)) { ++ R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); ++ R2 = CurDAG->getTargetConstant(0, MVT::i32); ++ } else { ++ R1 = Addr; ++ R2 = CurDAG->getTargetConstant(0, MVT::i32); ++ } ++ } else if (Addr.getOpcode() == ISD::ADD) { ++ R1 = Addr.getOperand(0); ++ R2 = Addr.getOperand(1); ++ } else { ++ R1 = Addr; ++ R2 = CurDAG->getTargetConstant(0, MVT::i32); ++ } ++ return true; ++} ++ ++bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) { ++ if (Addr.getOpcode() == ISD::TargetExternalSymbol || ++ Addr.getOpcode() == ISD::TargetGlobalAddress) { ++ return false; ++ } ++ return SelectADDRParam(Addr, R1, R2); ++} ++ ++ ++bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) { ++ if (Addr.getOpcode() == ISD::TargetExternalSymbol || ++ Addr.getOpcode() == ISD::TargetGlobalAddress) { ++ return false; ++ } ++ ++ if (Addr.getOpcode() == ISD::FrameIndex) { ++ if (FrameIndexSDNode *FIN = dyn_cast(Addr)) { ++ R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64); ++ R2 = CurDAG->getTargetConstant(0, MVT::i64); ++ } else { ++ R1 = Addr; ++ R2 = CurDAG->getTargetConstant(0, MVT::i64); ++ } ++ } else if (Addr.getOpcode() == ISD::ADD) { ++ R1 = Addr.getOperand(0); ++ R2 = Addr.getOperand(1); ++ } else { ++ R1 = Addr; ++ R2 = CurDAG->getTargetConstant(0, MVT::i64); ++ } ++ return true; ++} ++ ++SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { ++ unsigned int Opc = N->getOpcode(); ++ if (N->isMachineOpcode()) { ++ return NULL; // Already selected. ++ } ++ switch (Opc) { ++ default: break; ++ case ISD::FrameIndex: { ++ if (FrameIndexSDNode *FIN = dyn_cast(N)) { ++ unsigned int FI = FIN->getIndex(); ++ EVT OpVT = N->getValueType(0); ++ unsigned int NewOpc = AMDGPU::COPY; ++ SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i32); ++ return CurDAG->SelectNodeTo(N, NewOpc, OpVT, TFI); ++ } ++ break; ++ } ++ case ISD::ConstantFP: ++ case ISD::Constant: { ++ const AMDGPUSubtarget &ST = TM.getSubtarget(); ++ // XXX: Custom immediate lowering not implemented yet. Instead we use ++ // pseudo instructions defined in SIInstructions.td ++ if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { ++ break; ++ } ++ const R600InstrInfo *TII = static_cast(TM.getInstrInfo()); ++ ++ uint64_t ImmValue = 0; ++ unsigned ImmReg = AMDGPU::ALU_LITERAL_X; ++ ++ if (N->getOpcode() == ISD::ConstantFP) { ++ // XXX: 64-bit Immediates not supported yet ++ assert(N->getValueType(0) != MVT::f64); ++ ++ ConstantFPSDNode *C = dyn_cast(N); ++ APFloat Value = C->getValueAPF(); ++ float FloatValue = Value.convertToFloat(); ++ if (FloatValue == 0.0) { ++ ImmReg = AMDGPU::ZERO; ++ } else if (FloatValue == 0.5) { ++ ImmReg = AMDGPU::HALF; ++ } else if (FloatValue == 1.0) { ++ ImmReg = AMDGPU::ONE; ++ } else { ++ ImmValue = Value.bitcastToAPInt().getZExtValue(); ++ } ++ } else { ++ // XXX: 64-bit Immediates not supported yet ++ assert(N->getValueType(0) != MVT::i64); ++ ++ ConstantSDNode *C = dyn_cast(N); ++ if (C->getZExtValue() == 0) { ++ ImmReg = AMDGPU::ZERO; ++ } else if (C->getZExtValue() == 1) { ++ ImmReg = AMDGPU::ONE_INT; ++ } else { ++ ImmValue = C->getZExtValue(); ++ } ++ } ++ ++ for (SDNode::use_iterator Use = N->use_begin(), Next = llvm::next(Use); ++ Use != SDNode::use_end(); Use = Next) { ++ Next = llvm::next(Use); ++ std::vector Ops; ++ for (unsigned i = 0; i < Use->getNumOperands(); ++i) { ++ Ops.push_back(Use->getOperand(i)); ++ } ++ ++ if (!Use->isMachineOpcode()) { ++ if (ImmReg == AMDGPU::ALU_LITERAL_X) { ++ // We can only use literal constants (e.g. AMDGPU::ZERO, ++ // AMDGPU::ONE, etc) in machine opcodes. ++ continue; ++ } ++ } else { ++ if (!TII->isALUInstr(Use->getMachineOpcode())) { ++ continue; ++ } ++ ++ int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(), R600Operands::IMM); ++ assert(ImmIdx != -1); ++ ++ // subtract one from ImmIdx, because the DST operand is usually index ++ // 0 for MachineInstrs, but we have no DST in the Ops vector. ++ ImmIdx--; ++ ++ // Check that we aren't already using an immediate. ++ // XXX: It's possible for an instruction to have more than one ++ // immediate operand, but this is not supported yet. ++ if (ImmReg == AMDGPU::ALU_LITERAL_X) { ++ ConstantSDNode *C = dyn_cast(Use->getOperand(ImmIdx)); ++ assert(C); ++ ++ if (C->getZExtValue() != 0) { ++ // This instruction is already using an immediate. ++ continue; ++ } ++ ++ // Set the immediate value ++ Ops[ImmIdx] = CurDAG->getTargetConstant(ImmValue, MVT::i32); ++ } ++ } ++ // Set the immediate register ++ Ops[Use.getOperandNo()] = CurDAG->getRegister(ImmReg, MVT::i32); ++ ++ CurDAG->UpdateNodeOperands(*Use, Ops.data(), Use->getNumOperands()); ++ } ++ break; ++ } ++ } ++ SDNode *Result = SelectCode(N); ++ ++ // Fold operands of selected node ++ ++ const AMDGPUSubtarget &ST = TM.getSubtarget(); ++ if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { ++ const R600InstrInfo *TII = ++ static_cast(TM.getInstrInfo()); ++ if (Result && TII->isALUInstr(Result->getMachineOpcode())) { ++ bool IsModified = false; ++ do { ++ std::vector Ops; ++ for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end(); ++ I != E; ++I) ++ Ops.push_back(*I); ++ IsModified = FoldOperands(Result->getMachineOpcode(), TII, Ops); ++ if (IsModified) { ++ Result = CurDAG->MorphNodeTo(Result, Result->getOpcode(), ++ Result->getVTList(), Ops.data(), Ops.size()); ++ } ++ } while (IsModified); ++ } ++ } ++ ++ return Result; ++} ++ ++bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode, ++ const R600InstrInfo *TII, std::vector &Ops) { ++ int OperandIdx[] = { ++ TII->getOperandIdx(Opcode, R600Operands::SRC0), ++ TII->getOperandIdx(Opcode, R600Operands::SRC1), ++ TII->getOperandIdx(Opcode, R600Operands::SRC2) ++ }; ++ int SelIdx[] = { ++ TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL), ++ TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL), ++ TII->getOperandIdx(Opcode, R600Operands::SRC2_SEL) ++ }; ++ for (unsigned i = 0; i < 3; i++) { ++ if (OperandIdx[i] < 0) ++ return false; ++ SDValue Operand = Ops[OperandIdx[i] - 1]; ++ switch (Operand.getOpcode()) { ++ case AMDGPUISD::CONST_ADDRESS: { ++ SDValue CstOffset; ++ if (!Operand.getValueType().isVector() && ++ SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) { ++ Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32); ++ Ops[SelIdx[i] - 1] = CstOffset; ++ return true; ++ } ++ } ++ break; ++ default: ++ break; ++ } ++ } ++ return false; ++} ++ ++bool AMDGPUDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) { ++ if (!ptr) { ++ return false; ++ } ++ Type *ptrType = ptr->getType(); ++ return dyn_cast(ptrType)->getAddressSpace() == addrspace; ++} ++ ++const Value * AMDGPUDAGToDAGISel::getBasePointerValue(const Value *V) { ++ if (!V) { ++ return NULL; ++ } ++ const Value *ret = NULL; ++ ValueMap ValueBitMap; ++ std::queue > ValueQueue; ++ ValueQueue.push(V); ++ while (!ValueQueue.empty()) { ++ V = ValueQueue.front(); ++ if (ValueBitMap.find(V) == ValueBitMap.end()) { ++ ValueBitMap[V] = true; ++ if (dyn_cast(V) && dyn_cast(V->getType())) { ++ ret = V; ++ break; ++ } else if (dyn_cast(V)) { ++ ret = V; ++ break; ++ } else if (dyn_cast(V)) { ++ const ConstantExpr *CE = dyn_cast(V); ++ if (CE) { ++ ValueQueue.push(CE->getOperand(0)); ++ } ++ } else if (const AllocaInst *AI = dyn_cast(V)) { ++ ret = AI; ++ break; ++ } else if (const Instruction *I = dyn_cast(V)) { ++ uint32_t numOps = I->getNumOperands(); ++ for (uint32_t x = 0; x < numOps; ++x) { ++ ValueQueue.push(I->getOperand(x)); ++ } ++ } else { ++ assert(!"Found a Value that we didn't know how to handle!"); ++ } ++ } ++ ValueQueue.pop(); ++ } ++ return ret; ++} ++ ++bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) { ++ return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS); ++} ++ ++bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) { ++ return (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS) ++ && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS) ++ && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS)); ++} ++ ++bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) { ++ return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS); ++} ++ ++bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) { ++ return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS); ++} ++ ++bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int cbID) { ++ if (checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)) { ++ return true; ++ } ++ MachineMemOperand *MMO = N->getMemOperand(); ++ const Value *V = MMO->getValue(); ++ const Value *BV = getBasePointerValue(V); ++ if (MMO ++ && MMO->getValue() ++ && ((V && dyn_cast(V)) ++ || (BV && dyn_cast( ++ getBasePointerValue(MMO->getValue()))))) { ++ return checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS); ++ } else { ++ return false; ++ } ++} ++ ++bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) { ++ return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS); ++} ++ ++bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) { ++ return checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS); ++} ++ ++bool AMDGPUDAGToDAGISel::isLocalLoad(const LoadSDNode *N) { ++ return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS); ++} ++ ++bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) { ++ return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS); ++} ++ ++bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) { ++ MachineMemOperand *MMO = N->getMemOperand(); ++ if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) { ++ if (MMO) { ++ const Value *V = MMO->getValue(); ++ const PseudoSourceValue *PSV = dyn_cast(V); ++ if (PSV && PSV == PseudoSourceValue::getConstantPool()) { ++ return true; ++ } ++ } ++ } ++ return false; ++} ++ ++bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) { ++ if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) { ++ // Check to make sure we are not a constant pool load or a constant load ++ // that is marked as a private load ++ if (isCPLoad(N) || isConstantLoad(N, -1)) { ++ return false; ++ } ++ } ++ if (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS) ++ && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS) ++ && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS) ++ && !checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS) ++ && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_D_ADDRESS) ++ && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS)) { ++ return true; ++ } ++ return false; ++} ++ ++const char *AMDGPUDAGToDAGISel::getPassName() const { ++ return "AMDGPU DAG->DAG Pattern Instruction Selection"; ++} ++ ++#ifdef DEBUGTMP ++#undef INT64_C ++#endif ++#undef DEBUGTMP ++ ++///==== AMDGPU Functions ====/// ++ ++bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr, ++ SDValue& IntPtr) { ++ if (ConstantSDNode *Cst = dyn_cast(Addr)) { ++ IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, true); ++ return true; ++ } ++ return false; ++} ++ ++bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, ++ SDValue& BaseReg, SDValue &Offset) { ++ if (!dyn_cast(Addr)) { ++ BaseReg = Addr; ++ Offset = CurDAG->getIntPtrConstant(0, true); ++ return true; ++ } ++ return false; ++} ++ ++bool AMDGPUDAGToDAGISel::SelectADDR8BitOffset(SDValue Addr, SDValue& Base, ++ SDValue& Offset) { ++ if (Addr.getOpcode() == ISD::TargetExternalSymbol || ++ Addr.getOpcode() == ISD::TargetGlobalAddress) { ++ return false; ++ } ++ ++ ++ if (Addr.getOpcode() == ISD::ADD) { ++ bool Match = false; ++ ++ // Find the base ptr and the offset ++ for (unsigned i = 0; i < Addr.getNumOperands(); i++) { ++ SDValue Arg = Addr.getOperand(i); ++ ConstantSDNode * OffsetNode = dyn_cast(Arg); ++ // This arg isn't a constant so it must be the base PTR. ++ if (!OffsetNode) { ++ Base = Addr.getOperand(i); ++ continue; ++ } ++ // Check if the constant argument fits in 8-bits. The offset is in bytes ++ // so we need to convert it to dwords. ++ if (isUInt<8>(OffsetNode->getZExtValue() >> 2)) { ++ Match = true; ++ Offset = CurDAG->getTargetConstant(OffsetNode->getZExtValue() >> 2, ++ MVT::i32); ++ } ++ } ++ return Match; ++ } ++ ++ // Default case, no offset ++ Base = Addr; ++ Offset = CurDAG->getTargetConstant(0, MVT::i32); ++ return true; ++} ++ ++bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, ++ SDValue &Offset) { ++ ConstantSDNode * IMMOffset; ++ ++ if (Addr.getOpcode() == ISD::ADD ++ && (IMMOffset = dyn_cast(Addr.getOperand(1))) ++ && isInt<16>(IMMOffset->getZExtValue())) { ++ ++ Base = Addr.getOperand(0); ++ Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32); ++ return true; ++ // If the pointer address is constant, we can move it to the offset field. ++ } else if ((IMMOffset = dyn_cast(Addr)) ++ && isInt<16>(IMMOffset->getZExtValue())) { ++ Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), ++ CurDAG->getEntryNode().getDebugLoc(), ++ AMDGPU::ZERO, MVT::i32); ++ Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32); ++ return true; ++ } ++ ++ // Default case, no offset ++ Base = Addr; ++ Offset = CurDAG->getTargetConstant(0, MVT::i32); ++ return true; ++} ++ ++bool AMDGPUDAGToDAGISel::SelectADDRReg(SDValue Addr, SDValue& Base, ++ SDValue& Offset) { ++ if (Addr.getOpcode() == ISD::TargetExternalSymbol || ++ Addr.getOpcode() == ISD::TargetGlobalAddress || ++ Addr.getOpcode() != ISD::ADD) { ++ return false; ++ } ++ ++ Base = Addr.getOperand(0); ++ Offset = Addr.getOperand(1); ++ ++ return true; ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILISelLowering.cpp llvm-r600/lib/Target/R600/AMDILISelLowering.cpp +--- llvm-3.2.src/lib/Target/R600/AMDILISelLowering.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILISelLowering.cpp 2013-01-25 19:43:57.443383054 +0100 +@@ -0,0 +1,651 @@ ++//===-- AMDILISelLowering.cpp - AMDIL DAG Lowering Implementation ---------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//==-----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief TargetLowering functions borrowed from AMDIL. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "AMDGPUISelLowering.h" ++#include "AMDGPURegisterInfo.h" ++#include "AMDILDevices.h" ++#include "AMDILIntrinsicInfo.h" ++#include "AMDGPUSubtarget.h" ++#include "llvm/CallingConv.h" ++#include "llvm/CodeGen/MachineFrameInfo.h" ++#include "llvm/CodeGen/MachineRegisterInfo.h" ++#include "llvm/CodeGen/PseudoSourceValue.h" ++#include "llvm/CodeGen/SelectionDAG.h" ++#include "llvm/CodeGen/SelectionDAGNodes.h" ++#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" ++#include "llvm/DerivedTypes.h" ++#include "llvm/Instructions.h" ++#include "llvm/Intrinsics.h" ++#include "llvm/Support/raw_ostream.h" ++#include "llvm/Target/TargetInstrInfo.h" ++#include "llvm/Target/TargetOptions.h" ++ ++using namespace llvm; ++//===----------------------------------------------------------------------===// ++// Calling Convention Implementation ++//===----------------------------------------------------------------------===// ++#include "AMDGPUGenCallingConv.inc" ++ ++//===----------------------------------------------------------------------===// ++// TargetLowering Implementation Help Functions End ++//===----------------------------------------------------------------------===// ++ ++//===----------------------------------------------------------------------===// ++// TargetLowering Class Implementation Begins ++//===----------------------------------------------------------------------===// ++void AMDGPUTargetLowering::InitAMDILLowering() { ++ int types[] = { ++ (int)MVT::i8, ++ (int)MVT::i16, ++ (int)MVT::i32, ++ (int)MVT::f32, ++ (int)MVT::f64, ++ (int)MVT::i64, ++ (int)MVT::v2i8, ++ (int)MVT::v4i8, ++ (int)MVT::v2i16, ++ (int)MVT::v4i16, ++ (int)MVT::v4f32, ++ (int)MVT::v4i32, ++ (int)MVT::v2f32, ++ (int)MVT::v2i32, ++ (int)MVT::v2f64, ++ (int)MVT::v2i64 ++ }; ++ ++ int IntTypes[] = { ++ (int)MVT::i8, ++ (int)MVT::i16, ++ (int)MVT::i32, ++ (int)MVT::i64 ++ }; ++ ++ int FloatTypes[] = { ++ (int)MVT::f32, ++ (int)MVT::f64 ++ }; ++ ++ int VectorTypes[] = { ++ (int)MVT::v2i8, ++ (int)MVT::v4i8, ++ (int)MVT::v2i16, ++ (int)MVT::v4i16, ++ (int)MVT::v4f32, ++ (int)MVT::v4i32, ++ (int)MVT::v2f32, ++ (int)MVT::v2i32, ++ (int)MVT::v2f64, ++ (int)MVT::v2i64 ++ }; ++ size_t NumTypes = sizeof(types) / sizeof(*types); ++ size_t NumFloatTypes = sizeof(FloatTypes) / sizeof(*FloatTypes); ++ size_t NumIntTypes = sizeof(IntTypes) / sizeof(*IntTypes); ++ size_t NumVectorTypes = sizeof(VectorTypes) / sizeof(*VectorTypes); ++ ++ const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget(); ++ // These are the current register classes that are ++ // supported ++ ++ for (unsigned int x = 0; x < NumTypes; ++x) { ++ MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x]; ++ ++ //FIXME: SIGN_EXTEND_INREG is not meaningful for floating point types ++ // We cannot sextinreg, expand to shifts ++ setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom); ++ setOperationAction(ISD::SUBE, VT, Expand); ++ setOperationAction(ISD::SUBC, VT, Expand); ++ setOperationAction(ISD::ADDE, VT, Expand); ++ setOperationAction(ISD::ADDC, VT, Expand); ++ setOperationAction(ISD::BRCOND, VT, Custom); ++ setOperationAction(ISD::BR_JT, VT, Expand); ++ setOperationAction(ISD::BRIND, VT, Expand); ++ // TODO: Implement custom UREM/SREM routines ++ setOperationAction(ISD::SREM, VT, Expand); ++ setOperationAction(ISD::SMUL_LOHI, VT, Expand); ++ setOperationAction(ISD::UMUL_LOHI, VT, Expand); ++ if (VT != MVT::i64 && VT != MVT::v2i64) { ++ setOperationAction(ISD::SDIV, VT, Custom); ++ } ++ } ++ for (unsigned int x = 0; x < NumFloatTypes; ++x) { ++ MVT::SimpleValueType VT = (MVT::SimpleValueType)FloatTypes[x]; ++ ++ // IL does not have these operations for floating point types ++ setOperationAction(ISD::FP_ROUND_INREG, VT, Expand); ++ setOperationAction(ISD::SETOLT, VT, Expand); ++ setOperationAction(ISD::SETOGE, VT, Expand); ++ setOperationAction(ISD::SETOGT, VT, Expand); ++ setOperationAction(ISD::SETOLE, VT, Expand); ++ setOperationAction(ISD::SETULT, VT, Expand); ++ setOperationAction(ISD::SETUGE, VT, Expand); ++ setOperationAction(ISD::SETUGT, VT, Expand); ++ setOperationAction(ISD::SETULE, VT, Expand); ++ } ++ ++ for (unsigned int x = 0; x < NumIntTypes; ++x) { ++ MVT::SimpleValueType VT = (MVT::SimpleValueType)IntTypes[x]; ++ ++ // GPU also does not have divrem function for signed or unsigned ++ setOperationAction(ISD::SDIVREM, VT, Expand); ++ ++ // GPU does not have [S|U]MUL_LOHI functions as a single instruction ++ setOperationAction(ISD::SMUL_LOHI, VT, Expand); ++ setOperationAction(ISD::UMUL_LOHI, VT, Expand); ++ ++ // GPU doesn't have a rotl, rotr, or byteswap instruction ++ setOperationAction(ISD::ROTR, VT, Expand); ++ setOperationAction(ISD::BSWAP, VT, Expand); ++ ++ // GPU doesn't have any counting operators ++ setOperationAction(ISD::CTPOP, VT, Expand); ++ setOperationAction(ISD::CTTZ, VT, Expand); ++ setOperationAction(ISD::CTLZ, VT, Expand); ++ } ++ ++ for (unsigned int ii = 0; ii < NumVectorTypes; ++ii) { ++ MVT::SimpleValueType VT = (MVT::SimpleValueType)VectorTypes[ii]; ++ ++ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); ++ setOperationAction(ISD::SDIVREM, VT, Expand); ++ setOperationAction(ISD::SMUL_LOHI, VT, Expand); ++ // setOperationAction(ISD::VSETCC, VT, Expand); ++ setOperationAction(ISD::SELECT_CC, VT, Expand); ++ ++ } ++ if (STM.device()->isSupported(AMDGPUDeviceInfo::LongOps)) { ++ setOperationAction(ISD::MULHU, MVT::i64, Expand); ++ setOperationAction(ISD::MULHU, MVT::v2i64, Expand); ++ setOperationAction(ISD::MULHS, MVT::i64, Expand); ++ setOperationAction(ISD::MULHS, MVT::v2i64, Expand); ++ setOperationAction(ISD::ADD, MVT::v2i64, Expand); ++ setOperationAction(ISD::SREM, MVT::v2i64, Expand); ++ setOperationAction(ISD::Constant , MVT::i64 , Legal); ++ setOperationAction(ISD::SDIV, MVT::v2i64, Expand); ++ setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand); ++ setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand); ++ setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand); ++ setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand); ++ } ++ if (STM.device()->isSupported(AMDGPUDeviceInfo::DoubleOps)) { ++ // we support loading/storing v2f64 but not operations on the type ++ setOperationAction(ISD::FADD, MVT::v2f64, Expand); ++ setOperationAction(ISD::FSUB, MVT::v2f64, Expand); ++ setOperationAction(ISD::FMUL, MVT::v2f64, Expand); ++ setOperationAction(ISD::FP_ROUND_INREG, MVT::v2f64, Expand); ++ setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); ++ setOperationAction(ISD::ConstantFP , MVT::f64 , Legal); ++ // We want to expand vector conversions into their scalar ++ // counterparts. ++ setOperationAction(ISD::TRUNCATE, MVT::v2f64, Expand); ++ setOperationAction(ISD::SIGN_EXTEND, MVT::v2f64, Expand); ++ setOperationAction(ISD::ZERO_EXTEND, MVT::v2f64, Expand); ++ setOperationAction(ISD::ANY_EXTEND, MVT::v2f64, Expand); ++ setOperationAction(ISD::FABS, MVT::f64, Expand); ++ setOperationAction(ISD::FABS, MVT::v2f64, Expand); ++ } ++ // TODO: Fix the UDIV24 algorithm so it works for these ++ // types correctly. This needs vector comparisons ++ // for this to work correctly. ++ setOperationAction(ISD::UDIV, MVT::v2i8, Expand); ++ setOperationAction(ISD::UDIV, MVT::v4i8, Expand); ++ setOperationAction(ISD::UDIV, MVT::v2i16, Expand); ++ setOperationAction(ISD::UDIV, MVT::v4i16, Expand); ++ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom); ++ setOperationAction(ISD::SUBC, MVT::Other, Expand); ++ setOperationAction(ISD::ADDE, MVT::Other, Expand); ++ setOperationAction(ISD::ADDC, MVT::Other, Expand); ++ setOperationAction(ISD::BRCOND, MVT::Other, Custom); ++ setOperationAction(ISD::BR_JT, MVT::Other, Expand); ++ setOperationAction(ISD::BRIND, MVT::Other, Expand); ++ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); ++ ++ ++ // Use the default implementation. ++ setOperationAction(ISD::ConstantFP , MVT::f32 , Legal); ++ setOperationAction(ISD::Constant , MVT::i32 , Legal); ++ ++ setSchedulingPreference(Sched::RegPressure); ++ setPow2DivIsCheap(false); ++ setSelectIsExpensive(true); ++ setJumpIsExpensive(true); ++ ++ maxStoresPerMemcpy = 4096; ++ maxStoresPerMemmove = 4096; ++ maxStoresPerMemset = 4096; ++ ++} ++ ++bool ++AMDGPUTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, ++ const CallInst &I, unsigned Intrinsic) const { ++ return false; ++} ++ ++// The backend supports 32 and 64 bit floating point immediates ++bool ++AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { ++ if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32 ++ || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) { ++ return true; ++ } else { ++ return false; ++ } ++} ++ ++bool ++AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { ++ if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32 ++ || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) { ++ return false; ++ } else { ++ return true; ++ } ++} ++ ++ ++// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to ++// be zero. Op is expected to be a target specific node. Used by DAG ++// combiner. ++ ++void ++AMDGPUTargetLowering::computeMaskedBitsForTargetNode( ++ const SDValue Op, ++ APInt &KnownZero, ++ APInt &KnownOne, ++ const SelectionDAG &DAG, ++ unsigned Depth) const { ++ APInt KnownZero2; ++ APInt KnownOne2; ++ KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything ++ switch (Op.getOpcode()) { ++ default: break; ++ case ISD::SELECT_CC: ++ DAG.ComputeMaskedBits( ++ Op.getOperand(1), ++ KnownZero, ++ KnownOne, ++ Depth + 1 ++ ); ++ DAG.ComputeMaskedBits( ++ Op.getOperand(0), ++ KnownZero2, ++ KnownOne2 ++ ); ++ assert((KnownZero & KnownOne) == 0 ++ && "Bits known to be one AND zero?"); ++ assert((KnownZero2 & KnownOne2) == 0 ++ && "Bits known to be one AND zero?"); ++ // Only known if known in both the LHS and RHS ++ KnownOne &= KnownOne2; ++ KnownZero &= KnownZero2; ++ break; ++ }; ++} ++ ++//===----------------------------------------------------------------------===// ++// Other Lowering Hooks ++//===----------------------------------------------------------------------===// ++ ++SDValue ++AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { ++ EVT OVT = Op.getValueType(); ++ SDValue DST; ++ if (OVT.getScalarType() == MVT::i64) { ++ DST = LowerSDIV64(Op, DAG); ++ } else if (OVT.getScalarType() == MVT::i32) { ++ DST = LowerSDIV32(Op, DAG); ++ } else if (OVT.getScalarType() == MVT::i16 ++ || OVT.getScalarType() == MVT::i8) { ++ DST = LowerSDIV24(Op, DAG); ++ } else { ++ DST = SDValue(Op.getNode(), 0); ++ } ++ return DST; ++} ++ ++SDValue ++AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const { ++ EVT OVT = Op.getValueType(); ++ SDValue DST; ++ if (OVT.getScalarType() == MVT::i64) { ++ DST = LowerSREM64(Op, DAG); ++ } else if (OVT.getScalarType() == MVT::i32) { ++ DST = LowerSREM32(Op, DAG); ++ } else if (OVT.getScalarType() == MVT::i16) { ++ DST = LowerSREM16(Op, DAG); ++ } else if (OVT.getScalarType() == MVT::i8) { ++ DST = LowerSREM8(Op, DAG); ++ } else { ++ DST = SDValue(Op.getNode(), 0); ++ } ++ return DST; ++} ++ ++SDValue ++AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { ++ SDValue Data = Op.getOperand(0); ++ VTSDNode *BaseType = cast(Op.getOperand(1)); ++ DebugLoc DL = Op.getDebugLoc(); ++ EVT DVT = Data.getValueType(); ++ EVT BVT = BaseType->getVT(); ++ unsigned baseBits = BVT.getScalarType().getSizeInBits(); ++ unsigned srcBits = DVT.isSimple() ? DVT.getScalarType().getSizeInBits() : 1; ++ unsigned shiftBits = srcBits - baseBits; ++ if (srcBits < 32) { ++ // If the op is less than 32 bits, then it needs to extend to 32bits ++ // so it can properly keep the upper bits valid. ++ EVT IVT = genIntType(32, DVT.isVector() ? DVT.getVectorNumElements() : 1); ++ Data = DAG.getNode(ISD::ZERO_EXTEND, DL, IVT, Data); ++ shiftBits = 32 - baseBits; ++ DVT = IVT; ++ } ++ SDValue Shift = DAG.getConstant(shiftBits, DVT); ++ // Shift left by 'Shift' bits. ++ Data = DAG.getNode(ISD::SHL, DL, DVT, Data, Shift); ++ // Signed shift Right by 'Shift' bits. ++ Data = DAG.getNode(ISD::SRA, DL, DVT, Data, Shift); ++ if (srcBits < 32) { ++ // Once the sign extension is done, the op needs to be converted to ++ // its original type. ++ Data = DAG.getSExtOrTrunc(Data, DL, Op.getOperand(0).getValueType()); ++ } ++ return Data; ++} ++EVT ++AMDGPUTargetLowering::genIntType(uint32_t size, uint32_t numEle) const { ++ int iSize = (size * numEle); ++ int vEle = (iSize >> ((size == 64) ? 6 : 5)); ++ if (!vEle) { ++ vEle = 1; ++ } ++ if (size == 64) { ++ if (vEle == 1) { ++ return EVT(MVT::i64); ++ } else { ++ return EVT(MVT::getVectorVT(MVT::i64, vEle)); ++ } ++ } else { ++ if (vEle == 1) { ++ return EVT(MVT::i32); ++ } else { ++ return EVT(MVT::getVectorVT(MVT::i32, vEle)); ++ } ++ } ++} ++ ++SDValue ++AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { ++ SDValue Chain = Op.getOperand(0); ++ SDValue Cond = Op.getOperand(1); ++ SDValue Jump = Op.getOperand(2); ++ SDValue Result; ++ Result = DAG.getNode( ++ AMDGPUISD::BRANCH_COND, ++ Op.getDebugLoc(), ++ Op.getValueType(), ++ Chain, Jump, Cond); ++ return Result; ++} ++ ++SDValue ++AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const { ++ DebugLoc DL = Op.getDebugLoc(); ++ EVT OVT = Op.getValueType(); ++ SDValue LHS = Op.getOperand(0); ++ SDValue RHS = Op.getOperand(1); ++ MVT INTTY; ++ MVT FLTTY; ++ if (!OVT.isVector()) { ++ INTTY = MVT::i32; ++ FLTTY = MVT::f32; ++ } else if (OVT.getVectorNumElements() == 2) { ++ INTTY = MVT::v2i32; ++ FLTTY = MVT::v2f32; ++ } else if (OVT.getVectorNumElements() == 4) { ++ INTTY = MVT::v4i32; ++ FLTTY = MVT::v4f32; ++ } ++ unsigned bitsize = OVT.getScalarType().getSizeInBits(); ++ // char|short jq = ia ^ ib; ++ SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS); ++ ++ // jq = jq >> (bitsize - 2) ++ jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT)); ++ ++ // jq = jq | 0x1 ++ jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT)); ++ ++ // jq = (int)jq ++ jq = DAG.getSExtOrTrunc(jq, DL, INTTY); ++ ++ // int ia = (int)LHS; ++ SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY); ++ ++ // int ib, (int)RHS; ++ SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY); ++ ++ // float fa = (float)ia; ++ SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia); ++ ++ // float fb = (float)ib; ++ SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib); ++ ++ // float fq = native_divide(fa, fb); ++ SDValue fq = DAG.getNode(AMDGPUISD::DIV_INF, DL, FLTTY, fa, fb); ++ ++ // fq = trunc(fq); ++ fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq); ++ ++ // float fqneg = -fq; ++ SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq); ++ ++ // float fr = mad(fqneg, fb, fa); ++ SDValue fr = DAG.getNode(AMDGPUISD::MAD, DL, FLTTY, fqneg, fb, fa); ++ ++ // int iq = (int)fq; ++ SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq); ++ ++ // fr = fabs(fr); ++ fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr); ++ ++ // fb = fabs(fb); ++ fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb); ++ ++ // int cv = fr >= fb; ++ SDValue cv; ++ if (INTTY == MVT::i32) { ++ cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); ++ } else { ++ cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); ++ } ++ // jq = (cv ? jq : 0); ++ jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq, ++ DAG.getConstant(0, OVT)); ++ // dst = iq + jq; ++ iq = DAG.getSExtOrTrunc(iq, DL, OVT); ++ iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq); ++ return iq; ++} ++ ++SDValue ++AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const { ++ DebugLoc DL = Op.getDebugLoc(); ++ EVT OVT = Op.getValueType(); ++ SDValue LHS = Op.getOperand(0); ++ SDValue RHS = Op.getOperand(1); ++ // The LowerSDIV32 function generates equivalent to the following IL. ++ // mov r0, LHS ++ // mov r1, RHS ++ // ilt r10, r0, 0 ++ // ilt r11, r1, 0 ++ // iadd r0, r0, r10 ++ // iadd r1, r1, r11 ++ // ixor r0, r0, r10 ++ // ixor r1, r1, r11 ++ // udiv r0, r0, r1 ++ // ixor r10, r10, r11 ++ // iadd r0, r0, r10 ++ // ixor DST, r0, r10 ++ ++ // mov r0, LHS ++ SDValue r0 = LHS; ++ ++ // mov r1, RHS ++ SDValue r1 = RHS; ++ ++ // ilt r10, r0, 0 ++ SDValue r10 = DAG.getSelectCC(DL, ++ r0, DAG.getConstant(0, OVT), ++ DAG.getConstant(-1, MVT::i32), ++ DAG.getConstant(0, MVT::i32), ++ ISD::SETLT); ++ ++ // ilt r11, r1, 0 ++ SDValue r11 = DAG.getSelectCC(DL, ++ r1, DAG.getConstant(0, OVT), ++ DAG.getConstant(-1, MVT::i32), ++ DAG.getConstant(0, MVT::i32), ++ ISD::SETLT); ++ ++ // iadd r0, r0, r10 ++ r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); ++ ++ // iadd r1, r1, r11 ++ r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); ++ ++ // ixor r0, r0, r10 ++ r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); ++ ++ // ixor r1, r1, r11 ++ r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); ++ ++ // udiv r0, r0, r1 ++ r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1); ++ ++ // ixor r10, r10, r11 ++ r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11); ++ ++ // iadd r0, r0, r10 ++ r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); ++ ++ // ixor DST, r0, r10 ++ SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); ++ return DST; ++} ++ ++SDValue ++AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const { ++ return SDValue(Op.getNode(), 0); ++} ++ ++SDValue ++AMDGPUTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const { ++ DebugLoc DL = Op.getDebugLoc(); ++ EVT OVT = Op.getValueType(); ++ MVT INTTY = MVT::i32; ++ if (OVT == MVT::v2i8) { ++ INTTY = MVT::v2i32; ++ } else if (OVT == MVT::v4i8) { ++ INTTY = MVT::v4i32; ++ } ++ SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY); ++ SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY); ++ LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS); ++ LHS = DAG.getSExtOrTrunc(LHS, DL, OVT); ++ return LHS; ++} ++ ++SDValue ++AMDGPUTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const { ++ DebugLoc DL = Op.getDebugLoc(); ++ EVT OVT = Op.getValueType(); ++ MVT INTTY = MVT::i32; ++ if (OVT == MVT::v2i16) { ++ INTTY = MVT::v2i32; ++ } else if (OVT == MVT::v4i16) { ++ INTTY = MVT::v4i32; ++ } ++ SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY); ++ SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY); ++ LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS); ++ LHS = DAG.getSExtOrTrunc(LHS, DL, OVT); ++ return LHS; ++} ++ ++SDValue ++AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const { ++ DebugLoc DL = Op.getDebugLoc(); ++ EVT OVT = Op.getValueType(); ++ SDValue LHS = Op.getOperand(0); ++ SDValue RHS = Op.getOperand(1); ++ // The LowerSREM32 function generates equivalent to the following IL. ++ // mov r0, LHS ++ // mov r1, RHS ++ // ilt r10, r0, 0 ++ // ilt r11, r1, 0 ++ // iadd r0, r0, r10 ++ // iadd r1, r1, r11 ++ // ixor r0, r0, r10 ++ // ixor r1, r1, r11 ++ // udiv r20, r0, r1 ++ // umul r20, r20, r1 ++ // sub r0, r0, r20 ++ // iadd r0, r0, r10 ++ // ixor DST, r0, r10 ++ ++ // mov r0, LHS ++ SDValue r0 = LHS; ++ ++ // mov r1, RHS ++ SDValue r1 = RHS; ++ ++ // ilt r10, r0, 0 ++ SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT); ++ ++ // ilt r11, r1, 0 ++ SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT); ++ ++ // iadd r0, r0, r10 ++ r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); ++ ++ // iadd r1, r1, r11 ++ r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); ++ ++ // ixor r0, r0, r10 ++ r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); ++ ++ // ixor r1, r1, r11 ++ r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); ++ ++ // udiv r20, r0, r1 ++ SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1); ++ ++ // umul r20, r20, r1 ++ r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1); ++ ++ // sub r0, r0, r20 ++ r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20); ++ ++ // iadd r0, r0, r10 ++ r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); ++ ++ // ixor DST, r0, r10 ++ SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); ++ return DST; ++} ++ ++SDValue ++AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const { ++ return SDValue(Op.getNode(), 0); ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILNIDevice.cpp llvm-r600/lib/Target/R600/AMDILNIDevice.cpp +--- llvm-3.2.src/lib/Target/R600/AMDILNIDevice.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILNIDevice.cpp 2013-01-25 19:43:57.446716388 +0100 +@@ -0,0 +1,65 @@ ++//===-- AMDILNIDevice.cpp - Device Info for Northern Islands devices ------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++/// \file ++//==-----------------------------------------------------------------------===// ++#include "AMDILNIDevice.h" ++#include "AMDILEvergreenDevice.h" ++#include "AMDGPUSubtarget.h" ++ ++using namespace llvm; ++ ++AMDGPUNIDevice::AMDGPUNIDevice(AMDGPUSubtarget *ST) ++ : AMDGPUEvergreenDevice(ST) { ++ std::string name = ST->getDeviceName(); ++ if (name == "caicos") { ++ DeviceFlag = OCL_DEVICE_CAICOS; ++ } else if (name == "turks") { ++ DeviceFlag = OCL_DEVICE_TURKS; ++ } else if (name == "cayman") { ++ DeviceFlag = OCL_DEVICE_CAYMAN; ++ } else { ++ DeviceFlag = OCL_DEVICE_BARTS; ++ } ++} ++AMDGPUNIDevice::~AMDGPUNIDevice() { ++} ++ ++size_t ++AMDGPUNIDevice::getMaxLDSSize() const { ++ if (usesHardware(AMDGPUDeviceInfo::LocalMem)) { ++ return MAX_LDS_SIZE_900; ++ } else { ++ return 0; ++ } ++} ++ ++uint32_t ++AMDGPUNIDevice::getGeneration() const { ++ return AMDGPUDeviceInfo::HD6XXX; ++} ++ ++ ++AMDGPUCaymanDevice::AMDGPUCaymanDevice(AMDGPUSubtarget *ST) ++ : AMDGPUNIDevice(ST) { ++ setCaps(); ++} ++ ++AMDGPUCaymanDevice::~AMDGPUCaymanDevice() { ++} ++ ++void ++AMDGPUCaymanDevice::setCaps() { ++ if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) { ++ mHWBits.set(AMDGPUDeviceInfo::DoubleOps); ++ mHWBits.set(AMDGPUDeviceInfo::FMA); ++ } ++ mHWBits.set(AMDGPUDeviceInfo::Signed24BitOps); ++ mSWBits.reset(AMDGPUDeviceInfo::Signed24BitOps); ++ mSWBits.set(AMDGPUDeviceInfo::ArenaSegment); ++} ++ +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILNIDevice.h llvm-r600/lib/Target/R600/AMDILNIDevice.h +--- llvm-3.2.src/lib/Target/R600/AMDILNIDevice.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILNIDevice.h 2013-01-25 19:43:57.446716388 +0100 +@@ -0,0 +1,57 @@ ++//===------- AMDILNIDevice.h - Define NI Device for AMDIL -*- C++ -*------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//==-----------------------------------------------------------------------===// ++/// \file ++/// \brief Interface for the subtarget data classes. ++/// ++/// This file will define the interface that each generation needs to ++/// implement in order to correctly answer queries on the capabilities of the ++/// specific hardware. ++//===---------------------------------------------------------------------===// ++#ifndef AMDILNIDEVICE_H ++#define AMDILNIDEVICE_H ++#include "AMDILEvergreenDevice.h" ++#include "AMDGPUSubtarget.h" ++ ++namespace llvm { ++ ++class AMDGPUSubtarget; ++//===---------------------------------------------------------------------===// ++// NI generation of devices and their respective sub classes ++//===---------------------------------------------------------------------===// ++ ++/// \brief The AMDGPUNIDevice is the base class for all Northern Island series of ++/// cards. ++/// ++/// It is very similiar to the AMDGPUEvergreenDevice, with the major ++/// exception being differences in wavefront size and hardware capabilities. The ++/// NI devices are all 64 wide wavefronts and also add support for signed 24 bit ++/// integer operations ++class AMDGPUNIDevice : public AMDGPUEvergreenDevice { ++public: ++ AMDGPUNIDevice(AMDGPUSubtarget*); ++ virtual ~AMDGPUNIDevice(); ++ virtual size_t getMaxLDSSize() const; ++ virtual uint32_t getGeneration() const; ++}; ++ ++/// Just as the AMDGPUCypressDevice is the double capable version of the ++/// AMDGPUEvergreenDevice, the AMDGPUCaymanDevice is the double capable version ++/// of the AMDGPUNIDevice. The other major difference is that the Cayman Device ++/// has 4 wide ALU's, whereas the rest of the NI family is a 5 wide. ++class AMDGPUCaymanDevice: public AMDGPUNIDevice { ++public: ++ AMDGPUCaymanDevice(AMDGPUSubtarget*); ++ virtual ~AMDGPUCaymanDevice(); ++private: ++ virtual void setCaps(); ++}; ++ ++static const unsigned int MAX_LDS_SIZE_900 = AMDGPUDevice::MAX_LDS_SIZE_800; ++} // namespace llvm ++#endif // AMDILNIDEVICE_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILPeepholeOptimizer.cpp llvm-r600/lib/Target/R600/AMDILPeepholeOptimizer.cpp +--- llvm-3.2.src/lib/Target/R600/AMDILPeepholeOptimizer.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILPeepholeOptimizer.cpp 2013-01-25 19:43:57.450049721 +0100 +@@ -0,0 +1,1256 @@ ++//===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++/// \file ++//==-----------------------------------------------------------------------===// ++ ++#define DEBUG_TYPE "PeepholeOpt" ++#ifdef DEBUG ++#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) ++#else ++#define DEBUGME 0 ++#endif ++ ++#include "AMDILDevices.h" ++#include "AMDGPUInstrInfo.h" ++#include "llvm/ADT/Statistic.h" ++#include "llvm/ADT/StringExtras.h" ++#include "llvm/ADT/StringRef.h" ++#include "llvm/ADT/Twine.h" ++#include "llvm/Constants.h" ++#include "llvm/CodeGen/MachineFunction.h" ++#include "llvm/CodeGen/MachineFunctionAnalysis.h" ++#include "llvm/Function.h" ++#include "llvm/Instructions.h" ++#include "llvm/Module.h" ++#include "llvm/Support/Debug.h" ++#include "llvm/Support/MathExtras.h" ++ ++#include ++ ++#if 0 ++STATISTIC(PointerAssignments, "Number of dynamic pointer " ++ "assigments discovered"); ++STATISTIC(PointerSubtract, "Number of pointer subtractions discovered"); ++#endif ++ ++using namespace llvm; ++// The Peephole optimization pass is used to do simple last minute optimizations ++// that are required for correct code or to remove redundant functions ++namespace { ++ ++class OpaqueType; ++ ++class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass { ++public: ++ TargetMachine &TM; ++ static char ID; ++ AMDGPUPeepholeOpt(TargetMachine &tm); ++ ~AMDGPUPeepholeOpt(); ++ const char *getPassName() const; ++ bool runOnFunction(Function &F); ++ bool doInitialization(Module &M); ++ bool doFinalization(Module &M); ++ void getAnalysisUsage(AnalysisUsage &AU) const; ++protected: ++private: ++ // Function to initiate all of the instruction level optimizations. ++ bool instLevelOptimizations(BasicBlock::iterator *inst); ++ // Quick check to see if we need to dump all of the pointers into the ++ // arena. If this is correct, then we set all pointers to exist in arena. This ++ // is a workaround for aliasing of pointers in a struct/union. ++ bool dumpAllIntoArena(Function &F); ++ // Because I don't want to invalidate any pointers while in the ++ // safeNestedForEachFunction. I push atomic conversions to a vector and handle ++ // it later. This function does the conversions if required. ++ void doAtomicConversionIfNeeded(Function &F); ++ // Because __amdil_is_constant cannot be properly evaluated if ++ // optimizations are disabled, the call's are placed in a vector ++ // and evaluated after the __amdil_image* functions are evaluated ++ // which should allow the __amdil_is_constant function to be ++ // evaluated correctly. ++ void doIsConstCallConversionIfNeeded(); ++ bool mChanged; ++ bool mDebug; ++ bool mConvertAtomics; ++ CodeGenOpt::Level optLevel; ++ // Run a series of tests to see if we can optimize a CALL instruction. ++ bool optimizeCallInst(BasicBlock::iterator *bbb); ++ // A peephole optimization to optimize bit extract sequences. ++ bool optimizeBitExtract(Instruction *inst); ++ // A peephole optimization to optimize bit insert sequences. ++ bool optimizeBitInsert(Instruction *inst); ++ bool setupBitInsert(Instruction *base, ++ Instruction *&src, ++ Constant *&mask, ++ Constant *&shift); ++ // Expand the bit field insert instruction on versions of OpenCL that ++ // don't support it. ++ bool expandBFI(CallInst *CI); ++ // Expand the bit field mask instruction on version of OpenCL that ++ // don't support it. ++ bool expandBFM(CallInst *CI); ++ // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in ++ // this case we need to expand them. These functions check for 24bit functions ++ // and then expand. ++ bool isSigned24BitOps(CallInst *CI); ++ void expandSigned24BitOps(CallInst *CI); ++ // One optimization that can occur is that if the required workgroup size is ++ // specified then the result of get_local_size is known at compile time and ++ // can be returned accordingly. ++ bool isRWGLocalOpt(CallInst *CI); ++ // On northern island cards, the division is slightly less accurate than on ++ // previous generations, so we need to utilize a more accurate division. So we ++ // can translate the accurate divide to a normal divide on all other cards. ++ bool convertAccurateDivide(CallInst *CI); ++ void expandAccurateDivide(CallInst *CI); ++ // If the alignment is set incorrectly, it can produce really inefficient ++ // code. This checks for this scenario and fixes it if possible. ++ bool correctMisalignedMemOp(Instruction *inst); ++ ++ // If we are in no opt mode, then we need to make sure that ++ // local samplers are properly propagated as constant propagation ++ // doesn't occur and we need to know the value of kernel defined ++ // samplers at compile time. ++ bool propagateSamplerInst(CallInst *CI); ++ ++ // Helper functions ++ ++ // Group of functions that recursively calculate the size of a structure based ++ // on it's sub-types. ++ size_t getTypeSize(Type * const T, bool dereferencePtr = false); ++ size_t getTypeSize(StructType * const ST, bool dereferencePtr = false); ++ size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false); ++ size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false); ++ size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false); ++ size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false); ++ size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false); ++ size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false); ++ ++ LLVMContext *mCTX; ++ Function *mF; ++ const AMDGPUSubtarget *mSTM; ++ SmallVector< std::pair, 16> atomicFuncs; ++ SmallVector isConstVec; ++}; // class AMDGPUPeepholeOpt ++ char AMDGPUPeepholeOpt::ID = 0; ++ ++// A template function that has two levels of looping before calling the ++// function with a pointer to the current iterator. ++template ++Function safeNestedForEach(InputIterator First, InputIterator Last, ++ SecondIterator S, Function F) { ++ for ( ; First != Last; ++First) { ++ SecondIterator sf, sl; ++ for (sf = First->begin(), sl = First->end(); ++ sf != sl; ) { ++ if (!F(&sf)) { ++ ++sf; ++ } ++ } ++ } ++ return F; ++} ++ ++} // anonymous namespace ++ ++namespace llvm { ++ FunctionPass * ++ createAMDGPUPeepholeOpt(TargetMachine &tm) { ++ return new AMDGPUPeepholeOpt(tm); ++ } ++} // llvm namespace ++ ++AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm) ++ : FunctionPass(ID), TM(tm) { ++ mDebug = DEBUGME; ++ optLevel = TM.getOptLevel(); ++ ++} ++ ++AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() { ++} ++ ++const char * ++AMDGPUPeepholeOpt::getPassName() const { ++ return "AMDGPU PeepHole Optimization Pass"; ++} ++ ++bool ++containsPointerType(Type *Ty) { ++ if (!Ty) { ++ return false; ++ } ++ switch(Ty->getTypeID()) { ++ default: ++ return false; ++ case Type::StructTyID: { ++ const StructType *ST = dyn_cast(Ty); ++ for (StructType::element_iterator stb = ST->element_begin(), ++ ste = ST->element_end(); stb != ste; ++stb) { ++ if (!containsPointerType(*stb)) { ++ continue; ++ } ++ return true; ++ } ++ break; ++ } ++ case Type::VectorTyID: ++ case Type::ArrayTyID: ++ return containsPointerType(dyn_cast(Ty)->getElementType()); ++ case Type::PointerTyID: ++ return true; ++ }; ++ return false; ++} ++ ++bool ++AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) { ++ bool dumpAll = false; ++ for (Function::const_arg_iterator cab = F.arg_begin(), ++ cae = F.arg_end(); cab != cae; ++cab) { ++ const Argument *arg = cab; ++ const PointerType *PT = dyn_cast(arg->getType()); ++ if (!PT) { ++ continue; ++ } ++ Type *DereferencedType = PT->getElementType(); ++ if (!dyn_cast(DereferencedType) ++ ) { ++ continue; ++ } ++ if (!containsPointerType(DereferencedType)) { ++ continue; ++ } ++ // FIXME: Because a pointer inside of a struct/union may be aliased to ++ // another pointer we need to take the conservative approach and place all ++ // pointers into the arena until more advanced detection is implemented. ++ dumpAll = true; ++ } ++ return dumpAll; ++} ++void ++AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() { ++ if (isConstVec.empty()) { ++ return; ++ } ++ for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) { ++ CallInst *CI = isConstVec[x]; ++ Constant *CV = dyn_cast(CI->getOperand(0)); ++ Type *aType = Type::getInt32Ty(*mCTX); ++ Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) ++ : ConstantInt::get(aType, 0); ++ CI->replaceAllUsesWith(Val); ++ CI->eraseFromParent(); ++ } ++ isConstVec.clear(); ++} ++void ++AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) { ++ // Don't do anything if we don't have any atomic operations. ++ if (atomicFuncs.empty()) { ++ return; ++ } ++ // Change the function name for the atomic if it is required ++ uint32_t size = atomicFuncs.size(); ++ for (uint32_t x = 0; x < size; ++x) { ++ atomicFuncs[x].first->setOperand( ++ atomicFuncs[x].first->getNumOperands()-1, ++ atomicFuncs[x].second); ++ ++ } ++ mChanged = true; ++ if (mConvertAtomics) { ++ return; ++ } ++} ++ ++bool ++AMDGPUPeepholeOpt::runOnFunction(Function &MF) { ++ mChanged = false; ++ mF = &MF; ++ mSTM = &TM.getSubtarget(); ++ if (mDebug) { ++ MF.dump(); ++ } ++ mCTX = &MF.getType()->getContext(); ++ mConvertAtomics = true; ++ safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(), ++ std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations), ++ this)); ++ ++ doAtomicConversionIfNeeded(MF); ++ doIsConstCallConversionIfNeeded(); ++ ++ if (mDebug) { ++ MF.dump(); ++ } ++ return mChanged; ++} ++ ++bool ++AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) { ++ Instruction *inst = (*bbb); ++ CallInst *CI = dyn_cast(inst); ++ if (!CI) { ++ return false; ++ } ++ if (isSigned24BitOps(CI)) { ++ expandSigned24BitOps(CI); ++ ++(*bbb); ++ CI->eraseFromParent(); ++ return true; ++ } ++ if (propagateSamplerInst(CI)) { ++ return false; ++ } ++ if (expandBFI(CI) || expandBFM(CI)) { ++ ++(*bbb); ++ CI->eraseFromParent(); ++ return true; ++ } ++ if (convertAccurateDivide(CI)) { ++ expandAccurateDivide(CI); ++ ++(*bbb); ++ CI->eraseFromParent(); ++ return true; ++ } ++ ++ StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName(); ++ if (calleeName.startswith("__amdil_is_constant")) { ++ // If we do not have optimizations, then this ++ // cannot be properly evaluated, so we add the ++ // call instruction to a vector and process ++ // them at the end of processing after the ++ // samplers have been correctly handled. ++ if (optLevel == CodeGenOpt::None) { ++ isConstVec.push_back(CI); ++ return false; ++ } else { ++ Constant *CV = dyn_cast(CI->getOperand(0)); ++ Type *aType = Type::getInt32Ty(*mCTX); ++ Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) ++ : ConstantInt::get(aType, 0); ++ CI->replaceAllUsesWith(Val); ++ ++(*bbb); ++ CI->eraseFromParent(); ++ return true; ++ } ++ } ++ ++ if (calleeName.equals("__amdil_is_asic_id_i32")) { ++ ConstantInt *CV = dyn_cast(CI->getOperand(0)); ++ Type *aType = Type::getInt32Ty(*mCTX); ++ Value *Val = CV; ++ if (Val) { ++ Val = ConstantInt::get(aType, ++ mSTM->device()->getDeviceFlag() & CV->getZExtValue()); ++ } else { ++ Val = ConstantInt::get(aType, 0); ++ } ++ CI->replaceAllUsesWith(Val); ++ ++(*bbb); ++ CI->eraseFromParent(); ++ return true; ++ } ++ Function *F = dyn_cast(CI->getOperand(CI->getNumOperands()-1)); ++ if (!F) { ++ return false; ++ } ++ if (F->getName().startswith("__atom") && !CI->getNumUses() ++ && F->getName().find("_xchg") == StringRef::npos) { ++ std::string buffer(F->getName().str() + "_noret"); ++ F = dyn_cast( ++ F->getParent()->getOrInsertFunction(buffer, F->getFunctionType())); ++ atomicFuncs.push_back(std::make_pair (CI, F)); ++ } ++ ++ if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment) ++ && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) { ++ return false; ++ } ++ if (!mConvertAtomics) { ++ return false; ++ } ++ StringRef name = F->getName(); ++ if (name.startswith("__atom") && name.find("_g") != StringRef::npos) { ++ mConvertAtomics = false; ++ } ++ return false; ++} ++ ++bool ++AMDGPUPeepholeOpt::setupBitInsert(Instruction *base, ++ Instruction *&src, ++ Constant *&mask, ++ Constant *&shift) { ++ if (!base) { ++ if (mDebug) { ++ dbgs() << "Null pointer passed into function.\n"; ++ } ++ return false; ++ } ++ bool andOp = false; ++ if (base->getOpcode() == Instruction::Shl) { ++ shift = dyn_cast(base->getOperand(1)); ++ } else if (base->getOpcode() == Instruction::And) { ++ mask = dyn_cast(base->getOperand(1)); ++ andOp = true; ++ } else { ++ if (mDebug) { ++ dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n"; ++ } ++ // If the base is neither a Shl or a And, we don't fit any of the patterns above. ++ return false; ++ } ++ src = dyn_cast(base->getOperand(0)); ++ if (!src) { ++ if (mDebug) { ++ dbgs() << "Failed setup since the base operand is not an instruction!\n"; ++ } ++ return false; ++ } ++ // If we find an 'and' operation, then we don't need to ++ // find the next operation as we already know the ++ // bits that are valid at this point. ++ if (andOp) { ++ return true; ++ } ++ if (src->getOpcode() == Instruction::Shl && !shift) { ++ shift = dyn_cast(src->getOperand(1)); ++ src = dyn_cast(src->getOperand(0)); ++ } else if (src->getOpcode() == Instruction::And && !mask) { ++ mask = dyn_cast(src->getOperand(1)); ++ } ++ if (!mask && !shift) { ++ if (mDebug) { ++ dbgs() << "Failed setup since both mask and shift are NULL!\n"; ++ } ++ // Did not find a constant mask or a shift. ++ return false; ++ } ++ return true; ++} ++bool ++AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) { ++ if (!inst) { ++ return false; ++ } ++ if (!inst->isBinaryOp()) { ++ return false; ++ } ++ if (inst->getOpcode() != Instruction::Or) { ++ return false; ++ } ++ if (optLevel == CodeGenOpt::None) { ++ return false; ++ } ++ // We want to do an optimization on a sequence of ops that in the end equals a ++ // single ISA instruction. ++ // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F) ++ // Some simplified versions of this pattern are as follows: ++ // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0 ++ // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E ++ // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B ++ // (A & B) | (D << F) when (1 << F) >= B ++ // (A << C) | (D & E) when (1 << C) >= E ++ if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) { ++ // The HD4XXX hardware doesn't support the ubit_insert instruction. ++ return false; ++ } ++ Type *aType = inst->getType(); ++ bool isVector = aType->isVectorTy(); ++ int numEle = 1; ++ // This optimization only works on 32bit integers. ++ if (aType->getScalarType() ++ != Type::getInt32Ty(inst->getContext())) { ++ return false; ++ } ++ if (isVector) { ++ const VectorType *VT = dyn_cast(aType); ++ numEle = VT->getNumElements(); ++ // We currently cannot support more than 4 elements in a intrinsic and we ++ // cannot support Vec3 types. ++ if (numEle > 4 || numEle == 3) { ++ return false; ++ } ++ } ++ // TODO: Handle vectors. ++ if (isVector) { ++ if (mDebug) { ++ dbgs() << "!!! Vectors are not supported yet!\n"; ++ } ++ return false; ++ } ++ Instruction *LHSSrc = NULL, *RHSSrc = NULL; ++ Constant *LHSMask = NULL, *RHSMask = NULL; ++ Constant *LHSShift = NULL, *RHSShift = NULL; ++ Instruction *LHS = dyn_cast(inst->getOperand(0)); ++ Instruction *RHS = dyn_cast(inst->getOperand(1)); ++ if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) { ++ if (mDebug) { ++ dbgs() << "Found an OR Operation that failed setup!\n"; ++ inst->dump(); ++ if (LHS) { LHS->dump(); } ++ if (LHSSrc) { LHSSrc->dump(); } ++ if (LHSMask) { LHSMask->dump(); } ++ if (LHSShift) { LHSShift->dump(); } ++ } ++ // There was an issue with the setup for BitInsert. ++ return false; ++ } ++ if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) { ++ if (mDebug) { ++ dbgs() << "Found an OR Operation that failed setup!\n"; ++ inst->dump(); ++ if (RHS) { RHS->dump(); } ++ if (RHSSrc) { RHSSrc->dump(); } ++ if (RHSMask) { RHSMask->dump(); } ++ if (RHSShift) { RHSShift->dump(); } ++ } ++ // There was an issue with the setup for BitInsert. ++ return false; ++ } ++ if (mDebug) { ++ dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n"; ++ dbgs() << "Op: "; inst->dump(); ++ dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; } ++ dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; } ++ dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; } ++ dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; } ++ dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; } ++ dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; } ++ dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; } ++ dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; } ++ } ++ Constant *offset = NULL; ++ Constant *width = NULL; ++ uint32_t lhsMaskVal = 0, rhsMaskVal = 0; ++ uint32_t lhsShiftVal = 0, rhsShiftVal = 0; ++ uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0; ++ uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0; ++ lhsMaskVal = (LHSMask ++ ? dyn_cast(LHSMask)->getZExtValue() : 0); ++ rhsMaskVal = (RHSMask ++ ? dyn_cast(RHSMask)->getZExtValue() : 0); ++ lhsShiftVal = (LHSShift ++ ? dyn_cast(LHSShift)->getZExtValue() : 0); ++ rhsShiftVal = (RHSShift ++ ? dyn_cast(RHSShift)->getZExtValue() : 0); ++ lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal; ++ rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal; ++ lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal; ++ rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal; ++ // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks). ++ if (mDebug) { ++ dbgs() << "Found pattern: \'((A" << (LHSMask ? " & B)" : ")"); ++ dbgs() << (LHSShift ? " << C)" : ")") << " | ((D" ; ++ dbgs() << (RHSMask ? " & E)" : ")"); ++ dbgs() << (RHSShift ? " << F)\'\n" : ")\'\n"); ++ dbgs() << "A = LHSSrc\t\tD = RHSSrc \n"; ++ dbgs() << "B = " << lhsMaskVal << "\t\tE = " << rhsMaskVal << "\n"; ++ dbgs() << "C = " << lhsShiftVal << "\t\tF = " << rhsShiftVal << "\n"; ++ dbgs() << "width(B) = " << lhsMaskWidth; ++ dbgs() << "\twidth(E) = " << rhsMaskWidth << "\n"; ++ dbgs() << "offset(B) = " << lhsMaskOffset; ++ dbgs() << "\toffset(E) = " << rhsMaskOffset << "\n"; ++ dbgs() << "Constraints: \n"; ++ dbgs() << "\t(1) B ^ E == 0\n"; ++ dbgs() << "\t(2-LHS) B is a mask\n"; ++ dbgs() << "\t(2-LHS) E is a mask\n"; ++ dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n"; ++ dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n"; ++ } ++ if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) { ++ if (mDebug) { ++ dbgs() << lhsMaskVal << " ^ " << rhsMaskVal; ++ dbgs() << " = " << (lhsMaskVal ^ rhsMaskVal) << "\n"; ++ dbgs() << "Failed constraint 1!\n"; ++ } ++ return false; ++ } ++ if (mDebug) { ++ dbgs() << "LHS = " << lhsMaskOffset << ""; ++ dbgs() << " >= (" << rhsMaskWidth << " + " << rhsMaskOffset << ") = "; ++ dbgs() << (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)); ++ dbgs() << "\nRHS = " << rhsMaskOffset << ""; ++ dbgs() << " >= (" << lhsMaskWidth << " + " << lhsMaskOffset << ") = "; ++ dbgs() << (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)); ++ dbgs() << "\n"; ++ } ++ if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) { ++ offset = ConstantInt::get(aType, lhsMaskOffset, false); ++ width = ConstantInt::get(aType, lhsMaskWidth, false); ++ RHSSrc = RHS; ++ if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) { ++ if (mDebug) { ++ dbgs() << "Value is not a Mask: " << lhsMaskVal << "\n"; ++ dbgs() << "Failed constraint 2!\n"; ++ } ++ return false; ++ } ++ if (!LHSShift) { ++ LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, ++ "MaskShr", LHS); ++ } else if (lhsShiftVal != lhsMaskOffset) { ++ LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, ++ "MaskShr", LHS); ++ } ++ if (mDebug) { ++ dbgs() << "Optimizing LHS!\n"; ++ } ++ } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) { ++ offset = ConstantInt::get(aType, rhsMaskOffset, false); ++ width = ConstantInt::get(aType, rhsMaskWidth, false); ++ LHSSrc = RHSSrc; ++ RHSSrc = LHS; ++ if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) { ++ if (mDebug) { ++ dbgs() << "Non-Mask: " << rhsMaskVal << "\n"; ++ dbgs() << "Failed constraint 2!\n"; ++ } ++ return false; ++ } ++ if (!RHSShift) { ++ LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, ++ "MaskShr", RHS); ++ } else if (rhsShiftVal != rhsMaskOffset) { ++ LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, ++ "MaskShr", RHS); ++ } ++ if (mDebug) { ++ dbgs() << "Optimizing RHS!\n"; ++ } ++ } else { ++ if (mDebug) { ++ dbgs() << "Failed constraint 3!\n"; ++ } ++ return false; ++ } ++ if (mDebug) { ++ dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; } ++ dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; } ++ dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; } ++ dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; } ++ } ++ if (!offset || !width) { ++ if (mDebug) { ++ dbgs() << "Either width or offset are NULL, failed detection!\n"; ++ } ++ return false; ++ } ++ // Lets create the function signature. ++ std::vector callTypes; ++ callTypes.push_back(aType); ++ callTypes.push_back(aType); ++ callTypes.push_back(aType); ++ callTypes.push_back(aType); ++ FunctionType *funcType = FunctionType::get(aType, callTypes, false); ++ std::string name = "__amdil_ubit_insert"; ++ if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; } ++ Function *Func = ++ dyn_cast(inst->getParent()->getParent()->getParent()-> ++ getOrInsertFunction(llvm::StringRef(name), funcType)); ++ Value *Operands[4] = { ++ width, ++ offset, ++ LHSSrc, ++ RHSSrc ++ }; ++ CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt"); ++ if (mDebug) { ++ dbgs() << "Old Inst: "; ++ inst->dump(); ++ dbgs() << "New Inst: "; ++ CI->dump(); ++ dbgs() << "\n\n"; ++ } ++ CI->insertBefore(inst); ++ inst->replaceAllUsesWith(CI); ++ return true; ++} ++ ++bool ++AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst) { ++ if (!inst) { ++ return false; ++ } ++ if (!inst->isBinaryOp()) { ++ return false; ++ } ++ if (inst->getOpcode() != Instruction::And) { ++ return false; ++ } ++ if (optLevel == CodeGenOpt::None) { ++ return false; ++ } ++ // We want to do some simple optimizations on Shift right/And patterns. The ++ // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a ++ // value smaller than 32 and C is a mask. If C is a constant value, then the ++ // following transformation can occur. For signed integers, it turns into the ++ // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned ++ // integers, it turns into the function call dst = ++ // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract ++ // can be found in Section 7.9 of the ATI IL spec of the stream SDK for ++ // Evergreen hardware. ++ if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) { ++ // This does not work on HD4XXX hardware. ++ return false; ++ } ++ Type *aType = inst->getType(); ++ bool isVector = aType->isVectorTy(); ++ ++ // XXX Support vector types ++ if (isVector) { ++ return false; ++ } ++ int numEle = 1; ++ // This only works on 32bit integers ++ if (aType->getScalarType() ++ != Type::getInt32Ty(inst->getContext())) { ++ return false; ++ } ++ if (isVector) { ++ const VectorType *VT = dyn_cast(aType); ++ numEle = VT->getNumElements(); ++ // We currently cannot support more than 4 elements in a intrinsic and we ++ // cannot support Vec3 types. ++ if (numEle > 4 || numEle == 3) { ++ return false; ++ } ++ } ++ BinaryOperator *ShiftInst = dyn_cast(inst->getOperand(0)); ++ // If the first operand is not a shift instruction, then we can return as it ++ // doesn't match this pattern. ++ if (!ShiftInst || !ShiftInst->isShift()) { ++ return false; ++ } ++ // If we are a shift left, then we need don't match this pattern. ++ if (ShiftInst->getOpcode() == Instruction::Shl) { ++ return false; ++ } ++ bool isSigned = ShiftInst->isArithmeticShift(); ++ Constant *AndMask = dyn_cast(inst->getOperand(1)); ++ Constant *ShrVal = dyn_cast(ShiftInst->getOperand(1)); ++ // Lets make sure that the shift value and the and mask are constant integers. ++ if (!AndMask || !ShrVal) { ++ return false; ++ } ++ Constant *newMaskConst; ++ Constant *shiftValConst; ++ if (isVector) { ++ // Handle the vector case ++ std::vector maskVals; ++ std::vector shiftVals; ++ ConstantVector *AndMaskVec = dyn_cast(AndMask); ++ ConstantVector *ShrValVec = dyn_cast(ShrVal); ++ Type *scalarType = AndMaskVec->getType()->getScalarType(); ++ assert(AndMaskVec->getNumOperands() == ++ ShrValVec->getNumOperands() && "cannot have a " ++ "combination where the number of elements to a " ++ "shift and an and are different!"); ++ for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) { ++ ConstantInt *AndCI = dyn_cast(AndMaskVec->getOperand(x)); ++ ConstantInt *ShiftIC = dyn_cast(ShrValVec->getOperand(x)); ++ if (!AndCI || !ShiftIC) { ++ return false; ++ } ++ uint32_t maskVal = (uint32_t)AndCI->getZExtValue(); ++ if (!isMask_32(maskVal)) { ++ return false; ++ } ++ maskVal = (uint32_t)CountTrailingOnes_32(maskVal); ++ uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue(); ++ // If the mask or shiftval is greater than the bitcount, then break out. ++ if (maskVal >= 32 || shiftVal >= 32) { ++ return false; ++ } ++ // If the mask val is greater than the the number of original bits left ++ // then this optimization is invalid. ++ if (maskVal > (32 - shiftVal)) { ++ return false; ++ } ++ maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned)); ++ shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned)); ++ } ++ newMaskConst = ConstantVector::get(maskVals); ++ shiftValConst = ConstantVector::get(shiftVals); ++ } else { ++ // Handle the scalar case ++ uint32_t maskVal = (uint32_t)dyn_cast(AndMask)->getZExtValue(); ++ // This must be a mask value where all lower bits are set to 1 and then any ++ // bit higher is set to 0. ++ if (!isMask_32(maskVal)) { ++ return false; ++ } ++ maskVal = (uint32_t)CountTrailingOnes_32(maskVal); ++ // Count the number of bits set in the mask, this is the width of the ++ // resulting bit set that is extracted from the source value. ++ uint32_t shiftVal = (uint32_t)dyn_cast(ShrVal)->getZExtValue(); ++ // If the mask or shift val is greater than the bitcount, then break out. ++ if (maskVal >= 32 || shiftVal >= 32) { ++ return false; ++ } ++ // If the mask val is greater than the the number of original bits left then ++ // this optimization is invalid. ++ if (maskVal > (32 - shiftVal)) { ++ return false; ++ } ++ newMaskConst = ConstantInt::get(aType, maskVal, isSigned); ++ shiftValConst = ConstantInt::get(aType, shiftVal, isSigned); ++ } ++ // Lets create the function signature. ++ std::vector callTypes; ++ callTypes.push_back(aType); ++ callTypes.push_back(aType); ++ callTypes.push_back(aType); ++ FunctionType *funcType = FunctionType::get(aType, callTypes, false); ++ std::string name = "llvm.AMDGPU.bit.extract.u32"; ++ if (isVector) { ++ name += ".v" + itostr(numEle) + "i32"; ++ } else { ++ name += "."; ++ } ++ // Lets create the function. ++ Function *Func = ++ dyn_cast(inst->getParent()->getParent()->getParent()-> ++ getOrInsertFunction(llvm::StringRef(name), funcType)); ++ Value *Operands[3] = { ++ ShiftInst->getOperand(0), ++ shiftValConst, ++ newMaskConst ++ }; ++ // Lets create the Call with the operands ++ CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt"); ++ CI->setDoesNotAccessMemory(); ++ CI->insertBefore(inst); ++ inst->replaceAllUsesWith(CI); ++ return true; ++} ++ ++bool ++AMDGPUPeepholeOpt::expandBFI(CallInst *CI) { ++ if (!CI) { ++ return false; ++ } ++ Value *LHS = CI->getOperand(CI->getNumOperands() - 1); ++ if (!LHS->getName().startswith("__amdil_bfi")) { ++ return false; ++ } ++ Type* type = CI->getOperand(0)->getType(); ++ Constant *negOneConst = NULL; ++ if (type->isVectorTy()) { ++ std::vector negOneVals; ++ negOneConst = ConstantInt::get(CI->getContext(), ++ APInt(32, StringRef("-1"), 10)); ++ for (size_t x = 0, ++ y = dyn_cast(type)->getNumElements(); x < y; ++x) { ++ negOneVals.push_back(negOneConst); ++ } ++ negOneConst = ConstantVector::get(negOneVals); ++ } else { ++ negOneConst = ConstantInt::get(CI->getContext(), ++ APInt(32, StringRef("-1"), 10)); ++ } ++ // __amdil_bfi => (A & B) | (~A & C) ++ BinaryOperator *lhs = ++ BinaryOperator::Create(Instruction::And, CI->getOperand(0), ++ CI->getOperand(1), "bfi_and", CI); ++ BinaryOperator *rhs = ++ BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst, ++ "bfi_not", CI); ++ rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2), ++ "bfi_and", CI); ++ lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI); ++ CI->replaceAllUsesWith(lhs); ++ return true; ++} ++ ++bool ++AMDGPUPeepholeOpt::expandBFM(CallInst *CI) { ++ if (!CI) { ++ return false; ++ } ++ Value *LHS = CI->getOperand(CI->getNumOperands() - 1); ++ if (!LHS->getName().startswith("__amdil_bfm")) { ++ return false; ++ } ++ // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f) ++ Constant *newMaskConst = NULL; ++ Constant *newShiftConst = NULL; ++ Type* type = CI->getOperand(0)->getType(); ++ if (type->isVectorTy()) { ++ std::vector newMaskVals, newShiftVals; ++ newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F); ++ newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1); ++ for (size_t x = 0, ++ y = dyn_cast(type)->getNumElements(); x < y; ++x) { ++ newMaskVals.push_back(newMaskConst); ++ newShiftVals.push_back(newShiftConst); ++ } ++ newMaskConst = ConstantVector::get(newMaskVals); ++ newShiftConst = ConstantVector::get(newShiftVals); ++ } else { ++ newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F); ++ newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1); ++ } ++ BinaryOperator *lhs = ++ BinaryOperator::Create(Instruction::And, CI->getOperand(0), ++ newMaskConst, "bfm_mask", CI); ++ lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst, ++ lhs, "bfm_shl", CI); ++ lhs = BinaryOperator::Create(Instruction::Sub, lhs, ++ newShiftConst, "bfm_sub", CI); ++ BinaryOperator *rhs = ++ BinaryOperator::Create(Instruction::And, CI->getOperand(1), ++ newMaskConst, "bfm_mask", CI); ++ lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI); ++ CI->replaceAllUsesWith(lhs); ++ return true; ++} ++ ++bool ++AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) { ++ Instruction *inst = (*bbb); ++ if (optimizeCallInst(bbb)) { ++ return true; ++ } ++ if (optimizeBitExtract(inst)) { ++ return false; ++ } ++ if (optimizeBitInsert(inst)) { ++ return false; ++ } ++ if (correctMisalignedMemOp(inst)) { ++ return false; ++ } ++ return false; ++} ++bool ++AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) { ++ LoadInst *linst = dyn_cast(inst); ++ StoreInst *sinst = dyn_cast(inst); ++ unsigned alignment; ++ Type* Ty = inst->getType(); ++ if (linst) { ++ alignment = linst->getAlignment(); ++ Ty = inst->getType(); ++ } else if (sinst) { ++ alignment = sinst->getAlignment(); ++ Ty = sinst->getValueOperand()->getType(); ++ } else { ++ return false; ++ } ++ unsigned size = getTypeSize(Ty); ++ if (size == alignment || size < alignment) { ++ return false; ++ } ++ if (!Ty->isStructTy()) { ++ return false; ++ } ++ if (alignment < 4) { ++ if (linst) { ++ linst->setAlignment(0); ++ return true; ++ } else if (sinst) { ++ sinst->setAlignment(0); ++ return true; ++ } ++ } ++ return false; ++} ++bool ++AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI) { ++ if (!CI) { ++ return false; ++ } ++ Value *LHS = CI->getOperand(CI->getNumOperands() - 1); ++ std::string namePrefix = LHS->getName().substr(0, 14); ++ if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24" ++ && namePrefix != "__amdil__imul24_high") { ++ return false; ++ } ++ if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) { ++ return false; ++ } ++ return true; ++} ++ ++void ++AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI) { ++ assert(isSigned24BitOps(CI) && "Must be a " ++ "signed 24 bit operation to call this function!"); ++ Value *LHS = CI->getOperand(CI->getNumOperands()-1); ++ // On 7XX and 8XX we do not have signed 24bit, so we need to ++ // expand it to the following: ++ // imul24 turns into 32bit imul ++ // imad24 turns into 32bit imad ++ // imul24_high turns into 32bit imulhigh ++ if (LHS->getName().substr(0, 14) == "__amdil_imad24") { ++ Type *aType = CI->getOperand(0)->getType(); ++ bool isVector = aType->isVectorTy(); ++ int numEle = isVector ? dyn_cast(aType)->getNumElements() : 1; ++ std::vector callTypes; ++ callTypes.push_back(CI->getOperand(0)->getType()); ++ callTypes.push_back(CI->getOperand(1)->getType()); ++ callTypes.push_back(CI->getOperand(2)->getType()); ++ FunctionType *funcType = ++ FunctionType::get(CI->getOperand(0)->getType(), callTypes, false); ++ std::string name = "__amdil_imad"; ++ if (isVector) { ++ name += "_v" + itostr(numEle) + "i32"; ++ } else { ++ name += "_i32"; ++ } ++ Function *Func = dyn_cast( ++ CI->getParent()->getParent()->getParent()-> ++ getOrInsertFunction(llvm::StringRef(name), funcType)); ++ Value *Operands[3] = { ++ CI->getOperand(0), ++ CI->getOperand(1), ++ CI->getOperand(2) ++ }; ++ CallInst *nCI = CallInst::Create(Func, Operands, "imad24"); ++ nCI->insertBefore(CI); ++ CI->replaceAllUsesWith(nCI); ++ } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") { ++ BinaryOperator *mulOp = ++ BinaryOperator::Create(Instruction::Mul, CI->getOperand(0), ++ CI->getOperand(1), "imul24", CI); ++ CI->replaceAllUsesWith(mulOp); ++ } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") { ++ Type *aType = CI->getOperand(0)->getType(); ++ ++ bool isVector = aType->isVectorTy(); ++ int numEle = isVector ? dyn_cast(aType)->getNumElements() : 1; ++ std::vector callTypes; ++ callTypes.push_back(CI->getOperand(0)->getType()); ++ callTypes.push_back(CI->getOperand(1)->getType()); ++ FunctionType *funcType = ++ FunctionType::get(CI->getOperand(0)->getType(), callTypes, false); ++ std::string name = "__amdil_imul_high"; ++ if (isVector) { ++ name += "_v" + itostr(numEle) + "i32"; ++ } else { ++ name += "_i32"; ++ } ++ Function *Func = dyn_cast( ++ CI->getParent()->getParent()->getParent()-> ++ getOrInsertFunction(llvm::StringRef(name), funcType)); ++ Value *Operands[2] = { ++ CI->getOperand(0), ++ CI->getOperand(1) ++ }; ++ CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high"); ++ nCI->insertBefore(CI); ++ CI->replaceAllUsesWith(nCI); ++ } ++} ++ ++bool ++AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI) { ++ return (CI != NULL ++ && CI->getOperand(CI->getNumOperands() - 1)->getName() ++ == "__amdil_get_local_size_int"); ++} ++ ++bool ++AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI) { ++ if (!CI) { ++ return false; ++ } ++ if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX ++ && (mSTM->getDeviceName() == "cayman")) { ++ return false; ++ } ++ return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20) ++ == "__amdil_improved_div"; ++} ++ ++void ++AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI) { ++ assert(convertAccurateDivide(CI) ++ && "expanding accurate divide can only happen if it is expandable!"); ++ BinaryOperator *divOp = ++ BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0), ++ CI->getOperand(1), "fdiv32", CI); ++ CI->replaceAllUsesWith(divOp); ++} ++ ++bool ++AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) { ++ if (optLevel != CodeGenOpt::None) { ++ return false; ++ } ++ ++ if (!CI) { ++ return false; ++ } ++ ++ unsigned funcNameIdx = 0; ++ funcNameIdx = CI->getNumOperands() - 1; ++ StringRef calleeName = CI->getOperand(funcNameIdx)->getName(); ++ if (calleeName != "__amdil_image2d_read_norm" ++ && calleeName != "__amdil_image2d_read_unnorm" ++ && calleeName != "__amdil_image3d_read_norm" ++ && calleeName != "__amdil_image3d_read_unnorm") { ++ return false; ++ } ++ ++ unsigned samplerIdx = 2; ++ samplerIdx = 1; ++ Value *sampler = CI->getOperand(samplerIdx); ++ LoadInst *lInst = dyn_cast(sampler); ++ if (!lInst) { ++ return false; ++ } ++ ++ if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { ++ return false; ++ } ++ ++ GlobalVariable *gv = dyn_cast(lInst->getPointerOperand()); ++ // If we are loading from what is not a global value, then we ++ // fail and return. ++ if (!gv) { ++ return false; ++ } ++ ++ // If we don't have an initializer or we have an initializer and ++ // the initializer is not a 32bit integer, we fail. ++ if (!gv->hasInitializer() ++ || !gv->getInitializer()->getType()->isIntegerTy(32)) { ++ return false; ++ } ++ ++ // Now that we have the global variable initializer, lets replace ++ // all uses of the load instruction with the samplerVal and ++ // reparse the __amdil_is_constant() function. ++ Constant *samplerVal = gv->getInitializer(); ++ lInst->replaceAllUsesWith(samplerVal); ++ return true; ++} ++ ++bool ++AMDGPUPeepholeOpt::doInitialization(Module &M) { ++ return false; ++} ++ ++bool ++AMDGPUPeepholeOpt::doFinalization(Module &M) { ++ return false; ++} ++ ++void ++AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const { ++ AU.addRequired(); ++ FunctionPass::getAnalysisUsage(AU); ++ AU.setPreservesAll(); ++} ++ ++size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) { ++ size_t size = 0; ++ if (!T) { ++ return size; ++ } ++ switch (T->getTypeID()) { ++ case Type::X86_FP80TyID: ++ case Type::FP128TyID: ++ case Type::PPC_FP128TyID: ++ case Type::LabelTyID: ++ assert(0 && "These types are not supported by this backend"); ++ default: ++ case Type::FloatTyID: ++ case Type::DoubleTyID: ++ size = T->getPrimitiveSizeInBits() >> 3; ++ break; ++ case Type::PointerTyID: ++ size = getTypeSize(dyn_cast(T), dereferencePtr); ++ break; ++ case Type::IntegerTyID: ++ size = getTypeSize(dyn_cast(T), dereferencePtr); ++ break; ++ case Type::StructTyID: ++ size = getTypeSize(dyn_cast(T), dereferencePtr); ++ break; ++ case Type::ArrayTyID: ++ size = getTypeSize(dyn_cast(T), dereferencePtr); ++ break; ++ case Type::FunctionTyID: ++ size = getTypeSize(dyn_cast(T), dereferencePtr); ++ break; ++ case Type::VectorTyID: ++ size = getTypeSize(dyn_cast(T), dereferencePtr); ++ break; ++ }; ++ return size; ++} ++ ++size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST, ++ bool dereferencePtr) { ++ size_t size = 0; ++ if (!ST) { ++ return size; ++ } ++ Type *curType; ++ StructType::element_iterator eib; ++ StructType::element_iterator eie; ++ for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) { ++ curType = *eib; ++ size += getTypeSize(curType, dereferencePtr); ++ } ++ return size; ++} ++ ++size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT, ++ bool dereferencePtr) { ++ return IT ? (IT->getBitWidth() >> 3) : 0; ++} ++ ++size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT, ++ bool dereferencePtr) { ++ assert(0 && "Should not be able to calculate the size of an function type"); ++ return 0; ++} ++ ++size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT, ++ bool dereferencePtr) { ++ return (size_t)(AT ? (getTypeSize(AT->getElementType(), ++ dereferencePtr) * AT->getNumElements()) ++ : 0); ++} ++ ++size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT, ++ bool dereferencePtr) { ++ return VT ? (VT->getBitWidth() >> 3) : 0; ++} ++ ++size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT, ++ bool dereferencePtr) { ++ if (!PT) { ++ return 0; ++ } ++ Type *CT = PT->getElementType(); ++ if (CT->getTypeID() == Type::StructTyID && ++ PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { ++ return getTypeSize(dyn_cast(CT)); ++ } else if (dereferencePtr) { ++ size_t size = 0; ++ for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) { ++ size += getTypeSize(PT->getContainedType(x), dereferencePtr); ++ } ++ return size; ++ } else { ++ return 4; ++ } ++} ++ ++size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT, ++ bool dereferencePtr) { ++ //assert(0 && "Should not be able to calculate the size of an opaque type"); ++ return 4; ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILRegisterInfo.td llvm-r600/lib/Target/R600/AMDILRegisterInfo.td +--- llvm-3.2.src/lib/Target/R600/AMDILRegisterInfo.td 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILRegisterInfo.td 2013-01-25 19:43:57.450049721 +0100 +@@ -0,0 +1,107 @@ ++//===- AMDILRegisterInfo.td - AMDIL Register defs ----------*- tablegen -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//==-----------------------------------------------------------------------===// ++// ++// Declarations that describe the AMDIL register file ++// ++//===----------------------------------------------------------------------===// ++ ++class AMDILReg num, string n> : Register { ++ field bits<16> Value; ++ let Value = num; ++ let Namespace = "AMDGPU"; ++} ++ ++// We will start with 8 registers for each class before expanding to more ++// Since the swizzle is added based on the register class, we can leave it ++// off here and just specify different registers for different register classes ++def R1 : AMDILReg<1, "r1">, DwarfRegNum<[1]>; ++def R2 : AMDILReg<2, "r2">, DwarfRegNum<[2]>; ++def R3 : AMDILReg<3, "r3">, DwarfRegNum<[3]>; ++def R4 : AMDILReg<4, "r4">, DwarfRegNum<[4]>; ++def R5 : AMDILReg<5, "r5">, DwarfRegNum<[5]>; ++def R6 : AMDILReg<6, "r6">, DwarfRegNum<[6]>; ++def R7 : AMDILReg<7, "r7">, DwarfRegNum<[7]>; ++def R8 : AMDILReg<8, "r8">, DwarfRegNum<[8]>; ++def R9 : AMDILReg<9, "r9">, DwarfRegNum<[9]>; ++def R10 : AMDILReg<10, "r10">, DwarfRegNum<[10]>; ++def R11 : AMDILReg<11, "r11">, DwarfRegNum<[11]>; ++def R12 : AMDILReg<12, "r12">, DwarfRegNum<[12]>; ++def R13 : AMDILReg<13, "r13">, DwarfRegNum<[13]>; ++def R14 : AMDILReg<14, "r14">, DwarfRegNum<[14]>; ++def R15 : AMDILReg<15, "r15">, DwarfRegNum<[15]>; ++def R16 : AMDILReg<16, "r16">, DwarfRegNum<[16]>; ++def R17 : AMDILReg<17, "r17">, DwarfRegNum<[17]>; ++def R18 : AMDILReg<18, "r18">, DwarfRegNum<[18]>; ++def R19 : AMDILReg<19, "r19">, DwarfRegNum<[19]>; ++def R20 : AMDILReg<20, "r20">, DwarfRegNum<[20]>; ++ ++// All registers between 1000 and 1024 are reserved and cannot be used ++// unless commented in this section ++// r1021-r1025 are used to dynamically calculate the local/group/thread/region/region_local ID's ++// r1020 is used to hold the frame index for local arrays ++// r1019 is used to hold the dynamic stack allocation pointer ++// r1018 is used as a temporary register for handwritten code ++// r1017 is used as a temporary register for handwritten code ++// r1016 is used as a temporary register for load/store code ++// r1015 is used as a temporary register for data segment offset ++// r1014 is used as a temporary register for store code ++// r1013 is used as the section data pointer register ++// r1012-r1010 and r1001-r1008 are used for temporary I/O registers ++// r1009 is used as the frame pointer register ++// r999 is used as the mem register. ++// r998 is used as the return address register. ++//def R1025 : AMDILReg<1025, "r1025">, DwarfRegNum<[1025]>; ++//def R1024 : AMDILReg<1024, "r1024">, DwarfRegNum<[1024]>; ++//def R1023 : AMDILReg<1023, "r1023">, DwarfRegNum<[1023]>; ++//def R1022 : AMDILReg<1022, "r1022">, DwarfRegNum<[1022]>; ++//def R1021 : AMDILReg<1021, "r1021">, DwarfRegNum<[1021]>; ++//def R1020 : AMDILReg<1020, "r1020">, DwarfRegNum<[1020]>; ++def SP : AMDILReg<1019, "r1019">, DwarfRegNum<[1019]>; ++def T1 : AMDILReg<1018, "r1018">, DwarfRegNum<[1018]>; ++def T2 : AMDILReg<1017, "r1017">, DwarfRegNum<[1017]>; ++def T3 : AMDILReg<1016, "r1016">, DwarfRegNum<[1016]>; ++def T4 : AMDILReg<1015, "r1015">, DwarfRegNum<[1015]>; ++def T5 : AMDILReg<1014, "r1014">, DwarfRegNum<[1014]>; ++def SDP : AMDILReg<1013, "r1013">, DwarfRegNum<[1013]>; ++def R1012: AMDILReg<1012, "r1012">, DwarfRegNum<[1012]>; ++def R1011: AMDILReg<1011, "r1011">, DwarfRegNum<[1011]>; ++def R1010: AMDILReg<1010, "r1010">, DwarfRegNum<[1010]>; ++def DFP : AMDILReg<1009, "r1009">, DwarfRegNum<[1009]>; ++def R1008: AMDILReg<1008, "r1008">, DwarfRegNum<[1008]>; ++def R1007: AMDILReg<1007, "r1007">, DwarfRegNum<[1007]>; ++def R1006: AMDILReg<1006, "r1006">, DwarfRegNum<[1006]>; ++def R1005: AMDILReg<1005, "r1005">, DwarfRegNum<[1005]>; ++def R1004: AMDILReg<1004, "r1004">, DwarfRegNum<[1004]>; ++def R1003: AMDILReg<1003, "r1003">, DwarfRegNum<[1003]>; ++def R1002: AMDILReg<1002, "r1002">, DwarfRegNum<[1002]>; ++def R1001: AMDILReg<1001, "r1001">, DwarfRegNum<[1001]>; ++def MEM : AMDILReg<999, "mem">, DwarfRegNum<[999]>; ++def RA : AMDILReg<998, "r998">, DwarfRegNum<[998]>; ++def FP : AMDILReg<997, "r997">, DwarfRegNum<[997]>; ++def GPRI16 : RegisterClass<"AMDGPU", [i16], 16, ++ (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> { ++ let AltOrders = [(add (sequence "R%u", 1, 20))]; ++ let AltOrderSelect = [{ ++ return 1; ++ }]; ++ } ++def GPRI32 : RegisterClass<"AMDGPU", [i32], 32, ++ (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> { ++ let AltOrders = [(add (sequence "R%u", 1, 20))]; ++ let AltOrderSelect = [{ ++ return 1; ++ }]; ++ } ++def GPRF32 : RegisterClass<"AMDGPU", [f32], 32, ++ (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> { ++ let AltOrders = [(add (sequence "R%u", 1, 20))]; ++ let AltOrderSelect = [{ ++ return 1; ++ }]; ++ } +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILSIDevice.cpp llvm-r600/lib/Target/R600/AMDILSIDevice.cpp +--- llvm-3.2.src/lib/Target/R600/AMDILSIDevice.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILSIDevice.cpp 2013-01-25 19:43:57.450049721 +0100 +@@ -0,0 +1,45 @@ ++//===-- AMDILSIDevice.cpp - Device Info for Southern Islands GPUs ---------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++/// \file ++//==-----------------------------------------------------------------------===// ++#include "AMDILSIDevice.h" ++#include "AMDILEvergreenDevice.h" ++#include "AMDILNIDevice.h" ++#include "AMDGPUSubtarget.h" ++ ++using namespace llvm; ++ ++AMDGPUSIDevice::AMDGPUSIDevice(AMDGPUSubtarget *ST) ++ : AMDGPUEvergreenDevice(ST) { ++} ++AMDGPUSIDevice::~AMDGPUSIDevice() { ++} ++ ++size_t ++AMDGPUSIDevice::getMaxLDSSize() const { ++ if (usesHardware(AMDGPUDeviceInfo::LocalMem)) { ++ return MAX_LDS_SIZE_900; ++ } else { ++ return 0; ++ } ++} ++ ++uint32_t ++AMDGPUSIDevice::getGeneration() const { ++ return AMDGPUDeviceInfo::HD7XXX; ++} ++ ++std::string ++AMDGPUSIDevice::getDataLayout() const { ++ return std::string("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16" ++ "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32" ++ "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64" ++ "-v96:128:128-v128:128:128-v192:256:256-v256:256:256" ++ "-v512:512:512-v1024:1024:1024-v2048:2048:2048" ++ "-n8:16:32:64"); ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILSIDevice.h llvm-r600/lib/Target/R600/AMDILSIDevice.h +--- llvm-3.2.src/lib/Target/R600/AMDILSIDevice.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/AMDILSIDevice.h 2013-01-25 19:43:57.450049721 +0100 +@@ -0,0 +1,39 @@ ++//===------- AMDILSIDevice.h - Define SI Device for AMDIL -*- C++ -*------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//==-----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Interface for the subtarget data classes. ++/// ++/// This file will define the interface that each generation needs to ++/// implement in order to correctly answer queries on the capabilities of the ++/// specific hardware. ++//===---------------------------------------------------------------------===// ++#ifndef AMDILSIDEVICE_H ++#define AMDILSIDEVICE_H ++#include "AMDILEvergreenDevice.h" ++ ++namespace llvm { ++class AMDGPUSubtarget; ++//===---------------------------------------------------------------------===// ++// SI generation of devices and their respective sub classes ++//===---------------------------------------------------------------------===// ++ ++/// \brief The AMDGPUSIDevice is the base class for all Southern Island series ++/// of cards. ++class AMDGPUSIDevice : public AMDGPUEvergreenDevice { ++public: ++ AMDGPUSIDevice(AMDGPUSubtarget*); ++ virtual ~AMDGPUSIDevice(); ++ virtual size_t getMaxLDSSize() const; ++ virtual uint32_t getGeneration() const; ++ virtual std::string getDataLayout() const; ++}; ++ ++} // namespace llvm ++#endif // AMDILSIDEVICE_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/CMakeLists.txt llvm-r600/lib/Target/R600/CMakeLists.txt +--- llvm-3.2.src/lib/Target/R600/CMakeLists.txt 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/CMakeLists.txt 2013-01-25 19:43:57.453383054 +0100 +@@ -0,0 +1,55 @@ ++set(LLVM_TARGET_DEFINITIONS AMDGPU.td) ++ ++tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info) ++tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info) ++tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel) ++tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv) ++tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget) ++tablegen(LLVM AMDGPUGenIntrinsics.inc -gen-tgt-intrinsic) ++tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter -mc-emitter) ++tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer) ++tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer) ++add_public_tablegen_target(AMDGPUCommonTableGen) ++ ++add_llvm_target(AMDGPUCodeGen ++ AMDIL7XXDevice.cpp ++ AMDILCFGStructurizer.cpp ++ AMDILDevice.cpp ++ AMDILDeviceInfo.cpp ++ AMDILEvergreenDevice.cpp ++ AMDILFrameLowering.cpp ++ AMDILIntrinsicInfo.cpp ++ AMDILISelDAGToDAG.cpp ++ AMDILISelLowering.cpp ++ AMDILNIDevice.cpp ++ AMDILPeepholeOptimizer.cpp ++ AMDILSIDevice.cpp ++ AMDGPUAsmPrinter.cpp ++ AMDGPUMCInstLower.cpp ++ AMDGPUSubtarget.cpp ++ AMDGPUTargetMachine.cpp ++ AMDGPUISelLowering.cpp ++ AMDGPUConvertToISA.cpp ++ AMDGPUInstrInfo.cpp ++ AMDGPURegisterInfo.cpp ++ R600ExpandSpecialInstrs.cpp ++ R600InstrInfo.cpp ++ R600ISelLowering.cpp ++ R600LowerConstCopy.cpp ++ R600MachineFunctionInfo.cpp ++ R600RegisterInfo.cpp ++ SIAssignInterpRegs.cpp ++ SIInstrInfo.cpp ++ SIISelLowering.cpp ++ SILowerLiteralConstants.cpp ++ SILowerControlFlow.cpp ++ SIMachineFunctionInfo.cpp ++ SIRegisterInfo.cpp ++ SIFixSGPRLiveness.cpp ++ ) ++ ++add_dependencies(LLVMR600CodeGen intrinsics_gen) ++ ++add_subdirectory(InstPrinter) ++add_subdirectory(TargetInfo) ++add_subdirectory(MCTargetDesc) +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp llvm-r600/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp +--- llvm-3.2.src/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp 2013-01-25 19:43:57.456716387 +0100 +@@ -0,0 +1,156 @@ ++//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++// \file ++//===----------------------------------------------------------------------===// ++ ++#include "AMDGPUInstPrinter.h" ++#include "MCTargetDesc/AMDGPUMCTargetDesc.h" ++#include "llvm/MC/MCInst.h" ++ ++using namespace llvm; ++ ++void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, ++ StringRef Annot) { ++ printInstruction(MI, OS); ++ ++ printAnnotation(OS, Annot); ++} ++ ++void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, ++ raw_ostream &O) { ++ ++ const MCOperand &Op = MI->getOperand(OpNo); ++ if (Op.isReg()) { ++ switch (Op.getReg()) { ++ // This is the default predicate state, so we don't need to print it. ++ case AMDGPU::PRED_SEL_OFF: break; ++ default: O << getRegisterName(Op.getReg()); break; ++ } ++ } else if (Op.isImm()) { ++ O << Op.getImm(); ++ } else if (Op.isFPImm()) { ++ O << Op.getFPImm(); ++ } else { ++ assert(!"unknown operand type in printOperand"); ++ } ++} ++ ++void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo, ++ raw_ostream &O) { ++ printOperand(MI, OpNo, O); ++ O << ", "; ++ printOperand(MI, OpNo + 1, O); ++} ++ ++void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, ++ raw_ostream &O, StringRef Asm) { ++ const MCOperand &Op = MI->getOperand(OpNo); ++ assert(Op.isImm()); ++ if (Op.getImm() == 1) { ++ O << Asm; ++ } ++} ++ ++void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo, ++ raw_ostream &O) { ++ printIfSet(MI, OpNo, O, "|"); ++} ++ ++void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo, ++ raw_ostream &O) { ++ printIfSet(MI, OpNo, O, "_SAT"); ++} ++ ++void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, ++ raw_ostream &O) { ++ union Literal { ++ float f; ++ int32_t i; ++ } L; ++ ++ L.i = MI->getOperand(OpNo).getImm(); ++ O << L.i << "(" << L.f << ")"; ++} ++ ++void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo, ++ raw_ostream &O) { ++ printIfSet(MI, OpNo, O, " *"); ++} ++ ++void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo, ++ raw_ostream &O) { ++ printIfSet(MI, OpNo, O, "-"); ++} ++ ++void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo, ++ raw_ostream &O) { ++ switch (MI->getOperand(OpNo).getImm()) { ++ default: break; ++ case 1: ++ O << " * 2.0"; ++ break; ++ case 2: ++ O << " * 4.0"; ++ break; ++ case 3: ++ O << " / 2.0"; ++ break; ++ } ++} ++ ++void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo, ++ raw_ostream &O) { ++ const MCOperand &Op = MI->getOperand(OpNo); ++ if (Op.getImm() != 0) { ++ O << " + " << Op.getImm(); ++ } ++} ++ ++void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo, ++ raw_ostream &O) { ++ printIfSet(MI, OpNo, O, "ExecMask,"); ++} ++ ++void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo, ++ raw_ostream &O) { ++ printIfSet(MI, OpNo, O, "Pred,"); ++} ++ ++void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo, ++ raw_ostream &O) { ++ const MCOperand &Op = MI->getOperand(OpNo); ++ if (Op.getImm() == 0) { ++ O << " (MASKED)"; ++ } ++} ++ ++void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo, ++ raw_ostream &O) { ++ const char * chans = "XYZW"; ++ int sel = MI->getOperand(OpNo).getImm(); ++ ++ int chan = sel & 3; ++ sel >>= 2; ++ ++ if (sel >= 512) { ++ sel -= 512; ++ int cb = sel >> 12; ++ sel &= 4095; ++ O << cb << "[" << sel << "]"; ++ } else if (sel >= 448) { ++ sel -= 448; ++ O << sel; ++ } else if (sel >= 0){ ++ O << sel; ++ } ++ ++ if (sel >= 0) ++ O << "." << chans[chan]; ++} ++ ++#include "AMDGPUGenAsmWriter.inc" +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h llvm-r600/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h +--- llvm-3.2.src/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h 2013-01-25 19:43:57.456716387 +0100 +@@ -0,0 +1,53 @@ ++//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++//===----------------------------------------------------------------------===// ++ ++#ifndef AMDGPUINSTPRINTER_H ++#define AMDGPUINSTPRINTER_H ++ ++#include "llvm/ADT/StringRef.h" ++#include "llvm/MC/MCInstPrinter.h" ++#include "llvm/Support/raw_ostream.h" ++ ++namespace llvm { ++ ++class AMDGPUInstPrinter : public MCInstPrinter { ++public: ++ AMDGPUInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, ++ const MCRegisterInfo &MRI) ++ : MCInstPrinter(MAI, MII, MRI) {} ++ ++ //Autogenerated by tblgen ++ void printInstruction(const MCInst *MI, raw_ostream &O); ++ static const char *getRegisterName(unsigned RegNo); ++ ++ virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot); ++ ++private: ++ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); ++ void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); ++ void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, StringRef Asm); ++ void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O); ++ void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O); ++ void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O); ++ void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O); ++ void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O); ++ void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O); ++ void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O); ++ void printUpdateExecMask(const MCInst *MI, unsigned OpNo, raw_ostream &O); ++ void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O); ++ void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O); ++ void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); ++}; ++ ++} // End namespace llvm ++ ++#endif // AMDGPUINSTRPRINTER_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/InstPrinter/CMakeLists.txt llvm-r600/lib/Target/R600/InstPrinter/CMakeLists.txt +--- llvm-3.2.src/lib/Target/R600/InstPrinter/CMakeLists.txt 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/InstPrinter/CMakeLists.txt 2013-01-25 19:43:57.456716387 +0100 +@@ -0,0 +1,7 @@ ++include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) ++ ++add_llvm_library(LLVMR600AsmPrinter ++ AMDGPUInstPrinter.cpp ++ ) ++ ++add_dependencies(LLVMR600AsmPrinter R600CommonTableGen) +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/InstPrinter/LLVMBuild.txt llvm-r600/lib/Target/R600/InstPrinter/LLVMBuild.txt +--- llvm-3.2.src/lib/Target/R600/InstPrinter/LLVMBuild.txt 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/InstPrinter/LLVMBuild.txt 2013-01-25 19:43:57.456716387 +0100 +@@ -0,0 +1,24 @@ ++;===- ./lib/Target/R600/InstPrinter/LLVMBuild.txt -----------*- Conf -*--===; ++; ++; The LLVM Compiler Infrastructure ++; ++; This file is distributed under the University of Illinois Open Source ++; License. See LICENSE.TXT for details. ++; ++;===------------------------------------------------------------------------===; ++; ++; This is an LLVMBuild description file for the components in this subdirectory. ++; ++; For more information on the LLVMBuild system, please see: ++; ++; http://llvm.org/docs/LLVMBuild.html ++; ++;===------------------------------------------------------------------------===; ++ ++[component_0] ++type = Library ++name = R600AsmPrinter ++parent = R600 ++required_libraries = MC Support ++add_to_library_groups = R600 ++ +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/InstPrinter/Makefile llvm-r600/lib/Target/R600/InstPrinter/Makefile +--- llvm-3.2.src/lib/Target/R600/InstPrinter/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/InstPrinter/Makefile 2013-01-25 19:43:57.456716387 +0100 +@@ -0,0 +1,15 @@ ++#===- lib/Target/R600/AsmPrinter/Makefile ------------------*- Makefile -*-===## ++# ++# The LLVM Compiler Infrastructure ++# ++# This file is distributed under the University of Illinois Open Source ++# License. See LICENSE.TXT for details. ++# ++##===----------------------------------------------------------------------===## ++LEVEL = ../../../.. ++LIBRARYNAME = LLVMR600AsmPrinter ++ ++# Hack: we need to include 'main' x86 target directory to grab private headers ++CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. ++ ++include $(LEVEL)/Makefile.common +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/LLVMBuild.txt llvm-r600/lib/Target/R600/LLVMBuild.txt +--- llvm-3.2.src/lib/Target/R600/LLVMBuild.txt 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/LLVMBuild.txt 2013-01-25 19:43:57.456716387 +0100 +@@ -0,0 +1,32 @@ ++;===- ./lib/Target/AMDIL/LLVMBuild.txt -------------------------*- Conf -*--===; ++; ++; The LLVM Compiler Infrastructure ++; ++; This file is distributed under the University of Illinois Open Source ++; License. See LICENSE.TXT for details. ++; ++;===------------------------------------------------------------------------===; ++; ++; This is an LLVMBuild description file for the components in this subdirectory. ++; ++; For more information on the LLVMBuild system, please see: ++; ++; http://llvm.org/docs/LLVMBuild.html ++; ++;===------------------------------------------------------------------------===; ++ ++[common] ++subdirectories = InstPrinter MCTargetDesc TargetInfo ++ ++[component_0] ++type = TargetGroup ++name = R600 ++parent = Target ++has_asmprinter = 1 ++ ++[component_1] ++type = Library ++name = R600CodeGen ++parent = R600 ++required_libraries = AsmPrinter CodeGen Core SelectionDAG Support Target MC R600AsmPrinter R600Desc R600Info ++add_to_library_groups = R600 +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/Makefile llvm-r600/lib/Target/R600/Makefile +--- llvm-3.2.src/lib/Target/R600/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/Makefile 2013-01-25 19:43:57.460049721 +0100 +@@ -0,0 +1,23 @@ ++##===- lib/Target/R600/Makefile ---------------------------*- Makefile -*-===## ++# ++# The LLVM Compiler Infrastructure ++# ++# This file is distributed under the University of Illinois Open Source ++# License. See LICENSE.TXT for details. ++# ++##===----------------------------------------------------------------------===## ++ ++LEVEL = ../../.. ++LIBRARYNAME = LLVMR600CodeGen ++TARGET = AMDGPU ++ ++# Make sure that tblgen is run, first thing. ++BUILT_SOURCES = AMDGPUGenRegisterInfo.inc AMDGPUGenInstrInfo.inc \ ++ AMDGPUGenDAGISel.inc AMDGPUGenSubtargetInfo.inc \ ++ AMDGPUGenMCCodeEmitter.inc AMDGPUGenCallingConv.inc \ ++ AMDGPUGenIntrinsics.inc AMDGPUGenDFAPacketizer.inc \ ++ AMDGPUGenAsmWriter.inc ++ ++DIRS = InstPrinter TargetInfo MCTargetDesc ++ ++include $(LEVEL)/Makefile.common +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp +--- llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp 2013-01-25 19:43:57.456716387 +0100 +@@ -0,0 +1,90 @@ ++//===-- AMDGPUAsmBackend.cpp - AMDGPU Assembler Backend -------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++/// \file ++//===----------------------------------------------------------------------===// ++ ++#include "MCTargetDesc/AMDGPUMCTargetDesc.h" ++#include "llvm/ADT/StringRef.h" ++#include "llvm/MC/MCAsmBackend.h" ++#include "llvm/MC/MCAssembler.h" ++#include "llvm/MC/MCObjectWriter.h" ++#include "llvm/MC/MCValue.h" ++#include "llvm/Support/TargetRegistry.h" ++ ++using namespace llvm; ++ ++namespace { ++ ++class AMDGPUMCObjectWriter : public MCObjectWriter { ++public: ++ AMDGPUMCObjectWriter(raw_ostream &OS) : MCObjectWriter(OS, true) { } ++ virtual void ExecutePostLayoutBinding(MCAssembler &Asm, ++ const MCAsmLayout &Layout) { ++ //XXX: Implement if necessary. ++ } ++ virtual void RecordRelocation(const MCAssembler &Asm, ++ const MCAsmLayout &Layout, ++ const MCFragment *Fragment, ++ const MCFixup &Fixup, ++ MCValue Target, uint64_t &FixedValue) { ++ assert(!"Not implemented"); ++ } ++ ++ virtual void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout); ++ ++}; ++ ++class AMDGPUAsmBackend : public MCAsmBackend { ++public: ++ AMDGPUAsmBackend(const Target &T) ++ : MCAsmBackend() {} ++ ++ virtual AMDGPUMCObjectWriter *createObjectWriter(raw_ostream &OS) const; ++ virtual unsigned getNumFixupKinds() const { return 0; }; ++ virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, ++ uint64_t Value) const; ++ virtual bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, ++ const MCInstFragment *DF, ++ const MCAsmLayout &Layout) const { ++ return false; ++ } ++ virtual void relaxInstruction(const MCInst &Inst, MCInst &Res) const { ++ assert(!"Not implemented"); ++ } ++ virtual bool mayNeedRelaxation(const MCInst &Inst) const { return false; } ++ virtual bool writeNopData(uint64_t Count, MCObjectWriter *OW) const { ++ return true; ++ } ++}; ++ ++} //End anonymous namespace ++ ++void AMDGPUMCObjectWriter::WriteObject(MCAssembler &Asm, ++ const MCAsmLayout &Layout) { ++ for (MCAssembler::iterator I = Asm.begin(), E = Asm.end(); I != E; ++I) { ++ Asm.writeSectionData(I, Layout); ++ } ++} ++ ++MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, StringRef TT, ++ StringRef CPU) { ++ return new AMDGPUAsmBackend(T); ++} ++ ++AMDGPUMCObjectWriter * AMDGPUAsmBackend::createObjectWriter( ++ raw_ostream &OS) const { ++ return new AMDGPUMCObjectWriter(OS); ++} ++ ++void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, ++ unsigned DataSize, uint64_t Value) const { ++ ++ uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset()); ++ assert(Fixup.getKind() == FK_PCRel_4); ++ *Dst = (Value - 4) / 4; ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp +--- llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp 2013-01-25 19:43:57.456716387 +0100 +@@ -0,0 +1,85 @@ ++//===-- MCTargetDesc/AMDGPUMCAsmInfo.cpp - Assembly Info ------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++/// \file ++//===----------------------------------------------------------------------===// ++ ++#include "AMDGPUMCAsmInfo.h" ++ ++using namespace llvm; ++AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Target &T, StringRef &TT) : MCAsmInfo() { ++ HasSingleParameterDotFile = false; ++ WeakDefDirective = 0; ++ //===------------------------------------------------------------------===// ++ HasSubsectionsViaSymbols = true; ++ HasMachoZeroFillDirective = false; ++ HasMachoTBSSDirective = false; ++ HasStaticCtorDtorReferenceInStaticMode = false; ++ LinkerRequiresNonEmptyDwarfLines = true; ++ MaxInstLength = 16; ++ PCSymbol = "$"; ++ SeparatorString = "\n"; ++ CommentColumn = 40; ++ CommentString = ";"; ++ LabelSuffix = ":"; ++ GlobalPrefix = "@"; ++ PrivateGlobalPrefix = ";."; ++ LinkerPrivateGlobalPrefix = "!"; ++ InlineAsmStart = ";#ASMSTART"; ++ InlineAsmEnd = ";#ASMEND"; ++ AssemblerDialect = 0; ++ AllowQuotesInName = false; ++ AllowNameToStartWithDigit = false; ++ AllowPeriodsInName = false; ++ ++ //===--- Data Emission Directives -------------------------------------===// ++ ZeroDirective = ".zero"; ++ AsciiDirective = ".ascii\t"; ++ AscizDirective = ".asciz\t"; ++ Data8bitsDirective = ".byte\t"; ++ Data16bitsDirective = ".short\t"; ++ Data32bitsDirective = ".long\t"; ++ Data64bitsDirective = ".quad\t"; ++ GPRel32Directive = 0; ++ SunStyleELFSectionSwitchSyntax = true; ++ UsesELFSectionDirectiveForBSS = true; ++ HasMicrosoftFastStdCallMangling = false; ++ ++ //===--- Alignment Information ----------------------------------------===// ++ AlignDirective = ".align\t"; ++ AlignmentIsInBytes = true; ++ TextAlignFillValue = 0; ++ ++ //===--- Global Variable Emission Directives --------------------------===// ++ GlobalDirective = ".global"; ++ ExternDirective = ".extern"; ++ HasSetDirective = false; ++ HasAggressiveSymbolFolding = true; ++ COMMDirectiveAlignmentIsInBytes = false; ++ HasDotTypeDotSizeDirective = false; ++ HasNoDeadStrip = true; ++ HasSymbolResolver = false; ++ WeakRefDirective = ".weakref\t"; ++ LinkOnceDirective = 0; ++ //===--- Dwarf Emission Directives -----------------------------------===// ++ HasLEB128 = true; ++ SupportsDebugInformation = true; ++ ExceptionsType = ExceptionHandling::None; ++ DwarfUsesInlineInfoSection = false; ++ DwarfSectionOffsetDirective = ".offset"; ++ ++} ++ ++const char* ++AMDGPUMCAsmInfo::getDataASDirective(unsigned int Size, unsigned int AS) const { ++ return 0; ++} ++ ++const MCSection* ++AMDGPUMCAsmInfo::getNonexecutableStackSection(MCContext &CTX) const { ++ return 0; ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h +--- llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h 2013-01-25 19:43:57.456716387 +0100 +@@ -0,0 +1,30 @@ ++//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface ----------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++// ++//===----------------------------------------------------------------------===// ++ ++#ifndef AMDGPUMCASMINFO_H ++#define AMDGPUMCASMINFO_H ++ ++#include "llvm/MC/MCAsmInfo.h" ++namespace llvm { ++ ++class Target; ++class StringRef; ++ ++class AMDGPUMCAsmInfo : public MCAsmInfo { ++public: ++ explicit AMDGPUMCAsmInfo(const Target &T, StringRef &TT); ++ const char* getDataASDirective(unsigned int Size, unsigned int AS) const; ++ const MCSection* getNonexecutableStackSection(MCContext &CTX) const; ++}; ++} // namespace llvm ++#endif // AMDGPUMCASMINFO_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h +--- llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h 2013-01-25 19:43:57.456716387 +0100 +@@ -0,0 +1,60 @@ ++//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief CodeEmitter interface for R600 and SI codegen. ++// ++//===----------------------------------------------------------------------===// ++ ++#ifndef AMDGPUCODEEMITTER_H ++#define AMDGPUCODEEMITTER_H ++ ++#include "llvm/MC/MCCodeEmitter.h" ++#include "llvm/Support/raw_ostream.h" ++ ++namespace llvm { ++ ++class MCInst; ++class MCOperand; ++ ++class AMDGPUMCCodeEmitter : public MCCodeEmitter { ++public: ++ ++ uint64_t getBinaryCodeForInstr(const MCInst &MI, ++ SmallVectorImpl &Fixups) const; ++ ++ virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, ++ SmallVectorImpl &Fixups) const { ++ return 0; ++ } ++ ++ virtual unsigned GPR4AlignEncode(const MCInst &MI, unsigned OpNo, ++ SmallVectorImpl &Fixups) const { ++ return 0; ++ } ++ virtual unsigned GPR2AlignEncode(const MCInst &MI, unsigned OpNo, ++ SmallVectorImpl &Fixups) const { ++ return 0; ++ } ++ virtual uint64_t VOPPostEncode(const MCInst &MI, uint64_t Value) const { ++ return Value; ++ } ++ virtual uint64_t i32LiteralEncode(const MCInst &MI, unsigned OpNo, ++ SmallVectorImpl &Fixups) const { ++ return 0; ++ } ++ virtual uint32_t SMRDmemriEncode(const MCInst &MI, unsigned OpNo, ++ SmallVectorImpl &Fixups) const { ++ return 0; ++ } ++}; ++ ++} // End namespace llvm ++ ++#endif // AMDGPUCODEEMITTER_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp +--- llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp 2013-01-25 19:43:57.460049721 +0100 +@@ -0,0 +1,113 @@ ++//===-- AMDGPUMCTargetDesc.cpp - AMDGPU Target Descriptions ---------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief This file provides AMDGPU specific target descriptions. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "AMDGPUMCTargetDesc.h" ++#include "AMDGPUMCAsmInfo.h" ++#include "InstPrinter/AMDGPUInstPrinter.h" ++#include "llvm/MC/MachineLocation.h" ++#include "llvm/MC/MCCodeGenInfo.h" ++#include "llvm/MC/MCInstrInfo.h" ++#include "llvm/MC/MCRegisterInfo.h" ++#include "llvm/MC/MCStreamer.h" ++#include "llvm/MC/MCSubtargetInfo.h" ++#include "llvm/Support/ErrorHandling.h" ++#include "llvm/Support/TargetRegistry.h" ++ ++#define GET_INSTRINFO_MC_DESC ++#include "AMDGPUGenInstrInfo.inc" ++ ++#define GET_SUBTARGETINFO_MC_DESC ++#include "AMDGPUGenSubtargetInfo.inc" ++ ++#define GET_REGINFO_MC_DESC ++#include "AMDGPUGenRegisterInfo.inc" ++ ++using namespace llvm; ++ ++static MCInstrInfo *createAMDGPUMCInstrInfo() { ++ MCInstrInfo *X = new MCInstrInfo(); ++ InitAMDGPUMCInstrInfo(X); ++ return X; ++} ++ ++static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) { ++ MCRegisterInfo *X = new MCRegisterInfo(); ++ InitAMDGPUMCRegisterInfo(X, 0); ++ return X; ++} ++ ++static MCSubtargetInfo *createAMDGPUMCSubtargetInfo(StringRef TT, StringRef CPU, ++ StringRef FS) { ++ MCSubtargetInfo * X = new MCSubtargetInfo(); ++ InitAMDGPUMCSubtargetInfo(X, TT, CPU, FS); ++ return X; ++} ++ ++static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(StringRef TT, Reloc::Model RM, ++ CodeModel::Model CM, ++ CodeGenOpt::Level OL) { ++ MCCodeGenInfo *X = new MCCodeGenInfo(); ++ X->InitMCCodeGenInfo(RM, CM, OL); ++ return X; ++} ++ ++static MCInstPrinter *createAMDGPUMCInstPrinter(const Target &T, ++ unsigned SyntaxVariant, ++ const MCAsmInfo &MAI, ++ const MCInstrInfo &MII, ++ const MCRegisterInfo &MRI, ++ const MCSubtargetInfo &STI) { ++ return new AMDGPUInstPrinter(MAI, MII, MRI); ++} ++ ++static MCCodeEmitter *createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII, ++ const MCRegisterInfo &MRI, ++ const MCSubtargetInfo &STI, ++ MCContext &Ctx) { ++ if (STI.getFeatureBits() & AMDGPU::Feature64BitPtr) { ++ return createSIMCCodeEmitter(MCII, MRI, STI, Ctx); ++ } else { ++ return createR600MCCodeEmitter(MCII, MRI, STI, Ctx); ++ } ++} ++ ++static MCStreamer *createMCStreamer(const Target &T, StringRef TT, ++ MCContext &Ctx, MCAsmBackend &MAB, ++ raw_ostream &_OS, ++ MCCodeEmitter *_Emitter, ++ bool RelaxAll, ++ bool NoExecStack) { ++ return createPureStreamer(Ctx, MAB, _OS, _Emitter); ++} ++ ++extern "C" void LLVMInitializeR600TargetMC() { ++ ++ RegisterMCAsmInfo Y(TheAMDGPUTarget); ++ ++ TargetRegistry::RegisterMCCodeGenInfo(TheAMDGPUTarget, createAMDGPUMCCodeGenInfo); ++ ++ TargetRegistry::RegisterMCInstrInfo(TheAMDGPUTarget, createAMDGPUMCInstrInfo); ++ ++ TargetRegistry::RegisterMCRegInfo(TheAMDGPUTarget, createAMDGPUMCRegisterInfo); ++ ++ TargetRegistry::RegisterMCSubtargetInfo(TheAMDGPUTarget, createAMDGPUMCSubtargetInfo); ++ ++ TargetRegistry::RegisterMCInstPrinter(TheAMDGPUTarget, createAMDGPUMCInstPrinter); ++ ++ TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, createAMDGPUMCCodeEmitter); ++ ++ TargetRegistry::RegisterMCAsmBackend(TheAMDGPUTarget, createAMDGPUAsmBackend); ++ ++ TargetRegistry::RegisterMCObjectStreamer(TheAMDGPUTarget, createMCStreamer); ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h +--- llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h 2013-01-25 19:43:57.460049721 +0100 +@@ -0,0 +1,55 @@ ++//===-- AMDGPUMCTargetDesc.h - AMDGPU Target Descriptions -----*- C++ -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Provides AMDGPU specific target descriptions. ++// ++//===----------------------------------------------------------------------===// ++// ++ ++#ifndef AMDGPUMCTARGETDESC_H ++#define AMDGPUMCTARGETDESC_H ++ ++#include "llvm/ADT/StringRef.h" ++ ++namespace llvm { ++class MCAsmBackend; ++class MCCodeEmitter; ++class MCContext; ++class MCInstrInfo; ++class MCRegisterInfo; ++class MCSubtargetInfo; ++class Target; ++ ++extern Target TheAMDGPUTarget; ++ ++MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII, ++ const MCRegisterInfo &MRI, ++ const MCSubtargetInfo &STI, ++ MCContext &Ctx); ++ ++MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII, ++ const MCRegisterInfo &MRI, ++ const MCSubtargetInfo &STI, ++ MCContext &Ctx); ++ ++MCAsmBackend *createAMDGPUAsmBackend(const Target &T, StringRef TT, ++ StringRef CPU); ++} // End llvm namespace ++ ++#define GET_REGINFO_ENUM ++#include "AMDGPUGenRegisterInfo.inc" ++ ++#define GET_INSTRINFO_ENUM ++#include "AMDGPUGenInstrInfo.inc" ++ ++#define GET_SUBTARGETINFO_ENUM ++#include "AMDGPUGenSubtargetInfo.inc" ++ ++#endif // AMDGPUMCTARGETDESC_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/CMakeLists.txt llvm-r600/lib/Target/R600/MCTargetDesc/CMakeLists.txt +--- llvm-3.2.src/lib/Target/R600/MCTargetDesc/CMakeLists.txt 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/MCTargetDesc/CMakeLists.txt 2013-01-25 19:43:57.460049721 +0100 +@@ -0,0 +1,10 @@ ++ ++add_llvm_library(LLVMR600Desc ++ AMDGPUAsmBackend.cpp ++ AMDGPUMCTargetDesc.cpp ++ AMDGPUMCAsmInfo.cpp ++ R600MCCodeEmitter.cpp ++ SIMCCodeEmitter.cpp ++ ) ++ ++add_dependencies(LLVMR600Desc AMDGPUCommonTableGen) +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/LLVMBuild.txt llvm-r600/lib/Target/R600/MCTargetDesc/LLVMBuild.txt +--- llvm-3.2.src/lib/Target/R600/MCTargetDesc/LLVMBuild.txt 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/MCTargetDesc/LLVMBuild.txt 2013-01-25 19:43:57.460049721 +0100 +@@ -0,0 +1,23 @@ ++;===- ./lib/Target/R600/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===; ++; ++; The LLVM Compiler Infrastructure ++; ++; This file is distributed under the University of Illinois Open Source ++; License. See LICENSE.TXT for details. ++; ++;===------------------------------------------------------------------------===; ++; ++; This is an LLVMBuild description file for the components in this subdirectory. ++; ++; For more information on the LLVMBuild system, please see: ++; ++; http://llvm.org/docs/LLVMBuild.html ++; ++;===------------------------------------------------------------------------===; ++ ++[component_0] ++type = Library ++name = R600Desc ++parent = R600 ++required_libraries = R600AsmPrinter R600Info MC ++add_to_library_groups = R600 +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/Makefile llvm-r600/lib/Target/R600/MCTargetDesc/Makefile +--- llvm-3.2.src/lib/Target/R600/MCTargetDesc/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/MCTargetDesc/Makefile 2013-01-25 19:43:57.460049721 +0100 +@@ -0,0 +1,16 @@ ++##===- lib/Target/AMDGPU/TargetDesc/Makefile ----------------*- Makefile -*-===## ++# ++# The LLVM Compiler Infrastructure ++# ++# This file is distributed under the University of Illinois Open Source ++# License. See LICENSE.TXT for details. ++# ++##===----------------------------------------------------------------------===## ++ ++LEVEL = ../../../.. ++LIBRARYNAME = LLVMR600Desc ++ ++# Hack: we need to include 'main' target directory to grab private headers ++CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. ++ ++include $(LEVEL)/Makefile.common +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp llvm-r600/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +--- llvm-3.2.src/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp 2013-01-25 19:43:57.460049721 +0100 +@@ -0,0 +1,580 @@ ++//===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// ++/// This code emitter outputs bytecode that is understood by the r600g driver ++/// in the Mesa [1] project. The bytecode is very similar to the hardware's ISA, ++/// but it still needs to be run through a finalizer in order to be executed ++/// by the GPU. ++/// ++/// [1] http://www.mesa3d.org/ ++// ++//===----------------------------------------------------------------------===// ++ ++#include "R600Defines.h" ++#include "MCTargetDesc/AMDGPUMCTargetDesc.h" ++#include "MCTargetDesc/AMDGPUMCCodeEmitter.h" ++#include "llvm/MC/MCCodeEmitter.h" ++#include "llvm/MC/MCContext.h" ++#include "llvm/MC/MCInst.h" ++#include "llvm/MC/MCInstrInfo.h" ++#include "llvm/MC/MCRegisterInfo.h" ++#include "llvm/MC/MCSubtargetInfo.h" ++#include "llvm/Support/raw_ostream.h" ++ ++#include ++ ++#define SRC_BYTE_COUNT 11 ++#define DST_BYTE_COUNT 5 ++ ++using namespace llvm; ++ ++namespace { ++ ++class R600MCCodeEmitter : public AMDGPUMCCodeEmitter { ++ R600MCCodeEmitter(const R600MCCodeEmitter &); // DO NOT IMPLEMENT ++ void operator=(const R600MCCodeEmitter &); // DO NOT IMPLEMENT ++ const MCInstrInfo &MCII; ++ const MCRegisterInfo &MRI; ++ const MCSubtargetInfo &STI; ++ MCContext &Ctx; ++ ++public: ++ ++ R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, ++ const MCSubtargetInfo &sti, MCContext &ctx) ++ : MCII(mcii), MRI(mri), STI(sti), Ctx(ctx) { } ++ ++ /// \brief Encode the instruction and write it to the OS. ++ virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS, ++ SmallVectorImpl &Fixups) const; ++ ++ /// \returns the encoding for an MCOperand. ++ virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, ++ SmallVectorImpl &Fixups) const; ++private: ++ ++ void EmitALUInstr(const MCInst &MI, SmallVectorImpl &Fixups, ++ raw_ostream &OS) const; ++ void EmitSrc(const MCInst &MI, unsigned OpIdx, raw_ostream &OS) const; ++ void EmitSrcISA(const MCInst &MI, unsigned RegOpIdx, unsigned SelOpIdx, ++ raw_ostream &OS) const; ++ void EmitDst(const MCInst &MI, raw_ostream &OS) const; ++ void EmitTexInstr(const MCInst &MI, SmallVectorImpl &Fixups, ++ raw_ostream &OS) const; ++ void EmitFCInstr(const MCInst &MI, raw_ostream &OS) const; ++ ++ void EmitNullBytes(unsigned int byteCount, raw_ostream &OS) const; ++ ++ void EmitByte(unsigned int byte, raw_ostream &OS) const; ++ ++ void EmitTwoBytes(uint32_t bytes, raw_ostream &OS) const; ++ ++ void Emit(uint32_t value, raw_ostream &OS) const; ++ void Emit(uint64_t value, raw_ostream &OS) const; ++ ++ unsigned getHWRegChan(unsigned reg) const; ++ unsigned getHWReg(unsigned regNo) const; ++ ++ bool isFCOp(unsigned opcode) const; ++ bool isTexOp(unsigned opcode) const; ++ bool isFlagSet(const MCInst &MI, unsigned Operand, unsigned Flag) const; ++ ++}; ++ ++} // End anonymous namespace ++ ++enum RegElement { ++ ELEMENT_X = 0, ++ ELEMENT_Y, ++ ELEMENT_Z, ++ ELEMENT_W ++}; ++ ++enum InstrTypes { ++ INSTR_ALU = 0, ++ INSTR_TEX, ++ INSTR_FC, ++ INSTR_NATIVE, ++ INSTR_VTX, ++ INSTR_EXPORT ++}; ++ ++enum FCInstr { ++ FC_IF_PREDICATE = 0, ++ FC_ELSE, ++ FC_ENDIF, ++ FC_BGNLOOP, ++ FC_ENDLOOP, ++ FC_BREAK_PREDICATE, ++ FC_CONTINUE ++}; ++ ++enum TextureTypes { ++ TEXTURE_1D = 1, ++ TEXTURE_2D, ++ TEXTURE_3D, ++ TEXTURE_CUBE, ++ TEXTURE_RECT, ++ TEXTURE_SHADOW1D, ++ TEXTURE_SHADOW2D, ++ TEXTURE_SHADOWRECT, ++ TEXTURE_1D_ARRAY, ++ TEXTURE_2D_ARRAY, ++ TEXTURE_SHADOW1D_ARRAY, ++ TEXTURE_SHADOW2D_ARRAY ++}; ++ ++MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII, ++ const MCRegisterInfo &MRI, ++ const MCSubtargetInfo &STI, ++ MCContext &Ctx) { ++ return new R600MCCodeEmitter(MCII, MRI, STI, Ctx); ++} ++ ++void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, ++ SmallVectorImpl &Fixups) const { ++ if (isTexOp(MI.getOpcode())) { ++ EmitTexInstr(MI, Fixups, OS); ++ } else if (isFCOp(MI.getOpcode())){ ++ EmitFCInstr(MI, OS); ++ } else if (MI.getOpcode() == AMDGPU::RETURN || ++ MI.getOpcode() == AMDGPU::BUNDLE || ++ MI.getOpcode() == AMDGPU::KILL) { ++ return; ++ } else { ++ switch(MI.getOpcode()) { ++ case AMDGPU::RAT_WRITE_CACHELESS_32_eg: ++ case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { ++ uint64_t inst = getBinaryCodeForInstr(MI, Fixups); ++ EmitByte(INSTR_NATIVE, OS); ++ Emit(inst, OS); ++ break; ++ } ++ case AMDGPU::CONSTANT_LOAD_eg: ++ case AMDGPU::VTX_READ_PARAM_8_eg: ++ case AMDGPU::VTX_READ_PARAM_16_eg: ++ case AMDGPU::VTX_READ_PARAM_32_eg: ++ case AMDGPU::VTX_READ_GLOBAL_8_eg: ++ case AMDGPU::VTX_READ_GLOBAL_32_eg: ++ case AMDGPU::VTX_READ_GLOBAL_128_eg: ++ case AMDGPU::TEX_VTX_CONSTBUF: { ++ uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups); ++ uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset ++ ++ EmitByte(INSTR_VTX, OS); ++ Emit(InstWord01, OS); ++ Emit(InstWord2, OS); ++ break; ++ } ++ case AMDGPU::EG_ExportSwz: ++ case AMDGPU::R600_ExportSwz: ++ case AMDGPU::EG_ExportBuf: ++ case AMDGPU::R600_ExportBuf: { ++ uint64_t Inst = getBinaryCodeForInstr(MI, Fixups); ++ EmitByte(INSTR_EXPORT, OS); ++ Emit(Inst, OS); ++ break; ++ } ++ ++ default: ++ EmitALUInstr(MI, Fixups, OS); ++ break; ++ } ++ } ++} ++ ++void R600MCCodeEmitter::EmitALUInstr(const MCInst &MI, ++ SmallVectorImpl &Fixups, ++ raw_ostream &OS) const { ++ const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode()); ++ ++ // Emit instruction type ++ EmitByte(INSTR_ALU, OS); ++ ++ uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups); ++ ++ //older alu have different encoding for instructions with one or two src ++ //parameters. ++ if ((STI.getFeatureBits() & AMDGPU::FeatureR600ALUInst) && ++ !(MCDesc.TSFlags & R600_InstFlag::OP3)) { ++ uint64_t ISAOpCode = InstWord01 & (0x3FFULL << 39); ++ InstWord01 &= ~(0x3FFULL << 39); ++ InstWord01 |= ISAOpCode << 1; ++ } ++ ++ unsigned SrcNum = MCDesc.TSFlags & R600_InstFlag::OP3 ? 3 : ++ MCDesc.TSFlags & R600_InstFlag::OP2 ? 2 : 1; ++ ++ EmitByte(SrcNum, OS); ++ ++ const unsigned SrcOps[3][2] = { ++ {R600Operands::SRC0, R600Operands::SRC0_SEL}, ++ {R600Operands::SRC1, R600Operands::SRC1_SEL}, ++ {R600Operands::SRC2, R600Operands::SRC2_SEL} ++ }; ++ ++ for (unsigned SrcIdx = 0; SrcIdx < SrcNum; ++SrcIdx) { ++ unsigned RegOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][0]]; ++ unsigned SelOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][1]]; ++ EmitSrcISA(MI, RegOpIdx, SelOpIdx, OS); ++ } ++ ++ Emit(InstWord01, OS); ++ return; ++} ++ ++void R600MCCodeEmitter::EmitSrc(const MCInst &MI, unsigned OpIdx, ++ raw_ostream &OS) const { ++ const MCOperand &MO = MI.getOperand(OpIdx); ++ union { ++ float f; ++ uint32_t i; ++ } Value; ++ Value.i = 0; ++ // Emit the source select (2 bytes). For GPRs, this is the register index. ++ // For other potential instruction operands, (e.g. constant registers) the ++ // value of the source select is defined in the r600isa docs. ++ if (MO.isReg()) { ++ unsigned reg = MO.getReg(); ++ EmitTwoBytes(getHWReg(reg), OS); ++ if (reg == AMDGPU::ALU_LITERAL_X) { ++ unsigned ImmOpIndex = MI.getNumOperands() - 1; ++ MCOperand ImmOp = MI.getOperand(ImmOpIndex); ++ if (ImmOp.isFPImm()) { ++ Value.f = ImmOp.getFPImm(); ++ } else { ++ assert(ImmOp.isImm()); ++ Value.i = ImmOp.getImm(); ++ } ++ } ++ } else { ++ // XXX: Handle other operand types. ++ EmitTwoBytes(0, OS); ++ } ++ ++ // Emit the source channel (1 byte) ++ if (MO.isReg()) { ++ EmitByte(getHWRegChan(MO.getReg()), OS); ++ } else { ++ EmitByte(0, OS); ++ } ++ ++ // XXX: Emit isNegated (1 byte) ++ if ((!(isFlagSet(MI, OpIdx, MO_FLAG_ABS))) ++ && (isFlagSet(MI, OpIdx, MO_FLAG_NEG) || ++ (MO.isReg() && ++ (MO.getReg() == AMDGPU::NEG_ONE || MO.getReg() == AMDGPU::NEG_HALF)))){ ++ EmitByte(1, OS); ++ } else { ++ EmitByte(0, OS); ++ } ++ ++ // Emit isAbsolute (1 byte) ++ if (isFlagSet(MI, OpIdx, MO_FLAG_ABS)) { ++ EmitByte(1, OS); ++ } else { ++ EmitByte(0, OS); ++ } ++ ++ // XXX: Emit relative addressing mode (1 byte) ++ EmitByte(0, OS); ++ ++ // Emit kc_bank, This will be adjusted later by r600_asm ++ EmitByte(0, OS); ++ ++ // Emit the literal value, if applicable (4 bytes). ++ Emit(Value.i, OS); ++ ++} ++ ++void R600MCCodeEmitter::EmitSrcISA(const MCInst &MI, unsigned RegOpIdx, ++ unsigned SelOpIdx, raw_ostream &OS) const { ++ const MCOperand &RegMO = MI.getOperand(RegOpIdx); ++ const MCOperand &SelMO = MI.getOperand(SelOpIdx); ++ ++ union { ++ float f; ++ uint32_t i; ++ } InlineConstant; ++ InlineConstant.i = 0; ++ // Emit source type (1 byte) and source select (4 bytes). For GPRs type is 0 ++ // and select is 0 (GPR index is encoded in the instr encoding. For constants ++ // type is 1 and select is the original const select passed from the driver. ++ unsigned Reg = RegMO.getReg(); ++ if (Reg == AMDGPU::ALU_CONST) { ++ EmitByte(1, OS); ++ uint32_t Sel = SelMO.getImm(); ++ Emit(Sel, OS); ++ } else { ++ EmitByte(0, OS); ++ Emit((uint32_t)0, OS); ++ } ++ ++ if (Reg == AMDGPU::ALU_LITERAL_X) { ++ unsigned ImmOpIndex = MI.getNumOperands() - 1; ++ MCOperand ImmOp = MI.getOperand(ImmOpIndex); ++ if (ImmOp.isFPImm()) { ++ InlineConstant.f = ImmOp.getFPImm(); ++ } else { ++ assert(ImmOp.isImm()); ++ InlineConstant.i = ImmOp.getImm(); ++ } ++ } ++ ++ // Emit the literal value, if applicable (4 bytes). ++ Emit(InlineConstant.i, OS); ++} ++ ++void R600MCCodeEmitter::EmitTexInstr(const MCInst &MI, ++ SmallVectorImpl &Fixups, ++ raw_ostream &OS) const { ++ ++ unsigned Opcode = MI.getOpcode(); ++ bool hasOffsets = (Opcode == AMDGPU::TEX_LD); ++ unsigned OpOffset = hasOffsets ? 3 : 0; ++ int64_t Resource = MI.getOperand(OpOffset + 2).getImm(); ++ int64_t Sampler = MI.getOperand(OpOffset + 3).getImm(); ++ int64_t TextureType = MI.getOperand(OpOffset + 4).getImm(); ++ unsigned srcSelect[4] = {0, 1, 2, 3}; ++ ++ // Emit instruction type ++ EmitByte(1, OS); ++ ++ // Emit instruction ++ EmitByte(getBinaryCodeForInstr(MI, Fixups), OS); ++ ++ // Emit resource id ++ EmitByte(Resource, OS); ++ ++ // Emit source register ++ EmitByte(getHWReg(MI.getOperand(1).getReg()), OS); ++ ++ // XXX: Emit src isRelativeAddress ++ EmitByte(0, OS); ++ ++ // Emit destination register ++ EmitByte(getHWReg(MI.getOperand(0).getReg()), OS); ++ ++ // XXX: Emit dst isRealtiveAddress ++ EmitByte(0, OS); ++ ++ // XXX: Emit dst select ++ EmitByte(0, OS); // X ++ EmitByte(1, OS); // Y ++ EmitByte(2, OS); // Z ++ EmitByte(3, OS); // W ++ ++ // XXX: Emit lod bias ++ EmitByte(0, OS); ++ ++ // XXX: Emit coord types ++ unsigned coordType[4] = {1, 1, 1, 1}; ++ ++ if (TextureType == TEXTURE_RECT ++ || TextureType == TEXTURE_SHADOWRECT) { ++ coordType[ELEMENT_X] = 0; ++ coordType[ELEMENT_Y] = 0; ++ } ++ ++ if (TextureType == TEXTURE_1D_ARRAY ++ || TextureType == TEXTURE_SHADOW1D_ARRAY) { ++ if (Opcode == AMDGPU::TEX_SAMPLE_C_L || Opcode == AMDGPU::TEX_SAMPLE_C_LB) { ++ coordType[ELEMENT_Y] = 0; ++ } else { ++ coordType[ELEMENT_Z] = 0; ++ srcSelect[ELEMENT_Z] = ELEMENT_Y; ++ } ++ } else if (TextureType == TEXTURE_2D_ARRAY ++ || TextureType == TEXTURE_SHADOW2D_ARRAY) { ++ coordType[ELEMENT_Z] = 0; ++ } ++ ++ for (unsigned i = 0; i < 4; i++) { ++ EmitByte(coordType[i], OS); ++ } ++ ++ // XXX: Emit offsets ++ if (hasOffsets) ++ for (unsigned i = 2; i < 5; i++) ++ EmitByte(MI.getOperand(i).getImm()<<1, OS); ++ else ++ EmitNullBytes(3, OS); ++ ++ // Emit sampler id ++ EmitByte(Sampler, OS); ++ ++ // XXX:Emit source select ++ if ((TextureType == TEXTURE_SHADOW1D ++ || TextureType == TEXTURE_SHADOW2D ++ || TextureType == TEXTURE_SHADOWRECT ++ || TextureType == TEXTURE_SHADOW1D_ARRAY) ++ && Opcode != AMDGPU::TEX_SAMPLE_C_L ++ && Opcode != AMDGPU::TEX_SAMPLE_C_LB) { ++ srcSelect[ELEMENT_W] = ELEMENT_Z; ++ } ++ ++ for (unsigned i = 0; i < 4; i++) { ++ EmitByte(srcSelect[i], OS); ++ } ++} ++ ++void R600MCCodeEmitter::EmitFCInstr(const MCInst &MI, raw_ostream &OS) const { ++ ++ // Emit instruction type ++ EmitByte(INSTR_FC, OS); ++ ++ // Emit SRC ++ unsigned NumOperands = MI.getNumOperands(); ++ if (NumOperands > 0) { ++ assert(NumOperands == 1); ++ EmitSrc(MI, 0, OS); ++ } else { ++ EmitNullBytes(SRC_BYTE_COUNT, OS); ++ } ++ ++ // Emit FC Instruction ++ enum FCInstr instr; ++ switch (MI.getOpcode()) { ++ case AMDGPU::PREDICATED_BREAK: ++ instr = FC_BREAK_PREDICATE; ++ break; ++ case AMDGPU::CONTINUE: ++ instr = FC_CONTINUE; ++ break; ++ case AMDGPU::IF_PREDICATE_SET: ++ instr = FC_IF_PREDICATE; ++ break; ++ case AMDGPU::ELSE: ++ instr = FC_ELSE; ++ break; ++ case AMDGPU::ENDIF: ++ instr = FC_ENDIF; ++ break; ++ case AMDGPU::ENDLOOP: ++ instr = FC_ENDLOOP; ++ break; ++ case AMDGPU::WHILELOOP: ++ instr = FC_BGNLOOP; ++ break; ++ default: ++ abort(); ++ break; ++ } ++ EmitByte(instr, OS); ++} ++ ++void R600MCCodeEmitter::EmitNullBytes(unsigned int ByteCount, ++ raw_ostream &OS) const { ++ ++ for (unsigned int i = 0; i < ByteCount; i++) { ++ EmitByte(0, OS); ++ } ++} ++ ++void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const { ++ OS.write((uint8_t) Byte & 0xff); ++} ++ ++void R600MCCodeEmitter::EmitTwoBytes(unsigned int Bytes, ++ raw_ostream &OS) const { ++ OS.write((uint8_t) (Bytes & 0xff)); ++ OS.write((uint8_t) ((Bytes >> 8) & 0xff)); ++} ++ ++void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const { ++ for (unsigned i = 0; i < 4; i++) { ++ OS.write((uint8_t) ((Value >> (8 * i)) & 0xff)); ++ } ++} ++ ++void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const { ++ for (unsigned i = 0; i < 8; i++) { ++ EmitByte((Value >> (8 * i)) & 0xff, OS); ++ } ++} ++ ++unsigned R600MCCodeEmitter::getHWRegChan(unsigned reg) const { ++ return MRI.getEncodingValue(reg) >> HW_CHAN_SHIFT; ++} ++ ++unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const { ++ return MRI.getEncodingValue(RegNo) & HW_REG_MASK; ++} ++ ++uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI, ++ const MCOperand &MO, ++ SmallVectorImpl &Fixup) const { ++ if (MO.isReg()) { ++ if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) { ++ return MRI.getEncodingValue(MO.getReg()); ++ } else { ++ return getHWReg(MO.getReg()); ++ } ++ } else if (MO.isImm()) { ++ return MO.getImm(); ++ } else { ++ assert(0); ++ return 0; ++ } ++} ++ ++//===----------------------------------------------------------------------===// ++// Encoding helper functions ++//===----------------------------------------------------------------------===// ++ ++bool R600MCCodeEmitter::isFCOp(unsigned opcode) const { ++ switch(opcode) { ++ default: return false; ++ case AMDGPU::PREDICATED_BREAK: ++ case AMDGPU::CONTINUE: ++ case AMDGPU::IF_PREDICATE_SET: ++ case AMDGPU::ELSE: ++ case AMDGPU::ENDIF: ++ case AMDGPU::ENDLOOP: ++ case AMDGPU::WHILELOOP: ++ return true; ++ } ++} ++ ++bool R600MCCodeEmitter::isTexOp(unsigned opcode) const { ++ switch(opcode) { ++ default: return false; ++ case AMDGPU::TEX_LD: ++ case AMDGPU::TEX_GET_TEXTURE_RESINFO: ++ case AMDGPU::TEX_SAMPLE: ++ case AMDGPU::TEX_SAMPLE_C: ++ case AMDGPU::TEX_SAMPLE_L: ++ case AMDGPU::TEX_SAMPLE_C_L: ++ case AMDGPU::TEX_SAMPLE_LB: ++ case AMDGPU::TEX_SAMPLE_C_LB: ++ case AMDGPU::TEX_SAMPLE_G: ++ case AMDGPU::TEX_SAMPLE_C_G: ++ case AMDGPU::TEX_GET_GRADIENTS_H: ++ case AMDGPU::TEX_GET_GRADIENTS_V: ++ case AMDGPU::TEX_SET_GRADIENTS_H: ++ case AMDGPU::TEX_SET_GRADIENTS_V: ++ return true; ++ } ++} ++ ++bool R600MCCodeEmitter::isFlagSet(const MCInst &MI, unsigned Operand, ++ unsigned Flag) const { ++ const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode()); ++ unsigned FlagIndex = GET_FLAG_OPERAND_IDX(MCDesc.TSFlags); ++ if (FlagIndex == 0) { ++ return false; ++ } ++ assert(MI.getOperand(FlagIndex).isImm()); ++ return !!((MI.getOperand(FlagIndex).getImm() >> ++ (NUM_MO_FLAGS * Operand)) & Flag); ++} ++ ++#include "AMDGPUGenMCCodeEmitter.inc" +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp llvm-r600/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp +--- llvm-3.2.src/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp 2013-01-25 19:43:57.460049721 +0100 +@@ -0,0 +1,298 @@ ++//===-- SIMCCodeEmitter.cpp - SI Code Emitter -------------------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief The SI code emitter produces machine code that can be executed ++/// directly on the GPU device. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "MCTargetDesc/AMDGPUMCTargetDesc.h" ++#include "MCTargetDesc/AMDGPUMCCodeEmitter.h" ++#include "llvm/MC/MCCodeEmitter.h" ++#include "llvm/MC/MCContext.h" ++#include "llvm/MC/MCInst.h" ++#include "llvm/MC/MCInstrInfo.h" ++#include "llvm/MC/MCRegisterInfo.h" ++#include "llvm/MC/MCSubtargetInfo.h" ++#include "llvm/MC/MCFixup.h" ++#include "llvm/Support/raw_ostream.h" ++ ++#define VGPR_BIT(src_idx) (1ULL << (9 * src_idx - 1)) ++#define SI_INSTR_FLAGS_ENCODING_MASK 0xf ++ ++// These must be kept in sync with SIInstructions.td and also the ++// InstrEncodingInfo array in SIInstrInfo.cpp. ++// ++// NOTE: This enum is only used to identify the encoding type within LLVM, ++// the actual encoding type that is part of the instruction format is different ++namespace SIInstrEncodingType { ++ enum Encoding { ++ EXP = 0, ++ LDS = 1, ++ MIMG = 2, ++ MTBUF = 3, ++ MUBUF = 4, ++ SMRD = 5, ++ SOP1 = 6, ++ SOP2 = 7, ++ SOPC = 8, ++ SOPK = 9, ++ SOPP = 10, ++ VINTRP = 11, ++ VOP1 = 12, ++ VOP2 = 13, ++ VOP3 = 14, ++ VOPC = 15 ++ }; ++} ++ ++using namespace llvm; ++ ++namespace { ++class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { ++ SIMCCodeEmitter(const SIMCCodeEmitter &); // DO NOT IMPLEMENT ++ void operator=(const SIMCCodeEmitter &); // DO NOT IMPLEMENT ++ const MCInstrInfo &MCII; ++ const MCRegisterInfo &MRI; ++ const MCSubtargetInfo &STI; ++ MCContext &Ctx; ++ ++public: ++ SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, ++ const MCSubtargetInfo &sti, MCContext &ctx) ++ : MCII(mcii), MRI(mri), STI(sti), Ctx(ctx) { } ++ ++ ~SIMCCodeEmitter() { } ++ ++ /// \breif Encode the instruction and write it to the OS. ++ virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS, ++ SmallVectorImpl &Fixups) const; ++ ++ /// \returns the encoding for an MCOperand. ++ virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, ++ SmallVectorImpl &Fixups) const; ++ ++public: ++ ++ /// \brief Encode a sequence of registers with the correct alignment. ++ unsigned GPRAlign(const MCInst &MI, unsigned OpNo, unsigned shift) const; ++ ++ /// \brief Encoding for when 2 consecutive registers are used ++ virtual unsigned GPR2AlignEncode(const MCInst &MI, unsigned OpNo, ++ SmallVectorImpl &Fixup) const; ++ ++ /// \brief Encoding for when 4 consectuive registers are used ++ virtual unsigned GPR4AlignEncode(const MCInst &MI, unsigned OpNo, ++ SmallVectorImpl &Fixup) const; ++ ++ /// \brief Encoding for SMRD indexed loads ++ virtual uint32_t SMRDmemriEncode(const MCInst &MI, unsigned OpNo, ++ SmallVectorImpl &Fixup) const; ++ ++ /// \brief Post-Encoder method for VOP instructions ++ virtual uint64_t VOPPostEncode(const MCInst &MI, uint64_t Value) const; ++ ++private: ++ ++ /// \returns this SIInstrEncodingType for this instruction. ++ unsigned getEncodingType(const MCInst &MI) const; ++ ++ /// \brief Get then size in bytes of this instructions encoding. ++ unsigned getEncodingBytes(const MCInst &MI) const; ++ ++ /// \returns the hardware encoding for a register ++ unsigned getRegBinaryCode(unsigned reg) const; ++ ++ /// \brief Generated function that returns the hardware encoding for ++ /// a register ++ unsigned getHWRegNum(unsigned reg) const; ++ ++}; ++ ++} // End anonymous namespace ++ ++MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII, ++ const MCRegisterInfo &MRI, ++ const MCSubtargetInfo &STI, ++ MCContext &Ctx) { ++ return new SIMCCodeEmitter(MCII, MRI, STI, Ctx); ++} ++ ++void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, ++ SmallVectorImpl &Fixups) const { ++ uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups); ++ unsigned bytes = getEncodingBytes(MI); ++ for (unsigned i = 0; i < bytes; i++) { ++ OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff)); ++ } ++} ++ ++uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, ++ const MCOperand &MO, ++ SmallVectorImpl &Fixups) const { ++ if (MO.isReg()) { ++ return getRegBinaryCode(MO.getReg()); ++ } else if (MO.isImm()) { ++ return MO.getImm(); ++ } else if (MO.isFPImm()) { ++ // XXX: Not all instructions can use inline literals ++ // XXX: We should make sure this is a 32-bit constant ++ union { ++ float F; ++ uint32_t I; ++ } Imm; ++ Imm.F = MO.getFPImm(); ++ return Imm.I; ++ } else if (MO.isExpr()) { ++ const MCExpr *Expr = MO.getExpr(); ++ MCFixupKind Kind = MCFixupKind(FK_PCRel_4); ++ Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc())); ++ return 0; ++ } else{ ++ llvm_unreachable("Encoding of this operand type is not supported yet."); ++ } ++ return 0; ++} ++ ++//===----------------------------------------------------------------------===// ++// Custom Operand Encodings ++//===----------------------------------------------------------------------===// ++ ++unsigned SIMCCodeEmitter::GPRAlign(const MCInst &MI, unsigned OpNo, ++ unsigned shift) const { ++ unsigned regCode = getRegBinaryCode(MI.getOperand(OpNo).getReg()); ++ return regCode >> shift; ++ return 0; ++} ++unsigned SIMCCodeEmitter::GPR2AlignEncode(const MCInst &MI, ++ unsigned OpNo , ++ SmallVectorImpl &Fixup) const { ++ return GPRAlign(MI, OpNo, 1); ++} ++ ++unsigned SIMCCodeEmitter::GPR4AlignEncode(const MCInst &MI, ++ unsigned OpNo, ++ SmallVectorImpl &Fixup) const { ++ return GPRAlign(MI, OpNo, 2); ++} ++ ++#define SMRD_OFFSET_MASK 0xff ++#define SMRD_IMM_SHIFT 8 ++#define SMRD_SBASE_MASK 0x3f ++#define SMRD_SBASE_SHIFT 9 ++/// This function is responsibe for encoding the offset ++/// and the base ptr for SMRD instructions it should return a bit string in ++/// this format: ++/// ++/// OFFSET = bits{7-0} ++/// IMM = bits{8} ++/// SBASE = bits{14-9} ++/// ++uint32_t SIMCCodeEmitter::SMRDmemriEncode(const MCInst &MI, unsigned OpNo, ++ SmallVectorImpl &Fixup) const { ++ uint32_t Encoding; ++ ++ const MCOperand &OffsetOp = MI.getOperand(OpNo + 1); ++ ++ //XXX: Use this function for SMRD loads with register offsets ++ assert(OffsetOp.isImm()); ++ ++ Encoding = ++ (getMachineOpValue(MI, OffsetOp, Fixup) & SMRD_OFFSET_MASK) ++ | (1 << SMRD_IMM_SHIFT) //XXX If the Offset is a register we shouldn't set this bit ++ | ((GPR2AlignEncode(MI, OpNo, Fixup) & SMRD_SBASE_MASK) << SMRD_SBASE_SHIFT) ++ ; ++ ++ return Encoding; ++} ++ ++//===----------------------------------------------------------------------===// ++// Post Encoder Callbacks ++//===----------------------------------------------------------------------===// ++ ++uint64_t SIMCCodeEmitter::VOPPostEncode(const MCInst &MI, uint64_t Value) const{ ++ unsigned encodingType = getEncodingType(MI); ++ unsigned numSrcOps; ++ unsigned vgprBitOffset; ++ ++ if (encodingType == SIInstrEncodingType::VOP3) { ++ numSrcOps = 3; ++ vgprBitOffset = 32; ++ } else { ++ numSrcOps = 1; ++ vgprBitOffset = 0; ++ } ++ ++ // Add one to skip over the destination reg operand. ++ for (unsigned opIdx = 1; opIdx < numSrcOps + 1; opIdx++) { ++ const MCOperand &MO = MI.getOperand(opIdx); ++ if (MO.isReg()) { ++ unsigned reg = MI.getOperand(opIdx).getReg(); ++ if (AMDGPUMCRegisterClasses[AMDGPU::VReg_32RegClassID].contains(reg) || ++ AMDGPUMCRegisterClasses[AMDGPU::VReg_64RegClassID].contains(reg)) { ++ Value |= (VGPR_BIT(opIdx)) << vgprBitOffset; ++ } ++ } else if (MO.isFPImm()) { ++ union { ++ float f; ++ uint32_t i; ++ } Imm; ++ // XXX: Not all instructions can use inline literals ++ // XXX: We should make sure this is a 32-bit constant ++ Imm.f = MO.getFPImm(); ++ Value |= ((uint64_t)Imm.i) << 32; ++ } ++ } ++ return Value; ++} ++ ++//===----------------------------------------------------------------------===// ++// Encoding helper functions ++//===----------------------------------------------------------------------===// ++ ++unsigned SIMCCodeEmitter::getEncodingType(const MCInst &MI) const { ++ return MCII.get(MI.getOpcode()).TSFlags & SI_INSTR_FLAGS_ENCODING_MASK; ++} ++ ++unsigned SIMCCodeEmitter::getEncodingBytes(const MCInst &MI) const { ++ ++ // These instructions aren't real instructions with an encoding type, so ++ // we need to manually specify their size. ++ switch (MI.getOpcode()) { ++ default: break; ++ case AMDGPU::SI_LOAD_LITERAL_I32: ++ case AMDGPU::SI_LOAD_LITERAL_F32: ++ return 4; ++ } ++ ++ unsigned encoding_type = getEncodingType(MI); ++ switch (encoding_type) { ++ case SIInstrEncodingType::EXP: ++ case SIInstrEncodingType::LDS: ++ case SIInstrEncodingType::MUBUF: ++ case SIInstrEncodingType::MTBUF: ++ case SIInstrEncodingType::MIMG: ++ case SIInstrEncodingType::VOP3: ++ return 8; ++ default: ++ return 4; ++ } ++} ++ ++ ++unsigned SIMCCodeEmitter::getRegBinaryCode(unsigned reg) const { ++ switch (reg) { ++ case AMDGPU::M0: return 124; ++ case AMDGPU::SREG_LIT_0: return 128; ++ case AMDGPU::SI_LITERAL_CONSTANT: return 255; ++ default: return MRI.getEncodingValue(reg); ++ } ++} ++ +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/Processors.td llvm-r600/lib/Target/R600/Processors.td +--- llvm-3.2.src/lib/Target/R600/Processors.td 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/Processors.td 2013-01-25 19:43:57.460049721 +0100 +@@ -0,0 +1,29 @@ ++//===-- Processors.td - TODO: Add brief description -------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++// AMDIL processors supported. ++// ++//===----------------------------------------------------------------------===// ++ ++class Proc Features> ++: Processor; ++def : Proc<"r600", R600_EG_Itin, [FeatureR600ALUInst]>; ++def : Proc<"rv710", R600_EG_Itin, []>; ++def : Proc<"rv730", R600_EG_Itin, []>; ++def : Proc<"rv770", R600_EG_Itin, [FeatureFP64]>; ++def : Proc<"cedar", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; ++def : Proc<"redwood", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; ++def : Proc<"juniper", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; ++def : Proc<"cypress", R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>; ++def : Proc<"barts", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; ++def : Proc<"turks", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; ++def : Proc<"caicos", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; ++def : Proc<"cayman", R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>; ++def : Proc<"SI", SI_Itin, [Feature64BitPtr]>; ++ +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600Defines.h llvm-r600/lib/Target/R600/R600Defines.h +--- llvm-3.2.src/lib/Target/R600/R600Defines.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/R600Defines.h 2013-01-25 19:43:57.460049721 +0100 +@@ -0,0 +1,94 @@ ++//===-- R600Defines.h - R600 Helper Macros ----------------------*- C++ -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++/// \file ++//===----------------------------------------------------------------------===// ++ ++#ifndef R600DEFINES_H_ ++#define R600DEFINES_H_ ++ ++#include "llvm/MC/MCRegisterInfo.h" ++ ++// Operand Flags ++#define MO_FLAG_CLAMP (1 << 0) ++#define MO_FLAG_NEG (1 << 1) ++#define MO_FLAG_ABS (1 << 2) ++#define MO_FLAG_MASK (1 << 3) ++#define MO_FLAG_PUSH (1 << 4) ++#define MO_FLAG_NOT_LAST (1 << 5) ++#define MO_FLAG_LAST (1 << 6) ++#define NUM_MO_FLAGS 7 ++ ++/// \brief Helper for getting the operand index for the instruction flags ++/// operand. ++#define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3) ++ ++namespace R600_InstFlag { ++ enum TIF { ++ TRANS_ONLY = (1 << 0), ++ TEX = (1 << 1), ++ REDUCTION = (1 << 2), ++ FC = (1 << 3), ++ TRIG = (1 << 4), ++ OP3 = (1 << 5), ++ VECTOR = (1 << 6), ++ //FlagOperand bits 7, 8 ++ NATIVE_OPERANDS = (1 << 9), ++ OP1 = (1 << 10), ++ OP2 = (1 << 11) ++ }; ++} ++ ++#define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS) ++ ++/// \brief Defines for extracting register infomation from register encoding ++#define HW_REG_MASK 0x1ff ++#define HW_CHAN_SHIFT 9 ++ ++namespace R600Operands { ++ enum Ops { ++ DST, ++ UPDATE_EXEC_MASK, ++ UPDATE_PREDICATE, ++ WRITE, ++ OMOD, ++ DST_REL, ++ CLAMP, ++ SRC0, ++ SRC0_NEG, ++ SRC0_REL, ++ SRC0_ABS, ++ SRC0_SEL, ++ SRC1, ++ SRC1_NEG, ++ SRC1_REL, ++ SRC1_ABS, ++ SRC1_SEL, ++ SRC2, ++ SRC2_NEG, ++ SRC2_REL, ++ SRC2_SEL, ++ LAST, ++ PRED_SEL, ++ IMM, ++ COUNT ++ }; ++ ++ const static int ALUOpTable[3][R600Operands::COUNT] = { ++// W C S S S S S S S S S S S ++// R O D L S R R R R S R R R R S R R R L P ++// D U I M R A R C C C C R C C C C R C C C A R I ++// S E U T O E M C 0 0 0 0 C 1 1 1 1 C 2 2 2 S E M ++// T M P E D L P 0 N R A S 1 N R A S 2 N R S T D M ++ {0,-1,-1, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1,-1,10,11,12}, ++ {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,13,14,15,16,-1,-1,-1,-1,17,18,19}, ++ {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17} ++ }; ++ ++} ++ ++#endif // R600DEFINES_H_ +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600ExpandSpecialInstrs.cpp llvm-r600/lib/Target/R600/R600ExpandSpecialInstrs.cpp +--- llvm-3.2.src/lib/Target/R600/R600ExpandSpecialInstrs.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/R600ExpandSpecialInstrs.cpp 2013-01-25 19:43:57.463383054 +0100 +@@ -0,0 +1,333 @@ ++//===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// Vector, Reduction, and Cube instructions need to fill the entire instruction ++/// group to work correctly. This pass expands these individual instructions ++/// into several instructions that will completely fill the instruction group. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "AMDGPU.h" ++#include "R600Defines.h" ++#include "R600InstrInfo.h" ++#include "R600RegisterInfo.h" ++#include "R600MachineFunctionInfo.h" ++#include "llvm/CodeGen/MachineFunctionPass.h" ++#include "llvm/CodeGen/MachineInstrBuilder.h" ++#include "llvm/CodeGen/MachineRegisterInfo.h" ++ ++using namespace llvm; ++ ++namespace { ++ ++class R600ExpandSpecialInstrsPass : public MachineFunctionPass { ++ ++private: ++ static char ID; ++ const R600InstrInfo *TII; ++ ++ bool ExpandInputPerspective(MachineInstr& MI); ++ bool ExpandInputConstant(MachineInstr& MI); ++ ++public: ++ R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID), ++ TII (static_cast(tm.getInstrInfo())) { } ++ ++ virtual bool runOnMachineFunction(MachineFunction &MF); ++ ++ const char *getPassName() const { ++ return "R600 Expand special instructions pass"; ++ } ++}; ++ ++} // End anonymous namespace ++ ++char R600ExpandSpecialInstrsPass::ID = 0; ++ ++FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) { ++ return new R600ExpandSpecialInstrsPass(TM); ++} ++ ++bool R600ExpandSpecialInstrsPass::ExpandInputPerspective(MachineInstr &MI) { ++ const R600RegisterInfo &TRI = TII->getRegisterInfo(); ++ if (MI.getOpcode() != AMDGPU::input_perspective) ++ return false; ++ ++ MachineBasicBlock::iterator I = &MI; ++ unsigned DstReg = MI.getOperand(0).getReg(); ++ R600MachineFunctionInfo *MFI = MI.getParent()->getParent() ++ ->getInfo(); ++ unsigned IJIndexBase; ++ ++ // In Evergreen ISA doc section 8.3.2 : ++ // We need to interpolate XY and ZW in two different instruction groups. ++ // An INTERP_* must occupy all 4 slots of an instruction group. ++ // Output of INTERP_XY is written in X,Y slots ++ // Output of INTERP_ZW is written in Z,W slots ++ // ++ // Thus interpolation requires the following sequences : ++ // ++ // AnyGPR.x = INTERP_ZW; (Write Masked Out) ++ // AnyGPR.y = INTERP_ZW; (Write Masked Out) ++ // DstGPR.z = INTERP_ZW; ++ // DstGPR.w = INTERP_ZW; (End of first IG) ++ // DstGPR.x = INTERP_XY; ++ // DstGPR.y = INTERP_XY; ++ // AnyGPR.z = INTERP_XY; (Write Masked Out) ++ // AnyGPR.w = INTERP_XY; (Write Masked Out) (End of second IG) ++ // ++ switch (MI.getOperand(1).getImm()) { ++ case 0: ++ IJIndexBase = MFI->GetIJPerspectiveIndex(); ++ break; ++ case 1: ++ IJIndexBase = MFI->GetIJLinearIndex(); ++ break; ++ default: ++ assert(0 && "Unknow ij index"); ++ } ++ ++ for (unsigned i = 0; i < 8; i++) { ++ unsigned IJIndex = AMDGPU::R600_TReg32RegClass.getRegister( ++ 2 * IJIndexBase + ((i + 1) % 2)); ++ unsigned ReadReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( ++ MI.getOperand(2).getImm()); ++ ++ ++ unsigned Sel = AMDGPU::sel_x; ++ switch (i % 4) { ++ case 0:Sel = AMDGPU::sel_x;break; ++ case 1:Sel = AMDGPU::sel_y;break; ++ case 2:Sel = AMDGPU::sel_z;break; ++ case 3:Sel = AMDGPU::sel_w;break; ++ default:break; ++ } ++ ++ unsigned Res = TRI.getSubReg(DstReg, Sel); ++ ++ unsigned Opcode = (i < 4)?AMDGPU::INTERP_ZW:AMDGPU::INTERP_XY; ++ ++ MachineBasicBlock &MBB = *(MI.getParent()); ++ MachineInstr *NewMI = ++ TII->buildDefaultInstruction(MBB, I, Opcode, Res, IJIndex, ReadReg); ++ ++ if (!(i> 1 && i < 6)) { ++ TII->addFlag(NewMI, 0, MO_FLAG_MASK); ++ } ++ ++ if (i % 4 != 3) ++ TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST); ++ } ++ ++ MI.eraseFromParent(); ++ ++ return true; ++} ++ ++bool R600ExpandSpecialInstrsPass::ExpandInputConstant(MachineInstr &MI) { ++ const R600RegisterInfo &TRI = TII->getRegisterInfo(); ++ if (MI.getOpcode() != AMDGPU::input_constant) ++ return false; ++ ++ MachineBasicBlock::iterator I = &MI; ++ unsigned DstReg = MI.getOperand(0).getReg(); ++ ++ for (unsigned i = 0; i < 4; i++) { ++ unsigned ReadReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( ++ MI.getOperand(1).getImm()); ++ ++ unsigned Sel = AMDGPU::sel_x; ++ switch (i % 4) { ++ case 0:Sel = AMDGPU::sel_x;break; ++ case 1:Sel = AMDGPU::sel_y;break; ++ case 2:Sel = AMDGPU::sel_z;break; ++ case 3:Sel = AMDGPU::sel_w;break; ++ default:break; ++ } ++ ++ unsigned Res = TRI.getSubReg(DstReg, Sel); ++ ++ MachineBasicBlock &MBB = *(MI.getParent()); ++ MachineInstr *NewMI = TII->buildDefaultInstruction( ++ MBB, I, AMDGPU::INTERP_LOAD_P0, Res, ReadReg); ++ ++ if (i % 4 != 3) ++ TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST); ++ } ++ ++ MI.eraseFromParent(); ++ ++ return true; ++} ++ ++bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { ++ ++ const R600RegisterInfo &TRI = TII->getRegisterInfo(); ++ ++ for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); ++ BB != BB_E; ++BB) { ++ MachineBasicBlock &MBB = *BB; ++ MachineBasicBlock::iterator I = MBB.begin(); ++ while (I != MBB.end()) { ++ MachineInstr &MI = *I; ++ I = llvm::next(I); ++ ++ switch (MI.getOpcode()) { ++ default: break; ++ // Expand PRED_X to one of the PRED_SET instructions. ++ case AMDGPU::PRED_X: { ++ uint64_t Flags = MI.getOperand(3).getImm(); ++ // The native opcode used by PRED_X is stored as an immediate in the ++ // third operand. ++ MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I, ++ MI.getOperand(2).getImm(), // opcode ++ MI.getOperand(0).getReg(), // dst ++ MI.getOperand(1).getReg(), // src0 ++ AMDGPU::ZERO); // src1 ++ TII->addFlag(PredSet, 0, MO_FLAG_MASK); ++ if (Flags & MO_FLAG_PUSH) { ++ TII->setImmOperand(PredSet, R600Operands::UPDATE_EXEC_MASK, 1); ++ } else { ++ TII->setImmOperand(PredSet, R600Operands::UPDATE_PREDICATE, 1); ++ } ++ MI.eraseFromParent(); ++ continue; ++ } ++ case AMDGPU::BREAK: ++ MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I, ++ AMDGPU::PRED_SETE_INT, ++ AMDGPU::PREDICATE_BIT, ++ AMDGPU::ZERO, ++ AMDGPU::ZERO); ++ TII->addFlag(PredSet, 0, MO_FLAG_MASK); ++ TII->setImmOperand(PredSet, R600Operands::UPDATE_EXEC_MASK, 1); ++ ++ BuildMI(MBB, I, MBB.findDebugLoc(I), ++ TII->get(AMDGPU::PREDICATED_BREAK)) ++ .addReg(AMDGPU::PREDICATE_BIT); ++ MI.eraseFromParent(); ++ continue; ++ } ++ ++ if (ExpandInputPerspective(MI)) ++ continue; ++ if (ExpandInputConstant(MI)) ++ continue; ++ ++ bool IsReduction = TII->isReductionOp(MI.getOpcode()); ++ bool IsVector = TII->isVector(MI); ++ bool IsCube = TII->isCubeOp(MI.getOpcode()); ++ if (!IsReduction && !IsVector && !IsCube) { ++ continue; ++ } ++ ++ // Expand the instruction ++ // ++ // Reduction instructions: ++ // T0_X = DP4 T1_XYZW, T2_XYZW ++ // becomes: ++ // TO_X = DP4 T1_X, T2_X ++ // TO_Y (write masked) = DP4 T1_Y, T2_Y ++ // TO_Z (write masked) = DP4 T1_Z, T2_Z ++ // TO_W (write masked) = DP4 T1_W, T2_W ++ // ++ // Vector instructions: ++ // T0_X = MULLO_INT T1_X, T2_X ++ // becomes: ++ // T0_X = MULLO_INT T1_X, T2_X ++ // T0_Y (write masked) = MULLO_INT T1_X, T2_X ++ // T0_Z (write masked) = MULLO_INT T1_X, T2_X ++ // T0_W (write masked) = MULLO_INT T1_X, T2_X ++ // ++ // Cube instructions: ++ // T0_XYZW = CUBE T1_XYZW ++ // becomes: ++ // TO_X = CUBE T1_Z, T1_Y ++ // T0_Y = CUBE T1_Z, T1_X ++ // T0_Z = CUBE T1_X, T1_Z ++ // T0_W = CUBE T1_Y, T1_Z ++ for (unsigned Chan = 0; Chan < 4; Chan++) { ++ unsigned DstReg = MI.getOperand( ++ TII->getOperandIdx(MI, R600Operands::DST)).getReg(); ++ unsigned Src0 = MI.getOperand( ++ TII->getOperandIdx(MI, R600Operands::SRC0)).getReg(); ++ unsigned Src1 = 0; ++ ++ // Determine the correct source registers ++ if (!IsCube) { ++ int Src1Idx = TII->getOperandIdx(MI, R600Operands::SRC1); ++ if (Src1Idx != -1) { ++ Src1 = MI.getOperand(Src1Idx).getReg(); ++ } ++ } ++ if (IsReduction) { ++ unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); ++ Src0 = TRI.getSubReg(Src0, SubRegIndex); ++ Src1 = TRI.getSubReg(Src1, SubRegIndex); ++ } else if (IsCube) { ++ static const int CubeSrcSwz[] = {2, 2, 0, 1}; ++ unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]); ++ unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]); ++ Src1 = TRI.getSubReg(Src0, SubRegIndex1); ++ Src0 = TRI.getSubReg(Src0, SubRegIndex0); ++ } ++ ++ // Determine the correct destination registers; ++ bool Mask = false; ++ bool NotLast = true; ++ if (IsCube) { ++ unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); ++ DstReg = TRI.getSubReg(DstReg, SubRegIndex); ++ } else { ++ // Mask the write if the original instruction does not write to ++ // the current Channel. ++ Mask = (Chan != TRI.getHWRegChan(DstReg)); ++ unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; ++ DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); ++ } ++ ++ // Set the IsLast bit ++ NotLast = (Chan != 3 ); ++ ++ // Add the new instruction ++ unsigned Opcode = MI.getOpcode(); ++ switch (Opcode) { ++ case AMDGPU::CUBE_r600_pseudo: ++ Opcode = AMDGPU::CUBE_r600_real; ++ break; ++ case AMDGPU::CUBE_eg_pseudo: ++ Opcode = AMDGPU::CUBE_eg_real; ++ break; ++ case AMDGPU::DOT4_r600_pseudo: ++ Opcode = AMDGPU::DOT4_r600_real; ++ break; ++ case AMDGPU::DOT4_eg_pseudo: ++ Opcode = AMDGPU::DOT4_eg_real; ++ break; ++ default: ++ break; ++ } ++ ++ MachineInstr *NewMI = ++ TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1); ++ ++ NewMI->setIsInsideBundle(Chan != 0); ++ if (Mask) { ++ TII->addFlag(NewMI, 0, MO_FLAG_MASK); ++ } ++ if (NotLast) { ++ TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST); ++ } ++ } ++ MI.eraseFromParent(); ++ } ++ } ++ return false; ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600InstrInfo.cpp llvm-r600/lib/Target/R600/R600InstrInfo.cpp +--- llvm-3.2.src/lib/Target/R600/R600InstrInfo.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/R600InstrInfo.cpp 2013-01-25 19:43:57.466716387 +0100 +@@ -0,0 +1,655 @@ ++//===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief R600 Implementation of TargetInstrInfo. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "R600InstrInfo.h" ++#include "AMDGPUTargetMachine.h" ++#include "AMDGPUSubtarget.h" ++#include "R600Defines.h" ++#include "R600RegisterInfo.h" ++#include "llvm/CodeGen/MachineInstrBuilder.h" ++ ++#define GET_INSTRINFO_CTOR ++#include "AMDGPUGenDFAPacketizer.inc" ++ ++using namespace llvm; ++ ++R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm) ++ : AMDGPUInstrInfo(tm), ++ RI(tm, *this) ++ { } ++ ++const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const { ++ return RI; ++} ++ ++bool R600InstrInfo::isTrig(const MachineInstr &MI) const { ++ return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG; ++} ++ ++bool R600InstrInfo::isVector(const MachineInstr &MI) const { ++ return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR; ++} ++ ++void ++R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator MI, DebugLoc DL, ++ unsigned DestReg, unsigned SrcReg, ++ bool KillSrc) const { ++ if (AMDGPU::R600_Reg128RegClass.contains(DestReg) ++ && AMDGPU::R600_Reg128RegClass.contains(SrcReg)) { ++ for (unsigned I = 0; I < 4; I++) { ++ unsigned SubRegIndex = RI.getSubRegFromChannel(I); ++ buildDefaultInstruction(MBB, MI, AMDGPU::MOV, ++ RI.getSubReg(DestReg, SubRegIndex), ++ RI.getSubReg(SrcReg, SubRegIndex)) ++ .addReg(DestReg, ++ RegState::Define | RegState::Implicit); ++ } ++ } else { ++ ++ // We can't copy vec4 registers ++ assert(!AMDGPU::R600_Reg128RegClass.contains(DestReg) ++ && !AMDGPU::R600_Reg128RegClass.contains(SrcReg)); ++ ++ MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV, ++ DestReg, SrcReg); ++ NewMI->getOperand(getOperandIdx(*NewMI, R600Operands::SRC0)) ++ .setIsKill(KillSrc); ++ } ++} ++ ++MachineInstr * R600InstrInfo::getMovImmInstr(MachineFunction *MF, ++ unsigned DstReg, int64_t Imm) const { ++ MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::MOV), DebugLoc()); ++ MachineInstrBuilder(MI).addReg(DstReg, RegState::Define); ++ MachineInstrBuilder(MI).addReg(AMDGPU::ALU_LITERAL_X); ++ MachineInstrBuilder(MI).addImm(Imm); ++ MachineInstrBuilder(MI).addReg(0); // PREDICATE_BIT ++ ++ return MI; ++} ++ ++unsigned R600InstrInfo::getIEQOpcode() const { ++ return AMDGPU::SETE_INT; ++} ++ ++bool R600InstrInfo::isMov(unsigned Opcode) const { ++ ++ ++ switch(Opcode) { ++ default: return false; ++ case AMDGPU::MOV: ++ case AMDGPU::MOV_IMM_F32: ++ case AMDGPU::MOV_IMM_I32: ++ return true; ++ } ++} ++ ++// Some instructions act as place holders to emulate operations that the GPU ++// hardware does automatically. This function can be used to check if ++// an opcode falls into this category. ++bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const { ++ switch (Opcode) { ++ default: return false; ++ case AMDGPU::RETURN: ++ case AMDGPU::RESERVE_REG: ++ return true; ++ } ++} ++ ++bool R600InstrInfo::isReductionOp(unsigned Opcode) const { ++ switch(Opcode) { ++ default: return false; ++ case AMDGPU::DOT4_r600_pseudo: ++ case AMDGPU::DOT4_eg_pseudo: ++ return true; ++ } ++} ++ ++bool R600InstrInfo::isCubeOp(unsigned Opcode) const { ++ switch(Opcode) { ++ default: return false; ++ case AMDGPU::CUBE_r600_pseudo: ++ case AMDGPU::CUBE_r600_real: ++ case AMDGPU::CUBE_eg_pseudo: ++ case AMDGPU::CUBE_eg_real: ++ return true; ++ } ++} ++ ++bool R600InstrInfo::isALUInstr(unsigned Opcode) const { ++ unsigned TargetFlags = get(Opcode).TSFlags; ++ ++ return ((TargetFlags & R600_InstFlag::OP1) | ++ (TargetFlags & R600_InstFlag::OP2) | ++ (TargetFlags & R600_InstFlag::OP3)); ++} ++ ++DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM, ++ const ScheduleDAG *DAG) const { ++ const InstrItineraryData *II = TM->getInstrItineraryData(); ++ return TM->getSubtarget().createDFAPacketizer(II); ++} ++ ++static bool ++isPredicateSetter(unsigned Opcode) { ++ switch (Opcode) { ++ case AMDGPU::PRED_X: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static MachineInstr * ++findFirstPredicateSetterFrom(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator I) { ++ while (I != MBB.begin()) { ++ --I; ++ MachineInstr *MI = I; ++ if (isPredicateSetter(MI->getOpcode())) ++ return MI; ++ } ++ ++ return NULL; ++} ++ ++bool ++R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, ++ MachineBasicBlock *&TBB, ++ MachineBasicBlock *&FBB, ++ SmallVectorImpl &Cond, ++ bool AllowModify) const { ++ // Most of the following comes from the ARM implementation of AnalyzeBranch ++ ++ // If the block has no terminators, it just falls into the block after it. ++ MachineBasicBlock::iterator I = MBB.end(); ++ if (I == MBB.begin()) ++ return false; ++ --I; ++ while (I->isDebugValue()) { ++ if (I == MBB.begin()) ++ return false; ++ --I; ++ } ++ if (static_cast(I)->getOpcode() != AMDGPU::JUMP) { ++ return false; ++ } ++ ++ // Get the last instruction in the block. ++ MachineInstr *LastInst = I; ++ ++ // If there is only one terminator instruction, process it. ++ unsigned LastOpc = LastInst->getOpcode(); ++ if (I == MBB.begin() || ++ static_cast(--I)->getOpcode() != AMDGPU::JUMP) { ++ if (LastOpc == AMDGPU::JUMP) { ++ if(!isPredicated(LastInst)) { ++ TBB = LastInst->getOperand(0).getMBB(); ++ return false; ++ } else { ++ MachineInstr *predSet = I; ++ while (!isPredicateSetter(predSet->getOpcode())) { ++ predSet = --I; ++ } ++ TBB = LastInst->getOperand(0).getMBB(); ++ Cond.push_back(predSet->getOperand(1)); ++ Cond.push_back(predSet->getOperand(2)); ++ Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); ++ return false; ++ } ++ } ++ return true; // Can't handle indirect branch. ++ } ++ ++ // Get the instruction before it if it is a terminator. ++ MachineInstr *SecondLastInst = I; ++ unsigned SecondLastOpc = SecondLastInst->getOpcode(); ++ ++ // If the block ends with a B and a Bcc, handle it. ++ if (SecondLastOpc == AMDGPU::JUMP && ++ isPredicated(SecondLastInst) && ++ LastOpc == AMDGPU::JUMP && ++ !isPredicated(LastInst)) { ++ MachineInstr *predSet = --I; ++ while (!isPredicateSetter(predSet->getOpcode())) { ++ predSet = --I; ++ } ++ TBB = SecondLastInst->getOperand(0).getMBB(); ++ FBB = LastInst->getOperand(0).getMBB(); ++ Cond.push_back(predSet->getOperand(1)); ++ Cond.push_back(predSet->getOperand(2)); ++ Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); ++ return false; ++ } ++ ++ // Otherwise, can't handle this. ++ return true; ++} ++ ++int R600InstrInfo::getBranchInstr(const MachineOperand &op) const { ++ const MachineInstr *MI = op.getParent(); ++ ++ switch (MI->getDesc().OpInfo->RegClass) { ++ default: // FIXME: fallthrough?? ++ case AMDGPU::GPRI32RegClassID: return AMDGPU::BRANCH_COND_i32; ++ case AMDGPU::GPRF32RegClassID: return AMDGPU::BRANCH_COND_f32; ++ }; ++} ++ ++unsigned ++R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, ++ MachineBasicBlock *TBB, ++ MachineBasicBlock *FBB, ++ const SmallVectorImpl &Cond, ++ DebugLoc DL) const { ++ assert(TBB && "InsertBranch must not be told to insert a fallthrough"); ++ ++ if (FBB == 0) { ++ if (Cond.empty()) { ++ BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB).addReg(0); ++ return 1; ++ } else { ++ MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); ++ assert(PredSet && "No previous predicate !"); ++ addFlag(PredSet, 0, MO_FLAG_PUSH); ++ PredSet->getOperand(2).setImm(Cond[1].getImm()); ++ ++ BuildMI(&MBB, DL, get(AMDGPU::JUMP)) ++ .addMBB(TBB) ++ .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); ++ return 1; ++ } ++ } else { ++ MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); ++ assert(PredSet && "No previous predicate !"); ++ addFlag(PredSet, 0, MO_FLAG_PUSH); ++ PredSet->getOperand(2).setImm(Cond[1].getImm()); ++ BuildMI(&MBB, DL, get(AMDGPU::JUMP)) ++ .addMBB(TBB) ++ .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); ++ BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB).addReg(0); ++ return 2; ++ } ++} ++ ++unsigned ++R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { ++ ++ // Note : we leave PRED* instructions there. ++ // They may be needed when predicating instructions. ++ ++ MachineBasicBlock::iterator I = MBB.end(); ++ ++ if (I == MBB.begin()) { ++ return 0; ++ } ++ --I; ++ switch (I->getOpcode()) { ++ default: ++ return 0; ++ case AMDGPU::JUMP: ++ if (isPredicated(I)) { ++ MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); ++ clearFlag(predSet, 0, MO_FLAG_PUSH); ++ } ++ I->eraseFromParent(); ++ break; ++ } ++ I = MBB.end(); ++ ++ if (I == MBB.begin()) { ++ return 1; ++ } ++ --I; ++ switch (I->getOpcode()) { ++ // FIXME: only one case?? ++ default: ++ return 1; ++ case AMDGPU::JUMP: ++ if (isPredicated(I)) { ++ MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); ++ clearFlag(predSet, 0, MO_FLAG_PUSH); ++ } ++ I->eraseFromParent(); ++ break; ++ } ++ return 2; ++} ++ ++bool ++R600InstrInfo::isPredicated(const MachineInstr *MI) const { ++ int idx = MI->findFirstPredOperandIdx(); ++ if (idx < 0) ++ return false; ++ ++ unsigned Reg = MI->getOperand(idx).getReg(); ++ switch (Reg) { ++ default: return false; ++ case AMDGPU::PRED_SEL_ONE: ++ case AMDGPU::PRED_SEL_ZERO: ++ case AMDGPU::PREDICATE_BIT: ++ return true; ++ } ++} ++ ++bool ++R600InstrInfo::isPredicable(MachineInstr *MI) const { ++ // XXX: KILL* instructions can be predicated, but they must be the last ++ // instruction in a clause, so this means any instructions after them cannot ++ // be predicated. Until we have proper support for instruction clauses in the ++ // backend, we will mark KILL* instructions as unpredicable. ++ ++ if (MI->getOpcode() == AMDGPU::KILLGT) { ++ return false; ++ } else { ++ return AMDGPUInstrInfo::isPredicable(MI); ++ } ++} ++ ++ ++bool ++R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, ++ unsigned NumCyles, ++ unsigned ExtraPredCycles, ++ const BranchProbability &Probability) const{ ++ return true; ++} ++ ++bool ++R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB, ++ unsigned NumTCycles, ++ unsigned ExtraTCycles, ++ MachineBasicBlock &FMBB, ++ unsigned NumFCycles, ++ unsigned ExtraFCycles, ++ const BranchProbability &Probability) const { ++ return true; ++} ++ ++bool ++R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB, ++ unsigned NumCyles, ++ const BranchProbability &Probability) ++ const { ++ return true; ++} ++ ++bool ++R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB, ++ MachineBasicBlock &FMBB) const { ++ return false; ++} ++ ++ ++bool ++R600InstrInfo::ReverseBranchCondition(SmallVectorImpl &Cond) const { ++ MachineOperand &MO = Cond[1]; ++ switch (MO.getImm()) { ++ case OPCODE_IS_ZERO_INT: ++ MO.setImm(OPCODE_IS_NOT_ZERO_INT); ++ break; ++ case OPCODE_IS_NOT_ZERO_INT: ++ MO.setImm(OPCODE_IS_ZERO_INT); ++ break; ++ case OPCODE_IS_ZERO: ++ MO.setImm(OPCODE_IS_NOT_ZERO); ++ break; ++ case OPCODE_IS_NOT_ZERO: ++ MO.setImm(OPCODE_IS_ZERO); ++ break; ++ default: ++ return true; ++ } ++ ++ MachineOperand &MO2 = Cond[2]; ++ switch (MO2.getReg()) { ++ case AMDGPU::PRED_SEL_ZERO: ++ MO2.setReg(AMDGPU::PRED_SEL_ONE); ++ break; ++ case AMDGPU::PRED_SEL_ONE: ++ MO2.setReg(AMDGPU::PRED_SEL_ZERO); ++ break; ++ default: ++ return true; ++ } ++ return false; ++} ++ ++bool ++R600InstrInfo::DefinesPredicate(MachineInstr *MI, ++ std::vector &Pred) const { ++ return isPredicateSetter(MI->getOpcode()); ++} ++ ++ ++bool ++R600InstrInfo::SubsumesPredicate(const SmallVectorImpl &Pred1, ++ const SmallVectorImpl &Pred2) const { ++ return false; ++} ++ ++ ++bool ++R600InstrInfo::PredicateInstruction(MachineInstr *MI, ++ const SmallVectorImpl &Pred) const { ++ int PIdx = MI->findFirstPredOperandIdx(); ++ ++ if (PIdx != -1) { ++ MachineOperand &PMO = MI->getOperand(PIdx); ++ PMO.setReg(Pred[2].getReg()); ++ MachineInstrBuilder(MI).addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit); ++ return true; ++ } ++ ++ return false; ++} ++ ++unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData, ++ const MachineInstr *MI, ++ unsigned *PredCost) const { ++ if (PredCost) ++ *PredCost = 2; ++ return 2; ++} ++ ++MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator I, ++ unsigned Opcode, ++ unsigned DstReg, ++ unsigned Src0Reg, ++ unsigned Src1Reg) const { ++ MachineInstrBuilder MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opcode), ++ DstReg); // $dst ++ ++ if (Src1Reg) { ++ MIB.addImm(0) // $update_exec_mask ++ .addImm(0); // $update_predicate ++ } ++ MIB.addImm(1) // $write ++ .addImm(0) // $omod ++ .addImm(0) // $dst_rel ++ .addImm(0) // $dst_clamp ++ .addReg(Src0Reg) // $src0 ++ .addImm(0) // $src0_neg ++ .addImm(0) // $src0_rel ++ .addImm(0) // $src0_abs ++ .addImm(-1); // $src0_sel ++ ++ if (Src1Reg) { ++ MIB.addReg(Src1Reg) // $src1 ++ .addImm(0) // $src1_neg ++ .addImm(0) // $src1_rel ++ .addImm(0) // $src1_abs ++ .addImm(-1); // $src1_sel ++ } ++ ++ //XXX: The r600g finalizer expects this to be 1, once we've moved the ++ //scheduling to the backend, we can change the default to 0. ++ MIB.addImm(1) // $last ++ .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel ++ .addImm(0); // $literal ++ ++ return MIB; ++} ++ ++MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB, ++ MachineBasicBlock::iterator I, ++ unsigned DstReg, ++ uint64_t Imm) const { ++ MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg, ++ AMDGPU::ALU_LITERAL_X); ++ setImmOperand(MovImm, R600Operands::IMM, Imm); ++ return MovImm; ++} ++ ++int R600InstrInfo::getOperandIdx(const MachineInstr &MI, ++ R600Operands::Ops Op) const { ++ return getOperandIdx(MI.getOpcode(), Op); ++} ++ ++int R600InstrInfo::getOperandIdx(unsigned Opcode, ++ R600Operands::Ops Op) const { ++ unsigned TargetFlags = get(Opcode).TSFlags; ++ unsigned OpTableIdx; ++ ++ if (!HAS_NATIVE_OPERANDS(TargetFlags)) { ++ switch (Op) { ++ case R600Operands::DST: return 0; ++ case R600Operands::SRC0: return 1; ++ case R600Operands::SRC1: return 2; ++ case R600Operands::SRC2: return 3; ++ default: ++ assert(!"Unknown operand type for instruction"); ++ return -1; ++ } ++ } ++ ++ if (TargetFlags & R600_InstFlag::OP1) { ++ OpTableIdx = 0; ++ } else if (TargetFlags & R600_InstFlag::OP2) { ++ OpTableIdx = 1; ++ } else { ++ assert((TargetFlags & R600_InstFlag::OP3) && "OP1, OP2, or OP3 not defined " ++ "for this instruction"); ++ OpTableIdx = 2; ++ } ++ ++ return R600Operands::ALUOpTable[OpTableIdx][Op]; ++} ++ ++void R600InstrInfo::setImmOperand(MachineInstr *MI, R600Operands::Ops Op, ++ int64_t Imm) const { ++ int Idx = getOperandIdx(*MI, Op); ++ assert(Idx != -1 && "Operand not supported for this instruction."); ++ assert(MI->getOperand(Idx).isImm()); ++ MI->getOperand(Idx).setImm(Imm); ++} ++ ++//===----------------------------------------------------------------------===// ++// Instruction flag getters/setters ++//===----------------------------------------------------------------------===// ++ ++bool R600InstrInfo::hasFlagOperand(const MachineInstr &MI) const { ++ return GET_FLAG_OPERAND_IDX(get(MI.getOpcode()).TSFlags) != 0; ++} ++ ++MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx, ++ unsigned Flag) const { ++ unsigned TargetFlags = get(MI->getOpcode()).TSFlags; ++ int FlagIndex = 0; ++ if (Flag != 0) { ++ // If we pass something other than the default value of Flag to this ++ // function, it means we are want to set a flag on an instruction ++ // that uses native encoding. ++ assert(HAS_NATIVE_OPERANDS(TargetFlags)); ++ bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3; ++ switch (Flag) { ++ case MO_FLAG_CLAMP: ++ FlagIndex = getOperandIdx(*MI, R600Operands::CLAMP); ++ break; ++ case MO_FLAG_MASK: ++ FlagIndex = getOperandIdx(*MI, R600Operands::WRITE); ++ break; ++ case MO_FLAG_NOT_LAST: ++ case MO_FLAG_LAST: ++ FlagIndex = getOperandIdx(*MI, R600Operands::LAST); ++ break; ++ case MO_FLAG_NEG: ++ switch (SrcIdx) { ++ case 0: FlagIndex = getOperandIdx(*MI, R600Operands::SRC0_NEG); break; ++ case 1: FlagIndex = getOperandIdx(*MI, R600Operands::SRC1_NEG); break; ++ case 2: FlagIndex = getOperandIdx(*MI, R600Operands::SRC2_NEG); break; ++ } ++ break; ++ ++ case MO_FLAG_ABS: ++ assert(!IsOP3 && "Cannot set absolute value modifier for OP3 " ++ "instructions."); ++ switch (SrcIdx) { ++ case 0: FlagIndex = getOperandIdx(*MI, R600Operands::SRC0_ABS); break; ++ case 1: FlagIndex = getOperandIdx(*MI, R600Operands::SRC1_ABS); break; ++ } ++ break; ++ ++ default: ++ FlagIndex = -1; ++ break; ++ } ++ assert(FlagIndex != -1 && "Flag not supported for this instruction"); ++ } else { ++ FlagIndex = GET_FLAG_OPERAND_IDX(TargetFlags); ++ assert(FlagIndex != 0 && ++ "Instruction flags not supported for this instruction"); ++ } ++ ++ MachineOperand &FlagOp = MI->getOperand(FlagIndex); ++ assert(FlagOp.isImm()); ++ return FlagOp; ++} ++ ++void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand, ++ unsigned Flag) const { ++ unsigned TargetFlags = get(MI->getOpcode()).TSFlags; ++ if (Flag == 0) { ++ return; ++ } ++ if (HAS_NATIVE_OPERANDS(TargetFlags)) { ++ MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag); ++ if (Flag == MO_FLAG_NOT_LAST) { ++ clearFlag(MI, Operand, MO_FLAG_LAST); ++ } else if (Flag == MO_FLAG_MASK) { ++ clearFlag(MI, Operand, Flag); ++ } else { ++ FlagOp.setImm(1); ++ } ++ } else { ++ MachineOperand &FlagOp = getFlagOp(MI, Operand); ++ FlagOp.setImm(FlagOp.getImm() | (Flag << (NUM_MO_FLAGS * Operand))); ++ } ++} ++ ++void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand, ++ unsigned Flag) const { ++ unsigned TargetFlags = get(MI->getOpcode()).TSFlags; ++ if (HAS_NATIVE_OPERANDS(TargetFlags)) { ++ MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag); ++ FlagOp.setImm(0); ++ } else { ++ MachineOperand &FlagOp = getFlagOp(MI); ++ unsigned InstFlags = FlagOp.getImm(); ++ InstFlags &= ~(Flag << (NUM_MO_FLAGS * Operand)); ++ FlagOp.setImm(InstFlags); ++ } ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600InstrInfo.h llvm-r600/lib/Target/R600/R600InstrInfo.h +--- llvm-3.2.src/lib/Target/R600/R600InstrInfo.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/R600InstrInfo.h 2013-01-25 19:43:57.466716387 +0100 +@@ -0,0 +1,169 @@ ++//===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Interface definition for R600InstrInfo ++// ++//===----------------------------------------------------------------------===// ++ ++#ifndef R600INSTRUCTIONINFO_H_ ++#define R600INSTRUCTIONINFO_H_ ++ ++#include "AMDIL.h" ++#include "AMDGPUInstrInfo.h" ++#include "R600Defines.h" ++#include "R600RegisterInfo.h" ++ ++#include ++ ++namespace llvm { ++ ++ class AMDGPUTargetMachine; ++ class DFAPacketizer; ++ class ScheduleDAG; ++ class MachineFunction; ++ class MachineInstr; ++ class MachineInstrBuilder; ++ ++ class R600InstrInfo : public AMDGPUInstrInfo { ++ private: ++ const R600RegisterInfo RI; ++ ++ int getBranchInstr(const MachineOperand &op) const; ++ ++ public: ++ explicit R600InstrInfo(AMDGPUTargetMachine &tm); ++ ++ const R600RegisterInfo &getRegisterInfo() const; ++ virtual void copyPhysReg(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator MI, DebugLoc DL, ++ unsigned DestReg, unsigned SrcReg, ++ bool KillSrc) const; ++ ++ bool isTrig(const MachineInstr &MI) const; ++ bool isPlaceHolderOpcode(unsigned opcode) const; ++ bool isReductionOp(unsigned opcode) const; ++ bool isCubeOp(unsigned opcode) const; ++ ++ /// \returns true if this \p Opcode represents an ALU instruction. ++ bool isALUInstr(unsigned Opcode) const; ++ ++ /// \breif Vector instructions are instructions that must fill all ++ /// instruction slots within an instruction group. ++ bool isVector(const MachineInstr &MI) const; ++ ++ virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg, ++ int64_t Imm) const; ++ ++ virtual unsigned getIEQOpcode() const; ++ virtual bool isMov(unsigned Opcode) const; ++ ++ DFAPacketizer *CreateTargetScheduleState(const TargetMachine *TM, ++ const ScheduleDAG *DAG) const; ++ ++ bool ReverseBranchCondition(SmallVectorImpl &Cond) const; ++ ++ bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, ++ SmallVectorImpl &Cond, bool AllowModify) const; ++ ++ unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl &Cond, DebugLoc DL) const; ++ ++ unsigned RemoveBranch(MachineBasicBlock &MBB) const; ++ ++ bool isPredicated(const MachineInstr *MI) const; ++ ++ bool isPredicable(MachineInstr *MI) const; ++ ++ bool ++ isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, ++ const BranchProbability &Probability) const; ++ ++ bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, ++ unsigned ExtraPredCycles, ++ const BranchProbability &Probability) const ; ++ ++ bool ++ isProfitableToIfCvt(MachineBasicBlock &TMBB, ++ unsigned NumTCycles, unsigned ExtraTCycles, ++ MachineBasicBlock &FMBB, ++ unsigned NumFCycles, unsigned ExtraFCycles, ++ const BranchProbability &Probability) const; ++ ++ bool DefinesPredicate(MachineInstr *MI, ++ std::vector &Pred) const; ++ ++ bool SubsumesPredicate(const SmallVectorImpl &Pred1, ++ const SmallVectorImpl &Pred2) const; ++ ++ bool isProfitableToUnpredicate(MachineBasicBlock &TMBB, ++ MachineBasicBlock &FMBB) const; ++ ++ bool PredicateInstruction(MachineInstr *MI, ++ const SmallVectorImpl &Pred) const; ++ ++ unsigned int getInstrLatency(const InstrItineraryData *ItinData, ++ const MachineInstr *MI, ++ unsigned *PredCost = 0) const; ++ ++ virtual int getInstrLatency(const InstrItineraryData *ItinData, ++ SDNode *Node) const { return 1;} ++ ++ /// You can use this function to avoid manually specifying each instruction ++ /// modifier operand when building a new instruction. ++ /// ++ /// \returns a MachineInstr with all the instruction modifiers initialized ++ /// to their default values. ++ MachineInstrBuilder buildDefaultInstruction(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator I, ++ unsigned Opcode, ++ unsigned DstReg, ++ unsigned Src0Reg, ++ unsigned Src1Reg = 0) const; ++ ++ MachineInstr *buildMovImm(MachineBasicBlock &BB, ++ MachineBasicBlock::iterator I, ++ unsigned DstReg, ++ uint64_t Imm) const; ++ ++ /// \brief Get the index of Op in the MachineInstr. ++ /// ++ /// \returns -1 if the Instruction does not contain the specified \p Op. ++ int getOperandIdx(const MachineInstr &MI, R600Operands::Ops Op) const; ++ ++ /// \brief Get the index of \p Op for the given Opcode. ++ /// ++ /// \returns -1 if the Instruction does not contain the specified \p Op. ++ int getOperandIdx(unsigned Opcode, R600Operands::Ops Op) const; ++ ++ /// \brief Helper function for setting instruction flag values. ++ void setImmOperand(MachineInstr *MI, R600Operands::Ops Op, int64_t Imm) const; ++ ++ /// \returns true if this instruction has an operand for storing target flags. ++ bool hasFlagOperand(const MachineInstr &MI) const; ++ ++ ///\brief Add one of the MO_FLAG* flags to the specified \p Operand. ++ void addFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const; ++ ++ ///\brief Determine if the specified \p Flag is set on this \p Operand. ++ bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const; ++ ++ /// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2) ++ /// \param Flag The flag being set. ++ /// ++ /// \returns the operand containing the flags for this instruction. ++ MachineOperand &getFlagOp(MachineInstr *MI, unsigned SrcIdx = 0, ++ unsigned Flag = 0) const; ++ ++ /// \brief Clear the specified flag on the instruction. ++ void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const; ++}; ++ ++} // End llvm namespace ++ ++#endif // R600INSTRINFO_H_ +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600Instructions.td llvm-r600/lib/Target/R600/R600Instructions.td +--- llvm-3.2.src/lib/Target/R600/R600Instructions.td 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/R600Instructions.td 2013-01-25 19:43:57.466716387 +0100 +@@ -0,0 +1,1843 @@ ++//===-- R600Instructions.td - R600 Instruction defs -------*- tablegen -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++// R600 Tablegen instruction definitions ++// ++//===----------------------------------------------------------------------===// ++ ++include "R600Intrinsics.td" ++ ++class InstR600 inst, dag outs, dag ins, string asm, list pattern, ++ InstrItinClass itin> ++ : AMDGPUInst { ++ ++ field bits<64> Inst; ++ bit Trig = 0; ++ bit Op3 = 0; ++ bit isVector = 0; ++ bits<2> FlagOperandIdx = 0; ++ bit Op1 = 0; ++ bit Op2 = 0; ++ bit HasNativeOperands = 0; ++ ++ bits<11> op_code = inst; ++ //let Inst = inst; ++ let Namespace = "AMDGPU"; ++ let OutOperandList = outs; ++ let InOperandList = ins; ++ let AsmString = asm; ++ let Pattern = pattern; ++ let Itinerary = itin; ++ ++ let TSFlags{4} = Trig; ++ let TSFlags{5} = Op3; ++ ++ // Vector instructions are instructions that must fill all slots in an ++ // instruction group ++ let TSFlags{6} = isVector; ++ let TSFlags{8-7} = FlagOperandIdx; ++ let TSFlags{9} = HasNativeOperands; ++ let TSFlags{10} = Op1; ++ let TSFlags{11} = Op2; ++} ++ ++class InstR600ISA pattern> : ++ AMDGPUInst { ++ field bits<64> Inst; ++ ++ let Namespace = "AMDGPU"; ++} ++ ++def MEMxi : Operand { ++ let MIOperandInfo = (ops R600_TReg32_X:$ptr, i32imm:$index); ++ let PrintMethod = "printMemOperand"; ++} ++ ++def MEMrr : Operand { ++ let MIOperandInfo = (ops R600_Reg32:$ptr, R600_Reg32:$index); ++} ++ ++// Operands for non-registers ++ ++class InstFlag ++ : OperandWithDefaultOps { ++ let PrintMethod = PM; ++} ++ ++// src_sel for ALU src operands, see also ALU_CONST, ALU_PARAM registers ++def SEL : OperandWithDefaultOps { ++ let PrintMethod = "printSel"; ++} ++ ++def LITERAL : InstFlag<"printLiteral">; ++ ++def WRITE : InstFlag <"printWrite", 1>; ++def OMOD : InstFlag <"printOMOD">; ++def REL : InstFlag <"printRel">; ++def CLAMP : InstFlag <"printClamp">; ++def NEG : InstFlag <"printNeg">; ++def ABS : InstFlag <"printAbs">; ++def UEM : InstFlag <"printUpdateExecMask">; ++def UP : InstFlag <"printUpdatePred">; ++ ++// XXX: The r600g finalizer in Mesa expects last to be one in most cases. ++// Once we start using the packetizer in this backend we should have this ++// default to 0. ++def LAST : InstFlag<"printLast", 1>; ++ ++def ADDRParam : ComplexPattern; ++def ADDRDWord : ComplexPattern; ++def ADDRVTX_READ : ComplexPattern; ++def ADDRGA_CONST_OFFSET : ComplexPattern; ++def ADDRGA_VAR_OFFSET : ComplexPattern; ++ ++class R600ALU_Word0 { ++ field bits<32> Word0; ++ ++ bits<11> src0; ++ bits<1> src0_neg; ++ bits<1> src0_rel; ++ bits<11> src1; ++ bits<1> src1_rel; ++ bits<1> src1_neg; ++ bits<3> index_mode = 0; ++ bits<2> pred_sel; ++ bits<1> last; ++ ++ bits<9> src0_sel = src0{8-0}; ++ bits<2> src0_chan = src0{10-9}; ++ bits<9> src1_sel = src1{8-0}; ++ bits<2> src1_chan = src1{10-9}; ++ ++ let Word0{8-0} = src0_sel; ++ let Word0{9} = src0_rel; ++ let Word0{11-10} = src0_chan; ++ let Word0{12} = src0_neg; ++ let Word0{21-13} = src1_sel; ++ let Word0{22} = src1_rel; ++ let Word0{24-23} = src1_chan; ++ let Word0{25} = src1_neg; ++ let Word0{28-26} = index_mode; ++ let Word0{30-29} = pred_sel; ++ let Word0{31} = last; ++} ++ ++class R600ALU_Word1 { ++ field bits<32> Word1; ++ ++ bits<11> dst; ++ bits<3> bank_swizzle = 0; ++ bits<1> dst_rel; ++ bits<1> clamp; ++ ++ bits<7> dst_sel = dst{6-0}; ++ bits<2> dst_chan = dst{10-9}; ++ ++ let Word1{20-18} = bank_swizzle; ++ let Word1{27-21} = dst_sel; ++ let Word1{28} = dst_rel; ++ let Word1{30-29} = dst_chan; ++ let Word1{31} = clamp; ++} ++ ++class R600ALU_Word1_OP2 alu_inst> : R600ALU_Word1{ ++ ++ bits<1> src0_abs; ++ bits<1> src1_abs; ++ bits<1> update_exec_mask; ++ bits<1> update_pred; ++ bits<1> write; ++ bits<2> omod; ++ ++ let Word1{0} = src0_abs; ++ let Word1{1} = src1_abs; ++ let Word1{2} = update_exec_mask; ++ let Word1{3} = update_pred; ++ let Word1{4} = write; ++ let Word1{6-5} = omod; ++ let Word1{17-7} = alu_inst; ++} ++ ++class R600ALU_Word1_OP3 alu_inst> : R600ALU_Word1{ ++ ++ bits<11> src2; ++ bits<1> src2_rel; ++ bits<1> src2_neg; ++ ++ bits<9> src2_sel = src2{8-0}; ++ bits<2> src2_chan = src2{10-9}; ++ ++ let Word1{8-0} = src2_sel; ++ let Word1{9} = src2_rel; ++ let Word1{11-10} = src2_chan; ++ let Word1{12} = src2_neg; ++ let Word1{17-13} = alu_inst; ++} ++ ++class VTX_WORD0 { ++ field bits<32> Word0; ++ bits<7> SRC_GPR; ++ bits<5> VC_INST; ++ bits<2> FETCH_TYPE; ++ bits<1> FETCH_WHOLE_QUAD; ++ bits<8> BUFFER_ID; ++ bits<1> SRC_REL; ++ bits<2> SRC_SEL_X; ++ bits<6> MEGA_FETCH_COUNT; ++ ++ let Word0{4-0} = VC_INST; ++ let Word0{6-5} = FETCH_TYPE; ++ let Word0{7} = FETCH_WHOLE_QUAD; ++ let Word0{15-8} = BUFFER_ID; ++ let Word0{22-16} = SRC_GPR; ++ let Word0{23} = SRC_REL; ++ let Word0{25-24} = SRC_SEL_X; ++ let Word0{31-26} = MEGA_FETCH_COUNT; ++} ++ ++class VTX_WORD1_GPR { ++ field bits<32> Word1; ++ bits<7> DST_GPR; ++ bits<1> DST_REL; ++ bits<3> DST_SEL_X; ++ bits<3> DST_SEL_Y; ++ bits<3> DST_SEL_Z; ++ bits<3> DST_SEL_W; ++ bits<1> USE_CONST_FIELDS; ++ bits<6> DATA_FORMAT; ++ bits<2> NUM_FORMAT_ALL; ++ bits<1> FORMAT_COMP_ALL; ++ bits<1> SRF_MODE_ALL; ++ ++ let Word1{6-0} = DST_GPR; ++ let Word1{7} = DST_REL; ++ let Word1{8} = 0; // Reserved ++ let Word1{11-9} = DST_SEL_X; ++ let Word1{14-12} = DST_SEL_Y; ++ let Word1{17-15} = DST_SEL_Z; ++ let Word1{20-18} = DST_SEL_W; ++ let Word1{21} = USE_CONST_FIELDS; ++ let Word1{27-22} = DATA_FORMAT; ++ let Word1{29-28} = NUM_FORMAT_ALL; ++ let Word1{30} = FORMAT_COMP_ALL; ++ let Word1{31} = SRF_MODE_ALL; ++} ++ ++/* ++XXX: R600 subtarget uses a slightly different encoding than the other ++subtargets. We currently handle this in R600MCCodeEmitter, but we may ++want to use these instruction classes in the future. ++ ++class R600ALU_Word1_OP2_r600 : R600ALU_Word1_OP2 { ++ ++ bits<1> fog_merge; ++ bits<10> alu_inst; ++ ++ let Inst{37} = fog_merge; ++ let Inst{39-38} = omod; ++ let Inst{49-40} = alu_inst; ++} ++ ++class R600ALU_Word1_OP2_r700 : R600ALU_Word1_OP2 { ++ ++ bits<11> alu_inst; ++ ++ let Inst{38-37} = omod; ++ let Inst{49-39} = alu_inst; ++} ++*/ ++ ++def R600_Pred : PredicateOperand; ++ ++ ++let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { ++ ++// Class for instructions with only one source register. ++// If you add new ins to this instruction, make sure they are listed before ++// $literal, because the backend currently assumes that the last operand is ++// a literal. Also be sure to update the enum R600Op1OperandIndex::ROI in ++// R600Defines.h, R600InstrInfo::buildDefaultInstruction(), ++// and R600InstrInfo::getOperandIdx(). ++class R600_1OP inst, string opName, list pattern, ++ InstrItinClass itin = AnyALU> : ++ InstR600 <0, ++ (outs R600_Reg32:$dst), ++ (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp, ++ R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel, ++ LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal), ++ !strconcat(opName, ++ "$clamp $dst$write$dst_rel$omod, " ++ "$src0_neg$src0_abs$src0$src0_sel$src0_abs$src0_rel, " ++ "$literal $pred_sel$last"), ++ pattern, ++ itin>, ++ R600ALU_Word0, ++ R600ALU_Word1_OP2 { ++ ++ let src1 = 0; ++ let src1_rel = 0; ++ let src1_neg = 0; ++ let src1_abs = 0; ++ let update_exec_mask = 0; ++ let update_pred = 0; ++ let HasNativeOperands = 1; ++ let Op1 = 1; ++ let DisableEncoding = "$literal"; ++ ++ let Inst{31-0} = Word0; ++ let Inst{63-32} = Word1; ++} ++ ++class R600_1OP_Helper inst, string opName, SDPatternOperator node, ++ InstrItinClass itin = AnyALU> : ++ R600_1OP ; ++ ++// If you add our change the operands for R600_2OP instructions, you must ++// also update the R600Op2OperandIndex::ROI enum in R600Defines.h, ++// R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx(). ++class R600_2OP inst, string opName, list pattern, ++ InstrItinClass itin = AnyALU> : ++ InstR600 , ++ R600ALU_Word0, ++ R600ALU_Word1_OP2 { ++ ++ let HasNativeOperands = 1; ++ let Op2 = 1; ++ let DisableEncoding = "$literal"; ++ ++ let Inst{31-0} = Word0; ++ let Inst{63-32} = Word1; ++} ++ ++class R600_2OP_Helper inst, string opName, SDPatternOperator node, ++ InstrItinClass itim = AnyALU> : ++ R600_2OP ; ++ ++// If you add our change the operands for R600_3OP instructions, you must ++// also update the R600Op3OperandIndex::ROI enum in R600Defines.h, ++// R600InstrInfo::buildDefaultInstruction(), and ++// R600InstrInfo::getOperandIdx(). ++class R600_3OP inst, string opName, list pattern, ++ InstrItinClass itin = AnyALU> : ++ InstR600 <0, ++ (outs R600_Reg32:$dst), ++ (ins REL:$dst_rel, CLAMP:$clamp, ++ R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel, ++ R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel, ++ R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel, ++ LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal), ++ !strconcat(opName, "$clamp $dst$dst_rel, " ++ "$src0_neg$src0$src0_sel$src0_rel, " ++ "$src1_neg$src1$src1_sel$src1_rel, " ++ "$src2_neg$src2$src2_sel$src2_rel, " ++ "$literal $pred_sel$last"), ++ pattern, ++ itin>, ++ R600ALU_Word0, ++ R600ALU_Word1_OP3{ ++ ++ let HasNativeOperands = 1; ++ let DisableEncoding = "$literal"; ++ let Op3 = 1; ++ ++ let Inst{31-0} = Word0; ++ let Inst{63-32} = Word1; ++} ++ ++class R600_REDUCTION inst, dag ins, string asm, list pattern, ++ InstrItinClass itin = VecALU> : ++ InstR600 ; ++ ++class R600_TEX inst, string opName, list pattern, ++ InstrItinClass itin = AnyALU> : ++ InstR600 { ++ let Inst {10-0} = inst; ++ } ++ ++} // End mayLoad = 1, mayStore = 0, hasSideEffects = 0 ++ ++def TEX_SHADOW : PatLeaf< ++ (imm), ++ [{uint32_t TType = (uint32_t)N->getZExtValue(); ++ return (TType >= 6 && TType <= 8) || (TType >= 11 && TType <= 13); ++ }] ++>; ++ ++def TEX_RECT : PatLeaf< ++ (imm), ++ [{uint32_t TType = (uint32_t)N->getZExtValue(); ++ return TType == 5; ++ }] ++>; ++ ++class EG_CF_RAT cf_inst, bits <6> rat_inst, bits<4> rat_id, dag outs, ++ dag ins, string asm, list pattern> : ++ InstR600ISA { ++ bits<7> RW_GPR; ++ bits<7> INDEX_GPR; ++ ++ bits<2> RIM; ++ bits<2> TYPE; ++ bits<1> RW_REL; ++ bits<2> ELEM_SIZE; ++ ++ bits<12> ARRAY_SIZE; ++ bits<4> COMP_MASK; ++ bits<4> BURST_COUNT; ++ bits<1> VPM; ++ bits<1> eop; ++ bits<1> MARK; ++ bits<1> BARRIER; ++ ++ // CF_ALLOC_EXPORT_WORD0_RAT ++ let Inst{3-0} = rat_id; ++ let Inst{9-4} = rat_inst; ++ let Inst{10} = 0; // Reserved ++ let Inst{12-11} = RIM; ++ let Inst{14-13} = TYPE; ++ let Inst{21-15} = RW_GPR; ++ let Inst{22} = RW_REL; ++ let Inst{29-23} = INDEX_GPR; ++ let Inst{31-30} = ELEM_SIZE; ++ ++ // CF_ALLOC_EXPORT_WORD1_BUF ++ let Inst{43-32} = ARRAY_SIZE; ++ let Inst{47-44} = COMP_MASK; ++ let Inst{51-48} = BURST_COUNT; ++ let Inst{52} = VPM; ++ let Inst{53} = eop; ++ let Inst{61-54} = cf_inst; ++ let Inst{62} = MARK; ++ let Inst{63} = BARRIER; ++} ++ ++class LoadParamFrag : PatFrag < ++ (ops node:$ptr), (load_type node:$ptr), ++ [{ return isParamLoad(dyn_cast(N)); }] ++>; ++ ++def load_param : LoadParamFrag; ++def load_param_zexti8 : LoadParamFrag; ++def load_param_zexti16 : LoadParamFrag; ++ ++def isR600 : Predicate<"Subtarget.device()" ++ "->getGeneration() == AMDGPUDeviceInfo::HD4XXX">; ++def isR700 : Predicate<"Subtarget.device()" ++ "->getGeneration() == AMDGPUDeviceInfo::HD4XXX &&" ++ "Subtarget.device()->getDeviceFlag()" ++ ">= OCL_DEVICE_RV710">; ++def isEG : Predicate< ++ "Subtarget.device()->getGeneration() >= AMDGPUDeviceInfo::HD5XXX && " ++ "Subtarget.device()->getGeneration() < AMDGPUDeviceInfo::HD7XXX && " ++ "Subtarget.device()->getDeviceFlag() != OCL_DEVICE_CAYMAN">; ++ ++def isCayman : Predicate<"Subtarget.device()" ++ "->getDeviceFlag() == OCL_DEVICE_CAYMAN">; ++def isEGorCayman : Predicate<"Subtarget.device()" ++ "->getGeneration() == AMDGPUDeviceInfo::HD5XXX" ++ "|| Subtarget.device()->getGeneration() ==" ++ "AMDGPUDeviceInfo::HD6XXX">; ++ ++def isR600toCayman : Predicate< ++ "Subtarget.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX">; ++ ++//===----------------------------------------------------------------------===// ++// R600 SDNodes ++//===----------------------------------------------------------------------===// ++ ++def INTERP: SDNode<"AMDGPUISD::INTERP", ++ SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisInt<1>, SDTCisInt<2>]> ++ >; ++ ++def INTERP_P0: SDNode<"AMDGPUISD::INTERP_P0", ++ SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisInt<1>]> ++ >; ++ ++def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS", ++ SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisPtrTy<1>]>, ++ [SDNPMayLoad] ++>; ++ ++//===----------------------------------------------------------------------===// ++// Interpolation Instructions ++//===----------------------------------------------------------------------===// ++ ++let usesCustomInserter = 1 in { ++def input_perspective : AMDGPUShaderInst < ++ (outs R600_Reg128:$dst), ++ (ins i32imm:$src0, i32imm:$src1), ++ "input_perspective $src0 $src1 : dst", ++ [(set R600_Reg128:$dst, (INTERP (i32 imm:$src0), (i32 imm:$src1)))]>; ++} // End usesCustomInserter = 1 ++ ++def input_constant : AMDGPUShaderInst < ++ (outs R600_Reg128:$dst), ++ (ins i32imm:$src), ++ "input_perspective $src : dst", ++ [(set R600_Reg128:$dst, (INTERP_P0 (i32 imm:$src)))]>; ++ ++ ++ ++def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> { ++ let bank_swizzle = 5; ++} ++ ++def INTERP_ZW : R600_2OP <0xD7, "INTERP_ZW", []> { ++ let bank_swizzle = 5; ++} ++ ++def INTERP_LOAD_P0 : R600_1OP <0xE0, "INTERP_LOAD_P0", []>; ++ ++//===----------------------------------------------------------------------===// ++// Export Instructions ++//===----------------------------------------------------------------------===// ++ ++def ExportType : SDTypeProfile<0, 5, [SDTCisFP<0>, SDTCisInt<1>]>; ++ ++def EXPORT: SDNode<"AMDGPUISD::EXPORT", ExportType, ++ [SDNPHasChain, SDNPSideEffect]>; ++ ++class ExportWord0 { ++ field bits<32> Word0; ++ ++ bits<13> arraybase; ++ bits<2> type; ++ bits<7> gpr; ++ bits<2> elem_size; ++ ++ let Word0{12-0} = arraybase; ++ let Word0{14-13} = type; ++ let Word0{21-15} = gpr; ++ let Word0{22} = 0; // RW_REL ++ let Word0{29-23} = 0; // INDEX_GPR ++ let Word0{31-30} = elem_size; ++} ++ ++class ExportSwzWord1 { ++ field bits<32> Word1; ++ ++ bits<3> sw_x; ++ bits<3> sw_y; ++ bits<3> sw_z; ++ bits<3> sw_w; ++ bits<1> eop; ++ bits<8> inst; ++ ++ let Word1{2-0} = sw_x; ++ let Word1{5-3} = sw_y; ++ let Word1{8-6} = sw_z; ++ let Word1{11-9} = sw_w; ++} ++ ++class ExportBufWord1 { ++ field bits<32> Word1; ++ ++ bits<12> arraySize; ++ bits<4> compMask; ++ bits<1> eop; ++ bits<8> inst; ++ ++ let Word1{11-0} = arraySize; ++ let Word1{15-12} = compMask; ++} ++ ++multiclass ExportPattern cf_inst> { ++ def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg), ++ (ExportInst ++ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sel_x), ++ 0, 61, 0, 7, 7, 7, cf_inst, 0) ++ >; ++ ++ def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg), ++ (ExportInst ++ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sel_x), ++ 0, 61, 7, 0, 7, 7, cf_inst, 0) ++ >; ++ ++ def : Pat<(int_R600_store_pixel_dummy), ++ (ExportInst ++ (v4f32 (IMPLICIT_DEF)), 0, 0, 7, 7, 7, 7, cf_inst, 0) ++ >; ++ ++ def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 0), ++ (i32 imm:$type), (i32 imm:$arraybase), (i32 imm)), ++ (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase, ++ 0, 1, 2, 3, cf_inst, 0) ++ >; ++ def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 1), ++ (i32 imm:$type), (i32 imm:$arraybase), (i32 imm)), ++ (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase, ++ 0, 1, 2, 3, cf_inst, 0) ++ >; ++ ++ def : Pat<(int_R600_store_swizzle (v4f32 R600_Reg128:$src), imm:$arraybase, ++ imm:$type), ++ (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase, ++ 0, 1, 2, 3, cf_inst, 0) ++ >; ++} ++ ++multiclass SteamOutputExportPattern buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> { ++// Stream0 ++ def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), ++ (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)), ++ (ExportInst R600_Reg128:$src, 0, imm:$arraybase, ++ 4095, imm:$mask, buf0inst, 0)>; ++// Stream1 ++ def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), ++ (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)), ++ (ExportInst R600_Reg128:$src, 0, imm:$arraybase, ++ 4095, imm:$mask, buf1inst, 0)>; ++// Stream2 ++ def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), ++ (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)), ++ (ExportInst R600_Reg128:$src, 0, imm:$arraybase, ++ 4095, imm:$mask, buf2inst, 0)>; ++// Stream3 ++ def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), ++ (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)), ++ (ExportInst R600_Reg128:$src, 0, imm:$arraybase, ++ 4095, imm:$mask, buf3inst, 0)>; ++} ++ ++let isTerminator = 1, usesCustomInserter = 1 in { ++ ++class ExportSwzInst : InstR600ISA<( ++ outs), ++ (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase, ++ i32imm:$sw_x, i32imm:$sw_y, i32imm:$sw_z, i32imm:$sw_w, i32imm:$inst, ++ i32imm:$eop), ++ !strconcat("EXPORT", " $gpr"), ++ []>, ExportWord0, ExportSwzWord1 { ++ let elem_size = 3; ++ let Inst{31-0} = Word0; ++ let Inst{63-32} = Word1; ++} ++ ++} // End isTerminator = 1, usesCustomInserter = 1 ++ ++class ExportBufInst : InstR600ISA<( ++ outs), ++ (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase, ++ i32imm:$arraySize, i32imm:$compMask, i32imm:$inst, i32imm:$eop), ++ !strconcat("EXPORT", " $gpr"), ++ []>, ExportWord0, ExportBufWord1 { ++ let elem_size = 0; ++ let Inst{31-0} = Word0; ++ let Inst{63-32} = Word1; ++} ++ ++let Predicates = [isR600toCayman] in { ++ ++//===----------------------------------------------------------------------===// ++// Common Instructions R600, R700, Evergreen, Cayman ++//===----------------------------------------------------------------------===// ++ ++def ADD : R600_2OP_Helper <0x0, "ADD", fadd>; ++// Non-IEEE MUL: 0 * anything = 0 ++def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>; ++def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>; ++def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax>; ++def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin>; ++ ++// For the SET* instructions there is a naming conflict in TargetSelectionDAG.td, ++// so some of the instruction names don't match the asm string. ++// XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics. ++def SETE : R600_2OP < ++ 0x08, "SETE", ++ [(set R600_Reg32:$dst, ++ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, ++ COND_EQ))] ++>; ++ ++def SGT : R600_2OP < ++ 0x09, "SETGT", ++ [(set R600_Reg32:$dst, ++ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, ++ COND_GT))] ++>; ++ ++def SGE : R600_2OP < ++ 0xA, "SETGE", ++ [(set R600_Reg32:$dst, ++ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, ++ COND_GE))] ++>; ++ ++def SNE : R600_2OP < ++ 0xB, "SETNE", ++ [(set R600_Reg32:$dst, ++ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, ++ COND_NE))] ++>; ++ ++def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>; ++def TRUNC : R600_1OP_Helper <0x11, "TRUNC", int_AMDGPU_trunc>; ++def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>; ++def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>; ++def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>; ++ ++def MOV : R600_1OP <0x19, "MOV", []>; ++ ++let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in { ++ ++class MOV_IMM : AMDGPUInst < ++ (outs R600_Reg32:$dst), ++ (ins immType:$imm), ++ "", ++ [] ++>; ++ ++} // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 ++ ++def MOV_IMM_I32 : MOV_IMM; ++def : Pat < ++ (imm:$val), ++ (MOV_IMM_I32 imm:$val) ++>; ++ ++def MOV_IMM_F32 : MOV_IMM; ++def : Pat < ++ (fpimm:$val), ++ (MOV_IMM_F32 fpimm:$val) ++>; ++ ++def PRED_SETE : R600_2OP <0x20, "PRED_SETE", []>; ++def PRED_SETGT : R600_2OP <0x21, "PRED_SETGT", []>; ++def PRED_SETGE : R600_2OP <0x22, "PRED_SETGE", []>; ++def PRED_SETNE : R600_2OP <0x23, "PRED_SETNE", []>; ++ ++let hasSideEffects = 1 in { ++ ++def KILLGT : R600_2OP <0x2D, "KILLGT", []>; ++ ++} // end hasSideEffects ++ ++def AND_INT : R600_2OP_Helper <0x30, "AND_INT", and>; ++def OR_INT : R600_2OP_Helper <0x31, "OR_INT", or>; ++def XOR_INT : R600_2OP_Helper <0x32, "XOR_INT", xor>; ++def NOT_INT : R600_1OP_Helper <0x33, "NOT_INT", not>; ++def ADD_INT : R600_2OP_Helper <0x34, "ADD_INT", add>; ++def SUB_INT : R600_2OP_Helper <0x35, "SUB_INT", sub>; ++def MAX_INT : R600_2OP_Helper <0x36, "MAX_INT", AMDGPUsmax>; ++def MIN_INT : R600_2OP_Helper <0x37, "MIN_INT", AMDGPUsmin>; ++def MAX_UINT : R600_2OP_Helper <0x38, "MAX_UINT", AMDGPUumax>; ++def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", AMDGPUumin>; ++ ++def SETE_INT : R600_2OP < ++ 0x3A, "SETE_INT", ++ [(set (i32 R600_Reg32:$dst), ++ (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETEQ))] ++>; ++ ++def SETGT_INT : R600_2OP < ++ 0x3B, "SGT_INT", ++ [(set (i32 R600_Reg32:$dst), ++ (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETGT))] ++>; ++ ++def SETGE_INT : R600_2OP < ++ 0x3C, "SETGE_INT", ++ [(set (i32 R600_Reg32:$dst), ++ (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETGE))] ++>; ++ ++def SETNE_INT : R600_2OP < ++ 0x3D, "SETNE_INT", ++ [(set (i32 R600_Reg32:$dst), ++ (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETNE))] ++>; ++ ++def SETGT_UINT : R600_2OP < ++ 0x3E, "SETGT_UINT", ++ [(set (i32 R600_Reg32:$dst), ++ (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUGT))] ++>; ++ ++def SETGE_UINT : R600_2OP < ++ 0x3F, "SETGE_UINT", ++ [(set (i32 R600_Reg32:$dst), ++ (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUGE))] ++>; ++ ++def PRED_SETE_INT : R600_2OP <0x42, "PRED_SETE_INT", []>; ++def PRED_SETGT_INT : R600_2OP <0x43, "PRED_SETGE_INT", []>; ++def PRED_SETGE_INT : R600_2OP <0x44, "PRED_SETGE_INT", []>; ++def PRED_SETNE_INT : R600_2OP <0x45, "PRED_SETNE_INT", []>; ++ ++def CNDE_INT : R600_3OP < ++ 0x1C, "CNDE_INT", ++ [(set (i32 R600_Reg32:$dst), ++ (selectcc (i32 R600_Reg32:$src0), 0, ++ (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2), ++ COND_EQ))] ++>; ++ ++def CNDGE_INT : R600_3OP < ++ 0x1E, "CNDGE_INT", ++ [(set (i32 R600_Reg32:$dst), ++ (selectcc (i32 R600_Reg32:$src0), 0, ++ (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2), ++ COND_GE))] ++>; ++ ++def CNDGT_INT : R600_3OP < ++ 0x1D, "CNDGT_INT", ++ [(set (i32 R600_Reg32:$dst), ++ (selectcc (i32 R600_Reg32:$src0), 0, ++ (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2), ++ COND_GT))] ++>; ++ ++//===----------------------------------------------------------------------===// ++// Texture instructions ++//===----------------------------------------------------------------------===// ++ ++def TEX_LD : R600_TEX < ++ 0x03, "TEX_LD", ++ [(set R600_Reg128:$dst, (int_AMDGPU_txf R600_Reg128:$src0, imm:$src1, imm:$src2, imm:$src3, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] ++> { ++let AsmString = "TEX_LD $dst, $src0, $src1, $src2, $src3, $resourceId, $samplerId, $textureTarget"; ++let InOperandList = (ins R600_Reg128:$src0, i32imm:$src1, i32imm:$src2, i32imm:$src3, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget); ++} ++ ++def TEX_GET_TEXTURE_RESINFO : R600_TEX < ++ 0x04, "TEX_GET_TEXTURE_RESINFO", ++ [(set R600_Reg128:$dst, (int_AMDGPU_txq R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] ++>; ++ ++def TEX_GET_GRADIENTS_H : R600_TEX < ++ 0x07, "TEX_GET_GRADIENTS_H", ++ [(set R600_Reg128:$dst, (int_AMDGPU_ddx R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] ++>; ++ ++def TEX_GET_GRADIENTS_V : R600_TEX < ++ 0x08, "TEX_GET_GRADIENTS_V", ++ [(set R600_Reg128:$dst, (int_AMDGPU_ddy R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] ++>; ++ ++def TEX_SET_GRADIENTS_H : R600_TEX < ++ 0x0B, "TEX_SET_GRADIENTS_H", ++ [] ++>; ++ ++def TEX_SET_GRADIENTS_V : R600_TEX < ++ 0x0C, "TEX_SET_GRADIENTS_V", ++ [] ++>; ++ ++def TEX_SAMPLE : R600_TEX < ++ 0x10, "TEX_SAMPLE", ++ [(set R600_Reg128:$dst, (int_AMDGPU_tex R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] ++>; ++ ++def TEX_SAMPLE_C : R600_TEX < ++ 0x18, "TEX_SAMPLE_C", ++ [(set R600_Reg128:$dst, (int_AMDGPU_tex R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))] ++>; ++ ++def TEX_SAMPLE_L : R600_TEX < ++ 0x11, "TEX_SAMPLE_L", ++ [(set R600_Reg128:$dst, (int_AMDGPU_txl R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] ++>; ++ ++def TEX_SAMPLE_C_L : R600_TEX < ++ 0x19, "TEX_SAMPLE_C_L", ++ [(set R600_Reg128:$dst, (int_AMDGPU_txl R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))] ++>; ++ ++def TEX_SAMPLE_LB : R600_TEX < ++ 0x12, "TEX_SAMPLE_LB", ++ [(set R600_Reg128:$dst, (int_AMDGPU_txb R600_Reg128:$src0,imm:$resourceId, imm:$samplerId, imm:$textureTarget))] ++>; ++ ++def TEX_SAMPLE_C_LB : R600_TEX < ++ 0x1A, "TEX_SAMPLE_C_LB", ++ [(set R600_Reg128:$dst, (int_AMDGPU_txb R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))] ++>; ++ ++def TEX_SAMPLE_G : R600_TEX < ++ 0x14, "TEX_SAMPLE_G", ++ [] ++>; ++ ++def TEX_SAMPLE_C_G : R600_TEX < ++ 0x1C, "TEX_SAMPLE_C_G", ++ [] ++>; ++ ++//===----------------------------------------------------------------------===// ++// Helper classes for common instructions ++//===----------------------------------------------------------------------===// ++ ++class MUL_LIT_Common inst> : R600_3OP < ++ inst, "MUL_LIT", ++ [] ++>; ++ ++class MULADD_Common inst> : R600_3OP < ++ inst, "MULADD", ++ [(set (f32 R600_Reg32:$dst), ++ (IL_mad R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2))] ++>; ++ ++class CNDE_Common inst> : R600_3OP < ++ inst, "CNDE", ++ [(set R600_Reg32:$dst, ++ (selectcc (f32 R600_Reg32:$src0), FP_ZERO, ++ (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2), ++ COND_EQ))] ++>; ++ ++class CNDGT_Common inst> : R600_3OP < ++ inst, "CNDGT", ++ [(set R600_Reg32:$dst, ++ (selectcc (f32 R600_Reg32:$src0), FP_ZERO, ++ (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2), ++ COND_GT))] ++>; ++ ++class CNDGE_Common inst> : R600_3OP < ++ inst, "CNDGE", ++ [(set R600_Reg32:$dst, ++ (selectcc (f32 R600_Reg32:$src0), FP_ZERO, ++ (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2), ++ COND_GE))] ++>; ++ ++multiclass DOT4_Common inst> { ++ ++ def _pseudo : R600_REDUCTION ; ++ ++ def _real : R600_2OP ; ++} ++ ++let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { ++multiclass CUBE_Common inst> { ++ ++ def _pseudo : InstR600 < ++ inst, ++ (outs R600_Reg128:$dst), ++ (ins R600_Reg128:$src), ++ "CUBE $dst $src", ++ [(set R600_Reg128:$dst, (int_AMDGPU_cube R600_Reg128:$src))], ++ VecALU ++ > { ++ let isPseudo = 1; ++ } ++ ++ def _real : R600_2OP ; ++} ++} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0 ++ ++class EXP_IEEE_Common inst> : R600_1OP_Helper < ++ inst, "EXP_IEEE", fexp2 ++>; ++ ++class FLT_TO_INT_Common inst> : R600_1OP_Helper < ++ inst, "FLT_TO_INT", fp_to_sint ++>; ++ ++class INT_TO_FLT_Common inst> : R600_1OP_Helper < ++ inst, "INT_TO_FLT", sint_to_fp ++>; ++ ++class FLT_TO_UINT_Common inst> : R600_1OP_Helper < ++ inst, "FLT_TO_UINT", fp_to_uint ++>; ++ ++class UINT_TO_FLT_Common inst> : R600_1OP_Helper < ++ inst, "UINT_TO_FLT", uint_to_fp ++>; ++ ++class LOG_CLAMPED_Common inst> : R600_1OP < ++ inst, "LOG_CLAMPED", [] ++>; ++ ++class LOG_IEEE_Common inst> : R600_1OP_Helper < ++ inst, "LOG_IEEE", flog2 ++>; ++ ++class LSHL_Common inst> : R600_2OP_Helper ; ++class LSHR_Common inst> : R600_2OP_Helper ; ++class ASHR_Common inst> : R600_2OP_Helper ; ++class MULHI_INT_Common inst> : R600_2OP_Helper < ++ inst, "MULHI_INT", mulhs ++>; ++class MULHI_UINT_Common inst> : R600_2OP_Helper < ++ inst, "MULHI", mulhu ++>; ++class MULLO_INT_Common inst> : R600_2OP_Helper < ++ inst, "MULLO_INT", mul ++>; ++class MULLO_UINT_Common inst> : R600_2OP ; ++ ++class RECIP_CLAMPED_Common inst> : R600_1OP < ++ inst, "RECIP_CLAMPED", [] ++>; ++ ++class RECIP_IEEE_Common inst> : R600_1OP < ++ inst, "RECIP_IEEE", [(set R600_Reg32:$dst, (fdiv FP_ONE, R600_Reg32:$src0))] ++>; ++ ++class RECIP_UINT_Common inst> : R600_1OP_Helper < ++ inst, "RECIP_UINT", AMDGPUurecip ++>; ++ ++class RECIPSQRT_CLAMPED_Common inst> : R600_1OP_Helper < ++ inst, "RECIPSQRT_CLAMPED", int_AMDGPU_rsq ++>; ++ ++class RECIPSQRT_IEEE_Common inst> : R600_1OP < ++ inst, "RECIPSQRT_IEEE", [] ++>; ++ ++class SIN_Common inst> : R600_1OP < ++ inst, "SIN", []>{ ++ let Trig = 1; ++} ++ ++class COS_Common inst> : R600_1OP < ++ inst, "COS", []> { ++ let Trig = 1; ++} ++ ++//===----------------------------------------------------------------------===// ++// Helper patterns for complex intrinsics ++//===----------------------------------------------------------------------===// ++ ++multiclass DIV_Common { ++def : Pat< ++ (int_AMDGPU_div R600_Reg32:$src0, R600_Reg32:$src1), ++ (MUL R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1)) ++>; ++ ++def : Pat< ++ (fdiv R600_Reg32:$src0, R600_Reg32:$src1), ++ (MUL R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1)) ++>; ++} ++ ++class TGSI_LIT_Z_Common : Pat < ++ (int_TGSI_lit_z R600_Reg32:$src_x, R600_Reg32:$src_y, R600_Reg32:$src_w), ++ (exp_ieee (mul_lit (log_clamped (MAX R600_Reg32:$src_y, (f32 ZERO))), R600_Reg32:$src_w, R600_Reg32:$src_x)) ++>; ++ ++//===----------------------------------------------------------------------===// ++// R600 / R700 Instructions ++//===----------------------------------------------------------------------===// ++ ++let Predicates = [isR600] in { ++ ++ def MUL_LIT_r600 : MUL_LIT_Common<0x0C>; ++ def MULADD_r600 : MULADD_Common<0x10>; ++ def CNDE_r600 : CNDE_Common<0x18>; ++ def CNDGT_r600 : CNDGT_Common<0x19>; ++ def CNDGE_r600 : CNDGE_Common<0x1A>; ++ defm DOT4_r600 : DOT4_Common<0x50>; ++ defm CUBE_r600 : CUBE_Common<0x52>; ++ def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>; ++ def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>; ++ def LOG_IEEE_r600 : LOG_IEEE_Common<0x63>; ++ def RECIP_CLAMPED_r600 : RECIP_CLAMPED_Common<0x64>; ++ def RECIP_IEEE_r600 : RECIP_IEEE_Common<0x66>; ++ def RECIPSQRT_CLAMPED_r600 : RECIPSQRT_CLAMPED_Common<0x67>; ++ def RECIPSQRT_IEEE_r600 : RECIPSQRT_IEEE_Common<0x69>; ++ def FLT_TO_INT_r600 : FLT_TO_INT_Common<0x6b>; ++ def INT_TO_FLT_r600 : INT_TO_FLT_Common<0x6c>; ++ def FLT_TO_UINT_r600 : FLT_TO_UINT_Common<0x79>; ++ def UINT_TO_FLT_r600 : UINT_TO_FLT_Common<0x6d>; ++ def SIN_r600 : SIN_Common<0x6E>; ++ def COS_r600 : COS_Common<0x6F>; ++ def ASHR_r600 : ASHR_Common<0x70>; ++ def LSHR_r600 : LSHR_Common<0x71>; ++ def LSHL_r600 : LSHL_Common<0x72>; ++ def MULLO_INT_r600 : MULLO_INT_Common<0x73>; ++ def MULHI_INT_r600 : MULHI_INT_Common<0x74>; ++ def MULLO_UINT_r600 : MULLO_UINT_Common<0x75>; ++ def MULHI_UINT_r600 : MULHI_UINT_Common<0x76>; ++ def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>; ++ ++ defm DIV_r600 : DIV_Common; ++ def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common; ++ ++ def : Pat<(fsqrt R600_Reg32:$src), ++ (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_r600 R600_Reg32:$src))>; ++ ++ def R600_ExportSwz : ExportSwzInst { ++ let Word1{20-17} = 1; // BURST_COUNT ++ let Word1{21} = eop; ++ let Word1{22} = 1; // VALID_PIXEL_MODE ++ let Word1{30-23} = inst; ++ let Word1{31} = 1; // BARRIER ++ } ++ defm : ExportPattern; ++ ++ def R600_ExportBuf : ExportBufInst { ++ let Word1{20-17} = 1; // BURST_COUNT ++ let Word1{21} = eop; ++ let Word1{22} = 1; // VALID_PIXEL_MODE ++ let Word1{30-23} = inst; ++ let Word1{31} = 1; // BARRIER ++ } ++ defm : SteamOutputExportPattern; ++} ++ ++// Helper pattern for normalizing inputs to triginomic instructions for R700+ ++// cards. ++class COS_PAT : Pat< ++ (fcos R600_Reg32:$src), ++ (trig (MUL (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src)) ++>; ++ ++class SIN_PAT : Pat< ++ (fsin R600_Reg32:$src), ++ (trig (MUL (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src)) ++>; ++ ++//===----------------------------------------------------------------------===// ++// R700 Only instructions ++//===----------------------------------------------------------------------===// ++ ++let Predicates = [isR700] in { ++ def SIN_r700 : SIN_Common<0x6E>; ++ def COS_r700 : COS_Common<0x6F>; ++ ++ // R700 normalizes inputs to SIN/COS the same as EG ++ def : SIN_PAT ; ++ def : COS_PAT ; ++} ++ ++//===----------------------------------------------------------------------===// ++// Evergreen Only instructions ++//===----------------------------------------------------------------------===// ++ ++let Predicates = [isEG] in { ++ ++def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>; ++defm DIV_eg : DIV_Common; ++ ++def MULLO_INT_eg : MULLO_INT_Common<0x8F>; ++def MULHI_INT_eg : MULHI_INT_Common<0x90>; ++def MULLO_UINT_eg : MULLO_UINT_Common<0x91>; ++def MULHI_UINT_eg : MULHI_UINT_Common<0x92>; ++def RECIP_UINT_eg : RECIP_UINT_Common<0x94>; ++def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>; ++def EXP_IEEE_eg : EXP_IEEE_Common<0x81>; ++def LOG_IEEE_eg : LOG_IEEE_Common<0x83>; ++def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>; ++def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>; ++def SIN_eg : SIN_Common<0x8D>; ++def COS_eg : COS_Common<0x8E>; ++ ++def : SIN_PAT ; ++def : COS_PAT ; ++def : Pat<(fsqrt R600_Reg32:$src), ++ (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_eg R600_Reg32:$src))>; ++} // End Predicates = [isEG] ++ ++//===----------------------------------------------------------------------===// ++// Evergreen / Cayman Instructions ++//===----------------------------------------------------------------------===// ++ ++let Predicates = [isEGorCayman] in { ++ ++ // BFE_UINT - bit_extract, an optimization for mask and shift ++ // Src0 = Input ++ // Src1 = Offset ++ // Src2 = Width ++ // ++ // bit_extract = (Input << (32 - Offset - Width)) >> (32 - Width) ++ // ++ // Example Usage: ++ // (Offset, Width) ++ // ++ // (0, 8) = (Input << 24) >> 24 = (Input & 0xff) >> 0 ++ // (8, 8) = (Input << 16) >> 24 = (Input & 0xffff) >> 8 ++ // (16,8) = (Input << 8) >> 24 = (Input & 0xffffff) >> 16 ++ // (24,8) = (Input << 0) >> 24 = (Input & 0xffffffff) >> 24 ++ def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT", ++ [(set R600_Reg32:$dst, (int_AMDIL_bit_extract_u32 R600_Reg32:$src0, ++ R600_Reg32:$src1, ++ R600_Reg32:$src2))], ++ VecALU ++ >; ++ ++ def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", ++ [(set R600_Reg32:$dst, (AMDGPUbitalign R600_Reg32:$src0, R600_Reg32:$src1, ++ R600_Reg32:$src2))], ++ VecALU ++ >; ++ ++ def MULADD_eg : MULADD_Common<0x14>; ++ def ASHR_eg : ASHR_Common<0x15>; ++ def LSHR_eg : LSHR_Common<0x16>; ++ def LSHL_eg : LSHL_Common<0x17>; ++ def CNDE_eg : CNDE_Common<0x19>; ++ def CNDGT_eg : CNDGT_Common<0x1A>; ++ def CNDGE_eg : CNDGE_Common<0x1B>; ++ def MUL_LIT_eg : MUL_LIT_Common<0x1F>; ++ def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>; ++ defm DOT4_eg : DOT4_Common<0xBE>; ++ defm CUBE_eg : CUBE_Common<0xC0>; ++ ++ def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common; ++ ++ def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> { ++ let Pattern = []; ++ } ++ ++ def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>; ++ ++ def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> { ++ let Pattern = []; ++ } ++ ++ def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>; ++ ++ // TRUNC is used for the FLT_TO_INT instructions to work around a ++ // perceived problem where the rounding modes are applied differently ++ // depending on the instruction and the slot they are in. ++ // See: ++ // https://bugs.freedesktop.org/show_bug.cgi?id=50232 ++ // Mesa commit: a1a0974401c467cb86ef818f22df67c21774a38c ++ // ++ // XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes, ++ // which do not need to be truncated since the fp values are 0.0f or 1.0f. ++ // We should look into handling these cases separately. ++ def : Pat<(fp_to_sint R600_Reg32:$src0), ++ (FLT_TO_INT_eg (TRUNC R600_Reg32:$src0))>; ++ ++ def : Pat<(fp_to_uint R600_Reg32:$src0), ++ (FLT_TO_UINT_eg (TRUNC R600_Reg32:$src0))>; ++ ++ def EG_ExportSwz : ExportSwzInst { ++ let Word1{19-16} = 1; // BURST_COUNT ++ let Word1{20} = 1; // VALID_PIXEL_MODE ++ let Word1{21} = eop; ++ let Word1{29-22} = inst; ++ let Word1{30} = 0; // MARK ++ let Word1{31} = 1; // BARRIER ++ } ++ defm : ExportPattern; ++ ++ def EG_ExportBuf : ExportBufInst { ++ let Word1{19-16} = 1; // BURST_COUNT ++ let Word1{20} = 1; // VALID_PIXEL_MODE ++ let Word1{21} = eop; ++ let Word1{29-22} = inst; ++ let Word1{30} = 0; // MARK ++ let Word1{31} = 1; // BARRIER ++ } ++ defm : SteamOutputExportPattern; ++ ++//===----------------------------------------------------------------------===// ++// Memory read/write instructions ++//===----------------------------------------------------------------------===// ++let usesCustomInserter = 1 in { ++ ++class RAT_WRITE_CACHELESS_eg comp_mask, string name, ++ list pattern> ++ : EG_CF_RAT <0x57, 0x2, 0, (outs), ins, ++ !strconcat(name, " $rw_gpr, $index_gpr, $eop"), pattern> { ++ let RIM = 0; ++ // XXX: Have a separate instruction for non-indexed writes. ++ let TYPE = 1; ++ let RW_REL = 0; ++ let ELEM_SIZE = 0; ++ ++ let ARRAY_SIZE = 0; ++ let COMP_MASK = comp_mask; ++ let BURST_COUNT = 0; ++ let VPM = 0; ++ let MARK = 0; ++ let BARRIER = 1; ++} ++ ++} // End usesCustomInserter = 1 ++ ++// 32-bit store ++def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg < ++ (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), ++ 0x1, "RAT_WRITE_CACHELESS_32_eg", ++ [(global_store (i32 R600_TReg32_X:$rw_gpr), R600_TReg32_X:$index_gpr)] ++>; ++ ++//128-bit store ++def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg < ++ (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), ++ 0xf, "RAT_WRITE_CACHELESS_128", ++ [(global_store (v4i32 R600_Reg128:$rw_gpr), R600_TReg32_X:$index_gpr)] ++>; ++ ++class VTX_READ_eg buffer_id, dag outs, list pattern> ++ : InstR600ISA , ++ VTX_WORD1_GPR, VTX_WORD0 { ++ ++ // Static fields ++ let VC_INST = 0; ++ let FETCH_TYPE = 2; ++ let FETCH_WHOLE_QUAD = 0; ++ let BUFFER_ID = buffer_id; ++ let SRC_REL = 0; ++ // XXX: We can infer this field based on the SRC_GPR. This would allow us ++ // to store vertex addresses in any channel, not just X. ++ let SRC_SEL_X = 0; ++ let DST_REL = 0; ++ // The docs say that if this bit is set, then DATA_FORMAT, NUM_FORMAT_ALL, ++ // FORMAT_COMP_ALL, SRF_MODE_ALL, and ENDIAN_SWAP fields will be ignored, ++ // however, based on my testing if USE_CONST_FIELDS is set, then all ++ // these fields need to be set to 0. ++ let USE_CONST_FIELDS = 0; ++ let NUM_FORMAT_ALL = 1; ++ let FORMAT_COMP_ALL = 0; ++ let SRF_MODE_ALL = 0; ++ ++ let Inst{31-0} = Word0; ++ let Inst{63-32} = Word1; ++ // LLVM can only encode 64-bit instructions, so these fields are manually ++ // encoded in R600CodeEmitter ++ // ++ // bits<16> OFFSET; ++ // bits<2> ENDIAN_SWAP = 0; ++ // bits<1> CONST_BUF_NO_STRIDE = 0; ++ // bits<1> MEGA_FETCH = 0; ++ // bits<1> ALT_CONST = 0; ++ // bits<2> BUFFER_INDEX_MODE = 0; ++ ++ ++ ++ // VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding ++ // is done in R600CodeEmitter ++ // ++ // Inst{79-64} = OFFSET; ++ // Inst{81-80} = ENDIAN_SWAP; ++ // Inst{82} = CONST_BUF_NO_STRIDE; ++ // Inst{83} = MEGA_FETCH; ++ // Inst{84} = ALT_CONST; ++ // Inst{86-85} = BUFFER_INDEX_MODE; ++ // Inst{95-86} = 0; Reserved ++ ++ // VTX_WORD3 (Padding) ++ // ++ // Inst{127-96} = 0; ++} ++ ++class VTX_READ_8_eg buffer_id, list pattern> ++ : VTX_READ_eg <"VTX_READ_8", buffer_id, (outs R600_TReg32_X:$dst), ++ pattern> { ++ ++ let MEGA_FETCH_COUNT = 1; ++ let DST_SEL_X = 0; ++ let DST_SEL_Y = 7; // Masked ++ let DST_SEL_Z = 7; // Masked ++ let DST_SEL_W = 7; // Masked ++ let DATA_FORMAT = 1; // FMT_8 ++} ++ ++class VTX_READ_16_eg buffer_id, list pattern> ++ : VTX_READ_eg <"VTX_READ_16", buffer_id, (outs R600_TReg32_X:$dst), ++ pattern> { ++ let MEGA_FETCH_COUNT = 2; ++ let DST_SEL_X = 0; ++ let DST_SEL_Y = 7; // Masked ++ let DST_SEL_Z = 7; // Masked ++ let DST_SEL_W = 7; // Masked ++ let DATA_FORMAT = 5; // FMT_16 ++ ++} ++ ++class VTX_READ_32_eg buffer_id, list pattern> ++ : VTX_READ_eg <"VTX_READ_32", buffer_id, (outs R600_TReg32_X:$dst), ++ pattern> { ++ ++ let MEGA_FETCH_COUNT = 4; ++ let DST_SEL_X = 0; ++ let DST_SEL_Y = 7; // Masked ++ let DST_SEL_Z = 7; // Masked ++ let DST_SEL_W = 7; // Masked ++ let DATA_FORMAT = 0xD; // COLOR_32 ++ ++ // This is not really necessary, but there were some GPU hangs that appeared ++ // to be caused by ALU instructions in the next instruction group that wrote ++ // to the $ptr registers of the VTX_READ. ++ // e.g. ++ // %T3_X = VTX_READ_PARAM_32_eg %T2_X, 24 ++ // %T2_X = MOV %ZERO ++ //Adding this constraint prevents this from happening. ++ let Constraints = "$ptr.ptr = $dst"; ++} ++ ++class VTX_READ_128_eg buffer_id, list pattern> ++ : VTX_READ_eg <"VTX_READ_128", buffer_id, (outs R600_Reg128:$dst), ++ pattern> { ++ ++ let MEGA_FETCH_COUNT = 16; ++ let DST_SEL_X = 0; ++ let DST_SEL_Y = 1; ++ let DST_SEL_Z = 2; ++ let DST_SEL_W = 3; ++ let DATA_FORMAT = 0x22; // COLOR_32_32_32_32 ++ ++ // XXX: Need to force VTX_READ_128 instructions to write to the same register ++ // that holds its buffer address to avoid potential hangs. We can't use ++ // the same constraint as VTX_READ_32_eg, because the $ptr.ptr and $dst ++ // registers are different sizes. ++} ++ ++//===----------------------------------------------------------------------===// ++// VTX Read from parameter memory space ++//===----------------------------------------------------------------------===// ++ ++def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0, ++ [(set (i32 R600_TReg32_X:$dst), (load_param_zexti8 ADDRVTX_READ:$ptr))] ++>; ++ ++def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0, ++ [(set (i32 R600_TReg32_X:$dst), (load_param_zexti16 ADDRVTX_READ:$ptr))] ++>; ++ ++def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0, ++ [(set (i32 R600_TReg32_X:$dst), (load_param ADDRVTX_READ:$ptr))] ++>; ++ ++//===----------------------------------------------------------------------===// ++// VTX Read from global memory space ++//===----------------------------------------------------------------------===// ++ ++// 8-bit reads ++def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1, ++ [(set (i32 R600_TReg32_X:$dst), (zextloadi8_global ADDRVTX_READ:$ptr))] ++>; ++ ++// 32-bit reads ++def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1, ++ [(set (i32 R600_TReg32_X:$dst), (global_load ADDRVTX_READ:$ptr))] ++>; ++ ++// 128-bit reads ++def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1, ++ [(set (v4i32 R600_Reg128:$dst), (global_load ADDRVTX_READ:$ptr))] ++>; ++ ++//===----------------------------------------------------------------------===// ++// Constant Loads ++// XXX: We are currently storing all constants in the global address space. ++//===----------------------------------------------------------------------===// ++ ++def CONSTANT_LOAD_eg : VTX_READ_32_eg <1, ++ [(set (i32 R600_TReg32_X:$dst), (constant_load ADDRVTX_READ:$ptr))] ++>; ++ ++} ++ ++let Predicates = [isCayman] in { ++ ++let isVector = 1 in { ++ ++def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>; ++ ++def MULLO_INT_cm : MULLO_INT_Common<0x8F>; ++def MULHI_INT_cm : MULHI_INT_Common<0x90>; ++def MULLO_UINT_cm : MULLO_UINT_Common<0x91>; ++def MULHI_UINT_cm : MULHI_UINT_Common<0x92>; ++def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>; ++def EXP_IEEE_cm : EXP_IEEE_Common<0x81>; ++def LOG_IEEE_ : LOG_IEEE_Common<0x83>; ++def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>; ++def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>; ++def SIN_cm : SIN_Common<0x8D>; ++def COS_cm : COS_Common<0x8E>; ++} // End isVector = 1 ++ ++def : SIN_PAT ; ++def : COS_PAT ; ++ ++defm DIV_cm : DIV_Common; ++ ++// RECIP_UINT emulation for Cayman ++def : Pat < ++ (AMDGPUurecip R600_Reg32:$src0), ++ (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg R600_Reg32:$src0)), ++ (MOV_IMM_I32 0x4f800000))) ++>; ++ ++ ++def : Pat<(fsqrt R600_Reg32:$src), ++ (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm R600_Reg32:$src))>; ++ ++} // End isCayman ++ ++//===----------------------------------------------------------------------===// ++// Branch Instructions ++//===----------------------------------------------------------------------===// ++ ++ ++def IF_PREDICATE_SET : ILFormat<(outs), (ins GPRI32:$src), ++ "IF_PREDICATE_SET $src", []>; ++ ++def PREDICATED_BREAK : ILFormat<(outs), (ins GPRI32:$src), ++ "PREDICATED_BREAK $src", []>; ++ ++//===----------------------------------------------------------------------===// ++// Pseudo instructions ++//===----------------------------------------------------------------------===// ++ ++let isPseudo = 1 in { ++ ++def PRED_X : InstR600 < ++ 0, (outs R600_Predicate_Bit:$dst), ++ (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags), ++ "", [], NullALU> { ++ let FlagOperandIdx = 3; ++} ++ ++let isTerminator = 1, isBranch = 1, isBarrier = 1 in { ++ ++def JUMP : InstR600 <0x10, ++ (outs), ++ (ins brtarget:$target, R600_Pred:$p), ++ "JUMP $target ($p)", ++ [], AnyALU ++ >; ++ ++} // End isTerminator = 1, isBranch = 1, isBarrier = 1 ++ ++let usesCustomInserter = 1 in { ++ ++let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in { ++ ++def MASK_WRITE : AMDGPUShaderInst < ++ (outs), ++ (ins R600_Reg32:$src), ++ "MASK_WRITE $src", ++ [] ++>; ++ ++} // End mayLoad = 0, mayStore = 0, hasSideEffects = 1 ++ ++ ++def RESERVE_REG : AMDGPUShaderInst < ++ (outs), ++ (ins i32imm:$src), ++ "RESERVE_REG $src", ++ [(int_AMDGPU_reserve_reg imm:$src)] ++>; ++def TXD: AMDGPUShaderInst < ++ (outs R600_Reg128:$dst), ++ (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), ++ "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", ++ [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] ++>; ++ ++def TXD_SHADOW: AMDGPUShaderInst < ++ (outs R600_Reg128:$dst), ++ (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), ++ "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", ++ [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))] ++>; ++ ++} // End isPseudo = 1 ++} // End usesCustomInserter = 1 ++ ++def CLAMP_R600 : CLAMP ; ++def FABS_R600 : FABS; ++def FNEG_R600 : FNEG; ++ ++//===---------------------------------------------------------------------===// ++// Return instruction ++//===---------------------------------------------------------------------===// ++let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1 in { ++ def RETURN : ILFormat<(outs), (ins variable_ops), ++ "RETURN", [(IL_retflag)]>; ++} ++ ++ ++//===----------------------------------------------------------------------===// ++// Constant Buffer Addressing Support ++//===----------------------------------------------------------------------===// ++ ++let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { ++def CONST_COPY : Instruction { ++ let OutOperandList = (outs R600_Reg32:$dst); ++ let InOperandList = (ins i32imm:$src); ++ let Pattern = [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))]; ++ let AsmString = "CONST_COPY"; ++ let neverHasSideEffects = 1; ++ let isAsCheapAsAMove = 1; ++ let Itinerary = NullALU; ++} ++} // end isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" ++ ++def TEX_VTX_CONSTBUF : ++ InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr), "VTX_READ_eg $dst, $ptr", ++ [(set R600_Reg128:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr))]>, ++ VTX_WORD1_GPR, VTX_WORD0 { ++ ++ let VC_INST = 0; ++ let FETCH_TYPE = 2; ++ let FETCH_WHOLE_QUAD = 0; ++ let BUFFER_ID = 0; ++ let SRC_REL = 0; ++ let SRC_SEL_X = 0; ++ let DST_REL = 0; ++ let USE_CONST_FIELDS = 0; ++ let NUM_FORMAT_ALL = 2; ++ let FORMAT_COMP_ALL = 1; ++ let SRF_MODE_ALL = 1; ++ let MEGA_FETCH_COUNT = 16; ++ let DST_SEL_X = 0; ++ let DST_SEL_Y = 1; ++ let DST_SEL_Z = 2; ++ let DST_SEL_W = 3; ++ let DATA_FORMAT = 35; ++ ++ let Inst{31-0} = Word0; ++ let Inst{63-32} = Word1; ++ ++// LLVM can only encode 64-bit instructions, so these fields are manually ++// encoded in R600CodeEmitter ++// ++// bits<16> OFFSET; ++// bits<2> ENDIAN_SWAP = 0; ++// bits<1> CONST_BUF_NO_STRIDE = 0; ++// bits<1> MEGA_FETCH = 0; ++// bits<1> ALT_CONST = 0; ++// bits<2> BUFFER_INDEX_MODE = 0; ++ ++ ++ ++// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding ++// is done in R600CodeEmitter ++// ++// Inst{79-64} = OFFSET; ++// Inst{81-80} = ENDIAN_SWAP; ++// Inst{82} = CONST_BUF_NO_STRIDE; ++// Inst{83} = MEGA_FETCH; ++// Inst{84} = ALT_CONST; ++// Inst{86-85} = BUFFER_INDEX_MODE; ++// Inst{95-86} = 0; Reserved ++ ++// VTX_WORD3 (Padding) ++// ++// Inst{127-96} = 0; ++} ++ ++ ++//===--------------------------------------------------------------------===// ++// Instructions support ++//===--------------------------------------------------------------------===// ++//===---------------------------------------------------------------------===// ++// Custom Inserter for Branches and returns, this eventually will be a ++// seperate pass ++//===---------------------------------------------------------------------===// ++let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in { ++ def BRANCH : ILFormat<(outs), (ins brtarget:$target), ++ "; Pseudo unconditional branch instruction", ++ [(br bb:$target)]>; ++ defm BRANCH_COND : BranchConditional; ++} ++ ++//===---------------------------------------------------------------------===// ++// Flow and Program control Instructions ++//===---------------------------------------------------------------------===// ++let isTerminator=1 in { ++ def SWITCH : ILFormat< (outs), (ins GPRI32:$src), ++ !strconcat("SWITCH", " $src"), []>; ++ def CASE : ILFormat< (outs), (ins GPRI32:$src), ++ !strconcat("CASE", " $src"), []>; ++ def BREAK : ILFormat< (outs), (ins), ++ "BREAK", []>; ++ def CONTINUE : ILFormat< (outs), (ins), ++ "CONTINUE", []>; ++ def DEFAULT : ILFormat< (outs), (ins), ++ "DEFAULT", []>; ++ def ELSE : ILFormat< (outs), (ins), ++ "ELSE", []>; ++ def ENDSWITCH : ILFormat< (outs), (ins), ++ "ENDSWITCH", []>; ++ def ENDMAIN : ILFormat< (outs), (ins), ++ "ENDMAIN", []>; ++ def END : ILFormat< (outs), (ins), ++ "END", []>; ++ def ENDFUNC : ILFormat< (outs), (ins), ++ "ENDFUNC", []>; ++ def ENDIF : ILFormat< (outs), (ins), ++ "ENDIF", []>; ++ def WHILELOOP : ILFormat< (outs), (ins), ++ "WHILE", []>; ++ def ENDLOOP : ILFormat< (outs), (ins), ++ "ENDLOOP", []>; ++ def FUNC : ILFormat< (outs), (ins), ++ "FUNC", []>; ++ def RETDYN : ILFormat< (outs), (ins), ++ "RET_DYN", []>; ++ // This opcode has custom swizzle pattern encoded in Swizzle Encoder ++ defm IF_LOGICALNZ : BranchInstr<"IF_LOGICALNZ">; ++ // This opcode has custom swizzle pattern encoded in Swizzle Encoder ++ defm IF_LOGICALZ : BranchInstr<"IF_LOGICALZ">; ++ // This opcode has custom swizzle pattern encoded in Swizzle Encoder ++ defm BREAK_LOGICALNZ : BranchInstr<"BREAK_LOGICALNZ">; ++ // This opcode has custom swizzle pattern encoded in Swizzle Encoder ++ defm BREAK_LOGICALZ : BranchInstr<"BREAK_LOGICALZ">; ++ // This opcode has custom swizzle pattern encoded in Swizzle Encoder ++ defm CONTINUE_LOGICALNZ : BranchInstr<"CONTINUE_LOGICALNZ">; ++ // This opcode has custom swizzle pattern encoded in Swizzle Encoder ++ defm CONTINUE_LOGICALZ : BranchInstr<"CONTINUE_LOGICALZ">; ++ defm IFC : BranchInstr2<"IFC">; ++ defm BREAKC : BranchInstr2<"BREAKC">; ++ defm CONTINUEC : BranchInstr2<"CONTINUEC">; ++} ++ ++//===----------------------------------------------------------------------===// ++// ISel Patterns ++//===----------------------------------------------------------------------===// ++ ++//CNDGE_INT extra pattern ++def : Pat < ++ (selectcc (i32 R600_Reg32:$src0), -1, (i32 R600_Reg32:$src1), ++ (i32 R600_Reg32:$src2), COND_GT), ++ (CNDGE_INT R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2) ++>; ++ ++// KIL Patterns ++def KILP : Pat < ++ (int_AMDGPU_kilp), ++ (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO))) ++>; ++ ++def KIL : Pat < ++ (int_AMDGPU_kill R600_Reg32:$src0), ++ (MASK_WRITE (KILLGT (f32 ZERO), (f32 R600_Reg32:$src0))) ++>; ++ ++// SGT Reverse args ++def : Pat < ++ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, COND_LT), ++ (SGT R600_Reg32:$src1, R600_Reg32:$src0) ++>; ++ ++// SGE Reverse args ++def : Pat < ++ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, COND_LE), ++ (SGE R600_Reg32:$src1, R600_Reg32:$src0) ++>; ++ ++// SETGT_INT reverse args ++def : Pat < ++ (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETLT), ++ (SETGT_INT R600_Reg32:$src1, R600_Reg32:$src0) ++>; ++ ++// SETGE_INT reverse args ++def : Pat < ++ (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETLE), ++ (SETGE_INT R600_Reg32:$src1, R600_Reg32:$src0) ++>; ++ ++// SETGT_UINT reverse args ++def : Pat < ++ (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETULT), ++ (SETGT_UINT R600_Reg32:$src1, R600_Reg32:$src0) ++>; ++ ++// SETGE_UINT reverse args ++def : Pat < ++ (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETULE), ++ (SETGE_UINT R600_Reg32:$src1, R600_Reg32:$src0) ++>; ++ ++// The next two patterns are special cases for handling 'true if ordered' and ++// 'true if unordered' conditionals. The assumption here is that the behavior of ++// SETE and SNE conforms to the Direct3D 10 rules for floating point values ++// described here: ++// http://msdn.microsoft.com/en-us/library/windows/desktop/cc308050.aspx#alpha_32_bit ++// We assume that SETE returns false when one of the operands is NAN and ++// SNE returns true when on of the operands is NAN ++ ++//SETE - 'true if ordered' ++def : Pat < ++ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETO), ++ (SETE R600_Reg32:$src0, R600_Reg32:$src1) ++>; ++ ++//SNE - 'true if unordered' ++def : Pat < ++ (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETUO), ++ (SNE R600_Reg32:$src0, R600_Reg32:$src1) ++>; ++ ++def : Extract_Element ; ++def : Extract_Element ; ++def : Extract_Element ; ++def : Extract_Element ; ++ ++def : Insert_Element ; ++def : Insert_Element ; ++def : Insert_Element ; ++def : Insert_Element ; ++ ++def : Extract_Element ; ++def : Extract_Element ; ++def : Extract_Element ; ++def : Extract_Element ; ++ ++def : Insert_Element ; ++def : Insert_Element ; ++def : Insert_Element ; ++def : Insert_Element ; ++ ++def : Vector_Build ; ++def : Vector_Build ; ++ ++// bitconvert patterns ++ ++def : BitConvert ; ++def : BitConvert ; ++def : BitConvert ; ++def : BitConvert ; ++ ++// DWORDADDR pattern ++def : DwordAddrPat ; ++ ++} // End isR600toCayman Predicate +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600Intrinsics.td llvm-r600/lib/Target/R600/R600Intrinsics.td +--- llvm-3.2.src/lib/Target/R600/R600Intrinsics.td 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/R600Intrinsics.td 2013-01-25 19:43:57.466716387 +0100 +@@ -0,0 +1,34 @@ ++//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++// R600 Intrinsic Definitions ++// ++//===----------------------------------------------------------------------===// ++ ++let TargetPrefix = "R600", isTarget = 1 in { ++ def int_R600_load_input : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; ++ def int_R600_load_input_perspective : ++ Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadMem]>; ++ def int_R600_load_input_constant : ++ Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadMem]>; ++ def int_R600_load_input_linear : ++ Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadMem]>; ++ def int_R600_store_swizzle : ++ Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>; ++ def int_R600_store_stream_output : ++ Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; ++ def int_R600_store_pixel_color : ++ Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>; ++ def int_R600_store_pixel_depth : ++ Intrinsic<[], [llvm_float_ty], []>; ++ def int_R600_store_pixel_stencil : ++ Intrinsic<[], [llvm_float_ty], []>; ++ def int_R600_store_pixel_dummy : ++ Intrinsic<[], [], []>; ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600ISelLowering.cpp llvm-r600/lib/Target/R600/R600ISelLowering.cpp +--- llvm-3.2.src/lib/Target/R600/R600ISelLowering.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/R600ISelLowering.cpp 2013-01-25 19:43:57.463383054 +0100 +@@ -0,0 +1,997 @@ ++//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Custom DAG lowering for R600 ++// ++//===----------------------------------------------------------------------===// ++ ++#include "R600ISelLowering.h" ++#include "R600Defines.h" ++#include "R600InstrInfo.h" ++#include "R600MachineFunctionInfo.h" ++#include "llvm/Argument.h" ++#include "llvm/Function.h" ++#include "llvm/CodeGen/MachineInstrBuilder.h" ++#include "llvm/CodeGen/MachineRegisterInfo.h" ++#include "llvm/CodeGen/SelectionDAG.h" ++ ++using namespace llvm; ++ ++R600TargetLowering::R600TargetLowering(TargetMachine &TM) : ++ AMDGPUTargetLowering(TM), ++ TII(static_cast(TM.getInstrInfo())) { ++ setOperationAction(ISD::MUL, MVT::i64, Expand); ++ addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); ++ addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); ++ addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); ++ addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); ++ computeRegisterProperties(); ++ ++ setOperationAction(ISD::FADD, MVT::v4f32, Expand); ++ setOperationAction(ISD::FMUL, MVT::v4f32, Expand); ++ setOperationAction(ISD::FDIV, MVT::v4f32, Expand); ++ setOperationAction(ISD::FSUB, MVT::v4f32, Expand); ++ ++ setOperationAction(ISD::ADD, MVT::v4i32, Expand); ++ setOperationAction(ISD::AND, MVT::v4i32, Expand); ++ setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand); ++ setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand); ++ setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand); ++ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand); ++ setOperationAction(ISD::UDIV, MVT::v4i32, Expand); ++ setOperationAction(ISD::UREM, MVT::v4i32, Expand); ++ setOperationAction(ISD::SETCC, MVT::v4i32, Expand); ++ ++ setOperationAction(ISD::BR_CC, MVT::i32, Custom); ++ setOperationAction(ISD::BR_CC, MVT::f32, Custom); ++ ++ setOperationAction(ISD::FSUB, MVT::f32, Expand); ++ ++ setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); ++ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); ++ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); ++ setOperationAction(ISD::FPOW, MVT::f32, Custom); ++ ++ setOperationAction(ISD::ROTL, MVT::i32, Custom); ++ ++ setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); ++ setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); ++ ++ setOperationAction(ISD::SETCC, MVT::i32, Custom); ++ setOperationAction(ISD::SETCC, MVT::f32, Custom); ++ setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); ++ ++ setOperationAction(ISD::SELECT, MVT::i32, Custom); ++ setOperationAction(ISD::SELECT, MVT::f32, Custom); ++ ++ setOperationAction(ISD::STORE, MVT::i32, Custom); ++ setOperationAction(ISD::STORE, MVT::v4i32, Custom); ++ ++ setOperationAction(ISD::LOAD, MVT::i32, Custom); ++ setOperationAction(ISD::LOAD, MVT::v4i32, Custom); ++ setTargetDAGCombine(ISD::FP_ROUND); ++ setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); ++ ++ setSchedulingPreference(Sched::VLIW); ++} ++ ++MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( ++ MachineInstr * MI, MachineBasicBlock * BB) const { ++ MachineFunction * MF = BB->getParent(); ++ MachineRegisterInfo &MRI = MF->getRegInfo(); ++ MachineBasicBlock::iterator I = *MI; ++ ++ switch (MI->getOpcode()) { ++ default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); ++ case AMDGPU::SHADER_TYPE: break; ++ case AMDGPU::CLAMP_R600: { ++ MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, ++ AMDGPU::MOV, ++ MI->getOperand(0).getReg(), ++ MI->getOperand(1).getReg()); ++ TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); ++ break; ++ } ++ ++ case AMDGPU::FABS_R600: { ++ MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, ++ AMDGPU::MOV, ++ MI->getOperand(0).getReg(), ++ MI->getOperand(1).getReg()); ++ TII->addFlag(NewMI, 0, MO_FLAG_ABS); ++ break; ++ } ++ ++ case AMDGPU::FNEG_R600: { ++ MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, ++ AMDGPU::MOV, ++ MI->getOperand(0).getReg(), ++ MI->getOperand(1).getReg()); ++ TII->addFlag(NewMI, 0, MO_FLAG_NEG); ++ break; ++ } ++ ++ case AMDGPU::MASK_WRITE: { ++ unsigned maskedRegister = MI->getOperand(0).getReg(); ++ assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); ++ MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); ++ TII->addFlag(defInstr, 0, MO_FLAG_MASK); ++ break; ++ } ++ ++ case AMDGPU::MOV_IMM_F32: ++ TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), ++ MI->getOperand(1).getFPImm()->getValueAPF() ++ .bitcastToAPInt().getZExtValue()); ++ break; ++ case AMDGPU::MOV_IMM_I32: ++ TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), ++ MI->getOperand(1).getImm()); ++ break; ++ ++ ++ case AMDGPU::RAT_WRITE_CACHELESS_32_eg: ++ case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { ++ unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; ++ ++ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) ++ .addOperand(MI->getOperand(0)) ++ .addOperand(MI->getOperand(1)) ++ .addImm(EOP); // Set End of program bit ++ break; ++ } ++ ++ case AMDGPU::RESERVE_REG: { ++ R600MachineFunctionInfo * MFI = MF->getInfo(); ++ int64_t ReservedIndex = MI->getOperand(0).getImm(); ++ unsigned ReservedReg = ++ AMDGPU::R600_TReg32RegClass.getRegister(ReservedIndex); ++ MFI->ReservedRegs.push_back(ReservedReg); ++ unsigned SuperReg = ++ AMDGPU::R600_Reg128RegClass.getRegister(ReservedIndex / 4); ++ MFI->ReservedRegs.push_back(SuperReg); ++ break; ++ } ++ ++ case AMDGPU::TXD: { ++ unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); ++ unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); ++ ++ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) ++ .addOperand(MI->getOperand(3)) ++ .addOperand(MI->getOperand(4)) ++ .addOperand(MI->getOperand(5)) ++ .addOperand(MI->getOperand(6)); ++ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) ++ .addOperand(MI->getOperand(2)) ++ .addOperand(MI->getOperand(4)) ++ .addOperand(MI->getOperand(5)) ++ .addOperand(MI->getOperand(6)); ++ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) ++ .addOperand(MI->getOperand(0)) ++ .addOperand(MI->getOperand(1)) ++ .addOperand(MI->getOperand(4)) ++ .addOperand(MI->getOperand(5)) ++ .addOperand(MI->getOperand(6)) ++ .addReg(T0, RegState::Implicit) ++ .addReg(T1, RegState::Implicit); ++ break; ++ } ++ ++ case AMDGPU::TXD_SHADOW: { ++ unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); ++ unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); ++ ++ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) ++ .addOperand(MI->getOperand(3)) ++ .addOperand(MI->getOperand(4)) ++ .addOperand(MI->getOperand(5)) ++ .addOperand(MI->getOperand(6)); ++ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) ++ .addOperand(MI->getOperand(2)) ++ .addOperand(MI->getOperand(4)) ++ .addOperand(MI->getOperand(5)) ++ .addOperand(MI->getOperand(6)); ++ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) ++ .addOperand(MI->getOperand(0)) ++ .addOperand(MI->getOperand(1)) ++ .addOperand(MI->getOperand(4)) ++ .addOperand(MI->getOperand(5)) ++ .addOperand(MI->getOperand(6)) ++ .addReg(T0, RegState::Implicit) ++ .addReg(T1, RegState::Implicit); ++ break; ++ } ++ ++ case AMDGPU::BRANCH: ++ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) ++ .addOperand(MI->getOperand(0)) ++ .addReg(0); ++ break; ++ ++ case AMDGPU::BRANCH_COND_f32: { ++ MachineInstr *NewMI = ++ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), ++ AMDGPU::PREDICATE_BIT) ++ .addOperand(MI->getOperand(1)) ++ .addImm(OPCODE_IS_NOT_ZERO) ++ .addImm(0); // Flags ++ TII->addFlag(NewMI, 0, MO_FLAG_PUSH); ++ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) ++ .addOperand(MI->getOperand(0)) ++ .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); ++ break; ++ } ++ ++ case AMDGPU::BRANCH_COND_i32: { ++ MachineInstr *NewMI = ++ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), ++ AMDGPU::PREDICATE_BIT) ++ .addOperand(MI->getOperand(1)) ++ .addImm(OPCODE_IS_NOT_ZERO_INT) ++ .addImm(0); // Flags ++ TII->addFlag(NewMI, 0, MO_FLAG_PUSH); ++ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) ++ .addOperand(MI->getOperand(0)) ++ .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); ++ break; ++ } ++ ++ case AMDGPU::input_perspective: { ++ R600MachineFunctionInfo *MFI = MF->getInfo(); ++ ++ // XXX Be more fine about register reservation ++ for (unsigned i = 0; i < 4; i ++) { ++ unsigned ReservedReg = AMDGPU::R600_TReg32RegClass.getRegister(i); ++ MFI->ReservedRegs.push_back(ReservedReg); ++ } ++ ++ switch (MI->getOperand(1).getImm()) { ++ case 0:// Perspective ++ MFI->HasPerspectiveInterpolation = true; ++ break; ++ case 1:// Linear ++ MFI->HasLinearInterpolation = true; ++ break; ++ default: ++ assert(0 && "Unknow ij index"); ++ } ++ ++ return BB; ++ } ++ ++ case AMDGPU::EG_ExportSwz: ++ case AMDGPU::R600_ExportSwz: { ++ // Instruction is left unmodified if its not the last one of its type ++ bool isLastInstructionOfItsType = true; ++ unsigned InstExportType = MI->getOperand(1).getImm(); ++ for (MachineBasicBlock::iterator NextExportInst = llvm::next(I), ++ EndBlock = BB->end(); NextExportInst != EndBlock; ++ NextExportInst = llvm::next(NextExportInst)) { ++ if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || ++ NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { ++ unsigned CurrentInstExportType = NextExportInst->getOperand(1) ++ .getImm(); ++ if (CurrentInstExportType == InstExportType) { ++ isLastInstructionOfItsType = false; ++ break; ++ } ++ } ++ } ++ bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0; ++ if (!EOP && !isLastInstructionOfItsType) ++ return BB; ++ unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; ++ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) ++ .addOperand(MI->getOperand(0)) ++ .addOperand(MI->getOperand(1)) ++ .addOperand(MI->getOperand(2)) ++ .addOperand(MI->getOperand(3)) ++ .addOperand(MI->getOperand(4)) ++ .addOperand(MI->getOperand(5)) ++ .addOperand(MI->getOperand(6)) ++ .addImm(CfInst) ++ .addImm(EOP); ++ break; ++ } ++ } ++ ++ MI->eraseFromParent(); ++ return BB; ++} ++ ++//===----------------------------------------------------------------------===// ++// Custom DAG Lowering Operations ++//===----------------------------------------------------------------------===// ++ ++using namespace llvm::Intrinsic; ++using namespace llvm::AMDGPUIntrinsic; ++ ++static SDValue ++InsertScalarToRegisterExport(SelectionDAG &DAG, DebugLoc DL, SDNode **ExportMap, ++ unsigned Slot, unsigned Channel, unsigned Inst, unsigned Type, ++ SDValue Scalar, SDValue Chain) { ++ if (!ExportMap[Slot]) { ++ SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, ++ DL, MVT::v4f32, ++ DAG.getUNDEF(MVT::v4f32), ++ Scalar, ++ DAG.getConstant(Channel, MVT::i32)); ++ ++ unsigned Mask = 1 << Channel; ++ ++ const SDValue Ops[] = {Chain, Vector, DAG.getConstant(Inst, MVT::i32), ++ DAG.getConstant(Type, MVT::i32), DAG.getConstant(Slot, MVT::i32), ++ DAG.getConstant(Mask, MVT::i32)}; ++ ++ SDValue Res = DAG.getNode( ++ AMDGPUISD::EXPORT, ++ DL, ++ MVT::Other, ++ Ops, 6); ++ ExportMap[Slot] = Res.getNode(); ++ return Res; ++ } ++ ++ SDNode *ExportInstruction = (SDNode *) ExportMap[Slot] ; ++ SDValue PreviousVector = ExportInstruction->getOperand(1); ++ SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, ++ DL, MVT::v4f32, ++ PreviousVector, ++ Scalar, ++ DAG.getConstant(Channel, MVT::i32)); ++ ++ unsigned Mask = dyn_cast(ExportInstruction->getOperand(5)) ++ ->getZExtValue(); ++ Mask |= (1 << Channel); ++ ++ const SDValue Ops[] = {ExportInstruction->getOperand(0), Vector, ++ DAG.getConstant(Inst, MVT::i32), ++ DAG.getConstant(Type, MVT::i32), ++ DAG.getConstant(Slot, MVT::i32), ++ DAG.getConstant(Mask, MVT::i32)}; ++ ++ DAG.UpdateNodeOperands(ExportInstruction, ++ Ops, 6); ++ ++ return Chain; ++ ++} ++ ++SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { ++ switch (Op.getOpcode()) { ++ default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); ++ case ISD::BR_CC: return LowerBR_CC(Op, DAG); ++ case ISD::ROTL: return LowerROTL(Op, DAG); ++ case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); ++ case ISD::SELECT: return LowerSELECT(Op, DAG); ++ case ISD::SETCC: return LowerSETCC(Op, DAG); ++ case ISD::STORE: return LowerSTORE(Op, DAG); ++ case ISD::LOAD: return LowerLOAD(Op, DAG); ++ case ISD::FPOW: return LowerFPOW(Op, DAG); ++ case ISD::INTRINSIC_VOID: { ++ SDValue Chain = Op.getOperand(0); ++ unsigned IntrinsicID = ++ cast(Op.getOperand(1))->getZExtValue(); ++ switch (IntrinsicID) { ++ case AMDGPUIntrinsic::AMDGPU_store_output: { ++ MachineFunction &MF = DAG.getMachineFunction(); ++ MachineRegisterInfo &MRI = MF.getRegInfo(); ++ int64_t RegIndex = cast(Op.getOperand(3))->getZExtValue(); ++ unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); ++ if (!MRI.isLiveOut(Reg)) { ++ MRI.addLiveOut(Reg); ++ } ++ return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2)); ++ } ++ case AMDGPUIntrinsic::R600_store_pixel_color: { ++ MachineFunction &MF = DAG.getMachineFunction(); ++ R600MachineFunctionInfo *MFI = MF.getInfo(); ++ int64_t RegIndex = cast(Op.getOperand(3))->getZExtValue(); ++ ++ SDNode **OutputsMap = MFI->Outputs; ++ return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap, ++ RegIndex / 4, RegIndex % 4, 0, 0, Op.getOperand(2), ++ Chain); ++ ++ } ++ ++ // default for switch(IntrinsicID) ++ default: break; ++ } ++ // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) ++ break; ++ } ++ case ISD::INTRINSIC_WO_CHAIN: { ++ unsigned IntrinsicID = ++ cast(Op.getOperand(0))->getZExtValue(); ++ EVT VT = Op.getValueType(); ++ DebugLoc DL = Op.getDebugLoc(); ++ switch(IntrinsicID) { ++ default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); ++ case AMDGPUIntrinsic::R600_load_input: { ++ int64_t RegIndex = cast(Op.getOperand(1))->getZExtValue(); ++ unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); ++ return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT); ++ } ++ case AMDGPUIntrinsic::R600_load_input_perspective: { ++ int slot = cast(Op.getOperand(1))->getZExtValue(); ++ if (slot < 0) ++ return DAG.getUNDEF(MVT::f32); ++ SDValue FullVector = DAG.getNode( ++ AMDGPUISD::INTERP, ++ DL, MVT::v4f32, ++ DAG.getConstant(0, MVT::i32), DAG.getConstant(slot / 4 , MVT::i32)); ++ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, ++ DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32)); ++ } ++ case AMDGPUIntrinsic::R600_load_input_linear: { ++ int slot = cast(Op.getOperand(1))->getZExtValue(); ++ if (slot < 0) ++ return DAG.getUNDEF(MVT::f32); ++ SDValue FullVector = DAG.getNode( ++ AMDGPUISD::INTERP, ++ DL, MVT::v4f32, ++ DAG.getConstant(1, MVT::i32), DAG.getConstant(slot / 4 , MVT::i32)); ++ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, ++ DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32)); ++ } ++ case AMDGPUIntrinsic::R600_load_input_constant: { ++ int slot = cast(Op.getOperand(1))->getZExtValue(); ++ if (slot < 0) ++ return DAG.getUNDEF(MVT::f32); ++ SDValue FullVector = DAG.getNode( ++ AMDGPUISD::INTERP_P0, ++ DL, MVT::v4f32, ++ DAG.getConstant(slot / 4 , MVT::i32)); ++ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, ++ DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32)); ++ } ++ ++ case r600_read_ngroups_x: ++ return LowerImplicitParameter(DAG, VT, DL, 0); ++ case r600_read_ngroups_y: ++ return LowerImplicitParameter(DAG, VT, DL, 1); ++ case r600_read_ngroups_z: ++ return LowerImplicitParameter(DAG, VT, DL, 2); ++ case r600_read_global_size_x: ++ return LowerImplicitParameter(DAG, VT, DL, 3); ++ case r600_read_global_size_y: ++ return LowerImplicitParameter(DAG, VT, DL, 4); ++ case r600_read_global_size_z: ++ return LowerImplicitParameter(DAG, VT, DL, 5); ++ case r600_read_local_size_x: ++ return LowerImplicitParameter(DAG, VT, DL, 6); ++ case r600_read_local_size_y: ++ return LowerImplicitParameter(DAG, VT, DL, 7); ++ case r600_read_local_size_z: ++ return LowerImplicitParameter(DAG, VT, DL, 8); ++ ++ case r600_read_tgid_x: ++ return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, ++ AMDGPU::T1_X, VT); ++ case r600_read_tgid_y: ++ return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, ++ AMDGPU::T1_Y, VT); ++ case r600_read_tgid_z: ++ return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, ++ AMDGPU::T1_Z, VT); ++ case r600_read_tidig_x: ++ return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, ++ AMDGPU::T0_X, VT); ++ case r600_read_tidig_y: ++ return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, ++ AMDGPU::T0_Y, VT); ++ case r600_read_tidig_z: ++ return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, ++ AMDGPU::T0_Z, VT); ++ } ++ // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) ++ break; ++ } ++ } // end switch(Op.getOpcode()) ++ return SDValue(); ++} ++ ++void R600TargetLowering::ReplaceNodeResults(SDNode *N, ++ SmallVectorImpl &Results, ++ SelectionDAG &DAG) const { ++ switch (N->getOpcode()) { ++ default: return; ++ case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); ++ return; ++ case ISD::LOAD: { ++ SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); ++ Results.push_back(SDValue(Node, 0)); ++ Results.push_back(SDValue(Node, 1)); ++ // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode ++ // function ++ DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); ++ return; ++ } ++ } ++} ++ ++SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { ++ return DAG.getNode( ++ ISD::SETCC, ++ Op.getDebugLoc(), ++ MVT::i1, ++ Op, DAG.getConstantFP(0.0f, MVT::f32), ++ DAG.getCondCode(ISD::SETNE) ++ ); ++} ++ ++SDValue R600TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { ++ SDValue Chain = Op.getOperand(0); ++ SDValue CC = Op.getOperand(1); ++ SDValue LHS = Op.getOperand(2); ++ SDValue RHS = Op.getOperand(3); ++ SDValue JumpT = Op.getOperand(4); ++ SDValue CmpValue; ++ SDValue Result; ++ ++ if (LHS.getValueType() == MVT::i32) { ++ CmpValue = DAG.getNode( ++ ISD::SELECT_CC, ++ Op.getDebugLoc(), ++ MVT::i32, ++ LHS, RHS, ++ DAG.getConstant(-1, MVT::i32), ++ DAG.getConstant(0, MVT::i32), ++ CC); ++ } else if (LHS.getValueType() == MVT::f32) { ++ CmpValue = DAG.getNode( ++ ISD::SELECT_CC, ++ Op.getDebugLoc(), ++ MVT::f32, ++ LHS, RHS, ++ DAG.getConstantFP(1.0f, MVT::f32), ++ DAG.getConstantFP(0.0f, MVT::f32), ++ CC); ++ } else { ++ assert(0 && "Not valid type for br_cc"); ++ } ++ Result = DAG.getNode( ++ AMDGPUISD::BRANCH_COND, ++ CmpValue.getDebugLoc(), ++ MVT::Other, Chain, ++ JumpT, CmpValue); ++ return Result; ++} ++ ++SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, ++ DebugLoc DL, ++ unsigned DwordOffset) const { ++ unsigned ByteOffset = DwordOffset * 4; ++ PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), ++ AMDGPUAS::PARAM_I_ADDRESS); ++ ++ // We shouldn't be using an offset wider than 16-bits for implicit parameters. ++ assert(isInt<16>(ByteOffset)); ++ ++ return DAG.getLoad(VT, DL, DAG.getEntryNode(), ++ DAG.getConstant(ByteOffset, MVT::i32), // PTR ++ MachinePointerInfo(ConstantPointerNull::get(PtrType)), ++ false, false, false, 0); ++} ++ ++SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const { ++ DebugLoc DL = Op.getDebugLoc(); ++ EVT VT = Op.getValueType(); ++ ++ return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT, ++ Op.getOperand(0), ++ Op.getOperand(0), ++ DAG.getNode(ISD::SUB, DL, VT, ++ DAG.getConstant(32, MVT::i32), ++ Op.getOperand(1))); ++} ++ ++bool R600TargetLowering::isZero(SDValue Op) const { ++ if(ConstantSDNode *Cst = dyn_cast(Op)) { ++ return Cst->isNullValue(); ++ } else if(ConstantFPSDNode *CstFP = dyn_cast(Op)){ ++ return CstFP->isZero(); ++ } else { ++ return false; ++ } ++} ++ ++SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { ++ DebugLoc DL = Op.getDebugLoc(); ++ EVT VT = Op.getValueType(); ++ ++ SDValue LHS = Op.getOperand(0); ++ SDValue RHS = Op.getOperand(1); ++ SDValue True = Op.getOperand(2); ++ SDValue False = Op.getOperand(3); ++ SDValue CC = Op.getOperand(4); ++ SDValue Temp; ++ ++ // LHS and RHS are guaranteed to be the same value type ++ EVT CompareVT = LHS.getValueType(); ++ ++ // Check if we can lower this to a native operation. ++ ++ // Try to lower to a CND* instruction: ++ // CND* instructions requires RHS to be zero. Some SELECT_CC nodes that ++ // can be lowered to CND* instructions can also be lowered to SET* ++ // instructions. CND* instructions are cheaper, because they dont't ++ // require additional instructions to convert their result to the correct ++ // value type, so this check should be first. ++ if (isZero(LHS) || isZero(RHS)) { ++ SDValue Cond = (isZero(LHS) ? RHS : LHS); ++ SDValue Zero = (isZero(LHS) ? LHS : RHS); ++ ISD::CondCode CCOpcode = cast(CC)->get(); ++ if (CompareVT != VT) { ++ // Bitcast True / False to the correct types. This will end up being ++ // a nop, but it allows us to define only a single pattern in the ++ // .TD files for each CND* instruction rather than having to have ++ // one pattern for integer True/False and one for fp True/False ++ True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); ++ False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); ++ } ++ if (isZero(LHS)) { ++ CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode); ++ } ++ ++ switch (CCOpcode) { ++ case ISD::SETONE: ++ case ISD::SETUNE: ++ case ISD::SETNE: ++ case ISD::SETULE: ++ case ISD::SETULT: ++ case ISD::SETOLE: ++ case ISD::SETOLT: ++ case ISD::SETLE: ++ case ISD::SETLT: ++ CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); ++ Temp = True; ++ True = False; ++ False = Temp; ++ break; ++ default: ++ break; ++ } ++ SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, ++ Cond, Zero, ++ True, False, ++ DAG.getCondCode(CCOpcode)); ++ return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); ++ } ++ ++ // Try to lower to a SET* instruction: ++ // We need all the operands of SELECT_CC to have the same value type, so if ++ // necessary we need to change True and False to be the same type as LHS and ++ // RHS, and then convert the result of the select_cc back to the correct type. ++ ++ // Move hardware True/False values to the correct operand. ++ if (isHWTrueValue(False) && isHWFalseValue(True)) { ++ ISD::CondCode CCOpcode = cast(CC)->get(); ++ std::swap(False, True); ++ CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32)); ++ } ++ ++ if (isHWTrueValue(True) && isHWFalseValue(False)) { ++ if (CompareVT != VT) { ++ if (VT == MVT::f32 && CompareVT == MVT::i32) { ++ SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, ++ LHS, RHS, ++ DAG.getConstant(-1, MVT::i32), ++ DAG.getConstant(0, MVT::i32), ++ CC); ++ // Convert integer values of true (-1) and false (0) to fp values of ++ // true (1.0f) and false (0.0f). ++ SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean, ++ DAG.getConstant(1, MVT::i32)); ++ return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB); ++ } else if (VT == MVT::i32 && CompareVT == MVT::f32) { ++ SDValue BoolAsFlt = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, ++ LHS, RHS, ++ DAG.getConstantFP(1.0f, MVT::f32), ++ DAG.getConstantFP(0.0f, MVT::f32), ++ CC); ++ // Convert fp values of true (1.0f) and false (0.0f) to integer values ++ // of true (-1) and false (0). ++ SDValue Neg = DAG.getNode(ISD::FNEG, DL, MVT::f32, BoolAsFlt); ++ return DAG.getNode(ISD::FP_TO_SINT, DL, VT, Neg); ++ } else { ++ // I don't think there will be any other type pairings. ++ assert(!"Unhandled operand type parings in SELECT_CC"); ++ } ++ } else { ++ // This SELECT_CC is already legal. ++ return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); ++ } ++ } ++ ++ // Possible Min/Max pattern ++ SDValue MinMax = LowerMinMax(Op, DAG); ++ if (MinMax.getNode()) { ++ return MinMax; ++ } ++ ++ // If we make it this for it means we have no native instructions to handle ++ // this SELECT_CC, so we must lower it. ++ SDValue HWTrue, HWFalse; ++ ++ if (CompareVT == MVT::f32) { ++ HWTrue = DAG.getConstantFP(1.0f, CompareVT); ++ HWFalse = DAG.getConstantFP(0.0f, CompareVT); ++ } else if (CompareVT == MVT::i32) { ++ HWTrue = DAG.getConstant(-1, CompareVT); ++ HWFalse = DAG.getConstant(0, CompareVT); ++ } ++ else { ++ assert(!"Unhandled value type in LowerSELECT_CC"); ++ } ++ ++ // Lower this unsupported SELECT_CC into a combination of two supported ++ // SELECT_CC operations. ++ SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); ++ ++ return DAG.getNode(ISD::SELECT_CC, DL, VT, ++ Cond, HWFalse, ++ True, False, ++ DAG.getCondCode(ISD::SETNE)); ++} ++ ++SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { ++ return DAG.getNode(ISD::SELECT_CC, ++ Op.getDebugLoc(), ++ Op.getValueType(), ++ Op.getOperand(0), ++ DAG.getConstant(0, MVT::i32), ++ Op.getOperand(1), ++ Op.getOperand(2), ++ DAG.getCondCode(ISD::SETNE)); ++} ++ ++SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { ++ SDValue Cond; ++ SDValue LHS = Op.getOperand(0); ++ SDValue RHS = Op.getOperand(1); ++ SDValue CC = Op.getOperand(2); ++ DebugLoc DL = Op.getDebugLoc(); ++ assert(Op.getValueType() == MVT::i32); ++ if (LHS.getValueType() == MVT::i32) { ++ Cond = DAG.getNode( ++ ISD::SELECT_CC, ++ Op.getDebugLoc(), ++ MVT::i32, ++ LHS, RHS, ++ DAG.getConstant(-1, MVT::i32), ++ DAG.getConstant(0, MVT::i32), ++ CC); ++ } else if (LHS.getValueType() == MVT::f32) { ++ Cond = DAG.getNode( ++ ISD::SELECT_CC, ++ Op.getDebugLoc(), ++ MVT::f32, ++ LHS, RHS, ++ DAG.getConstantFP(1.0f, MVT::f32), ++ DAG.getConstantFP(0.0f, MVT::f32), ++ CC); ++ Cond = DAG.getNode( ++ ISD::FP_TO_SINT, ++ DL, ++ MVT::i32, ++ Cond); ++ } else { ++ assert(0 && "Not valid type for set_cc"); ++ } ++ Cond = DAG.getNode( ++ ISD::AND, ++ DL, ++ MVT::i32, ++ DAG.getConstant(1, MVT::i32), ++ Cond); ++ return Cond; ++} ++ ++SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { ++ DebugLoc DL = Op.getDebugLoc(); ++ StoreSDNode *StoreNode = cast(Op); ++ SDValue Chain = Op.getOperand(0); ++ SDValue Value = Op.getOperand(1); ++ SDValue Ptr = Op.getOperand(2); ++ ++ if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && ++ Ptr->getOpcode() != AMDGPUISD::DWORDADDR) { ++ // Convert pointer from byte address to dword address. ++ Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), ++ DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), ++ Ptr, DAG.getConstant(2, MVT::i32))); ++ ++ if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { ++ assert(!"Truncated and indexed stores not supported yet"); ++ } else { ++ Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); ++ } ++ return Chain; ++ } ++ return SDValue(); ++} ++ ++// return (512 + (kc_bank << 12) ++static int ++ConstantAddressBlock(unsigned AddressSpace) { ++ switch (AddressSpace) { ++ case AMDGPUAS::CONSTANT_BUFFER_0: ++ return 512; ++ case AMDGPUAS::CONSTANT_BUFFER_1: ++ return 512 + 4096; ++ case AMDGPUAS::CONSTANT_BUFFER_2: ++ return 512 + 4096 * 2; ++ case AMDGPUAS::CONSTANT_BUFFER_3: ++ return 512 + 4096 * 3; ++ case AMDGPUAS::CONSTANT_BUFFER_4: ++ return 512 + 4096 * 4; ++ case AMDGPUAS::CONSTANT_BUFFER_5: ++ return 512 + 4096 * 5; ++ case AMDGPUAS::CONSTANT_BUFFER_6: ++ return 512 + 4096 * 6; ++ case AMDGPUAS::CONSTANT_BUFFER_7: ++ return 512 + 4096 * 7; ++ case AMDGPUAS::CONSTANT_BUFFER_8: ++ return 512 + 4096 * 8; ++ case AMDGPUAS::CONSTANT_BUFFER_9: ++ return 512 + 4096 * 9; ++ case AMDGPUAS::CONSTANT_BUFFER_10: ++ return 512 + 4096 * 10; ++ case AMDGPUAS::CONSTANT_BUFFER_11: ++ return 512 + 4096 * 11; ++ case AMDGPUAS::CONSTANT_BUFFER_12: ++ return 512 + 4096 * 12; ++ case AMDGPUAS::CONSTANT_BUFFER_13: ++ return 512 + 4096 * 13; ++ case AMDGPUAS::CONSTANT_BUFFER_14: ++ return 512 + 4096 * 14; ++ case AMDGPUAS::CONSTANT_BUFFER_15: ++ return 512 + 4096 * 15; ++ default: ++ return -1; ++ } ++} ++ ++SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const ++{ ++ EVT VT = Op.getValueType(); ++ DebugLoc DL = Op.getDebugLoc(); ++ LoadSDNode *LoadNode = cast(Op); ++ SDValue Chain = Op.getOperand(0); ++ SDValue Ptr = Op.getOperand(1); ++ SDValue LoweredLoad; ++ ++ int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); ++ if (ConstantBlock > -1) { ++ SDValue Result; ++ if (dyn_cast(LoadNode->getSrcValue()) || ++ dyn_cast(LoadNode->getSrcValue())) { ++ SDValue Slots[4]; ++ for (unsigned i = 0; i < 4; i++) { ++ // We want Const position encoded with the following formula : ++ // (((512 + (kc_bank << 12) + const_index) << 2) + chan) ++ // const_index is Ptr computed by llvm using an alignment of 16. ++ // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and ++ // then div by 4 at the ISel step ++ SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, ++ DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32)); ++ Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); ++ } ++ Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4); ++ } else { ++ // non constant ptr cant be folded, keeps it as a v4f32 load ++ Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, ++ DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)) ++ ); ++ } ++ ++ if (!VT.isVector()) { ++ Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, ++ DAG.getConstant(0, MVT::i32)); ++ } ++ ++ SDValue MergedValues[2] = { ++ Result, ++ Chain ++ }; ++ return DAG.getMergeValues(MergedValues, 2, DL); ++ } ++ ++ return SDValue(); ++} ++ ++SDValue R600TargetLowering::LowerFPOW(SDValue Op, ++ SelectionDAG &DAG) const { ++ DebugLoc DL = Op.getDebugLoc(); ++ EVT VT = Op.getValueType(); ++ SDValue LogBase = DAG.getNode(ISD::FLOG2, DL, VT, Op.getOperand(0)); ++ SDValue MulLogBase = DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), LogBase); ++ return DAG.getNode(ISD::FEXP2, DL, VT, MulLogBase); ++} ++ ++/// XXX Only kernel functions are supported, so we can assume for now that ++/// every function is a kernel function, but in the future we should use ++/// separate calling conventions for kernel and non-kernel functions. ++SDValue R600TargetLowering::LowerFormalArguments( ++ SDValue Chain, ++ CallingConv::ID CallConv, ++ bool isVarArg, ++ const SmallVectorImpl &Ins, ++ DebugLoc DL, SelectionDAG &DAG, ++ SmallVectorImpl &InVals) const { ++ unsigned ParamOffsetBytes = 36; ++ Function::const_arg_iterator FuncArg = ++ DAG.getMachineFunction().getFunction()->arg_begin(); ++ for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) { ++ EVT VT = Ins[i].VT; ++ Type *ArgType = FuncArg->getType(); ++ unsigned ArgSizeInBits = ArgType->isPointerTy() ? ++ 32 : ArgType->getPrimitiveSizeInBits(); ++ unsigned ArgBytes = ArgSizeInBits >> 3; ++ EVT ArgVT; ++ if (ArgSizeInBits < VT.getSizeInBits()) { ++ assert(!ArgType->isFloatTy() && ++ "Extending floating point arguments not supported yet"); ++ ArgVT = MVT::getIntegerVT(ArgSizeInBits); ++ } else { ++ ArgVT = VT; ++ } ++ PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), ++ AMDGPUAS::PARAM_I_ADDRESS); ++ SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(), ++ DAG.getConstant(ParamOffsetBytes, MVT::i32), ++ MachinePointerInfo(new Argument(PtrTy)), ++ ArgVT, false, false, ArgBytes); ++ InVals.push_back(Arg); ++ ParamOffsetBytes += ArgBytes; ++ } ++ return Chain; ++} ++ ++EVT R600TargetLowering::getSetCCResultType(EVT VT) const { ++ if (!VT.isVector()) return MVT::i32; ++ return VT.changeVectorElementTypeToInteger(); ++} ++ ++//===----------------------------------------------------------------------===// ++// Custom DAG Optimizations ++//===----------------------------------------------------------------------===// ++ ++SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, ++ DAGCombinerInfo &DCI) const { ++ SelectionDAG &DAG = DCI.DAG; ++ ++ switch (N->getOpcode()) { ++ // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) ++ case ISD::FP_ROUND: { ++ SDValue Arg = N->getOperand(0); ++ if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { ++ return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0), ++ Arg.getOperand(0)); ++ } ++ break; ++ } ++ // Extract_vec (Build_vector) generated by custom lowering ++ // also needs to be customly combined ++ case ISD::EXTRACT_VECTOR_ELT: { ++ SDValue Arg = N->getOperand(0); ++ if (Arg.getOpcode() == ISD::BUILD_VECTOR) { ++ if (ConstantSDNode *Const = dyn_cast(N->getOperand(1))) { ++ unsigned Element = Const->getZExtValue(); ++ return Arg->getOperand(Element); ++ } ++ } ++ } ++ } ++ return SDValue(); ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600ISelLowering.h llvm-r600/lib/Target/R600/R600ISelLowering.h +--- llvm-3.2.src/lib/Target/R600/R600ISelLowering.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/R600ISelLowering.h 2013-01-25 19:43:57.463383054 +0100 +@@ -0,0 +1,73 @@ ++//===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief R600 DAG Lowering interface definition ++// ++//===----------------------------------------------------------------------===// ++ ++#ifndef R600ISELLOWERING_H ++#define R600ISELLOWERING_H ++ ++#include "AMDGPUISelLowering.h" ++ ++namespace llvm { ++ ++class R600InstrInfo; ++ ++class R600TargetLowering : public AMDGPUTargetLowering { ++public: ++ R600TargetLowering(TargetMachine &TM); ++ virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, ++ MachineBasicBlock * BB) const; ++ virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; ++ virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; ++ void ReplaceNodeResults(SDNode * N, ++ SmallVectorImpl &Results, ++ SelectionDAG &DAG) const; ++ virtual SDValue LowerFormalArguments( ++ SDValue Chain, ++ CallingConv::ID CallConv, ++ bool isVarArg, ++ const SmallVectorImpl &Ins, ++ DebugLoc DL, SelectionDAG &DAG, ++ SmallVectorImpl &InVals) const; ++ virtual EVT getSetCCResultType(EVT VT) const; ++private: ++ const R600InstrInfo * TII; ++ ++ /// Each OpenCL kernel has nine implicit parameters that are stored in the ++ /// first nine dwords of a Vertex Buffer. These implicit parameters are ++ /// lowered to load instructions which retreive the values from the Vertex ++ /// Buffer. ++ SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT, ++ DebugLoc DL, unsigned DwordOffset) const; ++ ++ void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB, ++ MachineRegisterInfo & MRI, unsigned dword_offset) const; ++ ++ SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; ++ ++ /// \brief Lower ROTL opcode to BITALIGN ++ SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const; ++ ++ SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerFPOW(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; ++ ++ bool isZero(SDValue Op) const; ++}; ++ ++} // End namespace llvm; ++ ++#endif // R600ISELLOWERING_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600LowerConstCopy.cpp llvm-r600/lib/Target/R600/R600LowerConstCopy.cpp +--- llvm-3.2.src/lib/Target/R600/R600LowerConstCopy.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/R600LowerConstCopy.cpp 2013-01-25 19:43:57.466716387 +0100 +@@ -0,0 +1,74 @@ ++//===-- R600LowerConstCopy.cpp - Propagate ConstCopy / lower them to MOV---===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// This pass is intended to handle remaining ConstCopy pseudo MachineInstr. ++/// ISel will fold each Const Buffer read inside scalar ALU. However it cannot ++/// fold them inside vector instruction, like DOT4 or Cube ; ISel emits ++/// ConstCopy instead. This pass (executed after ExpandingSpecialInstr) will try ++/// to fold them if possible or replace them by MOV otherwise. ++/// TODO : Implement the folding part, using Copy Propagation algorithm. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "AMDGPU.h" ++#include "llvm/CodeGen/MachineFunction.h" ++#include "llvm/CodeGen/MachineFunctionPass.h" ++#include "R600InstrInfo.h" ++#include "llvm/GlobalValue.h" ++#include "llvm/CodeGen/MachineInstrBuilder.h" ++ ++namespace llvm { ++ ++class R600LowerConstCopy : public MachineFunctionPass { ++private: ++ static char ID; ++ const R600InstrInfo *TII; ++public: ++ R600LowerConstCopy(TargetMachine &tm); ++ virtual bool runOnMachineFunction(MachineFunction &MF); ++ ++ const char *getPassName() const { return "R600 Eliminate Symbolic Operand"; } ++}; ++ ++char R600LowerConstCopy::ID = 0; ++ ++ ++R600LowerConstCopy::R600LowerConstCopy(TargetMachine &tm) : ++ MachineFunctionPass(ID), ++ TII (static_cast(tm.getInstrInfo())) ++{ ++} ++ ++bool R600LowerConstCopy::runOnMachineFunction(MachineFunction &MF) { ++ for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); ++ BB != BB_E; ++BB) { ++ MachineBasicBlock &MBB = *BB; ++ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); ++ I != E;) { ++ MachineInstr &MI = *I; ++ I = llvm::next(I); ++ if (MI.getOpcode() != AMDGPU::CONST_COPY) ++ continue; ++ MachineInstr *NewMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::MOV, ++ MI.getOperand(0).getReg(), AMDGPU::ALU_CONST); ++ NewMI->getOperand(9).setImm(MI.getOperand(1).getImm()); ++ MI.eraseFromParent(); ++ } ++ } ++ return false; ++} ++ ++FunctionPass *createR600LowerConstCopy(TargetMachine &tm) { ++ return new R600LowerConstCopy(tm); ++} ++ ++} ++ ++ +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600MachineFunctionInfo.cpp llvm-r600/lib/Target/R600/R600MachineFunctionInfo.cpp +--- llvm-3.2.src/lib/Target/R600/R600MachineFunctionInfo.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/R600MachineFunctionInfo.cpp 2013-01-25 19:43:57.470049720 +0100 +@@ -0,0 +1,33 @@ ++//===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++/// \file ++//===----------------------------------------------------------------------===// ++ ++#include "R600MachineFunctionInfo.h" ++ ++using namespace llvm; ++ ++R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF) ++ : MachineFunctionInfo(), ++ HasLinearInterpolation(false), ++ HasPerspectiveInterpolation(false) { ++ memset(Outputs, 0, sizeof(Outputs)); ++ } ++ ++unsigned R600MachineFunctionInfo::GetIJPerspectiveIndex() const { ++ assert(HasPerspectiveInterpolation); ++ return 0; ++} ++ ++unsigned R600MachineFunctionInfo::GetIJLinearIndex() const { ++ assert(HasLinearInterpolation); ++ if (HasPerspectiveInterpolation) ++ return 1; ++ else ++ return 0; ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600MachineFunctionInfo.h llvm-r600/lib/Target/R600/R600MachineFunctionInfo.h +--- llvm-3.2.src/lib/Target/R600/R600MachineFunctionInfo.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/R600MachineFunctionInfo.h 2013-01-25 19:43:57.470049720 +0100 +@@ -0,0 +1,38 @@ ++//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++//===----------------------------------------------------------------------===// ++ ++#ifndef R600MACHINEFUNCTIONINFO_H ++#define R600MACHINEFUNCTIONINFO_H ++ ++#include "llvm/CodeGen/MachineFunction.h" ++#include "llvm/CodeGen/SelectionDAG.h" ++#include ++ ++namespace llvm { ++ ++class R600MachineFunctionInfo : public MachineFunctionInfo { ++ ++public: ++ R600MachineFunctionInfo(const MachineFunction &MF); ++ std::vector ReservedRegs; ++ SDNode *Outputs[16]; ++ bool HasLinearInterpolation; ++ bool HasPerspectiveInterpolation; ++ ++ unsigned GetIJLinearIndex() const; ++ unsigned GetIJPerspectiveIndex() const; ++ ++}; ++ ++} // End llvm namespace ++ ++#endif //R600MACHINEFUNCTIONINFO_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600RegisterInfo.cpp llvm-r600/lib/Target/R600/R600RegisterInfo.cpp +--- llvm-3.2.src/lib/Target/R600/R600RegisterInfo.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/R600RegisterInfo.cpp 2013-01-25 19:43:57.470049720 +0100 +@@ -0,0 +1,85 @@ ++//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief R600 implementation of the TargetRegisterInfo class. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "R600RegisterInfo.h" ++#include "AMDGPUTargetMachine.h" ++#include "R600Defines.h" ++#include "R600MachineFunctionInfo.h" ++ ++using namespace llvm; ++ ++R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm, ++ const TargetInstrInfo &tii) ++: AMDGPURegisterInfo(tm, tii), ++ TM(tm), ++ TII(tii) ++ { } ++ ++BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { ++ BitVector Reserved(getNumRegs()); ++ const R600MachineFunctionInfo * MFI = MF.getInfo(); ++ ++ Reserved.set(AMDGPU::ZERO); ++ Reserved.set(AMDGPU::HALF); ++ Reserved.set(AMDGPU::ONE); ++ Reserved.set(AMDGPU::ONE_INT); ++ Reserved.set(AMDGPU::NEG_HALF); ++ Reserved.set(AMDGPU::NEG_ONE); ++ Reserved.set(AMDGPU::PV_X); ++ Reserved.set(AMDGPU::ALU_LITERAL_X); ++ Reserved.set(AMDGPU::ALU_CONST); ++ Reserved.set(AMDGPU::PREDICATE_BIT); ++ Reserved.set(AMDGPU::PRED_SEL_OFF); ++ Reserved.set(AMDGPU::PRED_SEL_ZERO); ++ Reserved.set(AMDGPU::PRED_SEL_ONE); ++ ++ for (std::vector::const_iterator I = MFI->ReservedRegs.begin(), ++ E = MFI->ReservedRegs.end(); I != E; ++I) { ++ Reserved.set(*I); ++ } ++ ++ return Reserved; ++} ++ ++const TargetRegisterClass * ++R600RegisterInfo::getISARegClass(const TargetRegisterClass * rc) const { ++ switch (rc->getID()) { ++ case AMDGPU::GPRF32RegClassID: ++ case AMDGPU::GPRI32RegClassID: ++ return &AMDGPU::R600_Reg32RegClass; ++ default: return rc; ++ } ++} ++ ++unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const { ++ return this->getEncodingValue(reg) >> HW_CHAN_SHIFT; ++} ++ ++const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass( ++ MVT VT) const { ++ switch(VT.SimpleTy) { ++ default: ++ case MVT::i32: return &AMDGPU::R600_TReg32RegClass; ++ } ++} ++ ++unsigned R600RegisterInfo::getSubRegFromChannel(unsigned Channel) const { ++ switch (Channel) { ++ default: assert(!"Invalid channel index"); return 0; ++ case 0: return AMDGPU::sel_x; ++ case 1: return AMDGPU::sel_y; ++ case 2: return AMDGPU::sel_z; ++ case 3: return AMDGPU::sel_w; ++ } ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600RegisterInfo.h llvm-r600/lib/Target/R600/R600RegisterInfo.h +--- llvm-3.2.src/lib/Target/R600/R600RegisterInfo.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/R600RegisterInfo.h 2013-01-25 19:43:57.470049720 +0100 +@@ -0,0 +1,55 @@ ++//===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Interface definition for R600RegisterInfo ++// ++//===----------------------------------------------------------------------===// ++ ++#ifndef R600REGISTERINFO_H_ ++#define R600REGISTERINFO_H_ ++ ++#include "AMDGPUTargetMachine.h" ++#include "AMDGPURegisterInfo.h" ++ ++namespace llvm { ++ ++class R600TargetMachine; ++class TargetInstrInfo; ++ ++struct R600RegisterInfo : public AMDGPURegisterInfo { ++ AMDGPUTargetMachine &TM; ++ const TargetInstrInfo &TII; ++ ++ R600RegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii); ++ ++ virtual BitVector getReservedRegs(const MachineFunction &MF) const; ++ ++ /// \param RC is an AMDIL reg class. ++ /// ++ /// \returns the R600 reg class that is equivalent to \p RC. ++ virtual const TargetRegisterClass *getISARegClass( ++ const TargetRegisterClass *RC) const; ++ ++ /// \brief get the HW encoding for a register's channel. ++ unsigned getHWRegChan(unsigned reg) const; ++ ++ /// \brief get the register class of the specified type to use in the ++ /// CFGStructurizer ++ virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const; ++ ++ /// \returns the sub reg enum value for the given \p Channel ++ /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sel_x) ++ unsigned getSubRegFromChannel(unsigned Channel) const; ++ ++}; ++ ++} // End namespace llvm ++ ++#endif // AMDIDSAREGISTERINFO_H_ +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600RegisterInfo.td llvm-r600/lib/Target/R600/R600RegisterInfo.td +--- llvm-3.2.src/lib/Target/R600/R600RegisterInfo.td 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/R600RegisterInfo.td 2013-01-25 19:43:57.470049720 +0100 +@@ -0,0 +1,101 @@ ++ ++class R600Reg encoding> : Register { ++ let Namespace = "AMDGPU"; ++ let HWEncoding = encoding; ++} ++ ++class R600RegWithChan sel, string chan> : ++ Register { ++ ++ field bits<2> chan_encoding = !if(!eq(chan, "X"), 0, ++ !if(!eq(chan, "Y"), 1, ++ !if(!eq(chan, "Z"), 2, ++ !if(!eq(chan, "W"), 3, 0)))); ++ let HWEncoding{8-0} = sel; ++ let HWEncoding{10-9} = chan_encoding; ++ let Namespace = "AMDGPU"; ++} ++ ++class R600Reg_128 subregs, bits<16> encoding> : ++ RegisterWithSubRegs { ++ let Namespace = "AMDGPU"; ++ let SubRegIndices = [sel_x, sel_y, sel_z, sel_w]; ++ let HWEncoding = encoding; ++} ++ ++foreach Index = 0-127 in { ++ foreach Chan = [ "X", "Y", "Z", "W" ] in { ++ // 32-bit Temporary Registers ++ def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>; ++ } ++ // 128-bit Temporary Registers ++ def T#Index#_XYZW : R600Reg_128 <"T"#Index#".XYZW", ++ [!cast("T"#Index#"_X"), ++ !cast("T"#Index#"_Y"), ++ !cast("T"#Index#"_Z"), ++ !cast("T"#Index#"_W")], ++ Index>; ++} ++ ++// Array Base Register holding input in FS ++foreach Index = 448-464 in { ++ def ArrayBase#Index : R600Reg<"ARRAY_BASE", Index>; ++} ++ ++ ++// Special Registers ++ ++def ZERO : R600Reg<"0.0", 248>; ++def ONE : R600Reg<"1.0", 249>; ++def NEG_ONE : R600Reg<"-1.0", 249>; ++def ONE_INT : R600Reg<"1", 250>; ++def HALF : R600Reg<"0.5", 252>; ++def NEG_HALF : R600Reg<"-0.5", 252>; ++def ALU_LITERAL_X : R600Reg<"literal.x", 253>; ++def PV_X : R600Reg<"pv.x", 254>; ++def PREDICATE_BIT : R600Reg<"PredicateBit", 0>; ++def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>; ++def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>; ++def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>; ++ ++def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32, ++ (add (sequence "ArrayBase%u", 448, 464))>; ++// special registers for ALU src operands ++// const buffer reference, SRCx_SEL contains index ++def ALU_CONST : R600Reg<"CBuf", 0>; ++// interpolation param reference, SRCx_SEL contains index ++def ALU_PARAM : R600Reg<"Param", 0>; ++ ++def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32, ++ (add (sequence "T%u_X", 0, 127))>; ++ ++def R600_TReg32_Y : RegisterClass <"AMDGPU", [f32, i32], 32, ++ (add (sequence "T%u_Y", 0, 127))>; ++ ++def R600_TReg32_Z : RegisterClass <"AMDGPU", [f32, i32], 32, ++ (add (sequence "T%u_Z", 0, 127))>; ++ ++def R600_TReg32_W : RegisterClass <"AMDGPU", [f32, i32], 32, ++ (add (sequence "T%u_W", 0, 127))>; ++ ++def R600_TReg32 : RegisterClass <"AMDGPU", [f32, i32], 32, ++ (interleave R600_TReg32_X, R600_TReg32_Y, ++ R600_TReg32_Z, R600_TReg32_W)>; ++ ++def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add ++ R600_TReg32, ++ R600_ArrayBase, ++ ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF, ++ ALU_CONST, ALU_PARAM ++ )>; ++ ++def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add ++ PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>; ++ ++def R600_Predicate_Bit: RegisterClass <"AMDGPU", [i32], 32, (add ++ PREDICATE_BIT)>; ++ ++def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, ++ (add (sequence "T%u_XYZW", 0, 127))> { ++ let CopyCost = -1; ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600Schedule.td llvm-r600/lib/Target/R600/R600Schedule.td +--- llvm-3.2.src/lib/Target/R600/R600Schedule.td 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/R600Schedule.td 2013-01-25 19:43:57.470049720 +0100 +@@ -0,0 +1,36 @@ ++//===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++// R600 has a VLIW architecture. On pre-cayman cards there are 5 instruction ++// slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS. For cayman cards, the TRANS ++// slot has been removed. ++// ++//===----------------------------------------------------------------------===// ++ ++ ++def ALU_X : FuncUnit; ++def ALU_Y : FuncUnit; ++def ALU_Z : FuncUnit; ++def ALU_W : FuncUnit; ++def TRANS : FuncUnit; ++ ++def AnyALU : InstrItinClass; ++def VecALU : InstrItinClass; ++def TransALU : InstrItinClass; ++ ++def R600_EG_Itin : ProcessorItineraries < ++ [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL], ++ [], ++ [ ++ InstrItinData]>, ++ InstrItinData]>, ++ InstrItinData]>, ++ InstrItinData]> ++ ] ++>; +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIAnnotateControlFlow.cpp llvm-r600/lib/Target/R600/SIAnnotateControlFlow.cpp +--- llvm-3.2.src/lib/Target/R600/SIAnnotateControlFlow.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/SIAnnotateControlFlow.cpp 2013-01-25 19:43:57.470049720 +0100 +@@ -0,0 +1,330 @@ ++//===-- SIAnnotateControlFlow.cpp - ------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// Annotates the control flow with hardware specific intrinsics. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "AMDGPU.h" ++ ++#include "llvm/Pass.h" ++#include "llvm/Module.h" ++#include "llvm/Analysis/Dominators.h" ++#include "llvm/Transforms/Utils/BasicBlockUtils.h" ++#include "llvm/ADT/DepthFirstIterator.h" ++#include "llvm/Transforms/Utils/SSAUpdater.h" ++ ++using namespace llvm; ++ ++namespace { ++ ++// Complex types used in this pass ++typedef std::pair StackEntry; ++typedef SmallVector StackVector; ++ ++// Intrinsic names the control flow is annotated with ++static const char *IfIntrinsic = "llvm.SI.if"; ++static const char *ElseIntrinsic = "llvm.SI.else"; ++static const char *BreakIntrinsic = "llvm.SI.break"; ++static const char *IfBreakIntrinsic = "llvm.SI.if.break"; ++static const char *ElseBreakIntrinsic = "llvm.SI.else.break"; ++static const char *LoopIntrinsic = "llvm.SI.loop"; ++static const char *EndCfIntrinsic = "llvm.SI.end.cf"; ++ ++class SIAnnotateControlFlow : public FunctionPass { ++ ++ static char ID; ++ ++ Type *Boolean; ++ Type *Void; ++ Type *Int64; ++ Type *ReturnStruct; ++ ++ ConstantInt *BoolTrue; ++ ConstantInt *BoolFalse; ++ UndefValue *BoolUndef; ++ Constant *Int64Zero; ++ ++ Constant *If; ++ Constant *Else; ++ Constant *Break; ++ Constant *IfBreak; ++ Constant *ElseBreak; ++ Constant *Loop; ++ Constant *EndCf; ++ ++ DominatorTree *DT; ++ StackVector Stack; ++ SSAUpdater PhiInserter; ++ ++ bool isTopOfStack(BasicBlock *BB); ++ ++ Value *popSaved(); ++ ++ void push(BasicBlock *BB, Value *Saved); ++ ++ bool isElse(PHINode *Phi); ++ ++ void eraseIfUnused(PHINode *Phi); ++ ++ void openIf(BranchInst *Term); ++ ++ void insertElse(BranchInst *Term); ++ ++ void handleLoopCondition(Value *Cond); ++ ++ void handleLoop(BranchInst *Term); ++ ++ void closeControlFlow(BasicBlock *BB); ++ ++public: ++ SIAnnotateControlFlow(): ++ FunctionPass(ID) { } ++ ++ virtual bool doInitialization(Module &M); ++ ++ virtual bool runOnFunction(Function &F); ++ ++ virtual const char *getPassName() const { ++ return "SI annotate control flow"; ++ } ++ ++ virtual void getAnalysisUsage(AnalysisUsage &AU) const { ++ AU.addRequired(); ++ AU.addPreserved(); ++ FunctionPass::getAnalysisUsage(AU); ++ } ++ ++}; ++ ++} // end anonymous namespace ++ ++char SIAnnotateControlFlow::ID = 0; ++ ++/// \brief Initialize all the types and constants used in the pass ++bool SIAnnotateControlFlow::doInitialization(Module &M) { ++ LLVMContext &Context = M.getContext(); ++ ++ Void = Type::getVoidTy(Context); ++ Boolean = Type::getInt1Ty(Context); ++ Int64 = Type::getInt64Ty(Context); ++ ReturnStruct = StructType::get(Boolean, Int64, (Type *)0); ++ ++ BoolTrue = ConstantInt::getTrue(Context); ++ BoolFalse = ConstantInt::getFalse(Context); ++ BoolUndef = UndefValue::get(Boolean); ++ Int64Zero = ConstantInt::get(Int64, 0); ++ ++ If = M.getOrInsertFunction( ++ IfIntrinsic, ReturnStruct, Boolean, (Type *)0); ++ ++ Else = M.getOrInsertFunction( ++ ElseIntrinsic, ReturnStruct, Int64, (Type *)0); ++ ++ Break = M.getOrInsertFunction( ++ BreakIntrinsic, Int64, Int64, (Type *)0); ++ ++ IfBreak = M.getOrInsertFunction( ++ IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)0); ++ ++ ElseBreak = M.getOrInsertFunction( ++ ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)0); ++ ++ Loop = M.getOrInsertFunction( ++ LoopIntrinsic, Boolean, Int64, (Type *)0); ++ ++ EndCf = M.getOrInsertFunction( ++ EndCfIntrinsic, Void, Int64, (Type *)0); ++ ++ return false; ++} ++ ++/// \brief Is BB the last block saved on the stack ? ++bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) { ++ return Stack.back().first == BB; ++} ++ ++/// \brief Pop the last saved value from the control flow stack ++Value *SIAnnotateControlFlow::popSaved() { ++ return Stack.pop_back_val().second; ++} ++ ++/// \brief Push a BB and saved value to the control flow stack ++void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) { ++ Stack.push_back(std::make_pair(BB, Saved)); ++} ++ ++/// \brief Can the condition represented by this PHI node treated like ++/// an "Else" block? ++bool SIAnnotateControlFlow::isElse(PHINode *Phi) { ++ BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock(); ++ for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { ++ if (Phi->getIncomingBlock(i) == IDom) { ++ ++ if (Phi->getIncomingValue(i) != BoolTrue) ++ return false; ++ ++ } else { ++ if (Phi->getIncomingValue(i) != BoolFalse) ++ return false; ++ ++ } ++ } ++ return true; ++} ++ ++// \brief Erase "Phi" if it is not used any more ++void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) { ++ if (!Phi->hasNUsesOrMore(1)) ++ Phi->eraseFromParent(); ++} ++ ++/// \brief Open a new "If" block ++void SIAnnotateControlFlow::openIf(BranchInst *Term) { ++ Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term); ++ Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); ++ push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); ++} ++ ++/// \brief Close the last "If" block and open a new "Else" block ++void SIAnnotateControlFlow::insertElse(BranchInst *Term) { ++ Value *Ret = CallInst::Create(Else, popSaved(), "", Term); ++ Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); ++ push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); ++} ++ ++/// \brief Recursively handle the condition leading to a loop ++void SIAnnotateControlFlow::handleLoopCondition(Value *Cond) { ++ if (PHINode *Phi = dyn_cast(Cond)) { ++ ++ // Handle all non constant incoming values first ++ for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { ++ Value *Incoming = Phi->getIncomingValue(i); ++ if (isa(Incoming)) ++ continue; ++ ++ Phi->setIncomingValue(i, BoolFalse); ++ handleLoopCondition(Incoming); ++ } ++ ++ BasicBlock *Parent = Phi->getParent(); ++ BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock(); ++ ++ for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { ++ ++ Value *Incoming = Phi->getIncomingValue(i); ++ if (Incoming != BoolTrue) ++ continue; ++ ++ BasicBlock *From = Phi->getIncomingBlock(i); ++ if (From == IDom) { ++ CallInst *OldEnd = dyn_cast(Parent->getFirstInsertionPt()); ++ if (OldEnd && OldEnd->getCalledFunction() == EndCf) { ++ Value *Args[] = { ++ OldEnd->getArgOperand(0), ++ PhiInserter.GetValueAtEndOfBlock(Parent) ++ }; ++ Value *Ret = CallInst::Create(ElseBreak, Args, "", OldEnd); ++ PhiInserter.AddAvailableValue(Parent, Ret); ++ continue; ++ } ++ } ++ ++ TerminatorInst *Insert = From->getTerminator(); ++ Value *Arg = PhiInserter.GetValueAtEndOfBlock(From); ++ Value *Ret = CallInst::Create(Break, Arg, "", Insert); ++ PhiInserter.AddAvailableValue(From, Ret); ++ } ++ eraseIfUnused(Phi); ++ ++ } else if (Instruction *Inst = dyn_cast(Cond)) { ++ BasicBlock *Parent = Inst->getParent(); ++ TerminatorInst *Insert = Parent->getTerminator(); ++ Value *Args[] = { Cond, PhiInserter.GetValueAtEndOfBlock(Parent) }; ++ Value *Ret = CallInst::Create(IfBreak, Args, "", Insert); ++ PhiInserter.AddAvailableValue(Parent, Ret); ++ ++ } else { ++ assert(0 && "Unhandled loop condition!"); ++ } ++} ++ ++/// \brief Handle a back edge (loop) ++void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { ++ BasicBlock *Target = Term->getSuccessor(1); ++ PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front()); ++ ++ PhiInserter.Initialize(Int64, ""); ++ PhiInserter.AddAvailableValue(Target, Broken); ++ ++ Value *Cond = Term->getCondition(); ++ Term->setCondition(BoolTrue); ++ handleLoopCondition(Cond); ++ ++ BasicBlock *BB = Term->getParent(); ++ Value *Arg = PhiInserter.GetValueAtEndOfBlock(BB); ++ for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target); ++ PI != PE; ++PI) { ++ ++ Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI); ++ } ++ ++ Term->setCondition(CallInst::Create(Loop, Arg, "", Term)); ++ push(Term->getSuccessor(0), Arg); ++} ++ ++/// \brief Close the last opened control flow ++void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { ++ CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt()); ++} ++ ++/// \brief Annotate the control flow with intrinsics so the backend can ++/// recognize if/then/else and loops. ++bool SIAnnotateControlFlow::runOnFunction(Function &F) { ++ DT = &getAnalysis(); ++ ++ for (df_iterator I = df_begin(&F.getEntryBlock()), ++ E = df_end(&F.getEntryBlock()); I != E; ++I) { ++ ++ BranchInst *Term = dyn_cast((*I)->getTerminator()); ++ ++ if (!Term || Term->isUnconditional()) { ++ if (isTopOfStack(*I)) ++ closeControlFlow(*I); ++ continue; ++ } ++ ++ if (I.nodeVisited(Term->getSuccessor(1))) { ++ if (isTopOfStack(*I)) ++ closeControlFlow(*I); ++ handleLoop(Term); ++ continue; ++ } ++ ++ if (isTopOfStack(*I)) { ++ PHINode *Phi = dyn_cast(Term->getCondition()); ++ if (Phi && Phi->getParent() == *I && isElse(Phi)) { ++ insertElse(Term); ++ eraseIfUnused(Phi); ++ continue; ++ } ++ closeControlFlow(*I); ++ } ++ openIf(Term); ++ } ++ ++ assert(Stack.empty()); ++ return true; ++} ++ ++/// \brief Create the annotation pass ++FunctionPass *llvm::createSIAnnotateControlFlowPass() { ++ return new SIAnnotateControlFlow(); ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIAssignInterpRegs.cpp llvm-r600/lib/Target/R600/SIAssignInterpRegs.cpp +--- llvm-3.2.src/lib/Target/R600/SIAssignInterpRegs.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/SIAssignInterpRegs.cpp 2013-01-25 19:43:57.470049720 +0100 +@@ -0,0 +1,152 @@ ++//===-- SIAssignInterpRegs.cpp - Assign interpolation registers -----------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief This pass maps the pseudo interpolation registers to the correct physical ++/// registers. ++// ++/// Prior to executing a fragment shader, the GPU loads interpolation ++/// parameters into physical registers. The specific physical register that each ++/// interpolation parameter ends up in depends on the type of the interpolation ++/// parameter as well as how many interpolation parameters are used by the ++/// shader. ++// ++//===----------------------------------------------------------------------===// ++ ++ ++ ++#include "AMDGPU.h" ++#include "AMDIL.h" ++#include "SIMachineFunctionInfo.h" ++#include "llvm/CodeGen/MachineFunctionPass.h" ++#include "llvm/CodeGen/MachineInstrBuilder.h" ++#include "llvm/CodeGen/MachineRegisterInfo.h" ++ ++using namespace llvm; ++ ++namespace { ++ ++class SIAssignInterpRegsPass : public MachineFunctionPass { ++ ++private: ++ static char ID; ++ TargetMachine &TM; ++ ++ void addLiveIn(MachineFunction * MF, MachineRegisterInfo & MRI, ++ unsigned physReg, unsigned virtReg); ++ ++public: ++ SIAssignInterpRegsPass(TargetMachine &tm) : ++ MachineFunctionPass(ID), TM(tm) { } ++ ++ virtual bool runOnMachineFunction(MachineFunction &MF); ++ ++ const char *getPassName() const { return "SI Assign intrpolation registers"; } ++}; ++ ++} // End anonymous namespace ++ ++char SIAssignInterpRegsPass::ID = 0; ++ ++#define INTERP_VALUES 16 ++#define REQUIRED_VALUE_MAX_INDEX 7 ++ ++struct InterpInfo { ++ bool Enabled; ++ unsigned Regs[3]; ++ unsigned RegCount; ++}; ++ ++ ++FunctionPass *llvm::createSIAssignInterpRegsPass(TargetMachine &tm) { ++ return new SIAssignInterpRegsPass(tm); ++} ++ ++bool SIAssignInterpRegsPass::runOnMachineFunction(MachineFunction &MF) { ++ ++ struct InterpInfo InterpUse[INTERP_VALUES] = { ++ {false, {AMDGPU::PERSP_SAMPLE_I, AMDGPU::PERSP_SAMPLE_J}, 2}, ++ {false, {AMDGPU::PERSP_CENTER_I, AMDGPU::PERSP_CENTER_J}, 2}, ++ {false, {AMDGPU::PERSP_CENTROID_I, AMDGPU::PERSP_CENTROID_J}, 2}, ++ {false, {AMDGPU::PERSP_I_W, AMDGPU::PERSP_J_W, AMDGPU::PERSP_1_W}, 3}, ++ {false, {AMDGPU::LINEAR_SAMPLE_I, AMDGPU::LINEAR_SAMPLE_J}, 2}, ++ {false, {AMDGPU::LINEAR_CENTER_I, AMDGPU::LINEAR_CENTER_J}, 2}, ++ {false, {AMDGPU::LINEAR_CENTROID_I, AMDGPU::LINEAR_CENTROID_J}, 2}, ++ {false, {AMDGPU::LINE_STIPPLE_TEX_COORD}, 1}, ++ {false, {AMDGPU::POS_X_FLOAT}, 1}, ++ {false, {AMDGPU::POS_Y_FLOAT}, 1}, ++ {false, {AMDGPU::POS_Z_FLOAT}, 1}, ++ {false, {AMDGPU::POS_W_FLOAT}, 1}, ++ {false, {AMDGPU::FRONT_FACE}, 1}, ++ {false, {AMDGPU::ANCILLARY}, 1}, ++ {false, {AMDGPU::SAMPLE_COVERAGE}, 1}, ++ {false, {AMDGPU::POS_FIXED_PT}, 1} ++ }; ++ ++ SIMachineFunctionInfo * MFI = MF.getInfo(); ++ // This pass is only needed for pixel shaders. ++ if (MFI->ShaderType != ShaderType::PIXEL) { ++ return false; ++ } ++ MachineRegisterInfo &MRI = MF.getRegInfo(); ++ bool ForceEnable = true; ++ ++ // First pass, mark the interpolation values that are used. ++ for (unsigned InterpIdx = 0; InterpIdx < INTERP_VALUES; InterpIdx++) { ++ for (unsigned RegIdx = 0; RegIdx < InterpUse[InterpIdx].RegCount; ++ RegIdx++) { ++ InterpUse[InterpIdx].Enabled = InterpUse[InterpIdx].Enabled || ++ !MRI.use_empty(InterpUse[InterpIdx].Regs[RegIdx]); ++ if (InterpUse[InterpIdx].Enabled && ++ InterpIdx <= REQUIRED_VALUE_MAX_INDEX) { ++ ForceEnable = false; ++ } ++ } ++ } ++ ++ // At least one interpolation mode must be enabled or else the GPU will hang. ++ if (ForceEnable) { ++ InterpUse[0].Enabled = true; ++ } ++ ++ unsigned UsedVgprs = 0; ++ ++ // Second pass, replace with VGPRs. ++ for (unsigned InterpIdx = 0; InterpIdx < INTERP_VALUES; InterpIdx++) { ++ if (!InterpUse[InterpIdx].Enabled) { ++ continue; ++ } ++ MFI->SPIPSInputAddr |= (1 << InterpIdx); ++ ++ for (unsigned RegIdx = 0; RegIdx < InterpUse[InterpIdx].RegCount; ++ RegIdx++, UsedVgprs++) { ++ unsigned NewReg = AMDGPU::VReg_32RegClass.getRegister(UsedVgprs); ++ unsigned VirtReg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); ++ MRI.replaceRegWith(InterpUse[InterpIdx].Regs[RegIdx], VirtReg); ++ addLiveIn(&MF, MRI, NewReg, VirtReg); ++ } ++ } ++ ++ return false; ++} ++ ++void SIAssignInterpRegsPass::addLiveIn(MachineFunction * MF, ++ MachineRegisterInfo & MRI, ++ unsigned physReg, unsigned virtReg) { ++ const TargetInstrInfo * TII = TM.getInstrInfo(); ++ if (!MRI.isLiveIn(physReg)) { ++ MRI.addLiveIn(physReg, virtReg); ++ MF->front().addLiveIn(physReg); ++ BuildMI(MF->front(), MF->front().begin(), DebugLoc(), ++ TII->get(TargetOpcode::COPY), virtReg) ++ .addReg(physReg); ++ } else { ++ MRI.replaceRegWith(virtReg, MRI.getLiveInVirtReg(physReg)); ++ } ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIInsertWaits.cpp llvm-r600/lib/Target/R600/SIInsertWaits.cpp +--- llvm-3.2.src/lib/Target/R600/SIInsertWaits.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/SIInsertWaits.cpp 2013-01-25 19:43:57.473383054 +0100 +@@ -0,0 +1,353 @@ ++//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Insert wait instructions for memory reads and writes. ++/// ++/// Memory reads and writes are issued asynchronously, so we need to insert ++/// S_WAITCNT instructions when we want to access any of their results or ++/// overwrite any register that's used asynchronously. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "AMDGPU.h" ++#include "SIInstrInfo.h" ++#include "SIMachineFunctionInfo.h" ++#include "llvm/CodeGen/MachineFunction.h" ++#include "llvm/CodeGen/MachineFunctionPass.h" ++#include "llvm/CodeGen/MachineInstrBuilder.h" ++#include "llvm/CodeGen/MachineRegisterInfo.h" ++ ++using namespace llvm; ++ ++namespace { ++ ++/// \brief One variable for each of the hardware counters ++typedef union { ++ struct { ++ unsigned VM; ++ unsigned EXP; ++ unsigned LGKM; ++ } Named; ++ unsigned Array[3]; ++ ++} Counters; ++ ++typedef Counters RegCounters[512]; ++typedef std::pair RegInterval; ++ ++class SIInsertWaits : public MachineFunctionPass { ++ ++private: ++ static char ID; ++ const SIInstrInfo *TII; ++ const SIRegisterInfo &TRI; ++ const MachineRegisterInfo *MRI; ++ ++ /// \brief Constant hardware limits ++ static const Counters WaitCounts; ++ ++ /// \brief Constant zero value ++ static const Counters ZeroCounts; ++ ++ /// \brief Counter values we have already waited on. ++ Counters WaitedOn; ++ ++ /// \brief Counter values for last instruction issued. ++ Counters LastIssued; ++ ++ /// \brief Registers used by async instructions. ++ RegCounters UsedRegs; ++ ++ /// \brief Registers defined by async instructions. ++ RegCounters DefinedRegs; ++ ++ /// \brief Different export instruction types seen since last wait. ++ unsigned ExpInstrTypesSeen; ++ ++ /// \brief Get increment/decrement amount for this instruction. ++ Counters getHwCounts(MachineInstr &MI); ++ ++ /// \brief Is operand relevant for async execution? ++ bool isOpRelevant(MachineOperand &Op); ++ ++ /// \brief Get register interval an operand affects. ++ RegInterval getRegInterval(MachineOperand &Op); ++ ++ /// \brief Handle instructions async components ++ void pushInstruction(MachineInstr &MI); ++ ++ /// \brief Insert the actual wait instruction ++ bool insertWait(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator I, ++ const Counters &Counts); ++ ++ /// \brief Resolve all operand dependencies to counter requirements ++ Counters handleOperands(MachineInstr &MI); ++ ++public: ++ SIInsertWaits(TargetMachine &tm) : ++ MachineFunctionPass(ID), ++ TII(static_cast(tm.getInstrInfo())), ++ TRI(TII->getRegisterInfo()) { } ++ ++ virtual bool runOnMachineFunction(MachineFunction &MF); ++ ++ const char *getPassName() const { ++ return "SI insert wait instructions"; ++ } ++ ++}; ++ ++} // End anonymous namespace ++ ++char SIInsertWaits::ID = 0; ++ ++const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } }; ++const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } }; ++ ++FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) { ++ return new SIInsertWaits(tm); ++} ++ ++Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { ++ ++ uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags; ++ Counters Result; ++ ++ Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT); ++ ++ // Only consider stores or EXP for EXP_CNT ++ Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT && ++ (MI.getOpcode() == AMDGPU::EXP || !MI.getDesc().mayStore())); ++ ++ // LGKM may uses larger values ++ if (TSFlags & SIInstrFlags::LGKM_CNT) { ++ ++ MachineOperand &Op = MI.getOperand(0); ++ assert(Op.isReg() && "First LGKM operand must be a register!"); ++ ++ unsigned Reg = Op.getReg(); ++ unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize(); ++ Result.Named.LGKM = Size > 4 ? 2 : 1; ++ ++ } else { ++ Result.Named.LGKM = 0; ++ } ++ ++ return Result; ++} ++ ++bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { ++ ++ // Constants are always irrelevant ++ if (!Op.isReg()) ++ return false; ++ ++ // Defines are always relevant ++ if (Op.isDef()) ++ return true; ++ ++ // For exports all registers are relevant ++ MachineInstr &MI = *Op.getParent(); ++ if (MI.getOpcode() == AMDGPU::EXP) ++ return true; ++ ++ // For stores the stored value is also relevant ++ if (!MI.getDesc().mayStore()) ++ return false; ++ ++ for (MachineInstr::mop_iterator I = MI.operands_begin(), ++ E = MI.operands_end(); I != E; ++I) { ++ ++ if (I->isReg() && I->isUse()) ++ return Op.isIdenticalTo(*I); ++ } ++ ++ return false; ++} ++ ++RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) { ++ ++ if (!Op.isReg()) ++ return std::make_pair(0, 0); ++ ++ unsigned Reg = Op.getReg(); ++ unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize(); ++ ++ assert(Size >= 4); ++ ++ RegInterval Result; ++ Result.first = TRI.getEncodingValue(Reg); ++ Result.second = Result.first + Size / 4; ++ ++ return Result; ++} ++ ++void SIInsertWaits::pushInstruction(MachineInstr &MI) { ++ ++ // Get the hardware counter increments and sum them up ++ Counters Increment = getHwCounts(MI); ++ unsigned Sum = 0; ++ ++ for (unsigned i = 0; i < 3; ++i) { ++ LastIssued.Array[i] += Increment.Array[i]; ++ Sum += Increment.Array[i]; ++ } ++ ++ // If we don't increase anything then that's it ++ if (Sum == 0) ++ return; ++ ++ // Remember which export instructions we have seen ++ if (Increment.Named.EXP) { ++ ExpInstrTypesSeen |= MI.getOpcode() == AMDGPU::EXP ? 1 : 2; ++ } ++ ++ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { ++ ++ MachineOperand &Op = MI.getOperand(i); ++ if (!isOpRelevant(Op)) ++ continue; ++ ++ RegInterval Interval = getRegInterval(Op); ++ for (unsigned j = Interval.first; j < Interval.second; ++j) { ++ ++ // Remember which registers we define ++ if (Op.isDef()) ++ DefinedRegs[j] = LastIssued; ++ ++ // and which one we are using ++ if (Op.isUse()) ++ UsedRegs[j] = LastIssued; ++ } ++ } ++} ++ ++bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator I, ++ const Counters &Required) { ++ ++ // End of program? No need to wait on anything ++ if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM) ++ return false; ++ ++ // Figure out if the async instructions execute in order ++ bool Ordered[3]; ++ ++ // VM_CNT is always ordered ++ Ordered[0] = true; ++ ++ // EXP_CNT is unordered if we have both EXP & VM-writes ++ Ordered[1] = ExpInstrTypesSeen == 3; ++ ++ // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS ++ Ordered[2] = false; ++ ++ // The values we are going to put into the S_WAITCNT instruction ++ Counters Counts = WaitCounts; ++ ++ // Do we really need to wait? ++ bool NeedWait = false; ++ ++ for (unsigned i = 0; i < 3; ++i) { ++ ++ if (Required.Array[i] <= WaitedOn.Array[i]) ++ continue; ++ ++ NeedWait = true; ++ ++ if (Ordered[i]) { ++ unsigned Value = LastIssued.Array[i] - Required.Array[i]; ++ ++ // adjust the value to the real hardware posibilities ++ Counts.Array[i] = std::min(Value, WaitCounts.Array[i]); ++ ++ } else ++ Counts.Array[i] = 0; ++ ++ // Remember on what we have waited on ++ WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i]; ++ } ++ ++ if (!NeedWait) ++ return false; ++ ++ // Reset EXP_CNT instruction types ++ if (Counts.Named.EXP == 0) ++ ExpInstrTypesSeen = 0; ++ ++ // Build the wait instruction ++ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) ++ .addImm((Counts.Named.VM & 0xF) | ++ ((Counts.Named.EXP & 0x7) << 4) | ++ ((Counts.Named.LGKM & 0x7) << 8)); ++ ++ return true; ++} ++ ++/// \brief helper function for handleOperands ++static void increaseCounters(Counters &Dst, const Counters &Src) { ++ ++ for (unsigned i = 0; i < 3; ++i) ++ Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]); ++} ++ ++Counters SIInsertWaits::handleOperands(MachineInstr &MI) { ++ ++ Counters Result = ZeroCounts; ++ ++ // For each register affected by this ++ // instruction increase the result sequence ++ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { ++ ++ MachineOperand &Op = MI.getOperand(i); ++ RegInterval Interval = getRegInterval(Op); ++ for (unsigned j = Interval.first; j < Interval.second; ++j) { ++ ++ if (Op.isDef()) ++ increaseCounters(Result, UsedRegs[j]); ++ ++ if (Op.isUse()) ++ increaseCounters(Result, DefinedRegs[j]); ++ } ++ } ++ ++ return Result; ++} ++ ++bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { ++ ++ bool Changes = false; ++ ++ MRI = &MF.getRegInfo(); ++ ++ WaitedOn = ZeroCounts; ++ LastIssued = ZeroCounts; ++ ++ memset(&UsedRegs, 0, sizeof(UsedRegs)); ++ memset(&DefinedRegs, 0, sizeof(DefinedRegs)); ++ ++ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); ++ BI != BE; ++BI) { ++ ++ MachineBasicBlock &MBB = *BI; ++ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); ++ I != E; ++I) { ++ ++ Changes |= insertWait(MBB, I, handleOperands(*I)); ++ pushInstruction(*I); ++ } ++ ++ // Wait for everything at the end of the MBB ++ Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); ++ } ++ ++ return Changes; ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIInstrFormats.td llvm-r600/lib/Target/R600/SIInstrFormats.td +--- llvm-3.2.src/lib/Target/R600/SIInstrFormats.td 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/SIInstrFormats.td 2013-01-25 19:43:57.473383054 +0100 +@@ -0,0 +1,146 @@ ++//===-- SIInstrFormats.td - SI Instruction Formats ------------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++// SI Instruction format definitions. ++// ++// Instructions with _32 take 32-bit operands. ++// Instructions with _64 take 64-bit operands. ++// ++// VOP_* instructions can use either a 32-bit or 64-bit encoding. The 32-bit ++// encoding is the standard encoding, but instruction that make use of ++// any of the instruction modifiers must use the 64-bit encoding. ++// ++// Instructions with _e32 use the 32-bit encoding. ++// Instructions with _e64 use the 64-bit encoding. ++// ++//===----------------------------------------------------------------------===// ++ ++class VOP3b_2IN op, string opName, RegisterClass dstClass, ++ RegisterClass src0Class, RegisterClass src1Class, ++ list pattern> ++ : VOP3b ; ++ ++ ++class VOP3_1_32 op, string opName, list pattern> ++ : VOP3b_2IN ; ++ ++class VOP3_32 op, string opName, list pattern> ++ : VOP3 ; ++ ++class VOP3_64 op, string opName, list pattern> ++ : VOP3 ; ++ ++ ++class SOP1_32 op, string opName, list pattern> ++ : SOP1 ; ++ ++class SOP1_64 op, string opName, list pattern> ++ : SOP1 ; ++ ++class SOP2_32 op, string opName, list pattern> ++ : SOP2 ; ++ ++class SOP2_64 op, string opName, list pattern> ++ : SOP2 ; ++ ++class SOP2_VCC op, string opName, list pattern> ++ : SOP2 ; ++ ++class VOP1_Helper op, RegisterClass vrc, RegisterClass arc, ++ string opName, list pattern> : ++ VOP1 < ++ op, (outs vrc:$dst), (ins arc:$src0), opName, pattern ++ >; ++ ++multiclass VOP1_32 op, string opName, list pattern> { ++ def _e32: VOP1_Helper ; ++ def _e64 : VOP3_32 <{1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, ++ opName, [] ++ >; ++} ++ ++multiclass VOP1_64 op, string opName, list pattern> { ++ ++ def _e32 : VOP1_Helper ; ++ ++ def _e64 : VOP3_64 < ++ {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, ++ opName, [] ++ >; ++} ++ ++class VOP2_Helper op, RegisterClass vrc, RegisterClass arc, ++ string opName, list pattern> : ++ VOP2 < ++ op, (outs vrc:$dst), (ins arc:$src0, vrc:$src1), opName, pattern ++ >; ++ ++multiclass VOP2_32 op, string opName, list pattern> { ++ ++ def _e32 : VOP2_Helper ; ++ ++ def _e64 : VOP3_32 <{1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, ++ opName, [] ++ >; ++} ++ ++multiclass VOP2_64 op, string opName, list pattern> { ++ def _e32: VOP2_Helper ; ++ ++ def _e64 : VOP3_64 < ++ {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, ++ opName, [] ++ >; ++} ++ ++class SOPK_32 op, string opName, list pattern> ++ : SOPK ; ++ ++class SOPK_64 op, string opName, list pattern> ++ : SOPK ; ++ ++class VOPC_Helper op, RegisterClass vrc, RegisterClass arc, ++ string opName, list pattern> : ++ VOPC < ++ op, (ins arc:$src0, vrc:$src1), opName, pattern ++ >; ++ ++multiclass VOPC_32 op, string opName, list pattern> { ++ ++ def _e32 : VOPC_Helper < ++ {op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, ++ VReg_32, AllReg_32, opName, pattern ++ >; ++ ++ def _e64 : VOP3_1_32 < ++ op, ++ opName, pattern ++ >; ++} ++ ++multiclass VOPC_64 op, string opName, list pattern> { ++ ++ def _e32 : VOPC_Helper ; ++ ++ def _e64 : VOP3_64 < ++ {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, ++ opName, [] ++ >; ++} ++ ++class SOPC_32 op, string opName, list pattern> ++ : SOPC ; ++ ++class SOPC_64 op, string opName, list pattern> ++ : SOPC ; ++ +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIInstrInfo.cpp llvm-r600/lib/Target/R600/SIInstrInfo.cpp +--- llvm-3.2.src/lib/Target/R600/SIInstrInfo.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/SIInstrInfo.cpp 2013-01-25 19:43:57.473383054 +0100 +@@ -0,0 +1,89 @@ ++//===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief SI Implementation of TargetInstrInfo. ++// ++//===----------------------------------------------------------------------===// ++ ++ ++#include "SIInstrInfo.h" ++#include "AMDGPUTargetMachine.h" ++#include "llvm/CodeGen/MachineInstrBuilder.h" ++#include "llvm/CodeGen/MachineRegisterInfo.h" ++#include "llvm/MC/MCInstrDesc.h" ++ ++#include ++ ++using namespace llvm; ++ ++SIInstrInfo::SIInstrInfo(AMDGPUTargetMachine &tm) ++ : AMDGPUInstrInfo(tm), ++ RI(tm, *this) ++ { } ++ ++const SIRegisterInfo &SIInstrInfo::getRegisterInfo() const { ++ return RI; ++} ++ ++void ++SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator MI, DebugLoc DL, ++ unsigned DestReg, unsigned SrcReg, ++ bool KillSrc) const { ++ // If we are trying to copy to or from SCC, there is a bug somewhere else in ++ // the backend. While it may be theoretically possible to do this, it should ++ // never be necessary. ++ assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); ++ ++ if (AMDGPU::SReg_64RegClass.contains(DestReg)) { ++ assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); ++ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) ++ .addReg(SrcReg, getKillRegState(KillSrc)); ++ } else if (AMDGPU::VReg_32RegClass.contains(DestReg)) { ++ assert(AMDGPU::VReg_32RegClass.contains(SrcReg) || ++ AMDGPU::SReg_32RegClass.contains(SrcReg)); ++ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) ++ .addReg(SrcReg, getKillRegState(KillSrc)); ++ } else { ++ assert(AMDGPU::SReg_32RegClass.contains(DestReg)); ++ assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); ++ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) ++ .addReg(SrcReg, getKillRegState(KillSrc)); ++ } ++} ++ ++MachineInstr * SIInstrInfo::getMovImmInstr(MachineFunction *MF, unsigned DstReg, ++ int64_t Imm) const { ++ MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::V_MOV_IMM_I32), DebugLoc()); ++ MachineInstrBuilder(MI).addReg(DstReg, RegState::Define); ++ MachineInstrBuilder(MI).addImm(Imm); ++ ++ return MI; ++ ++} ++ ++bool SIInstrInfo::isMov(unsigned Opcode) const { ++ switch(Opcode) { ++ default: return false; ++ case AMDGPU::S_MOV_B32: ++ case AMDGPU::S_MOV_B64: ++ case AMDGPU::V_MOV_B32_e32: ++ case AMDGPU::V_MOV_B32_e64: ++ case AMDGPU::V_MOV_IMM_F32: ++ case AMDGPU::V_MOV_IMM_I32: ++ case AMDGPU::S_MOV_IMM_I32: ++ return true; ++ } ++} ++ ++bool ++SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { ++ return RC != &AMDGPU::EXECRegRegClass; ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIInstrInfo.h llvm-r600/lib/Target/R600/SIInstrInfo.h +--- llvm-3.2.src/lib/Target/R600/SIInstrInfo.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/SIInstrInfo.h 2013-01-25 19:43:57.476716387 +0100 +@@ -0,0 +1,64 @@ ++//===-- SIInstrInfo.h - SI Instruction Info Interface ---------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Interface definition for SIInstrInfo. ++// ++//===----------------------------------------------------------------------===// ++ ++ ++#ifndef SIINSTRINFO_H ++#define SIINSTRINFO_H ++ ++#include "AMDGPUInstrInfo.h" ++#include "SIRegisterInfo.h" ++ ++namespace llvm { ++ ++class SIInstrInfo : public AMDGPUInstrInfo { ++private: ++ const SIRegisterInfo RI; ++ ++public: ++ explicit SIInstrInfo(AMDGPUTargetMachine &tm); ++ ++ const SIRegisterInfo &getRegisterInfo() const; ++ ++ virtual void copyPhysReg(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator MI, DebugLoc DL, ++ unsigned DestReg, unsigned SrcReg, ++ bool KillSrc) const; ++ ++ /// \returns the encoding type of this instruction. ++ unsigned getEncodingType(const MachineInstr &MI) const; ++ ++ /// \returns the size of this instructions encoding in number of bytes. ++ unsigned getEncodingBytes(const MachineInstr &MI) const; ++ ++ virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg, ++ int64_t Imm) const; ++ ++ virtual unsigned getIEQOpcode() const { assert(!"Implement"); return 0;} ++ virtual bool isMov(unsigned Opcode) const; ++ ++ virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const; ++ }; ++ ++} // End namespace llvm ++ ++namespace SIInstrFlags { ++ enum Flags { ++ // First 4 bits are the instruction encoding ++ VM_CNT = 1 << 4, ++ EXP_CNT = 1 << 5, ++ LGKM_CNT = 1 << 6 ++ }; ++} ++ ++#endif //SIINSTRINFO_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIInstrInfo.td llvm-r600/lib/Target/R600/SIInstrInfo.td +--- llvm-3.2.src/lib/Target/R600/SIInstrInfo.td 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/SIInstrInfo.td 2013-01-25 19:43:57.476716387 +0100 +@@ -0,0 +1,591 @@ ++//===-- SIInstrInfo.td - SI Instruction Encodings ---------*- tablegen -*--===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++ ++//===----------------------------------------------------------------------===// ++// SI DAG Profiles ++//===----------------------------------------------------------------------===// ++def SDTVCCBinaryOp : SDTypeProfile<1, 2, [ ++ SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 2> ++]>; ++ ++//===----------------------------------------------------------------------===// ++// SI DAG Nodes ++//===----------------------------------------------------------------------===// ++ ++// and operation on 64-bit wide vcc ++def SIsreg1_and : SDNode<"SIISD::VCC_AND", SDTVCCBinaryOp, ++ [SDNPCommutative, SDNPAssociative] ++>; ++ ++// Special bitcast node for sharing VCC register between VALU and SALU ++def SIsreg1_bitcast : SDNode<"SIISD::VCC_BITCAST", ++ SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]> ++>; ++ ++// and operation on 64-bit wide vcc ++def SIvcc_and : SDNode<"SIISD::VCC_AND", SDTVCCBinaryOp, ++ [SDNPCommutative, SDNPAssociative] ++>; ++ ++// Special bitcast node for sharing VCC register between VALU and SALU ++def SIvcc_bitcast : SDNode<"SIISD::VCC_BITCAST", ++ SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]> ++>; ++ ++class InstSI pattern> : ++ AMDGPUInst { ++ ++ field bits<4> EncodingType = 0; ++ field bits<1> VM_CNT = 0; ++ field bits<1> EXP_CNT = 0; ++ field bits<1> LGKM_CNT = 0; ++ ++ let TSFlags{3-0} = EncodingType; ++ let TSFlags{4} = VM_CNT; ++ let TSFlags{5} = EXP_CNT; ++ let TSFlags{6} = LGKM_CNT; ++} ++ ++class Enc32 pattern> : ++ InstSI { ++ ++ field bits<32> Inst; ++} ++ ++class Enc64 pattern> : ++ InstSI { ++ ++ field bits<64> Inst; ++} ++ ++class SIOperand : Operand { ++ let EncoderMethod = "encodeOperand"; ++ let MIOperandInfo = opInfo; ++} ++ ++def IMM16bit : ImmLeaf < ++ i16, ++ [{return isInt<16>(Imm);}] ++>; ++ ++def IMM8bit : ImmLeaf < ++ i32, ++ [{return (int32_t)Imm >= 0 && (int32_t)Imm <= 0xff;}] ++>; ++ ++def IMM12bit : ImmLeaf < ++ i16, ++ [{return (int16_t)Imm >= 0 && (int16_t)Imm <= 0xfff;}] ++>; ++ ++def IMM32bitIn64bit : ImmLeaf < ++ i64, ++ [{return isInt<32>(Imm);}] ++>; ++ ++class GPR4Align : Operand { ++ let EncoderMethod = "GPR4AlignEncode"; ++ let MIOperandInfo = (ops rc:$reg); ++} ++ ++class GPR2Align : Operand { ++ let EncoderMethod = "GPR2AlignEncode"; ++ let MIOperandInfo = (ops rc:$reg); ++} ++ ++def SMRDmemrr : Operand { ++ let MIOperandInfo = (ops SReg_64, SReg_32); ++ let EncoderMethod = "GPR2AlignEncode"; ++} ++ ++def SMRDmemri : Operand { ++ let MIOperandInfo = (ops SReg_64, i32imm); ++ let EncoderMethod = "SMRDmemriEncode"; ++} ++ ++def ADDR_Reg : ComplexPattern; ++def ADDR_Offset8 : ComplexPattern; ++ ++let Uses = [EXEC] in { ++ ++def EXP : Enc64< ++ (outs), ++ (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm, ++ VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3), ++ "EXP $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3", ++ [] > { ++ ++ bits<4> EN; ++ bits<6> TGT; ++ bits<1> COMPR; ++ bits<1> DONE; ++ bits<1> VM; ++ bits<8> VSRC0; ++ bits<8> VSRC1; ++ bits<8> VSRC2; ++ bits<8> VSRC3; ++ ++ let Inst{3-0} = EN; ++ let Inst{9-4} = TGT; ++ let Inst{10} = COMPR; ++ let Inst{11} = DONE; ++ let Inst{12} = VM; ++ let Inst{31-26} = 0x3e; ++ let Inst{39-32} = VSRC0; ++ let Inst{47-40} = VSRC1; ++ let Inst{55-48} = VSRC2; ++ let Inst{63-56} = VSRC3; ++ let EncodingType = 0; //SIInstrEncodingType::EXP ++ ++ let EXP_CNT = 1; ++} ++ ++class MIMG op, dag outs, dag ins, string asm, list pattern> : ++ Enc64 { ++ ++ bits<8> VDATA; ++ bits<4> DMASK; ++ bits<1> UNORM; ++ bits<1> GLC; ++ bits<1> DA; ++ bits<1> R128; ++ bits<1> TFE; ++ bits<1> LWE; ++ bits<1> SLC; ++ bits<8> VADDR; ++ bits<5> SRSRC; ++ bits<5> SSAMP; ++ ++ let Inst{11-8} = DMASK; ++ let Inst{12} = UNORM; ++ let Inst{13} = GLC; ++ let Inst{14} = DA; ++ let Inst{15} = R128; ++ let Inst{16} = TFE; ++ let Inst{17} = LWE; ++ let Inst{24-18} = op; ++ let Inst{25} = SLC; ++ let Inst{31-26} = 0x3c; ++ let Inst{39-32} = VADDR; ++ let Inst{47-40} = VDATA; ++ let Inst{52-48} = SRSRC; ++ let Inst{57-53} = SSAMP; ++ let EncodingType = 2; //SIInstrEncodingType::MIMG ++ ++ let VM_CNT = 1; ++ let EXP_CNT = 1; ++} ++ ++class MTBUF op, dag outs, dag ins, string asm, list pattern> : ++ Enc64 { ++ ++ bits<8> VDATA; ++ bits<12> OFFSET; ++ bits<1> OFFEN; ++ bits<1> IDXEN; ++ bits<1> GLC; ++ bits<1> ADDR64; ++ bits<4> DFMT; ++ bits<3> NFMT; ++ bits<8> VADDR; ++ bits<5> SRSRC; ++ bits<1> SLC; ++ bits<1> TFE; ++ bits<8> SOFFSET; ++ ++ let Inst{11-0} = OFFSET; ++ let Inst{12} = OFFEN; ++ let Inst{13} = IDXEN; ++ let Inst{14} = GLC; ++ let Inst{15} = ADDR64; ++ let Inst{18-16} = op; ++ let Inst{22-19} = DFMT; ++ let Inst{25-23} = NFMT; ++ let Inst{31-26} = 0x3a; //encoding ++ let Inst{39-32} = VADDR; ++ let Inst{47-40} = VDATA; ++ let Inst{52-48} = SRSRC; ++ let Inst{54} = SLC; ++ let Inst{55} = TFE; ++ let Inst{63-56} = SOFFSET; ++ let EncodingType = 3; //SIInstrEncodingType::MTBUF ++ ++ let VM_CNT = 1; ++ let EXP_CNT = 1; ++ ++ let neverHasSideEffects = 1; ++} ++ ++class MUBUF op, dag outs, dag ins, string asm, list pattern> : ++ Enc64 { ++ ++ bits<8> VDATA; ++ bits<12> OFFSET; ++ bits<1> OFFEN; ++ bits<1> IDXEN; ++ bits<1> GLC; ++ bits<1> ADDR64; ++ bits<1> LDS; ++ bits<8> VADDR; ++ bits<5> SRSRC; ++ bits<1> SLC; ++ bits<1> TFE; ++ bits<8> SOFFSET; ++ ++ let Inst{11-0} = OFFSET; ++ let Inst{12} = OFFEN; ++ let Inst{13} = IDXEN; ++ let Inst{14} = GLC; ++ let Inst{15} = ADDR64; ++ let Inst{16} = LDS; ++ let Inst{24-18} = op; ++ let Inst{31-26} = 0x38; //encoding ++ let Inst{39-32} = VADDR; ++ let Inst{47-40} = VDATA; ++ let Inst{52-48} = SRSRC; ++ let Inst{54} = SLC; ++ let Inst{55} = TFE; ++ let Inst{63-56} = SOFFSET; ++ let EncodingType = 4; //SIInstrEncodingType::MUBUF ++ ++ let VM_CNT = 1; ++ let EXP_CNT = 1; ++ ++ let neverHasSideEffects = 1; ++} ++ ++} // End Uses = [EXEC] ++ ++class SMRD op, dag outs, dag ins, string asm, list pattern> : ++ Enc32 { ++ ++ bits<7> SDST; ++ bits<15> PTR; ++ bits<8> OFFSET = PTR{7-0}; ++ bits<1> IMM = PTR{8}; ++ bits<6> SBASE = PTR{14-9}; ++ ++ let Inst{7-0} = OFFSET; ++ let Inst{8} = IMM; ++ let Inst{14-9} = SBASE; ++ let Inst{21-15} = SDST; ++ let Inst{26-22} = op; ++ let Inst{31-27} = 0x18; //encoding ++ let EncodingType = 5; //SIInstrEncodingType::SMRD ++ ++ let LGKM_CNT = 1; ++} ++ ++class SOP1 op, dag outs, dag ins, string asm, list pattern> : ++ Enc32 { ++ ++ bits<7> SDST; ++ bits<8> SSRC0; ++ ++ let Inst{7-0} = SSRC0; ++ let Inst{15-8} = op; ++ let Inst{22-16} = SDST; ++ let Inst{31-23} = 0x17d; //encoding; ++ let EncodingType = 6; //SIInstrEncodingType::SOP1 ++ ++ let mayLoad = 0; ++ let mayStore = 0; ++ let hasSideEffects = 0; ++} ++ ++class SOP2 op, dag outs, dag ins, string asm, list pattern> : ++ Enc32 { ++ ++ bits<7> SDST; ++ bits<8> SSRC0; ++ bits<8> SSRC1; ++ ++ let Inst{7-0} = SSRC0; ++ let Inst{15-8} = SSRC1; ++ let Inst{22-16} = SDST; ++ let Inst{29-23} = op; ++ let Inst{31-30} = 0x2; // encoding ++ let EncodingType = 7; // SIInstrEncodingType::SOP2 ++ ++ let mayLoad = 0; ++ let mayStore = 0; ++ let hasSideEffects = 0; ++} ++ ++class SOPC op, dag outs, dag ins, string asm, list pattern> : ++ Enc32 { ++ ++ bits<8> SSRC0; ++ bits<8> SSRC1; ++ ++ let Inst{7-0} = SSRC0; ++ let Inst{15-8} = SSRC1; ++ let Inst{22-16} = op; ++ let Inst{31-23} = 0x17e; ++ let EncodingType = 8; // SIInstrEncodingType::SOPC ++ ++ let DisableEncoding = "$dst"; ++ let mayLoad = 0; ++ let mayStore = 0; ++ let hasSideEffects = 0; ++} ++ ++class SOPK op, dag outs, dag ins, string asm, list pattern> : ++ Enc32 { ++ ++ bits <7> SDST; ++ bits <16> SIMM16; ++ ++ let Inst{15-0} = SIMM16; ++ let Inst{22-16} = SDST; ++ let Inst{27-23} = op; ++ let Inst{31-28} = 0xb; //encoding ++ let EncodingType = 9; // SIInstrEncodingType::SOPK ++ ++ let mayLoad = 0; ++ let mayStore = 0; ++ let hasSideEffects = 0; ++} ++ ++class SOPP op, dag ins, string asm, list pattern> : Enc32 < ++ (outs), ++ ins, ++ asm, ++ pattern > { ++ ++ bits <16> SIMM16; ++ ++ let Inst{15-0} = SIMM16; ++ let Inst{22-16} = op; ++ let Inst{31-23} = 0x17f; // encoding ++ let EncodingType = 10; // SIInstrEncodingType::SOPP ++ ++ let mayLoad = 0; ++ let mayStore = 0; ++ let hasSideEffects = 0; ++} ++ ++let Uses = [EXEC] in { ++ ++class VINTRP op, dag outs, dag ins, string asm, list pattern> : ++ Enc32 { ++ ++ bits<8> VDST; ++ bits<8> VSRC; ++ bits<2> ATTRCHAN; ++ bits<6> ATTR; ++ ++ let Inst{7-0} = VSRC; ++ let Inst{9-8} = ATTRCHAN; ++ let Inst{15-10} = ATTR; ++ let Inst{17-16} = op; ++ let Inst{25-18} = VDST; ++ let Inst{31-26} = 0x32; // encoding ++ let EncodingType = 11; // SIInstrEncodingType::VINTRP ++ ++ let neverHasSideEffects = 1; ++ let mayLoad = 1; ++ let mayStore = 0; ++} ++ ++class VOP1 op, dag outs, dag ins, string asm, list pattern> : ++ Enc32 { ++ ++ bits<8> VDST; ++ bits<9> SRC0; ++ ++ let Inst{8-0} = SRC0; ++ let Inst{16-9} = op; ++ let Inst{24-17} = VDST; ++ let Inst{31-25} = 0x3f; //encoding ++ ++ let EncodingType = 12; // SIInstrEncodingType::VOP1 ++ let PostEncoderMethod = "VOPPostEncode"; ++ ++ let mayLoad = 0; ++ let mayStore = 0; ++ let hasSideEffects = 0; ++} ++ ++class VOP2 op, dag outs, dag ins, string asm, list pattern> : ++ Enc32 { ++ ++ bits<8> VDST; ++ bits<9> SRC0; ++ bits<8> VSRC1; ++ ++ let Inst{8-0} = SRC0; ++ let Inst{16-9} = VSRC1; ++ let Inst{24-17} = VDST; ++ let Inst{30-25} = op; ++ let Inst{31} = 0x0; //encoding ++ ++ let EncodingType = 13; // SIInstrEncodingType::VOP2 ++ let PostEncoderMethod = "VOPPostEncode"; ++ ++ let mayLoad = 0; ++ let mayStore = 0; ++ let hasSideEffects = 0; ++} ++ ++class VOP3 op, dag outs, dag ins, string asm, list pattern> : ++ Enc64 { ++ ++ bits<8> VDST; ++ bits<9> SRC0; ++ bits<9> SRC1; ++ bits<9> SRC2; ++ bits<3> ABS; ++ bits<1> CLAMP; ++ bits<2> OMOD; ++ bits<3> NEG; ++ ++ let Inst{7-0} = VDST; ++ let Inst{10-8} = ABS; ++ let Inst{11} = CLAMP; ++ let Inst{25-17} = op; ++ let Inst{31-26} = 0x34; //encoding ++ let Inst{40-32} = SRC0; ++ let Inst{49-41} = SRC1; ++ let Inst{58-50} = SRC2; ++ let Inst{60-59} = OMOD; ++ let Inst{63-61} = NEG; ++ ++ let EncodingType = 14; // SIInstrEncodingType::VOP3 ++ let PostEncoderMethod = "VOPPostEncode"; ++ ++ let mayLoad = 0; ++ let mayStore = 0; ++ let hasSideEffects = 0; ++} ++ ++class VOP3b op, dag outs, dag ins, string asm, list pattern> : ++ Enc64 { ++ ++ bits<8> VDST; ++ bits<9> SRC0; ++ bits<9> SRC1; ++ bits<9> SRC2; ++ bits<7> SDST; ++ bits<2> OMOD; ++ bits<3> NEG; ++ ++ let Inst{7-0} = VDST; ++ let Inst{14-8} = SDST; ++ let Inst{25-17} = op; ++ let Inst{31-26} = 0x34; //encoding ++ let Inst{40-32} = SRC0; ++ let Inst{49-41} = SRC1; ++ let Inst{58-50} = SRC2; ++ let Inst{60-59} = OMOD; ++ let Inst{63-61} = NEG; ++ ++ let EncodingType = 14; // SIInstrEncodingType::VOP3 ++ let PostEncoderMethod = "VOPPostEncode"; ++ ++ let mayLoad = 0; ++ let mayStore = 0; ++ let hasSideEffects = 0; ++} ++ ++class VOPC op, dag ins, string asm, list pattern> : ++ Enc32 <(outs VCCReg:$dst), ins, asm, pattern> { ++ ++ bits<9> SRC0; ++ bits<8> VSRC1; ++ ++ let Inst{8-0} = SRC0; ++ let Inst{16-9} = VSRC1; ++ let Inst{24-17} = op; ++ let Inst{31-25} = 0x3e; ++ ++ let EncodingType = 15; //SIInstrEncodingType::VOPC ++ let PostEncoderMethod = "VOPPostEncode"; ++ let DisableEncoding = "$dst"; ++ let mayLoad = 0; ++ let mayStore = 0; ++ let hasSideEffects = 0; ++} ++ ++} // End Uses = [EXEC] ++ ++class MIMG_Load_Helper op, string asm> : MIMG < ++ op, ++ (outs VReg_128:$vdata), ++ (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, ++ i1imm:$tfe, i1imm:$lwe, i1imm:$slc, VReg_128:$vaddr, ++ GPR4Align:$srsrc, GPR4Align:$ssamp), ++ asm, ++ []> { ++ let mayLoad = 1; ++ let mayStore = 0; ++} ++ ++class MUBUF_Load_Helper op, string asm, RegisterClass regClass> : MUBUF < ++ op, ++ (outs regClass:$dst), ++ (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, ++ i1imm:$lds, VReg_32:$vaddr, GPR4Align:$srsrc, i1imm:$slc, ++ i1imm:$tfe, SReg_32:$soffset), ++ asm, ++ []> { ++ let mayLoad = 1; ++ let mayStore = 0; ++} ++ ++class MTBUF_Load_Helper op, string asm, RegisterClass regClass> : MTBUF < ++ op, ++ (outs regClass:$dst), ++ (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, ++ i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, GPR4Align:$srsrc, ++ i1imm:$slc, i1imm:$tfe, SReg_32:$soffset), ++ asm, ++ []> { ++ let mayLoad = 1; ++ let mayStore = 0; ++} ++ ++class MTBUF_Store_Helper op, string asm, RegisterClass regClass> : MTBUF < ++ op, ++ (outs), ++ (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, ++ i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, ++ GPR4Align:$srsrc, i1imm:$slc, i1imm:$tfe, SReg_32:$soffset), ++ asm, ++ []> { ++ let mayStore = 1; ++ let mayLoad = 0; ++} ++ ++multiclass SMRD_Helper op, string asm, RegisterClass dstClass, ++ ValueType vt> { ++ def _IMM : SMRD < ++ op, ++ (outs dstClass:$dst), ++ (ins SMRDmemri:$src0), ++ asm, ++ [(set (vt dstClass:$dst), (constant_load ADDR_Offset8:$src0))] ++ >; ++ ++ def _SGPR : SMRD < ++ op, ++ (outs dstClass:$dst), ++ (ins SMRDmemrr:$src0), ++ asm, ++ [(set (vt dstClass:$dst), (constant_load ADDR_Reg:$src0))] ++ >; ++} ++ ++multiclass SMRD_32 op, string asm, RegisterClass dstClass> { ++ defm _F32 : SMRD_Helper ; ++ defm _I32 : SMRD_Helper ; ++} ++ ++include "SIInstrFormats.td" ++include "SIInstructions.td" +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIInstructions.td llvm-r600/lib/Target/R600/SIInstructions.td +--- llvm-3.2.src/lib/Target/R600/SIInstructions.td 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/SIInstructions.td 2013-01-25 19:43:57.480049720 +0100 +@@ -0,0 +1,1357 @@ ++//===-- SIInstructions.td - SI Instruction Defintions ---------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// This file was originally auto-generated from a GPU register header file and ++// all the instruction definitions were originally commented out. Instructions ++// that are not yet supported remain commented out. ++//===----------------------------------------------------------------------===// ++ ++def isSI : Predicate<"Subtarget.device()" ++ "->getGeneration() == AMDGPUDeviceInfo::HD7XXX">; ++ ++let Predicates = [isSI] in { ++ ++let neverHasSideEffects = 1 in { ++def S_MOV_B32 : SOP1_32 <0x00000003, "S_MOV_B32", []>; ++def S_MOV_B64 : SOP1_64 <0x00000004, "S_MOV_B64", []>; ++def S_CMOV_B32 : SOP1_32 <0x00000005, "S_CMOV_B32", []>; ++def S_CMOV_B64 : SOP1_64 <0x00000006, "S_CMOV_B64", []>; ++def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32", []>; ++def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", []>; ++def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>; ++def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>; ++def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32", []>; ++def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>; ++} // End neverHasSideEffects = 1 ++////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>; ++////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>; ++////def S_BCNT1_I32_B32 : SOP1_BCNT1 <0x0000000f, "S_BCNT1_I32_B32", []>; ++////def S_BCNT1_I32_B64 : SOP1_BCNT1 <0x00000010, "S_BCNT1_I32_B64", []>; ++////def S_FF0_I32_B32 : SOP1_FF0 <0x00000011, "S_FF0_I32_B32", []>; ++////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>; ++////def S_FF1_I32_B32 : SOP1_FF1 <0x00000013, "S_FF1_I32_B32", []>; ++////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "S_FF1_I32_B64", []>; ++//def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32", []>; ++//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>; ++def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>; ++//def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>; ++//def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8", []>; ++//def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16", []>; ++////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "S_BITSET0_B32", []>; ++////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>; ++////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>; ++////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "S_BITSET1_B64", []>; ++def S_GETPC_B64 : SOP1_64 <0x0000001f, "S_GETPC_B64", []>; ++def S_SETPC_B64 : SOP1_64 <0x00000020, "S_SETPC_B64", []>; ++def S_SWAPPC_B64 : SOP1_64 <0x00000021, "S_SWAPPC_B64", []>; ++def S_RFE_B64 : SOP1_64 <0x00000022, "S_RFE_B64", []>; ++ ++let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC] in { ++ ++def S_AND_SAVEEXEC_B64 : SOP1_64 <0x00000024, "S_AND_SAVEEXEC_B64", []>; ++def S_OR_SAVEEXEC_B64 : SOP1_64 <0x00000025, "S_OR_SAVEEXEC_B64", []>; ++def S_XOR_SAVEEXEC_B64 : SOP1_64 <0x00000026, "S_XOR_SAVEEXEC_B64", []>; ++def S_ANDN2_SAVEEXEC_B64 : SOP1_64 <0x00000027, "S_ANDN2_SAVEEXEC_B64", []>; ++def S_ORN2_SAVEEXEC_B64 : SOP1_64 <0x00000028, "S_ORN2_SAVEEXEC_B64", []>; ++def S_NAND_SAVEEXEC_B64 : SOP1_64 <0x00000029, "S_NAND_SAVEEXEC_B64", []>; ++def S_NOR_SAVEEXEC_B64 : SOP1_64 <0x0000002a, "S_NOR_SAVEEXEC_B64", []>; ++def S_XNOR_SAVEEXEC_B64 : SOP1_64 <0x0000002b, "S_XNOR_SAVEEXEC_B64", []>; ++ ++} // End hasSideEffects = 1 ++ ++def S_QUADMASK_B32 : SOP1_32 <0x0000002c, "S_QUADMASK_B32", []>; ++def S_QUADMASK_B64 : SOP1_64 <0x0000002d, "S_QUADMASK_B64", []>; ++def S_MOVRELS_B32 : SOP1_32 <0x0000002e, "S_MOVRELS_B32", []>; ++def S_MOVRELS_B64 : SOP1_64 <0x0000002f, "S_MOVRELS_B64", []>; ++def S_MOVRELD_B32 : SOP1_32 <0x00000030, "S_MOVRELD_B32", []>; ++def S_MOVRELD_B64 : SOP1_64 <0x00000031, "S_MOVRELD_B64", []>; ++//def S_CBRANCH_JOIN : SOP1_ <0x00000032, "S_CBRANCH_JOIN", []>; ++def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "S_MOV_REGRD_B32", []>; ++def S_ABS_I32 : SOP1_32 <0x00000034, "S_ABS_I32", []>; ++def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>; ++def S_MOVK_I32 : SOPK_32 <0x00000000, "S_MOVK_I32", []>; ++def S_CMOVK_I32 : SOPK_32 <0x00000002, "S_CMOVK_I32", []>; ++ ++/* ++This instruction is disabled for now until we can figure out how to teach ++the instruction selector to correctly use the S_CMP* vs V_CMP* ++instructions. ++ ++When this instruction is enabled the code generator sometimes produces this ++invalid sequence: ++ ++SCC = S_CMPK_EQ_I32 SGPR0, imm ++VCC = COPY SCC ++VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1 ++ ++def S_CMPK_EQ_I32 : SOPK < ++ 0x00000003, (outs SCCReg:$dst), (ins SReg_32:$src0, i32imm:$src1), ++ "S_CMPK_EQ_I32", ++ [(set SCCReg:$dst, (setcc SReg_32:$src0, imm:$src1, SETEQ))] ++>; ++*/ ++ ++def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "S_CMPK_LG_I32", []>; ++def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "S_CMPK_GT_I32", []>; ++def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "S_CMPK_GE_I32", []>; ++def S_CMPK_LT_I32 : SOPK_32 <0x00000007, "S_CMPK_LT_I32", []>; ++def S_CMPK_LE_I32 : SOPK_32 <0x00000008, "S_CMPK_LE_I32", []>; ++def S_CMPK_EQ_U32 : SOPK_32 <0x00000009, "S_CMPK_EQ_U32", []>; ++def S_CMPK_LG_U32 : SOPK_32 <0x0000000a, "S_CMPK_LG_U32", []>; ++def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "S_CMPK_GT_U32", []>; ++def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "S_CMPK_GE_U32", []>; ++def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>; ++def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>; ++def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>; ++def S_MULK_I32 : SOPK_32 <0x00000010, "S_MULK_I32", []>; ++//def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "S_CBRANCH_I_FORK", []>; ++def S_GETREG_B32 : SOPK_32 <0x00000012, "S_GETREG_B32", []>; ++def S_SETREG_B32 : SOPK_32 <0x00000013, "S_SETREG_B32", []>; ++def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>; ++//def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "S_SETREG_IMM32_B32", []>; ++//def EXP : EXP_ <0x00000000, "EXP", []>; ++ ++defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32", []>; ++defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", []>; ++def : Pat < ++ (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_LT)), ++ (V_CMP_LT_F32_e64 AllReg_32:$src0, VReg_32:$src1) ++>; ++defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", []>; ++def : Pat < ++ (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_EQ)), ++ (V_CMP_EQ_F32_e64 AllReg_32:$src0, VReg_32:$src1) ++>; ++defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", []>; ++def : Pat < ++ (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_LE)), ++ (V_CMP_LE_F32_e64 AllReg_32:$src0, VReg_32:$src1) ++>; ++defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", []>; ++def : Pat < ++ (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_GT)), ++ (V_CMP_GT_F32_e64 AllReg_32:$src0, VReg_32:$src1) ++>; ++defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32", []>; ++def : Pat < ++ (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_NE)), ++ (V_CMP_LG_F32_e64 AllReg_32:$src0, VReg_32:$src1) ++>; ++defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", []>; ++def : Pat < ++ (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_GE)), ++ (V_CMP_GE_F32_e64 AllReg_32:$src0, VReg_32:$src1) ++>; ++defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32", []>; ++defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32", []>; ++defm V_CMP_NGE_F32 : VOPC_32 <0x00000009, "V_CMP_NGE_F32", []>; ++defm V_CMP_NLG_F32 : VOPC_32 <0x0000000a, "V_CMP_NLG_F32", []>; ++defm V_CMP_NGT_F32 : VOPC_32 <0x0000000b, "V_CMP_NGT_F32", []>; ++defm V_CMP_NLE_F32 : VOPC_32 <0x0000000c, "V_CMP_NLE_F32", []>; ++defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", []>; ++def : Pat < ++ (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_NE)), ++ (V_CMP_NEQ_F32_e64 AllReg_32:$src0, VReg_32:$src1) ++>; ++defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32", []>; ++defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32", []>; ++ ++//Side effect is writing to EXEC ++let hasSideEffects = 1 in { ++ ++defm V_CMPX_F_F32 : VOPC_32 <0x00000010, "V_CMPX_F_F32", []>; ++defm V_CMPX_LT_F32 : VOPC_32 <0x00000011, "V_CMPX_LT_F32", []>; ++defm V_CMPX_EQ_F32 : VOPC_32 <0x00000012, "V_CMPX_EQ_F32", []>; ++defm V_CMPX_LE_F32 : VOPC_32 <0x00000013, "V_CMPX_LE_F32", []>; ++defm V_CMPX_GT_F32 : VOPC_32 <0x00000014, "V_CMPX_GT_F32", []>; ++defm V_CMPX_LG_F32 : VOPC_32 <0x00000015, "V_CMPX_LG_F32", []>; ++defm V_CMPX_GE_F32 : VOPC_32 <0x00000016, "V_CMPX_GE_F32", []>; ++defm V_CMPX_O_F32 : VOPC_32 <0x00000017, "V_CMPX_O_F32", []>; ++defm V_CMPX_U_F32 : VOPC_32 <0x00000018, "V_CMPX_U_F32", []>; ++defm V_CMPX_NGE_F32 : VOPC_32 <0x00000019, "V_CMPX_NGE_F32", []>; ++defm V_CMPX_NLG_F32 : VOPC_32 <0x0000001a, "V_CMPX_NLG_F32", []>; ++defm V_CMPX_NGT_F32 : VOPC_32 <0x0000001b, "V_CMPX_NGT_F32", []>; ++defm V_CMPX_NLE_F32 : VOPC_32 <0x0000001c, "V_CMPX_NLE_F32", []>; ++defm V_CMPX_NEQ_F32 : VOPC_32 <0x0000001d, "V_CMPX_NEQ_F32", []>; ++defm V_CMPX_NLT_F32 : VOPC_32 <0x0000001e, "V_CMPX_NLT_F32", []>; ++defm V_CMPX_TRU_F32 : VOPC_32 <0x0000001f, "V_CMPX_TRU_F32", []>; ++ ++} // End hasSideEffects = 1 ++ ++defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64", []>; ++defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", []>; ++defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64", []>; ++defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64", []>; ++defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64", []>; ++defm V_CMP_LG_F64 : VOPC_64 <0x00000025, "V_CMP_LG_F64", []>; ++defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64", []>; ++defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64", []>; ++defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64", []>; ++defm V_CMP_NGE_F64 : VOPC_64 <0x00000029, "V_CMP_NGE_F64", []>; ++defm V_CMP_NLG_F64 : VOPC_64 <0x0000002a, "V_CMP_NLG_F64", []>; ++defm V_CMP_NGT_F64 : VOPC_64 <0x0000002b, "V_CMP_NGT_F64", []>; ++defm V_CMP_NLE_F64 : VOPC_64 <0x0000002c, "V_CMP_NLE_F64", []>; ++defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", []>; ++defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64", []>; ++defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64", []>; ++ ++//Side effect is writing to EXEC ++let hasSideEffects = 1 in { ++ ++defm V_CMPX_F_F64 : VOPC_64 <0x00000030, "V_CMPX_F_F64", []>; ++defm V_CMPX_LT_F64 : VOPC_64 <0x00000031, "V_CMPX_LT_F64", []>; ++defm V_CMPX_EQ_F64 : VOPC_64 <0x00000032, "V_CMPX_EQ_F64", []>; ++defm V_CMPX_LE_F64 : VOPC_64 <0x00000033, "V_CMPX_LE_F64", []>; ++defm V_CMPX_GT_F64 : VOPC_64 <0x00000034, "V_CMPX_GT_F64", []>; ++defm V_CMPX_LG_F64 : VOPC_64 <0x00000035, "V_CMPX_LG_F64", []>; ++defm V_CMPX_GE_F64 : VOPC_64 <0x00000036, "V_CMPX_GE_F64", []>; ++defm V_CMPX_O_F64 : VOPC_64 <0x00000037, "V_CMPX_O_F64", []>; ++defm V_CMPX_U_F64 : VOPC_64 <0x00000038, "V_CMPX_U_F64", []>; ++defm V_CMPX_NGE_F64 : VOPC_64 <0x00000039, "V_CMPX_NGE_F64", []>; ++defm V_CMPX_NLG_F64 : VOPC_64 <0x0000003a, "V_CMPX_NLG_F64", []>; ++defm V_CMPX_NGT_F64 : VOPC_64 <0x0000003b, "V_CMPX_NGT_F64", []>; ++defm V_CMPX_NLE_F64 : VOPC_64 <0x0000003c, "V_CMPX_NLE_F64", []>; ++defm V_CMPX_NEQ_F64 : VOPC_64 <0x0000003d, "V_CMPX_NEQ_F64", []>; ++defm V_CMPX_NLT_F64 : VOPC_64 <0x0000003e, "V_CMPX_NLT_F64", []>; ++defm V_CMPX_TRU_F64 : VOPC_64 <0x0000003f, "V_CMPX_TRU_F64", []>; ++ ++} // End hasSideEffects = 1 ++ ++defm V_CMPS_F_F32 : VOPC_32 <0x00000040, "V_CMPS_F_F32", []>; ++defm V_CMPS_LT_F32 : VOPC_32 <0x00000041, "V_CMPS_LT_F32", []>; ++defm V_CMPS_EQ_F32 : VOPC_32 <0x00000042, "V_CMPS_EQ_F32", []>; ++defm V_CMPS_LE_F32 : VOPC_32 <0x00000043, "V_CMPS_LE_F32", []>; ++defm V_CMPS_GT_F32 : VOPC_32 <0x00000044, "V_CMPS_GT_F32", []>; ++defm V_CMPS_LG_F32 : VOPC_32 <0x00000045, "V_CMPS_LG_F32", []>; ++defm V_CMPS_GE_F32 : VOPC_32 <0x00000046, "V_CMPS_GE_F32", []>; ++defm V_CMPS_O_F32 : VOPC_32 <0x00000047, "V_CMPS_O_F32", []>; ++defm V_CMPS_U_F32 : VOPC_32 <0x00000048, "V_CMPS_U_F32", []>; ++defm V_CMPS_NGE_F32 : VOPC_32 <0x00000049, "V_CMPS_NGE_F32", []>; ++defm V_CMPS_NLG_F32 : VOPC_32 <0x0000004a, "V_CMPS_NLG_F32", []>; ++defm V_CMPS_NGT_F32 : VOPC_32 <0x0000004b, "V_CMPS_NGT_F32", []>; ++defm V_CMPS_NLE_F32 : VOPC_32 <0x0000004c, "V_CMPS_NLE_F32", []>; ++defm V_CMPS_NEQ_F32 : VOPC_32 <0x0000004d, "V_CMPS_NEQ_F32", []>; ++defm V_CMPS_NLT_F32 : VOPC_32 <0x0000004e, "V_CMPS_NLT_F32", []>; ++defm V_CMPS_TRU_F32 : VOPC_32 <0x0000004f, "V_CMPS_TRU_F32", []>; ++defm V_CMPSX_F_F32 : VOPC_32 <0x00000050, "V_CMPSX_F_F32", []>; ++defm V_CMPSX_LT_F32 : VOPC_32 <0x00000051, "V_CMPSX_LT_F32", []>; ++defm V_CMPSX_EQ_F32 : VOPC_32 <0x00000052, "V_CMPSX_EQ_F32", []>; ++defm V_CMPSX_LE_F32 : VOPC_32 <0x00000053, "V_CMPSX_LE_F32", []>; ++defm V_CMPSX_GT_F32 : VOPC_32 <0x00000054, "V_CMPSX_GT_F32", []>; ++defm V_CMPSX_LG_F32 : VOPC_32 <0x00000055, "V_CMPSX_LG_F32", []>; ++defm V_CMPSX_GE_F32 : VOPC_32 <0x00000056, "V_CMPSX_GE_F32", []>; ++defm V_CMPSX_O_F32 : VOPC_32 <0x00000057, "V_CMPSX_O_F32", []>; ++defm V_CMPSX_U_F32 : VOPC_32 <0x00000058, "V_CMPSX_U_F32", []>; ++defm V_CMPSX_NGE_F32 : VOPC_32 <0x00000059, "V_CMPSX_NGE_F32", []>; ++defm V_CMPSX_NLG_F32 : VOPC_32 <0x0000005a, "V_CMPSX_NLG_F32", []>; ++defm V_CMPSX_NGT_F32 : VOPC_32 <0x0000005b, "V_CMPSX_NGT_F32", []>; ++defm V_CMPSX_NLE_F32 : VOPC_32 <0x0000005c, "V_CMPSX_NLE_F32", []>; ++defm V_CMPSX_NEQ_F32 : VOPC_32 <0x0000005d, "V_CMPSX_NEQ_F32", []>; ++defm V_CMPSX_NLT_F32 : VOPC_32 <0x0000005e, "V_CMPSX_NLT_F32", []>; ++defm V_CMPSX_TRU_F32 : VOPC_32 <0x0000005f, "V_CMPSX_TRU_F32", []>; ++defm V_CMPS_F_F64 : VOPC_64 <0x00000060, "V_CMPS_F_F64", []>; ++defm V_CMPS_LT_F64 : VOPC_64 <0x00000061, "V_CMPS_LT_F64", []>; ++defm V_CMPS_EQ_F64 : VOPC_64 <0x00000062, "V_CMPS_EQ_F64", []>; ++defm V_CMPS_LE_F64 : VOPC_64 <0x00000063, "V_CMPS_LE_F64", []>; ++defm V_CMPS_GT_F64 : VOPC_64 <0x00000064, "V_CMPS_GT_F64", []>; ++defm V_CMPS_LG_F64 : VOPC_64 <0x00000065, "V_CMPS_LG_F64", []>; ++defm V_CMPS_GE_F64 : VOPC_64 <0x00000066, "V_CMPS_GE_F64", []>; ++defm V_CMPS_O_F64 : VOPC_64 <0x00000067, "V_CMPS_O_F64", []>; ++defm V_CMPS_U_F64 : VOPC_64 <0x00000068, "V_CMPS_U_F64", []>; ++defm V_CMPS_NGE_F64 : VOPC_64 <0x00000069, "V_CMPS_NGE_F64", []>; ++defm V_CMPS_NLG_F64 : VOPC_64 <0x0000006a, "V_CMPS_NLG_F64", []>; ++defm V_CMPS_NGT_F64 : VOPC_64 <0x0000006b, "V_CMPS_NGT_F64", []>; ++defm V_CMPS_NLE_F64 : VOPC_64 <0x0000006c, "V_CMPS_NLE_F64", []>; ++defm V_CMPS_NEQ_F64 : VOPC_64 <0x0000006d, "V_CMPS_NEQ_F64", []>; ++defm V_CMPS_NLT_F64 : VOPC_64 <0x0000006e, "V_CMPS_NLT_F64", []>; ++defm V_CMPS_TRU_F64 : VOPC_64 <0x0000006f, "V_CMPS_TRU_F64", []>; ++defm V_CMPSX_F_F64 : VOPC_64 <0x00000070, "V_CMPSX_F_F64", []>; ++defm V_CMPSX_LT_F64 : VOPC_64 <0x00000071, "V_CMPSX_LT_F64", []>; ++defm V_CMPSX_EQ_F64 : VOPC_64 <0x00000072, "V_CMPSX_EQ_F64", []>; ++defm V_CMPSX_LE_F64 : VOPC_64 <0x00000073, "V_CMPSX_LE_F64", []>; ++defm V_CMPSX_GT_F64 : VOPC_64 <0x00000074, "V_CMPSX_GT_F64", []>; ++defm V_CMPSX_LG_F64 : VOPC_64 <0x00000075, "V_CMPSX_LG_F64", []>; ++defm V_CMPSX_GE_F64 : VOPC_64 <0x00000076, "V_CMPSX_GE_F64", []>; ++defm V_CMPSX_O_F64 : VOPC_64 <0x00000077, "V_CMPSX_O_F64", []>; ++defm V_CMPSX_U_F64 : VOPC_64 <0x00000078, "V_CMPSX_U_F64", []>; ++defm V_CMPSX_NGE_F64 : VOPC_64 <0x00000079, "V_CMPSX_NGE_F64", []>; ++defm V_CMPSX_NLG_F64 : VOPC_64 <0x0000007a, "V_CMPSX_NLG_F64", []>; ++defm V_CMPSX_NGT_F64 : VOPC_64 <0x0000007b, "V_CMPSX_NGT_F64", []>; ++defm V_CMPSX_NLE_F64 : VOPC_64 <0x0000007c, "V_CMPSX_NLE_F64", []>; ++defm V_CMPSX_NEQ_F64 : VOPC_64 <0x0000007d, "V_CMPSX_NEQ_F64", []>; ++defm V_CMPSX_NLT_F64 : VOPC_64 <0x0000007e, "V_CMPSX_NLT_F64", []>; ++defm V_CMPSX_TRU_F64 : VOPC_64 <0x0000007f, "V_CMPSX_TRU_F64", []>; ++defm V_CMP_F_I32 : VOPC_32 <0x00000080, "V_CMP_F_I32", []>; ++defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", []>; ++def : Pat < ++ (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_LT)), ++ (V_CMP_LT_I32_e64 AllReg_32:$src0, VReg_32:$src1) ++>; ++defm V_CMP_EQ_I32 : VOPC_32 <0x00000082, "V_CMP_EQ_I32", []>; ++def : Pat < ++ (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_EQ)), ++ (V_CMP_EQ_I32_e64 AllReg_32:$src0, VReg_32:$src1) ++>; ++defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", []>; ++def : Pat < ++ (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_LE)), ++ (V_CMP_LE_I32_e64 AllReg_32:$src0, VReg_32:$src1) ++>; ++defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", []>; ++def : Pat < ++ (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_GT)), ++ (V_CMP_GT_I32_e64 AllReg_32:$src0, VReg_32:$src1) ++>; ++defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", []>; ++def : Pat < ++ (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_NE)), ++ (V_CMP_NE_I32_e64 AllReg_32:$src0, VReg_32:$src1) ++>; ++defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", []>; ++def : Pat < ++ (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_GE)), ++ (V_CMP_GE_I32_e64 AllReg_32:$src0, VReg_32:$src1) ++>; ++defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32", []>; ++ ++let hasSideEffects = 1 in { ++ ++defm V_CMPX_F_I32 : VOPC_32 <0x00000090, "V_CMPX_F_I32", []>; ++defm V_CMPX_LT_I32 : VOPC_32 <0x00000091, "V_CMPX_LT_I32", []>; ++defm V_CMPX_EQ_I32 : VOPC_32 <0x00000092, "V_CMPX_EQ_I32", []>; ++defm V_CMPX_LE_I32 : VOPC_32 <0x00000093, "V_CMPX_LE_I32", []>; ++defm V_CMPX_GT_I32 : VOPC_32 <0x00000094, "V_CMPX_GT_I32", []>; ++defm V_CMPX_NE_I32 : VOPC_32 <0x00000095, "V_CMPX_NE_I32", []>; ++defm V_CMPX_GE_I32 : VOPC_32 <0x00000096, "V_CMPX_GE_I32", []>; ++defm V_CMPX_T_I32 : VOPC_32 <0x00000097, "V_CMPX_T_I32", []>; ++ ++} // End hasSideEffects ++ ++defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64", []>; ++defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64", []>; ++defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64", []>; ++defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64", []>; ++defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64", []>; ++defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64", []>; ++defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64", []>; ++defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64", []>; ++ ++let hasSideEffects = 1 in { ++ ++defm V_CMPX_F_I64 : VOPC_64 <0x000000b0, "V_CMPX_F_I64", []>; ++defm V_CMPX_LT_I64 : VOPC_64 <0x000000b1, "V_CMPX_LT_I64", []>; ++defm V_CMPX_EQ_I64 : VOPC_64 <0x000000b2, "V_CMPX_EQ_I64", []>; ++defm V_CMPX_LE_I64 : VOPC_64 <0x000000b3, "V_CMPX_LE_I64", []>; ++defm V_CMPX_GT_I64 : VOPC_64 <0x000000b4, "V_CMPX_GT_I64", []>; ++defm V_CMPX_NE_I64 : VOPC_64 <0x000000b5, "V_CMPX_NE_I64", []>; ++defm V_CMPX_GE_I64 : VOPC_64 <0x000000b6, "V_CMPX_GE_I64", []>; ++defm V_CMPX_T_I64 : VOPC_64 <0x000000b7, "V_CMPX_T_I64", []>; ++ ++} // End hasSideEffects ++ ++defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32", []>; ++defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32", []>; ++defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32", []>; ++defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32", []>; ++defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32", []>; ++defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32", []>; ++defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32", []>; ++defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32", []>; ++ ++let hasSideEffects = 1 in { ++ ++defm V_CMPX_F_U32 : VOPC_32 <0x000000d0, "V_CMPX_F_U32", []>; ++defm V_CMPX_LT_U32 : VOPC_32 <0x000000d1, "V_CMPX_LT_U32", []>; ++defm V_CMPX_EQ_U32 : VOPC_32 <0x000000d2, "V_CMPX_EQ_U32", []>; ++defm V_CMPX_LE_U32 : VOPC_32 <0x000000d3, "V_CMPX_LE_U32", []>; ++defm V_CMPX_GT_U32 : VOPC_32 <0x000000d4, "V_CMPX_GT_U32", []>; ++defm V_CMPX_NE_U32 : VOPC_32 <0x000000d5, "V_CMPX_NE_U32", []>; ++defm V_CMPX_GE_U32 : VOPC_32 <0x000000d6, "V_CMPX_GE_U32", []>; ++defm V_CMPX_T_U32 : VOPC_32 <0x000000d7, "V_CMPX_T_U32", []>; ++ ++} // End hasSideEffects ++ ++defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64", []>; ++defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64", []>; ++defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64", []>; ++defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64", []>; ++defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64", []>; ++defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64", []>; ++defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64", []>; ++defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64", []>; ++defm V_CMPX_F_U64 : VOPC_64 <0x000000f0, "V_CMPX_F_U64", []>; ++defm V_CMPX_LT_U64 : VOPC_64 <0x000000f1, "V_CMPX_LT_U64", []>; ++defm V_CMPX_EQ_U64 : VOPC_64 <0x000000f2, "V_CMPX_EQ_U64", []>; ++defm V_CMPX_LE_U64 : VOPC_64 <0x000000f3, "V_CMPX_LE_U64", []>; ++defm V_CMPX_GT_U64 : VOPC_64 <0x000000f4, "V_CMPX_GT_U64", []>; ++defm V_CMPX_NE_U64 : VOPC_64 <0x000000f5, "V_CMPX_NE_U64", []>; ++defm V_CMPX_GE_U64 : VOPC_64 <0x000000f6, "V_CMPX_GE_U64", []>; ++defm V_CMPX_T_U64 : VOPC_64 <0x000000f7, "V_CMPX_T_U64", []>; ++defm V_CMP_CLASS_F32 : VOPC_32 <0x00000088, "V_CMP_CLASS_F32", []>; ++defm V_CMPX_CLASS_F32 : VOPC_32 <0x00000098, "V_CMPX_CLASS_F32", []>; ++defm V_CMP_CLASS_F64 : VOPC_64 <0x000000a8, "V_CMP_CLASS_F64", []>; ++defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64", []>; ++//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>; ++//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>; ++//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>; ++def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>; ++//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "BUFFER_STORE_FORMAT_X", []>; ++//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>; ++//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>; ++//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>; ++//def BUFFER_LOAD_UBYTE : MUBUF_ <0x00000008, "BUFFER_LOAD_UBYTE", []>; ++//def BUFFER_LOAD_SBYTE : MUBUF_ <0x00000009, "BUFFER_LOAD_SBYTE", []>; ++//def BUFFER_LOAD_USHORT : MUBUF_ <0x0000000a, "BUFFER_LOAD_USHORT", []>; ++//def BUFFER_LOAD_SSHORT : MUBUF_ <0x0000000b, "BUFFER_LOAD_SSHORT", []>; ++//def BUFFER_LOAD_DWORD : MUBUF_ <0x0000000c, "BUFFER_LOAD_DWORD", []>; ++//def BUFFER_LOAD_DWORDX2 : MUBUF_DWORDX2 <0x0000000d, "BUFFER_LOAD_DWORDX2", []>; ++//def BUFFER_LOAD_DWORDX4 : MUBUF_DWORDX4 <0x0000000e, "BUFFER_LOAD_DWORDX4", []>; ++//def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>; ++//def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>; ++//def BUFFER_STORE_DWORD : MUBUF_ <0x0000001c, "BUFFER_STORE_DWORD", []>; ++//def BUFFER_STORE_DWORDX2 : MUBUF_DWORDX2 <0x0000001d, "BUFFER_STORE_DWORDX2", []>; ++//def BUFFER_STORE_DWORDX4 : MUBUF_DWORDX4 <0x0000001e, "BUFFER_STORE_DWORDX4", []>; ++//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>; ++//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>; ++//def BUFFER_ATOMIC_ADD : MUBUF_ <0x00000032, "BUFFER_ATOMIC_ADD", []>; ++//def BUFFER_ATOMIC_SUB : MUBUF_ <0x00000033, "BUFFER_ATOMIC_SUB", []>; ++//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "BUFFER_ATOMIC_RSUB", []>; ++//def BUFFER_ATOMIC_SMIN : MUBUF_ <0x00000035, "BUFFER_ATOMIC_SMIN", []>; ++//def BUFFER_ATOMIC_UMIN : MUBUF_ <0x00000036, "BUFFER_ATOMIC_UMIN", []>; ++//def BUFFER_ATOMIC_SMAX : MUBUF_ <0x00000037, "BUFFER_ATOMIC_SMAX", []>; ++//def BUFFER_ATOMIC_UMAX : MUBUF_ <0x00000038, "BUFFER_ATOMIC_UMAX", []>; ++//def BUFFER_ATOMIC_AND : MUBUF_ <0x00000039, "BUFFER_ATOMIC_AND", []>; ++//def BUFFER_ATOMIC_OR : MUBUF_ <0x0000003a, "BUFFER_ATOMIC_OR", []>; ++//def BUFFER_ATOMIC_XOR : MUBUF_ <0x0000003b, "BUFFER_ATOMIC_XOR", []>; ++//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "BUFFER_ATOMIC_INC", []>; ++//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "BUFFER_ATOMIC_DEC", []>; ++//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "BUFFER_ATOMIC_FCMPSWAP", []>; ++//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "BUFFER_ATOMIC_FMIN", []>; ++//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "BUFFER_ATOMIC_FMAX", []>; ++//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "BUFFER_ATOMIC_SWAP_X2", []>; ++//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "BUFFER_ATOMIC_CMPSWAP_X2", []>; ++//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "BUFFER_ATOMIC_ADD_X2", []>; ++//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "BUFFER_ATOMIC_SUB_X2", []>; ++//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "BUFFER_ATOMIC_RSUB_X2", []>; ++//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "BUFFER_ATOMIC_SMIN_X2", []>; ++//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "BUFFER_ATOMIC_UMIN_X2", []>; ++//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "BUFFER_ATOMIC_SMAX_X2", []>; ++//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "BUFFER_ATOMIC_UMAX_X2", []>; ++//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "BUFFER_ATOMIC_AND_X2", []>; ++//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "BUFFER_ATOMIC_OR_X2", []>; ++//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "BUFFER_ATOMIC_XOR_X2", []>; ++//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "BUFFER_ATOMIC_INC_X2", []>; ++//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "BUFFER_ATOMIC_DEC_X2", []>; ++//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "BUFFER_ATOMIC_FCMPSWAP_X2", []>; ++//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "BUFFER_ATOMIC_FMIN_X2", []>; ++//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "BUFFER_ATOMIC_FMAX_X2", []>; ++//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "BUFFER_WBINVL1_SC", []>; ++//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "BUFFER_WBINVL1", []>; ++//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "TBUFFER_LOAD_FORMAT_X", []>; ++//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>; ++//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>; ++def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORMAT_XYZW", VReg_128>; ++//def TBUFFER_STORE_FORMAT_X : MTBUF_ <0x00000004, "TBUFFER_STORE_FORMAT_X", []>; ++//def TBUFFER_STORE_FORMAT_XY : MTBUF_ <0x00000005, "TBUFFER_STORE_FORMAT_XY", []>; ++//def TBUFFER_STORE_FORMAT_XYZ : MTBUF_ <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", []>; ++//def TBUFFER_STORE_FORMAT_XYZW : MTBUF_ <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", []>; ++ ++defm S_LOAD_DWORD : SMRD_32 <0x00000000, "S_LOAD_DWORD", SReg_32>; ++ ++//def S_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000001, "S_LOAD_DWORDX2", []>; ++defm S_LOAD_DWORDX4 : SMRD_Helper <0x00000002, "S_LOAD_DWORDX4", SReg_128, v4i32>; ++defm S_LOAD_DWORDX8 : SMRD_Helper <0x00000003, "S_LOAD_DWORDX8", SReg_256, v8i32>; ++//def S_LOAD_DWORDX16 : SMRD_DWORDX16 <0x00000004, "S_LOAD_DWORDX16", []>; ++//def S_BUFFER_LOAD_DWORD : SMRD_ <0x00000008, "S_BUFFER_LOAD_DWORD", []>; ++//def S_BUFFER_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000009, "S_BUFFER_LOAD_DWORDX2", []>; ++//def S_BUFFER_LOAD_DWORDX4 : SMRD_DWORDX4 <0x0000000a, "S_BUFFER_LOAD_DWORDX4", []>; ++//def S_BUFFER_LOAD_DWORDX8 : SMRD_DWORDX8 <0x0000000b, "S_BUFFER_LOAD_DWORDX8", []>; ++//def S_BUFFER_LOAD_DWORDX16 : SMRD_DWORDX16 <0x0000000c, "S_BUFFER_LOAD_DWORDX16", []>; ++ ++//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>; ++//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>; ++//def IMAGE_LOAD : MIMG_NoPattern_ <"IMAGE_LOAD", 0x00000000>; ++//def IMAGE_LOAD_MIP : MIMG_NoPattern_ <"IMAGE_LOAD_MIP", 0x00000001>; ++//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>; ++//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_PCK_SGN", 0x00000003>; ++//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK", 0x00000004>; ++//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK_SGN", 0x00000005>; ++//def IMAGE_STORE : MIMG_NoPattern_ <"IMAGE_STORE", 0x00000008>; ++//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"IMAGE_STORE_MIP", 0x00000009>; ++//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"IMAGE_STORE_PCK", 0x0000000a>; ++//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"IMAGE_STORE_MIP_PCK", 0x0000000b>; ++//def IMAGE_GET_RESINFO : MIMG_NoPattern_ <"IMAGE_GET_RESINFO", 0x0000000e>; ++//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_SWAP", 0x0000000f>; ++//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_CMPSWAP", 0x00000010>; ++//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"IMAGE_ATOMIC_ADD", 0x00000011>; ++//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_SUB", 0x00000012>; ++//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_RSUB", 0x00000013>; ++//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMIN", 0x00000014>; ++//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMIN", 0x00000015>; ++//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMAX", 0x00000016>; ++//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMAX", 0x00000017>; ++//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"IMAGE_ATOMIC_AND", 0x00000018>; ++//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"IMAGE_ATOMIC_OR", 0x00000019>; ++//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"IMAGE_ATOMIC_XOR", 0x0000001a>; ++//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"IMAGE_ATOMIC_INC", 0x0000001b>; ++//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"IMAGE_ATOMIC_DEC", 0x0000001c>; ++//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>; ++//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>; ++//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>; ++def IMAGE_SAMPLE : MIMG_Load_Helper <0x00000020, "IMAGE_SAMPLE">; ++//def IMAGE_SAMPLE_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL", 0x00000021>; ++def IMAGE_SAMPLE_D : MIMG_Load_Helper <0x00000022, "IMAGE_SAMPLE_D">; ++//def IMAGE_SAMPLE_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL", 0x00000023>; ++def IMAGE_SAMPLE_L : MIMG_Load_Helper <0x00000024, "IMAGE_SAMPLE_L">; ++def IMAGE_SAMPLE_B : MIMG_Load_Helper <0x00000025, "IMAGE_SAMPLE_B">; ++//def IMAGE_SAMPLE_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL", 0x00000026>; ++//def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>; ++//def IMAGE_SAMPLE_C : MIMG_NoPattern_ <"IMAGE_SAMPLE_C", 0x00000028>; ++//def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>; ++//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D", 0x0000002a>; ++//def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>; ++//def IMAGE_SAMPLE_C_L : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_L", 0x0000002c>; ++//def IMAGE_SAMPLE_C_B : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B", 0x0000002d>; ++//def IMAGE_SAMPLE_C_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL", 0x0000002e>; ++//def IMAGE_SAMPLE_C_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ", 0x0000002f>; ++//def IMAGE_SAMPLE_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_O", 0x00000030>; ++//def IMAGE_SAMPLE_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL_O", 0x00000031>; ++//def IMAGE_SAMPLE_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_O", 0x00000032>; ++//def IMAGE_SAMPLE_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL_O", 0x00000033>; ++//def IMAGE_SAMPLE_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_L_O", 0x00000034>; ++//def IMAGE_SAMPLE_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_O", 0x00000035>; ++//def IMAGE_SAMPLE_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL_O", 0x00000036>; ++//def IMAGE_SAMPLE_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ_O", 0x00000037>; ++//def IMAGE_SAMPLE_C_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_O", 0x00000038>; ++//def IMAGE_SAMPLE_C_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL_O", 0x00000039>; ++//def IMAGE_SAMPLE_C_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_O", 0x0000003a>; ++//def IMAGE_SAMPLE_C_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL_O", 0x0000003b>; ++//def IMAGE_SAMPLE_C_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_L_O", 0x0000003c>; ++//def IMAGE_SAMPLE_C_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_O", 0x0000003d>; ++//def IMAGE_SAMPLE_C_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL_O", 0x0000003e>; ++//def IMAGE_SAMPLE_C_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ_O", 0x0000003f>; ++//def IMAGE_GATHER4 : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4", 0x00000040>; ++//def IMAGE_GATHER4_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL", 0x00000041>; ++//def IMAGE_GATHER4_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L", 0x00000044>; ++//def IMAGE_GATHER4_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B", 0x00000045>; ++//def IMAGE_GATHER4_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL", 0x00000046>; ++//def IMAGE_GATHER4_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ", 0x00000047>; ++//def IMAGE_GATHER4_C : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C", 0x00000048>; ++//def IMAGE_GATHER4_C_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL", 0x00000049>; ++//def IMAGE_GATHER4_C_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L", 0x0000004c>; ++//def IMAGE_GATHER4_C_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B", 0x0000004d>; ++//def IMAGE_GATHER4_C_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL", 0x0000004e>; ++//def IMAGE_GATHER4_C_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ", 0x0000004f>; ++//def IMAGE_GATHER4_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_O", 0x00000050>; ++//def IMAGE_GATHER4_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL_O", 0x00000051>; ++//def IMAGE_GATHER4_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L_O", 0x00000054>; ++//def IMAGE_GATHER4_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_O", 0x00000055>; ++//def IMAGE_GATHER4_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL_O", 0x00000056>; ++//def IMAGE_GATHER4_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ_O", 0x00000057>; ++//def IMAGE_GATHER4_C_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_O", 0x00000058>; ++//def IMAGE_GATHER4_C_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL_O", 0x00000059>; ++//def IMAGE_GATHER4_C_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L_O", 0x0000005c>; ++//def IMAGE_GATHER4_C_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_O", 0x0000005d>; ++//def IMAGE_GATHER4_C_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL_O", 0x0000005e>; ++//def IMAGE_GATHER4_C_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ_O", 0x0000005f>; ++//def IMAGE_GET_LOD : MIMG_NoPattern_ <"IMAGE_GET_LOD", 0x00000060>; ++//def IMAGE_SAMPLE_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD", 0x00000068>; ++//def IMAGE_SAMPLE_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL", 0x00000069>; ++//def IMAGE_SAMPLE_C_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD", 0x0000006a>; ++//def IMAGE_SAMPLE_C_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL", 0x0000006b>; ++//def IMAGE_SAMPLE_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_O", 0x0000006c>; ++//def IMAGE_SAMPLE_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL_O", 0x0000006d>; ++//def IMAGE_SAMPLE_C_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_O", 0x0000006e>; ++//def IMAGE_SAMPLE_C_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL_O", 0x0000006f>; ++//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"IMAGE_RSRC256", 0x0000007e>; ++//def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>; ++//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>; ++ ++let neverHasSideEffects = 1 in { ++defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>; ++} // End neverHasSideEffects ++defm V_READFIRSTLANE_B32 : VOP1_32 <0x00000002, "V_READFIRSTLANE_B32", []>; ++//defm V_CVT_I32_F64 : VOP1_32 <0x00000003, "V_CVT_I32_F64", []>; ++//defm V_CVT_F64_I32 : VOP1_64 <0x00000004, "V_CVT_F64_I32", []>; ++defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32", ++ [(set VReg_32:$dst, (sint_to_fp AllReg_32:$src0))] ++>; ++//defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32", []>; ++//defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>; ++defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32", ++ [(set VReg_32:$dst, (fp_to_sint AllReg_32:$src0))] ++>; ++defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>; ++////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>; ++//defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", []>; ++//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>; ++//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>; ++//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>; ++//defm V_CVT_F32_F64 : VOP1_32 <0x0000000f, "V_CVT_F32_F64", []>; ++//defm V_CVT_F64_F32 : VOP1_64 <0x00000010, "V_CVT_F64_F32", []>; ++//defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", []>; ++//defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>; ++//defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>; ++//defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", []>; ++//defm V_CVT_U32_F64 : VOP1_32 <0x00000015, "V_CVT_U32_F64", []>; ++//defm V_CVT_F64_U32 : VOP1_64 <0x00000016, "V_CVT_F64_U32", []>; ++defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32", ++ [(set VReg_32:$dst, (AMDGPUfract AllReg_32:$src0))] ++>; ++defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", []>; ++defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32", []>; ++defm V_RNDNE_F32 : VOP1_32 <0x00000023, "V_RNDNE_F32", ++ [(set VReg_32:$dst, (frint AllReg_32:$src0))] ++>; ++defm V_FLOOR_F32 : VOP1_32 <0x00000024, "V_FLOOR_F32", ++ [(set VReg_32:$dst, (ffloor AllReg_32:$src0))] ++>; ++defm V_EXP_F32 : VOP1_32 <0x00000025, "V_EXP_F32", ++ [(set VReg_32:$dst, (fexp2 AllReg_32:$src0))] ++>; ++defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>; ++defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32", []>; ++defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>; ++defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>; ++defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32", ++ [(set VReg_32:$dst, (fdiv FP_ONE, AllReg_32:$src0))] ++>; ++defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>; ++defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>; ++defm V_RSQ_LEGACY_F32 : VOP1_32 < ++ 0x0000002d, "V_RSQ_LEGACY_F32", ++ [(set VReg_32:$dst, (int_AMDGPU_rsq AllReg_32:$src0))] ++>; ++defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", []>; ++defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", []>; ++defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>; ++defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", []>; ++defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", []>; ++defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32", []>; ++defm V_SQRT_F64 : VOP1_64 <0x00000034, "V_SQRT_F64", []>; ++defm V_SIN_F32 : VOP1_32 <0x00000035, "V_SIN_F32", []>; ++defm V_COS_F32 : VOP1_32 <0x00000036, "V_COS_F32", []>; ++defm V_NOT_B32 : VOP1_32 <0x00000037, "V_NOT_B32", []>; ++defm V_BFREV_B32 : VOP1_32 <0x00000038, "V_BFREV_B32", []>; ++defm V_FFBH_U32 : VOP1_32 <0x00000039, "V_FFBH_U32", []>; ++defm V_FFBL_B32 : VOP1_32 <0x0000003a, "V_FFBL_B32", []>; ++defm V_FFBH_I32 : VOP1_32 <0x0000003b, "V_FFBH_I32", []>; ++//defm V_FREXP_EXP_I32_F64 : VOP1_32 <0x0000003c, "V_FREXP_EXP_I32_F64", []>; ++defm V_FREXP_MANT_F64 : VOP1_64 <0x0000003d, "V_FREXP_MANT_F64", []>; ++defm V_FRACT_F64 : VOP1_64 <0x0000003e, "V_FRACT_F64", []>; ++//defm V_FREXP_EXP_I32_F32 : VOP1_32 <0x0000003f, "V_FREXP_EXP_I32_F32", []>; ++defm V_FREXP_MANT_F32 : VOP1_32 <0x00000040, "V_FREXP_MANT_F32", []>; ++//def V_CLREXCP : VOP1_ <0x00000041, "V_CLREXCP", []>; ++defm V_MOVRELD_B32 : VOP1_32 <0x00000042, "V_MOVRELD_B32", []>; ++defm V_MOVRELS_B32 : VOP1_32 <0x00000043, "V_MOVRELS_B32", []>; ++defm V_MOVRELSD_B32 : VOP1_32 <0x00000044, "V_MOVRELSD_B32", []>; ++ ++def V_INTERP_P1_F32 : VINTRP < ++ 0x00000000, ++ (outs VReg_32:$dst), ++ (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), ++ "V_INTERP_P1_F32", ++ []> { ++ let DisableEncoding = "$m0"; ++} ++ ++def V_INTERP_P2_F32 : VINTRP < ++ 0x00000001, ++ (outs VReg_32:$dst), ++ (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), ++ "V_INTERP_P2_F32", ++ []> { ++ ++ let Constraints = "$src0 = $dst"; ++ let DisableEncoding = "$src0,$m0"; ++ ++} ++ ++def V_INTERP_MOV_F32 : VINTRP < ++ 0x00000002, ++ (outs VReg_32:$dst), ++ (ins i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0), ++ "V_INTERP_MOV_F32", ++ []> { ++ let VSRC = 0; ++ let DisableEncoding = "$m0"; ++} ++ ++//def S_NOP : SOPP_ <0x00000000, "S_NOP", []>; ++ ++let isTerminator = 1 in { ++ ++def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM", ++ [(IL_retflag)]> { ++ let SIMM16 = 0; ++ let isBarrier = 1; ++ let hasCtrlDep = 1; ++} ++ ++let isBranch = 1 in { ++def S_BRANCH : SOPP < ++ 0x00000002, (ins brtarget:$target), "S_BRANCH", ++ [(br bb:$target)]> { ++ let isBarrier = 1; ++} ++ ++let DisableEncoding = "$scc" in { ++def S_CBRANCH_SCC0 : SOPP < ++ 0x00000004, (ins brtarget:$target, SCCReg:$scc), ++ "S_CBRANCH_SCC0", [] ++>; ++def S_CBRANCH_SCC1 : SOPP < ++ 0x00000005, (ins brtarget:$target, SCCReg:$scc), ++ "S_CBRANCH_SCC1", ++ [] ++>; ++} // End DisableEncoding = "$scc" ++ ++def S_CBRANCH_VCCZ : SOPP < ++ 0x00000006, (ins brtarget:$target, VCCReg:$vcc), ++ "S_CBRANCH_VCCZ", ++ [] ++>; ++def S_CBRANCH_VCCNZ : SOPP < ++ 0x00000007, (ins brtarget:$target, VCCReg:$vcc), ++ "S_CBRANCH_VCCNZ", ++ [] ++>; ++ ++let DisableEncoding = "$exec" in { ++def S_CBRANCH_EXECZ : SOPP < ++ 0x00000008, (ins brtarget:$target, EXECReg:$exec), ++ "S_CBRANCH_EXECZ", ++ [] ++>; ++def S_CBRANCH_EXECNZ : SOPP < ++ 0x00000009, (ins brtarget:$target, EXECReg:$exec), ++ "S_CBRANCH_EXECNZ", ++ [] ++>; ++} // End DisableEncoding = "$exec" ++ ++ ++} // End isBranch = 1 ++} // End isTerminator = 1 ++ ++//def S_BARRIER : SOPP_ <0x0000000a, "S_BARRIER", []>; ++let hasSideEffects = 1 in { ++def S_WAITCNT : SOPP <0x0000000c, (ins i32imm:$simm16), "S_WAITCNT $simm16", ++ [] ++>; ++} // End hasSideEffects ++//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>; ++//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>; ++//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>; ++//def S_SENDMSG : SOPP_ <0x00000010, "S_SENDMSG", []>; ++//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>; ++//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>; ++//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>; ++//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>; ++//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>; ++//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>; ++ ++def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst), ++ (ins AllReg_32:$src0, VReg_32:$src1, VCCReg:$vcc), "V_CNDMASK_B32_e32", ++ [] ++>{ ++ let DisableEncoding = "$vcc"; ++} ++ ++def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst), ++ (ins VReg_32:$src0, VReg_32:$src1, SReg_1:$src2, InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg), ++ "V_CNDMASK_B32_e64", ++ [(set (i32 VReg_32:$dst), (select SReg_1:$src2, VReg_32:$src1, VReg_32:$src0))] ++>; ++ ++//f32 pattern for V_CNDMASK_B32_e64 ++def : Pat < ++ (f32 (select SReg_1:$src2, VReg_32:$src1, VReg_32:$src0)), ++ (V_CNDMASK_B32_e64 VReg_32:$src0, VReg_32:$src1, SReg_1:$src2) ++>; ++ ++defm V_READLANE_B32 : VOP2_32 <0x00000001, "V_READLANE_B32", []>; ++defm V_WRITELANE_B32 : VOP2_32 <0x00000002, "V_WRITELANE_B32", []>; ++ ++defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32", []>; ++def : Pat < ++ (f32 (fadd AllReg_32:$src0, VReg_32:$src1)), ++ (V_ADD_F32_e32 AllReg_32:$src0, VReg_32:$src1) ++>; ++ ++defm V_SUB_F32 : VOP2_32 <0x00000004, "V_SUB_F32", []>; ++def : Pat < ++ (f32 (fsub AllReg_32:$src0, VReg_32:$src1)), ++ (V_SUB_F32_e32 AllReg_32:$src0, VReg_32:$src1) ++>; ++defm V_SUBREV_F32 : VOP2_32 <0x00000005, "V_SUBREV_F32", []>; ++defm V_MAC_LEGACY_F32 : VOP2_32 <0x00000006, "V_MAC_LEGACY_F32", []>; ++defm V_MUL_LEGACY_F32 : VOP2_32 < ++ 0x00000007, "V_MUL_LEGACY_F32", ++ [(set VReg_32:$dst, (int_AMDGPU_mul AllReg_32:$src0, VReg_32:$src1))] ++>; ++ ++defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32", ++ [(set VReg_32:$dst, (fmul AllReg_32:$src0, VReg_32:$src1))] ++>; ++//defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24", []>; ++//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>; ++//defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24", []>; ++//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>; ++defm V_MIN_LEGACY_F32 : VOP2_32 <0x0000000d, "V_MIN_LEGACY_F32", ++ [(set VReg_32:$dst, (AMDGPUfmin AllReg_32:$src0, VReg_32:$src1))] ++>; ++ ++defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32", ++ [(set VReg_32:$dst, (AMDGPUfmax AllReg_32:$src0, VReg_32:$src1))] ++>; ++defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>; ++defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>; ++defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", []>; ++defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", []>; ++defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", []>; ++defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", []>; ++defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", []>; ++defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", []>; ++defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", []>; ++defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", []>; ++defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", []>; ++defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", []>; ++defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32", ++ [(set VReg_32:$dst, (and AllReg_32:$src0, VReg_32:$src1))] ++>; ++defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32", ++ [(set VReg_32:$dst, (or AllReg_32:$src0, VReg_32:$src1))] ++>; ++defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32", ++ [(set VReg_32:$dst, (xor AllReg_32:$src0, VReg_32:$src1))] ++>; ++defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32", []>; ++defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>; ++defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>; ++defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>; ++//defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>; ++//defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>; ++//defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>; ++let Defs = [VCC] in { // Carry-out goes to VCC ++defm V_ADD_I32 : VOP2_32 <0x00000025, "V_ADD_I32", ++ [(set VReg_32:$dst, (add (i32 AllReg_32:$src0), (i32 VReg_32:$src1)))] ++>; ++defm V_SUB_I32 : VOP2_32 <0x00000026, "V_SUB_I32", ++ [(set VReg_32:$dst, (sub (i32 AllReg_32:$src0), (i32 VReg_32:$src1)))] ++>; ++} // End Defs = [VCC] ++defm V_SUBREV_I32 : VOP2_32 <0x00000027, "V_SUBREV_I32", []>; ++defm V_ADDC_U32 : VOP2_32 <0x00000028, "V_ADDC_U32", []>; ++defm V_SUBB_U32 : VOP2_32 <0x00000029, "V_SUBB_U32", []>; ++defm V_SUBBREV_U32 : VOP2_32 <0x0000002a, "V_SUBBREV_U32", []>; ++defm V_LDEXP_F32 : VOP2_32 <0x0000002b, "V_LDEXP_F32", []>; ++////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "V_CVT_PKACCUM_U8_F32", []>; ++////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "V_CVT_PKNORM_I16_F32", []>; ++////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "V_CVT_PKNORM_U16_F32", []>; ++defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32", ++ [(set VReg_32:$dst, (int_SI_packf16 AllReg_32:$src0, VReg_32:$src1))] ++>; ++////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>; ++////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>; ++def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32", []>; ++def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32", []>; ++def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32", []>; ++def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32", []>; ++def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32", []>; ++def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32", []>; ++def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32", []>; ++def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32", []>; ++def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32", []>; ++def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32", []>; ++def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32", []>; ++def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32", []>; ++////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>; ++////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>; ++////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>; ++////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>; ++//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>; ++ ++let neverHasSideEffects = 1 in { ++ ++def V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>; ++def V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", []>; ++//def V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24", []>; ++//def V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24", []>; ++ ++} // End neverHasSideEffects ++def V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>; ++def V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>; ++def V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>; ++def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>; ++def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", []>; ++def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", []>; ++def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", []>; ++def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", []>; ++def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", []>; ++//def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>; ++def V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>; ++def V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>; ++def V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>; ++////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>; ++////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>; ++////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>; ++////def V_MAX3_F32 : VOP3_MAX3 <0x00000154, "V_MAX3_F32", []>; ++////def V_MAX3_I32 : VOP3_MAX3 <0x00000155, "V_MAX3_I32", []>; ++////def V_MAX3_U32 : VOP3_MAX3 <0x00000156, "V_MAX3_U32", []>; ++////def V_MED3_F32 : VOP3_MED3 <0x00000157, "V_MED3_F32", []>; ++////def V_MED3_I32 : VOP3_MED3 <0x00000158, "V_MED3_I32", []>; ++////def V_MED3_U32 : VOP3_MED3 <0x00000159, "V_MED3_U32", []>; ++//def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>; ++//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>; ++//def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>; ++def V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>; ++////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>; ++def V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>; ++def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>; ++def V_LSHL_B64 : VOP3_64 <0x00000161, "V_LSHL_B64", []>; ++def V_LSHR_B64 : VOP3_64 <0x00000162, "V_LSHR_B64", []>; ++def V_ASHR_I64 : VOP3_64 <0x00000163, "V_ASHR_I64", []>; ++def V_ADD_F64 : VOP3_64 <0x00000164, "V_ADD_F64", []>; ++def V_MUL_F64 : VOP3_64 <0x00000165, "V_MUL_F64", []>; ++def V_MIN_F64 : VOP3_64 <0x00000166, "V_MIN_F64", []>; ++def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>; ++def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>; ++def V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>; ++def V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>; ++def V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>; ++def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>; ++def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>; ++def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>; ++def V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>; ++def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>; ++//def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>; ++//def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>; ++//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>; ++def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>; ++def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>; ++def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>; ++def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32", []>; ++def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32", []>; ++def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32", []>; ++def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32", []>; ++def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32", []>; ++def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32", []>; ++def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32", []>; ++def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32", []>; ++ ++def S_CSELECT_B32 : SOP2 < ++ 0x0000000a, (outs SReg_32:$dst), ++ (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32", ++ [(set (i32 SReg_32:$dst), (select SCCReg:$scc, SReg_32:$src0, SReg_32:$src1))] ++>; ++ ++def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>; ++ ++// f32 pattern for S_CSELECT_B32 ++def : Pat < ++ (f32 (select SCCReg:$scc, SReg_32:$src0, SReg_32:$src1)), ++ (S_CSELECT_B32 SReg_32:$src0, SReg_32:$src1, SCCReg:$scc) ++>; ++ ++def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32", []>; ++ ++def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64", ++ [(set SReg_64:$dst, (and SReg_64:$src0, SReg_64:$src1))] ++>; ++def S_AND_VCC : SOP2_VCC <0x0000000f, "S_AND_B64", ++ [(set SReg_1:$vcc, (SIvcc_and SReg_64:$src0, SReg_64:$src1))] ++>; ++def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32", []>; ++def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64", []>; ++def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", []>; ++def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", []>; ++def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>; ++def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>; ++def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>; ++def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>; ++def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>; ++def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>; ++def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>; ++def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>; ++def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>; ++def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>; ++def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32", []>; ++def S_LSHL_B64 : SOP2_64 <0x0000001f, "S_LSHL_B64", []>; ++def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32", []>; ++def S_LSHR_B64 : SOP2_64 <0x00000021, "S_LSHR_B64", []>; ++def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32", []>; ++def S_ASHR_I64 : SOP2_64 <0x00000023, "S_ASHR_I64", []>; ++def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>; ++def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>; ++def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>; ++def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>; ++def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>; ++def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>; ++def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>; ++//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>; ++def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>; ++ ++class V_MOV_IMM : InstSI < ++ (outs VReg_32:$dst), ++ (ins immType:$src0), ++ "V_MOV_IMM", ++ [(set VReg_32:$dst, (immNode:$src0))] ++>; ++ ++let isCodeGenOnly = 1, isPseudo = 1 in { ++ ++def V_MOV_IMM_I32 : V_MOV_IMM; ++def V_MOV_IMM_F32 : V_MOV_IMM; ++ ++def S_MOV_IMM_I32 : InstSI < ++ (outs SReg_32:$dst), ++ (ins i32imm:$src0), ++ "S_MOV_IMM_I32", ++ [(set SReg_32:$dst, (imm:$src0))] ++>; ++ ++// i64 immediates aren't really supported in hardware, but LLVM will use the i64 ++// type for indices on load and store instructions. The pattern for ++// S_MOV_IMM_I64 will only match i64 immediates that can fit into 32-bits, ++// which the hardware can handle. ++def S_MOV_IMM_I64 : InstSI < ++ (outs SReg_64:$dst), ++ (ins i64imm:$src0), ++ "S_MOV_IMM_I64 $dst, $src0", ++ [(set SReg_64:$dst, (IMM32bitIn64bit:$src0))] ++>; ++ ++} // End isCodeGenOnly, isPseudo = 1 ++ ++class SI_LOAD_LITERAL : ++ Enc32 <(outs), (ins ImmType:$imm), "LOAD_LITERAL $imm", []> { ++ ++ bits<32> imm; ++ let Inst{31-0} = imm; ++} ++ ++def SI_LOAD_LITERAL_I32 : SI_LOAD_LITERAL; ++def SI_LOAD_LITERAL_F32 : SI_LOAD_LITERAL; ++ ++let isCodeGenOnly = 1, isPseudo = 1 in { ++ ++def SET_M0 : InstSI < ++ (outs SReg_32:$dst), ++ (ins i32imm:$src0), ++ "SET_M0", ++ [(set SReg_32:$dst, (int_SI_set_M0 imm:$src0))] ++>; ++ ++def LOAD_CONST : AMDGPUShaderInst < ++ (outs GPRF32:$dst), ++ (ins i32imm:$src), ++ "LOAD_CONST $dst, $src", ++ [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))] ++>; ++ ++let usesCustomInserter = 1 in { ++ ++def SI_V_CNDLT : InstSI < ++ (outs VReg_32:$dst), ++ (ins VReg_32:$src0, VReg_32:$src1, VReg_32:$src2), ++ "SI_V_CNDLT $dst, $src0, $src1, $src2", ++ [(set VReg_32:$dst, (int_AMDGPU_cndlt VReg_32:$src0, VReg_32:$src1, VReg_32:$src2))] ++>; ++ ++def SI_INTERP : InstSI < ++ (outs VReg_32:$dst), ++ (ins VReg_32:$i, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, SReg_32:$params), ++ "SI_INTERP $dst, $i, $j, $attr_chan, $attr, $params", ++ [] ++>; ++ ++def SI_INTERP_CONST : InstSI < ++ (outs VReg_32:$dst), ++ (ins i32imm:$attr_chan, i32imm:$attr, SReg_32:$params), ++ "SI_INTERP_CONST $dst, $attr_chan, $attr, $params", ++ [(set VReg_32:$dst, (int_SI_fs_interp_constant imm:$attr_chan, ++ imm:$attr, SReg_32:$params))] ++>; ++ ++def SI_WQM : InstSI < ++ (outs), ++ (ins), ++ "SI_WQM", ++ [(int_SI_wqm)] ++>; ++ ++} // end usesCustomInserter ++ ++// SI Psuedo instructions. These are used by the CFG structurizer pass ++// and should be lowered to ISA instructions prior to codegen. ++ ++let mayLoad = 1, mayStore = 1, hasSideEffects = 1, ++ Uses = [EXEC], Defs = [EXEC] in { ++ ++let isBranch = 1, isTerminator = 1 in { ++ ++def SI_IF : InstSI < ++ (outs SReg_64:$dst), ++ (ins SReg_1:$vcc, brtarget:$target), ++ "SI_IF", ++ [(set SReg_64:$dst, (int_SI_if SReg_1:$vcc, bb:$target))] ++>; ++ ++def SI_ELSE : InstSI < ++ (outs SReg_64:$dst), ++ (ins SReg_64:$src, brtarget:$target), ++ "SI_ELSE", ++ [(set SReg_64:$dst, (int_SI_else SReg_64:$src, bb:$target))]> { ++ ++ let Constraints = "$src = $dst"; ++} ++ ++def SI_LOOP : InstSI < ++ (outs), ++ (ins SReg_64:$saved, brtarget:$target), ++ "SI_LOOP", ++ [(int_SI_loop SReg_64:$saved, bb:$target)] ++>; ++ ++} // end isBranch = 1, isTerminator = 1 ++ ++def SI_BREAK : InstSI < ++ (outs SReg_64:$dst), ++ (ins SReg_64:$src), ++ "SI_ELSE", ++ [(set SReg_64:$dst, (int_SI_break SReg_64:$src))] ++>; ++ ++def SI_IF_BREAK : InstSI < ++ (outs SReg_64:$dst), ++ (ins SReg_1:$vcc, SReg_64:$src), ++ "SI_IF_BREAK", ++ [(set SReg_64:$dst, (int_SI_if_break SReg_1:$vcc, SReg_64:$src))] ++>; ++ ++def SI_ELSE_BREAK : InstSI < ++ (outs SReg_64:$dst), ++ (ins SReg_64:$src0, SReg_64:$src1), ++ "SI_ELSE_BREAK", ++ [(set SReg_64:$dst, (int_SI_else_break SReg_64:$src0, SReg_64:$src1))] ++>; ++ ++def SI_END_CF : InstSI < ++ (outs), ++ (ins SReg_64:$saved), ++ "SI_END_CF", ++ [(int_SI_end_cf SReg_64:$saved)] ++>; ++ ++def SI_KILL : InstSI < ++ (outs), ++ (ins VReg_32:$src), ++ "SI_KIL $src", ++ [(int_AMDGPU_kill VReg_32:$src)] ++>; ++ ++} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1 ++ // Uses = [EXEC], Defs = [EXEC] ++ ++} // end IsCodeGenOnly, isPseudo ++ ++def : Pat < ++ (int_AMDGPU_kilp), ++ (SI_KILL (V_MOV_IMM_I32 0xbf800000)) ++>; ++ ++/* int_SI_vs_load_input */ ++def : Pat< ++ (int_SI_vs_load_input SReg_128:$tlst, IMM12bit:$attr_offset, ++ VReg_32:$buf_idx_vgpr), ++ (BUFFER_LOAD_FORMAT_XYZW imm:$attr_offset, 0, 1, 0, 0, 0, ++ VReg_32:$buf_idx_vgpr, SReg_128:$tlst, ++ 0, 0, (i32 SREG_LIT_0)) ++>; ++ ++/* int_SI_export */ ++def : Pat < ++ (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr, ++ VReg_32:$src0,VReg_32:$src1, VReg_32:$src2, VReg_32:$src3), ++ (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm, ++ VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3) ++>; ++ ++/* int_SI_sample */ ++def : Pat < ++ (int_SI_sample imm:$writemask, VReg_128:$coord, SReg_256:$rsrc, SReg_128:$sampler, imm), ++ (IMAGE_SAMPLE imm:$writemask, 0, 0, 0, 0, 0, 0, 0, VReg_128:$coord, ++ SReg_256:$rsrc, SReg_128:$sampler) ++>; ++ ++def : Pat < ++ (int_SI_sample imm:$writemask, VReg_128:$coord, SReg_256:$rsrc, SReg_128:$sampler, TEX_RECT), ++ (IMAGE_SAMPLE imm:$writemask, 1, 0, 0, 0, 0, 0, 0, VReg_128:$coord, ++ SReg_256:$rsrc, SReg_128:$sampler) ++>; ++ ++/* int_SI_sample_lod */ ++def : Pat < ++ (int_SI_sample_lod imm:$writemask, VReg_128:$coord, SReg_256:$rsrc, SReg_128:$sampler, imm), ++ (IMAGE_SAMPLE_L imm:$writemask, 0, 0, 0, 0, 0, 0, 0, VReg_128:$coord, ++ SReg_256:$rsrc, SReg_128:$sampler) ++>; ++ ++/* int_SI_sample_bias */ ++def : Pat < ++ (int_SI_sample_bias imm:$writemask, VReg_128:$coord, SReg_256:$rsrc, SReg_128:$sampler, imm), ++ (IMAGE_SAMPLE_B imm:$writemask, 0, 0, 0, 0, 0, 0, 0, VReg_128:$coord, ++ SReg_256:$rsrc, SReg_128:$sampler) ++>; ++ ++def CLAMP_SI : CLAMP; ++def FABS_SI : FABS; ++def FNEG_SI : FNEG; ++ ++def : Extract_Element ; ++def : Extract_Element ; ++def : Extract_Element ; ++def : Extract_Element ; ++ ++def : Insert_Element ; ++def : Insert_Element ; ++def : Insert_Element ; ++def : Insert_Element ; ++ ++def : Vector_Build ; ++def : Vector_Build ; ++ ++def : BitConvert ; ++def : BitConvert ; ++ ++def : BitConvert ; ++def : BitConvert ; ++ ++def : Pat < ++ (i64 (SIsreg1_bitcast SReg_1:$vcc)), ++ (S_MOV_B64 (COPY_TO_REGCLASS SReg_1:$vcc, SReg_64)) ++>; ++ ++def : Pat < ++ (i1 (SIsreg1_bitcast SReg_64:$vcc)), ++ (COPY_TO_REGCLASS SReg_64:$vcc, SReg_1) ++>; ++ ++def : Pat < ++ (i64 (SIvcc_bitcast VCCReg:$vcc)), ++ (S_MOV_B64 (COPY_TO_REGCLASS VCCReg:$vcc, SReg_64)) ++>; ++ ++def : Pat < ++ (i1 (SIvcc_bitcast SReg_64:$vcc)), ++ (COPY_TO_REGCLASS SReg_64:$vcc, VCCReg) ++>; ++ ++/********** ===================== **********/ ++/********** Interpolation Paterns **********/ ++/********** ===================== **********/ ++ ++def : Pat < ++ (int_SI_fs_interp_linear_center imm:$attr_chan, imm:$attr, SReg_32:$params), ++ (SI_INTERP (f32 LINEAR_CENTER_I), (f32 LINEAR_CENTER_J), imm:$attr_chan, ++ imm:$attr, SReg_32:$params) ++>; ++ ++def : Pat < ++ (int_SI_fs_interp_linear_centroid imm:$attr_chan, imm:$attr, SReg_32:$params), ++ (SI_INTERP (f32 LINEAR_CENTROID_I), (f32 LINEAR_CENTROID_J), imm:$attr_chan, ++ imm:$attr, SReg_32:$params) ++>; ++ ++def : Pat < ++ (int_SI_fs_interp_persp_center imm:$attr_chan, imm:$attr, SReg_32:$params), ++ (SI_INTERP (f32 PERSP_CENTER_I), (f32 PERSP_CENTER_J), imm:$attr_chan, ++ imm:$attr, SReg_32:$params) ++>; ++ ++def : Pat < ++ (int_SI_fs_interp_persp_centroid imm:$attr_chan, imm:$attr, SReg_32:$params), ++ (SI_INTERP (f32 PERSP_CENTROID_I), (f32 PERSP_CENTROID_J), imm:$attr_chan, ++ imm:$attr, SReg_32:$params) ++>; ++ ++def : Pat < ++ (int_SI_fs_read_face), ++ (f32 FRONT_FACE) ++>; ++ ++def : Pat < ++ (int_SI_fs_read_pos 0), ++ (f32 POS_X_FLOAT) ++>; ++ ++def : Pat < ++ (int_SI_fs_read_pos 1), ++ (f32 POS_Y_FLOAT) ++>; ++ ++def : Pat < ++ (int_SI_fs_read_pos 2), ++ (f32 POS_Z_FLOAT) ++>; ++ ++def : Pat < ++ (int_SI_fs_read_pos 3), ++ (f32 POS_W_FLOAT) ++>; ++ ++/********** ================== **********/ ++/********** Intrinsic Patterns **********/ ++/********** ================== **********/ ++ ++/* llvm.AMDGPU.pow */ ++/* XXX: We are using IEEE MUL, not the 0 * anything = 0 MUL, is this correct? */ ++def : POW_Common ; ++ ++def : Pat < ++ (int_AMDGPU_div AllReg_32:$src0, AllReg_32:$src1), ++ (V_MUL_LEGACY_F32_e32 AllReg_32:$src0, (V_RCP_LEGACY_F32_e32 AllReg_32:$src1)) ++>; ++ ++def : Pat< ++ (fdiv AllReg_32:$src0, AllReg_32:$src1), ++ (V_MUL_F32_e32 AllReg_32:$src0, (V_RCP_F32_e32 AllReg_32:$src1)) ++>; ++ ++def : Pat < ++ (int_AMDGPU_cube VReg_128:$src), ++ (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), ++ (V_CUBETC_F32 (EXTRACT_SUBREG VReg_128:$src, sel_x), ++ (EXTRACT_SUBREG VReg_128:$src, sel_y), ++ (EXTRACT_SUBREG VReg_128:$src, sel_z), ++ 0, 0, 0, 0), sel_x), ++ (V_CUBESC_F32 (EXTRACT_SUBREG VReg_128:$src, sel_x), ++ (EXTRACT_SUBREG VReg_128:$src, sel_y), ++ (EXTRACT_SUBREG VReg_128:$src, sel_z), ++ 0, 0, 0, 0), sel_y), ++ (V_CUBEMA_F32 (EXTRACT_SUBREG VReg_128:$src, sel_x), ++ (EXTRACT_SUBREG VReg_128:$src, sel_y), ++ (EXTRACT_SUBREG VReg_128:$src, sel_z), ++ 0, 0, 0, 0), sel_z), ++ (V_CUBEID_F32 (EXTRACT_SUBREG VReg_128:$src, sel_x), ++ (EXTRACT_SUBREG VReg_128:$src, sel_y), ++ (EXTRACT_SUBREG VReg_128:$src, sel_z), ++ 0, 0, 0, 0), sel_w) ++>; ++ ++/********** ================== **********/ ++/********** VOP3 Patterns **********/ ++/********** ================== **********/ ++ ++def : Pat <(f32 (IL_mad AllReg_32:$src0, VReg_32:$src1, VReg_32:$src2)), ++ (V_MAD_LEGACY_F32 AllReg_32:$src0, VReg_32:$src1, VReg_32:$src2, ++ 0, 0, 0, 0)>; ++ ++} // End isSI predicate +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIIntrinsics.td llvm-r600/lib/Target/R600/SIIntrinsics.td +--- llvm-3.2.src/lib/Target/R600/SIIntrinsics.td 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/SIIntrinsics.td 2013-01-25 19:43:57.480049720 +0100 +@@ -0,0 +1,54 @@ ++//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++// SI Intrinsic Definitions ++// ++//===----------------------------------------------------------------------===// ++ ++ ++let TargetPrefix = "SI", isTarget = 1 in { ++ ++ def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; ++ def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; ++ /* XXX: We may need a seperate intrinsic here for loading integer values */ ++ def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_i64_ty, llvm_i32_ty], []>; ++ def int_SI_vs_load_buffer_index : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>; ++ def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v4i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrReadMem]> ; ++ def int_SI_wqm : Intrinsic <[], [], []>; ++ ++ class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_v4f32_ty, llvm_v8i32_ty, llvm_v4i32_ty, llvm_i32_ty], [IntrReadMem]>; ++ ++ def int_SI_sample : Sample; ++ def int_SI_sample_bias : Sample; ++ def int_SI_sample_lod : Sample; ++ ++ /* Interpolation Intrinsics */ ++ ++ def int_SI_set_M0 : Intrinsic <[llvm_i32_ty], [llvm_i32_ty]>; ++ class Interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadMem]>; ++ ++ def int_SI_fs_interp_linear_center : Interp; ++ def int_SI_fs_interp_linear_centroid : Interp; ++ def int_SI_fs_interp_persp_center : Interp; ++ def int_SI_fs_interp_persp_centroid : Interp; ++ def int_SI_fs_interp_constant : Interp; ++ ++ def int_SI_fs_read_face : Intrinsic <[llvm_float_ty], [], [IntrNoMem]>; ++ def int_SI_fs_read_pos : Intrinsic <[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; ++ ++ /* Control flow Intrinsics */ ++ ++ def int_SI_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>; ++ def int_SI_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>; ++ def int_SI_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>; ++ def int_SI_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>; ++ def int_SI_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>; ++ def int_SI_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>; ++ def int_SI_end_cf : Intrinsic<[], [llvm_i64_ty], []>; ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIISelLowering.cpp llvm-r600/lib/Target/R600/SIISelLowering.cpp +--- llvm-3.2.src/lib/Target/R600/SIISelLowering.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/SIISelLowering.cpp 2013-01-25 19:43:57.470049720 +0100 +@@ -0,0 +1,486 @@ ++//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Custom DAG lowering for SI ++// ++//===----------------------------------------------------------------------===// ++ ++#include "SIISelLowering.h" ++#include "AMDIL.h" ++#include "AMDILIntrinsicInfo.h" ++#include "SIInstrInfo.h" ++#include "SIMachineFunctionInfo.h" ++#include "SIRegisterInfo.h" ++#include "llvm/CodeGen/MachineInstrBuilder.h" ++#include "llvm/CodeGen/MachineRegisterInfo.h" ++#include "llvm/CodeGen/SelectionDAG.h" ++ ++using namespace llvm; ++ ++SITargetLowering::SITargetLowering(TargetMachine &TM) : ++ AMDGPUTargetLowering(TM), ++ TII(static_cast(TM.getInstrInfo())) { ++ addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); ++ addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass); ++ addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass); ++ addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); ++ addRegisterClass(MVT::i1, &AMDGPU::SCCRegRegClass); ++ addRegisterClass(MVT::i1, &AMDGPU::VCCRegRegClass); ++ ++ addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); ++ addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); ++ ++ computeRegisterProperties(); ++ ++ setOperationAction(ISD::AND, MVT::i1, Custom); ++ ++ setOperationAction(ISD::ADD, MVT::i64, Legal); ++ setOperationAction(ISD::ADD, MVT::i32, Legal); ++ ++ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); ++ ++ // We need to custom lower loads from the USER_SGPR address space, so we can ++ // add the SGPRs as livein registers. ++ setOperationAction(ISD::LOAD, MVT::i32, Custom); ++ setOperationAction(ISD::LOAD, MVT::i64, Custom); ++ ++ setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); ++ setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); ++ ++ setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); ++ setTargetDAGCombine(ISD::SELECT_CC); ++ ++ setTargetDAGCombine(ISD::SETCC); ++} ++ ++MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( ++ MachineInstr * MI, MachineBasicBlock * BB) const { ++ const TargetInstrInfo * TII = getTargetMachine().getInstrInfo(); ++ MachineRegisterInfo & MRI = BB->getParent()->getRegInfo(); ++ MachineBasicBlock::iterator I = MI; ++ ++ switch (MI->getOpcode()) { ++ default: ++ return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); ++ case AMDGPU::BRANCH: return BB; ++ case AMDGPU::CLAMP_SI: ++ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64)) ++ .addOperand(MI->getOperand(0)) ++ .addOperand(MI->getOperand(1)) ++ // VSRC1-2 are unused, but we still need to fill all the ++ // operand slots, so we just reuse the VSRC0 operand ++ .addOperand(MI->getOperand(1)) ++ .addOperand(MI->getOperand(1)) ++ .addImm(0) // ABS ++ .addImm(1) // CLAMP ++ .addImm(0) // OMOD ++ .addImm(0); // NEG ++ MI->eraseFromParent(); ++ break; ++ ++ case AMDGPU::FABS_SI: ++ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64)) ++ .addOperand(MI->getOperand(0)) ++ .addOperand(MI->getOperand(1)) ++ // VSRC1-2 are unused, but we still need to fill all the ++ // operand slots, so we just reuse the VSRC0 operand ++ .addOperand(MI->getOperand(1)) ++ .addOperand(MI->getOperand(1)) ++ .addImm(1) // ABS ++ .addImm(0) // CLAMP ++ .addImm(0) // OMOD ++ .addImm(0); // NEG ++ MI->eraseFromParent(); ++ break; ++ ++ case AMDGPU::FNEG_SI: ++ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64)) ++ .addOperand(MI->getOperand(0)) ++ .addOperand(MI->getOperand(1)) ++ // VSRC1-2 are unused, but we still need to fill all the ++ // operand slots, so we just reuse the VSRC0 operand ++ .addOperand(MI->getOperand(1)) ++ .addOperand(MI->getOperand(1)) ++ .addImm(0) // ABS ++ .addImm(0) // CLAMP ++ .addImm(0) // OMOD ++ .addImm(1); // NEG ++ MI->eraseFromParent(); ++ break; ++ case AMDGPU::SHADER_TYPE: ++ BB->getParent()->getInfo()->ShaderType = ++ MI->getOperand(0).getImm(); ++ MI->eraseFromParent(); ++ break; ++ ++ case AMDGPU::SI_INTERP: ++ LowerSI_INTERP(MI, *BB, I, MRI); ++ break; ++ case AMDGPU::SI_INTERP_CONST: ++ LowerSI_INTERP_CONST(MI, *BB, I, MRI); ++ break; ++ case AMDGPU::SI_WQM: ++ LowerSI_WQM(MI, *BB, I, MRI); ++ break; ++ case AMDGPU::SI_V_CNDLT: ++ LowerSI_V_CNDLT(MI, *BB, I, MRI); ++ break; ++ } ++ return BB; ++} ++ ++void SITargetLowering::LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB, ++ MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const { ++ BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC) ++ .addReg(AMDGPU::EXEC); ++ ++ MI->eraseFromParent(); ++} ++ ++void SITargetLowering::LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB, ++ MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const { ++ unsigned tmp = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass); ++ unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass); ++ MachineOperand dst = MI->getOperand(0); ++ MachineOperand iReg = MI->getOperand(1); ++ MachineOperand jReg = MI->getOperand(2); ++ MachineOperand attr_chan = MI->getOperand(3); ++ MachineOperand attr = MI->getOperand(4); ++ MachineOperand params = MI->getOperand(5); ++ ++ BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0) ++ .addOperand(params); ++ ++ BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P1_F32), tmp) ++ .addOperand(iReg) ++ .addOperand(attr_chan) ++ .addOperand(attr) ++ .addReg(M0); ++ ++ BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P2_F32)) ++ .addOperand(dst) ++ .addReg(tmp) ++ .addOperand(jReg) ++ .addOperand(attr_chan) ++ .addOperand(attr) ++ .addReg(M0); ++ ++ MI->eraseFromParent(); ++} ++ ++void SITargetLowering::LowerSI_INTERP_CONST(MachineInstr *MI, ++ MachineBasicBlock &BB, MachineBasicBlock::iterator I, ++ MachineRegisterInfo &MRI) const { ++ MachineOperand dst = MI->getOperand(0); ++ MachineOperand attr_chan = MI->getOperand(1); ++ MachineOperand attr = MI->getOperand(2); ++ MachineOperand params = MI->getOperand(3); ++ unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass); ++ ++ BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0) ++ .addOperand(params); ++ ++ BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_MOV_F32)) ++ .addOperand(dst) ++ .addOperand(attr_chan) ++ .addOperand(attr) ++ .addReg(M0); ++ ++ MI->eraseFromParent(); ++} ++ ++void SITargetLowering::LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB, ++ MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const { ++ unsigned VCC = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); ++ ++ BuildMI(BB, I, BB.findDebugLoc(I), ++ TII->get(AMDGPU::V_CMP_GT_F32_e32), ++ VCC) ++ .addReg(AMDGPU::SREG_LIT_0) ++ .addOperand(MI->getOperand(1)); ++ ++ BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_CNDMASK_B32_e32)) ++ .addOperand(MI->getOperand(0)) ++ .addOperand(MI->getOperand(3)) ++ .addOperand(MI->getOperand(2)) ++ .addReg(VCC); ++ ++ MI->eraseFromParent(); ++} ++ ++EVT SITargetLowering::getSetCCResultType(EVT VT) const { ++ return MVT::i1; ++} ++ ++//===----------------------------------------------------------------------===// ++// Custom DAG Lowering Operations ++//===----------------------------------------------------------------------===// ++ ++SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { ++ switch (Op.getOpcode()) { ++ default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); ++ case ISD::BRCOND: return LowerBRCOND(Op, DAG); ++ case ISD::LOAD: return LowerLOAD(Op, DAG); ++ case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); ++ case ISD::AND: return Loweri1ContextSwitch(Op, DAG, ISD::AND); ++ case ISD::INTRINSIC_WO_CHAIN: { ++ unsigned IntrinsicID = ++ cast(Op.getOperand(0))->getZExtValue(); ++ EVT VT = Op.getValueType(); ++ switch (IntrinsicID) { ++ case AMDGPUIntrinsic::SI_vs_load_buffer_index: ++ return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, ++ AMDGPU::VGPR0, VT); ++ default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); ++ } ++ break; ++ } ++ } ++ return SDValue(); ++} ++ ++/// \brief The function is for lowering i1 operations on the ++/// VCC register. ++/// ++/// In the VALU context, VCC is a one bit register, but in the ++/// SALU context the VCC is a 64-bit register (1-bit per thread). Since only ++/// the SALU can perform operations on the VCC register, we need to promote ++/// the operand types from i1 to i64 in order for tablegen to be able to match ++/// this operation to the correct SALU instruction. We do this promotion by ++/// wrapping the operands in a CopyToReg node. ++/// ++SDValue SITargetLowering::Loweri1ContextSwitch(SDValue Op, ++ SelectionDAG &DAG, ++ unsigned VCCNode) const { ++ DebugLoc DL = Op.getDebugLoc(); ++ ++ SDValue OpNode = DAG.getNode(VCCNode, DL, MVT::i64, ++ DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64, ++ Op.getOperand(0)), ++ DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64, ++ Op.getOperand(1))); ++ ++ return DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i1, OpNode); ++} ++ ++/// \brief Helper function for LowerBRCOND ++static SDNode *findUser(SDValue Value, unsigned Opcode) { ++ ++ SDNode *Parent = Value.getNode(); ++ for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); ++ I != E; ++I) { ++ ++ if (I.getUse().get() != Value) ++ continue; ++ ++ if (I->getOpcode() == Opcode) ++ return *I; ++ } ++ return 0; ++} ++ ++/// This transforms the control flow intrinsics to get the branch destination as ++/// last parameter, also switches branch target with BR if the need arise ++SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, ++ SelectionDAG &DAG) const { ++ ++ DebugLoc DL = BRCOND.getDebugLoc(); ++ ++ SDNode *Intr = BRCOND.getOperand(1).getNode(); ++ SDValue Target = BRCOND.getOperand(2); ++ SDNode *BR = 0; ++ ++ if (Intr->getOpcode() == ISD::SETCC) { ++ // As long as we negate the condition everything is fine ++ SDNode *SetCC = Intr; ++ assert(SetCC->getConstantOperandVal(1) == 1); ++ ++ CondCodeSDNode *CC = cast(SetCC->getOperand(2).getNode()); ++ assert(CC->get() == ISD::SETNE); ++ Intr = SetCC->getOperand(0).getNode(); ++ ++ } else { ++ // Get the target from BR if we don't negate the condition ++ BR = findUser(BRCOND, ISD::BR); ++ Target = BR->getOperand(1); ++ } ++ ++ assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); ++ ++ // Build the result and ++ SmallVector Res; ++ for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i) ++ Res.push_back(Intr->getValueType(i)); ++ ++ // operands of the new intrinsic call ++ SmallVector Ops; ++ Ops.push_back(BRCOND.getOperand(0)); ++ for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i) ++ Ops.push_back(Intr->getOperand(i)); ++ Ops.push_back(Target); ++ ++ // build the new intrinsic call ++ SDNode *Result = DAG.getNode( ++ Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, ++ DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode(); ++ ++ if (BR) { ++ // Give the branch instruction our target ++ SDValue Ops[] = { ++ BR->getOperand(0), ++ BRCOND.getOperand(2) ++ }; ++ DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2); ++ } ++ ++ SDValue Chain = SDValue(Result, Result->getNumValues() - 1); ++ ++ // Copy the intrinsic results to registers ++ for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { ++ SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); ++ if (!CopyToReg) ++ continue; ++ ++ Chain = DAG.getCopyToReg( ++ Chain, DL, ++ CopyToReg->getOperand(1), ++ SDValue(Result, i - 1), ++ SDValue()); ++ ++ DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); ++ } ++ ++ // Remove the old intrinsic from the chain ++ DAG.ReplaceAllUsesOfValueWith( ++ SDValue(Intr, Intr->getNumValues() - 1), ++ Intr->getOperand(0)); ++ ++ return Chain; ++} ++ ++SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { ++ EVT VT = Op.getValueType(); ++ LoadSDNode *Ptr = dyn_cast(Op); ++ ++ assert(Ptr); ++ ++ unsigned AddrSpace = Ptr->getPointerInfo().getAddrSpace(); ++ ++ // We only need to lower USER_SGPR address space loads ++ if (AddrSpace != AMDGPUAS::USER_SGPR_ADDRESS) { ++ return SDValue(); ++ } ++ ++ // Loads from the USER_SGPR address space can only have constant value ++ // pointers. ++ ConstantSDNode *BasePtr = dyn_cast(Ptr->getBasePtr()); ++ assert(BasePtr); ++ ++ unsigned TypeDwordWidth = VT.getSizeInBits() / 32; ++ const TargetRegisterClass * dstClass; ++ switch (TypeDwordWidth) { ++ default: ++ assert(!"USER_SGPR value size not implemented"); ++ return SDValue(); ++ case 1: ++ dstClass = &AMDGPU::SReg_32RegClass; ++ break; ++ case 2: ++ dstClass = &AMDGPU::SReg_64RegClass; ++ break; ++ } ++ uint64_t Index = BasePtr->getZExtValue(); ++ assert(Index % TypeDwordWidth == 0 && "USER_SGPR not properly aligned"); ++ unsigned SGPRIndex = Index / TypeDwordWidth; ++ unsigned Reg = dstClass->getRegister(SGPRIndex); ++ ++ DAG.ReplaceAllUsesOfValueWith(Op, CreateLiveInRegister(DAG, dstClass, Reg, ++ VT)); ++ return SDValue(); ++} ++ ++SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { ++ SDValue LHS = Op.getOperand(0); ++ SDValue RHS = Op.getOperand(1); ++ SDValue True = Op.getOperand(2); ++ SDValue False = Op.getOperand(3); ++ SDValue CC = Op.getOperand(4); ++ EVT VT = Op.getValueType(); ++ DebugLoc DL = Op.getDebugLoc(); ++ ++ // Possible Min/Max pattern ++ SDValue MinMax = LowerMinMax(Op, DAG); ++ if (MinMax.getNode()) { ++ return MinMax; ++ } ++ ++ SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC); ++ return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False); ++} ++ ++//===----------------------------------------------------------------------===// ++// Custom DAG optimizations ++//===----------------------------------------------------------------------===// ++ ++SDValue SITargetLowering::PerformDAGCombine(SDNode *N, ++ DAGCombinerInfo &DCI) const { ++ SelectionDAG &DAG = DCI.DAG; ++ DebugLoc DL = N->getDebugLoc(); ++ EVT VT = N->getValueType(0); ++ ++ switch (N->getOpcode()) { ++ default: break; ++ case ISD::SELECT_CC: { ++ N->dump(); ++ ConstantSDNode *True, *False; ++ // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc) ++ if ((True = dyn_cast(N->getOperand(2))) ++ && (False = dyn_cast(N->getOperand(3))) ++ && True->isAllOnesValue() ++ && False->isNullValue() ++ && VT == MVT::i1) { ++ return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0), ++ N->getOperand(1), N->getOperand(4)); ++ ++ } ++ break; ++ } ++ case ISD::SETCC: { ++ SDValue Arg0 = N->getOperand(0); ++ SDValue Arg1 = N->getOperand(1); ++ SDValue CC = N->getOperand(2); ++ ConstantSDNode * C = NULL; ++ ISD::CondCode CCOp = dyn_cast(CC)->get(); ++ ++ // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne) ++ if (VT == MVT::i1 ++ && Arg0.getOpcode() == ISD::SIGN_EXTEND ++ && Arg0.getOperand(0).getValueType() == MVT::i1 ++ && (C = dyn_cast(Arg1)) ++ && C->isNullValue() ++ && CCOp == ISD::SETNE) { ++ return SimplifySetCC(VT, Arg0.getOperand(0), ++ DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL); ++ } ++ break; ++ } ++ } ++ return SDValue(); ++} ++ ++#define NODE_NAME_CASE(node) case SIISD::node: return #node; ++ ++const char* SITargetLowering::getTargetNodeName(unsigned Opcode) const { ++ switch (Opcode) { ++ default: return AMDGPUTargetLowering::getTargetNodeName(Opcode); ++ NODE_NAME_CASE(VCC_AND) ++ NODE_NAME_CASE(VCC_BITCAST) ++ } ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIISelLowering.h llvm-r600/lib/Target/R600/SIISelLowering.h +--- llvm-3.2.src/lib/Target/R600/SIISelLowering.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/SIISelLowering.h 2013-01-25 19:43:57.473383054 +0100 +@@ -0,0 +1,55 @@ ++//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief SI DAG Lowering interface definition ++// ++//===----------------------------------------------------------------------===// ++ ++#ifndef SIISELLOWERING_H ++#define SIISELLOWERING_H ++ ++#include "AMDGPUISelLowering.h" ++#include "SIInstrInfo.h" ++ ++namespace llvm { ++ ++class SITargetLowering : public AMDGPUTargetLowering { ++ const SIInstrInfo * TII; ++ ++ void LowerMOV_IMM(MachineInstr *MI, MachineBasicBlock &BB, ++ MachineBasicBlock::iterator I, unsigned Opocde) const; ++ void LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB, ++ MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const; ++ void LowerSI_INTERP_CONST(MachineInstr *MI, MachineBasicBlock &BB, ++ MachineBasicBlock::iterator I, MachineRegisterInfo &MRI) const; ++ void LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB, ++ MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const; ++ void LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB, ++ MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const; ++ ++ SDValue Loweri1ContextSwitch(SDValue Op, SelectionDAG &DAG, ++ unsigned VCCNode) const; ++ SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; ++ SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; ++ ++public: ++ SITargetLowering(TargetMachine &tm); ++ virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI, ++ MachineBasicBlock * BB) const; ++ virtual EVT getSetCCResultType(EVT VT) const; ++ virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; ++ virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; ++ virtual const char* getTargetNodeName(unsigned Opcode) const; ++}; ++ ++} // End namespace llvm ++ ++#endif //SIISELLOWERING_H +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SILowerControlFlow.cpp llvm-r600/lib/Target/R600/SILowerControlFlow.cpp +--- llvm-3.2.src/lib/Target/R600/SILowerControlFlow.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/SILowerControlFlow.cpp 2013-01-25 19:43:57.480049720 +0100 +@@ -0,0 +1,372 @@ ++//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief This pass lowers the pseudo control flow instructions to real ++/// machine instructions. ++/// ++/// All control flow is handled using predicated instructions and ++/// a predicate stack. Each Scalar ALU controls the operations of 64 Vector ++/// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs ++/// by writting to the 64-bit EXEC register (each bit corresponds to a ++/// single vector ALU). Typically, for predicates, a vector ALU will write ++/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each ++/// Vector ALU) and then the ScalarALU will AND the VCC register with the ++/// EXEC to update the predicates. ++/// ++/// For example: ++/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 ++/// %SGPR0 = SI_IF %VCC ++/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 ++/// %SGPR0 = SI_ELSE %SGPR0 ++/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 ++/// SI_END_CF %SGPR0 ++/// ++/// becomes: ++/// ++/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask ++/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask ++/// S_CBRANCH_EXECZ label0 // This instruction is an optional ++/// // optimization which allows us to ++/// // branch if all the bits of ++/// // EXEC are zero. ++/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch ++/// ++/// label0: ++/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block ++/// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask ++/// S_BRANCH_EXECZ label1 // Use our branch optimization ++/// // instruction again. ++/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block ++/// label1: ++/// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits ++//===----------------------------------------------------------------------===// ++ ++#include "AMDGPU.h" ++#include "SIInstrInfo.h" ++#include "SIMachineFunctionInfo.h" ++#include "llvm/CodeGen/MachineFunction.h" ++#include "llvm/CodeGen/MachineFunctionPass.h" ++#include "llvm/CodeGen/MachineInstrBuilder.h" ++#include "llvm/CodeGen/MachineRegisterInfo.h" ++ ++using namespace llvm; ++ ++namespace { ++ ++class SILowerControlFlowPass : public MachineFunctionPass { ++ ++private: ++ static const unsigned SkipThreshold = 12; ++ ++ static char ID; ++ const TargetInstrInfo *TII; ++ ++ bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); ++ ++ void Skip(MachineInstr &From, MachineOperand &To); ++ void SkipIfDead(MachineInstr &MI); ++ ++ void If(MachineInstr &MI); ++ void Else(MachineInstr &MI); ++ void Break(MachineInstr &MI); ++ void IfBreak(MachineInstr &MI); ++ void ElseBreak(MachineInstr &MI); ++ void Loop(MachineInstr &MI); ++ void EndCf(MachineInstr &MI); ++ ++ void Kill(MachineInstr &MI); ++ void Branch(MachineInstr &MI); ++ ++public: ++ SILowerControlFlowPass(TargetMachine &tm) : ++ MachineFunctionPass(ID), TII(tm.getInstrInfo()) { } ++ ++ virtual bool runOnMachineFunction(MachineFunction &MF); ++ ++ const char *getPassName() const { ++ return "SI Lower control flow instructions"; ++ } ++ ++}; ++ ++} // End anonymous namespace ++ ++char SILowerControlFlowPass::ID = 0; ++ ++FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) { ++ return new SILowerControlFlowPass(tm); ++} ++ ++bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From, ++ MachineBasicBlock *To) { ++ ++ unsigned NumInstr = 0; ++ ++ for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty(); ++ MBB = *MBB->succ_begin()) { ++ ++ for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); ++ NumInstr < SkipThreshold && I != E; ++I) { ++ ++ if (I->isBundle() || !I->isBundled()) ++ if (++NumInstr >= SkipThreshold) ++ return true; ++ } ++ } ++ ++ return false; ++} ++ ++void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) { ++ ++ if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB())) ++ return; ++ ++ DebugLoc DL = From.getDebugLoc(); ++ BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) ++ .addOperand(To) ++ .addReg(AMDGPU::EXEC); ++} ++ ++void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { ++ ++ MachineBasicBlock &MBB = *MI.getParent(); ++ DebugLoc DL = MI.getDebugLoc(); ++ ++ if (!shouldSkip(&MBB, &MBB.getParent()->back())) ++ return; ++ ++ MachineBasicBlock::iterator Insert = &MI; ++ ++Insert; ++ ++ // If the exec mask is non-zero, skip the next two instructions ++ BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) ++ .addImm(3) ++ .addReg(AMDGPU::EXEC); ++ ++ // Exec mask is zero: Export to NULL target... ++ BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP)) ++ .addImm(0) ++ .addImm(0x09) // V_008DFC_SQ_EXP_NULL ++ .addImm(0) ++ .addImm(1) ++ .addImm(1) ++ .addReg(AMDGPU::SREG_LIT_0) ++ .addReg(AMDGPU::SREG_LIT_0) ++ .addReg(AMDGPU::SREG_LIT_0) ++ .addReg(AMDGPU::SREG_LIT_0); ++ ++ // ... and terminate wavefront ++ BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); ++} ++ ++void SILowerControlFlowPass::If(MachineInstr &MI) { ++ MachineBasicBlock &MBB = *MI.getParent(); ++ DebugLoc DL = MI.getDebugLoc(); ++ unsigned Reg = MI.getOperand(0).getReg(); ++ unsigned Vcc = MI.getOperand(1).getReg(); ++ ++ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg) ++ .addReg(Vcc); ++ ++ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg) ++ .addReg(AMDGPU::EXEC) ++ .addReg(Reg); ++ ++ Skip(MI, MI.getOperand(2)); ++ ++ MI.eraseFromParent(); ++} ++ ++void SILowerControlFlowPass::Else(MachineInstr &MI) { ++ MachineBasicBlock &MBB = *MI.getParent(); ++ DebugLoc DL = MI.getDebugLoc(); ++ unsigned Dst = MI.getOperand(0).getReg(); ++ unsigned Src = MI.getOperand(1).getReg(); ++ ++ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) ++ .addReg(Src); // Saved EXEC ++ ++ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) ++ .addReg(AMDGPU::EXEC) ++ .addReg(Dst); ++ ++ Skip(MI, MI.getOperand(2)); ++ ++ MI.eraseFromParent(); ++} ++ ++void SILowerControlFlowPass::Break(MachineInstr &MI) { ++ MachineBasicBlock &MBB = *MI.getParent(); ++ DebugLoc DL = MI.getDebugLoc(); ++ ++ unsigned Dst = MI.getOperand(0).getReg(); ++ unsigned Src = MI.getOperand(1).getReg(); ++ ++ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) ++ .addReg(AMDGPU::EXEC) ++ .addReg(Src); ++ ++ MI.eraseFromParent(); ++} ++ ++void SILowerControlFlowPass::IfBreak(MachineInstr &MI) { ++ MachineBasicBlock &MBB = *MI.getParent(); ++ DebugLoc DL = MI.getDebugLoc(); ++ ++ unsigned Dst = MI.getOperand(0).getReg(); ++ unsigned Vcc = MI.getOperand(1).getReg(); ++ unsigned Src = MI.getOperand(2).getReg(); ++ ++ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) ++ .addReg(Vcc) ++ .addReg(Src); ++ ++ MI.eraseFromParent(); ++} ++ ++void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) { ++ MachineBasicBlock &MBB = *MI.getParent(); ++ DebugLoc DL = MI.getDebugLoc(); ++ ++ unsigned Dst = MI.getOperand(0).getReg(); ++ unsigned Saved = MI.getOperand(1).getReg(); ++ unsigned Src = MI.getOperand(2).getReg(); ++ ++ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) ++ .addReg(Saved) ++ .addReg(Src); ++ ++ MI.eraseFromParent(); ++} ++ ++void SILowerControlFlowPass::Loop(MachineInstr &MI) { ++ MachineBasicBlock &MBB = *MI.getParent(); ++ DebugLoc DL = MI.getDebugLoc(); ++ unsigned Src = MI.getOperand(0).getReg(); ++ ++ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) ++ .addReg(AMDGPU::EXEC) ++ .addReg(Src); ++ ++ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) ++ .addOperand(MI.getOperand(1)) ++ .addReg(AMDGPU::EXEC); ++ ++ MI.eraseFromParent(); ++} ++ ++void SILowerControlFlowPass::EndCf(MachineInstr &MI) { ++ MachineBasicBlock &MBB = *MI.getParent(); ++ DebugLoc DL = MI.getDebugLoc(); ++ unsigned Reg = MI.getOperand(0).getReg(); ++ ++ BuildMI(MBB, MBB.getFirstNonPHI(), DL, ++ TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) ++ .addReg(AMDGPU::EXEC) ++ .addReg(Reg); ++ ++ MI.eraseFromParent(); ++} ++ ++void SILowerControlFlowPass::Branch(MachineInstr &MI) { ++ MachineBasicBlock *Next = MI.getParent()->getNextNode(); ++ MachineBasicBlock *Target = MI.getOperand(0).getMBB(); ++ if (Target == Next) ++ MI.eraseFromParent(); ++ else ++ assert(0); ++} ++ ++void SILowerControlFlowPass::Kill(MachineInstr &MI) { ++ ++ MachineBasicBlock &MBB = *MI.getParent(); ++ DebugLoc DL = MI.getDebugLoc(); ++ ++ // Kill is only allowed in pixel shaders ++ MachineFunction &MF = *MBB.getParent(); ++ SIMachineFunctionInfo *Info = MF.getInfo(); ++ assert(Info->ShaderType == ShaderType::PIXEL); ++ ++ // Clear this pixel from the exec mask if the operand is negative ++ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC) ++ .addReg(AMDGPU::SREG_LIT_0) ++ .addOperand(MI.getOperand(0)); ++ ++ MI.eraseFromParent(); ++} ++ ++bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { ++ ++ bool HaveKill = false; ++ unsigned Depth = 0; ++ ++ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); ++ BI != BE; ++BI) { ++ ++ MachineBasicBlock &MBB = *BI; ++ for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); ++ I != MBB.end(); I = Next) { ++ ++ Next = llvm::next(I); ++ MachineInstr &MI = *I; ++ switch (MI.getOpcode()) { ++ default: break; ++ case AMDGPU::SI_IF: ++ ++Depth; ++ If(MI); ++ break; ++ ++ case AMDGPU::SI_ELSE: ++ Else(MI); ++ break; ++ ++ case AMDGPU::SI_BREAK: ++ Break(MI); ++ break; ++ ++ case AMDGPU::SI_IF_BREAK: ++ IfBreak(MI); ++ break; ++ ++ case AMDGPU::SI_ELSE_BREAK: ++ ElseBreak(MI); ++ break; ++ ++ case AMDGPU::SI_LOOP: ++ ++Depth; ++ Loop(MI); ++ break; ++ ++ case AMDGPU::SI_END_CF: ++ if (--Depth == 0 && HaveKill) { ++ SkipIfDead(MI); ++ HaveKill = false; ++ } ++ EndCf(MI); ++ break; ++ ++ case AMDGPU::SI_KILL: ++ if (Depth == 0) ++ SkipIfDead(MI); ++ else ++ HaveKill = true; ++ Kill(MI); ++ break; ++ ++ case AMDGPU::S_BRANCH: ++ Branch(MI); ++ break; ++ } ++ } ++ } ++ ++ return true; ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SILowerLiteralConstants.cpp llvm-r600/lib/Target/R600/SILowerLiteralConstants.cpp +--- llvm-3.2.src/lib/Target/R600/SILowerLiteralConstants.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/SILowerLiteralConstants.cpp 2013-01-25 19:43:57.480049720 +0100 +@@ -0,0 +1,108 @@ ++//===-- SILowerLiteralConstants.cpp - Lower intrs using literal constants--===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief This pass performs the following transformation on instructions with ++/// literal constants: ++/// ++/// %VGPR0 = V_MOV_IMM_I32 1 ++/// ++/// becomes: ++/// ++/// BUNDLE ++/// * %VGPR = V_MOV_B32_32 SI_LITERAL_CONSTANT ++/// * SI_LOAD_LITERAL 1 ++/// ++/// The resulting sequence matches exactly how the hardware handles immediate ++/// operands, so this transformation greatly simplifies the code generator. ++/// ++/// Only the *_MOV_IMM_* support immediate operands at the moment, but when ++/// support for immediate operands is added to other instructions, they ++/// will be lowered here as well. ++//===----------------------------------------------------------------------===// ++ ++#include "AMDGPU.h" ++#include "llvm/CodeGen/MachineFunction.h" ++#include "llvm/CodeGen/MachineFunctionPass.h" ++#include "llvm/CodeGen/MachineInstrBuilder.h" ++#include "llvm/CodeGen/MachineInstrBundle.h" ++ ++using namespace llvm; ++ ++namespace { ++ ++class SILowerLiteralConstantsPass : public MachineFunctionPass { ++ ++private: ++ static char ID; ++ const TargetInstrInfo *TII; ++ ++public: ++ SILowerLiteralConstantsPass(TargetMachine &tm) : ++ MachineFunctionPass(ID), TII(tm.getInstrInfo()) { } ++ ++ virtual bool runOnMachineFunction(MachineFunction &MF); ++ ++ const char *getPassName() const { ++ return "SI Lower literal constants pass"; ++ } ++}; ++ ++} // End anonymous namespace ++ ++char SILowerLiteralConstantsPass::ID = 0; ++ ++FunctionPass *llvm::createSILowerLiteralConstantsPass(TargetMachine &tm) { ++ return new SILowerLiteralConstantsPass(tm); ++} ++ ++bool SILowerLiteralConstantsPass::runOnMachineFunction(MachineFunction &MF) { ++ for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); ++ BB != BB_E; ++BB) { ++ MachineBasicBlock &MBB = *BB; ++ for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); ++ I != MBB.end(); I = Next) { ++ Next = llvm::next(I); ++ MachineInstr &MI = *I; ++ switch (MI.getOpcode()) { ++ default: break; ++ case AMDGPU::S_MOV_IMM_I32: ++ case AMDGPU::S_MOV_IMM_I64: ++ case AMDGPU::V_MOV_IMM_F32: ++ case AMDGPU::V_MOV_IMM_I32: { ++ unsigned MovOpcode; ++ unsigned LoadLiteralOpcode; ++ MachineOperand LiteralOp = MI.getOperand(1); ++ if (AMDGPU::VReg_32RegClass.contains(MI.getOperand(0).getReg())) { ++ MovOpcode = AMDGPU::V_MOV_B32_e32; ++ } else { ++ MovOpcode = AMDGPU::S_MOV_B32; ++ } ++ if (LiteralOp.isImm()) { ++ LoadLiteralOpcode = AMDGPU::SI_LOAD_LITERAL_I32; ++ } else { ++ LoadLiteralOpcode = AMDGPU::SI_LOAD_LITERAL_F32; ++ } ++ MachineInstr *First = ++ BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(MovOpcode), ++ MI.getOperand(0).getReg()) ++ .addReg(AMDGPU::SI_LITERAL_CONSTANT); ++ MachineInstr *Last = ++ BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(LoadLiteralOpcode)) ++ .addOperand(MI.getOperand(1)); ++ Last->setIsInsideBundle(); ++ llvm::finalizeBundle(MBB, First, Last); ++ MI.eraseFromParent(); ++ break; ++ } ++ } ++ } ++ } ++ return false; ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIMachineFunctionInfo.cpp llvm-r600/lib/Target/R600/SIMachineFunctionInfo.cpp +--- llvm-3.2.src/lib/Target/R600/SIMachineFunctionInfo.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/SIMachineFunctionInfo.cpp 2013-01-25 19:43:57.480049720 +0100 +@@ -0,0 +1,20 @@ ++//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++/// \file ++//===----------------------------------------------------------------------===// ++ ++ ++#include "SIMachineFunctionInfo.h" ++ ++using namespace llvm; ++ ++SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) ++ : MachineFunctionInfo(), ++ SPIPSInputAddr(0), ++ ShaderType(0) ++ { } +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIMachineFunctionInfo.h llvm-r600/lib/Target/R600/SIMachineFunctionInfo.h +--- llvm-3.2.src/lib/Target/R600/SIMachineFunctionInfo.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/SIMachineFunctionInfo.h 2013-01-25 19:43:57.480049720 +0100 +@@ -0,0 +1,34 @@ ++//===- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface -*- C++ -*-==// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++// ++//===----------------------------------------------------------------------===// ++ ++ ++#ifndef SIMACHINEFUNCTIONINFO_H_ ++#define SIMACHINEFUNCTIONINFO_H_ ++ ++#include "llvm/CodeGen/MachineFunction.h" ++ ++namespace llvm { ++ ++/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which ++/// tells the hardware which interpolation parameters to load. ++class SIMachineFunctionInfo : public MachineFunctionInfo { ++public: ++ SIMachineFunctionInfo(const MachineFunction &MF); ++ unsigned SPIPSInputAddr; ++ unsigned ShaderType; ++}; ++ ++} // End namespace llvm ++ ++ ++#endif //_SIMACHINEFUNCTIONINFO_H_ +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIRegisterInfo.cpp llvm-r600/lib/Target/R600/SIRegisterInfo.cpp +--- llvm-3.2.src/lib/Target/R600/SIRegisterInfo.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/SIRegisterInfo.cpp 2013-01-25 19:43:57.480049720 +0100 +@@ -0,0 +1,48 @@ ++//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief SI implementation of the TargetRegisterInfo class. ++// ++//===----------------------------------------------------------------------===// ++ ++ ++#include "SIRegisterInfo.h" ++#include "AMDGPUTargetMachine.h" ++ ++using namespace llvm; ++ ++SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine &tm, ++ const TargetInstrInfo &tii) ++: AMDGPURegisterInfo(tm, tii), ++ TM(tm), ++ TII(tii) ++ { } ++ ++BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { ++ BitVector Reserved(getNumRegs()); ++ return Reserved; ++} ++ ++const TargetRegisterClass * ++SIRegisterInfo::getISARegClass(const TargetRegisterClass * rc) const { ++ switch (rc->getID()) { ++ case AMDGPU::GPRF32RegClassID: ++ return &AMDGPU::VReg_32RegClass; ++ default: return rc; ++ } ++} ++ ++const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass( ++ MVT VT) const { ++ switch(VT.SimpleTy) { ++ default: ++ case MVT::i32: return &AMDGPU::VReg_32RegClass; ++ } ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIRegisterInfo.h llvm-r600/lib/Target/R600/SIRegisterInfo.h +--- llvm-3.2.src/lib/Target/R600/SIRegisterInfo.h 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/SIRegisterInfo.h 2013-01-25 19:43:57.483383054 +0100 +@@ -0,0 +1,47 @@ ++//===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++/// \brief Interface definition for SIRegisterInfo ++// ++//===----------------------------------------------------------------------===// ++ ++ ++#ifndef SIREGISTERINFO_H_ ++#define SIREGISTERINFO_H_ ++ ++#include "AMDGPURegisterInfo.h" ++ ++namespace llvm { ++ ++class AMDGPUTargetMachine; ++class TargetInstrInfo; ++ ++struct SIRegisterInfo : public AMDGPURegisterInfo { ++ AMDGPUTargetMachine &TM; ++ const TargetInstrInfo &TII; ++ ++ SIRegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii); ++ ++ virtual BitVector getReservedRegs(const MachineFunction &MF) const; ++ ++ /// \param RC is an AMDIL reg class. ++ /// ++ /// \returns the SI register class that is equivalent to \p RC. ++ virtual const TargetRegisterClass * ++ getISARegClass(const TargetRegisterClass *RC) const; ++ ++ /// \brief get the register class of the specified type to use in the ++ /// CFGStructurizer ++ virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const; ++}; ++ ++} // End namespace llvm ++ ++#endif // SIREGISTERINFO_H_ +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIRegisterInfo.td llvm-r600/lib/Target/R600/SIRegisterInfo.td +--- llvm-3.2.src/lib/Target/R600/SIRegisterInfo.td 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/SIRegisterInfo.td 2013-01-25 19:43:57.483383054 +0100 +@@ -0,0 +1,167 @@ ++ ++let Namespace = "AMDGPU" in { ++ def low : SubRegIndex; ++ def high : SubRegIndex; ++ ++ def sub0 : SubRegIndex; ++ def sub1 : SubRegIndex; ++ def sub2 : SubRegIndex; ++ def sub3 : SubRegIndex; ++ def sub4 : SubRegIndex; ++ def sub5 : SubRegIndex; ++ def sub6 : SubRegIndex; ++ def sub7 : SubRegIndex; ++} ++ ++class SIReg encoding = 0> : Register { ++ let Namespace = "AMDGPU"; ++ let HWEncoding = encoding; ++} ++ ++class SI_64 subregs, bits<16> encoding> : RegisterWithSubRegs { ++ let Namespace = "AMDGPU"; ++ let SubRegIndices = [low, high]; ++ let HWEncoding = encoding; ++} ++ ++class SGPR_32 num, string name> : SIReg; ++ ++class VGPR_32 num, string name> : SIReg; ++ ++// Special Registers ++def VCC : SIReg<"VCC", 106>; ++def EXEC_LO : SIReg <"EXEC LO", 126>; ++def EXEC_HI : SIReg <"EXEC HI", 127>; ++def EXEC : SI_64<"EXEC", [EXEC_LO, EXEC_HI], 126>; ++def SCC : SIReg<"SCC", 253>; ++def SREG_LIT_0 : SIReg <"S LIT 0", 128>; ++def SI_LITERAL_CONSTANT : SIReg<"LITERAL CONSTANT", 255>; ++def M0 : SIReg <"M0", 124>; ++ ++//Interpolation registers ++def PERSP_SAMPLE_I : SIReg <"PERSP_SAMPLE_I">; ++def PERSP_SAMPLE_J : SIReg <"PERSP_SAMPLE_J">; ++def PERSP_CENTER_I : SIReg <"PERSP_CENTER_I">; ++def PERSP_CENTER_J : SIReg <"PERSP_CENTER_J">; ++def PERSP_CENTROID_I : SIReg <"PERSP_CENTROID_I">; ++def PERSP_CENTROID_J : SIReg <"PERP_CENTROID_J">; ++def PERSP_I_W : SIReg <"PERSP_I_W">; ++def PERSP_J_W : SIReg <"PERSP_J_W">; ++def PERSP_1_W : SIReg <"PERSP_1_W">; ++def LINEAR_SAMPLE_I : SIReg <"LINEAR_SAMPLE_I">; ++def LINEAR_SAMPLE_J : SIReg <"LINEAR_SAMPLE_J">; ++def LINEAR_CENTER_I : SIReg <"LINEAR_CENTER_I">; ++def LINEAR_CENTER_J : SIReg <"LINEAR_CENTER_J">; ++def LINEAR_CENTROID_I : SIReg <"LINEAR_CENTROID_I">; ++def LINEAR_CENTROID_J : SIReg <"LINEAR_CENTROID_J">; ++def LINE_STIPPLE_TEX_COORD : SIReg <"LINE_STIPPLE_TEX_COORD">; ++def POS_X_FLOAT : SIReg <"POS_X_FLOAT">; ++def POS_Y_FLOAT : SIReg <"POS_Y_FLOAT">; ++def POS_Z_FLOAT : SIReg <"POS_Z_FLOAT">; ++def POS_W_FLOAT : SIReg <"POS_W_FLOAT">; ++def FRONT_FACE : SIReg <"FRONT_FACE">; ++def ANCILLARY : SIReg <"ANCILLARY">; ++def SAMPLE_COVERAGE : SIReg <"SAMPLE_COVERAGE">; ++def POS_FIXED_PT : SIReg <"POS_FIXED_PT">; ++ ++// SGPR 32-bit registers ++foreach Index = 0-101 in { ++ def SGPR#Index : SGPR_32 ; ++} ++ ++def SGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32, ++ (add (sequence "SGPR%u", 0, 101))>; ++ ++// SGPR 64-bit registers ++def SGPR_64 : RegisterTuples<[low, high], ++ [(add (decimate SGPR_32, 2)), ++ (add(decimate (rotl SGPR_32, 1), 2))]>; ++ ++// SGPR 128-bit registers ++def SGPR_128 : RegisterTuples<[sel_x, sel_y, sel_z, sel_w], ++ [(add (decimate SGPR_32, 4)), ++ (add (decimate (rotl SGPR_32, 1), 4)), ++ (add (decimate (rotl SGPR_32, 2), 4)), ++ (add (decimate (rotl SGPR_32, 3), 4))]>; ++ ++// SGPR 256-bit registers ++def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], ++ [(add (decimate SGPR_32, 8)), ++ (add (decimate (rotl SGPR_32, 1), 8)), ++ (add (decimate (rotl SGPR_32, 2), 8)), ++ (add (decimate (rotl SGPR_32, 3), 8)), ++ (add (decimate (rotl SGPR_32, 4), 8)), ++ (add (decimate (rotl SGPR_32, 5), 8)), ++ (add (decimate (rotl SGPR_32, 6), 8)), ++ (add (decimate (rotl SGPR_32, 7), 8))]>; ++ ++// VGPR 32-bit registers ++foreach Index = 0-255 in { ++ def VGPR#Index : VGPR_32 ; ++} ++ ++def VGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32, ++ (add (sequence "VGPR%u", 0, 255))>; ++ ++// VGPR 64-bit registers ++def VGPR_64 : RegisterTuples<[low, high], ++ [(add VGPR_32), ++ (add (rotl VGPR_32, 1))]>; ++ ++// VGPR 128-bit registers ++def VGPR_128 : RegisterTuples<[sel_x, sel_y, sel_z, sel_w], ++ [(add VGPR_32), ++ (add (rotl VGPR_32, 1)), ++ (add (rotl VGPR_32, 2)), ++ (add (rotl VGPR_32, 3))]>; ++ ++// Register class for all scalar registers (SGPRs + Special Registers) ++def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32, ++ (add SGPR_32, SREG_LIT_0, M0, EXEC_LO, EXEC_HI) ++>; ++ ++def SReg_64 : RegisterClass<"AMDGPU", [i64], 64, (add SGPR_64, VCC, EXEC)>; ++ ++def SReg_1 : RegisterClass<"AMDGPU", [i1], 1, (add VCC, SGPR_64, EXEC)>; ++ ++def SReg_128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, (add SGPR_128)>; ++ ++def SReg_256 : RegisterClass<"AMDGPU", [v8i32], 256, (add SGPR_256)>; ++ ++// Register class for all vector registers (VGPRs + Interploation Registers) ++def VReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32, ++ (add VGPR_32, ++ PERSP_SAMPLE_I, PERSP_SAMPLE_J, ++ PERSP_CENTER_I, PERSP_CENTER_J, ++ PERSP_CENTROID_I, PERSP_CENTROID_J, ++ PERSP_I_W, PERSP_J_W, PERSP_1_W, ++ LINEAR_SAMPLE_I, LINEAR_SAMPLE_J, ++ LINEAR_CENTER_I, LINEAR_CENTER_J, ++ LINEAR_CENTROID_I, LINEAR_CENTROID_J, ++ LINE_STIPPLE_TEX_COORD, ++ POS_X_FLOAT, ++ POS_Y_FLOAT, ++ POS_Z_FLOAT, ++ POS_W_FLOAT, ++ FRONT_FACE, ++ ANCILLARY, ++ SAMPLE_COVERAGE, ++ POS_FIXED_PT ++ ) ++>; ++ ++def VReg_64 : RegisterClass<"AMDGPU", [i64], 64, (add VGPR_64)>; ++ ++def VReg_128 : RegisterClass<"AMDGPU", [v4f32], 128, (add VGPR_128)>; ++ ++// AllReg_* - A set of all scalar and vector registers of a given width. ++def AllReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32, (add VReg_32, SReg_32)>; ++ ++def AllReg_64 : RegisterClass<"AMDGPU", [f64, i64], 64, (add SReg_64, VReg_64)>; ++ ++// Special register classes for predicates and the M0 register ++def SCCReg : RegisterClass<"AMDGPU", [i1], 1, (add SCC)>; ++def VCCReg : RegisterClass<"AMDGPU", [i1], 1, (add VCC)>; ++def EXECReg : RegisterClass<"AMDGPU", [i1], 1, (add EXEC)>; ++def M0Reg : RegisterClass<"AMDGPU", [i32], 32, (add M0)>; ++ +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SISchedule.td llvm-r600/lib/Target/R600/SISchedule.td +--- llvm-3.2.src/lib/Target/R600/SISchedule.td 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/SISchedule.td 2013-01-25 19:43:57.483383054 +0100 +@@ -0,0 +1,15 @@ ++//===-- SISchedule.td - SI Scheduling definitons -------------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++// TODO: This is just a place holder for now. ++// ++//===----------------------------------------------------------------------===// ++ ++ ++def SI_Itin : ProcessorItineraries <[], [], []>; +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp llvm-r600/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp +--- llvm-3.2.src/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp 2013-01-25 19:43:57.483383054 +0100 +@@ -0,0 +1,26 @@ ++//===-- TargetInfo/AMDGPUTargetInfo.cpp - TargetInfo for AMDGPU -----------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++/// \file ++// ++//===----------------------------------------------------------------------===// ++ ++#include "AMDGPU.h" ++#include "llvm/Support/TargetRegistry.h" ++ ++using namespace llvm; ++ ++/// \brief The target for the AMDGPU backend ++Target llvm::TheAMDGPUTarget; ++ ++/// \brief Extern function to initialize the targets for the AMDGPU backend ++extern "C" void LLVMInitializeR600TargetInfo() { ++ RegisterTarget ++ R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX"); ++} +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/TargetInfo/CMakeLists.txt llvm-r600/lib/Target/R600/TargetInfo/CMakeLists.txt +--- llvm-3.2.src/lib/Target/R600/TargetInfo/CMakeLists.txt 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/TargetInfo/CMakeLists.txt 2013-01-25 19:43:57.483383054 +0100 +@@ -0,0 +1,7 @@ ++include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) ++ ++add_llvm_library(LLVMR600Info ++ AMDGPUTargetInfo.cpp ++ ) ++ ++add_dependencies(LLVMR600Info AMDGPUCommonTableGen intrinsics_gen) +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/TargetInfo/LLVMBuild.txt llvm-r600/lib/Target/R600/TargetInfo/LLVMBuild.txt +--- llvm-3.2.src/lib/Target/R600/TargetInfo/LLVMBuild.txt 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/TargetInfo/LLVMBuild.txt 2013-01-25 19:43:57.483383054 +0100 +@@ -0,0 +1,23 @@ ++;===- ./lib/Target/R600/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===; ++; ++; The LLVM Compiler Infrastructure ++; ++; This file is distributed under the University of Illinois Open Source ++; License. See LICENSE.TXT for details. ++; ++;===------------------------------------------------------------------------===; ++; ++; This is an LLVMBuild description file for the components in this subdirectory. ++; ++; For more information on the LLVMBuild system, please see: ++; ++; http://llvm.org/docs/LLVMBuild.html ++; ++;===------------------------------------------------------------------------===; ++ ++[component_0] ++type = Library ++name = R600Info ++parent = R600 ++required_libraries = MC Support ++add_to_library_groups = R600 +diff -Nur -x .git llvm-3.2.src/lib/Target/R600/TargetInfo/Makefile llvm-r600/lib/Target/R600/TargetInfo/Makefile +--- llvm-3.2.src/lib/Target/R600/TargetInfo/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/lib/Target/R600/TargetInfo/Makefile 2013-01-25 19:43:57.483383054 +0100 +@@ -0,0 +1,15 @@ ++##===- lib/Target/AMDGPU/TargetInfo/Makefile ----------------*- Makefile -*-===## ++# ++# The LLVM Compiler Infrastructure ++# ++# This file is distributed under the University of Illinois Open Source ++# License. See LICENSE.TXT for details. ++# ++##===----------------------------------------------------------------------===## ++LEVEL = ../../../.. ++LIBRARYNAME = LLVMR600Info ++ ++# Hack: we need to include 'main' target directory to grab private headers ++CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. ++ ++include $(LEVEL)/Makefile.common +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/add.v4i32.ll llvm-r600/test/CodeGen/R600/add.v4i32.ll +--- llvm-3.2.src/test/CodeGen/R600/add.v4i32.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/add.v4i32.ll 2013-01-25 19:43:58.460049700 +0100 +@@ -0,0 +1,15 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { ++ %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1 ++ %a = load <4 x i32> addrspace(1) * %in ++ %b = load <4 x i32> addrspace(1) * %b_ptr ++ %result = add <4 x i32> %a, %b ++ store <4 x i32> %result, <4 x i32> addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/and.v4i32.ll llvm-r600/test/CodeGen/R600/and.v4i32.ll +--- llvm-3.2.src/test/CodeGen/R600/and.v4i32.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/and.v4i32.ll 2013-01-25 19:43:58.460049700 +0100 +@@ -0,0 +1,15 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { ++ %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1 ++ %a = load <4 x i32> addrspace(1) * %in ++ %b = load <4 x i32> addrspace(1) * %b_ptr ++ %result = and <4 x i32> %a, %b ++ store <4 x i32> %result, <4 x i32> addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll llvm-r600/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll +--- llvm-3.2.src/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll 2013-01-25 19:43:58.460049700 +0100 +@@ -0,0 +1,33 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++; This test is for a bug in ++; DAGCombiner::reduceBuildVecConvertToConvertBuildVec() where ++; the wrong type was being passed to ++; TargetLowering::getOperationAction() when checking the legality of ++; ISD::UINT_TO_FP and ISD::SINT_TO_FP opcodes. ++ ++define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) { ++entry: ++ %ptr = getelementptr i32 addrspace(1)* %in, i32 1 ++ %sint = load i32 addrspace(1) * %in ++ %conv = sitofp i32 %sint to float ++ %0 = insertelement <4 x float> undef, float %conv, i32 0 ++ %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer ++ store <4 x float> %splat, <4 x float> addrspace(1)* %out ++ ret void ++} ++ ++;CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) { ++entry: ++ %ptr = getelementptr i32 addrspace(1)* %in, i32 1 ++ %uint = load i32 addrspace(1) * %in ++ %conv = uitofp i32 %uint to float ++ %0 = insertelement <4 x float> undef, float %conv, i32 0 ++ %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer ++ store <4 x float> %splat, <4 x float> addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fabs.ll llvm-r600/test/CodeGen/R600/fabs.ll +--- llvm-3.2.src/test/CodeGen/R600/fabs.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/fabs.ll 2013-01-25 19:43:58.460049700 +0100 +@@ -0,0 +1,16 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: MOV T{{[0-9]+\.[XYZW], \|T[0-9]+\.[XYZW]\|}} ++ ++define void @test() { ++ %r0 = call float @llvm.R600.load.input(i32 0) ++ %r1 = call float @fabs( float %r0) ++ call void @llvm.AMDGPU.store.output(float %r1, i32 0) ++ ret void ++} ++ ++declare float @llvm.R600.load.input(i32) readnone ++ ++declare void @llvm.AMDGPU.store.output(float, i32) ++ ++declare float @fabs(float ) readnone +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fadd.ll llvm-r600/test/CodeGen/R600/fadd.ll +--- llvm-3.2.src/test/CodeGen/R600/fadd.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/fadd.ll 2013-01-25 19:43:58.460049700 +0100 +@@ -0,0 +1,16 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test() { ++ %r0 = call float @llvm.R600.load.input(i32 0) ++ %r1 = call float @llvm.R600.load.input(i32 1) ++ %r2 = fadd float %r0, %r1 ++ call void @llvm.AMDGPU.store.output(float %r2, i32 0) ++ ret void ++} ++ ++declare float @llvm.R600.load.input(i32) readnone ++ ++declare void @llvm.AMDGPU.store.output(float, i32) ++ +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fadd.v4f32.ll llvm-r600/test/CodeGen/R600/fadd.v4f32.ll +--- llvm-3.2.src/test/CodeGen/R600/fadd.v4f32.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/fadd.v4f32.ll 2013-01-25 19:43:58.460049700 +0100 +@@ -0,0 +1,15 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { ++ %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1 ++ %a = load <4 x float> addrspace(1) * %in ++ %b = load <4 x float> addrspace(1) * %b_ptr ++ %result = fadd <4 x float> %a, %b ++ store <4 x float> %result, <4 x float> addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fcmp-cnde-int-args.ll llvm-r600/test/CodeGen/R600/fcmp-cnde-int-args.ll +--- llvm-3.2.src/test/CodeGen/R600/fcmp-cnde-int-args.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/fcmp-cnde-int-args.ll 2013-01-25 19:43:58.460049700 +0100 +@@ -0,0 +1,16 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++; This test checks a bug in R600TargetLowering::LowerSELECT_CC where the ++; chance to optimize the fcmp + select instructions to CNDE was missed ++; due to the fact that the operands to fcmp and select had different types ++ ++;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, 0.0, -1}} ++ ++define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { ++entry: ++ %0 = load float addrspace(1)* %in ++ %cmp = fcmp oeq float %0, 0.000000e+00 ++ %value = select i1 %cmp, i32 -1, i32 0 ++ store i32 %value, i32 addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fcmp-cnd.ll llvm-r600/test/CodeGen/R600/fcmp-cnd.ll +--- llvm-3.2.src/test/CodeGen/R600/fcmp-cnd.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/fcmp-cnd.ll 2013-01-25 19:43:58.460049700 +0100 +@@ -0,0 +1,14 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;Not checking arguments 2 and 3 to CNDE, because they may change between ++;registers and literal.x depending on what the optimizer does. ++;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { ++entry: ++ %0 = load float addrspace(1)* %in ++ %cmp = fcmp oeq float %0, 0.000000e+00 ++ %value = select i1 %cmp, i32 2, i32 3 ++ store i32 %value, i32 addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fcmp.ll llvm-r600/test/CodeGen/R600/fcmp.ll +--- llvm-3.2.src/test/CodeGen/R600/fcmp.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/fcmp.ll 2013-01-25 19:43:58.460049700 +0100 +@@ -0,0 +1,16 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: SETE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: MOV T{{[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} ++;CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { ++entry: ++ %0 = load float addrspace(1)* %in ++ %arrayidx1 = getelementptr inbounds float addrspace(1)* %in, i32 1 ++ %1 = load float addrspace(1)* %arrayidx1 ++ %cmp = fcmp oeq float %0, %1 ++ %sext = sext i1 %cmp to i32 ++ store i32 %sext, i32 addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fdiv.v4f32.ll llvm-r600/test/CodeGen/R600/fdiv.v4f32.ll +--- llvm-3.2.src/test/CodeGen/R600/fdiv.v4f32.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/fdiv.v4f32.ll 2013-01-25 19:43:58.460049700 +0100 +@@ -0,0 +1,19 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { ++ %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1 ++ %a = load <4 x float> addrspace(1) * %in ++ %b = load <4 x float> addrspace(1) * %b_ptr ++ %result = fdiv <4 x float> %a, %b ++ store <4 x float> %result, <4 x float> addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/floor.ll llvm-r600/test/CodeGen/R600/floor.ll +--- llvm-3.2.src/test/CodeGen/R600/floor.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/floor.ll 2013-01-25 19:43:58.463383033 +0100 +@@ -0,0 +1,16 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: FLOOR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test() { ++ %r0 = call float @llvm.R600.load.input(i32 0) ++ %r1 = call float @floor(float %r0) ++ call void @llvm.AMDGPU.store.output(float %r1, i32 0) ++ ret void ++} ++ ++declare float @llvm.R600.load.input(i32) readnone ++ ++declare void @llvm.AMDGPU.store.output(float, i32) ++ ++declare float @floor(float) readonly +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fmax.ll llvm-r600/test/CodeGen/R600/fmax.ll +--- llvm-3.2.src/test/CodeGen/R600/fmax.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/fmax.ll 2013-01-25 19:43:58.463383033 +0100 +@@ -0,0 +1,16 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: MAX T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test() { ++ %r0 = call float @llvm.R600.load.input(i32 0) ++ %r1 = call float @llvm.R600.load.input(i32 1) ++ %r2 = fcmp uge float %r0, %r1 ++ %r3 = select i1 %r2, float %r0, float %r1 ++ call void @llvm.AMDGPU.store.output(float %r3, i32 0) ++ ret void ++} ++ ++declare float @llvm.R600.load.input(i32) readnone ++ ++declare void @llvm.AMDGPU.store.output(float, i32) +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fmin.ll llvm-r600/test/CodeGen/R600/fmin.ll +--- llvm-3.2.src/test/CodeGen/R600/fmin.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/fmin.ll 2013-01-25 19:43:58.463383033 +0100 +@@ -0,0 +1,16 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: MIN T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test() { ++ %r0 = call float @llvm.R600.load.input(i32 0) ++ %r1 = call float @llvm.R600.load.input(i32 1) ++ %r2 = fcmp uge float %r0, %r1 ++ %r3 = select i1 %r2, float %r1, float %r0 ++ call void @llvm.AMDGPU.store.output(float %r3, i32 0) ++ ret void ++} ++ ++declare float @llvm.R600.load.input(i32) readnone ++ ++declare void @llvm.AMDGPU.store.output(float, i32) +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fmul.ll llvm-r600/test/CodeGen/R600/fmul.ll +--- llvm-3.2.src/test/CodeGen/R600/fmul.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/fmul.ll 2013-01-25 19:43:58.463383033 +0100 +@@ -0,0 +1,16 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test() { ++ %r0 = call float @llvm.R600.load.input(i32 0) ++ %r1 = call float @llvm.R600.load.input(i32 1) ++ %r2 = fmul float %r0, %r1 ++ call void @llvm.AMDGPU.store.output(float %r2, i32 0) ++ ret void ++} ++ ++declare float @llvm.R600.load.input(i32) readnone ++ ++declare void @llvm.AMDGPU.store.output(float, i32) ++ +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fmul.v4f32.ll llvm-r600/test/CodeGen/R600/fmul.v4f32.ll +--- llvm-3.2.src/test/CodeGen/R600/fmul.v4f32.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/fmul.v4f32.ll 2013-01-25 19:43:58.463383033 +0100 +@@ -0,0 +1,15 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { ++ %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1 ++ %a = load <4 x float> addrspace(1) * %in ++ %b = load <4 x float> addrspace(1) * %b_ptr ++ %result = fmul <4 x float> %a, %b ++ store <4 x float> %result, <4 x float> addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fsub.ll llvm-r600/test/CodeGen/R600/fsub.ll +--- llvm-3.2.src/test/CodeGen/R600/fsub.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/fsub.ll 2013-01-25 19:43:58.463383033 +0100 +@@ -0,0 +1,17 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++; CHECK: MOV T{{[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} ++; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test() { ++ %r0 = call float @llvm.R600.load.input(i32 0) ++ %r1 = call float @llvm.R600.load.input(i32 1) ++ %r2 = fsub float %r0, %r1 ++ call void @llvm.AMDGPU.store.output(float %r2, i32 0) ++ ret void ++} ++ ++declare float @llvm.R600.load.input(i32) readnone ++ ++declare void @llvm.AMDGPU.store.output(float, i32) ++ +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fsub.v4f32.ll llvm-r600/test/CodeGen/R600/fsub.v4f32.ll +--- llvm-3.2.src/test/CodeGen/R600/fsub.v4f32.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/fsub.v4f32.ll 2013-01-25 19:43:58.463383033 +0100 +@@ -0,0 +1,15 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { ++ %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1 ++ %a = load <4 x float> addrspace(1) * %in ++ %b = load <4 x float> addrspace(1) * %b_ptr ++ %result = fsub <4 x float> %a, %b ++ store <4 x float> %result, <4 x float> addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/i8_to_double_to_float.ll llvm-r600/test/CodeGen/R600/i8_to_double_to_float.ll +--- llvm-3.2.src/test/CodeGen/R600/i8_to_double_to_float.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/i8_to_double_to_float.ll 2013-01-25 19:43:58.463383033 +0100 +@@ -0,0 +1,11 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) { ++ %1 = load i8 addrspace(1)* %in ++ %2 = uitofp i8 %1 to double ++ %3 = fptrunc double %2 to float ++ store float %3, float addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/icmp-select-sete-reverse-args.ll llvm-r600/test/CodeGen/R600/icmp-select-sete-reverse-args.ll +--- llvm-3.2.src/test/CodeGen/R600/icmp-select-sete-reverse-args.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/icmp-select-sete-reverse-args.ll 2013-01-25 19:43:58.463383033 +0100 +@@ -0,0 +1,18 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;Test that a select with reversed True/False values is correctly lowered ++;to a SETNE_INT. There should only be one SETNE_INT instruction. ++ ++;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK_NOT: SETNE_INT ++ ++define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ++entry: ++ %0 = load i32 addrspace(1)* %in ++ %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %in, i32 1 ++ %1 = load i32 addrspace(1)* %arrayidx1 ++ %cmp = icmp eq i32 %0, %1 ++ %value = select i1 %cmp, i32 0, i32 -1 ++ store i32 %value, i32 addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/literals.ll llvm-r600/test/CodeGen/R600/literals.ll +--- llvm-3.2.src/test/CodeGen/R600/literals.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/literals.ll 2013-01-25 19:43:58.463383033 +0100 +@@ -0,0 +1,30 @@ ++; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++; Test using an integer literal constant. ++; Generated ASM should be: ++; ADD_INT REG literal.x, 5 ++; or ++; ADD_INT literal.x REG, 5 ++ ++; CHECK: ADD_INT {{[A-Z0-9,. ]*}}literal.x,{{[A-Z0-9,. ]*}} 5 ++define void @i32_literal(i32 addrspace(1)* %out, i32 %in) { ++entry: ++ %0 = add i32 5, %in ++ store i32 %0, i32 addrspace(1)* %out ++ ret void ++} ++ ++; Test using a float literal constant. ++; Generated ASM should be: ++; ADD REG literal.x, 5.0 ++; or ++; ADD literal.x REG, 5.0 ++ ++; CHECK: ADD {{[A-Z0-9,. ]*}}literal.x,{{[A-Z0-9,. ]*}} {{[0-9]+}}(5.0 ++define void @float_literal(float addrspace(1)* %out, float %in) { ++entry: ++ %0 = fadd float 5.0, %in ++ store float %0, float addrspace(1)* %out ++ ret void ++} ++ +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/lit.local.cfg llvm-r600/test/CodeGen/R600/lit.local.cfg +--- llvm-3.2.src/test/CodeGen/R600/lit.local.cfg 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/lit.local.cfg 2013-01-25 19:43:58.463383033 +0100 +@@ -0,0 +1,13 @@ ++config.suffixes = ['.ll', '.c', '.cpp'] ++ ++def getRoot(config): ++ if not config.parent: ++ return config ++ return getRoot(config.parent) ++ ++root = getRoot(config) ++ ++targets = set(root.targets_to_build.split()) ++if not 'R600' in targets: ++ config.unsupported = True ++ +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/llvm.AMDGPU.mul.ll llvm-r600/test/CodeGen/R600/llvm.AMDGPU.mul.ll +--- llvm-3.2.src/test/CodeGen/R600/llvm.AMDGPU.mul.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/llvm.AMDGPU.mul.ll 2013-01-25 19:43:58.463383033 +0100 +@@ -0,0 +1,17 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test() { ++ %r0 = call float @llvm.R600.load.input(i32 0) ++ %r1 = call float @llvm.R600.load.input(i32 1) ++ %r2 = call float @llvm.AMDGPU.mul( float %r0, float %r1) ++ call void @llvm.AMDGPU.store.output(float %r2, i32 0) ++ ret void ++} ++ ++declare float @llvm.R600.load.input(i32) readnone ++ ++declare void @llvm.AMDGPU.store.output(float, i32) ++ ++declare float @llvm.AMDGPU.mul(float ,float ) readnone +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/llvm.AMDGPU.trunc.ll llvm-r600/test/CodeGen/R600/llvm.AMDGPU.trunc.ll +--- llvm-3.2.src/test/CodeGen/R600/llvm.AMDGPU.trunc.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/llvm.AMDGPU.trunc.ll 2013-01-25 19:43:58.463383033 +0100 +@@ -0,0 +1,16 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: TRUNC T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test() { ++ %r0 = call float @llvm.R600.load.input(i32 0) ++ %r1 = call float @llvm.AMDGPU.trunc( float %r0) ++ call void @llvm.AMDGPU.store.output(float %r1, i32 0) ++ ret void ++} ++ ++declare float @llvm.R600.load.input(i32) readnone ++ ++declare void @llvm.AMDGPU.store.output(float, i32) ++ ++declare float @llvm.AMDGPU.trunc(float ) readnone +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/llvm.cos.ll llvm-r600/test/CodeGen/R600/llvm.cos.ll +--- llvm-3.2.src/test/CodeGen/R600/llvm.cos.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/llvm.cos.ll 2013-01-25 19:43:58.463383033 +0100 +@@ -0,0 +1,16 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: COS T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test() { ++ %r0 = call float @llvm.R600.load.input(i32 0) ++ %r1 = call float @llvm.cos.f32(float %r0) ++ call void @llvm.AMDGPU.store.output(float %r1, i32 0) ++ ret void ++} ++ ++declare float @llvm.cos.f32(float) readnone ++ ++declare float @llvm.R600.load.input(i32) readnone ++ ++declare void @llvm.AMDGPU.store.output(float, i32) +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/llvm.pow.ll llvm-r600/test/CodeGen/R600/llvm.pow.ll +--- llvm-3.2.src/test/CodeGen/R600/llvm.pow.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/llvm.pow.ll 2013-01-25 19:43:58.466716366 +0100 +@@ -0,0 +1,19 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: LOG_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK-NEXT: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++;CHECK-NEXT: EXP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test() { ++ %r0 = call float @llvm.R600.load.input(i32 0) ++ %r1 = call float @llvm.R600.load.input(i32 1) ++ %r2 = call float @llvm.pow.f32( float %r0, float %r1) ++ call void @llvm.AMDGPU.store.output(float %r2, i32 0) ++ ret void ++} ++ ++declare float @llvm.R600.load.input(i32) readnone ++ ++declare void @llvm.AMDGPU.store.output(float, i32) ++ ++declare float @llvm.pow.f32(float ,float ) readonly +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/llvm.sin.ll llvm-r600/test/CodeGen/R600/llvm.sin.ll +--- llvm-3.2.src/test/CodeGen/R600/llvm.sin.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/llvm.sin.ll 2013-01-25 19:43:58.466716366 +0100 +@@ -0,0 +1,16 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: SIN T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test() { ++ %r0 = call float @llvm.R600.load.input(i32 0) ++ %r1 = call float @llvm.sin.f32( float %r0) ++ call void @llvm.AMDGPU.store.output(float %r1, i32 0) ++ ret void ++} ++ ++declare float @llvm.sin.f32(float) readnone ++ ++declare float @llvm.R600.load.input(i32) readnone ++ ++declare void @llvm.AMDGPU.store.output(float, i32) +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/load.constant_addrspace.f32.ll llvm-r600/test/CodeGen/R600/load.constant_addrspace.f32.ll +--- llvm-3.2.src/test/CodeGen/R600/load.constant_addrspace.f32.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/load.constant_addrspace.f32.ll 2013-01-25 19:43:58.466716366 +0100 +@@ -0,0 +1,9 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: VTX_READ_32 T{{[0-9]+\.X, T[0-9]+\.X}} ++ ++define void @test(float addrspace(1)* %out, float addrspace(2)* %in) { ++ %1 = load float addrspace(2)* %in ++ store float %1, float addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/load.i8.ll llvm-r600/test/CodeGen/R600/load.i8.ll +--- llvm-3.2.src/test/CodeGen/R600/load.i8.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/load.i8.ll 2013-01-25 19:43:58.466716366 +0100 +@@ -0,0 +1,10 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}} ++ ++define void @test(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { ++ %1 = load i8 addrspace(1)* %in ++ %2 = zext i8 %1 to i32 ++ store i32 %2, i32 addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/reciprocal.ll llvm-r600/test/CodeGen/R600/reciprocal.ll +--- llvm-3.2.src/test/CodeGen/R600/reciprocal.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/reciprocal.ll 2013-01-25 19:43:58.466716366 +0100 +@@ -0,0 +1,16 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test() { ++ %r0 = call float @llvm.R600.load.input(i32 0) ++ %r1 = fdiv float 1.0, %r0 ++ call void @llvm.AMDGPU.store.output(float %r1, i32 0) ++ ret void ++} ++ ++declare float @llvm.R600.load.input(i32) readnone ++ ++declare void @llvm.AMDGPU.store.output(float, i32) ++ ++declare float @llvm.AMDGPU.rcp(float ) readnone +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/sdiv.ll llvm-r600/test/CodeGen/R600/sdiv.ll +--- llvm-3.2.src/test/CodeGen/R600/sdiv.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/sdiv.ll 2013-01-25 19:43:58.466716366 +0100 +@@ -0,0 +1,21 @@ ++; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++; The code generated by sdiv is long and complex and may frequently change. ++; The goal of this test is to make sure the ISel doesn't fail. ++; ++; This program was previously failing to compile when one of the selectcc ++; opcodes generated by the sdiv lowering was being legalized and optimized to: ++; selectcc Remainder -1, 0, -1, SETGT ++; This was fixed by adding an additional pattern in R600Instructions.td to ++; match this pattern with a CNDGE_INT. ++ ++; CHECK: RETURN ++ ++define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ++ %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1 ++ %num = load i32 addrspace(1) * %in ++ %den = load i32 addrspace(1) * %den_ptr ++ %result = sdiv i32 %num, %den ++ store i32 %result, i32 addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/selectcc_cnde_int.ll llvm-r600/test/CodeGen/R600/selectcc_cnde_int.ll +--- llvm-3.2.src/test/CodeGen/R600/selectcc_cnde_int.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/selectcc_cnde_int.ll 2013-01-25 19:43:58.466716366 +0100 +@@ -0,0 +1,11 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK-NOT: SETE_INT ++;CHECK: CNDE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1, literal.x, 2}} ++define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { ++ %1 = load i32 addrspace(1)* %in ++ %2 = icmp eq i32 %1, 0 ++ %3 = select i1 %2, i32 1, i32 2 ++ store i32 %3, i32 addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/selectcc_cnde.ll llvm-r600/test/CodeGen/R600/selectcc_cnde.ll +--- llvm-3.2.src/test/CodeGen/R600/selectcc_cnde.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/selectcc_cnde.ll 2013-01-25 19:43:58.466716366 +0100 +@@ -0,0 +1,11 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK-NOT: SETE ++;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1.0, literal.x, [-0-9]+\(2.0}} ++define void @test(float addrspace(1)* %out, float addrspace(1)* %in) { ++ %1 = load float addrspace(1)* %in ++ %2 = fcmp oeq float %1, 0.0 ++ %3 = select i1 %2, float 1.0, float 2.0 ++ store float %3, float addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/selectcc-icmp-select-float.ll llvm-r600/test/CodeGen/R600/selectcc-icmp-select-float.ll +--- llvm-3.2.src/test/CodeGen/R600/selectcc-icmp-select-float.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/selectcc-icmp-select-float.ll 2013-01-25 19:43:58.466716366 +0100 +@@ -0,0 +1,15 @@ ++; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++; Note additional optimizations may cause this SGT to be replaced with a ++; CND* instruction. ++; CHECK: SGT_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, -1}} ++; Test a selectcc with i32 LHS/RHS and float True/False ++ ++define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) { ++entry: ++ %0 = load i32 addrspace(1)* %in ++ %1 = icmp sge i32 %0, 0 ++ %2 = select i1 %1, float 1.0, float 0.0 ++ store float %2, float addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/setcc.v4i32.ll llvm-r600/test/CodeGen/R600/setcc.v4i32.ll +--- llvm-3.2.src/test/CodeGen/R600/setcc.v4i32.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/setcc.v4i32.ll 2013-01-25 19:43:58.466716366 +0100 +@@ -0,0 +1,12 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++;CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { ++ %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1 ++ %a = load <4 x i32> addrspace(1) * %in ++ %b = load <4 x i32> addrspace(1) * %b_ptr ++ %result = icmp eq <4 x i32> %a, %b ++ %sext = sext <4 x i1> %result to <4 x i32> ++ store <4 x i32> %sext, <4 x i32> addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/short-args.ll llvm-r600/test/CodeGen/R600/short-args.ll +--- llvm-3.2.src/test/CodeGen/R600/short-args.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/short-args.ll 2013-01-25 19:43:58.466716366 +0100 +@@ -0,0 +1,37 @@ ++; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++; CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}} ++ ++define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { ++entry: ++ %0 = zext i8 %in to i32 ++ store i32 %0, i32 addrspace(1)* %out, align 4 ++ ret void ++} ++ ++; CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}} ++ ++define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { ++entry: ++ %0 = zext i8 %in to i32 ++ store i32 %0, i32 addrspace(1)* %out, align 4 ++ ret void ++} ++ ++; CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}} ++ ++define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { ++entry: ++ %0 = zext i16 %in to i32 ++ store i32 %0, i32 addrspace(1)* %out, align 4 ++ ret void ++} ++ ++; CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}} ++ ++define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { ++entry: ++ %0 = zext i16 %in to i32 ++ store i32 %0, i32 addrspace(1)* %out, align 4 ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/store.v4f32.ll llvm-r600/test/CodeGen/R600/store.v4f32.ll +--- llvm-3.2.src/test/CodeGen/R600/store.v4f32.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/store.v4f32.ll 2013-01-25 19:43:58.466716366 +0100 +@@ -0,0 +1,9 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: RAT_WRITE_CACHELESS_128 T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1 ++ ++define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { ++ %1 = load <4 x float> addrspace(1) * %in ++ store <4 x float> %1, <4 x float> addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/store.v4i32.ll llvm-r600/test/CodeGen/R600/store.v4i32.ll +--- llvm-3.2.src/test/CodeGen/R600/store.v4i32.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/store.v4i32.ll 2013-01-25 19:43:58.466716366 +0100 +@@ -0,0 +1,9 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;CHECK: RAT_WRITE_CACHELESS_128 T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1 ++ ++define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { ++ %1 = load <4 x i32> addrspace(1) * %in ++ store <4 x i32> %1, <4 x i32> addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/udiv.v4i32.ll llvm-r600/test/CodeGen/R600/udiv.v4i32.ll +--- llvm-3.2.src/test/CodeGen/R600/udiv.v4i32.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/udiv.v4i32.ll 2013-01-25 19:43:58.466716366 +0100 +@@ -0,0 +1,15 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;The code generated by udiv is long and complex and may frequently change. ++;The goal of this test is to make sure the ISel doesn't fail when it gets ++;a v4i32 udiv ++;CHECK: RETURN ++ ++define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { ++ %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1 ++ %a = load <4 x i32> addrspace(1) * %in ++ %b = load <4 x i32> addrspace(1) * %b_ptr ++ %result = udiv <4 x i32> %a, %b ++ store <4 x i32> %result, <4 x i32> addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/urem.v4i32.ll llvm-r600/test/CodeGen/R600/urem.v4i32.ll +--- llvm-3.2.src/test/CodeGen/R600/urem.v4i32.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/urem.v4i32.ll 2013-01-25 19:43:58.470049700 +0100 +@@ -0,0 +1,15 @@ ++;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++;The code generated by urem is long and complex and may frequently change. ++;The goal of this test is to make sure the ISel doesn't fail when it gets ++;a v4i32 urem ++;CHECK: RETURN ++ ++define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { ++ %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1 ++ %a = load <4 x i32> addrspace(1) * %in ++ %b = load <4 x i32> addrspace(1) * %b_ptr ++ %result = urem <4 x i32> %a, %b ++ store <4 x i32> %result, <4 x i32> addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/vec4-expand.ll llvm-r600/test/CodeGen/R600/vec4-expand.ll +--- llvm-3.2.src/test/CodeGen/R600/vec4-expand.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/R600/vec4-expand.ll 2013-01-25 19:43:58.470049700 +0100 +@@ -0,0 +1,49 @@ ++; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s ++ ++; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @fp_to_sint(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { ++ %value = load <4 x float> addrspace(1) * %in ++ %result = fptosi <4 x float> %value to <4 x i32> ++ store <4 x i32> %result, <4 x i32> addrspace(1)* %out ++ ret void ++} ++ ++; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @fp_to_uint(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { ++ %value = load <4 x float> addrspace(1) * %in ++ %result = fptoui <4 x float> %value to <4 x i32> ++ store <4 x i32> %result, <4 x i32> addrspace(1)* %out ++ ret void ++} ++ ++; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @sint_to_fp(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { ++ %value = load <4 x i32> addrspace(1) * %in ++ %result = sitofp <4 x i32> %value to <4 x float> ++ store <4 x float> %result, <4 x float> addrspace(1)* %out ++ ret void ++} ++ ++; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ++ ++define void @uint_to_fp(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { ++ %value = load <4 x i32> addrspace(1) * %in ++ %result = uitofp <4 x i32> %value to <4 x float> ++ store <4 x float> %result, <4 x float> addrspace(1)* %out ++ ret void ++} +diff -Nur -x .git llvm-3.2.src/test/CodeGen/SI/sanity.ll llvm-r600/test/CodeGen/SI/sanity.ll +--- llvm-3.2.src/test/CodeGen/SI/sanity.ll 1970-01-01 01:00:00.000000000 +0100 ++++ llvm-r600/test/CodeGen/SI/sanity.ll 2013-01-25 19:43:58.470049700 +0100 +@@ -0,0 +1,37 @@ ++;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s ++ ++; CHECK: S_ENDPGM ++ ++define void @main() { ++main_body: ++ call void @llvm.AMDGPU.shader.type(i32 1) ++ %0 = load <4 x i32> addrspace(2)* addrspace(8)* inttoptr (i32 6 to <4 x i32> addrspace(2)* addrspace(8)*) ++ %1 = getelementptr <4 x i32> addrspace(2)* %0, i32 0 ++ %2 = load <4 x i32> addrspace(2)* %1 ++ %3 = call i32 @llvm.SI.vs.load.buffer.index() ++ %4 = call <4 x float> @llvm.SI.vs.load.input(<4 x i32> %2, i32 0, i32 %3) ++ %5 = extractelement <4 x float> %4, i32 0 ++ %6 = extractelement <4 x float> %4, i32 1 ++ %7 = extractelement <4 x float> %4, i32 2 ++ %8 = extractelement <4 x float> %4, i32 3 ++ %9 = load <4 x i32> addrspace(2)* addrspace(8)* inttoptr (i32 6 to <4 x i32> addrspace(2)* addrspace(8)*) ++ %10 = getelementptr <4 x i32> addrspace(2)* %9, i32 1 ++ %11 = load <4 x i32> addrspace(2)* %10 ++ %12 = call i32 @llvm.SI.vs.load.buffer.index() ++ %13 = call <4 x float> @llvm.SI.vs.load.input(<4 x i32> %11, i32 0, i32 %12) ++ %14 = extractelement <4 x float> %13, i32 0 ++ %15 = extractelement <4 x float> %13, i32 1 ++ %16 = extractelement <4 x float> %13, i32 2 ++ %17 = extractelement <4 x float> %13, i32 3 ++ call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %14, float %15, float %16, float %17) ++ call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %5, float %6, float %7, float %8) ++ ret void ++} ++ ++declare void @llvm.AMDGPU.shader.type(i32) ++ ++declare i32 @llvm.SI.vs.load.buffer.index() readnone ++ ++declare <4 x float> @llvm.SI.vs.load.input(<4 x i32>, i32, i32) ++ ++declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) +diff -Nur -x .git llvm-3.2.src/test/CodeGen/X86/cvtv2f32.ll llvm-r600/test/CodeGen/X86/cvtv2f32.ll +--- llvm-3.2.src/test/CodeGen/X86/cvtv2f32.ll 2012-10-24 06:14:18.000000000 +0200 ++++ llvm-r600/test/CodeGen/X86/cvtv2f32.ll 2013-01-25 19:43:58.856716358 +0100 +@@ -1,3 +1,7 @@ ++; A bug fix in the DAGCombiner made this test fail, so marking as xfail ++; until this can be investigated further. ++; XFAIL: * ++ + ; RUN: llc < %s -mtriple=i686-linux-pc -mcpu=corei7 | FileCheck %s + + define <2 x float> @foo(i32 %x, i32 %y, <2 x float> %v) { diff --git a/llvm-tld.patch b/llvm-tld.patch new file mode 100644 index 0000000..38b54f6 --- /dev/null +++ b/llvm-tld.patch @@ -0,0 +1,79 @@ +--- llvm-3.2.src/tools/clang/lib/Driver/ToolChains.cpp.orig 2012-12-16 16:59:27.000000000 +0100 ++++ llvm-3.2.src/tools/clang/lib/Driver/ToolChains.cpp 2013-01-24 12:42:19.582377854 +0100 +@@ -1062,6 +1062,7 @@ + + static const char *const X86_64LibDirs[] = { "/lib64", "/lib" }; + static const char *const X86_64Triples[] = { ++ "x86_64-tld-linux", + "x86_64-linux-gnu", + "x86_64-unknown-linux-gnu", + "x86_64-pc-linux-gnu", +@@ -1074,6 +1075,7 @@ + }; + static const char *const X86LibDirs[] = { "/lib32", "/lib" }; + static const char *const X86Triples[] = { ++ "i686-tld-linux", + "i686-linux-gnu", + "i686-pc-linux-gnu", + "i486-linux-gnu", +@@ -1830,6 +1834,7 @@ + + enum LinuxDistro { + ArchLinux, ++ TLDLinux, + DebianLenny, + DebianSqueeze, + DebianWheezy, +@@ -1877,6 +1882,10 @@ + return Distro >= UbuntuHardy && Distro <= UbuntuRaring; + } + ++static bool IsTLD(enum LinuxDistro Distro) { ++ return Distro == TLDLinux; ++} ++ + static LinuxDistro DetectLinuxDistro(llvm::Triple::ArchType Arch) { + OwningPtr File; + if (!llvm::MemoryBuffer::getFile("/etc/lsb-release", File)) { +@@ -1955,6 +1964,9 @@ + if (!llvm::sys::fs::exists("/etc/arch-release", Exists) && Exists) + return ArchLinux; + ++ if (!llvm::sys::fs::exists("/etc/tld-release", Exists) && Exists) ++ return TLDLinux; ++ + return UnknownDistro; + } + +@@ -2072,7 +2084,7 @@ + + LinuxDistro Distro = DetectLinuxDistro(Arch); + +- if (IsOpenSuse(Distro) || IsUbuntu(Distro)) { ++ if (IsOpenSuse(Distro) || IsUbuntu(Distro) || IsTLD(Distro)) { + ExtraOpts.push_back("-z"); + ExtraOpts.push_back("relro"); + } +@@ -2088,7 +2100,7 @@ + // ABI requires a mapping between the GOT and the symbol table. + // Android loader does not support .gnu.hash. + if (!isMipsArch(Arch) && !IsAndroid) { +- if (IsRedhat(Distro) || IsOpenSuse(Distro) || ++ if (IsRedhat(Distro) || IsOpenSuse(Distro) || IsTLD(Distro) || + (IsUbuntu(Distro) && Distro >= UbuntuMaverick)) + ExtraOpts.push_back("--hash-style=gnu"); + +@@ -2097,11 +2109,11 @@ + ExtraOpts.push_back("--hash-style=both"); + } + +- if (IsRedhat(Distro)) ++ if (IsRedhat(Distro) || IsTLD(Distro)) + ExtraOpts.push_back("--no-add-needed"); + + if (Distro == DebianSqueeze || Distro == DebianWheezy || +- IsOpenSuse(Distro) || ++ IsOpenSuse(Distro) || IsTLD(Distro) || + (IsRedhat(Distro) && Distro != RHEL4 && Distro != RHEL5) || + (IsUbuntu(Distro) && Distro >= UbuntuKarmic)) + ExtraOpts.push_back("--build-id"); diff --git a/llvm.spec b/llvm.spec new file mode 100644 index 0000000..e70eec5 --- /dev/null +++ b/llvm.spec @@ -0,0 +1,537 @@ +# +# TODO: +# - fix include search path to support libdir/gcc/platform/version/include. +# current error: /usr/include/wchar.h:39:11: fatal error: 'stdarg.h' file not found# include +# +# Conditional build: +%bcond_without ocaml # ocaml binding +%bcond_with apidocs # The doxygen docs are HUGE, so they are not built by default. +%bcond_without man # man pages +%bcond_with tests # run tests + +%ifarch s390 s390x sparc64 +# No ocaml on these arches +%undefine with_ocaml +%endif + +Summary: The Low Level Virtual Machine (An Optimizing Compiler Infrastructure) +Summary(pl.UTF-8): Niskopoziomowa maszyna wirtualna (infrastruktura kompilatora optymalizującego) +Name: llvm +Version: 3.2 +Release: 4 +License: University of Illinois/NCSA Open Source License +Group: Development/Languages +#Source0Download: http://llvm.org/releases/download.html +Source0: http://llvm.org/releases/%{version}/%{name}-%{version}.src.tar.gz +# Source0-md5: 71610289bbc819e3e15fdd562809a2d7 +Source1: http://llvm.org/releases/%{version}/clang-%{version}.src.tar.gz +# Source1-md5: 3896ef4334df08563b05d0848ba80582 +Patch0: %{name}-config.patch +# Data files should be installed with timestamps preserved +Patch1: %{name}-2.6-timestamp.patch +Patch2: %{name}-tld.patch +# R600 target support from git://people.freedesktop.org/~tstellar/llvm +Patch3: %{name}-r600.patch +URL: http://llvm.org/ +BuildRequires: autoconf >= 2.60 +BuildRequires: automake >= 1:1.9.6 +BuildRequires: bash +BuildRequires: bison +BuildRequires: flex +BuildRequires: gcc >= 5:3.4 +# gcc4 might be installed, but not current __cc +%if "%(echo %{cc_version} | cut -d. -f1,2)" < "3.4" +BuildRequires: __cc >= 3.4 +%endif +BuildRequires: groff +BuildRequires: libltdl-devel +BuildRequires: libtool >= 2:1.5.22 +BuildRequires: libstdc++-devel >= 5:3.4 +BuildRequires: ocaml-ocamldoc +BuildRequires: perl-base >= 1:5.6 +BuildRequires: perl-tools-pod +BuildRequires: rpm-pythonprov +%{?with_man:BuildRequires: sphinx-pdg} +%if %{with apidocs} +BuildRequires: doxygen +BuildRequires: graphviz +%endif +%if %{with tests} +BuildRequires: dejagnu +BuildRequires: python +BuildRequires: tcl-devel +%endif +Requires: %{name}-libs = %{version}-%{release} +# LLVM is not supported on PPC64 +# http://llvm.org/bugs/show_bug.cgi?id=3729 +ExcludeArch: ppc64 +BuildRoot: %{tmpdir}/%{name}-%{version}-root-%(id -u -n) + +%define _sysconfdir /etc/%{name} + +%define specflags_ppc -fno-var-tracking-assignments + +# strip corrupts: $RPM_BUILD_ROOT/usr/lib64/llvm-gcc/bin/llvm-c++ ... +%define _noautostrip .*/\\(libmud.*\\.a\\|bin/llvm-.*\\|lib.*++\\.a\\) + +%description +LLVM is a compiler infrastructure designed for compile-time, +link-time, runtime, and idle-time optimization of programs from +arbitrary programming languages. LLVM is written in C++ and has been +developed since 2000 at the University of Illinois and Apple. It +currently supports compilation of C and C++ programs using clang +frontend. + +%description -l pl.UTF-8 +LLVM to infrastruktura kompilatora zaprojektowana do optymalizacji +czasu kompilowania, linkowania, działania i bezczynności programów w +dowolnych językach programowania. Jest napisana w C++, rozwijana od +roku 2000 przez Uniwersytet w Illinois i Apple. Aktualnie obsługuje +kompilację programów w C i C++ przy użyciu frontendu clang. + +%package libs +Summary: LLVM shared library +Summary(pl.UTF-8): Biblioteka współdzielona LLVM-a +Group: Libraries +Conflicts: llvm < 3.2 + +%description libs +LLVM shared library. + +%description libs -l pl.UTF-8 +Biblioteka współdzielona LLVM-a. + +%package devel +Summary: Static libraries and header files for LLVM +Summary(pl.UTF-8): Biblioteki statyczne i pliki nagłówkowe dla LLVM-a +Group: Development/Languages +Requires: %{name}-libs = %{version}-%{release} +Requires: libstdc++-devel >= 6:3.4 + +%description devel +This package contains static libraries and header files needed to +develop new native programs that use the LLVM infrastructure. + +%description devel -l pl.UTF-8 +Ten pakiet zawiera biblioteki statyczne oraz pliki nagłówkowe +potrzebne do tworzenia nowych programów natywnych wykorzystujących +infrastrukturę LLVM. + +%package doc +Summary: Documentation for LLVM +Summary(pl.UTF-8): Dokumentacja do LLVM-a +Group: Documentation +# does not require base + +%description doc +Documentation for the LLVM compiler infrastructure. + +%description doc -l pl.UTF-8 +Dokumentacja do infrastruktury kompilatorów LLVM. + +%package apidocs +Summary: API documentation for LLVM +Summary(pl.UTF-8): Dokumentacja API LLVM-a +Group: Development/Languages +Requires: %{name}-doc = %{version}-%{release} + +%description apidocs +API documentation for the LLVM compiler infrastructure. + +%description apidocs -l pl.UTF-8 +Dokumentacja API infrastruktury kompilatorów LLVM. + +%package -n clang +Summary: A C language family frontend for LLVM +Summary(pl.UTF-8): Frontend LLVM-a do języków z rodziny C +License: NCSA +Group: Development/Languages +Requires: %{name} = %{version}-%{release} + +%description -n clang +clang: noun 1. A loud, resonant, metallic sound. 2. The strident call +of a crane or goose. 3. C-language family front-end toolkit. + +The goal of the Clang project is to create a new C, C++, Objective C +and Objective C++ front-end for the LLVM compiler. Its tools are built +as libraries and designed to be loosely-coupled and extendable. + +%description -n clang -l pl.UTF-8 +clang (z angielskiego): 1. głośny, rezonujący, metaliczny dźwięk; 2. +piskliwy odgłos żurawia lub gęsi; 3. narzędzia frontendowe dla języków +z rodziny C. + +Celem projektu Clang jest utworzenie nowego frontendu dla kompilatora +LLVM do języków C, C++, Objective C i Objective C++. Narzędzia są +budowane jako biblioteki i zaprojektowane z myślą o swobodnym łączeniu +i rozszerzaniu. + +%package -n clang-analyzer +Summary: A source code analysis framework +Summary(pl.UTF-8): Szkielet do analizy kodu źródłowego +License: NCSA +Group: Development/Languages +Requires: clang = %{version}-%{release} +# not picked up automatically since files are currently not instaled +# in standard Python hierarchies yet +Requires: python + +%description -n clang-analyzer +The Clang Static Analyzer consists of both a source code analysis +framework and a standalone tool that finds bugs in C and Objective-C +programs. The standalone tool is invoked from the command-line, and is +intended to run in tandem with a build of a project or code base. + +%description -n clang-analyzer -l pl.UTF-8 +Clang Static Analyzer składa się ze szkieletu do analizy kodu +źródłowego oraz samodzielnego narzędzia znajdującego błędy w +programach w C i C++. Narzędzie jest wywoływane z linii poleceń, z +myślą o uruchamianiu wraz z kompilacją projektu lub kodu. + +%package -n clang-devel +Summary: Header files for Clang +Summary(pl.UTF-8): Pliki nagłówkowe Clanga +Group: Development/Languages +Requires: %{name}-devel = %{version}-%{release} +Requires: clang = %{version}-%{release} + +%description -n clang-devel +This package contains header files for the Clang compiler. + +%description -n clang-devel -l pl.UTF-8 +Ten pakiet zawiera pliki nagłówkowe kompilatora Clang. + +%package -n clang-doc +Summary: Documentation for Clang +Summary(pl.UTF-8): Dokumentacja do Clanga +Group: Documentation +Requires: %{name} = %{version}-%{release} + +%description -n clang-doc +Documentation for the Clang compiler front-end. + +%description -n clang-doc -l pl.UTF-8 +Dokumentacja do frontendu kompilatora Clang. + +%package -n clang-apidocs +Summary: API documentation for Clang +Summary(pl.UTF-8): Dokumentacja API Clanga +Group: Development/Languages +Requires: clang-doc = %{version}-%{release} + +%description -n clang-apidocs +API documentation for the Clang compiler. + +%description -n clang-apidocs -l pl.UTF-8 +Dokumentacja API kompilatora Clang. + +%package ocaml +Summary: OCaml binding for LLVM +Summary(pl.UTF-8): Wiązanie OCamla do LLVM-a +Group: Libraries +Requires: %{name} = %{version}-%{release} +%requires_eq ocaml-runtime + +%description ocaml +OCaml binding for LLVM. + +%description ocaml -l pl.UTF-8 +Wiązanie OCamla do LLVM-a. + +%package ocaml-devel +Summary: Development files for LLVM OCaml binding +Summary(pl.UTF-8): Pliki programistyczne wiązania OCamla do LLVM-a +Group: Development/Libraries +Requires: %{name}-devel = %{version}-%{release} +Requires: %{name}-ocaml = %{version}-%{release} + +%description ocaml-devel +The llvm-ocaml-devel package contains libraries and signature files +for developing applications that use llvm-ocaml binding. + +%description ocaml-devel -l pl.UTF-8 +Ten pakiet zawiera biblioteki i pliki sygnatur do tworzenia aplikacji +wykorzystujących wiązanie llvm-ocaml. + +%package ocaml-doc +Summary: Documentation for LLVM's OCaml binding +Summary(pl.UTF-8): Dokumentacja wiązania OCamla do LLVM-a +Group: Documentation +Requires: %{name}-ocaml = %{version}-%{release} + +%description ocaml-doc +HTML documentation for LLVM's OCaml binding. + +%description ocaml-doc -l pl.UTF-8 +Dokumentacja HTML wiązania OCamla do LLVM-a. + +%prep +%setup -q -a1 -n %{name}-%{version}.src +mv clang-*.* tools/clang +%patch0 -p1 +%patch1 -p1 +%patch2 -p1 +%patch3 -p1 + +# configure does not properly specify libdir +%{__sed} -i 's|(PROJ_prefix)/lib|(PROJ_prefix)/%{_lib}|g' Makefile.config.in +%{__sed} -i 's|/lib/|/%{_lib}/|' lib/Support/Unix/Path.inc +# clang resources +%{__sed} -i 's|(PROJ_prefix)/lib/|(PROJ_prefix)/%{_lib}/|g' tools/clang/lib/Headers/Makefile +%{__sed} -i 's|"lib"|"%{_lib}"|' tools/clang/lib/Driver/Driver.cpp + +grep -rl /usr/bin/env tools utils | xargs sed -i -e '1{ + s,^#!.*bin/env python,#!%{__python}, + s,^#!.*bin/env perl,#!%{__perl}, +}' + +install -d obj + +%build +cd autoconf +%{__aclocal} -I m4 +%{__autoconf} -o ../configure configure.ac +cd .. +%{__autoheader} -I autoconf -I autoconf/m4 autoconf/configure.ac + +# Disabling assertions now, rec. by pure and needed for OpenGTL +# TESTFIX no PIC on ix86: http://llvm.org/bugs/show_bug.cgi?id=3801 +# +# bash specific 'test a < b' +cd obj +bash ../%configure \ + --datadir=%{_datadir}/%{name}-%{version} \ + --disable-assertions \ +%ifarch %{ix86} + --disable-pic \ +%endif + --disable-static \ + --enable-bindings=%{?with_ocaml:ocaml}%{!?with_ocaml:none} \ + --enable-debug-runtime \ +%if %{with apidocs} + --enable-doxygen \ +%endif + --enable-experimental-targets=R600 \ + --enable-jit \ + --enable-optimized \ + --enable-shared \ + --with-pic + +%{__make} \ + REQUIRES_RTTI=1 \ + OPTIMIZE_OPTION="%{rpmcflags} %{rpmcppflags}" + +%if %{with tests} +%{__make} check 2>&1 | tee llvm-testlog.txt +%{__make} -C tools/clang test 2>&1 | tee clang-testlog.txt +%endif + +cd .. + +%if %{with man} +%{__make} -C docs -f Makefile.sphinx man +%endif + +%install +rm -rf $RPM_BUILD_ROOT +%{__make} -C obj -j1 install \ + PROJ_docsdir=/moredocs \ + DESTDIR=$RPM_BUILD_ROOT + +# Static analyzer not installed by default: +# http://clang-analyzer.llvm.org/installation#OtherPlatforms +install -d $RPM_BUILD_ROOT%{_libdir}/clang-analyzer +# create launchers +for f in scan-{build,view}; do + ln -s %{_libdir}/clang-analyzer/$f/$f $RPM_BUILD_ROOT%{_bindir}/$f + cp -pr tools/clang/tools/$f $RPM_BUILD_ROOT%{_libdir}/clang-analyzer +done +%{__mv} $RPM_BUILD_ROOT%{_libdir}/clang-analyzer/scan-build/scan-build.1 $RPM_BUILD_ROOT%{_mandir}/man1 +%py_comp $RPM_BUILD_ROOT%{_libdir}/clang-analyzer/scan-view +%py_ocomp $RPM_BUILD_ROOT%{_libdir}/clang-analyzer/scan-view +%py_postclean %{_libdir}/clang-analyzer/scan-view + +%if %{with man} +install -d $RPM_BUILD_ROOT%{_mandir}/man1 +cp -p docs/_build/man/*.1 $RPM_BUILD_ROOT%{_mandir}/man1 +# these tools are not installed +%{__rm} $RPM_BUILD_ROOT%{_mandir}/man1/{FileCheck,llvm-build}.1 +%endif + +# Move documentation back to build directory +rm -rf moredocs +mv $RPM_BUILD_ROOT/moredocs . +%{__rm} -v moredocs/*.tar.gz +%{__rm} -v moredocs/ocamldoc/html/*.tar.gz + +# and separate the apidoc +%if %{with apidocs} +rm -rf apidoc clang-apidoc +mv moredocs/html/doxygen apidoc +cp -a tools/clang/docs/doxygen/html clang-apidoc +%endif + +# And prepare Clang documentation +rm -rf clang-docs +install -d clang-docs +for f in LICENSE.TXT NOTES.txt README.txt; do + ln tools/clang/$f clang-docs +done + +# Get rid of erroneously installed example files. +%{__rm} -v $RPM_BUILD_ROOT%{_libdir}/*LLVMHello.* + +# remove documentation makefiles: +# they require the build directory to work +rm -rf moredocs/examples +cp -a examples moredocs/examples +find moredocs/examples -name Makefile | xargs -0r rm -f + +%clean +rm -rf $RPM_BUILD_ROOT + +%post libs -p /sbin/ldconfig +%postun libs -p /sbin/ldconfig + +%files +%defattr(644,root,root,755) +%doc CREDITS.TXT LICENSE.TXT README.txt %{?with_tests:llvm-testlog.txt} +%attr(755,root,root) %{_bindir}/bugpoint +%attr(755,root,root) %{_bindir}/llc +%attr(755,root,root) %{_bindir}/lli +%attr(755,root,root) %{_bindir}/llvm-ar +%attr(755,root,root) %{_bindir}/llvm-as +%attr(755,root,root) %{_bindir}/llvm-bcanalyzer +%attr(755,root,root) %{_bindir}/llvm-cov +%attr(755,root,root) %{_bindir}/llvm-diff +%attr(755,root,root) %{_bindir}/llvm-dis +%attr(755,root,root) %{_bindir}/llvm-dwarfdump +%attr(755,root,root) %{_bindir}/llvm-extract +%attr(755,root,root) %{_bindir}/llvm-link +%attr(755,root,root) %{_bindir}/llvm-mc +%attr(755,root,root) %{_bindir}/llvm-mcmarkup +%attr(755,root,root) %{_bindir}/llvm-nm +%attr(755,root,root) %{_bindir}/llvm-objdump +%attr(755,root,root) %{_bindir}/llvm-prof +%attr(755,root,root) %{_bindir}/llvm-ranlib +%attr(755,root,root) %{_bindir}/llvm-readobj +%attr(755,root,root) %{_bindir}/llvm-rtdyld +%attr(755,root,root) %{_bindir}/llvm-size +%attr(755,root,root) %{_bindir}/llvm-stress +%attr(755,root,root) %{_bindir}/llvm-tblgen +%attr(755,root,root) %{_bindir}/macho-dump +%attr(755,root,root) %{_bindir}/opt +%{_mandir}/man1/bugpoint.1* +%{_mandir}/man1/lit.1* +%{_mandir}/man1/llc.1* +%{_mandir}/man1/lli.1* +%{_mandir}/man1/llvm-ar.1* +%{_mandir}/man1/llvm-as.1* +%{_mandir}/man1/llvm-bcanalyzer.1* +%{_mandir}/man1/llvm-cov.1* +%{_mandir}/man1/llvm-diff.1* +%{_mandir}/man1/llvm-dis.1* +%{_mandir}/man1/llvm-extract.1* +%{_mandir}/man1/llvm-link.1* +%{_mandir}/man1/llvm-nm.1* +%{_mandir}/man1/llvm-prof.1* +%{_mandir}/man1/llvm-ranlib.1* +%{_mandir}/man1/llvm-stress.1* +%{_mandir}/man1/opt.1* +%{_mandir}/man1/tblgen.1* + +%files libs +%defattr(644,root,root,755) +%attr(755,root,root) %{_libdir}/libLLVM-%{version}svn.so + +%files devel +%defattr(644,root,root,755) +%attr(755,root,root) %{_bindir}/llvm-config +%attr(755,root,root) %{_libdir}/libprofile_rt.so +%{_libdir}/libLLVM*.a +%{_libdir}/libprofile_rt.a +%ifarch %{x8664} +%attr(755,root,root) %{_libdir}/BugpointPasses.so +%attr(755,root,root) %{_libdir}/libLTO.so +%{_libdir}/libLTO.a +%endif +%{_includedir}/llvm +%{_includedir}/llvm-c +%{_mandir}/man1/llvm-config.1* + +%files doc +%defattr(644,root,root,755) +%doc moredocs/examples moredocs/html + +%if %{with apidocs} +%files apidocs +%defattr(644,root,root,755) +%doc apidoc/* +%endif + +%files -n clang +%defattr(644,root,root,755) +%doc clang-docs/{LICENSE.TXT,NOTES.txt,README.txt} %{?with_tests:clang-testlog.txt} +%attr(755,root,root) %{_bindir}/c-index-test +%attr(755,root,root) %{_bindir}/clang +%attr(755,root,root) %{_bindir}/clang++ +%attr(755,root,root) %{_bindir}/clang-check +%attr(755,root,root) %{_bindir}/clang-tblgen +%attr(755,root,root) %{_libdir}/libclang.so +%{_libdir}/clang +%{_mandir}/man1/clang.1* + +%files -n clang-analyzer +%defattr(644,root,root,755) +%attr(755,root,root) %{_bindir}/scan-build +%attr(755,root,root) %{_bindir}/scan-view +%{_mandir}/man1/scan-build.1* +%dir %{_libdir}/clang-analyzer + +%dir %{_libdir}/clang-analyzer/scan-build +%{_libdir}/clang-analyzer/scan-build/*.css +%{_libdir}/clang-analyzer/scan-build/*.js +%attr(755,root,root) %{_libdir}/clang-analyzer/scan-build/scan-build +%attr(755,root,root) %{_libdir}/clang-analyzer/scan-build/*-analyzer + +%dir %{_libdir}/clang-analyzer/scan-view +%attr(755,root,root) %{_libdir}/clang-analyzer/scan-view/scan-view +%{_libdir}/clang-analyzer/scan-view/Resources +%{_libdir}/clang-analyzer/scan-view/*.py[co] + +%files -n clang-devel +%defattr(644,root,root,755) +%{_libdir}/libclang*.a +%{_includedir}/clang +%{_includedir}/clang-c + +%files -n clang-doc +%defattr(644,root,root,755) +%doc tools/clang/docs/*.{css,html,png,txt} + +%if %{with apidocs} +%files -n clang-apidocs +%defattr(644,root,root,755) +%doc clang-apidoc/* +%endif + +%if %{with ocaml} +%files ocaml +%defattr(644,root,root,755) +%{_libdir}/ocaml/META.llvm +%{_libdir}/ocaml/llvm*.cma +%{_libdir}/ocaml/llvm*.cmi + +%files ocaml-devel +%defattr(644,root,root,755) +%{_libdir}/libllvm*.a +%{_libdir}/ocaml/libLLVM*.a +%{_libdir}/ocaml/libllvm*.a +%{_libdir}/ocaml/llvm*.a +%{_libdir}/ocaml/llvm*.cmx* +%{_libdir}/ocaml/llvm*.mli + +%files ocaml-doc +%defattr(644,root,root,755) +%doc moredocs/ocamldoc/html/* +%endif