diff -Nur -x .git llvm-3.2.src/autoconf/configure.ac llvm-r600/autoconf/configure.ac --- llvm-3.2.src/autoconf/configure.ac 2012-11-21 17:13:35.000000000 +0100 +++ llvm-r600/autoconf/configure.ac 2013-01-25 19:43:56.096716416 +0100 @@ -751,6 +751,11 @@ if test ${enableval} != "disable" then + if test ${enableval} = "AMDGPU" + then + AC_MSG_ERROR([The AMDGPU target has been renamed to R600, please reconfigure with --enable-experimental-targets=R600]) + enableval="R600" + fi TARGETS_TO_BUILD="$enableval $TARGETS_TO_BUILD" fi diff -Nur -x .git llvm-3.2.src/configure llvm-r600/configure --- llvm-3.2.src/configure 2012-11-21 17:13:35.000000000 +0100 +++ llvm-r600/configure 2013-01-25 19:43:56.173383081 +0100 @@ -5473,6 +5473,13 @@ if test ${enableval} != "disable" then + if test ${enableval} = "AMDGPU" + then + { { echo "$as_me:$LINENO: error: The AMDGPU target has been renamed to R600, please reconfigure with --enable-experimental-targets=R600" >&5 +echo "$as_me: error: The AMDGPU target has been renamed to R600, please reconfigure with --enable-experimental-targets=R600" >&2;} + { (exit 1); exit 1; }; } + enableval="R600" + fi TARGETS_TO_BUILD="$enableval $TARGETS_TO_BUILD" fi @@ -10316,7 +10323,7 @@ lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 lt_status=$lt_dlunknown cat > conftest.$ac_ext < + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>, + GCCBuiltin; + +multiclass R600ReadPreloadRegisterIntrinsic_xyz { + def _x : R600ReadPreloadRegisterIntrinsic; + def _y : R600ReadPreloadRegisterIntrinsic; + def _z : R600ReadPreloadRegisterIntrinsic; +} + +defm int_r600_read_global_size : R600ReadPreloadRegisterIntrinsic_xyz < + "__builtin_r600_read_global_size">; +defm int_r600_read_local_size : R600ReadPreloadRegisterIntrinsic_xyz < + "__builtin_r600_read_local_size">; +defm int_r600_read_ngroups : R600ReadPreloadRegisterIntrinsic_xyz < + "__builtin_r600_read_ngroups">; +defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz < + "__builtin_r600_read_tgid">; +defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz < + "__builtin_r600_read_tidig">; +} // End TargetPrefix = "r600" diff -Nur -x .git llvm-3.2.src/include/llvm/Intrinsics.td llvm-r600/include/llvm/Intrinsics.td --- llvm-3.2.src/include/llvm/Intrinsics.td 2012-10-20 01:00:20.000000000 +0200 +++ llvm-r600/include/llvm/Intrinsics.td 2013-01-25 19:43:56.426716409 +0100 @@ -469,3 +469,4 @@ include "llvm/IntrinsicsHexagon.td" include "llvm/IntrinsicsNVVM.td" include "llvm/IntrinsicsMips.td" +include "llvm/IntrinsicsR600.td" diff -Nur -x .git llvm-3.2.src/lib/CodeGen/SelectionDAG/DAGCombiner.cpp llvm-r600/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- llvm-3.2.src/lib/CodeGen/SelectionDAG/DAGCombiner.cpp 2012-11-26 18:01:12.000000000 +0100 +++ llvm-r600/lib/CodeGen/SelectionDAG/DAGCombiner.cpp 2013-01-25 19:43:56.720049736 +0100 @@ -8514,11 +8514,8 @@ if (Opcode == ISD::DELETED_NODE && (Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP)) { Opcode = Opc; - // If not supported by target, bail out. - if (TLI.getOperationAction(Opcode, VT) != TargetLowering::Legal && - TLI.getOperationAction(Opcode, VT) != TargetLowering::Custom) - return SDValue(); } + if (Opc != Opcode) return SDValue(); @@ -8543,6 +8540,10 @@ assert(SrcVT != MVT::Other && "Cannot determine source type!"); EVT NVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumInScalars); + + if (!TLI.isOperationLegalOrCustom(Opcode, NVT)) + return SDValue(); + SmallVector Opnds; for (unsigned i = 0; i != NumInScalars; ++i) { SDValue In = N->getOperand(i); diff -Nur -x .git llvm-3.2.src/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp llvm-r600/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp --- llvm-3.2.src/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp 2012-10-24 19:25:11.000000000 +0200 +++ llvm-r600/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp 2013-01-25 19:43:56.733383069 +0100 @@ -731,9 +731,10 @@ return; } case TargetLowering::Promote: { - assert(VT.isVector() && "Unknown legal promote case!"); - Value = DAG.getNode(ISD::BITCAST, dl, - TLI.getTypeToPromoteTo(ISD::STORE, VT), Value); + EVT NVT = TLI.getTypeToPromoteTo(ISD::STORE, VT); + assert(NVT.getSizeInBits() == VT.getSizeInBits() && + "Can only promote stores to same size type"); + Value = DAG.getNode(ISD::BITCAST, dl, NVT, Value); SDValue Result = DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(), isVolatile, @@ -889,10 +890,9 @@ break; } case TargetLowering::Promote: { - // Only promote a load of vector type to another. - assert(VT.isVector() && "Cannot promote this load!"); - // Change base type to a different vector type. EVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT); + assert(NVT.getSizeInBits() == VT.getSizeInBits() && + "Can only promote loads to same size type"); SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(), diff -Nur -x .git llvm-3.2.src/lib/Target/LLVMBuild.txt llvm-r600/lib/Target/LLVMBuild.txt --- llvm-3.2.src/lib/Target/LLVMBuild.txt 2012-07-16 20:19:46.000000000 +0200 +++ llvm-r600/lib/Target/LLVMBuild.txt 2013-01-25 19:43:57.173383060 +0100 @@ -16,7 +16,7 @@ ;===------------------------------------------------------------------------===; [common] -subdirectories = ARM CellSPU CppBackend Hexagon MBlaze MSP430 NVPTX Mips PowerPC Sparc X86 XCore +subdirectories = ARM CellSPU CppBackend Hexagon MBlaze MSP430 NVPTX Mips PowerPC R600 Sparc X86 XCore ; This is a special group whose required libraries are extended (by llvm-build) ; with the best execution engine (the native JIT, if available, or the diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUAsmPrinter.cpp llvm-r600/lib/Target/R600/AMDGPUAsmPrinter.cpp --- llvm-3.2.src/lib/Target/R600/AMDGPUAsmPrinter.cpp 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPUAsmPrinter.cpp 2013-01-25 19:43:57.423383055 +0100 @@ -0,0 +1,138 @@ +//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// The AMDGPUAsmPrinter is used to print both assembly string and also binary +/// code. When passed an MCAsmStreamer it prints assembly and when passed +/// an MCObjectStreamer it outputs binary code. +// +//===----------------------------------------------------------------------===// +// + + +#include "AMDGPUAsmPrinter.h" +#include "AMDGPU.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + + +static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm, + MCStreamer &Streamer) { + return new AMDGPUAsmPrinter(tm, Streamer); +} + +extern "C" void LLVMInitializeR600AsmPrinter() { + TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass); +} + +/// We need to override this function so we can avoid +/// the call to EmitFunctionHeader(), which the MCPureStreamer can't handle. +bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + const AMDGPUSubtarget &STM = TM.getSubtarget(); + if (STM.dumpCode()) { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + MF.dump(); +#endif + } + SetupMachineFunction(MF); + OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); + if (STM.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { + EmitProgramInfo(MF); + } + EmitFunctionBody(); + return false; +} + +void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) { + unsigned MaxSGPR = 0; + unsigned MaxVGPR = 0; + bool VCCUsed = false; + const SIRegisterInfo * RI = + static_cast(TM.getRegisterInfo()); + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + + unsigned numOperands = MI.getNumOperands(); + for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { + MachineOperand & MO = MI.getOperand(op_idx); + unsigned maxUsed; + unsigned width = 0; + bool isSGPR = false; + unsigned reg; + unsigned hwReg; + if (!MO.isReg()) { + continue; + } + reg = MO.getReg(); + if (reg == AMDGPU::VCC) { + VCCUsed = true; + continue; + } + switch (reg) { + default: break; + case AMDGPU::EXEC: + case AMDGPU::SI_LITERAL_CONSTANT: + case AMDGPU::SREG_LIT_0: + case AMDGPU::M0: + continue; + } + + if (AMDGPU::SReg_32RegClass.contains(reg)) { + isSGPR = true; + width = 1; + } else if (AMDGPU::VReg_32RegClass.contains(reg)) { + isSGPR = false; + width = 1; + } else if (AMDGPU::SReg_64RegClass.contains(reg)) { + isSGPR = true; + width = 2; + } else if (AMDGPU::VReg_64RegClass.contains(reg)) { + isSGPR = false; + width = 2; + } else if (AMDGPU::SReg_128RegClass.contains(reg)) { + isSGPR = true; + width = 4; + } else if (AMDGPU::VReg_128RegClass.contains(reg)) { + isSGPR = false; + width = 4; + } else if (AMDGPU::SReg_256RegClass.contains(reg)) { + isSGPR = true; + width = 8; + } else { + assert(!"Unknown register class"); + } + hwReg = RI->getEncodingValue(reg); + maxUsed = hwReg + width - 1; + if (isSGPR) { + MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR; + } else { + MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR; + } + } + } + } + if (VCCUsed) { + MaxSGPR += 2; + } + SIMachineFunctionInfo * MFI = MF.getInfo(); + OutStreamer.EmitIntValue(MaxSGPR + 1, 4); + OutStreamer.EmitIntValue(MaxVGPR + 1, 4); + OutStreamer.EmitIntValue(MFI->SPIPSInputAddr, 4); +} diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUAsmPrinter.h llvm-r600/lib/Target/R600/AMDGPUAsmPrinter.h --- llvm-3.2.src/lib/Target/R600/AMDGPUAsmPrinter.h 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPUAsmPrinter.h 2013-01-25 19:43:57.426716388 +0100 @@ -0,0 +1,44 @@ +//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU Assembly printer class. +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPU_ASMPRINTER_H +#define AMDGPU_ASMPRINTER_H + +#include "llvm/CodeGen/AsmPrinter.h" + +namespace llvm { + +class AMDGPUAsmPrinter : public AsmPrinter { + +public: + explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) + : AsmPrinter(TM, Streamer) { } + + virtual bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "AMDGPU Assembly Printer"; + } + + /// \brief Emit register usage information so that the GPU driver + /// can correctly setup the GPU state. + void EmitProgramInfo(MachineFunction &MF); + + /// Implemented in AMDGPUMCInstLower.cpp + virtual void EmitInstruction(const MachineInstr *MI); +}; + +} // End anonymous llvm + +#endif //AMDGPU_ASMPRINTER_H diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUCodeEmitter.h llvm-r600/lib/Target/R600/AMDGPUCodeEmitter.h --- llvm-3.2.src/lib/Target/R600/AMDGPUCodeEmitter.h 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPUCodeEmitter.h 2013-01-25 19:43:57.426716388 +0100 @@ -0,0 +1,49 @@ +//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief CodeEmitter interface for R600 and SI codegen. +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPUCODEEMITTER_H +#define AMDGPUCODEEMITTER_H + +namespace llvm { + +class AMDGPUCodeEmitter { +public: + uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const; + virtual uint64_t getMachineOpValue(const MachineInstr &MI, + const MachineOperand &MO) const { return 0; } + virtual unsigned GPR4AlignEncode(const MachineInstr &MI, + unsigned OpNo) const { + return 0; + } + virtual unsigned GPR2AlignEncode(const MachineInstr &MI, + unsigned OpNo) const { + return 0; + } + virtual uint64_t VOPPostEncode(const MachineInstr &MI, + uint64_t Value) const { + return Value; + } + virtual uint64_t i32LiteralEncode(const MachineInstr &MI, + unsigned OpNo) const { + return 0; + } + virtual uint32_t SMRDmemriEncode(const MachineInstr &MI, unsigned OpNo) + const { + return 0; + } +}; + +} // End namespace llvm + +#endif // AMDGPUCODEEMITTER_H diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUConvertToISA.cpp llvm-r600/lib/Target/R600/AMDGPUConvertToISA.cpp --- llvm-3.2.src/lib/Target/R600/AMDGPUConvertToISA.cpp 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPUConvertToISA.cpp 2013-01-25 19:43:57.426716388 +0100 @@ -0,0 +1,62 @@ +//===-- AMDGPUConvertToISA.cpp - Lower AMDIL to HW ISA --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This pass lowers AMDIL machine instructions to the appropriate +/// hardware instructions. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" + +using namespace llvm; + +namespace { + +class AMDGPUConvertToISAPass : public MachineFunctionPass { + +private: + static char ID; + TargetMachine &TM; + +public: + AMDGPUConvertToISAPass(TargetMachine &tm) : + MachineFunctionPass(ID), TM(tm) { } + + virtual bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const {return "AMDGPU Convert to ISA";} + +}; + +} // End anonymous namespace + +char AMDGPUConvertToISAPass::ID = 0; + +FunctionPass *llvm::createAMDGPUConvertToISAPass(TargetMachine &tm) { + return new AMDGPUConvertToISAPass(tm); +} + +bool AMDGPUConvertToISAPass::runOnMachineFunction(MachineFunction &MF) { + const AMDGPUInstrInfo * TII = + static_cast(TM.getInstrInfo()); + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + TII->convertToISA(MI, MF, MBB.findDebugLoc(I)); + } + } + return false; +} diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPU.h llvm-r600/lib/Target/R600/AMDGPU.h --- llvm-3.2.src/lib/Target/R600/AMDGPU.h 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPU.h 2013-01-25 19:43:57.423383055 +0100 @@ -0,0 +1,51 @@ +//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef AMDGPU_H +#define AMDGPU_H + +#include "AMDGPUTargetMachine.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +class FunctionPass; +class AMDGPUTargetMachine; + +// R600 Passes +FunctionPass* createR600KernelParametersPass(const DataLayout *TD); +FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm); +FunctionPass *createR600LowerConstCopy(TargetMachine &tm); + +// SI Passes +FunctionPass *createSIAnnotateControlFlowPass(); +FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm); +FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); +FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); +FunctionPass *createSILowerLiteralConstantsPass(TargetMachine &tm); +FunctionPass *createSIInsertWaits(TargetMachine &tm); + +// Passes common to R600 and SI +Pass *createAMDGPUStructurizeCFGPass(); +FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm); + +} // End namespace llvm + +namespace ShaderType { + enum Type { + PIXEL = 0, + VERTEX = 1, + GEOMETRY = 2, + COMPUTE = 3 + }; +} + +#endif // AMDGPU_H diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.cpp llvm-r600/lib/Target/R600/AMDGPUInstrInfo.cpp --- llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.cpp 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPUInstrInfo.cpp 2013-01-25 19:43:57.426716388 +0100 @@ -0,0 +1,257 @@ +//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Implementation of the TargetInstrInfo class that is common to all +/// AMD GPUs. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUInstrInfo.h" +#include "AMDGPURegisterInfo.h" +#include "AMDGPUTargetMachine.h" +#include "AMDIL.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +#define GET_INSTRINFO_CTOR +#include "AMDGPUGenInstrInfo.inc" + +using namespace llvm; + +AMDGPUInstrInfo::AMDGPUInstrInfo(TargetMachine &tm) + : AMDGPUGenInstrInfo(0,0), RI(tm, *this), TM(tm) { } + +const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const { + return RI; +} + +bool AMDGPUInstrInfo::isCoalescableExtInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SubIdx) const { +// TODO: Implement this function + return false; +} + +unsigned AMDGPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} + +unsigned AMDGPUInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} + +bool AMDGPUInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const { +// TODO: Implement this function + return false; +} +unsigned AMDGPUInstrInfo::isStoreFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} +unsigned AMDGPUInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} +bool AMDGPUInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const { +// TODO: Implement this function + return false; +} + +MachineInstr * +AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const { +// TODO: Implement this function + return NULL; +} +bool AMDGPUInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter, + MachineBasicBlock &MBB) const { + while (iter != MBB.end()) { + switch (iter->getOpcode()) { + default: + break; + case AMDGPU::BRANCH_COND_i32: + case AMDGPU::BRANCH_COND_f32: + case AMDGPU::BRANCH: + return true; + }; + ++iter; + } + return false; +} + +MachineBasicBlock::iterator skipFlowControl(MachineBasicBlock *MBB) { + MachineBasicBlock::iterator tmp = MBB->end(); + if (!MBB->size()) { + return MBB->end(); + } + while (--tmp) { + if (tmp->getOpcode() == AMDGPU::ENDLOOP + || tmp->getOpcode() == AMDGPU::ENDIF + || tmp->getOpcode() == AMDGPU::ELSE) { + if (tmp == MBB->begin()) { + return tmp; + } else { + continue; + } + } else { + return ++tmp; + } + } + return MBB->end(); +} + +void +AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, + int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + assert(!"Not Implemented"); +} + +void +AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + assert(!"Not Implemented"); +} + +MachineInstr * +AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + const SmallVectorImpl &Ops, + int FrameIndex) const { +// TODO: Implement this function + return 0; +} +MachineInstr* +AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + const SmallVectorImpl &Ops, + MachineInstr *LoadMI) const { + // TODO: Implement this function + return 0; +} +bool +AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI, + const SmallVectorImpl &Ops) const { + // TODO: Implement this function + return false; +} +bool +AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, + unsigned Reg, bool UnfoldLoad, + bool UnfoldStore, + SmallVectorImpl &NewMIs) const { + // TODO: Implement this function + return false; +} + +bool +AMDGPUInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, + SmallVectorImpl &NewNodes) const { + // TODO: Implement this function + return false; +} + +unsigned +AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, + bool UnfoldLoad, bool UnfoldStore, + unsigned *LoadRegIndex) const { + // TODO: Implement this function + return 0; +} + +bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const { + assert(Offset2 > Offset1 + && "Second offset should be larger than first offset!"); + // If we have less than 16 loads in a row, and the offsets are within 16, + // then schedule together. + // TODO: Make the loads schedule near if it fits in a cacheline + return (NumLoads < 16 && (Offset2 - Offset1) < 16); +} + +bool +AMDGPUInstrInfo::ReverseBranchCondition(SmallVectorImpl &Cond) + const { + // TODO: Implement this function + return true; +} +void AMDGPUInstrInfo::insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const { + // TODO: Implement this function +} + +bool AMDGPUInstrInfo::isPredicated(const MachineInstr *MI) const { + // TODO: Implement this function + return false; +} +bool +AMDGPUInstrInfo::SubsumesPredicate(const SmallVectorImpl &Pred1, + const SmallVectorImpl &Pred2) + const { + // TODO: Implement this function + return false; +} + +bool AMDGPUInstrInfo::DefinesPredicate(MachineInstr *MI, + std::vector &Pred) const { + // TODO: Implement this function + return false; +} + +bool AMDGPUInstrInfo::isPredicable(MachineInstr *MI) const { + // TODO: Implement this function + return MI->getDesc().isPredicable(); +} + +bool +AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { + // TODO: Implement this function + return true; +} + +void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF, + DebugLoc DL) const { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const AMDGPURegisterInfo & RI = getRegisterInfo(); + + for (unsigned i = 0; i < MI.getNumOperands(); i++) { + MachineOperand &MO = MI.getOperand(i); + // Convert dst regclass to one that is supported by the ISA + if (MO.isReg() && MO.isDef()) { + if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) { + const TargetRegisterClass * oldRegClass = MRI.getRegClass(MO.getReg()); + const TargetRegisterClass * newRegClass = RI.getISARegClass(oldRegClass); + + assert(newRegClass); + + MRI.setRegClass(MO.getReg(), newRegClass); + } + } + } +} diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.h llvm-r600/lib/Target/R600/AMDGPUInstrInfo.h --- llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.h 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPUInstrInfo.h 2013-01-25 19:43:57.430049721 +0100 @@ -0,0 +1,149 @@ +//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Contains the definition of a TargetInstrInfo class that is common +/// to all AMD GPUs. +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPUINSTRUCTIONINFO_H +#define AMDGPUINSTRUCTIONINFO_H + +#include "AMDGPURegisterInfo.h" +#include "AMDGPUInstrInfo.h" +#include "llvm/Target/TargetInstrInfo.h" + +#include + +#define GET_INSTRINFO_HEADER +#define GET_INSTRINFO_ENUM +#include "AMDGPUGenInstrInfo.inc" + +#define OPCODE_IS_ZERO_INT AMDGPU::PRED_SETE_INT +#define OPCODE_IS_NOT_ZERO_INT AMDGPU::PRED_SETNE_INT +#define OPCODE_IS_ZERO AMDGPU::PRED_SETE +#define OPCODE_IS_NOT_ZERO AMDGPU::PRED_SETNE + +namespace llvm { + +class AMDGPUTargetMachine; +class MachineFunction; +class MachineInstr; +class MachineInstrBuilder; + +class AMDGPUInstrInfo : public AMDGPUGenInstrInfo { +private: + const AMDGPURegisterInfo RI; + TargetMachine &TM; + bool getNextBranchInstr(MachineBasicBlock::iterator &iter, + MachineBasicBlock &MBB) const; +public: + explicit AMDGPUInstrInfo(TargetMachine &tm); + + virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0; + + bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, + unsigned &DstReg, unsigned &SubIdx) const; + + unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; + unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const; + bool hasLoadFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const; + unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; + unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const; + bool hasStoreFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const; + + MachineInstr * + convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const; + + + virtual void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const = 0; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + +protected: + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + const SmallVectorImpl &Ops, + int FrameIndex) const; + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + const SmallVectorImpl &Ops, + MachineInstr *LoadMI) const; +public: + bool canFoldMemoryOperand(const MachineInstr *MI, + const SmallVectorImpl &Ops) const; + bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, + unsigned Reg, bool UnfoldLoad, bool UnfoldStore, + SmallVectorImpl &NewMIs) const; + bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, + SmallVectorImpl &NewNodes) const; + unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, + bool UnfoldLoad, bool UnfoldStore, + unsigned *LoadRegIndex = 0) const; + bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const; + + bool ReverseBranchCondition(SmallVectorImpl &Cond) const; + void insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const; + bool isPredicated(const MachineInstr *MI) const; + bool SubsumesPredicate(const SmallVectorImpl &Pred1, + const SmallVectorImpl &Pred2) const; + bool DefinesPredicate(MachineInstr *MI, + std::vector &Pred) const; + bool isPredicable(MachineInstr *MI) const; + bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const; + + // Helper functions that check the opcode for status information + bool isLoadInst(llvm::MachineInstr *MI) const; + bool isExtLoadInst(llvm::MachineInstr *MI) const; + bool isSWSExtLoadInst(llvm::MachineInstr *MI) const; + bool isSExtLoadInst(llvm::MachineInstr *MI) const; + bool isZExtLoadInst(llvm::MachineInstr *MI) const; + bool isAExtLoadInst(llvm::MachineInstr *MI) const; + bool isStoreInst(llvm::MachineInstr *MI) const; + bool isTruncStoreInst(llvm::MachineInstr *MI) const; + + virtual MachineInstr* getMovImmInstr(MachineFunction *MF, unsigned DstReg, + int64_t Imm) const = 0; + virtual unsigned getIEQOpcode() const = 0; + virtual bool isMov(unsigned opcode) const = 0; + + /// \brief Convert the AMDIL MachineInstr to a supported ISA + /// MachineInstr + virtual void convertToISA(MachineInstr & MI, MachineFunction &MF, + DebugLoc DL) const; + +}; + +} // End llvm namespace + +#endif // AMDGPUINSTRINFO_H diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.td llvm-r600/lib/Target/R600/AMDGPUInstrInfo.td --- llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.td 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPUInstrInfo.td 2013-01-25 19:43:57.430049721 +0100 @@ -0,0 +1,74 @@ +//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains DAG node defintions for the AMDGPU target. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// AMDGPU DAG Profiles +//===----------------------------------------------------------------------===// + +def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [ + SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3> +]>; + +//===----------------------------------------------------------------------===// +// AMDGPU DAG Nodes +// + +// out = ((a << 32) | b) >> c) +// +// Can be used to optimize rtol: +// rotl(a, b) = bitalign(a, a, 32 - b) +def AMDGPUbitalign : SDNode<"AMDGPUISD::BITALIGN", AMDGPUDTIntTernaryOp>; + +// This argument to this node is a dword address. +def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; + +// out = a - floor(a) +def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; + +// out = max(a, b) a and b are floats +def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative] +>; + +// out = max(a, b) a and b are signed ints +def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp, + [SDNPCommutative, SDNPAssociative] +>; + +// out = max(a, b) a and b are unsigned ints +def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp, + [SDNPCommutative, SDNPAssociative] +>; + +// out = min(a, b) a and b are floats +def AMDGPUfmin : SDNode<"AMDGPUISD::FMIN", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative] +>; + +// out = min(a, b) a snd b are signed ints +def AMDGPUsmin : SDNode<"AMDGPUISD::SMIN", SDTIntBinOp, + [SDNPCommutative, SDNPAssociative] +>; + +// out = min(a, b) a and b are unsigned ints +def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp, + [SDNPCommutative, SDNPAssociative] +>; + +// urecip - This operation is a helper for integer division, it returns the +// result of 1 / a as a fractional unsigned integer. +// out = (2^32 / a) + e +// e is rounding error +def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>; + +def fpow : SDNode<"ISD::FPOW", SDTFPBinOp>; diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUInstructions.td llvm-r600/lib/Target/R600/AMDGPUInstructions.td --- llvm-3.2.src/lib/Target/R600/AMDGPUInstructions.td 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPUInstructions.td 2013-01-25 19:43:57.430049721 +0100 @@ -0,0 +1,190 @@ +//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains instruction defs that are common to all hw codegen +// targets. +// +//===----------------------------------------------------------------------===// + +class AMDGPUInst pattern> : Instruction { + field bits<16> AMDILOp = 0; + field bits<3> Gen = 0; + + let Namespace = "AMDGPU"; + let OutOperandList = outs; + let InOperandList = ins; + let AsmString = asm; + let Pattern = pattern; + let Itinerary = NullALU; + let TSFlags{42-40} = Gen; + let TSFlags{63-48} = AMDILOp; +} + +class AMDGPUShaderInst pattern> + : AMDGPUInst { + + field bits<32> Inst = 0xffffffff; + +} + +def InstFlag : OperandWithDefaultOps ; + +def COND_EQ : PatLeaf < + (cond), + [{switch(N->get()){{default: return false; + case ISD::SETOEQ: case ISD::SETUEQ: + case ISD::SETEQ: return true;}}}] +>; + +def COND_NE : PatLeaf < + (cond), + [{switch(N->get()){{default: return false; + case ISD::SETONE: case ISD::SETUNE: + case ISD::SETNE: return true;}}}] +>; +def COND_GT : PatLeaf < + (cond), + [{switch(N->get()){{default: return false; + case ISD::SETOGT: case ISD::SETUGT: + case ISD::SETGT: return true;}}}] +>; + +def COND_GE : PatLeaf < + (cond), + [{switch(N->get()){{default: return false; + case ISD::SETOGE: case ISD::SETUGE: + case ISD::SETGE: return true;}}}] +>; + +def COND_LT : PatLeaf < + (cond), + [{switch(N->get()){{default: return false; + case ISD::SETOLT: case ISD::SETULT: + case ISD::SETLT: return true;}}}] +>; + +def COND_LE : PatLeaf < + (cond), + [{switch(N->get()){{default: return false; + case ISD::SETOLE: case ISD::SETULE: + case ISD::SETLE: return true;}}}] +>; + +//===----------------------------------------------------------------------===// +// Load/Store Pattern Fragments +//===----------------------------------------------------------------------===// + +def zextloadi8_global : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr), [{ + return isGlobalLoad(dyn_cast(N)); +}]>; + +class Constants { +int TWO_PI = 0x40c90fdb; +int PI = 0x40490fdb; +int TWO_PI_INV = 0x3e22f983; +} +def CONST : Constants; + +def FP_ZERO : PatLeaf < + (fpimm), + [{return N->getValueAPF().isZero();}] +>; + +def FP_ONE : PatLeaf < + (fpimm), + [{return N->isExactlyValue(1.0);}] +>; + +let isCodeGenOnly = 1, isPseudo = 1, usesCustomInserter = 1 in { + +class CLAMP : AMDGPUShaderInst < + (outs rc:$dst), + (ins rc:$src0), + "CLAMP $dst, $src0", + [(set rc:$dst, (int_AMDIL_clamp rc:$src0, (f32 FP_ZERO), (f32 FP_ONE)))] +>; + +class FABS : AMDGPUShaderInst < + (outs rc:$dst), + (ins rc:$src0), + "FABS $dst, $src0", + [(set rc:$dst, (fabs rc:$src0))] +>; + +class FNEG : AMDGPUShaderInst < + (outs rc:$dst), + (ins rc:$src0), + "FNEG $dst, $src0", + [(set rc:$dst, (fneg rc:$src0))] +>; + +def SHADER_TYPE : AMDGPUShaderInst < + (outs), + (ins i32imm:$type), + "SHADER_TYPE $type", + [(int_AMDGPU_shader_type imm:$type)] +>; + +} // End isCodeGenOnly = 1, isPseudo = 1, hasCustomInserter = 1 + +/* Generic helper patterns for intrinsics */ +/* -------------------------------------- */ + +class POW_Common : Pat < + (fpow rc:$src0, rc:$src1), + (exp_ieee (mul rc:$src1, (log_ieee rc:$src0))) +>; + +/* Other helper patterns */ +/* --------------------- */ + +/* Extract element pattern */ +class Extract_Element : Pat< + (sub_type (vector_extract (vec_type vec_class:$src), sub_idx)), + (EXTRACT_SUBREG vec_class:$src, sub_reg) +>; + +/* Insert element pattern */ +class Insert_Element : Pat < + + (vec_type (vector_insert (vec_type vec_class:$vec), + (elem_type elem_class:$elem), sub_idx)), + (INSERT_SUBREG vec_class:$vec, elem_class:$elem, sub_reg) +>; + +// Vector Build pattern +class Vector_Build : Pat < + (vecType (build_vector (elemType elemClass:$x), (elemType elemClass:$y), + (elemType elemClass:$z), (elemType elemClass:$w))), + (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG + (vecType (IMPLICIT_DEF)), elemClass:$x, sel_x), elemClass:$y, sel_y), + elemClass:$z, sel_z), elemClass:$w, sel_w) +>; + +// bitconvert pattern +class BitConvert : Pat < + (dt (bitconvert (st rc:$src0))), + (dt rc:$src0) +>; + +class DwordAddrPat : Pat < + (vt (AMDGPUdwordaddr (vt rc:$addr))), + (vt rc:$addr) +>; + +include "R600Instructions.td" + +include "SIInstrInfo.td" + diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUIntrinsics.td llvm-r600/lib/Target/R600/AMDGPUIntrinsics.td --- llvm-3.2.src/lib/Target/R600/AMDGPUIntrinsics.td 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPUIntrinsics.td 2013-01-25 19:43:57.430049721 +0100 @@ -0,0 +1,62 @@ +//===-- AMDGPUIntrinsics.td - Common intrinsics -*- tablegen -*-----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines intrinsics that are used by all hw codegen targets. +// +//===----------------------------------------------------------------------===// + +let TargetPrefix = "AMDGPU", isTarget = 1 in { + + def int_AMDGPU_load_const : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_load_imm : Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_reserve_reg : Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>; + def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; + + def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; + def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>; + def int_AMDGPU_kilp : Intrinsic<[], [], []>; + def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_rcp : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; + + def int_AMDGPU_shader_type : Intrinsic<[], [llvm_i32_ty], []>; +} + +let TargetPrefix = "TGSI", isTarget = 1 in { + + def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[IntrNoMem]>; +} + +include "SIIntrinsics.td" diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUISelLowering.cpp llvm-r600/lib/Target/R600/AMDGPUISelLowering.cpp --- llvm-3.2.src/lib/Target/R600/AMDGPUISelLowering.cpp 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPUISelLowering.cpp 2013-01-25 19:43:57.426716388 +0100 @@ -0,0 +1,418 @@ +//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This is the parent TargetLowering class for hardware code gen +/// targets. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUISelLowering.h" +#include "AMDILIntrinsicInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" + +using namespace llvm; + +AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : + TargetLowering(TM, new TargetLoweringObjectFileELF()) { + + // Initialize target lowering borrowed from AMDIL + InitAMDILLowering(); + + // We need to custom lower some of the intrinsics + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + + // Library functions. These default to Expand, but we have instructions + // for them. + setOperationAction(ISD::FCEIL, MVT::f32, Legal); + setOperationAction(ISD::FEXP2, MVT::f32, Legal); + setOperationAction(ISD::FPOW, MVT::f32, Legal); + setOperationAction(ISD::FLOG2, MVT::f32, Legal); + setOperationAction(ISD::FABS, MVT::f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::f32, Legal); + setOperationAction(ISD::FRINT, MVT::f32, Legal); + + // Lower floating point store/load to integer store/load to reduce the number + // of patterns in tablegen. + setOperationAction(ISD::STORE, MVT::f32, Promote); + AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); + + setOperationAction(ISD::STORE, MVT::v4f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); + + setOperationAction(ISD::LOAD, MVT::f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); + + setOperationAction(ISD::LOAD, MVT::v4f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); + + setOperationAction(ISD::UDIV, MVT::i32, Expand); + setOperationAction(ISD::UDIVREM, MVT::i32, Custom); + setOperationAction(ISD::UREM, MVT::i32, Expand); +} + +//===---------------------------------------------------------------------===// +// TargetLowering Callbacks +//===---------------------------------------------------------------------===// + +SDValue AMDGPUTargetLowering::LowerFormalArguments( + SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Ins, + DebugLoc DL, SelectionDAG &DAG, + SmallVectorImpl &InVals) const { + for (unsigned i = 0, e = Ins.size(); i < e; ++i) { + InVals.push_back(SDValue()); + } + return Chain; +} + +SDValue AMDGPUTargetLowering::LowerReturn( + SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + DebugLoc DL, SelectionDAG &DAG) const { + return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain); +} + +//===---------------------------------------------------------------------===// +// Target specific lowering +//===---------------------------------------------------------------------===// + +SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) + const { + switch (Op.getOpcode()) { + default: + Op.getNode()->dump(); + assert(0 && "Custom lowering code for this" + "instruction is not implemented yet!"); + break; + // AMDIL DAG lowering + case ISD::SDIV: return LowerSDIV(Op, DAG); + case ISD::SREM: return LowerSREM(Op, DAG); + case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); + case ISD::BRCOND: return LowerBRCOND(Op, DAG); + // AMDGPU DAG lowering + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); + } + return Op; +} + +SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + unsigned IntrinsicID = cast(Op.getOperand(0))->getZExtValue(); + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + + switch (IntrinsicID) { + default: return Op; + case AMDGPUIntrinsic::AMDIL_abs: + return LowerIntrinsicIABS(Op, DAG); + case AMDGPUIntrinsic::AMDIL_exp: + return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_lrp: + return LowerIntrinsicLRP(Op, DAG); + case AMDGPUIntrinsic::AMDIL_fraction: + return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDIL_mad: + return DAG.getNode(AMDGPUISD::MAD, DL, VT, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); + case AMDGPUIntrinsic::AMDIL_max: + return DAG.getNode(AMDGPUISD::FMAX, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDGPU_imax: + return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDGPU_umax: + return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDIL_min: + return DAG.getNode(AMDGPUISD::FMIN, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDGPU_imin: + return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDGPU_umin: + return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDIL_round_nearest: + return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); + } +} + +///IABS(a) = SMAX(sub(0, a), a) +SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op, + SelectionDAG &DAG) const { + + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), + Op.getOperand(1)); + + return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1)); +} + +/// Linear Interpolation +/// LRP(a, b, c) = muladd(a, b, (1 - a) * c) +SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT, + DAG.getConstantFP(1.0f, MVT::f32), + Op.getOperand(1)); + SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA, + Op.getOperand(3)); + return DAG.getNode(AMDGPUISD::MAD, DL, VT, Op.getOperand(1), + Op.getOperand(2), + OneSubAC); +} + +/// \brief Generate Min/Max node +SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue True = Op.getOperand(2); + SDValue False = Op.getOperand(3); + SDValue CC = Op.getOperand(4); + + if (VT != MVT::f32 || + !((LHS == True && RHS == False) || (LHS == False && RHS == True))) { + return SDValue(); + } + + ISD::CondCode CCOpcode = cast(CC)->get(); + switch (CCOpcode) { + case ISD::SETOEQ: + case ISD::SETONE: + case ISD::SETUNE: + case ISD::SETNE: + case ISD::SETUEQ: + case ISD::SETEQ: + case ISD::SETFALSE: + case ISD::SETFALSE2: + case ISD::SETTRUE: + case ISD::SETTRUE2: + case ISD::SETUO: + case ISD::SETO: + assert(0 && "Operation should already be optimised !"); + case ISD::SETULE: + case ISD::SETULT: + case ISD::SETOLE: + case ISD::SETOLT: + case ISD::SETLE: + case ISD::SETLT: { + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS); + else + return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS); + } + case ISD::SETGT: + case ISD::SETGE: + case ISD::SETUGE: + case ISD::SETOGE: + case ISD::SETUGT: + case ISD::SETOGT: { + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS); + else + return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS); + } + case ISD::SETCC_INVALID: + assert(0 && "Invalid setcc condcode !"); + } + return Op; +} + + + +SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + + SDValue Num = Op.getOperand(0); + SDValue Den = Op.getOperand(1); + + SmallVector Results; + + // RCP = URECIP(Den) = 2^32 / Den + e + // e is rounding error. + SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); + + // RCP_LO = umulo(RCP, Den) */ + SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den); + + // RCP_HI = mulhu (RCP, Den) */ + SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); + + // NEG_RCP_LO = -RCP_LO + SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), + RCP_LO); + + // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) + SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT), + NEG_RCP_LO, RCP_LO, + ISD::SETEQ); + // Calculate the rounding error from the URECIP instruction + // E = mulhu(ABS_RCP_LO, RCP) + SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP); + + // RCP_A_E = RCP + E + SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E); + + // RCP_S_E = RCP - E + SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E); + + // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) + SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT), + RCP_A_E, RCP_S_E, + ISD::SETEQ); + // Quotient = mulhu(Tmp0, Num) + SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num); + + // Num_S_Remainder = Quotient * Den + SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den); + + // Remainder = Num - Num_S_Remainder + SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder); + + // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) + SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den, + DAG.getConstant(-1, VT), + DAG.getConstant(0, VT), + ISD::SETGE); + // Remainder_GE_Zero = (Remainder >= 0 ? -1 : 0) + SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Remainder, + DAG.getConstant(0, VT), + DAG.getConstant(-1, VT), + DAG.getConstant(0, VT), + ISD::SETGE); + // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero + SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den, + Remainder_GE_Zero); + + // Calculate Division result: + + // Quotient_A_One = Quotient + 1 + SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient, + DAG.getConstant(1, VT)); + + // Quotient_S_One = Quotient - 1 + SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient, + DAG.getConstant(1, VT)); + + // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) + SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT), + Quotient, Quotient_A_One, ISD::SETEQ); + + // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) + Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT), + Quotient_S_One, Div, ISD::SETEQ); + + // Calculate Rem result: + + // Remainder_S_Den = Remainder - Den + SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den); + + // Remainder_A_Den = Remainder + Den + SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den); + + // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) + SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT), + Remainder, Remainder_S_Den, ISD::SETEQ); + + // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) + Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT), + Remainder_A_Den, Rem, ISD::SETEQ); + SDValue Ops[2]; + Ops[0] = Div; + Ops[1] = Rem; + return DAG.getMergeValues(Ops, 2, DL); +} + +//===----------------------------------------------------------------------===// +// Helper functions +//===----------------------------------------------------------------------===// + +bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const { + if (ConstantFPSDNode * CFP = dyn_cast(Op)) { + return CFP->isExactlyValue(1.0); + } + if (ConstantSDNode *C = dyn_cast(Op)) { + return C->isAllOnesValue(); + } + return false; +} + +bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const { + if (ConstantFPSDNode * CFP = dyn_cast(Op)) { + return CFP->getValueAPF().isZero(); + } + if (ConstantSDNode *C = dyn_cast(Op)) { + return C->isNullValue(); + } + return false; +} + +SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + unsigned VirtualRegister; + if (!MRI.isLiveIn(Reg)) { + VirtualRegister = MRI.createVirtualRegister(RC); + MRI.addLiveIn(Reg, VirtualRegister); + } else { + VirtualRegister = MRI.getLiveInVirtReg(Reg); + } + return DAG.getRegister(VirtualRegister, VT); +} + +#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; + +const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { + switch (Opcode) { + default: return 0; + // AMDIL DAG nodes + NODE_NAME_CASE(MAD); + NODE_NAME_CASE(CALL); + NODE_NAME_CASE(UMUL); + NODE_NAME_CASE(DIV_INF); + NODE_NAME_CASE(RET_FLAG); + NODE_NAME_CASE(BRANCH_COND); + + // AMDGPU DAG nodes + NODE_NAME_CASE(DWORDADDR) + NODE_NAME_CASE(FRACT) + NODE_NAME_CASE(FMAX) + NODE_NAME_CASE(SMAX) + NODE_NAME_CASE(UMAX) + NODE_NAME_CASE(FMIN) + NODE_NAME_CASE(SMIN) + NODE_NAME_CASE(UMIN) + NODE_NAME_CASE(URECIP) + NODE_NAME_CASE(INTERP) + NODE_NAME_CASE(INTERP_P0) + NODE_NAME_CASE(EXPORT) + NODE_NAME_CASE(CONST_ADDRESS) + } +} diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUISelLowering.h llvm-r600/lib/Target/R600/AMDGPUISelLowering.h --- llvm-3.2.src/lib/Target/R600/AMDGPUISelLowering.h 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPUISelLowering.h 2013-01-25 19:43:57.426716388 +0100 @@ -0,0 +1,145 @@ +//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface definition of the TargetLowering class that is common +/// to all AMD GPUs. +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPUISELLOWERING_H +#define AMDGPUISELLOWERING_H + +#include "llvm/Target/TargetLowering.h" + +namespace llvm { + +class MachineRegisterInfo; + +class AMDGPUTargetLowering : public TargetLowering { +private: + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; + +protected: + + /// \brief Helper function that adds Reg to the LiveIn list of the DAG's + /// MachineFunction. + /// + /// \returns a RegisterSDNode representing Reg. + SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const; + + bool isHWTrueValue(SDValue Op) const; + bool isHWFalseValue(SDValue Op) const; + +public: + AMDGPUTargetLowering(TargetMachine &TM); + + virtual SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Ins, + DebugLoc DL, SelectionDAG &DAG, + SmallVectorImpl &InVals) const; + + virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + DebugLoc DL, SelectionDAG &DAG) const; + + virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const; + virtual const char* getTargetNodeName(unsigned Opcode) const; + +// Functions defined in AMDILISelLowering.cpp +public: + + /// \brief Determine which of the bits specified in \p Mask are known to be + /// either zero or one and return them in the \p KnownZero and \p KnownOne + /// bitsets. + virtual void computeMaskedBitsForTargetNode(const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth = 0) const; + + virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, unsigned Intrinsic) const; + + /// We want to mark f32/f64 floating point values as legal. + bool isFPImmLegal(const APFloat &Imm, EVT VT) const; + + /// We don't want to shrink f64/f32 constants. + bool ShouldShrinkFPConstant(EVT VT) const; + +private: + void InitAMDILLowering(); + SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSREM8(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSREM16(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + EVT genIntType(uint32_t size = 32, uint32_t numEle = 1) const; + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; +}; + +namespace AMDGPUISD { + +enum { + // AMDIL ISD Opcodes + FIRST_NUMBER = ISD::BUILTIN_OP_END, + MAD, // 32bit Fused Multiply Add instruction + CALL, // Function call based on a single integer + UMUL, // 32bit unsigned multiplication + DIV_INF, // Divide with infinity returned on zero divisor + RET_FLAG, + BRANCH_COND, + // End AMDIL ISD Opcodes + BITALIGN, + DWORDADDR, + FRACT, + FMAX, + SMAX, + UMAX, + FMIN, + SMIN, + UMIN, + URECIP, + INTERP, + INTERP_P0, + EXPORT, + CONST_ADDRESS, + LAST_AMDGPU_ISD_NUMBER +}; + + +} // End namespace AMDGPUISD + +namespace SIISD { + +enum { + SI_FIRST = AMDGPUISD::LAST_AMDGPU_ISD_NUMBER, + VCC_AND, + VCC_BITCAST +}; + +} // End namespace SIISD + +} // End namespace llvm + +#endif // AMDGPUISELLOWERING_H diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUMCInstLower.cpp llvm-r600/lib/Target/R600/AMDGPUMCInstLower.cpp --- llvm-3.2.src/lib/Target/R600/AMDGPUMCInstLower.cpp 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPUMCInstLower.cpp 2013-01-25 19:43:57.430049721 +0100 @@ -0,0 +1,83 @@ +//===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst. +// +//===----------------------------------------------------------------------===// +// + +#include "AMDGPUMCInstLower.h" +#include "AMDGPUAsmPrinter.h" +#include "R600InstrInfo.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/Constants.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; + +AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx): + Ctx(ctx) +{ } + +void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { + OutMI.setOpcode(MI->getOpcode()); + + for (unsigned i = 0, e = MI->getNumExplicitOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + + MCOperand MCOp; + switch (MO.getType()) { + default: + llvm_unreachable("unknown operand type"); + case MachineOperand::MO_FPImmediate: { + const APFloat &FloatValue = MO.getFPImm()->getValueAPF(); + assert(&FloatValue.getSemantics() == &APFloat::IEEEsingle && + "Only floating point immediates are supported at the moment."); + MCOp = MCOperand::CreateFPImm(FloatValue.convertToFloat()); + break; + } + case MachineOperand::MO_Immediate: + MCOp = MCOperand::CreateImm(MO.getImm()); + break; + case MachineOperand::MO_Register: + MCOp = MCOperand::CreateReg(MO.getReg()); + break; + case MachineOperand::MO_MachineBasicBlock: + MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create( + MO.getMBB()->getSymbol(), Ctx)); + } + OutMI.addOperand(MCOp); + } +} + +void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { + AMDGPUMCInstLower MCInstLowering(OutContext); + + if (MI->isBundle()) { + const MachineBasicBlock *MBB = MI->getParent(); + MachineBasicBlock::const_instr_iterator I = MI; + ++I; + while (I != MBB->end() && I->isInsideBundle()) { + MCInst MCBundleInst; + const MachineInstr *BundledInst = I; + MCInstLowering.lower(BundledInst, MCBundleInst); + OutStreamer.EmitInstruction(MCBundleInst); + ++I; + } + } else { + MCInst TmpInst; + MCInstLowering.lower(MI, TmpInst); + OutStreamer.EmitInstruction(TmpInst); + } +} diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUMCInstLower.h llvm-r600/lib/Target/R600/AMDGPUMCInstLower.h --- llvm-3.2.src/lib/Target/R600/AMDGPUMCInstLower.h 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPUMCInstLower.h 2013-01-25 19:43:57.430049721 +0100 @@ -0,0 +1,34 @@ +//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef AMDGPU_MCINSTLOWER_H +#define AMDGPU_MCINSTLOWER_H + +namespace llvm { + +class MCInst; +class MCContext; +class MachineInstr; + +class AMDGPUMCInstLower { + + MCContext &Ctx; + +public: + AMDGPUMCInstLower(MCContext &ctx); + + /// \brief Lower a MachineInstr to an MCInst + void lower(const MachineInstr *MI, MCInst &OutMI) const; + +}; + +} // End namespace llvm + +#endif //AMDGPU_MCINSTLOWER_H diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.cpp llvm-r600/lib/Target/R600/AMDGPURegisterInfo.cpp --- llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.cpp 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPURegisterInfo.cpp 2013-01-25 19:43:57.430049721 +0100 @@ -0,0 +1,51 @@ +//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Parent TargetRegisterInfo class common to all hw codegen targets. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPURegisterInfo.h" +#include "AMDGPUTargetMachine.h" + +using namespace llvm; + +AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm, + const TargetInstrInfo &tii) +: AMDGPUGenRegisterInfo(0), + TM(tm), + TII(tii) + { } + +//===----------------------------------------------------------------------===// +// Function handling callbacks - Functions are a seldom used feature of GPUS, so +// they are not supported at this time. +//===----------------------------------------------------------------------===// + +const uint16_t AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister; + +const uint16_t* AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) + const { + return &CalleeSavedReg; +} + +void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, + int SPAdj, + RegScavenger *RS) const { + assert(!"Subroutines not supported yet"); +} + +unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const { + assert(!"Subroutines not supported yet"); + return 0; +} + +#define GET_REGINFO_TARGET_DESC +#include "AMDGPUGenRegisterInfo.inc" diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.h llvm-r600/lib/Target/R600/AMDGPURegisterInfo.h --- llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.h 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPURegisterInfo.h 2013-01-25 19:43:57.430049721 +0100 @@ -0,0 +1,63 @@ +//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief TargetRegisterInfo interface that is implemented by all hw codegen +/// targets. +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPUREGISTERINFO_H +#define AMDGPUREGISTERINFO_H + +#include "llvm/ADT/BitVector.h" +#include "llvm/Target/TargetRegisterInfo.h" + +#define GET_REGINFO_HEADER +#define GET_REGINFO_ENUM +#include "AMDGPUGenRegisterInfo.inc" + +namespace llvm { + +class AMDGPUTargetMachine; +class TargetInstrInfo; + +struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { + TargetMachine &TM; + const TargetInstrInfo &TII; + static const uint16_t CalleeSavedReg; + + AMDGPURegisterInfo(TargetMachine &tm, const TargetInstrInfo &tii); + + virtual BitVector getReservedRegs(const MachineFunction &MF) const { + assert(!"Unimplemented"); return BitVector(); + } + + /// \param RC is an AMDIL reg class. + /// + /// \returns The ISA reg class that is equivalent to \p RC. + virtual const TargetRegisterClass * getISARegClass( + const TargetRegisterClass * RC) const { + assert(!"Unimplemented"); return NULL; + } + + virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const { + assert(!"Unimplemented"); return NULL; + } + + const uint16_t* getCalleeSavedRegs(const MachineFunction *MF) const; + void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, + RegScavenger *RS) const; + unsigned getFrameRegister(const MachineFunction &MF) const; + +}; + +} // End namespace llvm + +#endif // AMDIDSAREGISTERINFO_H diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.td llvm-r600/lib/Target/R600/AMDGPURegisterInfo.td --- llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.td 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPURegisterInfo.td 2013-01-25 19:43:57.433383055 +0100 @@ -0,0 +1,22 @@ +//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Tablegen register definitions common to all hw codegen targets. +// +//===----------------------------------------------------------------------===// + +let Namespace = "AMDGPU" in { + def sel_x : SubRegIndex; + def sel_y : SubRegIndex; + def sel_z : SubRegIndex; + def sel_w : SubRegIndex; +} + +include "R600RegisterInfo.td" +include "SIRegisterInfo.td" diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUStructurizeCFG.cpp llvm-r600/lib/Target/R600/AMDGPUStructurizeCFG.cpp --- llvm-3.2.src/lib/Target/R600/AMDGPUStructurizeCFG.cpp 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPUStructurizeCFG.cpp 2013-01-25 19:43:57.433383055 +0100 @@ -0,0 +1,714 @@ +//===-- AMDGPUStructurizeCFG.cpp - ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// The pass implemented in this file transforms the programs control flow +/// graph into a form that's suitable for code generation on hardware that +/// implements control flow by execution masking. This currently includes all +/// AMD GPUs but may as well be useful for other types of hardware. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/Module.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/Analysis/RegionIterator.h" +#include "llvm/Analysis/RegionInfo.h" +#include "llvm/Analysis/RegionPass.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" + +using namespace llvm; + +namespace { + +// Definition of the complex types used in this pass. + +typedef std::pair BBValuePair; +typedef ArrayRef BBVecRef; + +typedef SmallVector RNVector; +typedef SmallVector BBVector; +typedef SmallVector BBValueVector; + +typedef DenseMap PhiMap; +typedef DenseMap BBPhiMap; +typedef DenseMap BBPredicates; +typedef DenseMap PredMap; +typedef DenseMap VisitedMap; + +// The name for newly created blocks. + +static const char *FlowBlockName = "Flow"; + +/// @brief Transforms the control flow graph on one single entry/exit region +/// at a time. +/// +/// After the transform all "If"/"Then"/"Else" style control flow looks like +/// this: +/// +/// \verbatim +/// 1 +/// || +/// | | +/// 2 | +/// | / +/// |/ +/// 3 +/// || Where: +/// | | 1 = "If" block, calculates the condition +/// 4 | 2 = "Then" subregion, runs if the condition is true +/// | / 3 = "Flow" blocks, newly inserted flow blocks, rejoins the flow +/// |/ 4 = "Else" optional subregion, runs if the condition is false +/// 5 5 = "End" block, also rejoins the control flow +/// \endverbatim +/// +/// Control flow is expressed as a branch where the true exit goes into the +/// "Then"/"Else" region, while the false exit skips the region +/// The condition for the optional "Else" region is expressed as a PHI node. +/// The incomming values of the PHI node are true for the "If" edge and false +/// for the "Then" edge. +/// +/// Additionally to that even complicated loops look like this: +/// +/// \verbatim +/// 1 +/// || +/// | | +/// 2 ^ Where: +/// | / 1 = "Entry" block +/// |/ 2 = "Loop" optional subregion, with all exits at "Flow" block +/// 3 3 = "Flow" block, with back edge to entry block +/// | +/// \endverbatim +/// +/// The back edge of the "Flow" block is always on the false side of the branch +/// while the true side continues the general flow. So the loop condition +/// consist of a network of PHI nodes where the true incoming values expresses +/// breaks and the false values expresses continue states. +class AMDGPUStructurizeCFG : public RegionPass { + + static char ID; + + Type *Boolean; + ConstantInt *BoolTrue; + ConstantInt *BoolFalse; + UndefValue *BoolUndef; + + Function *Func; + Region *ParentRegion; + + DominatorTree *DT; + + RNVector Order; + VisitedMap Visited; + PredMap Predicates; + BBPhiMap DeletedPhis; + BBVector FlowsInserted; + + BasicBlock *LoopStart; + BasicBlock *LoopEnd; + BBPredicates LoopPred; + + void orderNodes(); + + void buildPredicate(BranchInst *Term, unsigned Idx, + BBPredicates &Pred, bool Invert); + + void analyzeBlock(BasicBlock *BB); + + void analyzeLoop(BasicBlock *BB, unsigned &LoopIdx); + + void collectInfos(); + + bool dominatesPredicates(BasicBlock *A, BasicBlock *B); + + void killTerminator(BasicBlock *BB); + + RegionNode *skipChained(RegionNode *Node); + + void delPhiValues(BasicBlock *From, BasicBlock *To); + + void addPhiValues(BasicBlock *From, BasicBlock *To); + + BasicBlock *getNextFlow(BasicBlock *Prev); + + bool isPredictableTrue(BasicBlock *Prev, BasicBlock *Node); + + BasicBlock *wireFlowBlock(BasicBlock *Prev, RegionNode *Node); + + void createFlow(); + + void insertConditions(); + + void rebuildSSA(); + +public: + AMDGPUStructurizeCFG(): + RegionPass(ID) { + + initializeRegionInfoPass(*PassRegistry::getPassRegistry()); + } + + virtual bool doInitialization(Region *R, RGPassManager &RGM); + + virtual bool runOnRegion(Region *R, RGPassManager &RGM); + + virtual const char *getPassName() const { + return "AMDGPU simplify control flow"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const { + + AU.addRequired(); + AU.addPreserved(); + RegionPass::getAnalysisUsage(AU); + } + +}; + +} // end anonymous namespace + +char AMDGPUStructurizeCFG::ID = 0; + +/// \brief Initialize the types and constants used in the pass +bool AMDGPUStructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) { + LLVMContext &Context = R->getEntry()->getContext(); + + Boolean = Type::getInt1Ty(Context); + BoolTrue = ConstantInt::getTrue(Context); + BoolFalse = ConstantInt::getFalse(Context); + BoolUndef = UndefValue::get(Boolean); + + return false; +} + +/// \brief Build up the general order of nodes +void AMDGPUStructurizeCFG::orderNodes() { + scc_iterator I = scc_begin(ParentRegion), + E = scc_end(ParentRegion); + for (Order.clear(); I != E; ++I) { + std::vector &Nodes = *I; + Order.append(Nodes.begin(), Nodes.end()); + } +} + +/// \brief Build blocks and loop predicates +void AMDGPUStructurizeCFG::buildPredicate(BranchInst *Term, unsigned Idx, + BBPredicates &Pred, bool Invert) { + Value *True = Invert ? BoolFalse : BoolTrue; + Value *False = Invert ? BoolTrue : BoolFalse; + + RegionInfo *RI = ParentRegion->getRegionInfo(); + BasicBlock *BB = Term->getParent(); + + // Handle the case where multiple regions start at the same block + Region *R = BB != ParentRegion->getEntry() ? + RI->getRegionFor(BB) : ParentRegion; + + if (R == ParentRegion) { + // It's a top level block in our region + Value *Cond = True; + if (Term->isConditional()) { + BasicBlock *Other = Term->getSuccessor(!Idx); + + if (Visited.count(Other)) { + if (!Pred.count(Other)) + Pred[Other] = False; + + if (!Pred.count(BB)) + Pred[BB] = True; + return; + } + Cond = Term->getCondition(); + + if (Idx != Invert) + Cond = BinaryOperator::CreateNot(Cond, "", Term); + } + + Pred[BB] = Cond; + + } else if (ParentRegion->contains(R)) { + // It's a block in a sub region + while(R->getParent() != ParentRegion) + R = R->getParent(); + + Pred[R->getEntry()] = True; + + } else { + // It's a branch from outside into our parent region + Pred[BB] = True; + } +} + +/// \brief Analyze the successors of each block and build up predicates +void AMDGPUStructurizeCFG::analyzeBlock(BasicBlock *BB) { + pred_iterator PI = pred_begin(BB), PE = pred_end(BB); + BBPredicates &Pred = Predicates[BB]; + + for (; PI != PE; ++PI) { + BranchInst *Term = cast((*PI)->getTerminator()); + + for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { + BasicBlock *Succ = Term->getSuccessor(i); + if (Succ != BB) + continue; + buildPredicate(Term, i, Pred, false); + } + } +} + +/// \brief Analyze the conditions leading to loop to a previous block +void AMDGPUStructurizeCFG::analyzeLoop(BasicBlock *BB, unsigned &LoopIdx) { + BranchInst *Term = cast(BB->getTerminator()); + + for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { + BasicBlock *Succ = Term->getSuccessor(i); + + // Ignore it if it's not a back edge + if (!Visited.count(Succ)) + continue; + + buildPredicate(Term, i, LoopPred, true); + + LoopEnd = BB; + if (Visited[Succ] < LoopIdx) { + LoopIdx = Visited[Succ]; + LoopStart = Succ; + } + } +} + +/// \brief Collect various loop and predicate infos +void AMDGPUStructurizeCFG::collectInfos() { + unsigned Number = 0, LoopIdx = ~0; + + // Reset predicate + Predicates.clear(); + + // and loop infos + LoopStart = LoopEnd = 0; + LoopPred.clear(); + + RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend(); + for (Visited.clear(); OI != OE; Visited[(*OI++)->getEntry()] = ++Number) { + + // Analyze all the conditions leading to a node + analyzeBlock((*OI)->getEntry()); + + if ((*OI)->isSubRegion()) + continue; + + // Find the first/last loop nodes and loop predicates + analyzeLoop((*OI)->getNodeAs(), LoopIdx); + } +} + +/// \brief Does A dominate all the predicates of B ? +bool AMDGPUStructurizeCFG::dominatesPredicates(BasicBlock *A, BasicBlock *B) { + BBPredicates &Preds = Predicates[B]; + for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end(); + PI != PE; ++PI) { + + if (!DT->dominates(A, PI->first)) + return false; + } + return true; +} + +/// \brief Remove phi values from all successors and the remove the terminator. +void AMDGPUStructurizeCFG::killTerminator(BasicBlock *BB) { + TerminatorInst *Term = BB->getTerminator(); + if (!Term) + return; + + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); + SI != SE; ++SI) { + + delPhiValues(BB, *SI); + } + + Term->eraseFromParent(); +} + +/// First: Skip forward to the first region node that either isn't a subregion or not +/// dominating it's exit, remove all the skipped nodes from the node order. +/// +/// Second: Handle the first successor directly if the resulting nodes successor +/// predicates are still dominated by the original entry +RegionNode *AMDGPUStructurizeCFG::skipChained(RegionNode *Node) { + BasicBlock *Entry = Node->getEntry(); + + // Skip forward as long as it is just a linear flow + while (true) { + BasicBlock *Entry = Node->getEntry(); + BasicBlock *Exit; + + if (Node->isSubRegion()) { + Exit = Node->getNodeAs()->getExit(); + } else { + TerminatorInst *Term = Entry->getTerminator(); + if (Term->getNumSuccessors() != 1) + break; + Exit = Term->getSuccessor(0); + } + + // It's a back edge, break here so we can insert a loop node + if (!Visited.count(Exit)) + return Node; + + // More than node edges are pointing to exit + if (!DT->dominates(Entry, Exit)) + return Node; + + RegionNode *Next = ParentRegion->getNode(Exit); + RNVector::iterator I = std::find(Order.begin(), Order.end(), Next); + assert(I != Order.end()); + + Visited.erase(Next->getEntry()); + Order.erase(I); + Node = Next; + } + + BasicBlock *BB = Node->getEntry(); + TerminatorInst *Term = BB->getTerminator(); + if (Term->getNumSuccessors() != 2) + return Node; + + // Our node has exactly two succesors, check if we can handle + // any of them directly + BasicBlock *Succ = Term->getSuccessor(0); + if (!Visited.count(Succ) || !dominatesPredicates(Entry, Succ)) { + Succ = Term->getSuccessor(1); + if (!Visited.count(Succ) || !dominatesPredicates(Entry, Succ)) + return Node; + } else { + BasicBlock *Succ2 = Term->getSuccessor(1); + if (Visited.count(Succ2) && Visited[Succ] > Visited[Succ2] && + dominatesPredicates(Entry, Succ2)) + Succ = Succ2; + } + + RegionNode *Next = ParentRegion->getNode(Succ); + RNVector::iterator E = Order.end(); + RNVector::iterator I = std::find(Order.begin(), E, Next); + assert(I != E); + + killTerminator(BB); + FlowsInserted.push_back(BB); + Visited.erase(Succ); + Order.erase(I); + return ParentRegion->getNode(wireFlowBlock(BB, Next)); +} + +/// \brief Remove all PHI values coming from "From" into "To" and remember +/// them in DeletedPhis +void AMDGPUStructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) { + PhiMap &Map = DeletedPhis[To]; + for (BasicBlock::iterator I = To->begin(), E = To->end(); + I != E && isa(*I);) { + + PHINode &Phi = cast(*I++); + while (Phi.getBasicBlockIndex(From) != -1) { + Value *Deleted = Phi.removeIncomingValue(From, false); + Map[&Phi].push_back(std::make_pair(From, Deleted)); + } + } +} + +/// \brief Add the PHI values back once we knew the new predecessor +void AMDGPUStructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) { + if (!DeletedPhis.count(To)) + return; + + PhiMap &Map = DeletedPhis[To]; + SSAUpdater Updater; + + for (PhiMap::iterator I = Map.begin(), E = Map.end(); I != E; ++I) { + + PHINode *Phi = I->first; + Updater.Initialize(Phi->getType(), ""); + BasicBlock *Fallback = To; + bool HaveFallback = false; + + for (BBValueVector::iterator VI = I->second.begin(), VE = I->second.end(); + VI != VE; ++VI) { + + Updater.AddAvailableValue(VI->first, VI->second); + BasicBlock *Dom = DT->findNearestCommonDominator(Fallback, VI->first); + if (Dom == VI->first) + HaveFallback = true; + else if (Dom != Fallback) + HaveFallback = false; + Fallback = Dom; + } + if (!HaveFallback) { + Value *Undef = UndefValue::get(Phi->getType()); + Updater.AddAvailableValue(Fallback, Undef); + } + + Phi->addIncoming(Updater.GetValueAtEndOfBlock(From), From); + } + DeletedPhis.erase(To); +} + +/// \brief Create a new flow node and update dominator tree and region info +BasicBlock *AMDGPUStructurizeCFG::getNextFlow(BasicBlock *Prev) { + LLVMContext &Context = Func->getContext(); + BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() : + Order.back()->getEntry(); + BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName, + Func, Insert); + DT->addNewBlock(Flow, Prev); + ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion); + FlowsInserted.push_back(Flow); + return Flow; +} + +/// \brief Can we predict that this node will always be called? +bool AMDGPUStructurizeCFG::isPredictableTrue(BasicBlock *Prev, + BasicBlock *Node) { + BBPredicates &Preds = Predicates[Node]; + bool Dominated = false; + + for (BBPredicates::iterator I = Preds.begin(), E = Preds.end(); + I != E; ++I) { + + if (I->second != BoolTrue) + return false; + + if (!Dominated && DT->dominates(I->first, Prev)) + Dominated = true; + } + return Dominated; +} + +/// \brief Wire up the new control flow by inserting or updating the branch +/// instructions at node exits +BasicBlock *AMDGPUStructurizeCFG::wireFlowBlock(BasicBlock *Prev, + RegionNode *Node) { + BasicBlock *Entry = Node->getEntry(); + + if (LoopStart == Entry) { + LoopStart = Prev; + LoopPred[Prev] = BoolTrue; + } + + // Wire it up temporary, skipChained may recurse into us + BranchInst::Create(Entry, Prev); + DT->changeImmediateDominator(Entry, Prev); + addPhiValues(Prev, Entry); + + Node = skipChained(Node); + + BasicBlock *Next = getNextFlow(Prev); + if (!isPredictableTrue(Prev, Entry)) { + // Let Prev point to entry and next block + Prev->getTerminator()->eraseFromParent(); + BranchInst::Create(Entry, Next, BoolUndef, Prev); + } else { + DT->changeImmediateDominator(Next, Entry); + } + + // Let node exit(s) point to next block + if (Node->isSubRegion()) { + Region *SubRegion = Node->getNodeAs(); + BasicBlock *Exit = SubRegion->getExit(); + + // Find all the edges from the sub region to the exit + BBVector ToDo; + for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) { + if (SubRegion->contains(*I)) + ToDo.push_back(*I); + } + + // Modify the edges to point to the new flow block + for (BBVector::iterator I = ToDo.begin(), E = ToDo.end(); I != E; ++I) { + delPhiValues(*I, Exit); + TerminatorInst *Term = (*I)->getTerminator(); + Term->replaceUsesOfWith(Exit, Next); + } + + // Update the region info + SubRegion->replaceExit(Next); + + } else { + BasicBlock *BB = Node->getNodeAs(); + killTerminator(BB); + BranchInst::Create(Next, BB); + + if (BB == LoopEnd) + LoopEnd = 0; + } + + return Next; +} + +/// Destroy node order and visited map, build up flow order instead. +/// After this function control flow looks like it should be, but +/// branches only have undefined conditions. +void AMDGPUStructurizeCFG::createFlow() { + DeletedPhis.clear(); + + BasicBlock *Prev = Order.pop_back_val()->getEntry(); + assert(Prev == ParentRegion->getEntry() && "Incorrect node order!"); + Visited.erase(Prev); + + if (LoopStart == Prev) { + // Loop starts at entry, split entry so that we can predicate it + BasicBlock::iterator Insert = Prev->getFirstInsertionPt(); + BasicBlock *Split = Prev->splitBasicBlock(Insert, FlowBlockName); + DT->addNewBlock(Split, Prev); + ParentRegion->getRegionInfo()->setRegionFor(Split, ParentRegion); + Predicates[Split] = Predicates[Prev]; + Order.push_back(ParentRegion->getBBNode(Split)); + LoopPred[Prev] = BoolTrue; + + } else if (LoopStart == Order.back()->getEntry()) { + // Loop starts behind entry, split entry so that we can jump to it + Instruction *Term = Prev->getTerminator(); + BasicBlock *Split = Prev->splitBasicBlock(Term, FlowBlockName); + DT->addNewBlock(Split, Prev); + ParentRegion->getRegionInfo()->setRegionFor(Split, ParentRegion); + Prev = Split; + } + + killTerminator(Prev); + FlowsInserted.clear(); + FlowsInserted.push_back(Prev); + + while (!Order.empty()) { + RegionNode *Node = Order.pop_back_val(); + Visited.erase(Node->getEntry()); + Prev = wireFlowBlock(Prev, Node); + if (LoopStart && !LoopEnd) { + // Create an extra loop end node + LoopEnd = Prev; + Prev = getNextFlow(LoopEnd); + BranchInst::Create(Prev, LoopStart, BoolUndef, LoopEnd); + addPhiValues(LoopEnd, LoopStart); + } + } + + BasicBlock *Exit = ParentRegion->getExit(); + BranchInst::Create(Exit, Prev); + addPhiValues(Prev, Exit); + if (DT->dominates(ParentRegion->getEntry(), Exit)) + DT->changeImmediateDominator(Exit, Prev); + + if (LoopStart && LoopEnd) { + BBVector::iterator FI = std::find(FlowsInserted.begin(), + FlowsInserted.end(), + LoopStart); + for (; *FI != LoopEnd; ++FI) { + addPhiValues(*FI, (*FI)->getTerminator()->getSuccessor(0)); + } + } + + assert(Order.empty()); + assert(Visited.empty()); + assert(DeletedPhis.empty()); +} + +/// \brief Insert the missing branch conditions +void AMDGPUStructurizeCFG::insertConditions() { + SSAUpdater PhiInserter; + + for (BBVector::iterator FI = FlowsInserted.begin(), FE = FlowsInserted.end(); + FI != FE; ++FI) { + + BranchInst *Term = cast((*FI)->getTerminator()); + if (Term->isUnconditional()) + continue; + + PhiInserter.Initialize(Boolean, ""); + PhiInserter.AddAvailableValue(&Func->getEntryBlock(), BoolFalse); + + BasicBlock *Succ = Term->getSuccessor(0); + BBPredicates &Preds = (*FI == LoopEnd) ? LoopPred : Predicates[Succ]; + for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end(); + PI != PE; ++PI) { + + PhiInserter.AddAvailableValue(PI->first, PI->second); + } + + Term->setCondition(PhiInserter.GetValueAtEndOfBlock(*FI)); + } +} + +/// Handle a rare case where the disintegrated nodes instructions +/// no longer dominate all their uses. Not sure if this is really nessasary +void AMDGPUStructurizeCFG::rebuildSSA() { + SSAUpdater Updater; + for (Region::block_iterator I = ParentRegion->block_begin(), + E = ParentRegion->block_end(); + I != E; ++I) { + + BasicBlock *BB = *I; + for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); + II != IE; ++II) { + + bool Initialized = false; + for (Use *I = &II->use_begin().getUse(), *Next; I; I = Next) { + + Next = I->getNext(); + + Instruction *User = cast(I->getUser()); + if (User->getParent() == BB) { + continue; + + } else if (PHINode *UserPN = dyn_cast(User)) { + if (UserPN->getIncomingBlock(*I) == BB) + continue; + } + + if (DT->dominates(II, User)) + continue; + + if (!Initialized) { + Value *Undef = UndefValue::get(II->getType()); + Updater.Initialize(II->getType(), ""); + Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); + Updater.AddAvailableValue(BB, II); + Initialized = true; + } + Updater.RewriteUseAfterInsertions(*I); + } + } + } +} + +/// \brief Run the transformation for each region found +bool AMDGPUStructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) { + if (R->isTopLevelRegion()) + return false; + + Func = R->getEntry()->getParent(); + ParentRegion = R; + + DT = &getAnalysis(); + + orderNodes(); + collectInfos(); + createFlow(); + insertConditions(); + rebuildSSA(); + + Order.clear(); + Visited.clear(); + Predicates.clear(); + DeletedPhis.clear(); + FlowsInserted.clear(); + + return true; +} + +/// \brief Create the pass +Pass *llvm::createAMDGPUStructurizeCFGPass() { + return new AMDGPUStructurizeCFG(); +} diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUSubtarget.cpp llvm-r600/lib/Target/R600/AMDGPUSubtarget.cpp --- llvm-3.2.src/lib/Target/R600/AMDGPUSubtarget.cpp 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPUSubtarget.cpp 2013-01-25 19:43:57.433383055 +0100 @@ -0,0 +1,87 @@ +//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Implements the AMDGPU specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUSubtarget.h" + +using namespace llvm; + +#define GET_SUBTARGETINFO_ENUM +#define GET_SUBTARGETINFO_TARGET_DESC +#define GET_SUBTARGETINFO_CTOR +#include "AMDGPUGenSubtargetInfo.inc" + +AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) : + AMDGPUGenSubtargetInfo(TT, CPU, FS), DumpCode(false) { + InstrItins = getInstrItineraryForCPU(CPU); + + memset(CapsOverride, 0, sizeof(*CapsOverride) + * AMDGPUDeviceInfo::MaxNumberCapabilities); + // Default card + StringRef GPU = CPU; + Is64bit = false; + DefaultSize[0] = 64; + DefaultSize[1] = 1; + DefaultSize[2] = 1; + ParseSubtargetFeatures(GPU, FS); + DevName = GPU; + Device = AMDGPUDeviceInfo::getDeviceFromName(DevName, this, Is64bit); +} + +AMDGPUSubtarget::~AMDGPUSubtarget() { + delete Device; +} + +bool +AMDGPUSubtarget::isOverride(AMDGPUDeviceInfo::Caps caps) const { + assert(caps < AMDGPUDeviceInfo::MaxNumberCapabilities && + "Caps index is out of bounds!"); + return CapsOverride[caps]; +} +bool +AMDGPUSubtarget::is64bit() const { + return Is64bit; +} +bool +AMDGPUSubtarget::isTargetELF() const { + return false; +} +size_t +AMDGPUSubtarget::getDefaultSize(uint32_t dim) const { + if (dim > 3) { + return 1; + } else { + return DefaultSize[dim]; + } +} + +std::string +AMDGPUSubtarget::getDataLayout() const { + if (!Device) { + return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16" + "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32" + "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64" + "-v96:128:128-v128:128:128-v192:256:256-v256:256:256" + "-v512:512:512-v1024:1024:1024-v2048:2048:2048-a0:0:64"); + } + return Device->getDataLayout(); +} + +std::string +AMDGPUSubtarget::getDeviceName() const { + return DevName; +} +const AMDGPUDevice * +AMDGPUSubtarget::device() const { + return Device; +} diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUSubtarget.h llvm-r600/lib/Target/R600/AMDGPUSubtarget.h --- llvm-3.2.src/lib/Target/R600/AMDGPUSubtarget.h 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPUSubtarget.h 2013-01-25 19:43:57.433383055 +0100 @@ -0,0 +1,65 @@ +//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPUSUBTARGET_H +#define AMDGPUSUBTARGET_H +#include "AMDILDevice.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +#define GET_SUBTARGETINFO_HEADER +#include "AMDGPUGenSubtargetInfo.inc" + +#define MAX_CB_SIZE (1 << 16) + +namespace llvm { + +class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { +private: + bool CapsOverride[AMDGPUDeviceInfo::MaxNumberCapabilities]; + const AMDGPUDevice *Device; + size_t DefaultSize[3]; + std::string DevName; + bool Is64bit; + bool Is32on64bit; + bool DumpCode; + bool R600ALUInst; + + InstrItineraryData InstrItins; + +public: + AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS); + virtual ~AMDGPUSubtarget(); + + const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } + virtual void ParseSubtargetFeatures(llvm::StringRef CPU, llvm::StringRef FS); + + bool isOverride(AMDGPUDeviceInfo::Caps) const; + bool is64bit() const; + + // Helper functions to simplify if statements + bool isTargetELF() const; + const AMDGPUDevice* device() const; + std::string getDataLayout() const; + std::string getDeviceName() const; + virtual size_t getDefaultSize(uint32_t dim) const; + bool dumpCode() const { return DumpCode; } + bool r600ALUEncoding() const { return R600ALUInst; } + +}; + +} // End namespace llvm + +#endif // AMDGPUSUBTARGET_H diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUTargetMachine.cpp llvm-r600/lib/Target/R600/AMDGPUTargetMachine.cpp --- llvm-3.2.src/lib/Target/R600/AMDGPUTargetMachine.cpp 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPUTargetMachine.cpp 2013-01-25 19:43:57.433383055 +0100 @@ -0,0 +1,148 @@ +//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief The AMDGPU target machine contains all of the hardware specific +/// information needed to emit code for R600 and SI GPUs. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetMachine.h" +#include "AMDGPU.h" +#include "R600ISelLowering.h" +#include "R600InstrInfo.h" +#include "SIISelLowering.h" +#include "SIInstrInfo.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/PassManager.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_os_ostream.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Scalar.h" +#include + +using namespace llvm; + +extern "C" void LLVMInitializeR600Target() { + // Register the target + RegisterTargetMachine X(TheAMDGPUTarget); +} + +AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + TargetOptions Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OptLevel +) +: + LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel), + Subtarget(TT, CPU, FS), + Layout(Subtarget.getDataLayout()), + FrameLowering(TargetFrameLowering::StackGrowsUp, + Subtarget.device()->getStackAlignment(), 0), + IntrinsicInfo(this), + InstrItins(&Subtarget.getInstrItineraryData()) { + // TLInfo uses InstrInfo so it must be initialized after. + if (Subtarget.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { + InstrInfo = new R600InstrInfo(*this); + TLInfo = new R600TargetLowering(*this); + } else { + InstrInfo = new SIInstrInfo(*this); + TLInfo = new SITargetLowering(*this); + } +} + +AMDGPUTargetMachine::~AMDGPUTargetMachine() { +} + +namespace { +class AMDGPUPassConfig : public TargetPassConfig { +public: + AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM) + : TargetPassConfig(TM, PM) {} + + AMDGPUTargetMachine &getAMDGPUTargetMachine() const { + return getTM(); + } + + virtual bool addPreISel(); + virtual bool addInstSelector(); + virtual bool addPreRegAlloc(); + virtual bool addPostRegAlloc(); + virtual bool addPreSched2(); + virtual bool addPreEmitPass(); +}; +} // End of anonymous namespace + +TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) { + return new AMDGPUPassConfig(this, PM); +} + +bool +AMDGPUPassConfig::addPreISel() { + const AMDGPUSubtarget &ST = TM->getSubtarget(); + if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { + addPass(createAMDGPUStructurizeCFGPass()); + addPass(createSIAnnotateControlFlowPass()); + } + return false; +} + +bool AMDGPUPassConfig::addInstSelector() { + addPass(createAMDGPUPeepholeOpt(*TM)); + addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); + return false; +} + +bool AMDGPUPassConfig::addPreRegAlloc() { + const AMDGPUSubtarget &ST = TM->getSubtarget(); + + if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { + addPass(createSIAssignInterpRegsPass(*TM)); + } + addPass(createAMDGPUConvertToISAPass(*TM)); + return false; +} + +bool AMDGPUPassConfig::addPostRegAlloc() { + const AMDGPUSubtarget &ST = TM->getSubtarget(); + + if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { + addPass(createSIInsertWaits(*TM)); + } + return false; +} + +bool AMDGPUPassConfig::addPreSched2() { + + addPass(&IfConverterID); + return false; +} + +bool AMDGPUPassConfig::addPreEmitPass() { + const AMDGPUSubtarget &ST = TM->getSubtarget(); + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { + addPass(createAMDGPUCFGPreparationPass(*TM)); + addPass(createAMDGPUCFGStructurizerPass(*TM)); + addPass(createR600ExpandSpecialInstrsPass(*TM)); + addPass(createR600LowerConstCopy(*TM)); + addPass(&FinalizeMachineBundlesID); + } else { + addPass(createSILowerLiteralConstantsPass(*TM)); + addPass(createSILowerControlFlowPass(*TM)); + } + + return false; +} + diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUTargetMachine.h llvm-r600/lib/Target/R600/AMDGPUTargetMachine.h --- llvm-3.2.src/lib/Target/R600/AMDGPUTargetMachine.h 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPUTargetMachine.h 2013-01-25 19:43:57.433383055 +0100 @@ -0,0 +1,70 @@ +//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets. +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPU_TARGET_MACHINE_H +#define AMDGPU_TARGET_MACHINE_H + +#include "AMDGPUInstrInfo.h" +#include "AMDGPUSubtarget.h" +#include "AMDILFrameLowering.h" +#include "AMDILIntrinsicInfo.h" +#include "R600ISelLowering.h" +#include "llvm/ADT/OwningPtr.h" +#include "llvm/DataLayout.h" + +namespace llvm { + +MCAsmInfo* createMCAsmInfo(const Target &T, StringRef TT); + +class AMDGPUTargetMachine : public LLVMTargetMachine { + + AMDGPUSubtarget Subtarget; + const DataLayout Layout; + AMDGPUFrameLowering FrameLowering; + AMDGPUIntrinsicInfo IntrinsicInfo; + const AMDGPUInstrInfo * InstrInfo; + AMDGPUTargetLowering * TLInfo; + const InstrItineraryData* InstrItins; + +public: + AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef FS, + StringRef CPU, + TargetOptions Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); + ~AMDGPUTargetMachine(); + virtual const AMDGPUFrameLowering* getFrameLowering() const { + return &FrameLowering; + } + virtual const AMDGPUIntrinsicInfo* getIntrinsicInfo() const { + return &IntrinsicInfo; + } + virtual const AMDGPUInstrInfo *getInstrInfo() const {return InstrInfo;} + virtual const AMDGPUSubtarget *getSubtargetImpl() const {return &Subtarget; } + virtual const AMDGPURegisterInfo *getRegisterInfo() const { + return &InstrInfo->getRegisterInfo(); + } + virtual AMDGPUTargetLowering * getTargetLowering() const { + return TLInfo; + } + virtual const InstrItineraryData* getInstrItineraryData() const { + return InstrItins; + } + virtual const DataLayout* getDataLayout() const { return &Layout; } + virtual TargetPassConfig *createPassConfig(PassManagerBase &PM); +}; + +} // End namespace llvm + +#endif // AMDGPU_TARGET_MACHINE_H diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPU.td llvm-r600/lib/Target/R600/AMDGPU.td --- llvm-3.2.src/lib/Target/R600/AMDGPU.td 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDGPU.td 2013-01-25 19:43:57.423383055 +0100 @@ -0,0 +1,40 @@ +//===-- AMDIL.td - AMDIL Tablegen files --*- tablegen -*-------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// + +// Include AMDIL TD files +include "AMDILBase.td" + + +def AMDGPUInstrInfo : InstrInfo { + let guessInstructionProperties = 1; +} + +//===----------------------------------------------------------------------===// +// Declare the target which we are implementing +//===----------------------------------------------------------------------===// +def AMDGPUAsmWriter : AsmWriter { + string AsmWriterClassName = "InstPrinter"; + int Variant = 0; + bit isMCAsmWriter = 1; +} + +def AMDGPU : Target { + // Pull in Instruction Info: + let InstructionSet = AMDGPUInstrInfo; + let AssemblyWriters = [AMDGPUAsmWriter]; +} + +// Include AMDGPU TD files +include "R600Schedule.td" +include "SISchedule.td" +include "Processors.td" +include "AMDGPUInstrInfo.td" +include "AMDGPUIntrinsics.td" +include "AMDGPURegisterInfo.td" +include "AMDGPUInstructions.td" diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDIL7XXDevice.cpp llvm-r600/lib/Target/R600/AMDIL7XXDevice.cpp --- llvm-3.2.src/lib/Target/R600/AMDIL7XXDevice.cpp 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDIL7XXDevice.cpp 2013-01-25 19:43:57.433383055 +0100 @@ -0,0 +1,115 @@ +//===-- AMDIL7XXDevice.cpp - Device Info for 7XX GPUs ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// \file +//==-----------------------------------------------------------------------===// +#include "AMDIL7XXDevice.h" +#include "AMDGPUSubtarget.h" +#include "AMDILDevice.h" + +using namespace llvm; + +AMDGPU7XXDevice::AMDGPU7XXDevice(AMDGPUSubtarget *ST) : AMDGPUDevice(ST) { + setCaps(); + std::string name = mSTM->getDeviceName(); + if (name == "rv710") { + DeviceFlag = OCL_DEVICE_RV710; + } else if (name == "rv730") { + DeviceFlag = OCL_DEVICE_RV730; + } else { + DeviceFlag = OCL_DEVICE_RV770; + } +} + +AMDGPU7XXDevice::~AMDGPU7XXDevice() { +} + +void AMDGPU7XXDevice::setCaps() { + mSWBits.set(AMDGPUDeviceInfo::LocalMem); +} + +size_t AMDGPU7XXDevice::getMaxLDSSize() const { + if (usesHardware(AMDGPUDeviceInfo::LocalMem)) { + return MAX_LDS_SIZE_700; + } + return 0; +} + +size_t AMDGPU7XXDevice::getWavefrontSize() const { + return AMDGPUDevice::HalfWavefrontSize; +} + +uint32_t AMDGPU7XXDevice::getGeneration() const { + return AMDGPUDeviceInfo::HD4XXX; +} + +uint32_t AMDGPU7XXDevice::getResourceID(uint32_t DeviceID) const { + switch (DeviceID) { + default: + assert(0 && "ID type passed in is unknown!"); + break; + case GLOBAL_ID: + case CONSTANT_ID: + case RAW_UAV_ID: + case ARENA_UAV_ID: + break; + case LDS_ID: + if (usesHardware(AMDGPUDeviceInfo::LocalMem)) { + return DEFAULT_LDS_ID; + } + break; + case SCRATCH_ID: + if (usesHardware(AMDGPUDeviceInfo::PrivateMem)) { + return DEFAULT_SCRATCH_ID; + } + break; + case GDS_ID: + assert(0 && "GDS UAV ID is not supported on this chip"); + if (usesHardware(AMDGPUDeviceInfo::RegionMem)) { + return DEFAULT_GDS_ID; + } + break; + }; + + return 0; +} + +uint32_t AMDGPU7XXDevice::getMaxNumUAVs() const { + return 1; +} + +AMDGPU770Device::AMDGPU770Device(AMDGPUSubtarget *ST): AMDGPU7XXDevice(ST) { + setCaps(); +} + +AMDGPU770Device::~AMDGPU770Device() { +} + +void AMDGPU770Device::setCaps() { + if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) { + mSWBits.set(AMDGPUDeviceInfo::FMA); + mHWBits.set(AMDGPUDeviceInfo::DoubleOps); + } + mSWBits.set(AMDGPUDeviceInfo::BarrierDetect); + mHWBits.reset(AMDGPUDeviceInfo::LongOps); + mSWBits.set(AMDGPUDeviceInfo::LongOps); + mSWBits.set(AMDGPUDeviceInfo::LocalMem); +} + +size_t AMDGPU770Device::getWavefrontSize() const { + return AMDGPUDevice::WavefrontSize; +} + +AMDGPU710Device::AMDGPU710Device(AMDGPUSubtarget *ST) : AMDGPU7XXDevice(ST) { +} + +AMDGPU710Device::~AMDGPU710Device() { +} + +size_t AMDGPU710Device::getWavefrontSize() const { + return AMDGPUDevice::QuarterWavefrontSize; +} diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDIL7XXDevice.h llvm-r600/lib/Target/R600/AMDIL7XXDevice.h --- llvm-3.2.src/lib/Target/R600/AMDIL7XXDevice.h 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDIL7XXDevice.h 2013-01-25 19:43:57.436716388 +0100 @@ -0,0 +1,72 @@ +//==-- AMDIL7XXDevice.h - Define 7XX Device Device for AMDIL ---*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +/// \file +/// \brief Interface for the subtarget data classes. +/// +/// This file will define the interface that each generation needs to +/// implement in order to correctly answer queries on the capabilities of the +/// specific hardware. +//===----------------------------------------------------------------------===// +#ifndef AMDIL7XXDEVICEIMPL_H +#define AMDIL7XXDEVICEIMPL_H +#include "AMDILDevice.h" + +namespace llvm { +class AMDGPUSubtarget; + +//===----------------------------------------------------------------------===// +// 7XX generation of devices and their respective sub classes +//===----------------------------------------------------------------------===// + +/// \brief The AMDGPU7XXDevice class represents the generic 7XX device. +/// +/// All 7XX devices are derived from this class. The AMDGPU7XX device will only +/// support the minimal features that are required to be considered OpenCL 1.0 +/// compliant and nothing more. +class AMDGPU7XXDevice : public AMDGPUDevice { +public: + AMDGPU7XXDevice(AMDGPUSubtarget *ST); + virtual ~AMDGPU7XXDevice(); + virtual size_t getMaxLDSSize() const; + virtual size_t getWavefrontSize() const; + virtual uint32_t getGeneration() const; + virtual uint32_t getResourceID(uint32_t DeviceID) const; + virtual uint32_t getMaxNumUAVs() const; + +protected: + virtual void setCaps(); +}; + +/// \brief The AMDGPU770Device class represents the RV770 chip and it's +/// derivative cards. +/// +/// The difference between this device and the base class is this device device +/// adds support for double precision and has a larger wavefront size. +class AMDGPU770Device : public AMDGPU7XXDevice { +public: + AMDGPU770Device(AMDGPUSubtarget *ST); + virtual ~AMDGPU770Device(); + virtual size_t getWavefrontSize() const; +private: + virtual void setCaps(); +}; + +/// \brief The AMDGPU710Device class derives from the 7XX base class. +/// +/// This class is a smaller derivative, so we need to overload some of the +/// functions in order to correctly specify this information. +class AMDGPU710Device : public AMDGPU7XXDevice { +public: + AMDGPU710Device(AMDGPUSubtarget *ST); + virtual ~AMDGPU710Device(); + virtual size_t getWavefrontSize() const; +}; + +} // namespace llvm +#endif // AMDILDEVICEIMPL_H diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILBase.td llvm-r600/lib/Target/R600/AMDILBase.td --- llvm-3.2.src/lib/Target/R600/AMDILBase.td 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDILBase.td 2013-01-25 19:43:57.436716388 +0100 @@ -0,0 +1,85 @@ +//===- AMDIL.td - AMDIL Target Machine -------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Target-independent interfaces which we are implementing +//===----------------------------------------------------------------------===// + +include "llvm/Target/Target.td" + +// Dummy Instruction itineraries for pseudo instructions +def ALU_NULL : FuncUnit; +def NullALU : InstrItinClass; + +//===----------------------------------------------------------------------===// +// AMDIL Subtarget features. +//===----------------------------------------------------------------------===// +def FeatureFP64 : SubtargetFeature<"fp64", + "CapsOverride[AMDGPUDeviceInfo::DoubleOps]", + "true", + "Enable 64bit double precision operations">; +def FeatureByteAddress : SubtargetFeature<"byte_addressable_store", + "CapsOverride[AMDGPUDeviceInfo::ByteStores]", + "true", + "Enable byte addressable stores">; +def FeatureBarrierDetect : SubtargetFeature<"barrier_detect", + "CapsOverride[AMDGPUDeviceInfo::BarrierDetect]", + "true", + "Enable duplicate barrier detection(HD5XXX or later).">; +def FeatureImages : SubtargetFeature<"images", + "CapsOverride[AMDGPUDeviceInfo::Images]", + "true", + "Enable image functions">; +def FeatureMultiUAV : SubtargetFeature<"multi_uav", + "CapsOverride[AMDGPUDeviceInfo::MultiUAV]", + "true", + "Generate multiple UAV code(HD5XXX family or later)">; +def FeatureMacroDB : SubtargetFeature<"macrodb", + "CapsOverride[AMDGPUDeviceInfo::MacroDB]", + "true", + "Use internal macrodb, instead of macrodb in driver">; +def FeatureNoAlias : SubtargetFeature<"noalias", + "CapsOverride[AMDGPUDeviceInfo::NoAlias]", + "true", + "assert that all kernel argument pointers are not aliased">; +def FeatureNoInline : SubtargetFeature<"no-inline", + "CapsOverride[AMDGPUDeviceInfo::NoInline]", + "true", + "specify whether to not inline functions">; + +def Feature64BitPtr : SubtargetFeature<"64BitPtr", + "Is64bit", + "false", + "Specify if 64bit addressing should be used.">; + +def Feature32on64BitPtr : SubtargetFeature<"64on32BitPtr", + "Is32on64bit", + "false", + "Specify if 64bit sized pointers with 32bit addressing should be used.">; +def FeatureDebug : SubtargetFeature<"debug", + "CapsOverride[AMDGPUDeviceInfo::Debug]", + "true", + "Debug mode is enabled, so disable hardware accelerated address spaces.">; +def FeatureDumpCode : SubtargetFeature <"DumpCode", + "DumpCode", + "true", + "Dump MachineInstrs in the CodeEmitter">; + +def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst", + "R600ALUInst", + "false", + "Older version of ALU instructions encoding.">; + + +//===----------------------------------------------------------------------===// +// Register File, Calling Conv, Instruction Descriptions +//===----------------------------------------------------------------------===// + + +include "AMDILRegisterInfo.td" +include "AMDILInstrInfo.td" + diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILCFGStructurizer.cpp llvm-r600/lib/Target/R600/AMDILCFGStructurizer.cpp --- llvm-3.2.src/lib/Target/R600/AMDILCFGStructurizer.cpp 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDILCFGStructurizer.cpp 2013-01-25 19:43:57.436716388 +0100 @@ -0,0 +1,3045 @@ +//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//==-----------------------------------------------------------------------===// + +#define DEBUGME 0 +#define DEBUG_TYPE "structcfg" + +#include "AMDGPUInstrInfo.h" +#include "AMDIL.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/DominatorInternals.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +// TODO: move-begin. + +//===----------------------------------------------------------------------===// +// +// Statistics for CFGStructurizer. +// +//===----------------------------------------------------------------------===// + +STATISTIC(numSerialPatternMatch, "CFGStructurizer number of serial pattern " + "matched"); +STATISTIC(numIfPatternMatch, "CFGStructurizer number of if pattern " + "matched"); +STATISTIC(numLoopbreakPatternMatch, "CFGStructurizer number of loop-break " + "pattern matched"); +STATISTIC(numLoopcontPatternMatch, "CFGStructurizer number of loop-continue " + "pattern matched"); +STATISTIC(numLoopPatternMatch, "CFGStructurizer number of loop pattern " + "matched"); +STATISTIC(numClonedBlock, "CFGStructurizer cloned blocks"); +STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions"); + +//===----------------------------------------------------------------------===// +// +// Miscellaneous utility for CFGStructurizer. +// +//===----------------------------------------------------------------------===// +namespace llvmCFGStruct { +#define SHOWNEWINSTR(i) \ + if (DEBUGME) errs() << "New instr: " << *i << "\n" + +#define SHOWNEWBLK(b, msg) \ +if (DEBUGME) { \ + errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ + errs() << "\n"; \ +} + +#define SHOWBLK_DETAIL(b, msg) \ +if (DEBUGME) { \ + if (b) { \ + errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ + b->print(errs()); \ + errs() << "\n"; \ + } \ +} + +#define INVALIDSCCNUM -1 +#define INVALIDREGNUM 0 + +template +void PrintLoopinfo(const LoopinfoT &LoopInfo, llvm::raw_ostream &OS) { + for (typename LoopinfoT::iterator iter = LoopInfo.begin(), + iterEnd = LoopInfo.end(); + iter != iterEnd; ++iter) { + (*iter)->print(OS, 0); + } +} + +template +void ReverseVector(SmallVector &Src) { + size_t sz = Src.size(); + for (size_t i = 0; i < sz/2; ++i) { + NodeT *t = Src[i]; + Src[i] = Src[sz - i - 1]; + Src[sz - i - 1] = t; + } +} + +} //end namespace llvmCFGStruct + +//===----------------------------------------------------------------------===// +// +// supporting data structure for CFGStructurizer +// +//===----------------------------------------------------------------------===// + +namespace llvmCFGStruct { +template +struct CFGStructTraits { +}; + +template +class BlockInformation { +public: + bool isRetired; + int sccNum; + //SmallVector succInstr; + //Instructions defining the corresponding successor. + BlockInformation() : isRetired(false), sccNum(INVALIDSCCNUM) {} +}; + +template +class LandInformation { +public: + BlockT *landBlk; + std::set breakInitRegs; //Registers that need to "reg = 0", before + //WHILELOOP(thisloop) init before entering + //thisloop. + std::set contInitRegs; //Registers that need to "reg = 0", after + //WHILELOOP(thisloop) init after entering + //thisloop. + std::set endbranchInitRegs; //Init before entering this loop, at loop + //land block, branch cond on this reg. + std::set breakOnRegs; //registers that need to "if (reg) break + //endif" after ENDLOOP(thisloop) break + //outerLoopOf(thisLoop). + std::set contOnRegs; //registers that need to "if (reg) continue + //endif" after ENDLOOP(thisloop) continue on + //outerLoopOf(thisLoop). + LandInformation() : landBlk(NULL) {} +}; + +} //end of namespace llvmCFGStruct + +//===----------------------------------------------------------------------===// +// +// CFGStructurizer +// +//===----------------------------------------------------------------------===// + +namespace llvmCFGStruct { +// bixia TODO: port it to BasicBlock, not just MachineBasicBlock. +template +class CFGStructurizer { +public: + typedef enum { + Not_SinglePath = 0, + SinglePath_InPath = 1, + SinglePath_NotInPath = 2 + } PathToKind; + +public: + typedef typename PassT::InstructionType InstrT; + typedef typename PassT::FunctionType FuncT; + typedef typename PassT::DominatortreeType DomTreeT; + typedef typename PassT::PostDominatortreeType PostDomTreeT; + typedef typename PassT::DomTreeNodeType DomTreeNodeT; + typedef typename PassT::LoopinfoType LoopInfoT; + + typedef GraphTraits FuncGTraits; + //typedef FuncGTraits::nodes_iterator BlockIterator; + typedef typename FuncT::iterator BlockIterator; + + typedef typename FuncGTraits::NodeType BlockT; + typedef GraphTraits BlockGTraits; + typedef GraphTraits > InvBlockGTraits; + //typedef BlockGTraits::succ_iterator InstructionIterator; + typedef typename BlockT::iterator InstrIterator; + + typedef CFGStructTraits CFGTraits; + typedef BlockInformation BlockInfo; + typedef std::map BlockInfoMap; + + typedef int RegiT; + typedef typename PassT::LoopType LoopT; + typedef LandInformation LoopLandInfo; + typedef std::map LoopLandInfoMap; + //landing info for loop break + typedef SmallVector BlockTSmallerVector; + +public: + CFGStructurizer(); + ~CFGStructurizer(); + + /// Perform the CFG structurization + bool run(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri); + + /// Perform the CFG preparation + bool prepare(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri); + +private: + void reversePredicateSetter(typename BlockT::iterator); + void orderBlocks(); + void printOrderedBlocks(llvm::raw_ostream &OS); + int patternMatch(BlockT *CurBlock); + int patternMatchGroup(BlockT *CurBlock); + + int serialPatternMatch(BlockT *CurBlock); + int ifPatternMatch(BlockT *CurBlock); + int switchPatternMatch(BlockT *CurBlock); + int loopendPatternMatch(BlockT *CurBlock); + int loopPatternMatch(BlockT *CurBlock); + + int loopbreakPatternMatch(LoopT *LoopRep, BlockT *LoopHeader); + int loopcontPatternMatch(LoopT *LoopRep, BlockT *LoopHeader); + //int loopWithoutBreak(BlockT *); + + void handleLoopbreak (BlockT *ExitingBlock, LoopT *ExitingLoop, + BlockT *ExitBlock, LoopT *exitLoop, BlockT *landBlock); + void handleLoopcontBlock(BlockT *ContingBlock, LoopT *contingLoop, + BlockT *ContBlock, LoopT *contLoop); + bool isSameloopDetachedContbreak(BlockT *Src1Block, BlockT *Src2Block); + int handleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock, + BlockT *FalseBlock); + int handleJumpintoIfImp(BlockT *HeadBlock, BlockT *TrueBlock, + BlockT *FalseBlock); + int improveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock, + BlockT *FalseBlock, BlockT **LandBlockPtr); + void showImproveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock, + BlockT *FalseBlock, BlockT *LandBlock, + bool Detail = false); + PathToKind singlePathTo(BlockT *SrcBlock, BlockT *DstBlock, + bool AllowSideEntry = true); + BlockT *singlePathEnd(BlockT *srcBlock, BlockT *DstBlock, + bool AllowSideEntry = true); + int cloneOnSideEntryTo(BlockT *PreBlock, BlockT *SrcBlock, BlockT *DstBlock); + void mergeSerialBlock(BlockT *DstBlock, BlockT *srcBlock); + + void mergeIfthenelseBlock(InstrT *BranchInstr, BlockT *CurBlock, + BlockT *TrueBlock, BlockT *FalseBlock, + BlockT *LandBlock); + void mergeLooplandBlock(BlockT *DstBlock, LoopLandInfo *LoopLand); + void mergeLoopbreakBlock(BlockT *ExitingBlock, BlockT *ExitBlock, + BlockT *ExitLandBlock, RegiT SetReg); + void settleLoopcontBlock(BlockT *ContingBlock, BlockT *ContBlock, + RegiT SetReg); + BlockT *relocateLoopcontBlock(LoopT *ParentLoopRep, LoopT *LoopRep, + std::set &ExitBlockSet, + BlockT *ExitLandBlk); + BlockT *addLoopEndbranchBlock(LoopT *LoopRep, + BlockTSmallerVector &ExitingBlocks, + BlockTSmallerVector &ExitBlocks); + BlockT *normalizeInfiniteLoopExit(LoopT *LoopRep); + void removeUnconditionalBranch(BlockT *SrcBlock); + void removeRedundantConditionalBranch(BlockT *SrcBlock); + void addDummyExitBlock(SmallVector &RetBlocks); + + void removeSuccessor(BlockT *SrcBlock); + BlockT *cloneBlockForPredecessor(BlockT *CurBlock, BlockT *PredBlock); + BlockT *exitingBlock2ExitBlock (LoopT *LoopRep, BlockT *exitingBlock); + + void migrateInstruction(BlockT *SrcBlock, BlockT *DstBlock, + InstrIterator InsertPos); + + void recordSccnum(BlockT *SrcBlock, int SCCNum); + int getSCCNum(BlockT *srcBlk); + + void retireBlock(BlockT *DstBlock, BlockT *SrcBlock); + bool isRetiredBlock(BlockT *SrcBlock); + bool isActiveLoophead(BlockT *CurBlock); + bool needMigrateBlock(BlockT *Block); + + BlockT *recordLoopLandBlock(LoopT *LoopRep, BlockT *LandBlock, + BlockTSmallerVector &exitBlocks, + std::set &ExitBlockSet); + void setLoopLandBlock(LoopT *LoopRep, BlockT *Block = NULL); + BlockT *getLoopLandBlock(LoopT *LoopRep); + LoopLandInfo *getLoopLandInfo(LoopT *LoopRep); + + void addLoopBreakOnReg(LoopT *LoopRep, RegiT RegNum); + void addLoopContOnReg(LoopT *LoopRep, RegiT RegNum); + void addLoopBreakInitReg(LoopT *LoopRep, RegiT RegNum); + void addLoopContInitReg(LoopT *LoopRep, RegiT RegNum); + void addLoopEndbranchInitReg(LoopT *LoopRep, RegiT RegNum); + + bool hasBackEdge(BlockT *curBlock); + unsigned getLoopDepth (LoopT *LoopRep); + int countActiveBlock( + typename SmallVector::const_iterator IterStart, + typename SmallVector::const_iterator IterEnd); + BlockT *findNearestCommonPostDom(std::set&); + BlockT *findNearestCommonPostDom(BlockT *Block1, BlockT *Block2); + +private: + DomTreeT *domTree; + PostDomTreeT *postDomTree; + LoopInfoT *loopInfo; + PassT *passRep; + FuncT *funcRep; + + BlockInfoMap blockInfoMap; + LoopLandInfoMap loopLandInfoMap; + SmallVector orderedBlks; + const AMDGPURegisterInfo *TRI; + +}; //template class CFGStructurizer + +template CFGStructurizer::CFGStructurizer() + : domTree(NULL), postDomTree(NULL), loopInfo(NULL) { +} + +template CFGStructurizer::~CFGStructurizer() { + for (typename BlockInfoMap::iterator I = blockInfoMap.begin(), + E = blockInfoMap.end(); I != E; ++I) { + delete I->second; + } +} + +template +bool CFGStructurizer::prepare(FuncT &func, PassT &pass, + const AMDGPURegisterInfo * tri) { + passRep = &pass; + funcRep = &func; + TRI = tri; + + bool changed = false; + + //FIXME: if not reducible flow graph, make it so ??? + + if (DEBUGME) { + errs() << "AMDGPUCFGStructurizer::prepare\n"; + } + + loopInfo = CFGTraits::getLoopInfo(pass); + if (DEBUGME) { + errs() << "LoopInfo:\n"; + PrintLoopinfo(*loopInfo, errs()); + } + + orderBlocks(); + if (DEBUGME) { + errs() << "Ordered blocks:\n"; + printOrderedBlocks(errs()); + } + + SmallVector retBlks; + + for (typename LoopInfoT::iterator iter = loopInfo->begin(), + iterEnd = loopInfo->end(); + iter != iterEnd; ++iter) { + LoopT* loopRep = (*iter); + BlockTSmallerVector exitingBlks; + loopRep->getExitingBlocks(exitingBlks); + + if (exitingBlks.size() == 0) { + BlockT* dummyExitBlk = normalizeInfiniteLoopExit(loopRep); + if (dummyExitBlk != NULL) + retBlks.push_back(dummyExitBlk); + } + } + + // Remove unconditional branch instr. + // Add dummy exit block iff there are multiple returns. + + for (typename SmallVector::const_iterator + iterBlk = orderedBlks.begin(), iterEndBlk = orderedBlks.end(); + iterBlk != iterEndBlk; + ++iterBlk) { + BlockT *curBlk = *iterBlk; + removeUnconditionalBranch(curBlk); + removeRedundantConditionalBranch(curBlk); + if (CFGTraits::isReturnBlock(curBlk)) { + retBlks.push_back(curBlk); + } + assert(curBlk->succ_size() <= 2); + } //for + + if (retBlks.size() >= 2) { + addDummyExitBlock(retBlks); + changed = true; + } + + return changed; +} //CFGStructurizer::prepare + +template +bool CFGStructurizer::run(FuncT &func, PassT &pass, + const AMDGPURegisterInfo * tri) { + passRep = &pass; + funcRep = &func; + TRI = tri; + + //Assume reducible CFG... + if (DEBUGME) { + errs() << "AMDGPUCFGStructurizer::run\n"; + func.viewCFG(); + } + + domTree = CFGTraits::getDominatorTree(pass); + if (DEBUGME) { + domTree->print(errs(), (const llvm::Module*)0); + } + + postDomTree = CFGTraits::getPostDominatorTree(pass); + if (DEBUGME) { + postDomTree->print(errs()); + } + + loopInfo = CFGTraits::getLoopInfo(pass); + if (DEBUGME) { + errs() << "LoopInfo:\n"; + PrintLoopinfo(*loopInfo, errs()); + } + + orderBlocks(); +#ifdef STRESSTEST + //Use the worse block ordering to test the algorithm. + ReverseVector(orderedBlks); +#endif + + if (DEBUGME) { + errs() << "Ordered blocks:\n"; + printOrderedBlocks(errs()); + } + int numIter = 0; + bool finish = false; + BlockT *curBlk; + bool makeProgress = false; + int numRemainedBlk = countActiveBlock(orderedBlks.begin(), + orderedBlks.end()); + + do { + ++numIter; + if (DEBUGME) { + errs() << "numIter = " << numIter + << ", numRemaintedBlk = " << numRemainedBlk << "\n"; + } + + typename SmallVector::const_iterator + iterBlk = orderedBlks.begin(); + typename SmallVector::const_iterator + iterBlkEnd = orderedBlks.end(); + + typename SmallVector::const_iterator + sccBeginIter = iterBlk; + BlockT *sccBeginBlk = NULL; + int sccNumBlk = 0; // The number of active blocks, init to a + // maximum possible number. + int sccNumIter; // Number of iteration in this SCC. + + while (iterBlk != iterBlkEnd) { + curBlk = *iterBlk; + + if (sccBeginBlk == NULL) { + sccBeginIter = iterBlk; + sccBeginBlk = curBlk; + sccNumIter = 0; + sccNumBlk = numRemainedBlk; // Init to maximum possible number. + if (DEBUGME) { + errs() << "start processing SCC" << getSCCNum(sccBeginBlk); + errs() << "\n"; + } + } + + if (!isRetiredBlock(curBlk)) { + patternMatch(curBlk); + } + + ++iterBlk; + + bool contNextScc = true; + if (iterBlk == iterBlkEnd + || getSCCNum(sccBeginBlk) != getSCCNum(*iterBlk)) { + // Just finish one scc. + ++sccNumIter; + int sccRemainedNumBlk = countActiveBlock(sccBeginIter, iterBlk); + if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= sccNumBlk) { + if (DEBUGME) { + errs() << "Can't reduce SCC " << getSCCNum(curBlk) + << ", sccNumIter = " << sccNumIter; + errs() << "doesn't make any progress\n"; + } + contNextScc = true; + } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < sccNumBlk) { + sccNumBlk = sccRemainedNumBlk; + iterBlk = sccBeginIter; + contNextScc = false; + if (DEBUGME) { + errs() << "repeat processing SCC" << getSCCNum(curBlk) + << "sccNumIter = " << sccNumIter << "\n"; + func.viewCFG(); + } + } else { + // Finish the current scc. + contNextScc = true; + } + } else { + // Continue on next component in the current scc. + contNextScc = false; + } + + if (contNextScc) { + sccBeginBlk = NULL; + } + } //while, "one iteration" over the function. + + BlockT *entryBlk = FuncGTraits::nodes_begin(&func); + if (entryBlk->succ_size() == 0) { + finish = true; + if (DEBUGME) { + errs() << "Reduce to one block\n"; + } + } else { + int newnumRemainedBlk + = countActiveBlock(orderedBlks.begin(), orderedBlks.end()); + // consider cloned blocks ?? + if (newnumRemainedBlk == 1 || newnumRemainedBlk < numRemainedBlk) { + makeProgress = true; + numRemainedBlk = newnumRemainedBlk; + } else { + makeProgress = false; + if (DEBUGME) { + errs() << "No progress\n"; + } + } + } + } while (!finish && makeProgress); + + // Misc wrap up to maintain the consistency of the Function representation. + CFGTraits::wrapup(FuncGTraits::nodes_begin(&func)); + + // Detach retired Block, release memory. + for (typename BlockInfoMap::iterator iterMap = blockInfoMap.begin(), + iterEndMap = blockInfoMap.end(); iterMap != iterEndMap; ++iterMap) { + if ((*iterMap).second && (*iterMap).second->isRetired) { + assert(((*iterMap).first)->getNumber() != -1); + if (DEBUGME) { + errs() << "Erase BB" << ((*iterMap).first)->getNumber() << "\n"; + } + (*iterMap).first->eraseFromParent(); //Remove from the parent Function. + } + delete (*iterMap).second; + } + blockInfoMap.clear(); + + // clear loopLandInfoMap + for (typename LoopLandInfoMap::iterator iterMap = loopLandInfoMap.begin(), + iterEndMap = loopLandInfoMap.end(); iterMap != iterEndMap; ++iterMap) { + delete (*iterMap).second; + } + loopLandInfoMap.clear(); + + if (DEBUGME) { + func.viewCFG(); + } + + if (!finish) { + assert(!"IRREDUCIBL_CF"); + } + + return true; +} //CFGStructurizer::run + +/// Print the ordered Blocks. +/// +template +void CFGStructurizer::printOrderedBlocks(llvm::raw_ostream &os) { + size_t i = 0; + for (typename SmallVector::const_iterator + iterBlk = orderedBlks.begin(), iterBlkEnd = orderedBlks.end(); + iterBlk != iterBlkEnd; + ++iterBlk, ++i) { + os << "BB" << (*iterBlk)->getNumber(); + os << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")"; + if (i != 0 && i % 10 == 0) { + os << "\n"; + } else { + os << " "; + } + } +} //printOrderedBlocks + +/// Compute the reversed DFS post order of Blocks +/// +template void CFGStructurizer::orderBlocks() { + int sccNum = 0; + BlockT *bb; + for (scc_iterator sccIter = scc_begin(funcRep), + sccEnd = scc_end(funcRep); sccIter != sccEnd; ++sccIter, ++sccNum) { + std::vector &sccNext = *sccIter; + for (typename std::vector::const_iterator + blockIter = sccNext.begin(), blockEnd = sccNext.end(); + blockIter != blockEnd; ++blockIter) { + bb = *blockIter; + orderedBlks.push_back(bb); + recordSccnum(bb, sccNum); + } + } + + //walk through all the block in func to check for unreachable + for (BlockIterator blockIter1 = FuncGTraits::nodes_begin(funcRep), + blockEnd1 = FuncGTraits::nodes_end(funcRep); + blockIter1 != blockEnd1; ++blockIter1) { + BlockT *bb = &(*blockIter1); + sccNum = getSCCNum(bb); + if (sccNum == INVALIDSCCNUM) { + errs() << "unreachable block BB" << bb->getNumber() << "\n"; + } + } +} //orderBlocks + +template int CFGStructurizer::patternMatch(BlockT *curBlk) { + int numMatch = 0; + int curMatch; + + if (DEBUGME) { + errs() << "Begin patternMatch BB" << curBlk->getNumber() << "\n"; + } + + while ((curMatch = patternMatchGroup(curBlk)) > 0) { + numMatch += curMatch; + } + + if (DEBUGME) { + errs() << "End patternMatch BB" << curBlk->getNumber() + << ", numMatch = " << numMatch << "\n"; + } + + return numMatch; +} //patternMatch + +template +int CFGStructurizer::patternMatchGroup(BlockT *curBlk) { + int numMatch = 0; + numMatch += serialPatternMatch(curBlk); + numMatch += ifPatternMatch(curBlk); + numMatch += loopendPatternMatch(curBlk); + numMatch += loopPatternMatch(curBlk); + return numMatch; +}//patternMatchGroup + +template +int CFGStructurizer::serialPatternMatch(BlockT *curBlk) { + if (curBlk->succ_size() != 1) { + return 0; + } + + BlockT *childBlk = *curBlk->succ_begin(); + if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk)) { + return 0; + } + + mergeSerialBlock(curBlk, childBlk); + ++numSerialPatternMatch; + return 1; +} //serialPatternMatch + +template +int CFGStructurizer::ifPatternMatch(BlockT *curBlk) { + //two edges + if (curBlk->succ_size() != 2) { + return 0; + } + + if (hasBackEdge(curBlk)) { + return 0; + } + + InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(curBlk); + if (branchInstr == NULL) { + return 0; + } + + assert(CFGTraits::isCondBranch(branchInstr)); + + BlockT *trueBlk = CFGTraits::getTrueBranch(branchInstr); + BlockT *falseBlk = CFGTraits::getFalseBranch(curBlk, branchInstr); + BlockT *landBlk; + int cloned = 0; + + // TODO: Simplify + if (trueBlk->succ_size() == 1 && falseBlk->succ_size() == 1 + && *trueBlk->succ_begin() == *falseBlk->succ_begin()) { + landBlk = *trueBlk->succ_begin(); + } else if (trueBlk->succ_size() == 0 && falseBlk->succ_size() == 0) { + landBlk = NULL; + } else if (trueBlk->succ_size() == 1 && *trueBlk->succ_begin() == falseBlk) { + landBlk = falseBlk; + falseBlk = NULL; + } else if (falseBlk->succ_size() == 1 + && *falseBlk->succ_begin() == trueBlk) { + landBlk = trueBlk; + trueBlk = NULL; + } else if (falseBlk->succ_size() == 1 + && isSameloopDetachedContbreak(trueBlk, falseBlk)) { + landBlk = *falseBlk->succ_begin(); + } else if (trueBlk->succ_size() == 1 + && isSameloopDetachedContbreak(falseBlk, trueBlk)) { + landBlk = *trueBlk->succ_begin(); + } else { + return handleJumpintoIf(curBlk, trueBlk, falseBlk); + } + + // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the + // new BB created for landBlk==NULL may introduce new challenge to the + // reduction process. + if (landBlk != NULL && + ((trueBlk && trueBlk->pred_size() > 1) + || (falseBlk && falseBlk->pred_size() > 1))) { + cloned += improveSimpleJumpintoIf(curBlk, trueBlk, falseBlk, &landBlk); + } + + if (trueBlk && trueBlk->pred_size() > 1) { + trueBlk = cloneBlockForPredecessor(trueBlk, curBlk); + ++cloned; + } + + if (falseBlk && falseBlk->pred_size() > 1) { + falseBlk = cloneBlockForPredecessor(falseBlk, curBlk); + ++cloned; + } + + mergeIfthenelseBlock(branchInstr, curBlk, trueBlk, falseBlk, landBlk); + + ++numIfPatternMatch; + + numClonedBlock += cloned; + + return 1 + cloned; +} //ifPatternMatch + +template +int CFGStructurizer::switchPatternMatch(BlockT *curBlk) { + return 0; +} //switchPatternMatch + +template +int CFGStructurizer::loopendPatternMatch(BlockT *curBlk) { + LoopT *loopRep = loopInfo->getLoopFor(curBlk); + typename std::vector nestedLoops; + while (loopRep) { + nestedLoops.push_back(loopRep); + loopRep = loopRep->getParentLoop(); + } + + if (nestedLoops.size() == 0) { + return 0; + } + + // Process nested loop outside->inside, so "continue" to a outside loop won't + // be mistaken as "break" of the current loop. + int num = 0; + for (typename std::vector::reverse_iterator + iter = nestedLoops.rbegin(), iterEnd = nestedLoops.rend(); + iter != iterEnd; ++iter) { + loopRep = *iter; + + if (getLoopLandBlock(loopRep) != NULL) { + continue; + } + + BlockT *loopHeader = loopRep->getHeader(); + + int numBreak = loopbreakPatternMatch(loopRep, loopHeader); + + if (numBreak == -1) { + break; + } + + int numCont = loopcontPatternMatch(loopRep, loopHeader); + num += numBreak + numCont; + } + + return num; +} //loopendPatternMatch + +template +int CFGStructurizer::loopPatternMatch(BlockT *curBlk) { + if (curBlk->succ_size() != 0) { + return 0; + } + + int numLoop = 0; + LoopT *loopRep = loopInfo->getLoopFor(curBlk); + while (loopRep && loopRep->getHeader() == curBlk) { + LoopLandInfo *loopLand = getLoopLandInfo(loopRep); + if (loopLand) { + BlockT *landBlk = loopLand->landBlk; + assert(landBlk); + if (!isRetiredBlock(landBlk)) { + mergeLooplandBlock(curBlk, loopLand); + ++numLoop; + } + } + loopRep = loopRep->getParentLoop(); + } + + numLoopPatternMatch += numLoop; + + return numLoop; +} //loopPatternMatch + +template +int CFGStructurizer::loopbreakPatternMatch(LoopT *loopRep, + BlockT *loopHeader) { + BlockTSmallerVector exitingBlks; + loopRep->getExitingBlocks(exitingBlks); + + if (DEBUGME) { + errs() << "Loop has " << exitingBlks.size() << " exiting blocks\n"; + } + + if (exitingBlks.size() == 0) { + setLoopLandBlock(loopRep); + return 0; + } + + // Compute the corresponding exitBlks and exit block set. + BlockTSmallerVector exitBlks; + std::set exitBlkSet; + for (typename BlockTSmallerVector::const_iterator iter = exitingBlks.begin(), + iterEnd = exitingBlks.end(); iter != iterEnd; ++iter) { + BlockT *exitingBlk = *iter; + BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk); + exitBlks.push_back(exitBlk); + exitBlkSet.insert(exitBlk); //non-duplicate insert + } + + assert(exitBlkSet.size() > 0); + assert(exitBlks.size() == exitingBlks.size()); + + if (DEBUGME) { + errs() << "Loop has " << exitBlkSet.size() << " exit blocks\n"; + } + + // Find exitLandBlk. + BlockT *exitLandBlk = NULL; + int numCloned = 0; + int numSerial = 0; + + if (exitBlkSet.size() == 1) { + exitLandBlk = *exitBlkSet.begin(); + } else { + exitLandBlk = findNearestCommonPostDom(exitBlkSet); + + if (exitLandBlk == NULL) { + return -1; + } + + bool allInPath = true; + bool allNotInPath = true; + for (typename std::set::const_iterator + iter = exitBlkSet.begin(), + iterEnd = exitBlkSet.end(); + iter != iterEnd; ++iter) { + BlockT *exitBlk = *iter; + + PathToKind pathKind = singlePathTo(exitBlk, exitLandBlk, true); + if (DEBUGME) { + errs() << "BB" << exitBlk->getNumber() + << " to BB" << exitLandBlk->getNumber() << " PathToKind=" + << pathKind << "\n"; + } + + allInPath = allInPath && (pathKind == SinglePath_InPath); + allNotInPath = allNotInPath && (pathKind == SinglePath_NotInPath); + + if (!allInPath && !allNotInPath) { + if (DEBUGME) { + errs() << "singlePath check fail\n"; + } + return -1; + } + } // check all exit blocks + + if (allNotInPath) { + + // TODO: Simplify, maybe separate function? + LoopT *parentLoopRep = loopRep->getParentLoop(); + BlockT *parentLoopHeader = NULL; + if (parentLoopRep) + parentLoopHeader = parentLoopRep->getHeader(); + + if (exitLandBlk == parentLoopHeader && + (exitLandBlk = relocateLoopcontBlock(parentLoopRep, + loopRep, + exitBlkSet, + exitLandBlk)) != NULL) { + if (DEBUGME) { + errs() << "relocateLoopcontBlock success\n"; + } + } else if ((exitLandBlk = addLoopEndbranchBlock(loopRep, + exitingBlks, + exitBlks)) != NULL) { + if (DEBUGME) { + errs() << "insertEndbranchBlock success\n"; + } + } else { + if (DEBUGME) { + errs() << "loop exit fail\n"; + } + return -1; + } + } + + // Handle side entry to exit path. + exitBlks.clear(); + exitBlkSet.clear(); + for (typename BlockTSmallerVector::iterator iterExiting = + exitingBlks.begin(), + iterExitingEnd = exitingBlks.end(); + iterExiting != iterExitingEnd; ++iterExiting) { + BlockT *exitingBlk = *iterExiting; + BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk); + BlockT *newExitBlk = exitBlk; + + if (exitBlk != exitLandBlk && exitBlk->pred_size() > 1) { + newExitBlk = cloneBlockForPredecessor(exitBlk, exitingBlk); + ++numCloned; + } + + numCloned += cloneOnSideEntryTo(exitingBlk, newExitBlk, exitLandBlk); + + exitBlks.push_back(newExitBlk); + exitBlkSet.insert(newExitBlk); + } + + for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(), + iterExitEnd = exitBlks.end(); + iterExit != iterExitEnd; ++iterExit) { + BlockT *exitBlk = *iterExit; + numSerial += serialPatternMatch(exitBlk); + } + + for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(), + iterExitEnd = exitBlks.end(); + iterExit != iterExitEnd; ++iterExit) { + BlockT *exitBlk = *iterExit; + if (exitBlk->pred_size() > 1) { + if (exitBlk != exitLandBlk) { + return -1; + } + } else { + if (exitBlk != exitLandBlk && + (exitBlk->succ_size() != 1 || + *exitBlk->succ_begin() != exitLandBlk)) { + return -1; + } + } + } + } // else + + exitLandBlk = recordLoopLandBlock(loopRep, exitLandBlk, exitBlks, exitBlkSet); + + // Fold break into the breaking block. Leverage across level breaks. + assert(exitingBlks.size() == exitBlks.size()); + for (typename BlockTSmallerVector::const_iterator iterExit = exitBlks.begin(), + iterExiting = exitingBlks.begin(), iterExitEnd = exitBlks.end(); + iterExit != iterExitEnd; ++iterExit, ++iterExiting) { + BlockT *exitBlk = *iterExit; + BlockT *exitingBlk = *iterExiting; + assert(exitBlk->pred_size() == 1 || exitBlk == exitLandBlk); + LoopT *exitingLoop = loopInfo->getLoopFor(exitingBlk); + handleLoopbreak(exitingBlk, exitingLoop, exitBlk, loopRep, exitLandBlk); + } + + int numBreak = static_cast(exitingBlks.size()); + numLoopbreakPatternMatch += numBreak; + numClonedBlock += numCloned; + return numBreak + numSerial + numCloned; +} //loopbreakPatternMatch + +template +int CFGStructurizer::loopcontPatternMatch(LoopT *loopRep, + BlockT *loopHeader) { + int numCont = 0; + SmallVector contBlk; + for (typename InvBlockGTraits::ChildIteratorType iter = + InvBlockGTraits::child_begin(loopHeader), + iterEnd = InvBlockGTraits::child_end(loopHeader); + iter != iterEnd; ++iter) { + BlockT *curBlk = *iter; + if (loopRep->contains(curBlk)) { + handleLoopcontBlock(curBlk, loopInfo->getLoopFor(curBlk), + loopHeader, loopRep); + contBlk.push_back(curBlk); + ++numCont; + } + } + + for (typename SmallVector::iterator + iter = contBlk.begin(), iterEnd = contBlk.end(); + iter != iterEnd; ++iter) { + (*iter)->removeSuccessor(loopHeader); + } + + numLoopcontPatternMatch += numCont; + + return numCont; +} //loopcontPatternMatch + + +template +bool CFGStructurizer::isSameloopDetachedContbreak(BlockT *src1Blk, + BlockT *src2Blk) { + // return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in the + // same loop with LoopLandInfo without explicitly keeping track of + // loopContBlks and loopBreakBlks, this is a method to get the information. + // + if (src1Blk->succ_size() == 0) { + LoopT *loopRep = loopInfo->getLoopFor(src1Blk); + if (loopRep != NULL && loopRep == loopInfo->getLoopFor(src2Blk)) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + if (theEntry != NULL) { + if (DEBUGME) { + errs() << "isLoopContBreakBlock yes src1 = BB" + << src1Blk->getNumber() + << " src2 = BB" << src2Blk->getNumber() << "\n"; + } + return true; + } + } + } + return false; +} //isSameloopDetachedContbreak + +template +int CFGStructurizer::handleJumpintoIf(BlockT *headBlk, + BlockT *trueBlk, + BlockT *falseBlk) { + int num = handleJumpintoIfImp(headBlk, trueBlk, falseBlk); + if (num == 0) { + if (DEBUGME) { + errs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n"; + } + num = handleJumpintoIfImp(headBlk, falseBlk, trueBlk); + } + return num; +} + +template +int CFGStructurizer::handleJumpintoIfImp(BlockT *headBlk, + BlockT *trueBlk, + BlockT *falseBlk) { + int num = 0; + BlockT *downBlk; + + //trueBlk could be the common post dominator + downBlk = trueBlk; + + if (DEBUGME) { + errs() << "handleJumpintoIfImp head = BB" << headBlk->getNumber() + << " true = BB" << trueBlk->getNumber() + << ", numSucc=" << trueBlk->succ_size() + << " false = BB" << falseBlk->getNumber() << "\n"; + } + + while (downBlk) { + if (DEBUGME) { + errs() << "check down = BB" << downBlk->getNumber(); + } + + if (singlePathTo(falseBlk, downBlk) == SinglePath_InPath) { + if (DEBUGME) { + errs() << " working\n"; + } + + num += cloneOnSideEntryTo(headBlk, trueBlk, downBlk); + num += cloneOnSideEntryTo(headBlk, falseBlk, downBlk); + + numClonedBlock += num; + num += serialPatternMatch(*headBlk->succ_begin()); + num += serialPatternMatch(*(++headBlk->succ_begin())); + num += ifPatternMatch(headBlk); + assert(num > 0); + + break; + } + if (DEBUGME) { + errs() << " not working\n"; + } + downBlk = (downBlk->succ_size() == 1) ? (*downBlk->succ_begin()) : NULL; + } // walk down the postDomTree + + return num; +} //handleJumpintoIf + +template +void CFGStructurizer::showImproveSimpleJumpintoIf(BlockT *headBlk, + BlockT *trueBlk, + BlockT *falseBlk, + BlockT *landBlk, + bool detail) { + errs() << "head = BB" << headBlk->getNumber() + << " size = " << headBlk->size(); + if (detail) { + errs() << "\n"; + headBlk->print(errs()); + errs() << "\n"; + } + + if (trueBlk) { + errs() << ", true = BB" << trueBlk->getNumber() << " size = " + << trueBlk->size() << " numPred = " << trueBlk->pred_size(); + if (detail) { + errs() << "\n"; + trueBlk->print(errs()); + errs() << "\n"; + } + } + if (falseBlk) { + errs() << ", false = BB" << falseBlk->getNumber() << " size = " + << falseBlk->size() << " numPred = " << falseBlk->pred_size(); + if (detail) { + errs() << "\n"; + falseBlk->print(errs()); + errs() << "\n"; + } + } + if (landBlk) { + errs() << ", land = BB" << landBlk->getNumber() << " size = " + << landBlk->size() << " numPred = " << landBlk->pred_size(); + if (detail) { + errs() << "\n"; + landBlk->print(errs()); + errs() << "\n"; + } + } + + errs() << "\n"; +} //showImproveSimpleJumpintoIf + +template +int CFGStructurizer::improveSimpleJumpintoIf(BlockT *headBlk, + BlockT *trueBlk, + BlockT *falseBlk, + BlockT **plandBlk) { + bool migrateTrue = false; + bool migrateFalse = false; + + BlockT *landBlk = *plandBlk; + + assert((trueBlk == NULL || trueBlk->succ_size() <= 1) + && (falseBlk == NULL || falseBlk->succ_size() <= 1)); + + if (trueBlk == falseBlk) { + return 0; + } + + migrateTrue = needMigrateBlock(trueBlk); + migrateFalse = needMigrateBlock(falseBlk); + + if (!migrateTrue && !migrateFalse) { + return 0; + } + + // If we need to migrate either trueBlk and falseBlk, migrate the rest that + // have more than one predecessors. without doing this, its predecessor + // rather than headBlk will have undefined value in initReg. + if (!migrateTrue && trueBlk && trueBlk->pred_size() > 1) { + migrateTrue = true; + } + if (!migrateFalse && falseBlk && falseBlk->pred_size() > 1) { + migrateFalse = true; + } + + if (DEBUGME) { + errs() << "before improveSimpleJumpintoIf: "; + showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0); + } + + // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk + // + // new: headBlk => if () {initReg = 1; org trueBlk branch} else + // {initReg = 0; org falseBlk branch } + // => landBlk => if (initReg) {org trueBlk} else {org falseBlk} + // => org landBlk + // if landBlk->pred_size() > 2, put the about if-else inside + // if (initReg !=2) {...} + // + // add initReg = initVal to headBlk + + const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); + unsigned initReg = + funcRep->getRegInfo().createVirtualRegister(I32RC); + if (!migrateTrue || !migrateFalse) { + int initVal = migrateTrue ? 0 : 1; + CFGTraits::insertAssignInstrBefore(headBlk, passRep, initReg, initVal); + } + + int numNewBlk = 0; + + if (landBlk == NULL) { + landBlk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(landBlk); //insert to function + + if (trueBlk) { + trueBlk->addSuccessor(landBlk); + } else { + headBlk->addSuccessor(landBlk); + } + + if (falseBlk) { + falseBlk->addSuccessor(landBlk); + } else { + headBlk->addSuccessor(landBlk); + } + + numNewBlk ++; + } + + bool landBlkHasOtherPred = (landBlk->pred_size() > 2); + + //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL" + typename BlockT::iterator insertPos = + CFGTraits::getInstrPos + (landBlk, CFGTraits::insertInstrBefore(landBlk, AMDGPU::ENDIF, passRep)); + + if (landBlkHasOtherPred) { + unsigned immReg = + funcRep->getRegInfo().createVirtualRegister(I32RC); + CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 2); + unsigned cmpResReg = + funcRep->getRegInfo().createVirtualRegister(I32RC); + + CFGTraits::insertCompareInstrBefore(landBlk, insertPos, passRep, cmpResReg, + initReg, immReg); + CFGTraits::insertCondBranchBefore(landBlk, insertPos, + AMDGPU::IF_PREDICATE_SET, passRep, + cmpResReg, DebugLoc()); + } + + CFGTraits::insertCondBranchBefore(landBlk, insertPos, AMDGPU::IF_PREDICATE_SET, + passRep, initReg, DebugLoc()); + + if (migrateTrue) { + migrateInstruction(trueBlk, landBlk, insertPos); + // need to uncondionally insert the assignment to ensure a path from its + // predecessor rather than headBlk has valid value in initReg if + // (initVal != 1). + CFGTraits::insertAssignInstrBefore(trueBlk, passRep, initReg, 1); + } + CFGTraits::insertInstrBefore(insertPos, AMDGPU::ELSE, passRep); + + if (migrateFalse) { + migrateInstruction(falseBlk, landBlk, insertPos); + // need to uncondionally insert the assignment to ensure a path from its + // predecessor rather than headBlk has valid value in initReg if + // (initVal != 0) + CFGTraits::insertAssignInstrBefore(falseBlk, passRep, initReg, 0); + } + + if (landBlkHasOtherPred) { + // add endif + CFGTraits::insertInstrBefore(insertPos, AMDGPU::ENDIF, passRep); + + // put initReg = 2 to other predecessors of landBlk + for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(), + predIterEnd = landBlk->pred_end(); predIter != predIterEnd; + ++predIter) { + BlockT *curBlk = *predIter; + if (curBlk != trueBlk && curBlk != falseBlk) { + CFGTraits::insertAssignInstrBefore(curBlk, passRep, initReg, 2); + } + } //for + } + if (DEBUGME) { + errs() << "result from improveSimpleJumpintoIf: "; + showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0); + } + + // update landBlk + *plandBlk = landBlk; + + return numNewBlk; +} //improveSimpleJumpintoIf + +template +void CFGStructurizer::handleLoopbreak(BlockT *exitingBlk, + LoopT *exitingLoop, + BlockT *exitBlk, + LoopT *exitLoop, + BlockT *landBlk) { + if (DEBUGME) { + errs() << "Trying to break loop-depth = " << getLoopDepth(exitLoop) + << " from loop-depth = " << getLoopDepth(exitingLoop) << "\n"; + } + const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); + + RegiT initReg = INVALIDREGNUM; + if (exitingLoop != exitLoop) { + initReg = static_cast + (funcRep->getRegInfo().createVirtualRegister(I32RC)); + assert(initReg != INVALIDREGNUM); + addLoopBreakInitReg(exitLoop, initReg); + while (exitingLoop != exitLoop && exitingLoop) { + addLoopBreakOnReg(exitingLoop, initReg); + exitingLoop = exitingLoop->getParentLoop(); + } + assert(exitingLoop == exitLoop); + } + + mergeLoopbreakBlock(exitingBlk, exitBlk, landBlk, initReg); + +} //handleLoopbreak + +template +void CFGStructurizer::handleLoopcontBlock(BlockT *contingBlk, + LoopT *contingLoop, + BlockT *contBlk, + LoopT *contLoop) { + if (DEBUGME) { + errs() << "loopcontPattern cont = BB" << contingBlk->getNumber() + << " header = BB" << contBlk->getNumber() << "\n"; + + errs() << "Trying to continue loop-depth = " + << getLoopDepth(contLoop) + << " from loop-depth = " << getLoopDepth(contingLoop) << "\n"; + } + + RegiT initReg = INVALIDREGNUM; + const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); + if (contingLoop != contLoop) { + initReg = static_cast + (funcRep->getRegInfo().createVirtualRegister(I32RC)); + assert(initReg != INVALIDREGNUM); + addLoopContInitReg(contLoop, initReg); + while (contingLoop && contingLoop->getParentLoop() != contLoop) { + addLoopBreakOnReg(contingLoop, initReg); //not addLoopContOnReg + contingLoop = contingLoop->getParentLoop(); + } + assert(contingLoop && contingLoop->getParentLoop() == contLoop); + addLoopContOnReg(contingLoop, initReg); + } + + settleLoopcontBlock(contingBlk, contBlk, initReg); +} //handleLoopcontBlock + +template +void CFGStructurizer::mergeSerialBlock(BlockT *dstBlk, BlockT *srcBlk) { + if (DEBUGME) { + errs() << "serialPattern BB" << dstBlk->getNumber() + << " <= BB" << srcBlk->getNumber() << "\n"; + } + dstBlk->splice(dstBlk->end(), srcBlk, srcBlk->begin(), srcBlk->end()); + + dstBlk->removeSuccessor(srcBlk); + CFGTraits::cloneSuccessorList(dstBlk, srcBlk); + + removeSuccessor(srcBlk); + retireBlock(dstBlk, srcBlk); +} //mergeSerialBlock + +template +void CFGStructurizer::mergeIfthenelseBlock(InstrT *branchInstr, + BlockT *curBlk, + BlockT *trueBlk, + BlockT *falseBlk, + BlockT *landBlk) { + if (DEBUGME) { + errs() << "ifPattern BB" << curBlk->getNumber(); + errs() << "{ "; + if (trueBlk) { + errs() << "BB" << trueBlk->getNumber(); + } + errs() << " } else "; + errs() << "{ "; + if (falseBlk) { + errs() << "BB" << falseBlk->getNumber(); + } + errs() << " }\n "; + errs() << "landBlock: "; + if (landBlk == NULL) { + errs() << "NULL"; + } else { + errs() << "BB" << landBlk->getNumber(); + } + errs() << "\n"; + } + + int oldOpcode = branchInstr->getOpcode(); + DebugLoc branchDL = branchInstr->getDebugLoc(); + +// transform to +// if cond +// trueBlk +// else +// falseBlk +// endif +// landBlk + + typename BlockT::iterator branchInstrPos = + CFGTraits::getInstrPos(curBlk, branchInstr); + CFGTraits::insertCondBranchBefore(branchInstrPos, + CFGTraits::getBranchNzeroOpcode(oldOpcode), + passRep, + branchDL); + + if (trueBlk) { + curBlk->splice(branchInstrPos, trueBlk, trueBlk->begin(), trueBlk->end()); + curBlk->removeSuccessor(trueBlk); + if (landBlk && trueBlk->succ_size()!=0) { + trueBlk->removeSuccessor(landBlk); + } + retireBlock(curBlk, trueBlk); + } + CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ELSE, passRep); + + if (falseBlk) { + curBlk->splice(branchInstrPos, falseBlk, falseBlk->begin(), + falseBlk->end()); + curBlk->removeSuccessor(falseBlk); + if (landBlk && falseBlk->succ_size() != 0) { + falseBlk->removeSuccessor(landBlk); + } + retireBlock(curBlk, falseBlk); + } + CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ENDIF, passRep); + + branchInstr->eraseFromParent(); + + if (landBlk && trueBlk && falseBlk) { + curBlk->addSuccessor(landBlk); + } + +} //mergeIfthenelseBlock + +template +void CFGStructurizer::mergeLooplandBlock(BlockT *dstBlk, + LoopLandInfo *loopLand) { + BlockT *landBlk = loopLand->landBlk; + + if (DEBUGME) { + errs() << "loopPattern header = BB" << dstBlk->getNumber() + << " land = BB" << landBlk->getNumber() << "\n"; + } + + // Loop contInitRegs are init at the beginning of the loop. + for (typename std::set::const_iterator iter = + loopLand->contInitRegs.begin(), + iterEnd = loopLand->contInitRegs.end(); iter != iterEnd; ++iter) { + CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0); + } + + /* we last inserterd the DebugLoc in the + * BREAK_LOGICALZ_i32 or AMDGPU::BREAK_LOGICALNZ statement in the current dstBlk. + * search for the DebugLoc in the that statement. + * if not found, we have to insert the empty/default DebugLoc */ + InstrT *loopBreakInstr = CFGTraits::getLoopBreakInstr(dstBlk); + DebugLoc DLBreak = (loopBreakInstr) ? loopBreakInstr->getDebugLoc() : DebugLoc(); + + CFGTraits::insertInstrBefore(dstBlk, AMDGPU::WHILELOOP, passRep, DLBreak); + // Loop breakInitRegs are init before entering the loop. + for (typename std::set::const_iterator iter = + loopLand->breakInitRegs.begin(), + iterEnd = loopLand->breakInitRegs.end(); iter != iterEnd; ++iter) { + CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0); + } + // Loop endbranchInitRegs are init before entering the loop. + for (typename std::set::const_iterator iter = + loopLand->endbranchInitRegs.begin(), + iterEnd = loopLand->endbranchInitRegs.end(); iter != iterEnd; ++iter) { + CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0); + } + + /* we last inserterd the DebugLoc in the continue statement in the current dstBlk + * search for the DebugLoc in the continue statement. + * if not found, we have to insert the empty/default DebugLoc */ + InstrT *continueInstr = CFGTraits::getContinueInstr(dstBlk); + DebugLoc DLContinue = (continueInstr) ? continueInstr->getDebugLoc() : DebugLoc(); + + CFGTraits::insertInstrEnd(dstBlk, AMDGPU::ENDLOOP, passRep, DLContinue); + // Loop breakOnRegs are check after the ENDLOOP: break the loop outside this + // loop. + for (typename std::set::const_iterator iter = + loopLand->breakOnRegs.begin(), + iterEnd = loopLand->breakOnRegs.end(); iter != iterEnd; ++iter) { + CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::PREDICATED_BREAK, passRep, + *iter); + } + + // Loop contOnRegs are check after the ENDLOOP: cont the loop outside this + // loop. + for (std::set::const_iterator iter = loopLand->contOnRegs.begin(), + iterEnd = loopLand->contOnRegs.end(); iter != iterEnd; ++iter) { + CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::CONTINUE_LOGICALNZ_i32, + passRep, *iter); + } + + dstBlk->splice(dstBlk->end(), landBlk, landBlk->begin(), landBlk->end()); + + for (typename BlockT::succ_iterator iter = landBlk->succ_begin(), + iterEnd = landBlk->succ_end(); iter != iterEnd; ++iter) { + dstBlk->addSuccessor(*iter); // *iter's predecessor is also taken care of. + } + + removeSuccessor(landBlk); + retireBlock(dstBlk, landBlk); +} //mergeLooplandBlock + +template +void CFGStructurizer::reversePredicateSetter(typename BlockT::iterator I) { + while (I--) { + if (I->getOpcode() == AMDGPU::PRED_X) { + switch (static_cast(I)->getOperand(2).getImm()) { + case OPCODE_IS_ZERO_INT: + static_cast(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO_INT); + return; + case OPCODE_IS_NOT_ZERO_INT: + static_cast(I)->getOperand(2).setImm(OPCODE_IS_ZERO_INT); + return; + case OPCODE_IS_ZERO: + static_cast(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO); + return; + case OPCODE_IS_NOT_ZERO: + static_cast(I)->getOperand(2).setImm(OPCODE_IS_ZERO); + return; + default: + assert(0 && "PRED_X Opcode invalid!"); + } + } + } +} + +template +void CFGStructurizer::mergeLoopbreakBlock(BlockT *exitingBlk, + BlockT *exitBlk, + BlockT *exitLandBlk, + RegiT setReg) { + if (DEBUGME) { + errs() << "loopbreakPattern exiting = BB" << exitingBlk->getNumber() + << " exit = BB" << exitBlk->getNumber() + << " land = BB" << exitLandBlk->getNumber() << "\n"; + } + + InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(exitingBlk); + assert(branchInstr && CFGTraits::isCondBranch(branchInstr)); + + DebugLoc DL = branchInstr->getDebugLoc(); + + BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr); + + // transform exitingBlk to + // if ( ) { + // exitBlk (if exitBlk != exitLandBlk) + // setReg = 1 + // break + // }endif + // successor = {orgSuccessor(exitingBlk) - exitBlk} + + typename BlockT::iterator branchInstrPos = + CFGTraits::getInstrPos(exitingBlk, branchInstr); + + if (exitBlk == exitLandBlk && setReg == INVALIDREGNUM) { + //break_logical + + if (trueBranch != exitBlk) { + reversePredicateSetter(branchInstrPos); + } + CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL); + } else { + if (trueBranch != exitBlk) { + reversePredicateSetter(branchInstr); + } + CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL); + if (exitBlk != exitLandBlk) { + //splice is insert-before ... + exitingBlk->splice(branchInstrPos, exitBlk, exitBlk->begin(), + exitBlk->end()); + } + if (setReg != INVALIDREGNUM) { + CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1); + } + CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::BREAK, passRep); + } //if_logical + + //now branchInst can be erase safely + branchInstr->eraseFromParent(); + + //now take care of successors, retire blocks + exitingBlk->removeSuccessor(exitBlk); + if (exitBlk != exitLandBlk) { + //splice is insert-before ... + exitBlk->removeSuccessor(exitLandBlk); + retireBlock(exitingBlk, exitBlk); + } + +} //mergeLoopbreakBlock + +template +void CFGStructurizer::settleLoopcontBlock(BlockT *contingBlk, + BlockT *contBlk, + RegiT setReg) { + if (DEBUGME) { + errs() << "settleLoopcontBlock conting = BB" + << contingBlk->getNumber() + << ", cont = BB" << contBlk->getNumber() << "\n"; + } + + InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(contingBlk); + if (branchInstr) { + assert(CFGTraits::isCondBranch(branchInstr)); + typename BlockT::iterator branchInstrPos = + CFGTraits::getInstrPos(contingBlk, branchInstr); + BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr); + int oldOpcode = branchInstr->getOpcode(); + DebugLoc DL = branchInstr->getDebugLoc(); + + // transform contingBlk to + // if () { + // move instr after branchInstr + // continue + // or + // setReg = 1 + // break + // }endif + // successor = {orgSuccessor(contingBlk) - loopHeader} + + bool useContinueLogical = + (setReg == INVALIDREGNUM && (&*contingBlk->rbegin()) == branchInstr); + + if (useContinueLogical == false) { + int branchOpcode = + trueBranch == contBlk ? CFGTraits::getBranchNzeroOpcode(oldOpcode) + : CFGTraits::getBranchZeroOpcode(oldOpcode); + + CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL); + + if (setReg != INVALIDREGNUM) { + CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1); + // insertEnd to ensure phi-moves, if exist, go before the continue-instr. + CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, DL); + } else { + // insertEnd to ensure phi-moves, if exist, go before the continue-instr. + CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, DL); + } + + CFGTraits::insertInstrEnd(contingBlk, AMDGPU::ENDIF, passRep, DL); + } else { + int branchOpcode = + trueBranch == contBlk ? CFGTraits::getContinueNzeroOpcode(oldOpcode) + : CFGTraits::getContinueZeroOpcode(oldOpcode); + + CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL); + } + + branchInstr->eraseFromParent(); + } else { + // if we've arrived here then we've already erased the branch instruction + // travel back up the basic block to see the last reference of our debug location + // we've just inserted that reference here so it should be representative + if (setReg != INVALIDREGNUM) { + CFGTraits::insertAssignInstrBefore(contingBlk, passRep, setReg, 1); + // insertEnd to ensure phi-moves, if exist, go before the continue-instr. + CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, CFGTraits::getLastDebugLocInBB(contingBlk)); + } else { + // insertEnd to ensure phi-moves, if exist, go before the continue-instr. + CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, CFGTraits::getLastDebugLocInBB(contingBlk)); + } + } //else + +} //settleLoopcontBlock + +// BBs in exitBlkSet are determined as in break-path for loopRep, +// before we can put code for BBs as inside loop-body for loopRep +// check whether those BBs are determined as cont-BB for parentLoopRep +// earlier. +// If so, generate a new BB newBlk +// (1) set newBlk common successor of BBs in exitBlkSet +// (2) change the continue-instr in BBs in exitBlkSet to break-instr +// (3) generate continue-instr in newBlk +// +template +typename CFGStructurizer::BlockT * +CFGStructurizer::relocateLoopcontBlock(LoopT *parentLoopRep, + LoopT *loopRep, + std::set &exitBlkSet, + BlockT *exitLandBlk) { + std::set endBlkSet; + + + + for (typename std::set::const_iterator iter = exitBlkSet.begin(), + iterEnd = exitBlkSet.end(); + iter != iterEnd; ++iter) { + BlockT *exitBlk = *iter; + BlockT *endBlk = singlePathEnd(exitBlk, exitLandBlk); + + if (endBlk == NULL || CFGTraits::getContinueInstr(endBlk) == NULL) + return NULL; + + endBlkSet.insert(endBlk); + } + + BlockT *newBlk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(newBlk); //insert to function + CFGTraits::insertInstrEnd(newBlk, AMDGPU::CONTINUE, passRep); + SHOWNEWBLK(newBlk, "New continue block: "); + + for (typename std::set::const_iterator iter = endBlkSet.begin(), + iterEnd = endBlkSet.end(); + iter != iterEnd; ++iter) { + BlockT *endBlk = *iter; + InstrT *contInstr = CFGTraits::getContinueInstr(endBlk); + if (contInstr) { + contInstr->eraseFromParent(); + } + endBlk->addSuccessor(newBlk); + if (DEBUGME) { + errs() << "Add new continue Block to BB" + << endBlk->getNumber() << " successors\n"; + } + } + + return newBlk; +} //relocateLoopcontBlock + + +// LoopEndbranchBlock is a BB created by the CFGStructurizer to use as +// LoopLandBlock. This BB branch on the loop endBranchInit register to the +// pathes corresponding to the loop exiting branches. + +template +typename CFGStructurizer::BlockT * +CFGStructurizer::addLoopEndbranchBlock(LoopT *loopRep, + BlockTSmallerVector &exitingBlks, + BlockTSmallerVector &exitBlks) { + const AMDGPUInstrInfo *tii = + static_cast(passRep->getTargetInstrInfo()); + const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); + + RegiT endBranchReg = static_cast + (funcRep->getRegInfo().createVirtualRegister(I32RC)); + assert(endBranchReg >= 0); + + // reg = 0 before entering the loop + addLoopEndbranchInitReg(loopRep, endBranchReg); + + uint32_t numBlks = static_cast(exitingBlks.size()); + assert(numBlks >=2 && numBlks == exitBlks.size()); + + BlockT *preExitingBlk = exitingBlks[0]; + BlockT *preExitBlk = exitBlks[0]; + BlockT *preBranchBlk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(preBranchBlk); //insert to function + SHOWNEWBLK(preBranchBlk, "New loopEndbranch block: "); + + BlockT *newLandBlk = preBranchBlk; + + CFGTraits::replaceInstrUseOfBlockWith(preExitingBlk, preExitBlk, + newLandBlk); + preExitingBlk->removeSuccessor(preExitBlk); + preExitingBlk->addSuccessor(newLandBlk); + + //it is redundant to add reg = 0 to exitingBlks[0] + + // For 1..n th exiting path (the last iteration handles two pathes) create the + // branch to the previous path and the current path. + for (uint32_t i = 1; i < numBlks; ++i) { + BlockT *curExitingBlk = exitingBlks[i]; + BlockT *curExitBlk = exitBlks[i]; + BlockT *curBranchBlk; + + if (i == numBlks - 1) { + curBranchBlk = curExitBlk; + } else { + curBranchBlk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(curBranchBlk); //insert to function + SHOWNEWBLK(curBranchBlk, "New loopEndbranch block: "); + } + + // Add reg = i to exitingBlks[i]. + CFGTraits::insertAssignInstrBefore(curExitingBlk, passRep, + endBranchReg, i); + + // Remove the edge (exitingBlks[i] exitBlks[i]) add new edge + // (exitingBlks[i], newLandBlk). + CFGTraits::replaceInstrUseOfBlockWith(curExitingBlk, curExitBlk, + newLandBlk); + curExitingBlk->removeSuccessor(curExitBlk); + curExitingBlk->addSuccessor(newLandBlk); + + // add to preBranchBlk the branch instruction: + // if (endBranchReg == preVal) + // preExitBlk + // else + // curBranchBlk + // + // preValReg = i - 1 + + DebugLoc DL; + RegiT preValReg = static_cast + (funcRep->getRegInfo().createVirtualRegister(I32RC)); + + preBranchBlk->insert(preBranchBlk->begin(), + tii->getMovImmInstr(preBranchBlk->getParent(), preValReg, + i - 1)); + + // condResReg = (endBranchReg == preValReg) + RegiT condResReg = static_cast + (funcRep->getRegInfo().createVirtualRegister(I32RC)); + BuildMI(preBranchBlk, DL, tii->get(tii->getIEQOpcode()), condResReg) + .addReg(endBranchReg).addReg(preValReg); + + BuildMI(preBranchBlk, DL, tii->get(AMDGPU::BRANCH_COND_i32)) + .addMBB(preExitBlk).addReg(condResReg); + + preBranchBlk->addSuccessor(preExitBlk); + preBranchBlk->addSuccessor(curBranchBlk); + + // Update preExitingBlk, preExitBlk, preBranchBlk. + preExitingBlk = curExitingBlk; + preExitBlk = curExitBlk; + preBranchBlk = curBranchBlk; + + } //end for 1 .. n blocks + + return newLandBlk; +} //addLoopEndbranchBlock + +template +typename CFGStructurizer::PathToKind +CFGStructurizer::singlePathTo(BlockT *srcBlk, BlockT *dstBlk, + bool allowSideEntry) { + assert(dstBlk); + + if (srcBlk == dstBlk) { + return SinglePath_InPath; + } + + while (srcBlk && srcBlk->succ_size() == 1) { + srcBlk = *srcBlk->succ_begin(); + if (srcBlk == dstBlk) { + return SinglePath_InPath; + } + + if (!allowSideEntry && srcBlk->pred_size() > 1) { + return Not_SinglePath; + } + } + + if (srcBlk && srcBlk->succ_size()==0) { + return SinglePath_NotInPath; + } + + return Not_SinglePath; +} //singlePathTo + +// If there is a single path from srcBlk to dstBlk, return the last block before +// dstBlk If there is a single path from srcBlk->end without dstBlk, return the +// last block in the path Otherwise, return NULL +template +typename CFGStructurizer::BlockT * +CFGStructurizer::singlePathEnd(BlockT *srcBlk, BlockT *dstBlk, + bool allowSideEntry) { + assert(dstBlk); + + if (srcBlk == dstBlk) { + return srcBlk; + } + + if (srcBlk->succ_size() == 0) { + return srcBlk; + } + + while (srcBlk && srcBlk->succ_size() == 1) { + BlockT *preBlk = srcBlk; + + srcBlk = *srcBlk->succ_begin(); + if (srcBlk == NULL) { + return preBlk; + } + + if (!allowSideEntry && srcBlk->pred_size() > 1) { + return NULL; + } + } + + if (srcBlk && srcBlk->succ_size()==0) { + return srcBlk; + } + + return NULL; + +} //singlePathEnd + +template +int CFGStructurizer::cloneOnSideEntryTo(BlockT *preBlk, BlockT *srcBlk, + BlockT *dstBlk) { + int cloned = 0; + assert(preBlk->isSuccessor(srcBlk)); + while (srcBlk && srcBlk != dstBlk) { + assert(srcBlk->succ_size() == 1); + if (srcBlk->pred_size() > 1) { + srcBlk = cloneBlockForPredecessor(srcBlk, preBlk); + ++cloned; + } + + preBlk = srcBlk; + srcBlk = *srcBlk->succ_begin(); + } + + return cloned; +} //cloneOnSideEntryTo + +template +typename CFGStructurizer::BlockT * +CFGStructurizer::cloneBlockForPredecessor(BlockT *curBlk, + BlockT *predBlk) { + assert(predBlk->isSuccessor(curBlk) && + "succBlk is not a prececessor of curBlk"); + + BlockT *cloneBlk = CFGTraits::clone(curBlk); //clone instructions + CFGTraits::replaceInstrUseOfBlockWith(predBlk, curBlk, cloneBlk); + //srcBlk, oldBlk, newBlk + + predBlk->removeSuccessor(curBlk); + predBlk->addSuccessor(cloneBlk); + + // add all successor to cloneBlk + CFGTraits::cloneSuccessorList(cloneBlk, curBlk); + + numClonedInstr += curBlk->size(); + + if (DEBUGME) { + errs() << "Cloned block: " << "BB" + << curBlk->getNumber() << "size " << curBlk->size() << "\n"; + } + + SHOWNEWBLK(cloneBlk, "result of Cloned block: "); + + return cloneBlk; +} //cloneBlockForPredecessor + +template +typename CFGStructurizer::BlockT * +CFGStructurizer::exitingBlock2ExitBlock(LoopT *loopRep, + BlockT *exitingBlk) { + BlockT *exitBlk = NULL; + + for (typename BlockT::succ_iterator iterSucc = exitingBlk->succ_begin(), + iterSuccEnd = exitingBlk->succ_end(); + iterSucc != iterSuccEnd; ++iterSucc) { + BlockT *curBlk = *iterSucc; + if (!loopRep->contains(curBlk)) { + assert(exitBlk == NULL); + exitBlk = curBlk; + } + } + + assert(exitBlk != NULL); + + return exitBlk; +} //exitingBlock2ExitBlock + +template +void CFGStructurizer::migrateInstruction(BlockT *srcBlk, + BlockT *dstBlk, + InstrIterator insertPos) { + InstrIterator spliceEnd; + //look for the input branchinstr, not the AMDGPU branchinstr + InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk); + if (branchInstr == NULL) { + if (DEBUGME) { + errs() << "migrateInstruction don't see branch instr\n" ; + } + spliceEnd = srcBlk->end(); + } else { + if (DEBUGME) { + errs() << "migrateInstruction see branch instr\n" ; + branchInstr->dump(); + } + spliceEnd = CFGTraits::getInstrPos(srcBlk, branchInstr); + } + if (DEBUGME) { + errs() << "migrateInstruction before splice dstSize = " << dstBlk->size() + << "srcSize = " << srcBlk->size() << "\n"; + } + + //splice insert before insertPos + dstBlk->splice(insertPos, srcBlk, srcBlk->begin(), spliceEnd); + + if (DEBUGME) { + errs() << "migrateInstruction after splice dstSize = " << dstBlk->size() + << "srcSize = " << srcBlk->size() << "\n"; + } +} //migrateInstruction + +// normalizeInfiniteLoopExit change +// B1: +// uncond_br LoopHeader +// +// to +// B1: +// cond_br 1 LoopHeader dummyExit +// and return the newly added dummy exit block +// +template +typename CFGStructurizer::BlockT * +CFGStructurizer::normalizeInfiniteLoopExit(LoopT* LoopRep) { + BlockT *loopHeader; + BlockT *loopLatch; + loopHeader = LoopRep->getHeader(); + loopLatch = LoopRep->getLoopLatch(); + BlockT *dummyExitBlk = NULL; + const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); + if (loopHeader!=NULL && loopLatch!=NULL) { + InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(loopLatch); + if (branchInstr!=NULL && CFGTraits::isUncondBranch(branchInstr)) { + dummyExitBlk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(dummyExitBlk); //insert to function + SHOWNEWBLK(dummyExitBlk, "DummyExitBlock to normalize infiniteLoop: "); + + if (DEBUGME) errs() << "Old branch instr: " << *branchInstr << "\n"; + + typename BlockT::iterator insertPos = + CFGTraits::getInstrPos(loopLatch, branchInstr); + unsigned immReg = + funcRep->getRegInfo().createVirtualRegister(I32RC); + CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 1); + InstrT *newInstr = + CFGTraits::insertInstrBefore(insertPos, AMDGPU::BRANCH_COND_i32, passRep); + MachineInstrBuilder(newInstr).addMBB(loopHeader).addReg(immReg, false); + + SHOWNEWINSTR(newInstr); + + branchInstr->eraseFromParent(); + loopLatch->addSuccessor(dummyExitBlk); + } + } + + return dummyExitBlk; +} //normalizeInfiniteLoopExit + +template +void CFGStructurizer::removeUnconditionalBranch(BlockT *srcBlk) { + InstrT *branchInstr; + + // I saw two unconditional branch in one basic block in example + // test_fc_do_while_or.c need to fix the upstream on this to remove the loop. + while ((branchInstr = CFGTraits::getLoopendBlockBranchInstr(srcBlk)) + && CFGTraits::isUncondBranch(branchInstr)) { + if (DEBUGME) { + errs() << "Removing unconditional branch instruction" ; + branchInstr->dump(); + } + branchInstr->eraseFromParent(); + } +} //removeUnconditionalBranch + +template +void CFGStructurizer::removeRedundantConditionalBranch(BlockT *srcBlk) { + if (srcBlk->succ_size() == 2) { + BlockT *blk1 = *srcBlk->succ_begin(); + BlockT *blk2 = *(++srcBlk->succ_begin()); + + if (blk1 == blk2) { + InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk); + assert(branchInstr && CFGTraits::isCondBranch(branchInstr)); + if (DEBUGME) { + errs() << "Removing unneeded conditional branch instruction" ; + branchInstr->dump(); + } + branchInstr->eraseFromParent(); + SHOWNEWBLK(blk1, "Removing redundant successor"); + srcBlk->removeSuccessor(blk1); + } + } +} //removeRedundantConditionalBranch + +template +void CFGStructurizer::addDummyExitBlock(SmallVector &retBlks) { + BlockT *dummyExitBlk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(dummyExitBlk); //insert to function + CFGTraits::insertInstrEnd(dummyExitBlk, AMDGPU::RETURN, passRep); + + for (typename SmallVector::iterator iter = + retBlks.begin(), + iterEnd = retBlks.end(); iter != iterEnd; ++iter) { + BlockT *curBlk = *iter; + InstrT *curInstr = CFGTraits::getReturnInstr(curBlk); + if (curInstr) { + curInstr->eraseFromParent(); + } + curBlk->addSuccessor(dummyExitBlk); + if (DEBUGME) { + errs() << "Add dummyExitBlock to BB" << curBlk->getNumber() + << " successors\n"; + } + } //for + + SHOWNEWBLK(dummyExitBlk, "DummyExitBlock: "); +} //addDummyExitBlock + +template +void CFGStructurizer::removeSuccessor(BlockT *srcBlk) { + while (srcBlk->succ_size()) { + srcBlk->removeSuccessor(*srcBlk->succ_begin()); + } +} + +template +void CFGStructurizer::recordSccnum(BlockT *srcBlk, int sccNum) { + BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk]; + + if (srcBlkInfo == NULL) { + srcBlkInfo = new BlockInfo(); + } + + srcBlkInfo->sccNum = sccNum; +} + +template +int CFGStructurizer::getSCCNum(BlockT *srcBlk) { + BlockInfo *srcBlkInfo = blockInfoMap[srcBlk]; + return srcBlkInfo ? srcBlkInfo->sccNum : INVALIDSCCNUM; +} + +template +void CFGStructurizer::retireBlock(BlockT *dstBlk, BlockT *srcBlk) { + if (DEBUGME) { + errs() << "Retiring BB" << srcBlk->getNumber() << "\n"; + } + + BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk]; + + if (srcBlkInfo == NULL) { + srcBlkInfo = new BlockInfo(); + } + + srcBlkInfo->isRetired = true; + assert(srcBlk->succ_size() == 0 && srcBlk->pred_size() == 0 + && "can't retire block yet"); +} + +template +bool CFGStructurizer::isRetiredBlock(BlockT *srcBlk) { + BlockInfo *srcBlkInfo = blockInfoMap[srcBlk]; + return (srcBlkInfo && srcBlkInfo->isRetired); +} + +template +bool CFGStructurizer::isActiveLoophead(BlockT *curBlk) { + LoopT *loopRep = loopInfo->getLoopFor(curBlk); + while (loopRep && loopRep->getHeader() == curBlk) { + LoopLandInfo *loopLand = getLoopLandInfo(loopRep); + + if(loopLand == NULL) + return true; + + BlockT *landBlk = loopLand->landBlk; + assert(landBlk); + if (!isRetiredBlock(landBlk)) { + return true; + } + + loopRep = loopRep->getParentLoop(); + } + + return false; +} //isActiveLoophead + +template +bool CFGStructurizer::needMigrateBlock(BlockT *blk) { + const unsigned blockSizeThreshold = 30; + const unsigned cloneInstrThreshold = 100; + + bool multiplePreds = blk && (blk->pred_size() > 1); + + if(!multiplePreds) + return false; + + unsigned blkSize = blk->size(); + return ((blkSize > blockSizeThreshold) + && (blkSize * (blk->pred_size() - 1) > cloneInstrThreshold)); +} //needMigrateBlock + +template +typename CFGStructurizer::BlockT * +CFGStructurizer::recordLoopLandBlock(LoopT *loopRep, BlockT *landBlk, + BlockTSmallerVector &exitBlks, + std::set &exitBlkSet) { + SmallVector inpathBlks; //in exit path blocks + + for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(), + predIterEnd = landBlk->pred_end(); + predIter != predIterEnd; ++predIter) { + BlockT *curBlk = *predIter; + if (loopRep->contains(curBlk) || exitBlkSet.count(curBlk)) { + inpathBlks.push_back(curBlk); + } + } //for + + //if landBlk has predecessors that are not in the given loop, + //create a new block + BlockT *newLandBlk = landBlk; + if (inpathBlks.size() != landBlk->pred_size()) { + newLandBlk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(newLandBlk); //insert to function + newLandBlk->addSuccessor(landBlk); + for (typename SmallVector::iterator iter = + inpathBlks.begin(), + iterEnd = inpathBlks.end(); iter != iterEnd; ++iter) { + BlockT *curBlk = *iter; + CFGTraits::replaceInstrUseOfBlockWith(curBlk, landBlk, newLandBlk); + //srcBlk, oldBlk, newBlk + curBlk->removeSuccessor(landBlk); + curBlk->addSuccessor(newLandBlk); + } + for (size_t i = 0, tot = exitBlks.size(); i < tot; ++i) { + if (exitBlks[i] == landBlk) { + exitBlks[i] = newLandBlk; + } + } + SHOWNEWBLK(newLandBlk, "NewLandingBlock: "); + } + + setLoopLandBlock(loopRep, newLandBlk); + + return newLandBlk; +} // recordLoopbreakLand + +template +void CFGStructurizer::setLoopLandBlock(LoopT *loopRep, BlockT *blk) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + if (theEntry == NULL) { + theEntry = new LoopLandInfo(); + } + assert(theEntry->landBlk == NULL); + + if (blk == NULL) { + blk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(blk); //insert to function + SHOWNEWBLK(blk, "DummyLandingBlock for loop without break: "); + } + + theEntry->landBlk = blk; + + if (DEBUGME) { + errs() << "setLoopLandBlock loop-header = BB" + << loopRep->getHeader()->getNumber() + << " landing-block = BB" << blk->getNumber() << "\n"; + } +} // setLoopLandBlock + +template +void CFGStructurizer::addLoopBreakOnReg(LoopT *loopRep, RegiT regNum) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + if (theEntry == NULL) { + theEntry = new LoopLandInfo(); + } + + theEntry->breakOnRegs.insert(regNum); + + if (DEBUGME) { + errs() << "addLoopBreakOnReg loop-header = BB" + << loopRep->getHeader()->getNumber() + << " regNum = " << regNum << "\n"; + } +} // addLoopBreakOnReg + +template +void CFGStructurizer::addLoopContOnReg(LoopT *loopRep, RegiT regNum) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + if (theEntry == NULL) { + theEntry = new LoopLandInfo(); + } + theEntry->contOnRegs.insert(regNum); + + if (DEBUGME) { + errs() << "addLoopContOnReg loop-header = BB" + << loopRep->getHeader()->getNumber() + << " regNum = " << regNum << "\n"; + } +} // addLoopContOnReg + +template +void CFGStructurizer::addLoopBreakInitReg(LoopT *loopRep, RegiT regNum) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + if (theEntry == NULL) { + theEntry = new LoopLandInfo(); + } + theEntry->breakInitRegs.insert(regNum); + + if (DEBUGME) { + errs() << "addLoopBreakInitReg loop-header = BB" + << loopRep->getHeader()->getNumber() + << " regNum = " << regNum << "\n"; + } +} // addLoopBreakInitReg + +template +void CFGStructurizer::addLoopContInitReg(LoopT *loopRep, RegiT regNum) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + if (theEntry == NULL) { + theEntry = new LoopLandInfo(); + } + theEntry->contInitRegs.insert(regNum); + + if (DEBUGME) { + errs() << "addLoopContInitReg loop-header = BB" + << loopRep->getHeader()->getNumber() + << " regNum = " << regNum << "\n"; + } +} // addLoopContInitReg + +template +void CFGStructurizer::addLoopEndbranchInitReg(LoopT *loopRep, + RegiT regNum) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + if (theEntry == NULL) { + theEntry = new LoopLandInfo(); + } + theEntry->endbranchInitRegs.insert(regNum); + + if (DEBUGME) { + errs() << "addLoopEndbranchInitReg loop-header = BB" + << loopRep->getHeader()->getNumber() + << " regNum = " << regNum << "\n"; + } +} // addLoopEndbranchInitReg + +template +typename CFGStructurizer::LoopLandInfo * +CFGStructurizer::getLoopLandInfo(LoopT *loopRep) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + return theEntry; +} // getLoopLandInfo + +template +typename CFGStructurizer::BlockT * +CFGStructurizer::getLoopLandBlock(LoopT *loopRep) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + return theEntry ? theEntry->landBlk : NULL; +} // getLoopLandBlock + + +template +bool CFGStructurizer::hasBackEdge(BlockT *curBlk) { + LoopT *loopRep = loopInfo->getLoopFor(curBlk); + if (loopRep == NULL) + return false; + + BlockT *loopHeader = loopRep->getHeader(); + + return curBlk->isSuccessor(loopHeader); + +} //hasBackEdge + +template +unsigned CFGStructurizer::getLoopDepth(LoopT *loopRep) { + return loopRep ? loopRep->getLoopDepth() : 0; +} //getLoopDepth + +template +int CFGStructurizer::countActiveBlock +(typename SmallVector::const_iterator iterStart, + typename SmallVector::const_iterator iterEnd) { + int count = 0; + while (iterStart != iterEnd) { + if (!isRetiredBlock(*iterStart)) { + ++count; + } + ++iterStart; + } + + return count; +} //countActiveBlock + +// This is work around solution for findNearestCommonDominator not avaiable to +// post dom a proper fix should go to Dominators.h. + +template +typename CFGStructurizer::BlockT* +CFGStructurizer::findNearestCommonPostDom(BlockT *blk1, BlockT *blk2) { + + if (postDomTree->dominates(blk1, blk2)) { + return blk1; + } + if (postDomTree->dominates(blk2, blk1)) { + return blk2; + } + + DomTreeNodeT *node1 = postDomTree->getNode(blk1); + DomTreeNodeT *node2 = postDomTree->getNode(blk2); + + // Handle newly cloned node. + if (node1 == NULL && blk1->succ_size() == 1) { + return findNearestCommonPostDom(*blk1->succ_begin(), blk2); + } + if (node2 == NULL && blk2->succ_size() == 1) { + return findNearestCommonPostDom(blk1, *blk2->succ_begin()); + } + + if (node1 == NULL || node2 == NULL) { + return NULL; + } + + node1 = node1->getIDom(); + while (node1) { + if (postDomTree->dominates(node1, node2)) { + return node1->getBlock(); + } + node1 = node1->getIDom(); + } + + return NULL; +} + +template +typename CFGStructurizer::BlockT * +CFGStructurizer::findNearestCommonPostDom +(typename std::set &blks) { + BlockT *commonDom; + typename std::set::const_iterator iter = blks.begin(); + typename std::set::const_iterator iterEnd = blks.end(); + for (commonDom = *iter; iter != iterEnd && commonDom != NULL; ++iter) { + BlockT *curBlk = *iter; + if (curBlk != commonDom) { + commonDom = findNearestCommonPostDom(curBlk, commonDom); + } + } + + if (DEBUGME) { + errs() << "Common post dominator for exit blocks is "; + if (commonDom) { + errs() << "BB" << commonDom->getNumber() << "\n"; + } else { + errs() << "NULL\n"; + } + } + + return commonDom; +} //findNearestCommonPostDom + +} //end namespace llvm + +//todo: move-end + + +//===----------------------------------------------------------------------===// +// +// CFGStructurizer for AMDGPU +// +//===----------------------------------------------------------------------===// + + +using namespace llvmCFGStruct; + +namespace llvm { +class AMDGPUCFGStructurizer : public MachineFunctionPass { +public: + typedef MachineInstr InstructionType; + typedef MachineFunction FunctionType; + typedef MachineBasicBlock BlockType; + typedef MachineLoopInfo LoopinfoType; + typedef MachineDominatorTree DominatortreeType; + typedef MachinePostDominatorTree PostDominatortreeType; + typedef MachineDomTreeNode DomTreeNodeType; + typedef MachineLoop LoopType; + +protected: + TargetMachine &TM; + const TargetInstrInfo *TII; + const AMDGPURegisterInfo *TRI; + +public: + AMDGPUCFGStructurizer(char &pid, TargetMachine &tm); + const TargetInstrInfo *getTargetInstrInfo() const; + +private: + +}; + +} //end of namespace llvm +AMDGPUCFGStructurizer::AMDGPUCFGStructurizer(char &pid, TargetMachine &tm) +: MachineFunctionPass(pid), TM(tm), TII(tm.getInstrInfo()), + TRI(static_cast(tm.getRegisterInfo())) { +} + +const TargetInstrInfo *AMDGPUCFGStructurizer::getTargetInstrInfo() const { + return TII; +} +//===----------------------------------------------------------------------===// +// +// CFGPrepare +// +//===----------------------------------------------------------------------===// + + +using namespace llvmCFGStruct; + +namespace llvm { +class AMDGPUCFGPrepare : public AMDGPUCFGStructurizer { +public: + static char ID; + +public: + AMDGPUCFGPrepare(TargetMachine &tm); + + virtual const char *getPassName() const; + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + + bool runOnMachineFunction(MachineFunction &F); + +private: + +}; + +char AMDGPUCFGPrepare::ID = 0; +} //end of namespace llvm + +AMDGPUCFGPrepare::AMDGPUCFGPrepare(TargetMachine &tm) + : AMDGPUCFGStructurizer(ID, tm ) { +} +const char *AMDGPUCFGPrepare::getPassName() const { + return "AMD IL Control Flow Graph Preparation Pass"; +} + +void AMDGPUCFGPrepare::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); +} + +//===----------------------------------------------------------------------===// +// +// CFGPerform +// +//===----------------------------------------------------------------------===// + + +using namespace llvmCFGStruct; + +namespace llvm { +class AMDGPUCFGPerform : public AMDGPUCFGStructurizer { +public: + static char ID; + +public: + AMDGPUCFGPerform(TargetMachine &tm); + virtual const char *getPassName() const; + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnMachineFunction(MachineFunction &F); + +private: + +}; + +char AMDGPUCFGPerform::ID = 0; +} //end of namespace llvm + + AMDGPUCFGPerform::AMDGPUCFGPerform(TargetMachine &tm) +: AMDGPUCFGStructurizer(ID, tm) { +} + +const char *AMDGPUCFGPerform::getPassName() const { + return "AMD IL Control Flow Graph structurizer Pass"; +} + +void AMDGPUCFGPerform::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); +} + +//===----------------------------------------------------------------------===// +// +// CFGStructTraits +// +//===----------------------------------------------------------------------===// + +namespace llvmCFGStruct { +// this class is tailor to the AMDGPU backend +template<> +struct CFGStructTraits { + typedef int RegiT; + + static int getBranchNzeroOpcode(int oldOpcode) { + switch(oldOpcode) { + case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; + case AMDGPU::BRANCH_COND_i32: + case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32; + default: + assert(0 && "internal error"); + } + return -1; + } + + static int getBranchZeroOpcode(int oldOpcode) { + switch(oldOpcode) { + case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; + case AMDGPU::BRANCH_COND_i32: + case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32; + default: + assert(0 && "internal error"); + } + return -1; + } + + static int getContinueNzeroOpcode(int oldOpcode) { + switch(oldOpcode) { + case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32; + default: + assert(0 && "internal error"); + }; + return -1; + } + + static int getContinueZeroOpcode(int oldOpcode) { + switch(oldOpcode) { + case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32; + default: + assert(0 && "internal error"); + } + return -1; + } + + static MachineBasicBlock *getTrueBranch(MachineInstr *instr) { + return instr->getOperand(0).getMBB(); + } + + static void setTrueBranch(MachineInstr *instr, MachineBasicBlock *blk) { + instr->getOperand(0).setMBB(blk); + } + + static MachineBasicBlock * + getFalseBranch(MachineBasicBlock *blk, MachineInstr *instr) { + assert(blk->succ_size() == 2); + MachineBasicBlock *trueBranch = getTrueBranch(instr); + MachineBasicBlock::succ_iterator iter = blk->succ_begin(); + MachineBasicBlock::succ_iterator iterNext = iter; + ++iterNext; + + return (*iter == trueBranch) ? *iterNext : *iter; + } + + static bool isCondBranch(MachineInstr *instr) { + switch (instr->getOpcode()) { + case AMDGPU::JUMP: + return instr->getOperand(instr->findFirstPredOperandIdx()).getReg() != 0; + case AMDGPU::BRANCH_COND_i32: + case AMDGPU::BRANCH_COND_f32: + break; + default: + return false; + } + return true; + } + + static bool isUncondBranch(MachineInstr *instr) { + switch (instr->getOpcode()) { + case AMDGPU::JUMP: + return instr->getOperand(instr->findFirstPredOperandIdx()).getReg() == 0; + case AMDGPU::BRANCH: + return true; + default: + return false; + } + return true; + } + + static DebugLoc getLastDebugLocInBB(MachineBasicBlock *blk) { + //get DebugLoc from the first MachineBasicBlock instruction with debug info + DebugLoc DL; + for (MachineBasicBlock::iterator iter = blk->begin(); iter != blk->end(); ++iter) { + MachineInstr *instr = &(*iter); + if (instr->getDebugLoc().isUnknown() == false) { + DL = instr->getDebugLoc(); + } + } + return DL; + } + + static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *blk) { + MachineBasicBlock::reverse_iterator iter = blk->rbegin(); + MachineInstr *instr = &*iter; + if (instr && (isCondBranch(instr) || isUncondBranch(instr))) { + return instr; + } + return NULL; + } + + // The correct naming for this is getPossibleLoopendBlockBranchInstr. + // + // BB with backward-edge could have move instructions after the branch + // instruction. Such move instruction "belong to" the loop backward-edge. + // + static MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *blk) { + const AMDGPUInstrInfo * TII = static_cast( + blk->getParent()->getTarget().getInstrInfo()); + + for (MachineBasicBlock::reverse_iterator iter = blk->rbegin(), + iterEnd = blk->rend(); iter != iterEnd; ++iter) { + // FIXME: Simplify + MachineInstr *instr = &*iter; + if (instr) { + if (isCondBranch(instr) || isUncondBranch(instr)) { + return instr; + } else if (!TII->isMov(instr->getOpcode())) { + break; + } + } + } + return NULL; + } + + static MachineInstr *getReturnInstr(MachineBasicBlock *blk) { + MachineBasicBlock::reverse_iterator iter = blk->rbegin(); + if (iter != blk->rend()) { + MachineInstr *instr = &(*iter); + if (instr->getOpcode() == AMDGPU::RETURN) { + return instr; + } + } + return NULL; + } + + static MachineInstr *getContinueInstr(MachineBasicBlock *blk) { + MachineBasicBlock::reverse_iterator iter = blk->rbegin(); + if (iter != blk->rend()) { + MachineInstr *instr = &(*iter); + if (instr->getOpcode() == AMDGPU::CONTINUE) { + return instr; + } + } + return NULL; + } + + static MachineInstr *getLoopBreakInstr(MachineBasicBlock *blk) { + for (MachineBasicBlock::iterator iter = blk->begin(); (iter != blk->end()); ++iter) { + MachineInstr *instr = &(*iter); + if (instr->getOpcode() == AMDGPU::PREDICATED_BREAK) { + return instr; + } + } + return NULL; + } + + static bool isReturnBlock(MachineBasicBlock *blk) { + MachineInstr *instr = getReturnInstr(blk); + bool isReturn = (blk->succ_size() == 0); + if (instr) { + assert(isReturn); + } else if (isReturn) { + if (DEBUGME) { + errs() << "BB" << blk->getNumber() + <<" is return block without RETURN instr\n"; + } + } + + return isReturn; + } + + static MachineBasicBlock::iterator + getInstrPos(MachineBasicBlock *blk, MachineInstr *instr) { + assert(instr->getParent() == blk && "instruction doesn't belong to block"); + MachineBasicBlock::iterator iter = blk->begin(); + MachineBasicBlock::iterator iterEnd = blk->end(); + while (&(*iter) != instr && iter != iterEnd) { + ++iter; + } + + assert(iter != iterEnd); + return iter; + }//getInstrPos + + static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode, + AMDGPUCFGStructurizer *passRep) { + return insertInstrBefore(blk,newOpcode,passRep,DebugLoc()); + } //insertInstrBefore + + static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode, + AMDGPUCFGStructurizer *passRep, DebugLoc DL) { + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + MachineInstr *newInstr = + blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL); + + MachineBasicBlock::iterator res; + if (blk->begin() != blk->end()) { + blk->insert(blk->begin(), newInstr); + } else { + blk->push_back(newInstr); + } + + SHOWNEWINSTR(newInstr); + + return newInstr; + } //insertInstrBefore + + static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode, + AMDGPUCFGStructurizer *passRep) { + insertInstrEnd(blk,newOpcode,passRep,DebugLoc()); + } //insertInstrEnd + + static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode, + AMDGPUCFGStructurizer *passRep, DebugLoc DL) { + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + MachineInstr *newInstr = blk->getParent() + ->CreateMachineInstr(tii->get(newOpcode), DL); + + blk->push_back(newInstr); + //assume the instruction doesn't take any reg operand ... + + SHOWNEWINSTR(newInstr); + } //insertInstrEnd + + static MachineInstr *insertInstrBefore(MachineBasicBlock::iterator instrPos, + int newOpcode, + AMDGPUCFGStructurizer *passRep) { + MachineInstr *oldInstr = &(*instrPos); + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + MachineBasicBlock *blk = oldInstr->getParent(); + MachineInstr *newInstr = + blk->getParent()->CreateMachineInstr(tii->get(newOpcode), + DebugLoc()); + + blk->insert(instrPos, newInstr); + //assume the instruction doesn't take any reg operand ... + + SHOWNEWINSTR(newInstr); + return newInstr; + } //insertInstrBefore + + static void insertCondBranchBefore(MachineBasicBlock::iterator instrPos, + int newOpcode, + AMDGPUCFGStructurizer *passRep, + DebugLoc DL) { + MachineInstr *oldInstr = &(*instrPos); + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + MachineBasicBlock *blk = oldInstr->getParent(); + MachineInstr *newInstr = + blk->getParent()->CreateMachineInstr(tii->get(newOpcode), + DL); + + blk->insert(instrPos, newInstr); + MachineInstrBuilder(newInstr).addReg(oldInstr->getOperand(1).getReg(), + false); + + SHOWNEWINSTR(newInstr); + //erase later oldInstr->eraseFromParent(); + } //insertCondBranchBefore + + static void insertCondBranchBefore(MachineBasicBlock *blk, + MachineBasicBlock::iterator insertPos, + int newOpcode, + AMDGPUCFGStructurizer *passRep, + RegiT regNum, + DebugLoc DL) { + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + + MachineInstr *newInstr = + blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL); + + //insert before + blk->insert(insertPos, newInstr); + MachineInstrBuilder(newInstr).addReg(regNum, false); + + SHOWNEWINSTR(newInstr); + } //insertCondBranchBefore + + static void insertCondBranchEnd(MachineBasicBlock *blk, + int newOpcode, + AMDGPUCFGStructurizer *passRep, + RegiT regNum) { + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + MachineInstr *newInstr = + blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DebugLoc()); + + blk->push_back(newInstr); + MachineInstrBuilder(newInstr).addReg(regNum, false); + + SHOWNEWINSTR(newInstr); + } //insertCondBranchEnd + + + static void insertAssignInstrBefore(MachineBasicBlock::iterator instrPos, + AMDGPUCFGStructurizer *passRep, + RegiT regNum, int regVal) { + MachineInstr *oldInstr = &(*instrPos); + const AMDGPUInstrInfo *tii = + static_cast(passRep->getTargetInstrInfo()); + MachineBasicBlock *blk = oldInstr->getParent(); + MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum, + regVal); + blk->insert(instrPos, newInstr); + + SHOWNEWINSTR(newInstr); + } //insertAssignInstrBefore + + static void insertAssignInstrBefore(MachineBasicBlock *blk, + AMDGPUCFGStructurizer *passRep, + RegiT regNum, int regVal) { + const AMDGPUInstrInfo *tii = + static_cast(passRep->getTargetInstrInfo()); + + MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum, + regVal); + if (blk->begin() != blk->end()) { + blk->insert(blk->begin(), newInstr); + } else { + blk->push_back(newInstr); + } + + SHOWNEWINSTR(newInstr); + + } //insertInstrBefore + + static void insertCompareInstrBefore(MachineBasicBlock *blk, + MachineBasicBlock::iterator instrPos, + AMDGPUCFGStructurizer *passRep, + RegiT dstReg, RegiT src1Reg, + RegiT src2Reg) { + const AMDGPUInstrInfo *tii = + static_cast(passRep->getTargetInstrInfo()); + MachineInstr *newInstr = + blk->getParent()->CreateMachineInstr(tii->get(tii->getIEQOpcode()), DebugLoc()); + + MachineInstrBuilder(newInstr).addReg(dstReg, RegState::Define); //set target + MachineInstrBuilder(newInstr).addReg(src1Reg); //set src value + MachineInstrBuilder(newInstr).addReg(src2Reg); //set src value + + blk->insert(instrPos, newInstr); + SHOWNEWINSTR(newInstr); + + } //insertCompareInstrBefore + + static void cloneSuccessorList(MachineBasicBlock *dstBlk, + MachineBasicBlock *srcBlk) { + for (MachineBasicBlock::succ_iterator iter = srcBlk->succ_begin(), + iterEnd = srcBlk->succ_end(); iter != iterEnd; ++iter) { + dstBlk->addSuccessor(*iter); // *iter's predecessor is also taken care of + } + } //cloneSuccessorList + + static MachineBasicBlock *clone(MachineBasicBlock *srcBlk) { + MachineFunction *func = srcBlk->getParent(); + MachineBasicBlock *newBlk = func->CreateMachineBasicBlock(); + func->push_back(newBlk); //insert to function + for (MachineBasicBlock::iterator iter = srcBlk->begin(), + iterEnd = srcBlk->end(); + iter != iterEnd; ++iter) { + MachineInstr *instr = func->CloneMachineInstr(iter); + newBlk->push_back(instr); + } + return newBlk; + } + + //MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose because + //the AMDGPU instruction is not recognized as terminator fix this and retire + //this routine + static void replaceInstrUseOfBlockWith(MachineBasicBlock *srcBlk, + MachineBasicBlock *oldBlk, + MachineBasicBlock *newBlk) { + MachineInstr *branchInstr = getLoopendBlockBranchInstr(srcBlk); + if (branchInstr && isCondBranch(branchInstr) && + getTrueBranch(branchInstr) == oldBlk) { + setTrueBranch(branchInstr, newBlk); + } + } + + static void wrapup(MachineBasicBlock *entryBlk) { + assert((!entryBlk->getParent()->getJumpTableInfo() + || entryBlk->getParent()->getJumpTableInfo()->isEmpty()) + && "found a jump table"); + + //collect continue right before endloop + SmallVector contInstr; + MachineBasicBlock::iterator pre = entryBlk->begin(); + MachineBasicBlock::iterator iterEnd = entryBlk->end(); + MachineBasicBlock::iterator iter = pre; + while (iter != iterEnd) { + if (pre->getOpcode() == AMDGPU::CONTINUE + && iter->getOpcode() == AMDGPU::ENDLOOP) { + contInstr.push_back(pre); + } + pre = iter; + ++iter; + } //end while + + //delete continue right before endloop + for (unsigned i = 0; i < contInstr.size(); ++i) { + contInstr[i]->eraseFromParent(); + } + + // TODO to fix up jump table so later phase won't be confused. if + // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but + // there isn't such an interface yet. alternatively, replace all the other + // blocks in the jump table with the entryBlk //} + + } //wrapup + + static MachineDominatorTree *getDominatorTree(AMDGPUCFGStructurizer &pass) { + return &pass.getAnalysis(); + } + + static MachinePostDominatorTree* + getPostDominatorTree(AMDGPUCFGStructurizer &pass) { + return &pass.getAnalysis(); + } + + static MachineLoopInfo *getLoopInfo(AMDGPUCFGStructurizer &pass) { + return &pass.getAnalysis(); + } +}; // template class CFGStructTraits +} //end of namespace llvm + +// createAMDGPUCFGPreparationPass- Returns a pass +FunctionPass *llvm::createAMDGPUCFGPreparationPass(TargetMachine &tm + ) { + return new AMDGPUCFGPrepare(tm ); +} + +bool AMDGPUCFGPrepare::runOnMachineFunction(MachineFunction &func) { + return llvmCFGStruct::CFGStructurizer().prepare(func, + *this, + TRI); +} + +// createAMDGPUCFGStructurizerPass- Returns a pass +FunctionPass *llvm::createAMDGPUCFGStructurizerPass(TargetMachine &tm + ) { + return new AMDGPUCFGPerform(tm ); +} + +bool AMDGPUCFGPerform::runOnMachineFunction(MachineFunction &func) { + return llvmCFGStruct::CFGStructurizer().run(func, + *this, + TRI); +} diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILDevice.cpp llvm-r600/lib/Target/R600/AMDILDevice.cpp --- llvm-3.2.src/lib/Target/R600/AMDILDevice.cpp 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDILDevice.cpp 2013-01-25 19:43:57.440049721 +0100 @@ -0,0 +1,124 @@ +//===-- AMDILDevice.cpp - Base class for AMDIL Devices --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//==-----------------------------------------------------------------------===// +#include "AMDILDevice.h" +#include "AMDGPUSubtarget.h" + +using namespace llvm; +// Default implementation for all of the classes. +AMDGPUDevice::AMDGPUDevice(AMDGPUSubtarget *ST) : mSTM(ST) { + mHWBits.resize(AMDGPUDeviceInfo::MaxNumberCapabilities); + mSWBits.resize(AMDGPUDeviceInfo::MaxNumberCapabilities); + setCaps(); + DeviceFlag = OCL_DEVICE_ALL; +} + +AMDGPUDevice::~AMDGPUDevice() { + mHWBits.clear(); + mSWBits.clear(); +} + +size_t AMDGPUDevice::getMaxGDSSize() const { + return 0; +} + +uint32_t +AMDGPUDevice::getDeviceFlag() const { + return DeviceFlag; +} + +size_t AMDGPUDevice::getMaxNumCBs() const { + if (usesHardware(AMDGPUDeviceInfo::ConstantMem)) { + return HW_MAX_NUM_CB; + } + + return 0; +} + +size_t AMDGPUDevice::getMaxCBSize() const { + if (usesHardware(AMDGPUDeviceInfo::ConstantMem)) { + return MAX_CB_SIZE; + } + + return 0; +} + +size_t AMDGPUDevice::getMaxScratchSize() const { + return 65536; +} + +uint32_t AMDGPUDevice::getStackAlignment() const { + return 16; +} + +void AMDGPUDevice::setCaps() { + mSWBits.set(AMDGPUDeviceInfo::HalfOps); + mSWBits.set(AMDGPUDeviceInfo::ByteOps); + mSWBits.set(AMDGPUDeviceInfo::ShortOps); + mSWBits.set(AMDGPUDeviceInfo::HW64BitDivMod); + if (mSTM->isOverride(AMDGPUDeviceInfo::NoInline)) { + mSWBits.set(AMDGPUDeviceInfo::NoInline); + } + if (mSTM->isOverride(AMDGPUDeviceInfo::MacroDB)) { + mSWBits.set(AMDGPUDeviceInfo::MacroDB); + } + if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) { + mSWBits.set(AMDGPUDeviceInfo::ConstantMem); + } else { + mHWBits.set(AMDGPUDeviceInfo::ConstantMem); + } + if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) { + mSWBits.set(AMDGPUDeviceInfo::PrivateMem); + } else { + mHWBits.set(AMDGPUDeviceInfo::PrivateMem); + } + if (mSTM->isOverride(AMDGPUDeviceInfo::BarrierDetect)) { + mSWBits.set(AMDGPUDeviceInfo::BarrierDetect); + } + mSWBits.set(AMDGPUDeviceInfo::ByteLDSOps); + mSWBits.set(AMDGPUDeviceInfo::LongOps); +} + +AMDGPUDeviceInfo::ExecutionMode +AMDGPUDevice::getExecutionMode(AMDGPUDeviceInfo::Caps Caps) const { + if (mHWBits[Caps]) { + assert(!mSWBits[Caps] && "Cannot set both SW and HW caps"); + return AMDGPUDeviceInfo::Hardware; + } + + if (mSWBits[Caps]) { + assert(!mHWBits[Caps] && "Cannot set both SW and HW caps"); + return AMDGPUDeviceInfo::Software; + } + + return AMDGPUDeviceInfo::Unsupported; + +} + +bool AMDGPUDevice::isSupported(AMDGPUDeviceInfo::Caps Mode) const { + return getExecutionMode(Mode) != AMDGPUDeviceInfo::Unsupported; +} + +bool AMDGPUDevice::usesHardware(AMDGPUDeviceInfo::Caps Mode) const { + return getExecutionMode(Mode) == AMDGPUDeviceInfo::Hardware; +} + +bool AMDGPUDevice::usesSoftware(AMDGPUDeviceInfo::Caps Mode) const { + return getExecutionMode(Mode) == AMDGPUDeviceInfo::Software; +} + +std::string +AMDGPUDevice::getDataLayout() const { + return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16" + "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32" + "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64" + "-v96:128:128-v128:128:128-v192:256:256-v256:256:256" + "-v512:512:512-v1024:1024:1024-v2048:2048:2048" + "-n8:16:32:64"); +} diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILDevice.h llvm-r600/lib/Target/R600/AMDILDevice.h --- llvm-3.2.src/lib/Target/R600/AMDILDevice.h 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDILDevice.h 2013-01-25 19:43:57.440049721 +0100 @@ -0,0 +1,117 @@ +//===---- AMDILDevice.h - Define Device Data for AMDGPU -----*- C++ -*------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface for the subtarget data classes. +// +/// This file will define the interface that each generation needs to +/// implement in order to correctly answer queries on the capabilities of the +/// specific hardware. +//===----------------------------------------------------------------------===// +#ifndef AMDILDEVICEIMPL_H +#define AMDILDEVICEIMPL_H +#include "AMDIL.h" +#include "llvm/ADT/BitVector.h" + +namespace llvm { + class AMDGPUSubtarget; + class MCStreamer; +//===----------------------------------------------------------------------===// +// Interface for data that is specific to a single device +//===----------------------------------------------------------------------===// +class AMDGPUDevice { +public: + AMDGPUDevice(AMDGPUSubtarget *ST); + virtual ~AMDGPUDevice(); + + // Enum values for the various memory types. + enum { + RAW_UAV_ID = 0, + ARENA_UAV_ID = 1, + LDS_ID = 2, + GDS_ID = 3, + SCRATCH_ID = 4, + CONSTANT_ID = 5, + GLOBAL_ID = 6, + MAX_IDS = 7 + } IO_TYPE_IDS; + + /// \returns The max LDS size that the hardware supports. Size is in + /// bytes. + virtual size_t getMaxLDSSize() const = 0; + + /// \returns The max GDS size that the hardware supports if the GDS is + /// supported by the hardware. Size is in bytes. + virtual size_t getMaxGDSSize() const; + + /// \returns The max number of hardware constant address spaces that + /// are supported by this device. + virtual size_t getMaxNumCBs() const; + + /// \returns The max number of bytes a single hardware constant buffer + /// can support. Size is in bytes. + virtual size_t getMaxCBSize() const; + + /// \returns The max number of bytes allowed by the hardware scratch + /// buffer. Size is in bytes. + virtual size_t getMaxScratchSize() const; + + /// \brief Get the flag that corresponds to the device. + virtual uint32_t getDeviceFlag() const; + + /// \returns The number of work-items that exist in a single hardware + /// wavefront. + virtual size_t getWavefrontSize() const = 0; + + /// \brief Get the generational name of this specific device. + virtual uint32_t getGeneration() const = 0; + + /// \brief Get the stack alignment of this specific device. + virtual uint32_t getStackAlignment() const; + + /// \brief Get the resource ID for this specific device. + virtual uint32_t getResourceID(uint32_t DeviceID) const = 0; + + /// \brief Get the max number of UAV's for this device. + virtual uint32_t getMaxNumUAVs() const = 0; + + + // API utilizing more detailed capabilities of each family of + // cards. If a capability is supported, then either usesHardware or + // usesSoftware returned true. If usesHardware returned true, then + // usesSoftware must return false for the same capability. Hardware + // execution means that the feature is done natively by the hardware + // and is not emulated by the softare. Software execution means + // that the feature could be done in the hardware, but there is + // software that emulates it with possibly using the hardware for + // support since the hardware does not fully comply with OpenCL + // specs. + + bool isSupported(AMDGPUDeviceInfo::Caps Mode) const; + bool usesHardware(AMDGPUDeviceInfo::Caps Mode) const; + bool usesSoftware(AMDGPUDeviceInfo::Caps Mode) const; + virtual std::string getDataLayout() const; + static const unsigned int MAX_LDS_SIZE_700 = 16384; + static const unsigned int MAX_LDS_SIZE_800 = 32768; + static const unsigned int WavefrontSize = 64; + static const unsigned int HalfWavefrontSize = 32; + static const unsigned int QuarterWavefrontSize = 16; +protected: + virtual void setCaps(); + llvm::BitVector mHWBits; + llvm::BitVector mSWBits; + AMDGPUSubtarget *mSTM; + uint32_t DeviceFlag; +private: + AMDGPUDeviceInfo::ExecutionMode + getExecutionMode(AMDGPUDeviceInfo::Caps Caps) const; +}; + +} // namespace llvm +#endif // AMDILDEVICEIMPL_H diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILDeviceInfo.cpp llvm-r600/lib/Target/R600/AMDILDeviceInfo.cpp --- llvm-3.2.src/lib/Target/R600/AMDILDeviceInfo.cpp 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDILDeviceInfo.cpp 2013-01-25 19:43:57.440049721 +0100 @@ -0,0 +1,94 @@ +//===-- AMDILDeviceInfo.cpp - AMDILDeviceInfo class -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief Function that creates DeviceInfo from a device name and other information. +// +//==-----------------------------------------------------------------------===// +#include "AMDILDevices.h" +#include "AMDGPUSubtarget.h" + +using namespace llvm; +namespace llvm { +namespace AMDGPUDeviceInfo { + +AMDGPUDevice* getDeviceFromName(const std::string &deviceName, + AMDGPUSubtarget *ptr, + bool is64bit, bool is64on32bit) { + if (deviceName.c_str()[2] == '7') { + switch (deviceName.c_str()[3]) { + case '1': + return new AMDGPU710Device(ptr); + case '7': + return new AMDGPU770Device(ptr); + default: + return new AMDGPU7XXDevice(ptr); + } + } else if (deviceName == "cypress") { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDGPUCypressDevice(ptr); + } else if (deviceName == "juniper") { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDGPUEvergreenDevice(ptr); + } else if (deviceName == "redwood") { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDGPURedwoodDevice(ptr); + } else if (deviceName == "cedar") { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDGPUCedarDevice(ptr); + } else if (deviceName == "barts" || deviceName == "turks") { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDGPUNIDevice(ptr); + } else if (deviceName == "cayman") { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDGPUCaymanDevice(ptr); + } else if (deviceName == "caicos") { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDGPUNIDevice(ptr); + } else if (deviceName == "SI") { + return new AMDGPUSIDevice(ptr); + } else { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDGPU7XXDevice(ptr); + } +} +} // End namespace AMDGPUDeviceInfo +} // End namespace llvm diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILDeviceInfo.h llvm-r600/lib/Target/R600/AMDILDeviceInfo.h --- llvm-3.2.src/lib/Target/R600/AMDILDeviceInfo.h 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDILDeviceInfo.h 2013-01-25 19:43:57.440049721 +0100 @@ -0,0 +1,88 @@ +//===-- AMDILDeviceInfo.h - Constants for describing devices --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//==-----------------------------------------------------------------------===// +#ifndef AMDILDEVICEINFO_H +#define AMDILDEVICEINFO_H + + +#include + +namespace llvm { + class AMDGPUDevice; + class AMDGPUSubtarget; + namespace AMDGPUDeviceInfo { + /// Each Capabilities can be executed using a hardware instruction, + /// emulated with a sequence of software instructions, or not + /// supported at all. + enum ExecutionMode { + Unsupported = 0, ///< Unsupported feature on the card(Default value) + /// This is the execution mode that is set if the feature is emulated in + /// software. + Software, + /// This execution mode is set if the feature exists natively in hardware + Hardware + }; + + enum Caps { + HalfOps = 0x1, ///< Half float is supported or not. + DoubleOps = 0x2, ///< Double is supported or not. + ByteOps = 0x3, ///< Byte(char) is support or not. + ShortOps = 0x4, ///< Short is supported or not. + LongOps = 0x5, ///< Long is supported or not. + Images = 0x6, ///< Images are supported or not. + ByteStores = 0x7, ///< ByteStores available(!HD4XXX). + ConstantMem = 0x8, ///< Constant/CB memory. + LocalMem = 0x9, ///< Local/LDS memory. + PrivateMem = 0xA, ///< Scratch/Private/Stack memory. + RegionMem = 0xB, ///< OCL GDS Memory Extension. + FMA = 0xC, ///< Use HW FMA or SW FMA. + ArenaSegment = 0xD, ///< Use for Arena UAV per pointer 12-1023. + MultiUAV = 0xE, ///< Use for UAV per Pointer 0-7. + Reserved0 = 0xF, ///< ReservedFlag + NoAlias = 0x10, ///< Cached loads. + Signed24BitOps = 0x11, ///< Peephole Optimization. + /// Debug mode implies that no hardware features or optimizations + /// are performned and that all memory access go through a single + /// uav(Arena on HD5XXX/HD6XXX and Raw on HD4XXX). + Debug = 0x12, + CachedMem = 0x13, ///< Cached mem is available or not. + BarrierDetect = 0x14, ///< Detect duplicate barriers. + Reserved1 = 0x15, ///< Reserved flag + ByteLDSOps = 0x16, ///< Flag to specify if byte LDS ops are available. + ArenaVectors = 0x17, ///< Flag to specify if vector loads from arena work. + TmrReg = 0x18, ///< Flag to specify if Tmr register is supported. + NoInline = 0x19, ///< Flag to specify that no inlining should occur. + MacroDB = 0x1A, ///< Flag to specify that backend handles macrodb. + HW64BitDivMod = 0x1B, ///< Flag for backend to generate 64bit div/mod. + ArenaUAV = 0x1C, ///< Flag to specify that arena uav is supported. + PrivateUAV = 0x1D, ///< Flag to specify that private memory uses uav's. + /// If more capabilities are required, then + /// this number needs to be increased. + /// All capabilities must come before this + /// number. + MaxNumberCapabilities = 0x20 + }; + /// These have to be in order with the older generations + /// having the lower number enumerations. + enum Generation { + HD4XXX = 0, ///< 7XX based devices. + HD5XXX, ///< Evergreen based devices. + HD6XXX, ///< NI/Evergreen+ based devices. + HD7XXX, ///< Southern Islands based devices. + HDTEST, ///< Experimental feature testing device. + HDNUMGEN + }; + + + AMDGPUDevice* + getDeviceFromName(const std::string &name, AMDGPUSubtarget *ptr, + bool is64bit = false, bool is64on32bit = false); + } // namespace AMDILDeviceInfo +} // namespace llvm +#endif // AMDILDEVICEINFO_H diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILDevices.h llvm-r600/lib/Target/R600/AMDILDevices.h --- llvm-3.2.src/lib/Target/R600/AMDILDevices.h 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDILDevices.h 2013-01-25 19:43:57.440049721 +0100 @@ -0,0 +1,19 @@ +//===-- AMDILDevices.h - Consolidate AMDIL Device headers -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//==-----------------------------------------------------------------------===// +#ifndef AMDIL_DEVICES_H +#define AMDIL_DEVICES_H +// Include all of the device specific header files +#include "AMDIL7XXDevice.h" +#include "AMDILDevice.h" +#include "AMDILEvergreenDevice.h" +#include "AMDILNIDevice.h" +#include "AMDILSIDevice.h" + +#endif // AMDIL_DEVICES_H diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILEvergreenDevice.cpp llvm-r600/lib/Target/R600/AMDILEvergreenDevice.cpp --- llvm-3.2.src/lib/Target/R600/AMDILEvergreenDevice.cpp 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDILEvergreenDevice.cpp 2013-01-25 19:43:57.440049721 +0100 @@ -0,0 +1,169 @@ +//===-- AMDILEvergreenDevice.cpp - Device Info for Evergreen --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//==-----------------------------------------------------------------------===// +#include "AMDILEvergreenDevice.h" + +using namespace llvm; + +AMDGPUEvergreenDevice::AMDGPUEvergreenDevice(AMDGPUSubtarget *ST) +: AMDGPUDevice(ST) { + setCaps(); + std::string name = ST->getDeviceName(); + if (name == "cedar") { + DeviceFlag = OCL_DEVICE_CEDAR; + } else if (name == "redwood") { + DeviceFlag = OCL_DEVICE_REDWOOD; + } else if (name == "cypress") { + DeviceFlag = OCL_DEVICE_CYPRESS; + } else { + DeviceFlag = OCL_DEVICE_JUNIPER; + } +} + +AMDGPUEvergreenDevice::~AMDGPUEvergreenDevice() { +} + +size_t AMDGPUEvergreenDevice::getMaxLDSSize() const { + if (usesHardware(AMDGPUDeviceInfo::LocalMem)) { + return MAX_LDS_SIZE_800; + } else { + return 0; + } +} +size_t AMDGPUEvergreenDevice::getMaxGDSSize() const { + if (usesHardware(AMDGPUDeviceInfo::RegionMem)) { + return MAX_LDS_SIZE_800; + } else { + return 0; + } +} +uint32_t AMDGPUEvergreenDevice::getMaxNumUAVs() const { + return 12; +} + +uint32_t AMDGPUEvergreenDevice::getResourceID(uint32_t id) const { + switch(id) { + default: + assert(0 && "ID type passed in is unknown!"); + break; + case CONSTANT_ID: + case RAW_UAV_ID: + return GLOBAL_RETURN_RAW_UAV_ID; + case GLOBAL_ID: + case ARENA_UAV_ID: + return DEFAULT_ARENA_UAV_ID; + case LDS_ID: + if (usesHardware(AMDGPUDeviceInfo::LocalMem)) { + return DEFAULT_LDS_ID; + } else { + return DEFAULT_ARENA_UAV_ID; + } + case GDS_ID: + if (usesHardware(AMDGPUDeviceInfo::RegionMem)) { + return DEFAULT_GDS_ID; + } else { + return DEFAULT_ARENA_UAV_ID; + } + case SCRATCH_ID: + if (usesHardware(AMDGPUDeviceInfo::PrivateMem)) { + return DEFAULT_SCRATCH_ID; + } else { + return DEFAULT_ARENA_UAV_ID; + } + }; + return 0; +} + +size_t AMDGPUEvergreenDevice::getWavefrontSize() const { + return AMDGPUDevice::WavefrontSize; +} + +uint32_t AMDGPUEvergreenDevice::getGeneration() const { + return AMDGPUDeviceInfo::HD5XXX; +} + +void AMDGPUEvergreenDevice::setCaps() { + mSWBits.set(AMDGPUDeviceInfo::ArenaSegment); + mHWBits.set(AMDGPUDeviceInfo::ArenaUAV); + mHWBits.set(AMDGPUDeviceInfo::HW64BitDivMod); + mSWBits.reset(AMDGPUDeviceInfo::HW64BitDivMod); + mSWBits.set(AMDGPUDeviceInfo::Signed24BitOps); + if (mSTM->isOverride(AMDGPUDeviceInfo::ByteStores)) { + mHWBits.set(AMDGPUDeviceInfo::ByteStores); + } + if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) { + mSWBits.set(AMDGPUDeviceInfo::LocalMem); + mSWBits.set(AMDGPUDeviceInfo::RegionMem); + } else { + mHWBits.set(AMDGPUDeviceInfo::LocalMem); + mHWBits.set(AMDGPUDeviceInfo::RegionMem); + } + mHWBits.set(AMDGPUDeviceInfo::Images); + if (mSTM->isOverride(AMDGPUDeviceInfo::NoAlias)) { + mHWBits.set(AMDGPUDeviceInfo::NoAlias); + } + mHWBits.set(AMDGPUDeviceInfo::CachedMem); + if (mSTM->isOverride(AMDGPUDeviceInfo::MultiUAV)) { + mHWBits.set(AMDGPUDeviceInfo::MultiUAV); + } + mHWBits.set(AMDGPUDeviceInfo::ByteLDSOps); + mSWBits.reset(AMDGPUDeviceInfo::ByteLDSOps); + mHWBits.set(AMDGPUDeviceInfo::ArenaVectors); + mHWBits.set(AMDGPUDeviceInfo::LongOps); + mSWBits.reset(AMDGPUDeviceInfo::LongOps); + mHWBits.set(AMDGPUDeviceInfo::TmrReg); +} + +AMDGPUCypressDevice::AMDGPUCypressDevice(AMDGPUSubtarget *ST) + : AMDGPUEvergreenDevice(ST) { + setCaps(); +} + +AMDGPUCypressDevice::~AMDGPUCypressDevice() { +} + +void AMDGPUCypressDevice::setCaps() { + if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) { + mHWBits.set(AMDGPUDeviceInfo::DoubleOps); + mHWBits.set(AMDGPUDeviceInfo::FMA); + } +} + + +AMDGPUCedarDevice::AMDGPUCedarDevice(AMDGPUSubtarget *ST) + : AMDGPUEvergreenDevice(ST) { + setCaps(); +} + +AMDGPUCedarDevice::~AMDGPUCedarDevice() { +} + +void AMDGPUCedarDevice::setCaps() { + mSWBits.set(AMDGPUDeviceInfo::FMA); +} + +size_t AMDGPUCedarDevice::getWavefrontSize() const { + return AMDGPUDevice::QuarterWavefrontSize; +} + +AMDGPURedwoodDevice::AMDGPURedwoodDevice(AMDGPUSubtarget *ST) + : AMDGPUEvergreenDevice(ST) { + setCaps(); +} + +AMDGPURedwoodDevice::~AMDGPURedwoodDevice() { +} + +void AMDGPURedwoodDevice::setCaps() { + mSWBits.set(AMDGPUDeviceInfo::FMA); +} + +size_t AMDGPURedwoodDevice::getWavefrontSize() const { + return AMDGPUDevice::HalfWavefrontSize; +} diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILEvergreenDevice.h llvm-r600/lib/Target/R600/AMDILEvergreenDevice.h --- llvm-3.2.src/lib/Target/R600/AMDILEvergreenDevice.h 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDILEvergreenDevice.h 2013-01-25 19:43:57.440049721 +0100 @@ -0,0 +1,93 @@ +//==- AMDILEvergreenDevice.h - Define Evergreen Device for AMDIL -*- C++ -*--=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface for the subtarget data classes. +/// +/// This file will define the interface that each generation needs to +/// implement in order to correctly answer queries on the capabilities of the +/// specific hardware. +//===----------------------------------------------------------------------===// +#ifndef AMDILEVERGREENDEVICE_H +#define AMDILEVERGREENDEVICE_H +#include "AMDILDevice.h" +#include "AMDGPUSubtarget.h" + +namespace llvm { + class AMDGPUSubtarget; +//===----------------------------------------------------------------------===// +// Evergreen generation of devices and their respective sub classes +//===----------------------------------------------------------------------===// + + +/// \brief The AMDGPUEvergreenDevice is the base device class for all of the Evergreen +/// series of cards. +/// +/// This class contains information required to differentiate +/// the Evergreen device from the generic AMDGPUDevice. This device represents +/// that capabilities of the 'Juniper' cards, also known as the HD57XX. +class AMDGPUEvergreenDevice : public AMDGPUDevice { +public: + AMDGPUEvergreenDevice(AMDGPUSubtarget *ST); + virtual ~AMDGPUEvergreenDevice(); + virtual size_t getMaxLDSSize() const; + virtual size_t getMaxGDSSize() const; + virtual size_t getWavefrontSize() const; + virtual uint32_t getGeneration() const; + virtual uint32_t getMaxNumUAVs() const; + virtual uint32_t getResourceID(uint32_t) const; +protected: + virtual void setCaps(); +}; + +/// The AMDGPUCypressDevice is similiar to the AMDGPUEvergreenDevice, except it has +/// support for double precision operations. This device is used to represent +/// both the Cypress and Hemlock cards, which are commercially known as HD58XX +/// and HD59XX cards. +class AMDGPUCypressDevice : public AMDGPUEvergreenDevice { +public: + AMDGPUCypressDevice(AMDGPUSubtarget *ST); + virtual ~AMDGPUCypressDevice(); +private: + virtual void setCaps(); +}; + + +/// \brief The AMDGPUCedarDevice is the class that represents all of the 'Cedar' based +/// devices. +/// +/// This class differs from the base AMDGPUEvergreenDevice in that the +/// device is a ~quarter of the 'Juniper'. These are commercially known as the +/// HD54XX and HD53XX series of cards. +class AMDGPUCedarDevice : public AMDGPUEvergreenDevice { +public: + AMDGPUCedarDevice(AMDGPUSubtarget *ST); + virtual ~AMDGPUCedarDevice(); + virtual size_t getWavefrontSize() const; +private: + virtual void setCaps(); +}; + +/// \brief The AMDGPURedwoodDevice is the class the represents all of the 'Redwood' based +/// devices. +/// +/// This class differs from the base class, in that these devices are +/// considered about half of a 'Juniper' device. These are commercially known as +/// the HD55XX and HD56XX series of cards. +class AMDGPURedwoodDevice : public AMDGPUEvergreenDevice { +public: + AMDGPURedwoodDevice(AMDGPUSubtarget *ST); + virtual ~AMDGPURedwoodDevice(); + virtual size_t getWavefrontSize() const; +private: + virtual void setCaps(); +}; + +} // namespace llvm +#endif // AMDILEVERGREENDEVICE_H diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILFrameLowering.cpp llvm-r600/lib/Target/R600/AMDILFrameLowering.cpp --- llvm-3.2.src/lib/Target/R600/AMDILFrameLowering.cpp 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDILFrameLowering.cpp 2013-01-25 19:43:57.440049721 +0100 @@ -0,0 +1,47 @@ +//===----------------------- AMDILFrameLowering.cpp -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface to describe a layout of a stack frame on a AMDGPU target +/// machine. +// +//===----------------------------------------------------------------------===// +#include "AMDILFrameLowering.h" +#include "llvm/CodeGen/MachineFrameInfo.h" + +using namespace llvm; +AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl, + int LAO, unsigned TransAl) + : TargetFrameLowering(D, StackAl, LAO, TransAl) { +} + +AMDGPUFrameLowering::~AMDGPUFrameLowering() { +} + +int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF, + int FI) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + return MFI->getObjectOffset(FI); +} + +const TargetFrameLowering::SpillSlot * +AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const { + NumEntries = 0; + return 0; +} +void +AMDGPUFrameLowering::emitPrologue(MachineFunction &MF) const { +} +void +AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { +} +bool +AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const { + return false; +} diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILFrameLowering.h llvm-r600/lib/Target/R600/AMDILFrameLowering.h --- llvm-3.2.src/lib/Target/R600/AMDILFrameLowering.h 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDILFrameLowering.h 2013-01-25 19:43:57.443383054 +0100 @@ -0,0 +1,40 @@ +//===--------------------- AMDILFrameLowering.h -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface to describe a layout of a stack frame on a AMDIL target +/// machine. +// +//===----------------------------------------------------------------------===// +#ifndef AMDILFRAME_LOWERING_H +#define AMDILFRAME_LOWERING_H + +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Target/TargetFrameLowering.h" + +namespace llvm { + +/// \brief Information about the stack frame layout on the AMDGPU targets. +/// +/// It holds the direction of the stack growth, the known stack alignment on +/// entry to each function, and the offset to the locals area. +/// See TargetFrameInfo for more comments. +class AMDGPUFrameLowering : public TargetFrameLowering { +public: + AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO, + unsigned TransAl = 1); + virtual ~AMDGPUFrameLowering(); + virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const; + virtual const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const; + virtual void emitPrologue(MachineFunction &MF) const; + virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + virtual bool hasFP(const MachineFunction &MF) const; +}; +} // namespace llvm +#endif // AMDILFRAME_LOWERING_H diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDIL.h llvm-r600/lib/Target/R600/AMDIL.h --- llvm-3.2.src/lib/Target/R600/AMDIL.h 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDIL.h 2013-01-25 19:43:57.433383055 +0100 @@ -0,0 +1,122 @@ +//===-- AMDIL.h - Top-level interface for AMDIL representation --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// This file contains the entry points for global functions defined in the LLVM +/// AMDGPU back-end. +// +//===----------------------------------------------------------------------===// + +#ifndef AMDIL_H +#define AMDIL_H + +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Target/TargetMachine.h" + +#define ARENA_SEGMENT_RESERVED_UAVS 12 +#define DEFAULT_ARENA_UAV_ID 8 +#define DEFAULT_RAW_UAV_ID 7 +#define GLOBAL_RETURN_RAW_UAV_ID 11 +#define HW_MAX_NUM_CB 8 +#define MAX_NUM_UNIQUE_UAVS 8 +#define OPENCL_MAX_NUM_ATOMIC_COUNTERS 8 +#define OPENCL_MAX_READ_IMAGES 128 +#define OPENCL_MAX_WRITE_IMAGES 8 +#define OPENCL_MAX_SAMPLERS 16 + +// The next two values can never be zero, as zero is the ID that is +// used to assert against. +#define DEFAULT_LDS_ID 1 +#define DEFAULT_GDS_ID 1 +#define DEFAULT_SCRATCH_ID 1 +#define DEFAULT_VEC_SLOTS 8 + +#define OCL_DEVICE_RV710 0x0001 +#define OCL_DEVICE_RV730 0x0002 +#define OCL_DEVICE_RV770 0x0004 +#define OCL_DEVICE_CEDAR 0x0008 +#define OCL_DEVICE_REDWOOD 0x0010 +#define OCL_DEVICE_JUNIPER 0x0020 +#define OCL_DEVICE_CYPRESS 0x0040 +#define OCL_DEVICE_CAICOS 0x0080 +#define OCL_DEVICE_TURKS 0x0100 +#define OCL_DEVICE_BARTS 0x0200 +#define OCL_DEVICE_CAYMAN 0x0400 +#define OCL_DEVICE_ALL 0x3FFF + +/// The number of function ID's that are reserved for +/// internal compiler usage. +const unsigned int RESERVED_FUNCS = 1024; + +namespace llvm { +class AMDGPUInstrPrinter; +class FunctionPass; +class MCAsmInfo; +class raw_ostream; +class Target; +class TargetMachine; + +// Instruction selection passes. +FunctionPass* + createAMDGPUISelDag(TargetMachine &TM); +FunctionPass* + createAMDGPUPeepholeOpt(TargetMachine &TM); + +// Pre emit passes. +FunctionPass* + createAMDGPUCFGPreparationPass(TargetMachine &TM); +FunctionPass* + createAMDGPUCFGStructurizerPass(TargetMachine &TM); + +extern Target TheAMDGPUTarget; +} // end namespace llvm; + +// Include device information enumerations +#include "AMDILDeviceInfo.h" + +namespace llvm { +/// OpenCL uses address spaces to differentiate between +/// various memory regions on the hardware. On the CPU +/// all of the address spaces point to the same memory, +/// however on the GPU, each address space points to +/// a seperate piece of memory that is unique from other +/// memory locations. +namespace AMDGPUAS { +enum AddressSpaces { + PRIVATE_ADDRESS = 0, ///< Address space for private memory. + GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). + CONSTANT_ADDRESS = 2, ///< Address space for constant memory + LOCAL_ADDRESS = 3, ///< Address space for local memory. + REGION_ADDRESS = 4, ///< Address space for region memory. + ADDRESS_NONE = 5, ///< Address space for unknown memory. + PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0) + PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1) + USER_SGPR_ADDRESS = 8, ///< Address space for USER_SGPRS on SI + CONSTANT_BUFFER_0 = 9, + CONSTANT_BUFFER_1 = 10, + CONSTANT_BUFFER_2 = 11, + CONSTANT_BUFFER_3 = 12, + CONSTANT_BUFFER_4 = 13, + CONSTANT_BUFFER_5 = 14, + CONSTANT_BUFFER_6 = 15, + CONSTANT_BUFFER_7 = 16, + CONSTANT_BUFFER_8 = 17, + CONSTANT_BUFFER_9 = 18, + CONSTANT_BUFFER_10 = 19, + CONSTANT_BUFFER_11 = 20, + CONSTANT_BUFFER_12 = 21, + CONSTANT_BUFFER_13 = 22, + CONSTANT_BUFFER_14 = 23, + CONSTANT_BUFFER_15 = 24, + LAST_ADDRESS = 25 +}; + +} // namespace AMDGPUAS + +} // end namespace llvm +#endif // AMDIL_H diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILInstrInfo.td llvm-r600/lib/Target/R600/AMDILInstrInfo.td --- llvm-3.2.src/lib/Target/R600/AMDILInstrInfo.td 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDILInstrInfo.td 2013-01-25 19:43:57.443383054 +0100 @@ -0,0 +1,208 @@ +//===------------ AMDILInstrInfo.td - AMDIL Target ------*-tablegen-*------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This file describes the AMDIL instructions in TableGen format. +// +//===----------------------------------------------------------------------===// +// AMDIL Instruction Predicate Definitions +// Predicate that is set to true if the hardware supports double precision +// divide +def HasHWDDiv : Predicate<"Subtarget.device()" + "->getGeneration() > AMDGPUDeviceInfo::HD4XXX && " + "Subtarget.device()->usesHardware(AMDGPUDeviceInfo::DoubleOps)">; + +// Predicate that is set to true if the hardware supports double, but not double +// precision divide in hardware +def HasSWDDiv : Predicate<"Subtarget.device()" + "->getGeneration() == AMDGPUDeviceInfo::HD4XXX &&" + "Subtarget.device()->usesHardware(AMDGPUDeviceInfo::DoubleOps)">; + +// Predicate that is set to true if the hardware support 24bit signed +// math ops. Otherwise a software expansion to 32bit math ops is used instead. +def HasHWSign24Bit : Predicate<"Subtarget.device()" + "->getGeneration() > AMDGPUDeviceInfo::HD5XXX">; + +// Predicate that is set to true if 64bit operations are supported or not +def HasHW64Bit : Predicate<"Subtarget.device()" + "->usesHardware(AMDGPUDeviceInfo::LongOps)">; +def HasSW64Bit : Predicate<"Subtarget.device()" + "->usesSoftware(AMDGPUDeviceInfo::LongOps)">; + +// Predicate that is set to true if the timer register is supported +def HasTmrRegister : Predicate<"Subtarget.device()" + "->isSupported(AMDGPUDeviceInfo::TmrReg)">; +// Predicate that is true if we are at least evergreen series +def HasDeviceIDInst : Predicate<"Subtarget.device()" + "->getGeneration() >= AMDGPUDeviceInfo::HD5XXX">; + +// Predicate that is true if we have region address space. +def hasRegionAS : Predicate<"Subtarget.device()" + "->usesHardware(AMDGPUDeviceInfo::RegionMem)">; + +// Predicate that is false if we don't have region address space. +def noRegionAS : Predicate<"!Subtarget.device()" + "->isSupported(AMDGPUDeviceInfo::RegionMem)">; + + +// Predicate that is set to true if 64bit Mul is supported in the IL or not +def HasHW64Mul : Predicate<"Subtarget.calVersion()" + ">= CAL_VERSION_SC_139" + "&& Subtarget.device()" + "->getGeneration() >=" + "AMDGPUDeviceInfo::HD5XXX">; +def HasSW64Mul : Predicate<"Subtarget.calVersion()" + "< CAL_VERSION_SC_139">; +// Predicate that is set to true if 64bit Div/Mod is supported in the IL or not +def HasHW64DivMod : Predicate<"Subtarget.device()" + "->usesHardware(AMDGPUDeviceInfo::HW64BitDivMod)">; +def HasSW64DivMod : Predicate<"Subtarget.device()" + "->usesSoftware(AMDGPUDeviceInfo::HW64BitDivMod)">; + +// Predicate that is set to true if 64bit pointer are used. +def Has64BitPtr : Predicate<"Subtarget.is64bit()">; +def Has32BitPtr : Predicate<"!Subtarget.is64bit()">; +//===--------------------------------------------------------------------===// +// Custom Operands +//===--------------------------------------------------------------------===// +def brtarget : Operand; + +//===--------------------------------------------------------------------===// +// Custom Selection DAG Type Profiles +//===--------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// Generic Profile Types +//===----------------------------------------------------------------------===// + +def SDTIL_GenBinaryOp : SDTypeProfile<1, 2, [ + SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2> + ]>; +def SDTIL_GenTernaryOp : SDTypeProfile<1, 3, [ + SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisSameAs<2, 3> + ]>; +def SDTIL_GenVecBuild : SDTypeProfile<1, 1, [ + SDTCisEltOfVec<1, 0> + ]>; + +//===----------------------------------------------------------------------===// +// Flow Control Profile Types +//===----------------------------------------------------------------------===// +// Branch instruction where second and third are basic blocks +def SDTIL_BRCond : SDTypeProfile<0, 2, [ + SDTCisVT<0, OtherVT> + ]>; + +//===--------------------------------------------------------------------===// +// Custom Selection DAG Nodes +//===--------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// Flow Control DAG Nodes +//===----------------------------------------------------------------------===// +def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>; + +//===----------------------------------------------------------------------===// +// Call/Return DAG Nodes +//===----------------------------------------------------------------------===// +def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue]>; + +//===--------------------------------------------------------------------===// +// Instructions +//===--------------------------------------------------------------------===// +// Floating point math functions +def IL_div_inf : SDNode<"AMDGPUISD::DIV_INF", SDTIL_GenBinaryOp>; +def IL_mad : SDNode<"AMDGPUISD::MAD", SDTIL_GenTernaryOp>; + +//===----------------------------------------------------------------------===// +// Integer functions +//===----------------------------------------------------------------------===// +def IL_umul : SDNode<"AMDGPUISD::UMUL" , SDTIntBinOp, + [SDNPCommutative, SDNPAssociative]>; + +//===--------------------------------------------------------------------===// +// Custom Pattern DAG Nodes +//===--------------------------------------------------------------------===// +def global_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast(N)); +}]>; + +//===----------------------------------------------------------------------===// +// Load pattern fragments +//===----------------------------------------------------------------------===// +// Global address space loads +def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isGlobalLoad(dyn_cast(N)); +}]>; +// Constant address space loads +def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isConstantLoad(dyn_cast(N), -1); +}]>; + +//===----------------------------------------------------------------------===// +// Complex addressing mode patterns +//===----------------------------------------------------------------------===// +def ADDR : ComplexPattern; +def ADDRF : ComplexPattern; +def ADDR64 : ComplexPattern; +def ADDR64F : ComplexPattern; + +//===----------------------------------------------------------------------===// +// Instruction format classes +//===----------------------------------------------------------------------===// +class ILFormat pattern> +: Instruction { + + let Namespace = "AMDGPU"; + dag OutOperandList = outs; + dag InOperandList = ins; + let Pattern = pattern; + let AsmString = !strconcat(asmstr, "\n"); + let isPseudo = 1; + let Itinerary = NullALU; + bit hasIEEEFlag = 0; + bit hasZeroOpFlag = 0; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; +} + +//===--------------------------------------------------------------------===// +// Multiclass Instruction formats +//===--------------------------------------------------------------------===// +// Multiclass that handles branch instructions +multiclass BranchConditional { + def _i32 : ILFormat<(outs), + (ins brtarget:$target, GPRI32:$src0), + "; i32 Pseudo branch instruction", + [(Op bb:$target, GPRI32:$src0)]>; + def _f32 : ILFormat<(outs), + (ins brtarget:$target, GPRF32:$src0), + "; f32 Pseudo branch instruction", + [(Op bb:$target, GPRF32:$src0)]>; +} + +// Only scalar types should generate flow control +multiclass BranchInstr { + def _i32 : ILFormat<(outs), (ins GPRI32:$src), + !strconcat(name, " $src"), []>; + def _f32 : ILFormat<(outs), (ins GPRF32:$src), + !strconcat(name, " $src"), []>; +} +// Only scalar types should generate flow control +multiclass BranchInstr2 { + def _i32 : ILFormat<(outs), (ins GPRI32:$src0, GPRI32:$src1), + !strconcat(name, " $src0, $src1"), []>; + def _f32 : ILFormat<(outs), (ins GPRF32:$src0, GPRF32:$src1), + !strconcat(name, " $src0, $src1"), []>; +} + +//===--------------------------------------------------------------------===// +// Intrinsics support +//===--------------------------------------------------------------------===// +include "AMDILIntrinsics.td" diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILIntrinsicInfo.cpp llvm-r600/lib/Target/R600/AMDILIntrinsicInfo.cpp --- llvm-3.2.src/lib/Target/R600/AMDILIntrinsicInfo.cpp 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDILIntrinsicInfo.cpp 2013-01-25 19:43:57.446716388 +0100 @@ -0,0 +1,79 @@ +//===- AMDILIntrinsicInfo.cpp - AMDGPU Intrinsic Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU Implementation of the IntrinsicInfo class. +// +//===-----------------------------------------------------------------------===// + +#include "AMDILIntrinsicInfo.h" +#include "AMDIL.h" +#include "AMDGPUSubtarget.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Intrinsics.h" +#include "llvm/Module.h" + +using namespace llvm; + +#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN +#include "AMDGPUGenIntrinsics.inc" +#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN + +AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm) + : TargetIntrinsicInfo() { +} + +std::string +AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys, + unsigned int numTys) const { + static const char* const names[] = { +#define GET_INTRINSIC_NAME_TABLE +#include "AMDGPUGenIntrinsics.inc" +#undef GET_INTRINSIC_NAME_TABLE + }; + + if (IntrID < Intrinsic::num_intrinsics) { + return 0; + } + assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics + && "Invalid intrinsic ID"); + + std::string Result(names[IntrID - Intrinsic::num_intrinsics]); + return Result; +} + +unsigned int +AMDGPUIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const { +#define GET_FUNCTION_RECOGNIZER +#include "AMDGPUGenIntrinsics.inc" +#undef GET_FUNCTION_RECOGNIZER + AMDGPUIntrinsic::ID IntrinsicID + = (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic; + IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name); + + if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) { + return IntrinsicID; + } + return 0; +} + +bool +AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const { + // Overload Table +#define GET_INTRINSIC_OVERLOAD_TABLE +#include "AMDGPUGenIntrinsics.inc" +#undef GET_INTRINSIC_OVERLOAD_TABLE +} + +Function* +AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, + Type **Tys, + unsigned numTys) const { + assert(!"Not implemented"); +} diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILIntrinsicInfo.h llvm-r600/lib/Target/R600/AMDILIntrinsicInfo.h --- llvm-3.2.src/lib/Target/R600/AMDILIntrinsicInfo.h 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDILIntrinsicInfo.h 2013-01-25 19:43:57.446716388 +0100 @@ -0,0 +1,49 @@ +//===- AMDILIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class. +// +//===-----------------------------------------------------------------------===// +#ifndef AMDIL_INTRINSICS_H +#define AMDIL_INTRINSICS_H + +#include "llvm/Intrinsics.h" +#include "llvm/Target/TargetIntrinsicInfo.h" + +namespace llvm { +class TargetMachine; + +namespace AMDGPUIntrinsic { +enum ID { + last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1, +#define GET_INTRINSIC_ENUM_VALUES +#include "AMDGPUGenIntrinsics.inc" +#undef GET_INTRINSIC_ENUM_VALUES + , num_AMDGPU_intrinsics +}; + +} // end namespace AMDGPUIntrinsic + +class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo { +public: + AMDGPUIntrinsicInfo(TargetMachine *tm); + std::string getName(unsigned int IntrId, Type **Tys = 0, + unsigned int numTys = 0) const; + unsigned int lookupName(const char *Name, unsigned int Len) const; + bool isOverloaded(unsigned int IID) const; + Function *getDeclaration(Module *M, unsigned int ID, + Type **Tys = 0, + unsigned int numTys = 0) const; +}; + +} // end namespace llvm + +#endif // AMDIL_INTRINSICS_H + diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILIntrinsics.td llvm-r600/lib/Target/R600/AMDILIntrinsics.td --- llvm-3.2.src/lib/Target/R600/AMDILIntrinsics.td 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDILIntrinsics.td 2013-01-25 19:43:57.446716388 +0100 @@ -0,0 +1,242 @@ +//===- AMDILIntrinsics.td - Defines AMDIL Intrinscs -*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This file defines all of the amdil-specific intrinsics +// +//===---------------------------------------------------------------===// +//===--------------------------------------------------------------------===// +// Intrinsic classes +// Generic versions of the above classes but for Target specific intrinsics +// instead of SDNode patterns. +//===--------------------------------------------------------------------===// +let TargetPrefix = "AMDIL", isTarget = 1 in { + class VoidIntLong : + Intrinsic<[llvm_i64_ty], [], []>; + class VoidIntInt : + Intrinsic<[llvm_i32_ty], [], []>; + class VoidIntBool : + Intrinsic<[llvm_i32_ty], [], []>; + class UnaryIntInt : + Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; + class UnaryIntFloat : + Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + class ConvertIntFTOI : + Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>; + class ConvertIntITOF : + Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty], [IntrNoMem]>; + class UnaryIntNoRetInt : + Intrinsic<[], [llvm_anyint_ty], []>; + class UnaryIntNoRetFloat : + Intrinsic<[], [llvm_anyfloat_ty], []>; + class BinaryIntInt : + Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + class BinaryIntFloat : + Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + class BinaryIntNoRetInt : + Intrinsic<[], [llvm_anyint_ty, LLVMMatchType<0>], []>; + class BinaryIntNoRetFloat : + Intrinsic<[], [llvm_anyfloat_ty, LLVMMatchType<0>], []>; + class TernaryIntInt : + Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + class TernaryIntFloat : + Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + class QuaternaryIntInt : + Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + class UnaryAtomicInt : + Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; + class BinaryAtomicInt : + Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; + class TernaryAtomicInt : + Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; + class UnaryAtomicIntNoRet : + Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; + class BinaryAtomicIntNoRet : + Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; + class TernaryAtomicIntNoRet : + Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; +} + +let TargetPrefix = "AMDIL", isTarget = 1 in { + def int_AMDIL_abs : GCCBuiltin<"__amdil_abs">, UnaryIntInt; + + def int_AMDIL_bit_extract_i32 : GCCBuiltin<"__amdil_ibit_extract">, + TernaryIntInt; + def int_AMDIL_bit_extract_u32 : GCCBuiltin<"__amdil_ubit_extract">, + TernaryIntInt; + def int_AMDIL_bit_reverse_u32 : GCCBuiltin<"__amdil_ubit_reverse">, + UnaryIntInt; + def int_AMDIL_bit_count_i32 : GCCBuiltin<"__amdil_count_bits">, + UnaryIntInt; + def int_AMDIL_bit_find_first_lo : GCCBuiltin<"__amdil_ffb_lo">, + UnaryIntInt; + def int_AMDIL_bit_find_first_hi : GCCBuiltin<"__amdil_ffb_hi">, + UnaryIntInt; + def int_AMDIL_bit_find_first_sgn : GCCBuiltin<"__amdil_ffb_signed">, + UnaryIntInt; + def int_AMDIL_media_bitalign : GCCBuiltin<"__amdil_bitalign">, + TernaryIntInt; + def int_AMDIL_media_bytealign : GCCBuiltin<"__amdil_bytealign">, + TernaryIntInt; + def int_AMDIL_bit_insert_u32 : GCCBuiltin<"__amdil_ubit_insert">, + QuaternaryIntInt; + def int_AMDIL_bfi : GCCBuiltin<"__amdil_bfi">, + TernaryIntInt; + def int_AMDIL_bfm : GCCBuiltin<"__amdil_bfm">, + BinaryIntInt; + def int_AMDIL_mad_i32 : GCCBuiltin<"__amdil_imad">, + TernaryIntInt; + def int_AMDIL_mad_u32 : GCCBuiltin<"__amdil_umad">, + TernaryIntInt; + def int_AMDIL_mad : GCCBuiltin<"__amdil_mad">, + TernaryIntFloat; + def int_AMDIL_mulhi_i32 : GCCBuiltin<"__amdil_imul_high">, + BinaryIntInt; + def int_AMDIL_mulhi_u32 : GCCBuiltin<"__amdil_umul_high">, + BinaryIntInt; + def int_AMDIL_mul24_i32 : GCCBuiltin<"__amdil_imul24">, + BinaryIntInt; + def int_AMDIL_mul24_u32 : GCCBuiltin<"__amdil_umul24">, + BinaryIntInt; + def int_AMDIL_mulhi24_i32 : GCCBuiltin<"__amdil_imul24_high">, + BinaryIntInt; + def int_AMDIL_mulhi24_u32 : GCCBuiltin<"__amdil_umul24_high">, + BinaryIntInt; + def int_AMDIL_mad24_i32 : GCCBuiltin<"__amdil_imad24">, + TernaryIntInt; + def int_AMDIL_mad24_u32 : GCCBuiltin<"__amdil_umad24">, + TernaryIntInt; + def int_AMDIL_carry_i32 : GCCBuiltin<"__amdil_carry">, + BinaryIntInt; + def int_AMDIL_borrow_i32 : GCCBuiltin<"__amdil_borrow">, + BinaryIntInt; + def int_AMDIL_min_i32 : GCCBuiltin<"__amdil_imin">, + BinaryIntInt; + def int_AMDIL_min_u32 : GCCBuiltin<"__amdil_umin">, + BinaryIntInt; + def int_AMDIL_min : GCCBuiltin<"__amdil_min">, + BinaryIntFloat; + def int_AMDIL_max_i32 : GCCBuiltin<"__amdil_imax">, + BinaryIntInt; + def int_AMDIL_max_u32 : GCCBuiltin<"__amdil_umax">, + BinaryIntInt; + def int_AMDIL_max : GCCBuiltin<"__amdil_max">, + BinaryIntFloat; + def int_AMDIL_media_lerp_u4 : GCCBuiltin<"__amdil_u4lerp">, + TernaryIntInt; + def int_AMDIL_media_sad : GCCBuiltin<"__amdil_sad">, + TernaryIntInt; + def int_AMDIL_media_sad_hi : GCCBuiltin<"__amdil_sadhi">, + TernaryIntInt; + def int_AMDIL_fraction : GCCBuiltin<"__amdil_fraction">, + UnaryIntFloat; + def int_AMDIL_clamp : GCCBuiltin<"__amdil_clamp">, + TernaryIntFloat; + def int_AMDIL_pireduce : GCCBuiltin<"__amdil_pireduce">, + UnaryIntFloat; + def int_AMDIL_round_nearest : GCCBuiltin<"__amdil_round_nearest">, + UnaryIntFloat; + def int_AMDIL_round_neginf : GCCBuiltin<"__amdil_round_neginf">, + UnaryIntFloat; + def int_AMDIL_round_zero : GCCBuiltin<"__amdil_round_zero">, + UnaryIntFloat; + def int_AMDIL_acos : GCCBuiltin<"__amdil_acos">, + UnaryIntFloat; + def int_AMDIL_atan : GCCBuiltin<"__amdil_atan">, + UnaryIntFloat; + def int_AMDIL_asin : GCCBuiltin<"__amdil_asin">, + UnaryIntFloat; + def int_AMDIL_cos : GCCBuiltin<"__amdil_cos">, + UnaryIntFloat; + def int_AMDIL_cos_vec : GCCBuiltin<"__amdil_cos_vec">, + UnaryIntFloat; + def int_AMDIL_tan : GCCBuiltin<"__amdil_tan">, + UnaryIntFloat; + def int_AMDIL_sin : GCCBuiltin<"__amdil_sin">, + UnaryIntFloat; + def int_AMDIL_sin_vec : GCCBuiltin<"__amdil_sin_vec">, + UnaryIntFloat; + def int_AMDIL_pow : GCCBuiltin<"__amdil_pow">, BinaryIntFloat; + def int_AMDIL_div : GCCBuiltin<"__amdil_div">, BinaryIntFloat; + def int_AMDIL_udiv : GCCBuiltin<"__amdil_udiv">, BinaryIntInt; + def int_AMDIL_sqrt: GCCBuiltin<"__amdil_sqrt">, + UnaryIntFloat; + def int_AMDIL_sqrt_vec: GCCBuiltin<"__amdil_sqrt_vec">, + UnaryIntFloat; + def int_AMDIL_exp : GCCBuiltin<"__amdil_exp">, + UnaryIntFloat; + def int_AMDIL_exp_vec : GCCBuiltin<"__amdil_exp_vec">, + UnaryIntFloat; + def int_AMDIL_exn : GCCBuiltin<"__amdil_exn">, + UnaryIntFloat; + def int_AMDIL_log_vec : GCCBuiltin<"__amdil_log_vec">, + UnaryIntFloat; + def int_AMDIL_ln : GCCBuiltin<"__amdil_ln">, + UnaryIntFloat; + def int_AMDIL_sign: GCCBuiltin<"__amdil_sign">, + UnaryIntFloat; + def int_AMDIL_fma: GCCBuiltin<"__amdil_fma">, + TernaryIntFloat; + def int_AMDIL_rsq : GCCBuiltin<"__amdil_rsq">, + UnaryIntFloat; + def int_AMDIL_rsq_vec : GCCBuiltin<"__amdil_rsq_vec">, + UnaryIntFloat; + def int_AMDIL_length : GCCBuiltin<"__amdil_length">, + UnaryIntFloat; + def int_AMDIL_lerp : GCCBuiltin<"__amdil_lerp">, + TernaryIntFloat; + def int_AMDIL_media_sad4 : GCCBuiltin<"__amdil_sad4">, + Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty, + llvm_v4i32_ty, llvm_i32_ty], []>; + + def int_AMDIL_frexp_f64 : GCCBuiltin<"__amdil_frexp">, + Intrinsic<[llvm_v2i64_ty], [llvm_double_ty], []>; + def int_AMDIL_ldexp : GCCBuiltin<"__amdil_ldexp">, + Intrinsic<[llvm_anyfloat_ty], [llvm_anyfloat_ty, llvm_anyint_ty], []>; + def int_AMDIL_drcp : GCCBuiltin<"__amdil_rcp">, + Intrinsic<[llvm_double_ty], [llvm_double_ty], []>; + def int_AMDIL_convert_f16_f32 : GCCBuiltin<"__amdil_half_to_float">, + ConvertIntITOF; + def int_AMDIL_convert_f32_f16 : GCCBuiltin<"__amdil_float_to_half">, + ConvertIntFTOI; + def int_AMDIL_convert_f32_i32_rpi : GCCBuiltin<"__amdil_float_to_int_rpi">, + ConvertIntFTOI; + def int_AMDIL_convert_f32_i32_flr : GCCBuiltin<"__amdil_float_to_int_flr">, + ConvertIntFTOI; + def int_AMDIL_convert_f32_f16_near : GCCBuiltin<"__amdil_float_to_half_near">, + ConvertIntFTOI; + def int_AMDIL_convert_f32_f16_neg_inf : GCCBuiltin<"__amdil_float_to_half_neg_inf">, + ConvertIntFTOI; + def int_AMDIL_convert_f32_f16_plus_inf : GCCBuiltin<"__amdil_float_to_half_plus_inf">, + ConvertIntFTOI; + def int_AMDIL_media_convert_f2v4u8 : GCCBuiltin<"__amdil_f_2_u4">, + Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], []>; + def int_AMDIL_media_unpack_byte_0 : GCCBuiltin<"__amdil_unpack_0">, + ConvertIntITOF; + def int_AMDIL_media_unpack_byte_1 : GCCBuiltin<"__amdil_unpack_1">, + ConvertIntITOF; + def int_AMDIL_media_unpack_byte_2 : GCCBuiltin<"__amdil_unpack_2">, + ConvertIntITOF; + def int_AMDIL_media_unpack_byte_3 : GCCBuiltin<"__amdil_unpack_3">, + ConvertIntITOF; + def int_AMDIL_dp2_add : GCCBuiltin<"__amdil_dp2_add">, + Intrinsic<[llvm_float_ty], [llvm_v2f32_ty, + llvm_v2f32_ty, llvm_float_ty], []>; + def int_AMDIL_dp2 : GCCBuiltin<"__amdil_dp2">, + Intrinsic<[llvm_float_ty], [llvm_v2f32_ty, + llvm_v2f32_ty], []>; + def int_AMDIL_dp3 : GCCBuiltin<"__amdil_dp3">, + Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, + llvm_v4f32_ty], []>; + def int_AMDIL_dp4 : GCCBuiltin<"__amdil_dp4">, + Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, + llvm_v4f32_ty], []>; +} diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILISelDAGToDAG.cpp llvm-r600/lib/Target/R600/AMDILISelDAGToDAG.cpp --- llvm-3.2.src/lib/Target/R600/AMDILISelDAGToDAG.cpp 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDILISelDAGToDAG.cpp 2013-01-25 19:43:57.443383054 +0100 @@ -0,0 +1,567 @@ +//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief Defines an instruction selector for the AMDGPU target. +// +//===----------------------------------------------------------------------===// +#include "AMDGPUInstrInfo.h" +#include "AMDGPUISelLowering.h" // For AMDGPUISD +#include "AMDGPURegisterInfo.h" +#include "AMDILDevices.h" +#include "R600InstrInfo.h" +#include "llvm/ADT/ValueMap.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/Support/Compiler.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include +#include + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Instruction Selector Implementation +//===----------------------------------------------------------------------===// + +namespace { +/// AMDGPU specific code to select AMDGPU machine instructions for +/// SelectionDAG operations. +class AMDGPUDAGToDAGISel : public SelectionDAGISel { + // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can + // make the right decision when generating code for different targets. + const AMDGPUSubtarget &Subtarget; +public: + AMDGPUDAGToDAGISel(TargetMachine &TM); + virtual ~AMDGPUDAGToDAGISel(); + + SDNode *Select(SDNode *N); + virtual const char *getPassName() const; + +private: + inline SDValue getSmallIPtrImm(unsigned Imm); + bool FoldOperands(unsigned, const R600InstrInfo *, std::vector &); + + // Complex pattern selectors + bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2); + bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2); + bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2); + + static bool checkType(const Value *ptr, unsigned int addrspace); + static const Value *getBasePointerValue(const Value *V); + + static bool isGlobalStore(const StoreSDNode *N); + static bool isPrivateStore(const StoreSDNode *N); + static bool isLocalStore(const StoreSDNode *N); + static bool isRegionStore(const StoreSDNode *N); + + static bool isCPLoad(const LoadSDNode *N); + static bool isConstantLoad(const LoadSDNode *N, int cbID); + static bool isGlobalLoad(const LoadSDNode *N); + static bool isParamLoad(const LoadSDNode *N); + static bool isPrivateLoad(const LoadSDNode *N); + static bool isLocalLoad(const LoadSDNode *N); + static bool isRegionLoad(const LoadSDNode *N); + + bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); + bool SelectGlobalValueVariableOffset(SDValue Addr, + SDValue &BaseReg, SDValue& Offset); + bool SelectADDR8BitOffset(SDValue Addr, SDValue& Base, SDValue& Offset); + bool SelectADDRReg(SDValue Addr, SDValue& Base, SDValue& Offset); + bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); + + // Include the pieces autogenerated from the target description. +#include "AMDGPUGenDAGISel.inc" +}; +} // end anonymous namespace + +/// \brief This pass converts a legalized DAG into a AMDGPU-specific +// DAG, ready for instruction scheduling. +FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM + ) { + return new AMDGPUDAGToDAGISel(TM); +} + +AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM + ) + : SelectionDAGISel(TM), Subtarget(TM.getSubtarget()) { +} + +AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() { +} + +SDValue AMDGPUDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) { + return CurDAG->getTargetConstant(Imm, MVT::i32); +} + +bool AMDGPUDAGToDAGISel::SelectADDRParam( + SDValue Addr, SDValue& R1, SDValue& R2) { + + if (Addr.getOpcode() == ISD::FrameIndex) { + if (FrameIndexSDNode *FIN = dyn_cast(Addr)) { + R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); + R2 = CurDAG->getTargetConstant(0, MVT::i32); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, MVT::i32); + } + } else if (Addr.getOpcode() == ISD::ADD) { + R1 = Addr.getOperand(0); + R2 = Addr.getOperand(1); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, MVT::i32); + } + return true; +} + +bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) { + if (Addr.getOpcode() == ISD::TargetExternalSymbol || + Addr.getOpcode() == ISD::TargetGlobalAddress) { + return false; + } + return SelectADDRParam(Addr, R1, R2); +} + + +bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) { + if (Addr.getOpcode() == ISD::TargetExternalSymbol || + Addr.getOpcode() == ISD::TargetGlobalAddress) { + return false; + } + + if (Addr.getOpcode() == ISD::FrameIndex) { + if (FrameIndexSDNode *FIN = dyn_cast(Addr)) { + R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64); + R2 = CurDAG->getTargetConstant(0, MVT::i64); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, MVT::i64); + } + } else if (Addr.getOpcode() == ISD::ADD) { + R1 = Addr.getOperand(0); + R2 = Addr.getOperand(1); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, MVT::i64); + } + return true; +} + +SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { + unsigned int Opc = N->getOpcode(); + if (N->isMachineOpcode()) { + return NULL; // Already selected. + } + switch (Opc) { + default: break; + case ISD::FrameIndex: { + if (FrameIndexSDNode *FIN = dyn_cast(N)) { + unsigned int FI = FIN->getIndex(); + EVT OpVT = N->getValueType(0); + unsigned int NewOpc = AMDGPU::COPY; + SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i32); + return CurDAG->SelectNodeTo(N, NewOpc, OpVT, TFI); + } + break; + } + case ISD::ConstantFP: + case ISD::Constant: { + const AMDGPUSubtarget &ST = TM.getSubtarget(); + // XXX: Custom immediate lowering not implemented yet. Instead we use + // pseudo instructions defined in SIInstructions.td + if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { + break; + } + const R600InstrInfo *TII = static_cast(TM.getInstrInfo()); + + uint64_t ImmValue = 0; + unsigned ImmReg = AMDGPU::ALU_LITERAL_X; + + if (N->getOpcode() == ISD::ConstantFP) { + // XXX: 64-bit Immediates not supported yet + assert(N->getValueType(0) != MVT::f64); + + ConstantFPSDNode *C = dyn_cast(N); + APFloat Value = C->getValueAPF(); + float FloatValue = Value.convertToFloat(); + if (FloatValue == 0.0) { + ImmReg = AMDGPU::ZERO; + } else if (FloatValue == 0.5) { + ImmReg = AMDGPU::HALF; + } else if (FloatValue == 1.0) { + ImmReg = AMDGPU::ONE; + } else { + ImmValue = Value.bitcastToAPInt().getZExtValue(); + } + } else { + // XXX: 64-bit Immediates not supported yet + assert(N->getValueType(0) != MVT::i64); + + ConstantSDNode *C = dyn_cast(N); + if (C->getZExtValue() == 0) { + ImmReg = AMDGPU::ZERO; + } else if (C->getZExtValue() == 1) { + ImmReg = AMDGPU::ONE_INT; + } else { + ImmValue = C->getZExtValue(); + } + } + + for (SDNode::use_iterator Use = N->use_begin(), Next = llvm::next(Use); + Use != SDNode::use_end(); Use = Next) { + Next = llvm::next(Use); + std::vector Ops; + for (unsigned i = 0; i < Use->getNumOperands(); ++i) { + Ops.push_back(Use->getOperand(i)); + } + + if (!Use->isMachineOpcode()) { + if (ImmReg == AMDGPU::ALU_LITERAL_X) { + // We can only use literal constants (e.g. AMDGPU::ZERO, + // AMDGPU::ONE, etc) in machine opcodes. + continue; + } + } else { + if (!TII->isALUInstr(Use->getMachineOpcode())) { + continue; + } + + int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(), R600Operands::IMM); + assert(ImmIdx != -1); + + // subtract one from ImmIdx, because the DST operand is usually index + // 0 for MachineInstrs, but we have no DST in the Ops vector. + ImmIdx--; + + // Check that we aren't already using an immediate. + // XXX: It's possible for an instruction to have more than one + // immediate operand, but this is not supported yet. + if (ImmReg == AMDGPU::ALU_LITERAL_X) { + ConstantSDNode *C = dyn_cast(Use->getOperand(ImmIdx)); + assert(C); + + if (C->getZExtValue() != 0) { + // This instruction is already using an immediate. + continue; + } + + // Set the immediate value + Ops[ImmIdx] = CurDAG->getTargetConstant(ImmValue, MVT::i32); + } + } + // Set the immediate register + Ops[Use.getOperandNo()] = CurDAG->getRegister(ImmReg, MVT::i32); + + CurDAG->UpdateNodeOperands(*Use, Ops.data(), Use->getNumOperands()); + } + break; + } + } + SDNode *Result = SelectCode(N); + + // Fold operands of selected node + + const AMDGPUSubtarget &ST = TM.getSubtarget(); + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { + const R600InstrInfo *TII = + static_cast(TM.getInstrInfo()); + if (Result && TII->isALUInstr(Result->getMachineOpcode())) { + bool IsModified = false; + do { + std::vector Ops; + for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end(); + I != E; ++I) + Ops.push_back(*I); + IsModified = FoldOperands(Result->getMachineOpcode(), TII, Ops); + if (IsModified) { + Result = CurDAG->MorphNodeTo(Result, Result->getOpcode(), + Result->getVTList(), Ops.data(), Ops.size()); + } + } while (IsModified); + } + } + + return Result; +} + +bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode, + const R600InstrInfo *TII, std::vector &Ops) { + int OperandIdx[] = { + TII->getOperandIdx(Opcode, R600Operands::SRC0), + TII->getOperandIdx(Opcode, R600Operands::SRC1), + TII->getOperandIdx(Opcode, R600Operands::SRC2) + }; + int SelIdx[] = { + TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL), + TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL), + TII->getOperandIdx(Opcode, R600Operands::SRC2_SEL) + }; + for (unsigned i = 0; i < 3; i++) { + if (OperandIdx[i] < 0) + return false; + SDValue Operand = Ops[OperandIdx[i] - 1]; + switch (Operand.getOpcode()) { + case AMDGPUISD::CONST_ADDRESS: { + SDValue CstOffset; + if (!Operand.getValueType().isVector() && + SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) { + Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32); + Ops[SelIdx[i] - 1] = CstOffset; + return true; + } + } + break; + default: + break; + } + } + return false; +} + +bool AMDGPUDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) { + if (!ptr) { + return false; + } + Type *ptrType = ptr->getType(); + return dyn_cast(ptrType)->getAddressSpace() == addrspace; +} + +const Value * AMDGPUDAGToDAGISel::getBasePointerValue(const Value *V) { + if (!V) { + return NULL; + } + const Value *ret = NULL; + ValueMap ValueBitMap; + std::queue > ValueQueue; + ValueQueue.push(V); + while (!ValueQueue.empty()) { + V = ValueQueue.front(); + if (ValueBitMap.find(V) == ValueBitMap.end()) { + ValueBitMap[V] = true; + if (dyn_cast(V) && dyn_cast(V->getType())) { + ret = V; + break; + } else if (dyn_cast(V)) { + ret = V; + break; + } else if (dyn_cast(V)) { + const ConstantExpr *CE = dyn_cast(V); + if (CE) { + ValueQueue.push(CE->getOperand(0)); + } + } else if (const AllocaInst *AI = dyn_cast(V)) { + ret = AI; + break; + } else if (const Instruction *I = dyn_cast(V)) { + uint32_t numOps = I->getNumOperands(); + for (uint32_t x = 0; x < numOps; ++x) { + ValueQueue.push(I->getOperand(x)); + } + } else { + assert(!"Found a Value that we didn't know how to handle!"); + } + } + ValueQueue.pop(); + } + return ret; +} + +bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) { + return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) { + return (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS) + && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS) + && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS)); +} + +bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) { + return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) { + return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int cbID) { + if (checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)) { + return true; + } + MachineMemOperand *MMO = N->getMemOperand(); + const Value *V = MMO->getValue(); + const Value *BV = getBasePointerValue(V); + if (MMO + && MMO->getValue() + && ((V && dyn_cast(V)) + || (BV && dyn_cast( + getBasePointerValue(MMO->getValue()))))) { + return checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS); + } else { + return false; + } +} + +bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) { + return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) { + return checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isLocalLoad(const LoadSDNode *N) { + return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) { + return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) { + MachineMemOperand *MMO = N->getMemOperand(); + if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) { + if (MMO) { + const Value *V = MMO->getValue(); + const PseudoSourceValue *PSV = dyn_cast(V); + if (PSV && PSV == PseudoSourceValue::getConstantPool()) { + return true; + } + } + } + return false; +} + +bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) { + if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) { + // Check to make sure we are not a constant pool load or a constant load + // that is marked as a private load + if (isCPLoad(N) || isConstantLoad(N, -1)) { + return false; + } + } + if (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS) + && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS) + && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS) + && !checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS) + && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_D_ADDRESS) + && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS)) { + return true; + } + return false; +} + +const char *AMDGPUDAGToDAGISel::getPassName() const { + return "AMDGPU DAG->DAG Pattern Instruction Selection"; +} + +#ifdef DEBUGTMP +#undef INT64_C +#endif +#undef DEBUGTMP + +///==== AMDGPU Functions ====/// + +bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr, + SDValue& IntPtr) { + if (ConstantSDNode *Cst = dyn_cast(Addr)) { + IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, true); + return true; + } + return false; +} + +bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, + SDValue& BaseReg, SDValue &Offset) { + if (!dyn_cast(Addr)) { + BaseReg = Addr; + Offset = CurDAG->getIntPtrConstant(0, true); + return true; + } + return false; +} + +bool AMDGPUDAGToDAGISel::SelectADDR8BitOffset(SDValue Addr, SDValue& Base, + SDValue& Offset) { + if (Addr.getOpcode() == ISD::TargetExternalSymbol || + Addr.getOpcode() == ISD::TargetGlobalAddress) { + return false; + } + + + if (Addr.getOpcode() == ISD::ADD) { + bool Match = false; + + // Find the base ptr and the offset + for (unsigned i = 0; i < Addr.getNumOperands(); i++) { + SDValue Arg = Addr.getOperand(i); + ConstantSDNode * OffsetNode = dyn_cast(Arg); + // This arg isn't a constant so it must be the base PTR. + if (!OffsetNode) { + Base = Addr.getOperand(i); + continue; + } + // Check if the constant argument fits in 8-bits. The offset is in bytes + // so we need to convert it to dwords. + if (isUInt<8>(OffsetNode->getZExtValue() >> 2)) { + Match = true; + Offset = CurDAG->getTargetConstant(OffsetNode->getZExtValue() >> 2, + MVT::i32); + } + } + return Match; + } + + // Default case, no offset + Base = Addr; + Offset = CurDAG->getTargetConstant(0, MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, + SDValue &Offset) { + ConstantSDNode * IMMOffset; + + if (Addr.getOpcode() == ISD::ADD + && (IMMOffset = dyn_cast(Addr.getOperand(1))) + && isInt<16>(IMMOffset->getZExtValue())) { + + Base = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32); + return true; + // If the pointer address is constant, we can move it to the offset field. + } else if ((IMMOffset = dyn_cast(Addr)) + && isInt<16>(IMMOffset->getZExtValue())) { + Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), + CurDAG->getEntryNode().getDebugLoc(), + AMDGPU::ZERO, MVT::i32); + Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32); + return true; + } + + // Default case, no offset + Base = Addr; + Offset = CurDAG->getTargetConstant(0, MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectADDRReg(SDValue Addr, SDValue& Base, + SDValue& Offset) { + if (Addr.getOpcode() == ISD::TargetExternalSymbol || + Addr.getOpcode() == ISD::TargetGlobalAddress || + Addr.getOpcode() != ISD::ADD) { + return false; + } + + Base = Addr.getOperand(0); + Offset = Addr.getOperand(1); + + return true; +} diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILISelLowering.cpp llvm-r600/lib/Target/R600/AMDILISelLowering.cpp --- llvm-3.2.src/lib/Target/R600/AMDILISelLowering.cpp 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDILISelLowering.cpp 2013-01-25 19:43:57.443383054 +0100 @@ -0,0 +1,651 @@ +//===-- AMDILISelLowering.cpp - AMDIL DAG Lowering Implementation ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief TargetLowering functions borrowed from AMDIL. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUISelLowering.h" +#include "AMDGPURegisterInfo.h" +#include "AMDILDevices.h" +#include "AMDILIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/CallingConv.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetOptions.h" + +using namespace llvm; +//===----------------------------------------------------------------------===// +// Calling Convention Implementation +//===----------------------------------------------------------------------===// +#include "AMDGPUGenCallingConv.inc" + +//===----------------------------------------------------------------------===// +// TargetLowering Implementation Help Functions End +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// TargetLowering Class Implementation Begins +//===----------------------------------------------------------------------===// +void AMDGPUTargetLowering::InitAMDILLowering() { + int types[] = { + (int)MVT::i8, + (int)MVT::i16, + (int)MVT::i32, + (int)MVT::f32, + (int)MVT::f64, + (int)MVT::i64, + (int)MVT::v2i8, + (int)MVT::v4i8, + (int)MVT::v2i16, + (int)MVT::v4i16, + (int)MVT::v4f32, + (int)MVT::v4i32, + (int)MVT::v2f32, + (int)MVT::v2i32, + (int)MVT::v2f64, + (int)MVT::v2i64 + }; + + int IntTypes[] = { + (int)MVT::i8, + (int)MVT::i16, + (int)MVT::i32, + (int)MVT::i64 + }; + + int FloatTypes[] = { + (int)MVT::f32, + (int)MVT::f64 + }; + + int VectorTypes[] = { + (int)MVT::v2i8, + (int)MVT::v4i8, + (int)MVT::v2i16, + (int)MVT::v4i16, + (int)MVT::v4f32, + (int)MVT::v4i32, + (int)MVT::v2f32, + (int)MVT::v2i32, + (int)MVT::v2f64, + (int)MVT::v2i64 + }; + size_t NumTypes = sizeof(types) / sizeof(*types); + size_t NumFloatTypes = sizeof(FloatTypes) / sizeof(*FloatTypes); + size_t NumIntTypes = sizeof(IntTypes) / sizeof(*IntTypes); + size_t NumVectorTypes = sizeof(VectorTypes) / sizeof(*VectorTypes); + + const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget(); + // These are the current register classes that are + // supported + + for (unsigned int x = 0; x < NumTypes; ++x) { + MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x]; + + //FIXME: SIGN_EXTEND_INREG is not meaningful for floating point types + // We cannot sextinreg, expand to shifts + setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom); + setOperationAction(ISD::SUBE, VT, Expand); + setOperationAction(ISD::SUBC, VT, Expand); + setOperationAction(ISD::ADDE, VT, Expand); + setOperationAction(ISD::ADDC, VT, Expand); + setOperationAction(ISD::BRCOND, VT, Custom); + setOperationAction(ISD::BR_JT, VT, Expand); + setOperationAction(ISD::BRIND, VT, Expand); + // TODO: Implement custom UREM/SREM routines + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + if (VT != MVT::i64 && VT != MVT::v2i64) { + setOperationAction(ISD::SDIV, VT, Custom); + } + } + for (unsigned int x = 0; x < NumFloatTypes; ++x) { + MVT::SimpleValueType VT = (MVT::SimpleValueType)FloatTypes[x]; + + // IL does not have these operations for floating point types + setOperationAction(ISD::FP_ROUND_INREG, VT, Expand); + setOperationAction(ISD::SETOLT, VT, Expand); + setOperationAction(ISD::SETOGE, VT, Expand); + setOperationAction(ISD::SETOGT, VT, Expand); + setOperationAction(ISD::SETOLE, VT, Expand); + setOperationAction(ISD::SETULT, VT, Expand); + setOperationAction(ISD::SETUGE, VT, Expand); + setOperationAction(ISD::SETUGT, VT, Expand); + setOperationAction(ISD::SETULE, VT, Expand); + } + + for (unsigned int x = 0; x < NumIntTypes; ++x) { + MVT::SimpleValueType VT = (MVT::SimpleValueType)IntTypes[x]; + + // GPU also does not have divrem function for signed or unsigned + setOperationAction(ISD::SDIVREM, VT, Expand); + + // GPU does not have [S|U]MUL_LOHI functions as a single instruction + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + + // GPU doesn't have a rotl, rotr, or byteswap instruction + setOperationAction(ISD::ROTR, VT, Expand); + setOperationAction(ISD::BSWAP, VT, Expand); + + // GPU doesn't have any counting operators + setOperationAction(ISD::CTPOP, VT, Expand); + setOperationAction(ISD::CTTZ, VT, Expand); + setOperationAction(ISD::CTLZ, VT, Expand); + } + + for (unsigned int ii = 0; ii < NumVectorTypes; ++ii) { + MVT::SimpleValueType VT = (MVT::SimpleValueType)VectorTypes[ii]; + + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + // setOperationAction(ISD::VSETCC, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); + + } + if (STM.device()->isSupported(AMDGPUDeviceInfo::LongOps)) { + setOperationAction(ISD::MULHU, MVT::i64, Expand); + setOperationAction(ISD::MULHU, MVT::v2i64, Expand); + setOperationAction(ISD::MULHS, MVT::i64, Expand); + setOperationAction(ISD::MULHS, MVT::v2i64, Expand); + setOperationAction(ISD::ADD, MVT::v2i64, Expand); + setOperationAction(ISD::SREM, MVT::v2i64, Expand); + setOperationAction(ISD::Constant , MVT::i64 , Legal); + setOperationAction(ISD::SDIV, MVT::v2i64, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand); + setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand); + setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand); + } + if (STM.device()->isSupported(AMDGPUDeviceInfo::DoubleOps)) { + // we support loading/storing v2f64 but not operations on the type + setOperationAction(ISD::FADD, MVT::v2f64, Expand); + setOperationAction(ISD::FSUB, MVT::v2f64, Expand); + setOperationAction(ISD::FMUL, MVT::v2f64, Expand); + setOperationAction(ISD::FP_ROUND_INREG, MVT::v2f64, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); + setOperationAction(ISD::ConstantFP , MVT::f64 , Legal); + // We want to expand vector conversions into their scalar + // counterparts. + setOperationAction(ISD::TRUNCATE, MVT::v2f64, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v2f64, Expand); + setOperationAction(ISD::ZERO_EXTEND, MVT::v2f64, Expand); + setOperationAction(ISD::ANY_EXTEND, MVT::v2f64, Expand); + setOperationAction(ISD::FABS, MVT::f64, Expand); + setOperationAction(ISD::FABS, MVT::v2f64, Expand); + } + // TODO: Fix the UDIV24 algorithm so it works for these + // types correctly. This needs vector comparisons + // for this to work correctly. + setOperationAction(ISD::UDIV, MVT::v2i8, Expand); + setOperationAction(ISD::UDIV, MVT::v4i8, Expand); + setOperationAction(ISD::UDIV, MVT::v2i16, Expand); + setOperationAction(ISD::UDIV, MVT::v4i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom); + setOperationAction(ISD::SUBC, MVT::Other, Expand); + setOperationAction(ISD::ADDE, MVT::Other, Expand); + setOperationAction(ISD::ADDC, MVT::Other, Expand); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + setOperationAction(ISD::BRIND, MVT::Other, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); + + + // Use the default implementation. + setOperationAction(ISD::ConstantFP , MVT::f32 , Legal); + setOperationAction(ISD::Constant , MVT::i32 , Legal); + + setSchedulingPreference(Sched::RegPressure); + setPow2DivIsCheap(false); + setSelectIsExpensive(true); + setJumpIsExpensive(true); + + maxStoresPerMemcpy = 4096; + maxStoresPerMemmove = 4096; + maxStoresPerMemset = 4096; + +} + +bool +AMDGPUTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, unsigned Intrinsic) const { + return false; +} + +// The backend supports 32 and 64 bit floating point immediates +bool +AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { + if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32 + || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) { + return true; + } else { + return false; + } +} + +bool +AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { + if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32 + || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) { + return false; + } else { + return true; + } +} + + +// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to +// be zero. Op is expected to be a target specific node. Used by DAG +// combiner. + +void +AMDGPUTargetLowering::computeMaskedBitsForTargetNode( + const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { + APInt KnownZero2; + APInt KnownOne2; + KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything + switch (Op.getOpcode()) { + default: break; + case ISD::SELECT_CC: + DAG.ComputeMaskedBits( + Op.getOperand(1), + KnownZero, + KnownOne, + Depth + 1 + ); + DAG.ComputeMaskedBits( + Op.getOperand(0), + KnownZero2, + KnownOne2 + ); + assert((KnownZero & KnownOne) == 0 + && "Bits known to be one AND zero?"); + assert((KnownZero2 & KnownOne2) == 0 + && "Bits known to be one AND zero?"); + // Only known if known in both the LHS and RHS + KnownOne &= KnownOne2; + KnownZero &= KnownZero2; + break; + }; +} + +//===----------------------------------------------------------------------===// +// Other Lowering Hooks +//===----------------------------------------------------------------------===// + +SDValue +AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { + EVT OVT = Op.getValueType(); + SDValue DST; + if (OVT.getScalarType() == MVT::i64) { + DST = LowerSDIV64(Op, DAG); + } else if (OVT.getScalarType() == MVT::i32) { + DST = LowerSDIV32(Op, DAG); + } else if (OVT.getScalarType() == MVT::i16 + || OVT.getScalarType() == MVT::i8) { + DST = LowerSDIV24(Op, DAG); + } else { + DST = SDValue(Op.getNode(), 0); + } + return DST; +} + +SDValue +AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const { + EVT OVT = Op.getValueType(); + SDValue DST; + if (OVT.getScalarType() == MVT::i64) { + DST = LowerSREM64(Op, DAG); + } else if (OVT.getScalarType() == MVT::i32) { + DST = LowerSREM32(Op, DAG); + } else if (OVT.getScalarType() == MVT::i16) { + DST = LowerSREM16(Op, DAG); + } else if (OVT.getScalarType() == MVT::i8) { + DST = LowerSREM8(Op, DAG); + } else { + DST = SDValue(Op.getNode(), 0); + } + return DST; +} + +SDValue +AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { + SDValue Data = Op.getOperand(0); + VTSDNode *BaseType = cast(Op.getOperand(1)); + DebugLoc DL = Op.getDebugLoc(); + EVT DVT = Data.getValueType(); + EVT BVT = BaseType->getVT(); + unsigned baseBits = BVT.getScalarType().getSizeInBits(); + unsigned srcBits = DVT.isSimple() ? DVT.getScalarType().getSizeInBits() : 1; + unsigned shiftBits = srcBits - baseBits; + if (srcBits < 32) { + // If the op is less than 32 bits, then it needs to extend to 32bits + // so it can properly keep the upper bits valid. + EVT IVT = genIntType(32, DVT.isVector() ? DVT.getVectorNumElements() : 1); + Data = DAG.getNode(ISD::ZERO_EXTEND, DL, IVT, Data); + shiftBits = 32 - baseBits; + DVT = IVT; + } + SDValue Shift = DAG.getConstant(shiftBits, DVT); + // Shift left by 'Shift' bits. + Data = DAG.getNode(ISD::SHL, DL, DVT, Data, Shift); + // Signed shift Right by 'Shift' bits. + Data = DAG.getNode(ISD::SRA, DL, DVT, Data, Shift); + if (srcBits < 32) { + // Once the sign extension is done, the op needs to be converted to + // its original type. + Data = DAG.getSExtOrTrunc(Data, DL, Op.getOperand(0).getValueType()); + } + return Data; +} +EVT +AMDGPUTargetLowering::genIntType(uint32_t size, uint32_t numEle) const { + int iSize = (size * numEle); + int vEle = (iSize >> ((size == 64) ? 6 : 5)); + if (!vEle) { + vEle = 1; + } + if (size == 64) { + if (vEle == 1) { + return EVT(MVT::i64); + } else { + return EVT(MVT::getVectorVT(MVT::i64, vEle)); + } + } else { + if (vEle == 1) { + return EVT(MVT::i32); + } else { + return EVT(MVT::getVectorVT(MVT::i32, vEle)); + } + } +} + +SDValue +AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + SDValue Cond = Op.getOperand(1); + SDValue Jump = Op.getOperand(2); + SDValue Result; + Result = DAG.getNode( + AMDGPUISD::BRANCH_COND, + Op.getDebugLoc(), + Op.getValueType(), + Chain, Jump, Cond); + return Result; +} + +SDValue +AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + MVT INTTY; + MVT FLTTY; + if (!OVT.isVector()) { + INTTY = MVT::i32; + FLTTY = MVT::f32; + } else if (OVT.getVectorNumElements() == 2) { + INTTY = MVT::v2i32; + FLTTY = MVT::v2f32; + } else if (OVT.getVectorNumElements() == 4) { + INTTY = MVT::v4i32; + FLTTY = MVT::v4f32; + } + unsigned bitsize = OVT.getScalarType().getSizeInBits(); + // char|short jq = ia ^ ib; + SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS); + + // jq = jq >> (bitsize - 2) + jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT)); + + // jq = jq | 0x1 + jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT)); + + // jq = (int)jq + jq = DAG.getSExtOrTrunc(jq, DL, INTTY); + + // int ia = (int)LHS; + SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY); + + // int ib, (int)RHS; + SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY); + + // float fa = (float)ia; + SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia); + + // float fb = (float)ib; + SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib); + + // float fq = native_divide(fa, fb); + SDValue fq = DAG.getNode(AMDGPUISD::DIV_INF, DL, FLTTY, fa, fb); + + // fq = trunc(fq); + fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq); + + // float fqneg = -fq; + SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq); + + // float fr = mad(fqneg, fb, fa); + SDValue fr = DAG.getNode(AMDGPUISD::MAD, DL, FLTTY, fqneg, fb, fa); + + // int iq = (int)fq; + SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq); + + // fr = fabs(fr); + fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr); + + // fb = fabs(fb); + fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb); + + // int cv = fr >= fb; + SDValue cv; + if (INTTY == MVT::i32) { + cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); + } else { + cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); + } + // jq = (cv ? jq : 0); + jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq, + DAG.getConstant(0, OVT)); + // dst = iq + jq; + iq = DAG.getSExtOrTrunc(iq, DL, OVT); + iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq); + return iq; +} + +SDValue +AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + // The LowerSDIV32 function generates equivalent to the following IL. + // mov r0, LHS + // mov r1, RHS + // ilt r10, r0, 0 + // ilt r11, r1, 0 + // iadd r0, r0, r10 + // iadd r1, r1, r11 + // ixor r0, r0, r10 + // ixor r1, r1, r11 + // udiv r0, r0, r1 + // ixor r10, r10, r11 + // iadd r0, r0, r10 + // ixor DST, r0, r10 + + // mov r0, LHS + SDValue r0 = LHS; + + // mov r1, RHS + SDValue r1 = RHS; + + // ilt r10, r0, 0 + SDValue r10 = DAG.getSelectCC(DL, + r0, DAG.getConstant(0, OVT), + DAG.getConstant(-1, MVT::i32), + DAG.getConstant(0, MVT::i32), + ISD::SETLT); + + // ilt r11, r1, 0 + SDValue r11 = DAG.getSelectCC(DL, + r1, DAG.getConstant(0, OVT), + DAG.getConstant(-1, MVT::i32), + DAG.getConstant(0, MVT::i32), + ISD::SETLT); + + // iadd r0, r0, r10 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + + // iadd r1, r1, r11 + r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); + + // ixor r0, r0, r10 + r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + + // ixor r1, r1, r11 + r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); + + // udiv r0, r0, r1 + r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1); + + // ixor r10, r10, r11 + r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11); + + // iadd r0, r0, r10 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + + // ixor DST, r0, r10 + SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + return DST; +} + +SDValue +AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const { + return SDValue(Op.getNode(), 0); +} + +SDValue +AMDGPUTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + MVT INTTY = MVT::i32; + if (OVT == MVT::v2i8) { + INTTY = MVT::v2i32; + } else if (OVT == MVT::v4i8) { + INTTY = MVT::v4i32; + } + SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY); + SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY); + LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS); + LHS = DAG.getSExtOrTrunc(LHS, DL, OVT); + return LHS; +} + +SDValue +AMDGPUTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + MVT INTTY = MVT::i32; + if (OVT == MVT::v2i16) { + INTTY = MVT::v2i32; + } else if (OVT == MVT::v4i16) { + INTTY = MVT::v4i32; + } + SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY); + SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY); + LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS); + LHS = DAG.getSExtOrTrunc(LHS, DL, OVT); + return LHS; +} + +SDValue +AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + // The LowerSREM32 function generates equivalent to the following IL. + // mov r0, LHS + // mov r1, RHS + // ilt r10, r0, 0 + // ilt r11, r1, 0 + // iadd r0, r0, r10 + // iadd r1, r1, r11 + // ixor r0, r0, r10 + // ixor r1, r1, r11 + // udiv r20, r0, r1 + // umul r20, r20, r1 + // sub r0, r0, r20 + // iadd r0, r0, r10 + // ixor DST, r0, r10 + + // mov r0, LHS + SDValue r0 = LHS; + + // mov r1, RHS + SDValue r1 = RHS; + + // ilt r10, r0, 0 + SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT); + + // ilt r11, r1, 0 + SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT); + + // iadd r0, r0, r10 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + + // iadd r1, r1, r11 + r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); + + // ixor r0, r0, r10 + r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + + // ixor r1, r1, r11 + r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); + + // udiv r20, r0, r1 + SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1); + + // umul r20, r20, r1 + r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1); + + // sub r0, r0, r20 + r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20); + + // iadd r0, r0, r10 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + + // ixor DST, r0, r10 + SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + return DST; +} + +SDValue +AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const { + return SDValue(Op.getNode(), 0); +} diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILNIDevice.cpp llvm-r600/lib/Target/R600/AMDILNIDevice.cpp --- llvm-3.2.src/lib/Target/R600/AMDILNIDevice.cpp 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDILNIDevice.cpp 2013-01-25 19:43:57.446716388 +0100 @@ -0,0 +1,65 @@ +//===-- AMDILNIDevice.cpp - Device Info for Northern Islands devices ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//==-----------------------------------------------------------------------===// +#include "AMDILNIDevice.h" +#include "AMDILEvergreenDevice.h" +#include "AMDGPUSubtarget.h" + +using namespace llvm; + +AMDGPUNIDevice::AMDGPUNIDevice(AMDGPUSubtarget *ST) + : AMDGPUEvergreenDevice(ST) { + std::string name = ST->getDeviceName(); + if (name == "caicos") { + DeviceFlag = OCL_DEVICE_CAICOS; + } else if (name == "turks") { + DeviceFlag = OCL_DEVICE_TURKS; + } else if (name == "cayman") { + DeviceFlag = OCL_DEVICE_CAYMAN; + } else { + DeviceFlag = OCL_DEVICE_BARTS; + } +} +AMDGPUNIDevice::~AMDGPUNIDevice() { +} + +size_t +AMDGPUNIDevice::getMaxLDSSize() const { + if (usesHardware(AMDGPUDeviceInfo::LocalMem)) { + return MAX_LDS_SIZE_900; + } else { + return 0; + } +} + +uint32_t +AMDGPUNIDevice::getGeneration() const { + return AMDGPUDeviceInfo::HD6XXX; +} + + +AMDGPUCaymanDevice::AMDGPUCaymanDevice(AMDGPUSubtarget *ST) + : AMDGPUNIDevice(ST) { + setCaps(); +} + +AMDGPUCaymanDevice::~AMDGPUCaymanDevice() { +} + +void +AMDGPUCaymanDevice::setCaps() { + if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) { + mHWBits.set(AMDGPUDeviceInfo::DoubleOps); + mHWBits.set(AMDGPUDeviceInfo::FMA); + } + mHWBits.set(AMDGPUDeviceInfo::Signed24BitOps); + mSWBits.reset(AMDGPUDeviceInfo::Signed24BitOps); + mSWBits.set(AMDGPUDeviceInfo::ArenaSegment); +} + diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILNIDevice.h llvm-r600/lib/Target/R600/AMDILNIDevice.h --- llvm-3.2.src/lib/Target/R600/AMDILNIDevice.h 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDILNIDevice.h 2013-01-25 19:43:57.446716388 +0100 @@ -0,0 +1,57 @@ +//===------- AMDILNIDevice.h - Define NI Device for AMDIL -*- C++ -*------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +/// \file +/// \brief Interface for the subtarget data classes. +/// +/// This file will define the interface that each generation needs to +/// implement in order to correctly answer queries on the capabilities of the +/// specific hardware. +//===---------------------------------------------------------------------===// +#ifndef AMDILNIDEVICE_H +#define AMDILNIDEVICE_H +#include "AMDILEvergreenDevice.h" +#include "AMDGPUSubtarget.h" + +namespace llvm { + +class AMDGPUSubtarget; +//===---------------------------------------------------------------------===// +// NI generation of devices and their respective sub classes +//===---------------------------------------------------------------------===// + +/// \brief The AMDGPUNIDevice is the base class for all Northern Island series of +/// cards. +/// +/// It is very similiar to the AMDGPUEvergreenDevice, with the major +/// exception being differences in wavefront size and hardware capabilities. The +/// NI devices are all 64 wide wavefronts and also add support for signed 24 bit +/// integer operations +class AMDGPUNIDevice : public AMDGPUEvergreenDevice { +public: + AMDGPUNIDevice(AMDGPUSubtarget*); + virtual ~AMDGPUNIDevice(); + virtual size_t getMaxLDSSize() const; + virtual uint32_t getGeneration() const; +}; + +/// Just as the AMDGPUCypressDevice is the double capable version of the +/// AMDGPUEvergreenDevice, the AMDGPUCaymanDevice is the double capable version +/// of the AMDGPUNIDevice. The other major difference is that the Cayman Device +/// has 4 wide ALU's, whereas the rest of the NI family is a 5 wide. +class AMDGPUCaymanDevice: public AMDGPUNIDevice { +public: + AMDGPUCaymanDevice(AMDGPUSubtarget*); + virtual ~AMDGPUCaymanDevice(); +private: + virtual void setCaps(); +}; + +static const unsigned int MAX_LDS_SIZE_900 = AMDGPUDevice::MAX_LDS_SIZE_800; +} // namespace llvm +#endif // AMDILNIDEVICE_H diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILPeepholeOptimizer.cpp llvm-r600/lib/Target/R600/AMDILPeepholeOptimizer.cpp --- llvm-3.2.src/lib/Target/R600/AMDILPeepholeOptimizer.cpp 1970-01-01 01:00:00.000000000 +0100 +++ llvm-r600/lib/Target/R600/AMDILPeepholeOptimizer.cpp 2013-01-25 19:43:57.450049721 +0100 @@ -0,0 +1,1256 @@ +//===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//==-----------------------------------------------------------------------===// + +#define DEBUG_TYPE "PeepholeOpt" +#ifdef DEBUG +#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) +#else +#define DEBUGME 0 +#endif + +#include "AMDILDevices.h" +#include "AMDGPUInstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Constants.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/Module.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" + +#include + +#if 0 +STATISTIC(PointerAssignments, "Number of dynamic pointer " + "assigments discovered"); +STATISTIC(PointerSubtract, "Number of pointer subtractions discovered"); +#endif + +using namespace llvm; +// The Peephole optimization pass is used to do simple last minute optimizations +// that are required for correct code or to remove redundant functions +namespace { + +class OpaqueType; + +class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass { +public: + TargetMachine &TM; + static char ID; + AMDGPUPeepholeOpt(TargetMachine &tm); + ~AMDGPUPeepholeOpt(); + const char *getPassName() const; + bool runOnFunction(Function &F); + bool doInitialization(Module &M); + bool doFinalization(Module &M); + void getAnalysisUsage(AnalysisUsage &AU) const; +protected: +private: + // Function to initiate all of the instruction level optimizations. + bool instLevelOptimizations(BasicBlock::iterator *inst); + // Quick check to see if we need to dump all of the pointers into the + // arena. If this is correct, then we set all pointers to exist in arena. This + // is a workaround for aliasing of pointers in a struct/union. + bool dumpAllIntoArena(Function &F); + // Because I don't want to invalidate any pointers while in the + // safeNestedForEachFunction. I push atomic conversions to a vector and handle + // it later. This function does the conversions if required. + void doAtomicConversionIfNeeded(Function &F); + // Because __amdil_is_constant cannot be properly evaluated if + // optimizations are disabled, the call's are placed in a vector + // and evaluated after the __amdil_image* functions are evaluated + // which should allow the __amdil_is_constant function to be + // evaluated correctly. + void doIsConstCallConversionIfNeeded(); + bool mChanged; + bool mDebug; + bool mConvertAtomics; + CodeGenOpt::Level optLevel; + // Run a series of tests to see if we can optimize a CALL instruction. + bool optimizeCallInst(BasicBlock::iterator *bbb); + // A peephole optimization to optimize bit extract sequences. + bool optimizeBitExtract(Instruction *inst); + // A peephole optimization to optimize bit insert sequences. + bool optimizeBitInsert(Instruction *inst); + bool setupBitInsert(Instruction *base, + Instruction *&src, + Constant *&mask, + Constant *&shift); + // Expand the bit field insert instruction on versions of OpenCL that + // don't support it. + bool expandBFI(CallInst *CI); + // Expand the bit field mask instruction on version of OpenCL that + // don't support it. + bool expandBFM(CallInst *CI); + // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in + // this case we need to expand them. These functions check for 24bit functions + // and then expand. + bool isSigned24BitOps(CallInst *CI); + void expandSigned24BitOps(CallInst *CI); + // One optimization that can occur is that if the required workgroup size is + // specified then the result of get_local_size is known at compile time and + // can be returned accordingly. + bool isRWGLocalOpt(CallInst *CI); + // On northern island cards, the division is slightly less accurate than on + // previous generations, so we need to utilize a more accurate division. So we + // can translate the accurate divide to a normal divide on all other cards. + bool convertAccurateDivide(CallInst *CI); + void expandAccurateDivide(CallInst *CI); + // If the alignment is set incorrectly, it can produce really inefficient + // code. This checks for this scenario and fixes it if possible. + bool correctMisalignedMemOp(Instruction *inst); + + // If we are in no opt mode, then we need to make sure that + // local samplers are properly propagated as constant propagation + // doesn't occur and we need to know the value of kernel defined + // samplers at compile time. + bool propagateSamplerInst(CallInst *CI); + + // Helper functions + + // Group of functions that recursively calculate the size of a structure based + // on it's sub-types. + size_t getTypeSize(Type * const T, bool dereferencePtr = false); + size_t getTypeSize(StructType * const ST, bool dereferencePtr = false); + size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false); + size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false); + size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false); + size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false); + size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false); + size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false); + + LLVMContext *mCTX; + Function *mF; + const AMDGPUSubtarget *mSTM; + SmallVector< std::pair, 16> atomicFuncs; + SmallVector isConstVec; +}; // class AMDGPUPeepholeOpt + char AMDGPUPeepholeOpt::ID = 0; + +// A template function that has two levels of looping before calling the +// function with a pointer to the current iterator. +template +Function safeNestedForEach(InputIterator First, InputIterator Last, + SecondIterator S, Function F) { + for ( ; First != Last; ++First) { + SecondIterator sf, sl; + for (sf = First->begin(), sl = First->end(); + sf != sl; ) { + if (!F(&sf)) { + ++sf; + } + } + } + return F; +} + +} // anonymous namespace + +namespace llvm { + FunctionPass * + createAMDGPUPeepholeOpt(TargetMachine &tm) { + return new AMDGPUPeepholeOpt(tm); + } +} // llvm namespace + +AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm) + : FunctionPass(ID), TM(tm) { + mDebug = DEBUGME; + optLevel = TM.getOptLevel(); + +} + +AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() { +} + +const char * +AMDGPUPeepholeOpt::getPassName() const { + return "AMDGPU PeepHole Optimization Pass"; +} + +bool +containsPointerType(Type *Ty) { + if (!Ty) { + return false; + } + switch(Ty->getTypeID()) { + default: + return false; + case Type::StructTyID: { + const StructType *ST = dyn_cast(Ty); + for (StructType::element_iterator stb = ST->element_begin(), + ste = ST->element_end(); stb != ste; ++stb) { + if (!containsPointerType(*stb)) { + continue; + } + return true; + } + break; + } + case Type::VectorTyID: + case Type::ArrayTyID: + return containsPointerType(dyn_cast(Ty)->getElementType()); + case Type::PointerTyID: + return true; + }; + return false; +} + +bool +AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) { + bool dumpAll = false; + for (Function::const_arg_iterator cab = F.arg_begin(), + cae = F.arg_end(); cab != cae; ++cab) { + const Argument *arg = cab; + const PointerType *PT = dyn_cast(arg->getType()); + if (!PT) { + continue; + } + Type *DereferencedType = PT->getElementType(); + if (!dyn_cast(DereferencedType) + ) { + continue; + } + if (!containsPointerType(DereferencedType)) { + continue; + } + // FIXME: Because a pointer inside of a struct/union may be aliased to + // another pointer we need to take the conservative approach and place all + // pointers into the arena until more advanced detection is implemented. + dumpAll = true; + } + return dumpAll; +} +void +AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() { + if (isConstVec.empty()) { + return; + } + for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) { + CallInst *CI = isConstVec[x]; + Constant *CV = dyn_cast(CI->getOperand(0)); + Type *aType = Type::getInt32Ty(*mCTX); + Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) + : ConstantInt::get(aType, 0); + CI->replaceAllUsesWith(Val); + CI->eraseFromParent(); + } + isConstVec.clear(); +} +void +AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) { + // Don't do anything if we don't have any atomic operations. + if (atomicFuncs.empty()) { + return; + } + // Change the function name for the atomic if it is required + uint32_t size = atomicFuncs.size(); + for (uint32_t x = 0; x < size; ++x) { + atomicFuncs[x].first->setOperand( + atomicFuncs[x].first->getNumOperands()-1, + atomicFuncs[x].second); + + } + mChanged = true; + if (mConvertAtomics) { + return; + } +} + +bool +AMDGPUPeepholeOpt::runOnFunction(Function &MF) { + mChanged = false; + mF = &MF; + mSTM = &TM.getSubtarget(); + if (mDebug) { + MF.dump(); + } + mCTX = &MF.getType()->getContext(); + mConvertAtomics = true; + safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(), + std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations), + this)); + + doAtomicConversionIfNeeded(MF); + doIsConstCallConversionIfNeeded(); + + if (mDebug) { + MF.dump(); + } + return mChanged; +} + +bool +AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) { + Instruction *inst = (*bbb); + CallInst *CI = dyn_cast(inst); + if (!CI) { + return false; + } + if (isSigned24BitOps(CI)) { + expandSigned24BitOps(CI); + ++(*bbb); + CI->eraseFromParent(); + return true; + } + if (propagateSamplerInst(CI)) { + return false; + } + if (expandBFI(CI) || expandBFM(CI)) { + ++(*bbb); + CI->eraseFromParent(); + return true; + } + if (convertAccurateDivide(CI)) { + expandAccurateDivide(CI); + ++(*bbb); + CI->eraseFromParent(); + return true; + } + + StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName(); + if (calleeName.startswith("__amdil_is_constant")) { + // If we do not have optimizations, then this + // cannot be properly evaluated, so we add the + // call instruction to a vector and process + // them at the end of processing after the + // samplers have been correctly handled. + if (optLevel == CodeGenOpt::None) { + isConstVec.push_back(CI); + return false; + } else { + Constant *CV = dyn_cast(CI->getOperand(0)); + Type *aType = Type::getInt32Ty(*mCTX); + Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) + : ConstantInt::get(aType, 0); + CI->replaceAllUsesWith(Val); + ++(*bbb); + CI->eraseFromParent(); + return true; + } + } + + if (calleeName.equals("__amdil_is_asic_id_i32")) { + ConstantInt *CV = dyn_cast(CI->getOperand(0)); + Type *aType = Type::getInt32Ty(*mCTX); + Value *Val = CV; + if (Val) { + Val = ConstantInt::get(aType, + mSTM->device()->getDeviceFlag() & CV->getZExtValue()); + } else { + Val = ConstantInt::get(aType, 0); + } + CI->replaceAllUsesWith(Val); + ++(*bbb); + CI->eraseFromParent(); + return true; + } + Function *F = dyn_cast(CI->getOperand(CI->getNumOperands()-1)); + if (!F) { + return false; + } + if (F->getName().startswith("__atom") && !CI->getNumUses() + && F->getName().find("_xchg") == StringRef::npos) { + std::string buffer(F->getName().str() + "_noret"); + F = dyn_cast( + F->getParent()->getOrInsertFunction(buffer, F->getFunctionType())); + atomicFuncs.push_back(std::make_pair (CI, F)); + } + + if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment) + && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) { + return false; + } + if (!mConvertAtomics) { + return false; + } + StringRef name = F->getName(); + if (name.startswith("__atom") && name.find("_g") != StringRef::npos) { + mConvertAtomics = false; + } + return false; +} + +bool +AMDGPUPeepholeOpt::setupBitInsert(Instruction *base, + Instruction *&src, + Constant *&mask, + Constant *&shift) { + if (!base) { + if (mDebug) { + dbgs() << "Null pointer passed into function.\n"; + } + return false; + } + bool andOp = false; + if (base->getOpcode() == Instruction::Shl) { + shift = dyn_cast(base->getOperand(1)); + } else if (base->getOpcode() == Instruction::And) { + mask = dyn_cast(base->getOperand(1)); + andOp = true; + } else { + if (mDebug) { + dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n"; + } + // If the base is neither a Shl or a And, we don't fit any of the patterns above. + return false; + } + src = dyn_cast(base->getOperand(0)); + if (!src) { + if (mDebug) { + dbgs() << "Failed setup since the base operand is not an instruction!\n"; + } + return false; + } + // If we find an 'and' operation, then we don't need to + // find the next operation as we already know the + // bits that are valid at this point. + if (andOp) { + return true; + } + if (src->getOpcode() == Instruction::Shl && !shift) { + shift = dyn_cast(src->getOperand(1)); + src = dyn_cast(src->getOperand(0)); + } else if (src->getOpcode() == Instruction::And && !mask) { + mask = dyn_cast(src->getOperand(1)); + } + if (!mask && !shift) { + if (mDebug) { + dbgs() << "Failed setup since both mask and shift are NULL!\n"; + } + // Did not find a constant mask or a shift. + return false; + } + return true; +} +bool +AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) { + if (!inst) { + return false; + } + if (!inst->isBinaryOp()) { + return false; + } + if (inst->getOpcode() != Instruction::Or) { + return false; + } + if (optLevel == CodeGenOpt::None) { + return false; + } + // We want to do an optimization on a sequence of ops that in the end equals a + // single ISA instruction. + // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F) + // Some simplified versions of this pattern are as follows: + // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0 + // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E + // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B + // (A & B) | (D << F) when (1 << F) >= B + // (A << C) | (D & E) when (1 << C) >= E + if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) { + // The HD4XXX hardware doesn't support the ubit_insert instruction. + return false; + } + Type *aType = inst->getType(); + bool isVector = aType->isVectorTy(); + int numEle = 1; + // This optimization only works on 32bit integers. + if (aType->getScalarType() + != Type::getInt32Ty(inst->getContext())) { + return false; + } + if (isVector) { + const VectorType *VT = dyn_cast(aType); + numEle = VT->getNumElements(); + // We currently cannot support more than 4 elements in a intrinsic and we + // cannot support Vec3 types. + if (numEle > 4 || numEle == 3) { + return false; + } + } + // TODO: Handle vectors. + if (isVector) { + if (mDebug) { + dbgs() << "!!! Vectors are not supported yet!\n"; + } + return false; + } + Instruction *LHSSrc = NULL, *RHSSrc = NULL; + Constant *LHSMask = NULL, *RHSMask = NULL; + Constant *LHSShift = NULL, *RHSShift = NULL; + Instruction *LHS = dyn_cast(inst->getOperand(0)); + Instruction *RHS = dyn_cast(inst->getOperand(1)); + if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) { + if (mDebug) { + dbgs() << "Found an OR Operation that failed setup!\n"; + inst->dump(); + if (LHS) { LHS->dump(); } + if (LHSSrc) { LHSSrc->dump(); } + if (LHSMask) { LHSMask->dump(); } + if (LHSShift) { LHSShift->dump(); } + } + // There was an issue with the setup for BitInsert. + return false; + } + if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) { + if (mDebug) { + dbgs() << "Found an OR Operation that failed setup!\n"; + inst->dump(); + if (RHS) { RHS->dump(); } + if (RHSSrc) { RHSSrc->dump(); } + if (RHSMask) { RHSMask->dump(); } + if (RHSShift) { RHSShift->dump(); } + } + // There was an issue with the setup for BitInsert. + return false; + } + if (mDebug) { + dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n"; + dbgs() << "Op: "; inst->dump(); + dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; } + dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; } + dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; } + dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; } + dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; } + dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; } + dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; } + dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; } + } + Constant *offset = NULL; + Constant *width = NULL; + uint32_t lhsMaskVal = 0, rhsMaskVal = 0; + uint32_t lhsShiftVal = 0, rhsShiftVal = 0; + uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0; + uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0; + lhsMaskVal = (LHSMask + ? dyn_cast(LHSMask)->getZExtValue() : 0); + rhsMaskVal = (RHSMask + ? dyn_cast(RHSMask)->getZExtValue() : 0); + lhsShiftVal = (LHSShift + ? dyn_cast(LHSShift)->getZExtValue() : 0); + rhsShiftVal = (RHSShift + ? dyn_cast