1 diff -Nur -x .git llvm-3.2.src/autoconf/configure.ac llvm-r600/autoconf/configure.ac
2 --- llvm-3.2.src/autoconf/configure.ac 2012-11-21 17:13:35.000000000 +0100
3 +++ llvm-r600/autoconf/configure.ac 2013-01-25 19:43:56.096716416 +0100
6 if test ${enableval} != "disable"
8 + if test ${enableval} = "AMDGPU"
10 + AC_MSG_ERROR([The AMDGPU target has been renamed to R600, please reconfigure with --enable-experimental-targets=R600])
13 TARGETS_TO_BUILD="$enableval $TARGETS_TO_BUILD"
16 diff -Nur -x .git llvm-3.2.src/configure llvm-r600/configure
17 --- llvm-3.2.src/configure 2012-11-21 17:13:35.000000000 +0100
18 +++ llvm-r600/configure 2013-01-25 19:43:56.173383081 +0100
19 @@ -5473,6 +5473,13 @@
21 if test ${enableval} != "disable"
23 + if test ${enableval} = "AMDGPU"
25 + { { echo "$as_me:$LINENO: error: The AMDGPU target has been renamed to R600, please reconfigure with --enable-experimental-targets=R600" >&5
26 +echo "$as_me: error: The AMDGPU target has been renamed to R600, please reconfigure with --enable-experimental-targets=R600" >&2;}
27 + { (exit 1); exit 1; }; }
30 TARGETS_TO_BUILD="$enableval $TARGETS_TO_BUILD"
33 @@ -10316,7 +10323,7 @@
34 lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
35 lt_status=$lt_dlunknown
36 cat > conftest.$ac_ext <<EOF
37 -#line 10317 "configure"
38 +#line 10326 "configure"
42 diff -Nur -x .git llvm-3.2.src/include/llvm/IntrinsicsR600.td llvm-r600/include/llvm/IntrinsicsR600.td
43 --- llvm-3.2.src/include/llvm/IntrinsicsR600.td 1970-01-01 01:00:00.000000000 +0100
44 +++ llvm-r600/include/llvm/IntrinsicsR600.td 2013-01-25 19:43:56.433383075 +0100
46 +//===- IntrinsicsR600.td - Defines R600 intrinsics ---------*- tablegen -*-===//
48 +// The LLVM Compiler Infrastructure
50 +// This file is distributed under the University of Illinois Open Source
51 +// License. See LICENSE.TXT for details.
53 +//===----------------------------------------------------------------------===//
55 +// This file defines all of the R600-specific intrinsics.
57 +//===----------------------------------------------------------------------===//
59 +let TargetPrefix = "r600" in {
61 +class R600ReadPreloadRegisterIntrinsic<string name>
62 + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
65 +multiclass R600ReadPreloadRegisterIntrinsic_xyz<string prefix> {
66 + def _x : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_x")>;
67 + def _y : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_y")>;
68 + def _z : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_z")>;
71 +defm int_r600_read_global_size : R600ReadPreloadRegisterIntrinsic_xyz <
72 + "__builtin_r600_read_global_size">;
73 +defm int_r600_read_local_size : R600ReadPreloadRegisterIntrinsic_xyz <
74 + "__builtin_r600_read_local_size">;
75 +defm int_r600_read_ngroups : R600ReadPreloadRegisterIntrinsic_xyz <
76 + "__builtin_r600_read_ngroups">;
77 +defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz <
78 + "__builtin_r600_read_tgid">;
79 +defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz <
80 + "__builtin_r600_read_tidig">;
81 +} // End TargetPrefix = "r600"
82 diff -Nur -x .git llvm-3.2.src/include/llvm/Intrinsics.td llvm-r600/include/llvm/Intrinsics.td
83 --- llvm-3.2.src/include/llvm/Intrinsics.td 2012-10-20 01:00:20.000000000 +0200
84 +++ llvm-r600/include/llvm/Intrinsics.td 2013-01-25 19:43:56.426716409 +0100
86 include "llvm/IntrinsicsHexagon.td"
87 include "llvm/IntrinsicsNVVM.td"
88 include "llvm/IntrinsicsMips.td"
89 +include "llvm/IntrinsicsR600.td"
90 diff -Nur -x .git llvm-3.2.src/lib/CodeGen/SelectionDAG/DAGCombiner.cpp llvm-r600/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
91 --- llvm-3.2.src/lib/CodeGen/SelectionDAG/DAGCombiner.cpp 2012-11-26 18:01:12.000000000 +0100
92 +++ llvm-r600/lib/CodeGen/SelectionDAG/DAGCombiner.cpp 2013-01-25 19:43:56.720049736 +0100
93 @@ -8514,11 +8514,8 @@
94 if (Opcode == ISD::DELETED_NODE &&
95 (Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP)) {
97 - // If not supported by target, bail out.
98 - if (TLI.getOperationAction(Opcode, VT) != TargetLowering::Legal &&
99 - TLI.getOperationAction(Opcode, VT) != TargetLowering::Custom)
106 @@ -8543,6 +8540,10 @@
107 assert(SrcVT != MVT::Other && "Cannot determine source type!");
109 EVT NVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumInScalars);
111 + if (!TLI.isOperationLegalOrCustom(Opcode, NVT))
114 SmallVector<SDValue, 8> Opnds;
115 for (unsigned i = 0; i != NumInScalars; ++i) {
116 SDValue In = N->getOperand(i);
117 diff -Nur -x .git llvm-3.2.src/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp llvm-r600/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
118 --- llvm-3.2.src/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp 2012-10-24 19:25:11.000000000 +0200
119 +++ llvm-r600/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp 2013-01-25 19:43:56.733383069 +0100
123 case TargetLowering::Promote: {
124 - assert(VT.isVector() && "Unknown legal promote case!");
125 - Value = DAG.getNode(ISD::BITCAST, dl,
126 - TLI.getTypeToPromoteTo(ISD::STORE, VT), Value);
127 + EVT NVT = TLI.getTypeToPromoteTo(ISD::STORE, VT);
128 + assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
129 + "Can only promote stores to same size type");
130 + Value = DAG.getNode(ISD::BITCAST, dl, NVT, Value);
132 DAG.getStore(Chain, dl, Value, Ptr,
133 ST->getPointerInfo(), isVolatile,
137 case TargetLowering::Promote: {
138 - // Only promote a load of vector type to another.
139 - assert(VT.isVector() && "Cannot promote this load!");
140 - // Change base type to a different vector type.
141 EVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT);
142 + assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
143 + "Can only promote loads to same size type");
145 SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(),
146 LD->isVolatile(), LD->isNonTemporal(),
147 diff -Nur -x .git llvm-3.2.src/lib/Target/LLVMBuild.txt llvm-r600/lib/Target/LLVMBuild.txt
148 --- llvm-3.2.src/lib/Target/LLVMBuild.txt 2012-07-16 20:19:46.000000000 +0200
149 +++ llvm-r600/lib/Target/LLVMBuild.txt 2013-01-25 19:43:57.173383060 +0100
151 ;===------------------------------------------------------------------------===;
154 -subdirectories = ARM CellSPU CppBackend Hexagon MBlaze MSP430 NVPTX Mips PowerPC Sparc X86 XCore
155 +subdirectories = ARM CellSPU CppBackend Hexagon MBlaze MSP430 NVPTX Mips PowerPC R600 Sparc X86 XCore
157 ; This is a special group whose required libraries are extended (by llvm-build)
158 ; with the best execution engine (the native JIT, if available, or the
159 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUAsmPrinter.cpp llvm-r600/lib/Target/R600/AMDGPUAsmPrinter.cpp
160 --- llvm-3.2.src/lib/Target/R600/AMDGPUAsmPrinter.cpp 1970-01-01 01:00:00.000000000 +0100
161 +++ llvm-r600/lib/Target/R600/AMDGPUAsmPrinter.cpp 2013-01-25 19:43:57.423383055 +0100
163 +//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===//
165 +// The LLVM Compiler Infrastructure
167 +// This file is distributed under the University of Illinois Open Source
168 +// License. See LICENSE.TXT for details.
170 +//===----------------------------------------------------------------------===//
174 +/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
175 +/// code. When passed an MCAsmStreamer it prints assembly and when passed
176 +/// an MCObjectStreamer it outputs binary code.
178 +//===----------------------------------------------------------------------===//
182 +#include "AMDGPUAsmPrinter.h"
184 +#include "SIMachineFunctionInfo.h"
185 +#include "SIRegisterInfo.h"
186 +#include "llvm/MC/MCStreamer.h"
187 +#include "llvm/Target/TargetLoweringObjectFile.h"
188 +#include "llvm/Support/TargetRegistry.h"
190 +using namespace llvm;
193 +static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm,
194 + MCStreamer &Streamer) {
195 + return new AMDGPUAsmPrinter(tm, Streamer);
198 +extern "C" void LLVMInitializeR600AsmPrinter() {
199 + TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
202 +/// We need to override this function so we can avoid
203 +/// the call to EmitFunctionHeader(), which the MCPureStreamer can't handle.
204 +bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
205 + const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
206 + if (STM.dumpCode()) {
207 +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
211 + SetupMachineFunction(MF);
212 + OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
213 + if (STM.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
214 + EmitProgramInfo(MF);
216 + EmitFunctionBody();
220 +void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) {
221 + unsigned MaxSGPR = 0;
222 + unsigned MaxVGPR = 0;
223 + bool VCCUsed = false;
224 + const SIRegisterInfo * RI =
225 + static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
227 + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
228 + BB != BB_E; ++BB) {
229 + MachineBasicBlock &MBB = *BB;
230 + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
232 + MachineInstr &MI = *I;
234 + unsigned numOperands = MI.getNumOperands();
235 + for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
236 + MachineOperand & MO = MI.getOperand(op_idx);
238 + unsigned width = 0;
239 + bool isSGPR = false;
246 + if (reg == AMDGPU::VCC) {
253 + case AMDGPU::SI_LITERAL_CONSTANT:
254 + case AMDGPU::SREG_LIT_0:
259 + if (AMDGPU::SReg_32RegClass.contains(reg)) {
262 + } else if (AMDGPU::VReg_32RegClass.contains(reg)) {
265 + } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
268 + } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
271 + } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
274 + } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
277 + } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
281 + assert(!"Unknown register class");
283 + hwReg = RI->getEncodingValue(reg);
284 + maxUsed = hwReg + width - 1;
286 + MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
288 + MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
296 + SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>();
297 + OutStreamer.EmitIntValue(MaxSGPR + 1, 4);
298 + OutStreamer.EmitIntValue(MaxVGPR + 1, 4);
299 + OutStreamer.EmitIntValue(MFI->SPIPSInputAddr, 4);
301 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUAsmPrinter.h llvm-r600/lib/Target/R600/AMDGPUAsmPrinter.h
302 --- llvm-3.2.src/lib/Target/R600/AMDGPUAsmPrinter.h 1970-01-01 01:00:00.000000000 +0100
303 +++ llvm-r600/lib/Target/R600/AMDGPUAsmPrinter.h 2013-01-25 19:43:57.426716388 +0100
305 +//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code -------------------===//
307 +// The LLVM Compiler Infrastructure
309 +// This file is distributed under the University of Illinois Open Source
310 +// License. See LICENSE.TXT for details.
312 +//===----------------------------------------------------------------------===//
315 +/// \brief AMDGPU Assembly printer class.
317 +//===----------------------------------------------------------------------===//
319 +#ifndef AMDGPU_ASMPRINTER_H
320 +#define AMDGPU_ASMPRINTER_H
322 +#include "llvm/CodeGen/AsmPrinter.h"
326 +class AMDGPUAsmPrinter : public AsmPrinter {
329 + explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
330 + : AsmPrinter(TM, Streamer) { }
332 + virtual bool runOnMachineFunction(MachineFunction &MF);
334 + virtual const char *getPassName() const {
335 + return "AMDGPU Assembly Printer";
338 + /// \brief Emit register usage information so that the GPU driver
339 + /// can correctly setup the GPU state.
340 + void EmitProgramInfo(MachineFunction &MF);
342 + /// Implemented in AMDGPUMCInstLower.cpp
343 + virtual void EmitInstruction(const MachineInstr *MI);
346 +} // End anonymous llvm
348 +#endif //AMDGPU_ASMPRINTER_H
349 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUCodeEmitter.h llvm-r600/lib/Target/R600/AMDGPUCodeEmitter.h
350 --- llvm-3.2.src/lib/Target/R600/AMDGPUCodeEmitter.h 1970-01-01 01:00:00.000000000 +0100
351 +++ llvm-r600/lib/Target/R600/AMDGPUCodeEmitter.h 2013-01-25 19:43:57.426716388 +0100
353 +//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----------------===//
355 +// The LLVM Compiler Infrastructure
357 +// This file is distributed under the University of Illinois Open Source
358 +// License. See LICENSE.TXT for details.
360 +//===----------------------------------------------------------------------===//
363 +/// \brief CodeEmitter interface for R600 and SI codegen.
365 +//===----------------------------------------------------------------------===//
367 +#ifndef AMDGPUCODEEMITTER_H
368 +#define AMDGPUCODEEMITTER_H
372 +class AMDGPUCodeEmitter {
374 + uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
375 + virtual uint64_t getMachineOpValue(const MachineInstr &MI,
376 + const MachineOperand &MO) const { return 0; }
377 + virtual unsigned GPR4AlignEncode(const MachineInstr &MI,
378 + unsigned OpNo) const {
381 + virtual unsigned GPR2AlignEncode(const MachineInstr &MI,
382 + unsigned OpNo) const {
385 + virtual uint64_t VOPPostEncode(const MachineInstr &MI,
386 + uint64_t Value) const {
389 + virtual uint64_t i32LiteralEncode(const MachineInstr &MI,
390 + unsigned OpNo) const {
393 + virtual uint32_t SMRDmemriEncode(const MachineInstr &MI, unsigned OpNo)
399 +} // End namespace llvm
401 +#endif // AMDGPUCODEEMITTER_H
402 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUConvertToISA.cpp llvm-r600/lib/Target/R600/AMDGPUConvertToISA.cpp
403 --- llvm-3.2.src/lib/Target/R600/AMDGPUConvertToISA.cpp 1970-01-01 01:00:00.000000000 +0100
404 +++ llvm-r600/lib/Target/R600/AMDGPUConvertToISA.cpp 2013-01-25 19:43:57.426716388 +0100
406 +//===-- AMDGPUConvertToISA.cpp - Lower AMDIL to HW ISA --------------------===//
408 +// The LLVM Compiler Infrastructure
410 +// This file is distributed under the University of Illinois Open Source
411 +// License. See LICENSE.TXT for details.
413 +//===----------------------------------------------------------------------===//
416 +/// \brief This pass lowers AMDIL machine instructions to the appropriate
417 +/// hardware instructions.
419 +//===----------------------------------------------------------------------===//
422 +#include "AMDGPUInstrInfo.h"
423 +#include "llvm/CodeGen/MachineFunctionPass.h"
425 +using namespace llvm;
429 +class AMDGPUConvertToISAPass : public MachineFunctionPass {
436 + AMDGPUConvertToISAPass(TargetMachine &tm) :
437 + MachineFunctionPass(ID), TM(tm) { }
439 + virtual bool runOnMachineFunction(MachineFunction &MF);
441 + virtual const char *getPassName() const {return "AMDGPU Convert to ISA";}
445 +} // End anonymous namespace
447 +char AMDGPUConvertToISAPass::ID = 0;
449 +FunctionPass *llvm::createAMDGPUConvertToISAPass(TargetMachine &tm) {
450 + return new AMDGPUConvertToISAPass(tm);
453 +bool AMDGPUConvertToISAPass::runOnMachineFunction(MachineFunction &MF) {
454 + const AMDGPUInstrInfo * TII =
455 + static_cast<const AMDGPUInstrInfo*>(TM.getInstrInfo());
457 + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
458 + BB != BB_E; ++BB) {
459 + MachineBasicBlock &MBB = *BB;
460 + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
462 + MachineInstr &MI = *I;
463 + TII->convertToISA(MI, MF, MBB.findDebugLoc(I));
468 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPU.h llvm-r600/lib/Target/R600/AMDGPU.h
469 --- llvm-3.2.src/lib/Target/R600/AMDGPU.h 1970-01-01 01:00:00.000000000 +0100
470 +++ llvm-r600/lib/Target/R600/AMDGPU.h 2013-01-25 19:43:57.423383055 +0100
472 +//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=//
474 +// The LLVM Compiler Infrastructure
476 +// This file is distributed under the University of Illinois Open Source
477 +// License. See LICENSE.TXT for details.
480 +//===----------------------------------------------------------------------===//
485 +#include "AMDGPUTargetMachine.h"
486 +#include "llvm/Support/TargetRegistry.h"
487 +#include "llvm/Target/TargetMachine.h"
492 +class AMDGPUTargetMachine;
495 +FunctionPass* createR600KernelParametersPass(const DataLayout *TD);
496 +FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
497 +FunctionPass *createR600LowerConstCopy(TargetMachine &tm);
500 +FunctionPass *createSIAnnotateControlFlowPass();
501 +FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm);
502 +FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
503 +FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
504 +FunctionPass *createSILowerLiteralConstantsPass(TargetMachine &tm);
505 +FunctionPass *createSIInsertWaits(TargetMachine &tm);
507 +// Passes common to R600 and SI
508 +Pass *createAMDGPUStructurizeCFGPass();
509 +FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
511 +} // End namespace llvm
513 +namespace ShaderType {
523 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.cpp llvm-r600/lib/Target/R600/AMDGPUInstrInfo.cpp
524 --- llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.cpp 1970-01-01 01:00:00.000000000 +0100
525 +++ llvm-r600/lib/Target/R600/AMDGPUInstrInfo.cpp 2013-01-25 19:43:57.426716388 +0100
527 +//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===//
529 +// The LLVM Compiler Infrastructure
531 +// This file is distributed under the University of Illinois Open Source
532 +// License. See LICENSE.TXT for details.
534 +//===----------------------------------------------------------------------===//
537 +/// \brief Implementation of the TargetInstrInfo class that is common to all
540 +//===----------------------------------------------------------------------===//
542 +#include "AMDGPUInstrInfo.h"
543 +#include "AMDGPURegisterInfo.h"
544 +#include "AMDGPUTargetMachine.h"
546 +#include "llvm/CodeGen/MachineFrameInfo.h"
547 +#include "llvm/CodeGen/MachineInstrBuilder.h"
548 +#include "llvm/CodeGen/MachineRegisterInfo.h"
550 +#define GET_INSTRINFO_CTOR
551 +#include "AMDGPUGenInstrInfo.inc"
553 +using namespace llvm;
555 +AMDGPUInstrInfo::AMDGPUInstrInfo(TargetMachine &tm)
556 + : AMDGPUGenInstrInfo(0,0), RI(tm, *this), TM(tm) { }
558 +const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const {
562 +bool AMDGPUInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
563 + unsigned &SrcReg, unsigned &DstReg,
564 + unsigned &SubIdx) const {
565 +// TODO: Implement this function
569 +unsigned AMDGPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
570 + int &FrameIndex) const {
571 +// TODO: Implement this function
575 +unsigned AMDGPUInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
576 + int &FrameIndex) const {
577 +// TODO: Implement this function
581 +bool AMDGPUInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
582 + const MachineMemOperand *&MMO,
583 + int &FrameIndex) const {
584 +// TODO: Implement this function
587 +unsigned AMDGPUInstrInfo::isStoreFromStackSlot(const MachineInstr *MI,
588 + int &FrameIndex) const {
589 +// TODO: Implement this function
592 +unsigned AMDGPUInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI,
593 + int &FrameIndex) const {
594 +// TODO: Implement this function
597 +bool AMDGPUInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI,
598 + const MachineMemOperand *&MMO,
599 + int &FrameIndex) const {
600 +// TODO: Implement this function
605 +AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
606 + MachineBasicBlock::iterator &MBBI,
607 + LiveVariables *LV) const {
608 +// TODO: Implement this function
611 +bool AMDGPUInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter,
612 + MachineBasicBlock &MBB) const {
613 + while (iter != MBB.end()) {
614 + switch (iter->getOpcode()) {
617 + case AMDGPU::BRANCH_COND_i32:
618 + case AMDGPU::BRANCH_COND_f32:
619 + case AMDGPU::BRANCH:
627 +MachineBasicBlock::iterator skipFlowControl(MachineBasicBlock *MBB) {
628 + MachineBasicBlock::iterator tmp = MBB->end();
629 + if (!MBB->size()) {
633 + if (tmp->getOpcode() == AMDGPU::ENDLOOP
634 + || tmp->getOpcode() == AMDGPU::ENDIF
635 + || tmp->getOpcode() == AMDGPU::ELSE) {
636 + if (tmp == MBB->begin()) {
649 +AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
650 + MachineBasicBlock::iterator MI,
651 + unsigned SrcReg, bool isKill,
653 + const TargetRegisterClass *RC,
654 + const TargetRegisterInfo *TRI) const {
655 + assert(!"Not Implemented");
659 +AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
660 + MachineBasicBlock::iterator MI,
661 + unsigned DestReg, int FrameIndex,
662 + const TargetRegisterClass *RC,
663 + const TargetRegisterInfo *TRI) const {
664 + assert(!"Not Implemented");
668 +AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
670 + const SmallVectorImpl<unsigned> &Ops,
671 + int FrameIndex) const {
672 +// TODO: Implement this function
676 +AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
678 + const SmallVectorImpl<unsigned> &Ops,
679 + MachineInstr *LoadMI) const {
680 + // TODO: Implement this function
684 +AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
685 + const SmallVectorImpl<unsigned> &Ops) const {
686 + // TODO: Implement this function
690 +AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
691 + unsigned Reg, bool UnfoldLoad,
693 + SmallVectorImpl<MachineInstr*> &NewMIs) const {
694 + // TODO: Implement this function
699 +AMDGPUInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
700 + SmallVectorImpl<SDNode*> &NewNodes) const {
701 + // TODO: Implement this function
706 +AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
707 + bool UnfoldLoad, bool UnfoldStore,
708 + unsigned *LoadRegIndex) const {
709 + // TODO: Implement this function
713 +bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
714 + int64_t Offset1, int64_t Offset2,
715 + unsigned NumLoads) const {
716 + assert(Offset2 > Offset1
717 + && "Second offset should be larger than first offset!");
718 + // If we have less than 16 loads in a row, and the offsets are within 16,
719 + // then schedule together.
720 + // TODO: Make the loads schedule near if it fits in a cacheline
721 + return (NumLoads < 16 && (Offset2 - Offset1) < 16);
725 +AMDGPUInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
727 + // TODO: Implement this function
730 +void AMDGPUInstrInfo::insertNoop(MachineBasicBlock &MBB,
731 + MachineBasicBlock::iterator MI) const {
732 + // TODO: Implement this function
735 +bool AMDGPUInstrInfo::isPredicated(const MachineInstr *MI) const {
736 + // TODO: Implement this function
740 +AMDGPUInstrInfo::SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
741 + const SmallVectorImpl<MachineOperand> &Pred2)
743 + // TODO: Implement this function
747 +bool AMDGPUInstrInfo::DefinesPredicate(MachineInstr *MI,
748 + std::vector<MachineOperand> &Pred) const {
749 + // TODO: Implement this function
753 +bool AMDGPUInstrInfo::isPredicable(MachineInstr *MI) const {
754 + // TODO: Implement this function
755 + return MI->getDesc().isPredicable();
759 +AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
760 + // TODO: Implement this function
764 +void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF,
765 + DebugLoc DL) const {
766 + MachineRegisterInfo &MRI = MF.getRegInfo();
767 + const AMDGPURegisterInfo & RI = getRegisterInfo();
769 + for (unsigned i = 0; i < MI.getNumOperands(); i++) {
770 + MachineOperand &MO = MI.getOperand(i);
771 + // Convert dst regclass to one that is supported by the ISA
772 + if (MO.isReg() && MO.isDef()) {
773 + if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
774 + const TargetRegisterClass * oldRegClass = MRI.getRegClass(MO.getReg());
775 + const TargetRegisterClass * newRegClass = RI.getISARegClass(oldRegClass);
777 + assert(newRegClass);
779 + MRI.setRegClass(MO.getReg(), newRegClass);
784 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.h llvm-r600/lib/Target/R600/AMDGPUInstrInfo.h
785 --- llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.h 1970-01-01 01:00:00.000000000 +0100
786 +++ llvm-r600/lib/Target/R600/AMDGPUInstrInfo.h 2013-01-25 19:43:57.430049721 +0100
788 +//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===//
790 +// The LLVM Compiler Infrastructure
792 +// This file is distributed under the University of Illinois Open Source
793 +// License. See LICENSE.TXT for details.
795 +//===----------------------------------------------------------------------===//
798 +/// \brief Contains the definition of a TargetInstrInfo class that is common
799 +/// to all AMD GPUs.
801 +//===----------------------------------------------------------------------===//
803 +#ifndef AMDGPUINSTRUCTIONINFO_H
804 +#define AMDGPUINSTRUCTIONINFO_H
806 +#include "AMDGPURegisterInfo.h"
807 +#include "AMDGPUInstrInfo.h"
808 +#include "llvm/Target/TargetInstrInfo.h"
812 +#define GET_INSTRINFO_HEADER
813 +#define GET_INSTRINFO_ENUM
814 +#include "AMDGPUGenInstrInfo.inc"
816 +#define OPCODE_IS_ZERO_INT AMDGPU::PRED_SETE_INT
817 +#define OPCODE_IS_NOT_ZERO_INT AMDGPU::PRED_SETNE_INT
818 +#define OPCODE_IS_ZERO AMDGPU::PRED_SETE
819 +#define OPCODE_IS_NOT_ZERO AMDGPU::PRED_SETNE
823 +class AMDGPUTargetMachine;
824 +class MachineFunction;
826 +class MachineInstrBuilder;
828 +class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
830 + const AMDGPURegisterInfo RI;
832 + bool getNextBranchInstr(MachineBasicBlock::iterator &iter,
833 + MachineBasicBlock &MBB) const;
835 + explicit AMDGPUInstrInfo(TargetMachine &tm);
837 + virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0;
839 + bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
840 + unsigned &DstReg, unsigned &SubIdx) const;
842 + unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
843 + unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
844 + int &FrameIndex) const;
845 + bool hasLoadFromStackSlot(const MachineInstr *MI,
846 + const MachineMemOperand *&MMO,
847 + int &FrameIndex) const;
848 + unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
849 + unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI,
850 + int &FrameIndex) const;
851 + bool hasStoreFromStackSlot(const MachineInstr *MI,
852 + const MachineMemOperand *&MMO,
853 + int &FrameIndex) const;
856 + convertToThreeAddress(MachineFunction::iterator &MFI,
857 + MachineBasicBlock::iterator &MBBI,
858 + LiveVariables *LV) const;
861 + virtual void copyPhysReg(MachineBasicBlock &MBB,
862 + MachineBasicBlock::iterator MI, DebugLoc DL,
863 + unsigned DestReg, unsigned SrcReg,
864 + bool KillSrc) const = 0;
866 + void storeRegToStackSlot(MachineBasicBlock &MBB,
867 + MachineBasicBlock::iterator MI,
868 + unsigned SrcReg, bool isKill, int FrameIndex,
869 + const TargetRegisterClass *RC,
870 + const TargetRegisterInfo *TRI) const;
871 + void loadRegFromStackSlot(MachineBasicBlock &MBB,
872 + MachineBasicBlock::iterator MI,
873 + unsigned DestReg, int FrameIndex,
874 + const TargetRegisterClass *RC,
875 + const TargetRegisterInfo *TRI) const;
878 + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
880 + const SmallVectorImpl<unsigned> &Ops,
881 + int FrameIndex) const;
882 + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
884 + const SmallVectorImpl<unsigned> &Ops,
885 + MachineInstr *LoadMI) const;
887 + bool canFoldMemoryOperand(const MachineInstr *MI,
888 + const SmallVectorImpl<unsigned> &Ops) const;
889 + bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
890 + unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
891 + SmallVectorImpl<MachineInstr *> &NewMIs) const;
892 + bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
893 + SmallVectorImpl<SDNode *> &NewNodes) const;
894 + unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
895 + bool UnfoldLoad, bool UnfoldStore,
896 + unsigned *LoadRegIndex = 0) const;
897 + bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
898 + int64_t Offset1, int64_t Offset2,
899 + unsigned NumLoads) const;
901 + bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
902 + void insertNoop(MachineBasicBlock &MBB,
903 + MachineBasicBlock::iterator MI) const;
904 + bool isPredicated(const MachineInstr *MI) const;
905 + bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
906 + const SmallVectorImpl<MachineOperand> &Pred2) const;
907 + bool DefinesPredicate(MachineInstr *MI,
908 + std::vector<MachineOperand> &Pred) const;
909 + bool isPredicable(MachineInstr *MI) const;
910 + bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
912 + // Helper functions that check the opcode for status information
913 + bool isLoadInst(llvm::MachineInstr *MI) const;
914 + bool isExtLoadInst(llvm::MachineInstr *MI) const;
915 + bool isSWSExtLoadInst(llvm::MachineInstr *MI) const;
916 + bool isSExtLoadInst(llvm::MachineInstr *MI) const;
917 + bool isZExtLoadInst(llvm::MachineInstr *MI) const;
918 + bool isAExtLoadInst(llvm::MachineInstr *MI) const;
919 + bool isStoreInst(llvm::MachineInstr *MI) const;
920 + bool isTruncStoreInst(llvm::MachineInstr *MI) const;
922 + virtual MachineInstr* getMovImmInstr(MachineFunction *MF, unsigned DstReg,
923 + int64_t Imm) const = 0;
924 + virtual unsigned getIEQOpcode() const = 0;
925 + virtual bool isMov(unsigned opcode) const = 0;
927 + /// \brief Convert the AMDIL MachineInstr to a supported ISA
929 + virtual void convertToISA(MachineInstr & MI, MachineFunction &MF,
930 + DebugLoc DL) const;
934 +} // End llvm namespace
936 +#endif // AMDGPUINSTRINFO_H
937 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.td llvm-r600/lib/Target/R600/AMDGPUInstrInfo.td
938 --- llvm-3.2.src/lib/Target/R600/AMDGPUInstrInfo.td 1970-01-01 01:00:00.000000000 +0100
939 +++ llvm-r600/lib/Target/R600/AMDGPUInstrInfo.td 2013-01-25 19:43:57.430049721 +0100
941 +//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===//
943 +// The LLVM Compiler Infrastructure
945 +// This file is distributed under the University of Illinois Open Source
946 +// License. See LICENSE.TXT for details.
948 +//===----------------------------------------------------------------------===//
950 +// This file contains DAG node defintions for the AMDGPU target.
952 +//===----------------------------------------------------------------------===//
954 +//===----------------------------------------------------------------------===//
955 +// AMDGPU DAG Profiles
956 +//===----------------------------------------------------------------------===//
958 +def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [
959 + SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
962 +//===----------------------------------------------------------------------===//
966 +// out = ((a << 32) | b) >> c)
968 +// Can be used to optimize rtol:
969 +// rotl(a, b) = bitalign(a, a, 32 - b)
970 +def AMDGPUbitalign : SDNode<"AMDGPUISD::BITALIGN", AMDGPUDTIntTernaryOp>;
972 +// This argument to this node is a dword address.
973 +def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
975 +// out = a - floor(a)
976 +def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
978 +// out = max(a, b) a and b are floats
979 +def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp,
980 + [SDNPCommutative, SDNPAssociative]
983 +// out = max(a, b) a and b are signed ints
984 +def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
985 + [SDNPCommutative, SDNPAssociative]
988 +// out = max(a, b) a and b are unsigned ints
989 +def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp,
990 + [SDNPCommutative, SDNPAssociative]
993 +// out = min(a, b) a and b are floats
994 +def AMDGPUfmin : SDNode<"AMDGPUISD::FMIN", SDTFPBinOp,
995 + [SDNPCommutative, SDNPAssociative]
998 +// out = min(a, b) a snd b are signed ints
999 +def AMDGPUsmin : SDNode<"AMDGPUISD::SMIN", SDTIntBinOp,
1000 + [SDNPCommutative, SDNPAssociative]
1003 +// out = min(a, b) a and b are unsigned ints
1004 +def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp,
1005 + [SDNPCommutative, SDNPAssociative]
1008 +// urecip - This operation is a helper for integer division, it returns the
1009 +// result of 1 / a as a fractional unsigned integer.
1010 +// out = (2^32 / a) + e
1011 +// e is rounding error
1012 +def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;
1014 +def fpow : SDNode<"ISD::FPOW", SDTFPBinOp>;
1015 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUInstructions.td llvm-r600/lib/Target/R600/AMDGPUInstructions.td
1016 --- llvm-3.2.src/lib/Target/R600/AMDGPUInstructions.td 1970-01-01 01:00:00.000000000 +0100
1017 +++ llvm-r600/lib/Target/R600/AMDGPUInstructions.td 2013-01-25 19:43:57.430049721 +0100
1019 +//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===//
1021 +// The LLVM Compiler Infrastructure
1023 +// This file is distributed under the University of Illinois Open Source
1024 +// License. See LICENSE.TXT for details.
1026 +//===----------------------------------------------------------------------===//
1028 +// This file contains instruction defs that are common to all hw codegen
1031 +//===----------------------------------------------------------------------===//
1033 +class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instruction {
1034 + field bits<16> AMDILOp = 0;
1035 + field bits<3> Gen = 0;
1037 + let Namespace = "AMDGPU";
1038 + let OutOperandList = outs;
1039 + let InOperandList = ins;
1040 + let AsmString = asm;
1041 + let Pattern = pattern;
1042 + let Itinerary = NullALU;
1043 + let TSFlags{42-40} = Gen;
1044 + let TSFlags{63-48} = AMDILOp;
1047 +class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
1048 + : AMDGPUInst<outs, ins, asm, pattern> {
1050 + field bits<32> Inst = 0xffffffff;
1054 +def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
1056 +def COND_EQ : PatLeaf <
1058 + [{switch(N->get()){{default: return false;
1059 + case ISD::SETOEQ: case ISD::SETUEQ:
1060 + case ISD::SETEQ: return true;}}}]
1063 +def COND_NE : PatLeaf <
1065 + [{switch(N->get()){{default: return false;
1066 + case ISD::SETONE: case ISD::SETUNE:
1067 + case ISD::SETNE: return true;}}}]
1069 +def COND_GT : PatLeaf <
1071 + [{switch(N->get()){{default: return false;
1072 + case ISD::SETOGT: case ISD::SETUGT:
1073 + case ISD::SETGT: return true;}}}]
1076 +def COND_GE : PatLeaf <
1078 + [{switch(N->get()){{default: return false;
1079 + case ISD::SETOGE: case ISD::SETUGE:
1080 + case ISD::SETGE: return true;}}}]
1083 +def COND_LT : PatLeaf <
1085 + [{switch(N->get()){{default: return false;
1086 + case ISD::SETOLT: case ISD::SETULT:
1087 + case ISD::SETLT: return true;}}}]
1090 +def COND_LE : PatLeaf <
1092 + [{switch(N->get()){{default: return false;
1093 + case ISD::SETOLE: case ISD::SETULE:
1094 + case ISD::SETLE: return true;}}}]
1097 +//===----------------------------------------------------------------------===//
1098 +// Load/Store Pattern Fragments
1099 +//===----------------------------------------------------------------------===//
1101 +def zextloadi8_global : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr), [{
1102 + return isGlobalLoad(dyn_cast<LoadSDNode>(N));
1106 +int TWO_PI = 0x40c90fdb;
1107 +int PI = 0x40490fdb;
1108 +int TWO_PI_INV = 0x3e22f983;
1110 +def CONST : Constants;
1112 +def FP_ZERO : PatLeaf <
1114 + [{return N->getValueAPF().isZero();}]
1117 +def FP_ONE : PatLeaf <
1119 + [{return N->isExactlyValue(1.0);}]
1122 +let isCodeGenOnly = 1, isPseudo = 1, usesCustomInserter = 1 in {
1124 +class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
1127 + "CLAMP $dst, $src0",
1128 + [(set rc:$dst, (int_AMDIL_clamp rc:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
1131 +class FABS <RegisterClass rc> : AMDGPUShaderInst <
1134 + "FABS $dst, $src0",
1135 + [(set rc:$dst, (fabs rc:$src0))]
1138 +class FNEG <RegisterClass rc> : AMDGPUShaderInst <
1141 + "FNEG $dst, $src0",
1142 + [(set rc:$dst, (fneg rc:$src0))]
1145 +def SHADER_TYPE : AMDGPUShaderInst <
1147 + (ins i32imm:$type),
1148 + "SHADER_TYPE $type",
1149 + [(int_AMDGPU_shader_type imm:$type)]
1152 +} // End isCodeGenOnly = 1, isPseudo = 1, hasCustomInserter = 1
1154 +/* Generic helper patterns for intrinsics */
1155 +/* -------------------------------------- */
1157 +class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul,
1158 + RegisterClass rc> : Pat <
1159 + (fpow rc:$src0, rc:$src1),
1160 + (exp_ieee (mul rc:$src1, (log_ieee rc:$src0)))
1163 +/* Other helper patterns */
1164 +/* --------------------- */
1166 +/* Extract element pattern */
1167 +class Extract_Element <ValueType sub_type, ValueType vec_type,
1168 + RegisterClass vec_class, int sub_idx,
1169 + SubRegIndex sub_reg>: Pat<
1170 + (sub_type (vector_extract (vec_type vec_class:$src), sub_idx)),
1171 + (EXTRACT_SUBREG vec_class:$src, sub_reg)
1174 +/* Insert element pattern */
1175 +class Insert_Element <ValueType elem_type, ValueType vec_type,
1176 + RegisterClass elem_class, RegisterClass vec_class,
1177 + int sub_idx, SubRegIndex sub_reg> : Pat <
1179 + (vec_type (vector_insert (vec_type vec_class:$vec),
1180 + (elem_type elem_class:$elem), sub_idx)),
1181 + (INSERT_SUBREG vec_class:$vec, elem_class:$elem, sub_reg)
1184 +// Vector Build pattern
1185 +class Vector_Build <ValueType vecType, RegisterClass vectorClass,
1186 + ValueType elemType, RegisterClass elemClass> : Pat <
1187 + (vecType (build_vector (elemType elemClass:$x), (elemType elemClass:$y),
1188 + (elemType elemClass:$z), (elemType elemClass:$w))),
1189 + (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
1190 + (vecType (IMPLICIT_DEF)), elemClass:$x, sel_x), elemClass:$y, sel_y),
1191 + elemClass:$z, sel_z), elemClass:$w, sel_w)
1194 +// bitconvert pattern
1195 +class BitConvert <ValueType dt, ValueType st, RegisterClass rc> : Pat <
1196 + (dt (bitconvert (st rc:$src0))),
1200 +class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat <
1201 + (vt (AMDGPUdwordaddr (vt rc:$addr))),
1205 +include "R600Instructions.td"
1207 +include "SIInstrInfo.td"
1209 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUIntrinsics.td llvm-r600/lib/Target/R600/AMDGPUIntrinsics.td
1210 --- llvm-3.2.src/lib/Target/R600/AMDGPUIntrinsics.td 1970-01-01 01:00:00.000000000 +0100
1211 +++ llvm-r600/lib/Target/R600/AMDGPUIntrinsics.td 2013-01-25 19:43:57.430049721 +0100
1213 +//===-- AMDGPUIntrinsics.td - Common intrinsics -*- tablegen -*-----------===//
1215 +// The LLVM Compiler Infrastructure
1217 +// This file is distributed under the University of Illinois Open Source
1218 +// License. See LICENSE.TXT for details.
1220 +//===----------------------------------------------------------------------===//
1222 +// This file defines intrinsics that are used by all hw codegen targets.
1224 +//===----------------------------------------------------------------------===//
1226 +let TargetPrefix = "AMDGPU", isTarget = 1 in {
1228 + def int_AMDGPU_load_const : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
1229 + def int_AMDGPU_load_imm : Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>;
1230 + def int_AMDGPU_reserve_reg : Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>;
1231 + def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
1232 + def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
1234 + def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
1235 + def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1236 + def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1237 + def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
1238 + def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
1239 + def int_AMDGPU_kilp : Intrinsic<[], [], []>;
1240 + def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1241 + def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1242 + def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1243 + def int_AMDGPU_rcp : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
1244 + def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
1245 + def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1246 + def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1247 + def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1248 + def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1249 + def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1250 + def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
1251 + def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1252 + def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1253 + def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1254 + def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1255 + def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1256 + def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1257 + def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
1258 + def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1259 + def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1260 + def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1261 + def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1262 + def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1263 + def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1264 + def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
1266 + def int_AMDGPU_shader_type : Intrinsic<[], [llvm_i32_ty], []>;
1269 +let TargetPrefix = "TGSI", isTarget = 1 in {
1271 + def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[IntrNoMem]>;
1274 +include "SIIntrinsics.td"
1275 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUISelLowering.cpp llvm-r600/lib/Target/R600/AMDGPUISelLowering.cpp
1276 --- llvm-3.2.src/lib/Target/R600/AMDGPUISelLowering.cpp 1970-01-01 01:00:00.000000000 +0100
1277 +++ llvm-r600/lib/Target/R600/AMDGPUISelLowering.cpp 2013-01-25 19:43:57.426716388 +0100
1279 +//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
1281 +// The LLVM Compiler Infrastructure
1283 +// This file is distributed under the University of Illinois Open Source
1284 +// License. See LICENSE.TXT for details.
1286 +//===----------------------------------------------------------------------===//
1289 +/// \brief This is the parent TargetLowering class for hardware code gen
1292 +//===----------------------------------------------------------------------===//
1294 +#include "AMDGPUISelLowering.h"
1295 +#include "AMDILIntrinsicInfo.h"
1296 +#include "llvm/CodeGen/MachineFunction.h"
1297 +#include "llvm/CodeGen/MachineRegisterInfo.h"
1298 +#include "llvm/CodeGen/SelectionDAG.h"
1299 +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
1301 +using namespace llvm;
1303 +AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
1304 + TargetLowering(TM, new TargetLoweringObjectFileELF()) {
1306 + // Initialize target lowering borrowed from AMDIL
1307 + InitAMDILLowering();
1309 + // We need to custom lower some of the intrinsics
1310 + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1312 + // Library functions. These default to Expand, but we have instructions
1314 + setOperationAction(ISD::FCEIL, MVT::f32, Legal);
1315 + setOperationAction(ISD::FEXP2, MVT::f32, Legal);
1316 + setOperationAction(ISD::FPOW, MVT::f32, Legal);
1317 + setOperationAction(ISD::FLOG2, MVT::f32, Legal);
1318 + setOperationAction(ISD::FABS, MVT::f32, Legal);
1319 + setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
1320 + setOperationAction(ISD::FRINT, MVT::f32, Legal);
1322 + // Lower floating point store/load to integer store/load to reduce the number
1323 + // of patterns in tablegen.
1324 + setOperationAction(ISD::STORE, MVT::f32, Promote);
1325 + AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
1327 + setOperationAction(ISD::STORE, MVT::v4f32, Promote);
1328 + AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
1330 + setOperationAction(ISD::LOAD, MVT::f32, Promote);
1331 + AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
1333 + setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
1334 + AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
1336 + setOperationAction(ISD::UDIV, MVT::i32, Expand);
1337 + setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
1338 + setOperationAction(ISD::UREM, MVT::i32, Expand);
1341 +//===---------------------------------------------------------------------===//
1342 +// TargetLowering Callbacks
1343 +//===---------------------------------------------------------------------===//
1345 +SDValue AMDGPUTargetLowering::LowerFormalArguments(
1347 + CallingConv::ID CallConv,
1349 + const SmallVectorImpl<ISD::InputArg> &Ins,
1350 + DebugLoc DL, SelectionDAG &DAG,
1351 + SmallVectorImpl<SDValue> &InVals) const {
1352 + for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1353 + InVals.push_back(SDValue());
1358 +SDValue AMDGPUTargetLowering::LowerReturn(
1360 + CallingConv::ID CallConv,
1362 + const SmallVectorImpl<ISD::OutputArg> &Outs,
1363 + const SmallVectorImpl<SDValue> &OutVals,
1364 + DebugLoc DL, SelectionDAG &DAG) const {
1365 + return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain);
1368 +//===---------------------------------------------------------------------===//
1369 +// Target specific lowering
1370 +//===---------------------------------------------------------------------===//
1372 +SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
1374 + switch (Op.getOpcode()) {
1376 + Op.getNode()->dump();
1377 + assert(0 && "Custom lowering code for this"
1378 + "instruction is not implemented yet!");
1380 + // AMDIL DAG lowering
1381 + case ISD::SDIV: return LowerSDIV(Op, DAG);
1382 + case ISD::SREM: return LowerSREM(Op, DAG);
1383 + case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1384 + case ISD::BRCOND: return LowerBRCOND(Op, DAG);
1385 + // AMDGPU DAG lowering
1386 + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
1387 + case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1392 +SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
1393 + SelectionDAG &DAG) const {
1394 + unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1395 + DebugLoc DL = Op.getDebugLoc();
1396 + EVT VT = Op.getValueType();
1398 + switch (IntrinsicID) {
1399 + default: return Op;
1400 + case AMDGPUIntrinsic::AMDIL_abs:
1401 + return LowerIntrinsicIABS(Op, DAG);
1402 + case AMDGPUIntrinsic::AMDIL_exp:
1403 + return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
1404 + case AMDGPUIntrinsic::AMDGPU_lrp:
1405 + return LowerIntrinsicLRP(Op, DAG);
1406 + case AMDGPUIntrinsic::AMDIL_fraction:
1407 + return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
1408 + case AMDGPUIntrinsic::AMDIL_mad:
1409 + return DAG.getNode(AMDGPUISD::MAD, DL, VT, Op.getOperand(1),
1410 + Op.getOperand(2), Op.getOperand(3));
1411 + case AMDGPUIntrinsic::AMDIL_max:
1412 + return DAG.getNode(AMDGPUISD::FMAX, DL, VT, Op.getOperand(1),
1413 + Op.getOperand(2));
1414 + case AMDGPUIntrinsic::AMDGPU_imax:
1415 + return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
1416 + Op.getOperand(2));
1417 + case AMDGPUIntrinsic::AMDGPU_umax:
1418 + return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1),
1419 + Op.getOperand(2));
1420 + case AMDGPUIntrinsic::AMDIL_min:
1421 + return DAG.getNode(AMDGPUISD::FMIN, DL, VT, Op.getOperand(1),
1422 + Op.getOperand(2));
1423 + case AMDGPUIntrinsic::AMDGPU_imin:
1424 + return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1),
1425 + Op.getOperand(2));
1426 + case AMDGPUIntrinsic::AMDGPU_umin:
1427 + return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1),
1428 + Op.getOperand(2));
1429 + case AMDGPUIntrinsic::AMDIL_round_nearest:
1430 + return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
1434 +///IABS(a) = SMAX(sub(0, a), a)
1435 +SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
1436 + SelectionDAG &DAG) const {
1438 + DebugLoc DL = Op.getDebugLoc();
1439 + EVT VT = Op.getValueType();
1440 + SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
1441 + Op.getOperand(1));
1443 + return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1));
1446 +/// Linear Interpolation
1447 +/// LRP(a, b, c) = muladd(a, b, (1 - a) * c)
1448 +SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
1449 + SelectionDAG &DAG) const {
1450 + DebugLoc DL = Op.getDebugLoc();
1451 + EVT VT = Op.getValueType();
1452 + SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
1453 + DAG.getConstantFP(1.0f, MVT::f32),
1454 + Op.getOperand(1));
1455 + SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA,
1456 + Op.getOperand(3));
1457 + return DAG.getNode(AMDGPUISD::MAD, DL, VT, Op.getOperand(1),
1462 +/// \brief Generate Min/Max node
1463 +SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
1464 + SelectionDAG &DAG) const {
1465 + DebugLoc DL = Op.getDebugLoc();
1466 + EVT VT = Op.getValueType();
1468 + SDValue LHS = Op.getOperand(0);
1469 + SDValue RHS = Op.getOperand(1);
1470 + SDValue True = Op.getOperand(2);
1471 + SDValue False = Op.getOperand(3);
1472 + SDValue CC = Op.getOperand(4);
1474 + if (VT != MVT::f32 ||
1475 + !((LHS == True && RHS == False) || (LHS == False && RHS == True))) {
1479 + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1480 + switch (CCOpcode) {
1487 + case ISD::SETFALSE:
1488 + case ISD::SETFALSE2:
1489 + case ISD::SETTRUE:
1490 + case ISD::SETTRUE2:
1493 + assert(0 && "Operation should already be optimised !");
1499 + case ISD::SETLT: {
1501 + return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS);
1503 + return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS);
1510 + case ISD::SETOGT: {
1512 + return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS);
1514 + return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS);
1516 + case ISD::SETCC_INVALID:
1517 + assert(0 && "Invalid setcc condcode !");
1524 +SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1525 + SelectionDAG &DAG) const {
1526 + DebugLoc DL = Op.getDebugLoc();
1527 + EVT VT = Op.getValueType();
1529 + SDValue Num = Op.getOperand(0);
1530 + SDValue Den = Op.getOperand(1);
1532 + SmallVector<SDValue, 8> Results;
1534 + // RCP = URECIP(Den) = 2^32 / Den + e
1535 + // e is rounding error.
1536 + SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
1538 + // RCP_LO = umulo(RCP, Den) */
1539 + SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den);
1541 + // RCP_HI = mulhu (RCP, Den) */
1542 + SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
1544 + // NEG_RCP_LO = -RCP_LO
1545 + SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
1548 + // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
1549 + SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
1550 + NEG_RCP_LO, RCP_LO,
1552 + // Calculate the rounding error from the URECIP instruction
1553 + // E = mulhu(ABS_RCP_LO, RCP)
1554 + SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
1556 + // RCP_A_E = RCP + E
1557 + SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
1559 + // RCP_S_E = RCP - E
1560 + SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
1562 + // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
1563 + SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
1566 + // Quotient = mulhu(Tmp0, Num)
1567 + SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
1569 + // Num_S_Remainder = Quotient * Den
1570 + SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den);
1572 + // Remainder = Num - Num_S_Remainder
1573 + SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
1575 + // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
1576 + SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
1577 + DAG.getConstant(-1, VT),
1578 + DAG.getConstant(0, VT),
1580 + // Remainder_GE_Zero = (Remainder >= 0 ? -1 : 0)
1581 + SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Remainder,
1582 + DAG.getConstant(0, VT),
1583 + DAG.getConstant(-1, VT),
1584 + DAG.getConstant(0, VT),
1586 + // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
1587 + SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
1588 + Remainder_GE_Zero);
1590 + // Calculate Division result:
1592 + // Quotient_A_One = Quotient + 1
1593 + SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
1594 + DAG.getConstant(1, VT));
1596 + // Quotient_S_One = Quotient - 1
1597 + SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
1598 + DAG.getConstant(1, VT));
1600 + // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
1601 + SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
1602 + Quotient, Quotient_A_One, ISD::SETEQ);
1604 + // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
1605 + Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
1606 + Quotient_S_One, Div, ISD::SETEQ);
1608 + // Calculate Rem result:
1610 + // Remainder_S_Den = Remainder - Den
1611 + SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
1613 + // Remainder_A_Den = Remainder + Den
1614 + SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
1616 + // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
1617 + SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
1618 + Remainder, Remainder_S_Den, ISD::SETEQ);
1620 + // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
1621 + Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
1622 + Remainder_A_Den, Rem, ISD::SETEQ);
1626 + return DAG.getMergeValues(Ops, 2, DL);
1629 +//===----------------------------------------------------------------------===//
1630 +// Helper functions
1631 +//===----------------------------------------------------------------------===//
1633 +bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
1634 + if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
1635 + return CFP->isExactlyValue(1.0);
1637 + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
1638 + return C->isAllOnesValue();
1643 +bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const {
1644 + if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
1645 + return CFP->getValueAPF().isZero();
1647 + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
1648 + return C->isNullValue();
1653 +SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
1654 + const TargetRegisterClass *RC,
1655 + unsigned Reg, EVT VT) const {
1656 + MachineFunction &MF = DAG.getMachineFunction();
1657 + MachineRegisterInfo &MRI = MF.getRegInfo();
1658 + unsigned VirtualRegister;
1659 + if (!MRI.isLiveIn(Reg)) {
1660 + VirtualRegister = MRI.createVirtualRegister(RC);
1661 + MRI.addLiveIn(Reg, VirtualRegister);
1663 + VirtualRegister = MRI.getLiveInVirtReg(Reg);
1665 + return DAG.getRegister(VirtualRegister, VT);
1668 +#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
1670 +const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
1672 + default: return 0;
1673 + // AMDIL DAG nodes
1674 + NODE_NAME_CASE(MAD);
1675 + NODE_NAME_CASE(CALL);
1676 + NODE_NAME_CASE(UMUL);
1677 + NODE_NAME_CASE(DIV_INF);
1678 + NODE_NAME_CASE(RET_FLAG);
1679 + NODE_NAME_CASE(BRANCH_COND);
1681 + // AMDGPU DAG nodes
1682 + NODE_NAME_CASE(DWORDADDR)
1683 + NODE_NAME_CASE(FRACT)
1684 + NODE_NAME_CASE(FMAX)
1685 + NODE_NAME_CASE(SMAX)
1686 + NODE_NAME_CASE(UMAX)
1687 + NODE_NAME_CASE(FMIN)
1688 + NODE_NAME_CASE(SMIN)
1689 + NODE_NAME_CASE(UMIN)
1690 + NODE_NAME_CASE(URECIP)
1691 + NODE_NAME_CASE(INTERP)
1692 + NODE_NAME_CASE(INTERP_P0)
1693 + NODE_NAME_CASE(EXPORT)
1694 + NODE_NAME_CASE(CONST_ADDRESS)
1697 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUISelLowering.h llvm-r600/lib/Target/R600/AMDGPUISelLowering.h
1698 --- llvm-3.2.src/lib/Target/R600/AMDGPUISelLowering.h 1970-01-01 01:00:00.000000000 +0100
1699 +++ llvm-r600/lib/Target/R600/AMDGPUISelLowering.h 2013-01-25 19:43:57.426716388 +0100
1701 +//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===//
1703 +// The LLVM Compiler Infrastructure
1705 +// This file is distributed under the University of Illinois Open Source
1706 +// License. See LICENSE.TXT for details.
1708 +//===----------------------------------------------------------------------===//
1711 +/// \brief Interface definition of the TargetLowering class that is common
1712 +/// to all AMD GPUs.
1714 +//===----------------------------------------------------------------------===//
1716 +#ifndef AMDGPUISELLOWERING_H
1717 +#define AMDGPUISELLOWERING_H
1719 +#include "llvm/Target/TargetLowering.h"
1723 +class MachineRegisterInfo;
1725 +class AMDGPUTargetLowering : public TargetLowering {
1727 + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
1728 + SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
1732 + /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
1733 + /// MachineFunction.
1735 + /// \returns a RegisterSDNode representing Reg.
1736 + SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
1737 + unsigned Reg, EVT VT) const;
1739 + bool isHWTrueValue(SDValue Op) const;
1740 + bool isHWFalseValue(SDValue Op) const;
1743 + AMDGPUTargetLowering(TargetMachine &TM);
1745 + virtual SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
1747 + const SmallVectorImpl<ISD::InputArg> &Ins,
1748 + DebugLoc DL, SelectionDAG &DAG,
1749 + SmallVectorImpl<SDValue> &InVals) const;
1751 + virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
1753 + const SmallVectorImpl<ISD::OutputArg> &Outs,
1754 + const SmallVectorImpl<SDValue> &OutVals,
1755 + DebugLoc DL, SelectionDAG &DAG) const;
1757 + virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
1758 + SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
1759 + SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const;
1760 + SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const;
1761 + virtual const char* getTargetNodeName(unsigned Opcode) const;
1763 +// Functions defined in AMDILISelLowering.cpp
1766 + /// \brief Determine which of the bits specified in \p Mask are known to be
1767 + /// either zero or one and return them in the \p KnownZero and \p KnownOne
1769 + virtual void computeMaskedBitsForTargetNode(const SDValue Op,
1772 + const SelectionDAG &DAG,
1773 + unsigned Depth = 0) const;
1775 + virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info,
1776 + const CallInst &I, unsigned Intrinsic) const;
1778 + /// We want to mark f32/f64 floating point values as legal.
1779 + bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
1781 + /// We don't want to shrink f64/f32 constants.
1782 + bool ShouldShrinkFPConstant(EVT VT) const;
1785 + void InitAMDILLowering();
1786 + SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const;
1787 + SDValue LowerSREM8(SDValue Op, SelectionDAG &DAG) const;
1788 + SDValue LowerSREM16(SDValue Op, SelectionDAG &DAG) const;
1789 + SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const;
1790 + SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const;
1791 + SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
1792 + SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const;
1793 + SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const;
1794 + SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const;
1795 + SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
1796 + EVT genIntType(uint32_t size = 32, uint32_t numEle = 1) const;
1797 + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
1798 + SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
1801 +namespace AMDGPUISD {
1804 + // AMDIL ISD Opcodes
1805 + FIRST_NUMBER = ISD::BUILTIN_OP_END,
1806 + MAD, // 32bit Fused Multiply Add instruction
1807 + CALL, // Function call based on a single integer
1808 + UMUL, // 32bit unsigned multiplication
1809 + DIV_INF, // Divide with infinity returned on zero divisor
1812 + // End AMDIL ISD Opcodes
1827 + LAST_AMDGPU_ISD_NUMBER
1831 +} // End namespace AMDGPUISD
1836 + SI_FIRST = AMDGPUISD::LAST_AMDGPU_ISD_NUMBER,
1841 +} // End namespace SIISD
1843 +} // End namespace llvm
1845 +#endif // AMDGPUISELLOWERING_H
1846 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUMCInstLower.cpp llvm-r600/lib/Target/R600/AMDGPUMCInstLower.cpp
1847 --- llvm-3.2.src/lib/Target/R600/AMDGPUMCInstLower.cpp 1970-01-01 01:00:00.000000000 +0100
1848 +++ llvm-r600/lib/Target/R600/AMDGPUMCInstLower.cpp 2013-01-25 19:43:57.430049721 +0100
1850 +//===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===//
1852 +// The LLVM Compiler Infrastructure
1854 +// This file is distributed under the University of Illinois Open Source
1855 +// License. See LICENSE.TXT for details.
1857 +//===----------------------------------------------------------------------===//
1860 +/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst.
1862 +//===----------------------------------------------------------------------===//
1865 +#include "AMDGPUMCInstLower.h"
1866 +#include "AMDGPUAsmPrinter.h"
1867 +#include "R600InstrInfo.h"
1868 +#include "llvm/CodeGen/MachineBasicBlock.h"
1869 +#include "llvm/CodeGen/MachineInstr.h"
1870 +#include "llvm/Constants.h"
1871 +#include "llvm/MC/MCInst.h"
1872 +#include "llvm/MC/MCStreamer.h"
1873 +#include "llvm/MC/MCExpr.h"
1874 +#include "llvm/Support/ErrorHandling.h"
1876 +using namespace llvm;
1878 +AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx):
1882 +void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
1883 + OutMI.setOpcode(MI->getOpcode());
1885 + for (unsigned i = 0, e = MI->getNumExplicitOperands(); i != e; ++i) {
1886 + const MachineOperand &MO = MI->getOperand(i);
1889 + switch (MO.getType()) {
1891 + llvm_unreachable("unknown operand type");
1892 + case MachineOperand::MO_FPImmediate: {
1893 + const APFloat &FloatValue = MO.getFPImm()->getValueAPF();
1894 + assert(&FloatValue.getSemantics() == &APFloat::IEEEsingle &&
1895 + "Only floating point immediates are supported at the moment.");
1896 + MCOp = MCOperand::CreateFPImm(FloatValue.convertToFloat());
1899 + case MachineOperand::MO_Immediate:
1900 + MCOp = MCOperand::CreateImm(MO.getImm());
1902 + case MachineOperand::MO_Register:
1903 + MCOp = MCOperand::CreateReg(MO.getReg());
1905 + case MachineOperand::MO_MachineBasicBlock:
1906 + MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
1907 + MO.getMBB()->getSymbol(), Ctx));
1909 + OutMI.addOperand(MCOp);
1913 +void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
1914 + AMDGPUMCInstLower MCInstLowering(OutContext);
1916 + if (MI->isBundle()) {
1917 + const MachineBasicBlock *MBB = MI->getParent();
1918 + MachineBasicBlock::const_instr_iterator I = MI;
1920 + while (I != MBB->end() && I->isInsideBundle()) {
1921 + MCInst MCBundleInst;
1922 + const MachineInstr *BundledInst = I;
1923 + MCInstLowering.lower(BundledInst, MCBundleInst);
1924 + OutStreamer.EmitInstruction(MCBundleInst);
1929 + MCInstLowering.lower(MI, TmpInst);
1930 + OutStreamer.EmitInstruction(TmpInst);
1933 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUMCInstLower.h llvm-r600/lib/Target/R600/AMDGPUMCInstLower.h
1934 --- llvm-3.2.src/lib/Target/R600/AMDGPUMCInstLower.h 1970-01-01 01:00:00.000000000 +0100
1935 +++ llvm-r600/lib/Target/R600/AMDGPUMCInstLower.h 2013-01-25 19:43:57.430049721 +0100
1937 +//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===//
1939 +// The LLVM Compiler Infrastructure
1941 +// This file is distributed under the University of Illinois Open Source
1942 +// License. See LICENSE.TXT for details.
1945 +//===----------------------------------------------------------------------===//
1947 +#ifndef AMDGPU_MCINSTLOWER_H
1948 +#define AMDGPU_MCINSTLOWER_H
1954 +class MachineInstr;
1956 +class AMDGPUMCInstLower {
1961 + AMDGPUMCInstLower(MCContext &ctx);
1963 + /// \brief Lower a MachineInstr to an MCInst
1964 + void lower(const MachineInstr *MI, MCInst &OutMI) const;
1968 +} // End namespace llvm
1970 +#endif //AMDGPU_MCINSTLOWER_H
1971 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.cpp llvm-r600/lib/Target/R600/AMDGPURegisterInfo.cpp
1972 --- llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.cpp 1970-01-01 01:00:00.000000000 +0100
1973 +++ llvm-r600/lib/Target/R600/AMDGPURegisterInfo.cpp 2013-01-25 19:43:57.430049721 +0100
1975 +//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===//
1977 +// The LLVM Compiler Infrastructure
1979 +// This file is distributed under the University of Illinois Open Source
1980 +// License. See LICENSE.TXT for details.
1982 +//===----------------------------------------------------------------------===//
1985 +/// \brief Parent TargetRegisterInfo class common to all hw codegen targets.
1987 +//===----------------------------------------------------------------------===//
1989 +#include "AMDGPURegisterInfo.h"
1990 +#include "AMDGPUTargetMachine.h"
1992 +using namespace llvm;
1994 +AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm,
1995 + const TargetInstrInfo &tii)
1996 +: AMDGPUGenRegisterInfo(0),
2001 +//===----------------------------------------------------------------------===//
2002 +// Function handling callbacks - Functions are a seldom used feature of GPUS, so
2003 +// they are not supported at this time.
2004 +//===----------------------------------------------------------------------===//
2006 +const uint16_t AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister;
2008 +const uint16_t* AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
2010 + return &CalleeSavedReg;
2013 +void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2015 + RegScavenger *RS) const {
2016 + assert(!"Subroutines not supported yet");
2019 +unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
2020 + assert(!"Subroutines not supported yet");
2024 +#define GET_REGINFO_TARGET_DESC
2025 +#include "AMDGPUGenRegisterInfo.inc"
2026 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.h llvm-r600/lib/Target/R600/AMDGPURegisterInfo.h
2027 --- llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.h 1970-01-01 01:00:00.000000000 +0100
2028 +++ llvm-r600/lib/Target/R600/AMDGPURegisterInfo.h 2013-01-25 19:43:57.430049721 +0100
2030 +//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===//
2032 +// The LLVM Compiler Infrastructure
2034 +// This file is distributed under the University of Illinois Open Source
2035 +// License. See LICENSE.TXT for details.
2037 +//===----------------------------------------------------------------------===//
2040 +/// \brief TargetRegisterInfo interface that is implemented by all hw codegen
2043 +//===----------------------------------------------------------------------===//
2045 +#ifndef AMDGPUREGISTERINFO_H
2046 +#define AMDGPUREGISTERINFO_H
2048 +#include "llvm/ADT/BitVector.h"
2049 +#include "llvm/Target/TargetRegisterInfo.h"
2051 +#define GET_REGINFO_HEADER
2052 +#define GET_REGINFO_ENUM
2053 +#include "AMDGPUGenRegisterInfo.inc"
2057 +class AMDGPUTargetMachine;
2058 +class TargetInstrInfo;
2060 +struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
2061 + TargetMachine &TM;
2062 + const TargetInstrInfo &TII;
2063 + static const uint16_t CalleeSavedReg;
2065 + AMDGPURegisterInfo(TargetMachine &tm, const TargetInstrInfo &tii);
2067 + virtual BitVector getReservedRegs(const MachineFunction &MF) const {
2068 + assert(!"Unimplemented"); return BitVector();
2071 + /// \param RC is an AMDIL reg class.
2073 + /// \returns The ISA reg class that is equivalent to \p RC.
2074 + virtual const TargetRegisterClass * getISARegClass(
2075 + const TargetRegisterClass * RC) const {
2076 + assert(!"Unimplemented"); return NULL;
2079 + virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const {
2080 + assert(!"Unimplemented"); return NULL;
2083 + const uint16_t* getCalleeSavedRegs(const MachineFunction *MF) const;
2084 + void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
2085 + RegScavenger *RS) const;
2086 + unsigned getFrameRegister(const MachineFunction &MF) const;
2090 +} // End namespace llvm
2092 +#endif // AMDIDSAREGISTERINFO_H
2093 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.td llvm-r600/lib/Target/R600/AMDGPURegisterInfo.td
2094 --- llvm-3.2.src/lib/Target/R600/AMDGPURegisterInfo.td 1970-01-01 01:00:00.000000000 +0100
2095 +++ llvm-r600/lib/Target/R600/AMDGPURegisterInfo.td 2013-01-25 19:43:57.433383055 +0100
2097 +//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===//
2099 +// The LLVM Compiler Infrastructure
2101 +// This file is distributed under the University of Illinois Open Source
2102 +// License. See LICENSE.TXT for details.
2104 +//===----------------------------------------------------------------------===//
2106 +// Tablegen register definitions common to all hw codegen targets.
2108 +//===----------------------------------------------------------------------===//
2110 +let Namespace = "AMDGPU" in {
2111 + def sel_x : SubRegIndex;
2112 + def sel_y : SubRegIndex;
2113 + def sel_z : SubRegIndex;
2114 + def sel_w : SubRegIndex;
2117 +include "R600RegisterInfo.td"
2118 +include "SIRegisterInfo.td"
2119 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUStructurizeCFG.cpp llvm-r600/lib/Target/R600/AMDGPUStructurizeCFG.cpp
2120 --- llvm-3.2.src/lib/Target/R600/AMDGPUStructurizeCFG.cpp 1970-01-01 01:00:00.000000000 +0100
2121 +++ llvm-r600/lib/Target/R600/AMDGPUStructurizeCFG.cpp 2013-01-25 19:43:57.433383055 +0100
2123 +//===-- AMDGPUStructurizeCFG.cpp - ------------------===//
2125 +// The LLVM Compiler Infrastructure
2127 +// This file is distributed under the University of Illinois Open Source
2128 +// License. See LICENSE.TXT for details.
2130 +//===----------------------------------------------------------------------===//
2133 +/// The pass implemented in this file transforms the programs control flow
2134 +/// graph into a form that's suitable for code generation on hardware that
2135 +/// implements control flow by execution masking. This currently includes all
2136 +/// AMD GPUs but may as well be useful for other types of hardware.
2138 +//===----------------------------------------------------------------------===//
2140 +#include "AMDGPU.h"
2141 +#include "llvm/Module.h"
2142 +#include "llvm/ADT/SCCIterator.h"
2143 +#include "llvm/Analysis/RegionIterator.h"
2144 +#include "llvm/Analysis/RegionInfo.h"
2145 +#include "llvm/Analysis/RegionPass.h"
2146 +#include "llvm/Transforms/Utils/SSAUpdater.h"
2148 +using namespace llvm;
2152 +// Definition of the complex types used in this pass.
2154 +typedef std::pair<BasicBlock *, Value *> BBValuePair;
2155 +typedef ArrayRef<BasicBlock*> BBVecRef;
2157 +typedef SmallVector<RegionNode*, 8> RNVector;
2158 +typedef SmallVector<BasicBlock*, 8> BBVector;
2159 +typedef SmallVector<BBValuePair, 2> BBValueVector;
2161 +typedef DenseMap<PHINode *, BBValueVector> PhiMap;
2162 +typedef DenseMap<BasicBlock *, PhiMap> BBPhiMap;
2163 +typedef DenseMap<BasicBlock *, Value *> BBPredicates;
2164 +typedef DenseMap<BasicBlock *, BBPredicates> PredMap;
2165 +typedef DenseMap<BasicBlock *, unsigned> VisitedMap;
2167 +// The name for newly created blocks.
2169 +static const char *FlowBlockName = "Flow";
2171 +/// @brief Transforms the control flow graph on one single entry/exit region
2174 +/// After the transform all "If"/"Then"/"Else" style control flow looks like
2186 +/// | | 1 = "If" block, calculates the condition
2187 +/// 4 | 2 = "Then" subregion, runs if the condition is true
2188 +/// | / 3 = "Flow" blocks, newly inserted flow blocks, rejoins the flow
2189 +/// |/ 4 = "Else" optional subregion, runs if the condition is false
2190 +/// 5 5 = "End" block, also rejoins the control flow
2193 +/// Control flow is expressed as a branch where the true exit goes into the
2194 +/// "Then"/"Else" region, while the false exit skips the region
2195 +/// The condition for the optional "Else" region is expressed as a PHI node.
2196 +/// The incomming values of the PHI node are true for the "If" edge and false
2197 +/// for the "Then" edge.
2199 +/// Additionally to that even complicated loops look like this:
2206 +/// | / 1 = "Entry" block
2207 +/// |/ 2 = "Loop" optional subregion, with all exits at "Flow" block
2208 +/// 3 3 = "Flow" block, with back edge to entry block
2212 +/// The back edge of the "Flow" block is always on the false side of the branch
2213 +/// while the true side continues the general flow. So the loop condition
2214 +/// consist of a network of PHI nodes where the true incoming values expresses
2215 +/// breaks and the false values expresses continue states.
2216 +class AMDGPUStructurizeCFG : public RegionPass {
2221 + ConstantInt *BoolTrue;
2222 + ConstantInt *BoolFalse;
2223 + UndefValue *BoolUndef;
2226 + Region *ParentRegion;
2228 + DominatorTree *DT;
2231 + VisitedMap Visited;
2232 + PredMap Predicates;
2233 + BBPhiMap DeletedPhis;
2234 + BBVector FlowsInserted;
2236 + BasicBlock *LoopStart;
2237 + BasicBlock *LoopEnd;
2238 + BBPredicates LoopPred;
2240 + void orderNodes();
2242 + void buildPredicate(BranchInst *Term, unsigned Idx,
2243 + BBPredicates &Pred, bool Invert);
2245 + void analyzeBlock(BasicBlock *BB);
2247 + void analyzeLoop(BasicBlock *BB, unsigned &LoopIdx);
2249 + void collectInfos();
2251 + bool dominatesPredicates(BasicBlock *A, BasicBlock *B);
2253 + void killTerminator(BasicBlock *BB);
2255 + RegionNode *skipChained(RegionNode *Node);
2257 + void delPhiValues(BasicBlock *From, BasicBlock *To);
2259 + void addPhiValues(BasicBlock *From, BasicBlock *To);
2261 + BasicBlock *getNextFlow(BasicBlock *Prev);
2263 + bool isPredictableTrue(BasicBlock *Prev, BasicBlock *Node);
2265 + BasicBlock *wireFlowBlock(BasicBlock *Prev, RegionNode *Node);
2267 + void createFlow();
2269 + void insertConditions();
2271 + void rebuildSSA();
2274 + AMDGPUStructurizeCFG():
2277 + initializeRegionInfoPass(*PassRegistry::getPassRegistry());
2280 + virtual bool doInitialization(Region *R, RGPassManager &RGM);
2282 + virtual bool runOnRegion(Region *R, RGPassManager &RGM);
2284 + virtual const char *getPassName() const {
2285 + return "AMDGPU simplify control flow";
2288 + void getAnalysisUsage(AnalysisUsage &AU) const {
2290 + AU.addRequired<DominatorTree>();
2291 + AU.addPreserved<DominatorTree>();
2292 + RegionPass::getAnalysisUsage(AU);
2297 +} // end anonymous namespace
2299 +char AMDGPUStructurizeCFG::ID = 0;
2301 +/// \brief Initialize the types and constants used in the pass
2302 +bool AMDGPUStructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
2303 + LLVMContext &Context = R->getEntry()->getContext();
2305 + Boolean = Type::getInt1Ty(Context);
2306 + BoolTrue = ConstantInt::getTrue(Context);
2307 + BoolFalse = ConstantInt::getFalse(Context);
2308 + BoolUndef = UndefValue::get(Boolean);
2313 +/// \brief Build up the general order of nodes
2314 +void AMDGPUStructurizeCFG::orderNodes() {
2315 + scc_iterator<Region *> I = scc_begin(ParentRegion),
2316 + E = scc_end(ParentRegion);
2317 + for (Order.clear(); I != E; ++I) {
2318 + std::vector<RegionNode *> &Nodes = *I;
2319 + Order.append(Nodes.begin(), Nodes.end());
2323 +/// \brief Build blocks and loop predicates
2324 +void AMDGPUStructurizeCFG::buildPredicate(BranchInst *Term, unsigned Idx,
2325 + BBPredicates &Pred, bool Invert) {
2326 + Value *True = Invert ? BoolFalse : BoolTrue;
2327 + Value *False = Invert ? BoolTrue : BoolFalse;
2329 + RegionInfo *RI = ParentRegion->getRegionInfo();
2330 + BasicBlock *BB = Term->getParent();
2332 + // Handle the case where multiple regions start at the same block
2333 + Region *R = BB != ParentRegion->getEntry() ?
2334 + RI->getRegionFor(BB) : ParentRegion;
2336 + if (R == ParentRegion) {
2337 + // It's a top level block in our region
2338 + Value *Cond = True;
2339 + if (Term->isConditional()) {
2340 + BasicBlock *Other = Term->getSuccessor(!Idx);
2342 + if (Visited.count(Other)) {
2343 + if (!Pred.count(Other))
2344 + Pred[Other] = False;
2346 + if (!Pred.count(BB))
2350 + Cond = Term->getCondition();
2352 + if (Idx != Invert)
2353 + Cond = BinaryOperator::CreateNot(Cond, "", Term);
2358 + } else if (ParentRegion->contains(R)) {
2359 + // It's a block in a sub region
2360 + while(R->getParent() != ParentRegion)
2361 + R = R->getParent();
2363 + Pred[R->getEntry()] = True;
2366 + // It's a branch from outside into our parent region
2371 +/// \brief Analyze the successors of each block and build up predicates
2372 +void AMDGPUStructurizeCFG::analyzeBlock(BasicBlock *BB) {
2373 + pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
2374 + BBPredicates &Pred = Predicates[BB];
2376 + for (; PI != PE; ++PI) {
2377 + BranchInst *Term = cast<BranchInst>((*PI)->getTerminator());
2379 + for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
2380 + BasicBlock *Succ = Term->getSuccessor(i);
2383 + buildPredicate(Term, i, Pred, false);
2388 +/// \brief Analyze the conditions leading to loop to a previous block
2389 +void AMDGPUStructurizeCFG::analyzeLoop(BasicBlock *BB, unsigned &LoopIdx) {
2390 + BranchInst *Term = cast<BranchInst>(BB->getTerminator());
2392 + for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
2393 + BasicBlock *Succ = Term->getSuccessor(i);
2395 + // Ignore it if it's not a back edge
2396 + if (!Visited.count(Succ))
2399 + buildPredicate(Term, i, LoopPred, true);
2402 + if (Visited[Succ] < LoopIdx) {
2403 + LoopIdx = Visited[Succ];
2409 +/// \brief Collect various loop and predicate infos
2410 +void AMDGPUStructurizeCFG::collectInfos() {
2411 + unsigned Number = 0, LoopIdx = ~0;
2413 + // Reset predicate
2414 + Predicates.clear();
2417 + LoopStart = LoopEnd = 0;
2420 + RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend();
2421 + for (Visited.clear(); OI != OE; Visited[(*OI++)->getEntry()] = ++Number) {
2423 + // Analyze all the conditions leading to a node
2424 + analyzeBlock((*OI)->getEntry());
2426 + if ((*OI)->isSubRegion())
2429 + // Find the first/last loop nodes and loop predicates
2430 + analyzeLoop((*OI)->getNodeAs<BasicBlock>(), LoopIdx);
2434 +/// \brief Does A dominate all the predicates of B ?
2435 +bool AMDGPUStructurizeCFG::dominatesPredicates(BasicBlock *A, BasicBlock *B) {
2436 + BBPredicates &Preds = Predicates[B];
2437 + for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
2440 + if (!DT->dominates(A, PI->first))
2446 +/// \brief Remove phi values from all successors and the remove the terminator.
2447 +void AMDGPUStructurizeCFG::killTerminator(BasicBlock *BB) {
2448 + TerminatorInst *Term = BB->getTerminator();
2452 + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
2455 + delPhiValues(BB, *SI);
2458 + Term->eraseFromParent();
2461 +/// First: Skip forward to the first region node that either isn't a subregion or not
2462 +/// dominating it's exit, remove all the skipped nodes from the node order.
2464 +/// Second: Handle the first successor directly if the resulting nodes successor
2465 +/// predicates are still dominated by the original entry
2466 +RegionNode *AMDGPUStructurizeCFG::skipChained(RegionNode *Node) {
2467 + BasicBlock *Entry = Node->getEntry();
2469 + // Skip forward as long as it is just a linear flow
2471 + BasicBlock *Entry = Node->getEntry();
2474 + if (Node->isSubRegion()) {
2475 + Exit = Node->getNodeAs<Region>()->getExit();
2477 + TerminatorInst *Term = Entry->getTerminator();
2478 + if (Term->getNumSuccessors() != 1)
2480 + Exit = Term->getSuccessor(0);
2483 + // It's a back edge, break here so we can insert a loop node
2484 + if (!Visited.count(Exit))
2487 + // More than node edges are pointing to exit
2488 + if (!DT->dominates(Entry, Exit))
2491 + RegionNode *Next = ParentRegion->getNode(Exit);
2492 + RNVector::iterator I = std::find(Order.begin(), Order.end(), Next);
2493 + assert(I != Order.end());
2495 + Visited.erase(Next->getEntry());
2500 + BasicBlock *BB = Node->getEntry();
2501 + TerminatorInst *Term = BB->getTerminator();
2502 + if (Term->getNumSuccessors() != 2)
2505 + // Our node has exactly two succesors, check if we can handle
2506 + // any of them directly
2507 + BasicBlock *Succ = Term->getSuccessor(0);
2508 + if (!Visited.count(Succ) || !dominatesPredicates(Entry, Succ)) {
2509 + Succ = Term->getSuccessor(1);
2510 + if (!Visited.count(Succ) || !dominatesPredicates(Entry, Succ))
2513 + BasicBlock *Succ2 = Term->getSuccessor(1);
2514 + if (Visited.count(Succ2) && Visited[Succ] > Visited[Succ2] &&
2515 + dominatesPredicates(Entry, Succ2))
2519 + RegionNode *Next = ParentRegion->getNode(Succ);
2520 + RNVector::iterator E = Order.end();
2521 + RNVector::iterator I = std::find(Order.begin(), E, Next);
2524 + killTerminator(BB);
2525 + FlowsInserted.push_back(BB);
2526 + Visited.erase(Succ);
2528 + return ParentRegion->getNode(wireFlowBlock(BB, Next));
2531 +/// \brief Remove all PHI values coming from "From" into "To" and remember
2532 +/// them in DeletedPhis
2533 +void AMDGPUStructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
2534 + PhiMap &Map = DeletedPhis[To];
2535 + for (BasicBlock::iterator I = To->begin(), E = To->end();
2536 + I != E && isa<PHINode>(*I);) {
2538 + PHINode &Phi = cast<PHINode>(*I++);
2539 + while (Phi.getBasicBlockIndex(From) != -1) {
2540 + Value *Deleted = Phi.removeIncomingValue(From, false);
2541 + Map[&Phi].push_back(std::make_pair(From, Deleted));
2546 +/// \brief Add the PHI values back once we knew the new predecessor
2547 +void AMDGPUStructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
2548 + if (!DeletedPhis.count(To))
2551 + PhiMap &Map = DeletedPhis[To];
2552 + SSAUpdater Updater;
2554 + for (PhiMap::iterator I = Map.begin(), E = Map.end(); I != E; ++I) {
2556 + PHINode *Phi = I->first;
2557 + Updater.Initialize(Phi->getType(), "");
2558 + BasicBlock *Fallback = To;
2559 + bool HaveFallback = false;
2561 + for (BBValueVector::iterator VI = I->second.begin(), VE = I->second.end();
2564 + Updater.AddAvailableValue(VI->first, VI->second);
2565 + BasicBlock *Dom = DT->findNearestCommonDominator(Fallback, VI->first);
2566 + if (Dom == VI->first)
2567 + HaveFallback = true;
2568 + else if (Dom != Fallback)
2569 + HaveFallback = false;
2572 + if (!HaveFallback) {
2573 + Value *Undef = UndefValue::get(Phi->getType());
2574 + Updater.AddAvailableValue(Fallback, Undef);
2577 + Phi->addIncoming(Updater.GetValueAtEndOfBlock(From), From);
2579 + DeletedPhis.erase(To);
2582 +/// \brief Create a new flow node and update dominator tree and region info
2583 +BasicBlock *AMDGPUStructurizeCFG::getNextFlow(BasicBlock *Prev) {
2584 + LLVMContext &Context = Func->getContext();
2585 + BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
2586 + Order.back()->getEntry();
2587 + BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
2589 + DT->addNewBlock(Flow, Prev);
2590 + ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion);
2591 + FlowsInserted.push_back(Flow);
2595 +/// \brief Can we predict that this node will always be called?
2596 +bool AMDGPUStructurizeCFG::isPredictableTrue(BasicBlock *Prev,
2597 + BasicBlock *Node) {
2598 + BBPredicates &Preds = Predicates[Node];
2599 + bool Dominated = false;
2601 + for (BBPredicates::iterator I = Preds.begin(), E = Preds.end();
2604 + if (I->second != BoolTrue)
2607 + if (!Dominated && DT->dominates(I->first, Prev))
2613 +/// \brief Wire up the new control flow by inserting or updating the branch
2614 +/// instructions at node exits
2615 +BasicBlock *AMDGPUStructurizeCFG::wireFlowBlock(BasicBlock *Prev,
2616 + RegionNode *Node) {
2617 + BasicBlock *Entry = Node->getEntry();
2619 + if (LoopStart == Entry) {
2621 + LoopPred[Prev] = BoolTrue;
2624 + // Wire it up temporary, skipChained may recurse into us
2625 + BranchInst::Create(Entry, Prev);
2626 + DT->changeImmediateDominator(Entry, Prev);
2627 + addPhiValues(Prev, Entry);
2629 + Node = skipChained(Node);
2631 + BasicBlock *Next = getNextFlow(Prev);
2632 + if (!isPredictableTrue(Prev, Entry)) {
2633 + // Let Prev point to entry and next block
2634 + Prev->getTerminator()->eraseFromParent();
2635 + BranchInst::Create(Entry, Next, BoolUndef, Prev);
2637 + DT->changeImmediateDominator(Next, Entry);
2640 + // Let node exit(s) point to next block
2641 + if (Node->isSubRegion()) {
2642 + Region *SubRegion = Node->getNodeAs<Region>();
2643 + BasicBlock *Exit = SubRegion->getExit();
2645 + // Find all the edges from the sub region to the exit
2647 + for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) {
2648 + if (SubRegion->contains(*I))
2649 + ToDo.push_back(*I);
2652 + // Modify the edges to point to the new flow block
2653 + for (BBVector::iterator I = ToDo.begin(), E = ToDo.end(); I != E; ++I) {
2654 + delPhiValues(*I, Exit);
2655 + TerminatorInst *Term = (*I)->getTerminator();
2656 + Term->replaceUsesOfWith(Exit, Next);
2659 + // Update the region info
2660 + SubRegion->replaceExit(Next);
2663 + BasicBlock *BB = Node->getNodeAs<BasicBlock>();
2664 + killTerminator(BB);
2665 + BranchInst::Create(Next, BB);
2667 + if (BB == LoopEnd)
2674 +/// Destroy node order and visited map, build up flow order instead.
2675 +/// After this function control flow looks like it should be, but
2676 +/// branches only have undefined conditions.
2677 +void AMDGPUStructurizeCFG::createFlow() {
2678 + DeletedPhis.clear();
2680 + BasicBlock *Prev = Order.pop_back_val()->getEntry();
2681 + assert(Prev == ParentRegion->getEntry() && "Incorrect node order!");
2682 + Visited.erase(Prev);
2684 + if (LoopStart == Prev) {
2685 + // Loop starts at entry, split entry so that we can predicate it
2686 + BasicBlock::iterator Insert = Prev->getFirstInsertionPt();
2687 + BasicBlock *Split = Prev->splitBasicBlock(Insert, FlowBlockName);
2688 + DT->addNewBlock(Split, Prev);
2689 + ParentRegion->getRegionInfo()->setRegionFor(Split, ParentRegion);
2690 + Predicates[Split] = Predicates[Prev];
2691 + Order.push_back(ParentRegion->getBBNode(Split));
2692 + LoopPred[Prev] = BoolTrue;
2694 + } else if (LoopStart == Order.back()->getEntry()) {
2695 + // Loop starts behind entry, split entry so that we can jump to it
2696 + Instruction *Term = Prev->getTerminator();
2697 + BasicBlock *Split = Prev->splitBasicBlock(Term, FlowBlockName);
2698 + DT->addNewBlock(Split, Prev);
2699 + ParentRegion->getRegionInfo()->setRegionFor(Split, ParentRegion);
2703 + killTerminator(Prev);
2704 + FlowsInserted.clear();
2705 + FlowsInserted.push_back(Prev);
2707 + while (!Order.empty()) {
2708 + RegionNode *Node = Order.pop_back_val();
2709 + Visited.erase(Node->getEntry());
2710 + Prev = wireFlowBlock(Prev, Node);
2711 + if (LoopStart && !LoopEnd) {
2712 + // Create an extra loop end node
2714 + Prev = getNextFlow(LoopEnd);
2715 + BranchInst::Create(Prev, LoopStart, BoolUndef, LoopEnd);
2716 + addPhiValues(LoopEnd, LoopStart);
2720 + BasicBlock *Exit = ParentRegion->getExit();
2721 + BranchInst::Create(Exit, Prev);
2722 + addPhiValues(Prev, Exit);
2723 + if (DT->dominates(ParentRegion->getEntry(), Exit))
2724 + DT->changeImmediateDominator(Exit, Prev);
2726 + if (LoopStart && LoopEnd) {
2727 + BBVector::iterator FI = std::find(FlowsInserted.begin(),
2728 + FlowsInserted.end(),
2730 + for (; *FI != LoopEnd; ++FI) {
2731 + addPhiValues(*FI, (*FI)->getTerminator()->getSuccessor(0));
2735 + assert(Order.empty());
2736 + assert(Visited.empty());
2737 + assert(DeletedPhis.empty());
2740 +/// \brief Insert the missing branch conditions
2741 +void AMDGPUStructurizeCFG::insertConditions() {
2742 + SSAUpdater PhiInserter;
2744 + for (BBVector::iterator FI = FlowsInserted.begin(), FE = FlowsInserted.end();
2747 + BranchInst *Term = cast<BranchInst>((*FI)->getTerminator());
2748 + if (Term->isUnconditional())
2751 + PhiInserter.Initialize(Boolean, "");
2752 + PhiInserter.AddAvailableValue(&Func->getEntryBlock(), BoolFalse);
2754 + BasicBlock *Succ = Term->getSuccessor(0);
2755 + BBPredicates &Preds = (*FI == LoopEnd) ? LoopPred : Predicates[Succ];
2756 + for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
2759 + PhiInserter.AddAvailableValue(PI->first, PI->second);
2762 + Term->setCondition(PhiInserter.GetValueAtEndOfBlock(*FI));
2766 +/// Handle a rare case where the disintegrated nodes instructions
2767 +/// no longer dominate all their uses. Not sure if this is really nessasary
2768 +void AMDGPUStructurizeCFG::rebuildSSA() {
2769 + SSAUpdater Updater;
2770 + for (Region::block_iterator I = ParentRegion->block_begin(),
2771 + E = ParentRegion->block_end();
2774 + BasicBlock *BB = *I;
2775 + for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
2778 + bool Initialized = false;
2779 + for (Use *I = &II->use_begin().getUse(), *Next; I; I = Next) {
2781 + Next = I->getNext();
2783 + Instruction *User = cast<Instruction>(I->getUser());
2784 + if (User->getParent() == BB) {
2787 + } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
2788 + if (UserPN->getIncomingBlock(*I) == BB)
2792 + if (DT->dominates(II, User))
2795 + if (!Initialized) {
2796 + Value *Undef = UndefValue::get(II->getType());
2797 + Updater.Initialize(II->getType(), "");
2798 + Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
2799 + Updater.AddAvailableValue(BB, II);
2800 + Initialized = true;
2802 + Updater.RewriteUseAfterInsertions(*I);
2808 +/// \brief Run the transformation for each region found
2809 +bool AMDGPUStructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
2810 + if (R->isTopLevelRegion())
2813 + Func = R->getEntry()->getParent();
2816 + DT = &getAnalysis<DominatorTree>();
2821 + insertConditions();
2826 + Predicates.clear();
2827 + DeletedPhis.clear();
2828 + FlowsInserted.clear();
2833 +/// \brief Create the pass
2834 +Pass *llvm::createAMDGPUStructurizeCFGPass() {
2835 + return new AMDGPUStructurizeCFG();
2837 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUSubtarget.cpp llvm-r600/lib/Target/R600/AMDGPUSubtarget.cpp
2838 --- llvm-3.2.src/lib/Target/R600/AMDGPUSubtarget.cpp 1970-01-01 01:00:00.000000000 +0100
2839 +++ llvm-r600/lib/Target/R600/AMDGPUSubtarget.cpp 2013-01-25 19:43:57.433383055 +0100
2841 +//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2843 +// The LLVM Compiler Infrastructure
2845 +// This file is distributed under the University of Illinois Open Source
2846 +// License. See LICENSE.TXT for details.
2848 +//===----------------------------------------------------------------------===//
2851 +/// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
2853 +//===----------------------------------------------------------------------===//
2855 +#include "AMDGPUSubtarget.h"
2857 +using namespace llvm;
2859 +#define GET_SUBTARGETINFO_ENUM
2860 +#define GET_SUBTARGETINFO_TARGET_DESC
2861 +#define GET_SUBTARGETINFO_CTOR
2862 +#include "AMDGPUGenSubtargetInfo.inc"
2864 +AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
2865 + AMDGPUGenSubtargetInfo(TT, CPU, FS), DumpCode(false) {
2866 + InstrItins = getInstrItineraryForCPU(CPU);
2868 + memset(CapsOverride, 0, sizeof(*CapsOverride)
2869 + * AMDGPUDeviceInfo::MaxNumberCapabilities);
2871 + StringRef GPU = CPU;
2873 + DefaultSize[0] = 64;
2874 + DefaultSize[1] = 1;
2875 + DefaultSize[2] = 1;
2876 + ParseSubtargetFeatures(GPU, FS);
2878 + Device = AMDGPUDeviceInfo::getDeviceFromName(DevName, this, Is64bit);
2881 +AMDGPUSubtarget::~AMDGPUSubtarget() {
2886 +AMDGPUSubtarget::isOverride(AMDGPUDeviceInfo::Caps caps) const {
2887 + assert(caps < AMDGPUDeviceInfo::MaxNumberCapabilities &&
2888 + "Caps index is out of bounds!");
2889 + return CapsOverride[caps];
2892 +AMDGPUSubtarget::is64bit() const {
2896 +AMDGPUSubtarget::isTargetELF() const {
2900 +AMDGPUSubtarget::getDefaultSize(uint32_t dim) const {
2904 + return DefaultSize[dim];
2909 +AMDGPUSubtarget::getDataLayout() const {
2911 + return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16"
2912 + "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
2913 + "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
2914 + "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
2915 + "-v512:512:512-v1024:1024:1024-v2048:2048:2048-a0:0:64");
2917 + return Device->getDataLayout();
2921 +AMDGPUSubtarget::getDeviceName() const {
2924 +const AMDGPUDevice *
2925 +AMDGPUSubtarget::device() const {
2928 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUSubtarget.h llvm-r600/lib/Target/R600/AMDGPUSubtarget.h
2929 --- llvm-3.2.src/lib/Target/R600/AMDGPUSubtarget.h 1970-01-01 01:00:00.000000000 +0100
2930 +++ llvm-r600/lib/Target/R600/AMDGPUSubtarget.h 2013-01-25 19:43:57.433383055 +0100
2932 +//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====//
2934 +// The LLVM Compiler Infrastructure
2936 +// This file is distributed under the University of Illinois Open Source
2937 +// License. See LICENSE.TXT for details.
2939 +//==-----------------------------------------------------------------------===//
2942 +/// \brief AMDGPU specific subclass of TargetSubtarget.
2944 +//===----------------------------------------------------------------------===//
2946 +#ifndef AMDGPUSUBTARGET_H
2947 +#define AMDGPUSUBTARGET_H
2948 +#include "AMDILDevice.h"
2949 +#include "llvm/ADT/StringExtras.h"
2950 +#include "llvm/ADT/StringRef.h"
2951 +#include "llvm/Target/TargetSubtargetInfo.h"
2953 +#define GET_SUBTARGETINFO_HEADER
2954 +#include "AMDGPUGenSubtargetInfo.inc"
2956 +#define MAX_CB_SIZE (1 << 16)
2960 +class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
2962 + bool CapsOverride[AMDGPUDeviceInfo::MaxNumberCapabilities];
2963 + const AMDGPUDevice *Device;
2964 + size_t DefaultSize[3];
2965 + std::string DevName;
2971 + InstrItineraryData InstrItins;
2974 + AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS);
2975 + virtual ~AMDGPUSubtarget();
2977 + const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
2978 + virtual void ParseSubtargetFeatures(llvm::StringRef CPU, llvm::StringRef FS);
2980 + bool isOverride(AMDGPUDeviceInfo::Caps) const;
2981 + bool is64bit() const;
2983 + // Helper functions to simplify if statements
2984 + bool isTargetELF() const;
2985 + const AMDGPUDevice* device() const;
2986 + std::string getDataLayout() const;
2987 + std::string getDeviceName() const;
2988 + virtual size_t getDefaultSize(uint32_t dim) const;
2989 + bool dumpCode() const { return DumpCode; }
2990 + bool r600ALUEncoding() const { return R600ALUInst; }
2994 +} // End namespace llvm
2996 +#endif // AMDGPUSUBTARGET_H
2997 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUTargetMachine.cpp llvm-r600/lib/Target/R600/AMDGPUTargetMachine.cpp
2998 --- llvm-3.2.src/lib/Target/R600/AMDGPUTargetMachine.cpp 1970-01-01 01:00:00.000000000 +0100
2999 +++ llvm-r600/lib/Target/R600/AMDGPUTargetMachine.cpp 2013-01-25 19:43:57.433383055 +0100
3001 +//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
3003 +// The LLVM Compiler Infrastructure
3005 +// This file is distributed under the University of Illinois Open Source
3006 +// License. See LICENSE.TXT for details.
3008 +//===----------------------------------------------------------------------===//
3011 +/// \brief The AMDGPU target machine contains all of the hardware specific
3012 +/// information needed to emit code for R600 and SI GPUs.
3014 +//===----------------------------------------------------------------------===//
3016 +#include "AMDGPUTargetMachine.h"
3017 +#include "AMDGPU.h"
3018 +#include "R600ISelLowering.h"
3019 +#include "R600InstrInfo.h"
3020 +#include "SIISelLowering.h"
3021 +#include "SIInstrInfo.h"
3022 +#include "llvm/Analysis/Passes.h"
3023 +#include "llvm/Analysis/Verifier.h"
3024 +#include "llvm/CodeGen/MachineFunctionAnalysis.h"
3025 +#include "llvm/CodeGen/MachineModuleInfo.h"
3026 +#include "llvm/CodeGen/Passes.h"
3027 +#include "llvm/MC/MCAsmInfo.h"
3028 +#include "llvm/PassManager.h"
3029 +#include "llvm/Support/TargetRegistry.h"
3030 +#include "llvm/Support/raw_os_ostream.h"
3031 +#include "llvm/Transforms/IPO.h"
3032 +#include "llvm/Transforms/Scalar.h"
3033 +#include <llvm/CodeGen/Passes.h>
3035 +using namespace llvm;
3037 +extern "C" void LLVMInitializeR600Target() {
3038 + // Register the target
3039 + RegisterTargetMachine<AMDGPUTargetMachine> X(TheAMDGPUTarget);
3042 +AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
3043 + StringRef CPU, StringRef FS,
3044 + TargetOptions Options,
3045 + Reloc::Model RM, CodeModel::Model CM,
3046 + CodeGenOpt::Level OptLevel
3049 + LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel),
3050 + Subtarget(TT, CPU, FS),
3051 + Layout(Subtarget.getDataLayout()),
3052 + FrameLowering(TargetFrameLowering::StackGrowsUp,
3053 + Subtarget.device()->getStackAlignment(), 0),
3054 + IntrinsicInfo(this),
3055 + InstrItins(&Subtarget.getInstrItineraryData()) {
3056 + // TLInfo uses InstrInfo so it must be initialized after.
3057 + if (Subtarget.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
3058 + InstrInfo = new R600InstrInfo(*this);
3059 + TLInfo = new R600TargetLowering(*this);
3061 + InstrInfo = new SIInstrInfo(*this);
3062 + TLInfo = new SITargetLowering(*this);
3066 +AMDGPUTargetMachine::~AMDGPUTargetMachine() {
3070 +class AMDGPUPassConfig : public TargetPassConfig {
3072 + AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM)
3073 + : TargetPassConfig(TM, PM) {}
3075 + AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
3076 + return getTM<AMDGPUTargetMachine>();
3079 + virtual bool addPreISel();
3080 + virtual bool addInstSelector();
3081 + virtual bool addPreRegAlloc();
3082 + virtual bool addPostRegAlloc();
3083 + virtual bool addPreSched2();
3084 + virtual bool addPreEmitPass();
3086 +} // End of anonymous namespace
3088 +TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) {
3089 + return new AMDGPUPassConfig(this, PM);
3093 +AMDGPUPassConfig::addPreISel() {
3094 + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
3095 + if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
3096 + addPass(createAMDGPUStructurizeCFGPass());
3097 + addPass(createSIAnnotateControlFlowPass());
3102 +bool AMDGPUPassConfig::addInstSelector() {
3103 + addPass(createAMDGPUPeepholeOpt(*TM));
3104 + addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
3108 +bool AMDGPUPassConfig::addPreRegAlloc() {
3109 + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
3111 + if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
3112 + addPass(createSIAssignInterpRegsPass(*TM));
3114 + addPass(createAMDGPUConvertToISAPass(*TM));
3118 +bool AMDGPUPassConfig::addPostRegAlloc() {
3119 + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
3121 + if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
3122 + addPass(createSIInsertWaits(*TM));
3127 +bool AMDGPUPassConfig::addPreSched2() {
3129 + addPass(&IfConverterID);
3133 +bool AMDGPUPassConfig::addPreEmitPass() {
3134 + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
3135 + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
3136 + addPass(createAMDGPUCFGPreparationPass(*TM));
3137 + addPass(createAMDGPUCFGStructurizerPass(*TM));
3138 + addPass(createR600ExpandSpecialInstrsPass(*TM));
3139 + addPass(createR600LowerConstCopy(*TM));
3140 + addPass(&FinalizeMachineBundlesID);
3142 + addPass(createSILowerLiteralConstantsPass(*TM));
3143 + addPass(createSILowerControlFlowPass(*TM));
3149 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPUTargetMachine.h llvm-r600/lib/Target/R600/AMDGPUTargetMachine.h
3150 --- llvm-3.2.src/lib/Target/R600/AMDGPUTargetMachine.h 1970-01-01 01:00:00.000000000 +0100
3151 +++ llvm-r600/lib/Target/R600/AMDGPUTargetMachine.h 2013-01-25 19:43:57.433383055 +0100
3153 +//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===//
3155 +// The LLVM Compiler Infrastructure
3157 +// This file is distributed under the University of Illinois Open Source
3158 +// License. See LICENSE.TXT for details.
3160 +//===----------------------------------------------------------------------===//
3163 +/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets.
3165 +//===----------------------------------------------------------------------===//
3167 +#ifndef AMDGPU_TARGET_MACHINE_H
3168 +#define AMDGPU_TARGET_MACHINE_H
3170 +#include "AMDGPUInstrInfo.h"
3171 +#include "AMDGPUSubtarget.h"
3172 +#include "AMDILFrameLowering.h"
3173 +#include "AMDILIntrinsicInfo.h"
3174 +#include "R600ISelLowering.h"
3175 +#include "llvm/ADT/OwningPtr.h"
3176 +#include "llvm/DataLayout.h"
3180 +MCAsmInfo* createMCAsmInfo(const Target &T, StringRef TT);
3182 +class AMDGPUTargetMachine : public LLVMTargetMachine {
3184 + AMDGPUSubtarget Subtarget;
3185 + const DataLayout Layout;
3186 + AMDGPUFrameLowering FrameLowering;
3187 + AMDGPUIntrinsicInfo IntrinsicInfo;
3188 + const AMDGPUInstrInfo * InstrInfo;
3189 + AMDGPUTargetLowering * TLInfo;
3190 + const InstrItineraryData* InstrItins;
3193 + AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef FS,
3195 + TargetOptions Options,
3196 + Reloc::Model RM, CodeModel::Model CM,
3197 + CodeGenOpt::Level OL);
3198 + ~AMDGPUTargetMachine();
3199 + virtual const AMDGPUFrameLowering* getFrameLowering() const {
3200 + return &FrameLowering;
3202 + virtual const AMDGPUIntrinsicInfo* getIntrinsicInfo() const {
3203 + return &IntrinsicInfo;
3205 + virtual const AMDGPUInstrInfo *getInstrInfo() const {return InstrInfo;}
3206 + virtual const AMDGPUSubtarget *getSubtargetImpl() const {return &Subtarget; }
3207 + virtual const AMDGPURegisterInfo *getRegisterInfo() const {
3208 + return &InstrInfo->getRegisterInfo();
3210 + virtual AMDGPUTargetLowering * getTargetLowering() const {
3213 + virtual const InstrItineraryData* getInstrItineraryData() const {
3214 + return InstrItins;
3216 + virtual const DataLayout* getDataLayout() const { return &Layout; }
3217 + virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
3220 +} // End namespace llvm
3222 +#endif // AMDGPU_TARGET_MACHINE_H
3223 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDGPU.td llvm-r600/lib/Target/R600/AMDGPU.td
3224 --- llvm-3.2.src/lib/Target/R600/AMDGPU.td 1970-01-01 01:00:00.000000000 +0100
3225 +++ llvm-r600/lib/Target/R600/AMDGPU.td 2013-01-25 19:43:57.423383055 +0100
3227 +//===-- AMDIL.td - AMDIL Tablegen files --*- tablegen -*-------------------===//
3229 +// The LLVM Compiler Infrastructure
3231 +// This file is distributed under the University of Illinois Open Source
3232 +// License. See LICENSE.TXT for details.
3234 +//==-----------------------------------------------------------------------===//
3236 +// Include AMDIL TD files
3237 +include "AMDILBase.td"
3240 +def AMDGPUInstrInfo : InstrInfo {
3241 + let guessInstructionProperties = 1;
3244 +//===----------------------------------------------------------------------===//
3245 +// Declare the target which we are implementing
3246 +//===----------------------------------------------------------------------===//
3247 +def AMDGPUAsmWriter : AsmWriter {
3248 + string AsmWriterClassName = "InstPrinter";
3250 + bit isMCAsmWriter = 1;
3253 +def AMDGPU : Target {
3254 + // Pull in Instruction Info:
3255 + let InstructionSet = AMDGPUInstrInfo;
3256 + let AssemblyWriters = [AMDGPUAsmWriter];
3259 +// Include AMDGPU TD files
3260 +include "R600Schedule.td"
3261 +include "SISchedule.td"
3262 +include "Processors.td"
3263 +include "AMDGPUInstrInfo.td"
3264 +include "AMDGPUIntrinsics.td"
3265 +include "AMDGPURegisterInfo.td"
3266 +include "AMDGPUInstructions.td"
3267 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDIL7XXDevice.cpp llvm-r600/lib/Target/R600/AMDIL7XXDevice.cpp
3268 --- llvm-3.2.src/lib/Target/R600/AMDIL7XXDevice.cpp 1970-01-01 01:00:00.000000000 +0100
3269 +++ llvm-r600/lib/Target/R600/AMDIL7XXDevice.cpp 2013-01-25 19:43:57.433383055 +0100
3271 +//===-- AMDIL7XXDevice.cpp - Device Info for 7XX GPUs ---------------------===//
3273 +// The LLVM Compiler Infrastructure
3275 +// This file is distributed under the University of Illinois Open Source
3276 +// License. See LICENSE.TXT for details.
3279 +//==-----------------------------------------------------------------------===//
3280 +#include "AMDIL7XXDevice.h"
3281 +#include "AMDGPUSubtarget.h"
3282 +#include "AMDILDevice.h"
3284 +using namespace llvm;
3286 +AMDGPU7XXDevice::AMDGPU7XXDevice(AMDGPUSubtarget *ST) : AMDGPUDevice(ST) {
3288 + std::string name = mSTM->getDeviceName();
3289 + if (name == "rv710") {
3290 + DeviceFlag = OCL_DEVICE_RV710;
3291 + } else if (name == "rv730") {
3292 + DeviceFlag = OCL_DEVICE_RV730;
3294 + DeviceFlag = OCL_DEVICE_RV770;
3298 +AMDGPU7XXDevice::~AMDGPU7XXDevice() {
3301 +void AMDGPU7XXDevice::setCaps() {
3302 + mSWBits.set(AMDGPUDeviceInfo::LocalMem);
3305 +size_t AMDGPU7XXDevice::getMaxLDSSize() const {
3306 + if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
3307 + return MAX_LDS_SIZE_700;
3312 +size_t AMDGPU7XXDevice::getWavefrontSize() const {
3313 + return AMDGPUDevice::HalfWavefrontSize;
3316 +uint32_t AMDGPU7XXDevice::getGeneration() const {
3317 + return AMDGPUDeviceInfo::HD4XXX;
3320 +uint32_t AMDGPU7XXDevice::getResourceID(uint32_t DeviceID) const {
3321 + switch (DeviceID) {
3323 + assert(0 && "ID type passed in is unknown!");
3328 + case ARENA_UAV_ID:
3331 + if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
3332 + return DEFAULT_LDS_ID;
3336 + if (usesHardware(AMDGPUDeviceInfo::PrivateMem)) {
3337 + return DEFAULT_SCRATCH_ID;
3341 + assert(0 && "GDS UAV ID is not supported on this chip");
3342 + if (usesHardware(AMDGPUDeviceInfo::RegionMem)) {
3343 + return DEFAULT_GDS_ID;
3351 +uint32_t AMDGPU7XXDevice::getMaxNumUAVs() const {
3355 +AMDGPU770Device::AMDGPU770Device(AMDGPUSubtarget *ST): AMDGPU7XXDevice(ST) {
3359 +AMDGPU770Device::~AMDGPU770Device() {
3362 +void AMDGPU770Device::setCaps() {
3363 + if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) {
3364 + mSWBits.set(AMDGPUDeviceInfo::FMA);
3365 + mHWBits.set(AMDGPUDeviceInfo::DoubleOps);
3367 + mSWBits.set(AMDGPUDeviceInfo::BarrierDetect);
3368 + mHWBits.reset(AMDGPUDeviceInfo::LongOps);
3369 + mSWBits.set(AMDGPUDeviceInfo::LongOps);
3370 + mSWBits.set(AMDGPUDeviceInfo::LocalMem);
3373 +size_t AMDGPU770Device::getWavefrontSize() const {
3374 + return AMDGPUDevice::WavefrontSize;
3377 +AMDGPU710Device::AMDGPU710Device(AMDGPUSubtarget *ST) : AMDGPU7XXDevice(ST) {
3380 +AMDGPU710Device::~AMDGPU710Device() {
3383 +size_t AMDGPU710Device::getWavefrontSize() const {
3384 + return AMDGPUDevice::QuarterWavefrontSize;
3386 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDIL7XXDevice.h llvm-r600/lib/Target/R600/AMDIL7XXDevice.h
3387 --- llvm-3.2.src/lib/Target/R600/AMDIL7XXDevice.h 1970-01-01 01:00:00.000000000 +0100
3388 +++ llvm-r600/lib/Target/R600/AMDIL7XXDevice.h 2013-01-25 19:43:57.436716388 +0100
3390 +//==-- AMDIL7XXDevice.h - Define 7XX Device Device for AMDIL ---*- C++ -*--===//
3392 +// The LLVM Compiler Infrastructure
3394 +// This file is distributed under the University of Illinois Open Source
3395 +// License. See LICENSE.TXT for details.
3397 +//==-----------------------------------------------------------------------===//
3399 +/// \brief Interface for the subtarget data classes.
3401 +/// This file will define the interface that each generation needs to
3402 +/// implement in order to correctly answer queries on the capabilities of the
3403 +/// specific hardware.
3404 +//===----------------------------------------------------------------------===//
3405 +#ifndef AMDIL7XXDEVICEIMPL_H
3406 +#define AMDIL7XXDEVICEIMPL_H
3407 +#include "AMDILDevice.h"
3410 +class AMDGPUSubtarget;
3412 +//===----------------------------------------------------------------------===//
3413 +// 7XX generation of devices and their respective sub classes
3414 +//===----------------------------------------------------------------------===//
3416 +/// \brief The AMDGPU7XXDevice class represents the generic 7XX device.
3418 +/// All 7XX devices are derived from this class. The AMDGPU7XX device will only
3419 +/// support the minimal features that are required to be considered OpenCL 1.0
3420 +/// compliant and nothing more.
3421 +class AMDGPU7XXDevice : public AMDGPUDevice {
3423 + AMDGPU7XXDevice(AMDGPUSubtarget *ST);
3424 + virtual ~AMDGPU7XXDevice();
3425 + virtual size_t getMaxLDSSize() const;
3426 + virtual size_t getWavefrontSize() const;
3427 + virtual uint32_t getGeneration() const;
3428 + virtual uint32_t getResourceID(uint32_t DeviceID) const;
3429 + virtual uint32_t getMaxNumUAVs() const;
3432 + virtual void setCaps();
3435 +/// \brief The AMDGPU770Device class represents the RV770 chip and it's
3436 +/// derivative cards.
3438 +/// The difference between this device and the base class is this device device
3439 +/// adds support for double precision and has a larger wavefront size.
3440 +class AMDGPU770Device : public AMDGPU7XXDevice {
3442 + AMDGPU770Device(AMDGPUSubtarget *ST);
3443 + virtual ~AMDGPU770Device();
3444 + virtual size_t getWavefrontSize() const;
3446 + virtual void setCaps();
3449 +/// \brief The AMDGPU710Device class derives from the 7XX base class.
3451 +/// This class is a smaller derivative, so we need to overload some of the
3452 +/// functions in order to correctly specify this information.
3453 +class AMDGPU710Device : public AMDGPU7XXDevice {
3455 + AMDGPU710Device(AMDGPUSubtarget *ST);
3456 + virtual ~AMDGPU710Device();
3457 + virtual size_t getWavefrontSize() const;
3460 +} // namespace llvm
3461 +#endif // AMDILDEVICEIMPL_H
3462 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILBase.td llvm-r600/lib/Target/R600/AMDILBase.td
3463 --- llvm-3.2.src/lib/Target/R600/AMDILBase.td 1970-01-01 01:00:00.000000000 +0100
3464 +++ llvm-r600/lib/Target/R600/AMDILBase.td 2013-01-25 19:43:57.436716388 +0100
3466 +//===- AMDIL.td - AMDIL Target Machine -------------*- tablegen -*-===//
3468 +// The LLVM Compiler Infrastructure
3470 +// This file is distributed under the University of Illinois Open Source
3471 +// License. See LICENSE.TXT for details.
3473 +//===----------------------------------------------------------------------===//
3474 +// Target-independent interfaces which we are implementing
3475 +//===----------------------------------------------------------------------===//
3477 +include "llvm/Target/Target.td"
3479 +// Dummy Instruction itineraries for pseudo instructions
3480 +def ALU_NULL : FuncUnit;
3481 +def NullALU : InstrItinClass;
3483 +//===----------------------------------------------------------------------===//
3484 +// AMDIL Subtarget features.
3485 +//===----------------------------------------------------------------------===//
3486 +def FeatureFP64 : SubtargetFeature<"fp64",
3487 + "CapsOverride[AMDGPUDeviceInfo::DoubleOps]",
3489 + "Enable 64bit double precision operations">;
3490 +def FeatureByteAddress : SubtargetFeature<"byte_addressable_store",
3491 + "CapsOverride[AMDGPUDeviceInfo::ByteStores]",
3493 + "Enable byte addressable stores">;
3494 +def FeatureBarrierDetect : SubtargetFeature<"barrier_detect",
3495 + "CapsOverride[AMDGPUDeviceInfo::BarrierDetect]",
3497 + "Enable duplicate barrier detection(HD5XXX or later).">;
3498 +def FeatureImages : SubtargetFeature<"images",
3499 + "CapsOverride[AMDGPUDeviceInfo::Images]",
3501 + "Enable image functions">;
3502 +def FeatureMultiUAV : SubtargetFeature<"multi_uav",
3503 + "CapsOverride[AMDGPUDeviceInfo::MultiUAV]",
3505 + "Generate multiple UAV code(HD5XXX family or later)">;
3506 +def FeatureMacroDB : SubtargetFeature<"macrodb",
3507 + "CapsOverride[AMDGPUDeviceInfo::MacroDB]",
3509 + "Use internal macrodb, instead of macrodb in driver">;
3510 +def FeatureNoAlias : SubtargetFeature<"noalias",
3511 + "CapsOverride[AMDGPUDeviceInfo::NoAlias]",
3513 + "assert that all kernel argument pointers are not aliased">;
3514 +def FeatureNoInline : SubtargetFeature<"no-inline",
3515 + "CapsOverride[AMDGPUDeviceInfo::NoInline]",
3517 + "specify whether to not inline functions">;
3519 +def Feature64BitPtr : SubtargetFeature<"64BitPtr",
3522 + "Specify if 64bit addressing should be used.">;
3524 +def Feature32on64BitPtr : SubtargetFeature<"64on32BitPtr",
3527 + "Specify if 64bit sized pointers with 32bit addressing should be used.">;
3528 +def FeatureDebug : SubtargetFeature<"debug",
3529 + "CapsOverride[AMDGPUDeviceInfo::Debug]",
3531 + "Debug mode is enabled, so disable hardware accelerated address spaces.">;
3532 +def FeatureDumpCode : SubtargetFeature <"DumpCode",
3535 + "Dump MachineInstrs in the CodeEmitter">;
3537 +def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
3540 + "Older version of ALU instructions encoding.">;
3543 +//===----------------------------------------------------------------------===//
3544 +// Register File, Calling Conv, Instruction Descriptions
3545 +//===----------------------------------------------------------------------===//
3548 +include "AMDILRegisterInfo.td"
3549 +include "AMDILInstrInfo.td"
3551 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILCFGStructurizer.cpp llvm-r600/lib/Target/R600/AMDILCFGStructurizer.cpp
3552 --- llvm-3.2.src/lib/Target/R600/AMDILCFGStructurizer.cpp 1970-01-01 01:00:00.000000000 +0100
3553 +++ llvm-r600/lib/Target/R600/AMDILCFGStructurizer.cpp 2013-01-25 19:43:57.436716388 +0100
3555 +//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===//
3557 +// The LLVM Compiler Infrastructure
3559 +// This file is distributed under the University of Illinois Open Source
3560 +// License. See LICENSE.TXT for details.
3563 +//==-----------------------------------------------------------------------===//
3566 +#define DEBUG_TYPE "structcfg"
3568 +#include "AMDGPUInstrInfo.h"
3570 +#include "llvm/ADT/SCCIterator.h"
3571 +#include "llvm/ADT/SmallVector.h"
3572 +#include "llvm/ADT/Statistic.h"
3573 +#include "llvm/Analysis/DominatorInternals.h"
3574 +#include "llvm/Analysis/Dominators.h"
3575 +#include "llvm/CodeGen/MachinePostDominators.h"
3576 +#include "llvm/CodeGen/MachineDominators.h"
3577 +#include "llvm/CodeGen/MachineFunction.h"
3578 +#include "llvm/CodeGen/MachineFunctionAnalysis.h"
3579 +#include "llvm/CodeGen/MachineFunctionPass.h"
3580 +#include "llvm/CodeGen/MachineInstrBuilder.h"
3581 +#include "llvm/CodeGen/MachineJumpTableInfo.h"
3582 +#include "llvm/CodeGen/MachineLoopInfo.h"
3583 +#include "llvm/CodeGen/MachineRegisterInfo.h"
3584 +#include "llvm/Target/TargetInstrInfo.h"
3586 +using namespace llvm;
3588 +// TODO: move-begin.
3590 +//===----------------------------------------------------------------------===//
3592 +// Statistics for CFGStructurizer.
3594 +//===----------------------------------------------------------------------===//
3596 +STATISTIC(numSerialPatternMatch, "CFGStructurizer number of serial pattern "
3598 +STATISTIC(numIfPatternMatch, "CFGStructurizer number of if pattern "
3600 +STATISTIC(numLoopbreakPatternMatch, "CFGStructurizer number of loop-break "
3601 + "pattern matched");
3602 +STATISTIC(numLoopcontPatternMatch, "CFGStructurizer number of loop-continue "
3603 + "pattern matched");
3604 +STATISTIC(numLoopPatternMatch, "CFGStructurizer number of loop pattern "
3606 +STATISTIC(numClonedBlock, "CFGStructurizer cloned blocks");
3607 +STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions");
3609 +//===----------------------------------------------------------------------===//
3611 +// Miscellaneous utility for CFGStructurizer.
3613 +//===----------------------------------------------------------------------===//
3614 +namespace llvmCFGStruct {
3615 +#define SHOWNEWINSTR(i) \
3616 + if (DEBUGME) errs() << "New instr: " << *i << "\n"
3618 +#define SHOWNEWBLK(b, msg) \
3620 + errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
3624 +#define SHOWBLK_DETAIL(b, msg) \
3627 + errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
3628 + b->print(errs()); \
3633 +#define INVALIDSCCNUM -1
3634 +#define INVALIDREGNUM 0
3636 +template<class LoopinfoT>
3637 +void PrintLoopinfo(const LoopinfoT &LoopInfo, llvm::raw_ostream &OS) {
3638 + for (typename LoopinfoT::iterator iter = LoopInfo.begin(),
3639 + iterEnd = LoopInfo.end();
3640 + iter != iterEnd; ++iter) {
3641 + (*iter)->print(OS, 0);
3645 +template<class NodeT>
3646 +void ReverseVector(SmallVector<NodeT *, DEFAULT_VEC_SLOTS> &Src) {
3647 + size_t sz = Src.size();
3648 + for (size_t i = 0; i < sz/2; ++i) {
3649 + NodeT *t = Src[i];
3650 + Src[i] = Src[sz - i - 1];
3651 + Src[sz - i - 1] = t;
3655 +} //end namespace llvmCFGStruct
3657 +//===----------------------------------------------------------------------===//
3659 +// supporting data structure for CFGStructurizer
3661 +//===----------------------------------------------------------------------===//
3663 +namespace llvmCFGStruct {
3664 +template<class PassT>
3665 +struct CFGStructTraits {
3668 +template <class InstrT>
3669 +class BlockInformation {
3673 + //SmallVector<InstrT*, DEFAULT_VEC_SLOTS> succInstr;
3674 + //Instructions defining the corresponding successor.
3675 + BlockInformation() : isRetired(false), sccNum(INVALIDSCCNUM) {}
3678 +template <class BlockT, class InstrT, class RegiT>
3679 +class LandInformation {
3682 + std::set<RegiT> breakInitRegs; //Registers that need to "reg = 0", before
3683 + //WHILELOOP(thisloop) init before entering
3685 + std::set<RegiT> contInitRegs; //Registers that need to "reg = 0", after
3686 + //WHILELOOP(thisloop) init after entering
3688 + std::set<RegiT> endbranchInitRegs; //Init before entering this loop, at loop
3689 + //land block, branch cond on this reg.
3690 + std::set<RegiT> breakOnRegs; //registers that need to "if (reg) break
3691 + //endif" after ENDLOOP(thisloop) break
3692 + //outerLoopOf(thisLoop).
3693 + std::set<RegiT> contOnRegs; //registers that need to "if (reg) continue
3694 + //endif" after ENDLOOP(thisloop) continue on
3695 + //outerLoopOf(thisLoop).
3696 + LandInformation() : landBlk(NULL) {}
3699 +} //end of namespace llvmCFGStruct
3701 +//===----------------------------------------------------------------------===//
3705 +//===----------------------------------------------------------------------===//
3707 +namespace llvmCFGStruct {
3708 +// bixia TODO: port it to BasicBlock, not just MachineBasicBlock.
3709 +template<class PassT>
3710 +class CFGStructurizer {
3713 + Not_SinglePath = 0,
3714 + SinglePath_InPath = 1,
3715 + SinglePath_NotInPath = 2
3719 + typedef typename PassT::InstructionType InstrT;
3720 + typedef typename PassT::FunctionType FuncT;
3721 + typedef typename PassT::DominatortreeType DomTreeT;
3722 + typedef typename PassT::PostDominatortreeType PostDomTreeT;
3723 + typedef typename PassT::DomTreeNodeType DomTreeNodeT;
3724 + typedef typename PassT::LoopinfoType LoopInfoT;
3726 + typedef GraphTraits<FuncT *> FuncGTraits;
3727 + //typedef FuncGTraits::nodes_iterator BlockIterator;
3728 + typedef typename FuncT::iterator BlockIterator;
3730 + typedef typename FuncGTraits::NodeType BlockT;
3731 + typedef GraphTraits<BlockT *> BlockGTraits;
3732 + typedef GraphTraits<Inverse<BlockT *> > InvBlockGTraits;
3733 + //typedef BlockGTraits::succ_iterator InstructionIterator;
3734 + typedef typename BlockT::iterator InstrIterator;
3736 + typedef CFGStructTraits<PassT> CFGTraits;
3737 + typedef BlockInformation<InstrT> BlockInfo;
3738 + typedef std::map<BlockT *, BlockInfo *> BlockInfoMap;
3740 + typedef int RegiT;
3741 + typedef typename PassT::LoopType LoopT;
3742 + typedef LandInformation<BlockT, InstrT, RegiT> LoopLandInfo;
3743 + typedef std::map<LoopT *, LoopLandInfo *> LoopLandInfoMap;
3744 + //landing info for loop break
3745 + typedef SmallVector<BlockT *, 32> BlockTSmallerVector;
3748 + CFGStructurizer();
3749 + ~CFGStructurizer();
3751 + /// Perform the CFG structurization
3752 + bool run(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri);
3754 + /// Perform the CFG preparation
3755 + bool prepare(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri);
3758 + void reversePredicateSetter(typename BlockT::iterator);
3759 + void orderBlocks();
3760 + void printOrderedBlocks(llvm::raw_ostream &OS);
3761 + int patternMatch(BlockT *CurBlock);
3762 + int patternMatchGroup(BlockT *CurBlock);
3764 + int serialPatternMatch(BlockT *CurBlock);
3765 + int ifPatternMatch(BlockT *CurBlock);
3766 + int switchPatternMatch(BlockT *CurBlock);
3767 + int loopendPatternMatch(BlockT *CurBlock);
3768 + int loopPatternMatch(BlockT *CurBlock);
3770 + int loopbreakPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
3771 + int loopcontPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
3772 + //int loopWithoutBreak(BlockT *);
3774 + void handleLoopbreak (BlockT *ExitingBlock, LoopT *ExitingLoop,
3775 + BlockT *ExitBlock, LoopT *exitLoop, BlockT *landBlock);
3776 + void handleLoopcontBlock(BlockT *ContingBlock, LoopT *contingLoop,
3777 + BlockT *ContBlock, LoopT *contLoop);
3778 + bool isSameloopDetachedContbreak(BlockT *Src1Block, BlockT *Src2Block);
3779 + int handleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
3780 + BlockT *FalseBlock);
3781 + int handleJumpintoIfImp(BlockT *HeadBlock, BlockT *TrueBlock,
3782 + BlockT *FalseBlock);
3783 + int improveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
3784 + BlockT *FalseBlock, BlockT **LandBlockPtr);
3785 + void showImproveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
3786 + BlockT *FalseBlock, BlockT *LandBlock,
3787 + bool Detail = false);
3788 + PathToKind singlePathTo(BlockT *SrcBlock, BlockT *DstBlock,
3789 + bool AllowSideEntry = true);
3790 + BlockT *singlePathEnd(BlockT *srcBlock, BlockT *DstBlock,
3791 + bool AllowSideEntry = true);
3792 + int cloneOnSideEntryTo(BlockT *PreBlock, BlockT *SrcBlock, BlockT *DstBlock);
3793 + void mergeSerialBlock(BlockT *DstBlock, BlockT *srcBlock);
3795 + void mergeIfthenelseBlock(InstrT *BranchInstr, BlockT *CurBlock,
3796 + BlockT *TrueBlock, BlockT *FalseBlock,
3797 + BlockT *LandBlock);
3798 + void mergeLooplandBlock(BlockT *DstBlock, LoopLandInfo *LoopLand);
3799 + void mergeLoopbreakBlock(BlockT *ExitingBlock, BlockT *ExitBlock,
3800 + BlockT *ExitLandBlock, RegiT SetReg);
3801 + void settleLoopcontBlock(BlockT *ContingBlock, BlockT *ContBlock,
3803 + BlockT *relocateLoopcontBlock(LoopT *ParentLoopRep, LoopT *LoopRep,
3804 + std::set<BlockT*> &ExitBlockSet,
3805 + BlockT *ExitLandBlk);
3806 + BlockT *addLoopEndbranchBlock(LoopT *LoopRep,
3807 + BlockTSmallerVector &ExitingBlocks,
3808 + BlockTSmallerVector &ExitBlocks);
3809 + BlockT *normalizeInfiniteLoopExit(LoopT *LoopRep);
3810 + void removeUnconditionalBranch(BlockT *SrcBlock);
3811 + void removeRedundantConditionalBranch(BlockT *SrcBlock);
3812 + void addDummyExitBlock(SmallVector<BlockT *, DEFAULT_VEC_SLOTS> &RetBlocks);
3814 + void removeSuccessor(BlockT *SrcBlock);
3815 + BlockT *cloneBlockForPredecessor(BlockT *CurBlock, BlockT *PredBlock);
3816 + BlockT *exitingBlock2ExitBlock (LoopT *LoopRep, BlockT *exitingBlock);
3818 + void migrateInstruction(BlockT *SrcBlock, BlockT *DstBlock,
3819 + InstrIterator InsertPos);
3821 + void recordSccnum(BlockT *SrcBlock, int SCCNum);
3822 + int getSCCNum(BlockT *srcBlk);
3824 + void retireBlock(BlockT *DstBlock, BlockT *SrcBlock);
3825 + bool isRetiredBlock(BlockT *SrcBlock);
3826 + bool isActiveLoophead(BlockT *CurBlock);
3827 + bool needMigrateBlock(BlockT *Block);
3829 + BlockT *recordLoopLandBlock(LoopT *LoopRep, BlockT *LandBlock,
3830 + BlockTSmallerVector &exitBlocks,
3831 + std::set<BlockT*> &ExitBlockSet);
3832 + void setLoopLandBlock(LoopT *LoopRep, BlockT *Block = NULL);
3833 + BlockT *getLoopLandBlock(LoopT *LoopRep);
3834 + LoopLandInfo *getLoopLandInfo(LoopT *LoopRep);
3836 + void addLoopBreakOnReg(LoopT *LoopRep, RegiT RegNum);
3837 + void addLoopContOnReg(LoopT *LoopRep, RegiT RegNum);
3838 + void addLoopBreakInitReg(LoopT *LoopRep, RegiT RegNum);
3839 + void addLoopContInitReg(LoopT *LoopRep, RegiT RegNum);
3840 + void addLoopEndbranchInitReg(LoopT *LoopRep, RegiT RegNum);
3842 + bool hasBackEdge(BlockT *curBlock);
3843 + unsigned getLoopDepth (LoopT *LoopRep);
3844 + int countActiveBlock(
3845 + typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterStart,
3846 + typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterEnd);
3847 + BlockT *findNearestCommonPostDom(std::set<BlockT *>&);
3848 + BlockT *findNearestCommonPostDom(BlockT *Block1, BlockT *Block2);
3851 + DomTreeT *domTree;
3852 + PostDomTreeT *postDomTree;
3853 + LoopInfoT *loopInfo;
3857 + BlockInfoMap blockInfoMap;
3858 + LoopLandInfoMap loopLandInfoMap;
3859 + SmallVector<BlockT *, DEFAULT_VEC_SLOTS> orderedBlks;
3860 + const AMDGPURegisterInfo *TRI;
3862 +}; //template class CFGStructurizer
3864 +template<class PassT> CFGStructurizer<PassT>::CFGStructurizer()
3865 + : domTree(NULL), postDomTree(NULL), loopInfo(NULL) {
3868 +template<class PassT> CFGStructurizer<PassT>::~CFGStructurizer() {
3869 + for (typename BlockInfoMap::iterator I = blockInfoMap.begin(),
3870 + E = blockInfoMap.end(); I != E; ++I) {
3875 +template<class PassT>
3876 +bool CFGStructurizer<PassT>::prepare(FuncT &func, PassT &pass,
3877 + const AMDGPURegisterInfo * tri) {
3882 + bool changed = false;
3884 + //FIXME: if not reducible flow graph, make it so ???
3887 + errs() << "AMDGPUCFGStructurizer::prepare\n";
3890 + loopInfo = CFGTraits::getLoopInfo(pass);
3892 + errs() << "LoopInfo:\n";
3893 + PrintLoopinfo(*loopInfo, errs());
3898 + errs() << "Ordered blocks:\n";
3899 + printOrderedBlocks(errs());
3902 + SmallVector<BlockT *, DEFAULT_VEC_SLOTS> retBlks;
3904 + for (typename LoopInfoT::iterator iter = loopInfo->begin(),
3905 + iterEnd = loopInfo->end();
3906 + iter != iterEnd; ++iter) {
3907 + LoopT* loopRep = (*iter);
3908 + BlockTSmallerVector exitingBlks;
3909 + loopRep->getExitingBlocks(exitingBlks);
3911 + if (exitingBlks.size() == 0) {
3912 + BlockT* dummyExitBlk = normalizeInfiniteLoopExit(loopRep);
3913 + if (dummyExitBlk != NULL)
3914 + retBlks.push_back(dummyExitBlk);
3918 + // Remove unconditional branch instr.
3919 + // Add dummy exit block iff there are multiple returns.
3921 + for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
3922 + iterBlk = orderedBlks.begin(), iterEndBlk = orderedBlks.end();
3923 + iterBlk != iterEndBlk;
3925 + BlockT *curBlk = *iterBlk;
3926 + removeUnconditionalBranch(curBlk);
3927 + removeRedundantConditionalBranch(curBlk);
3928 + if (CFGTraits::isReturnBlock(curBlk)) {
3929 + retBlks.push_back(curBlk);
3931 + assert(curBlk->succ_size() <= 2);
3934 + if (retBlks.size() >= 2) {
3935 + addDummyExitBlock(retBlks);
3940 +} //CFGStructurizer::prepare
3942 +template<class PassT>
3943 +bool CFGStructurizer<PassT>::run(FuncT &func, PassT &pass,
3944 + const AMDGPURegisterInfo * tri) {
3949 + //Assume reducible CFG...
3951 + errs() << "AMDGPUCFGStructurizer::run\n";
3955 + domTree = CFGTraits::getDominatorTree(pass);
3957 + domTree->print(errs(), (const llvm::Module*)0);
3960 + postDomTree = CFGTraits::getPostDominatorTree(pass);
3962 + postDomTree->print(errs());
3965 + loopInfo = CFGTraits::getLoopInfo(pass);
3967 + errs() << "LoopInfo:\n";
3968 + PrintLoopinfo(*loopInfo, errs());
3973 + //Use the worse block ordering to test the algorithm.
3974 + ReverseVector(orderedBlks);
3978 + errs() << "Ordered blocks:\n";
3979 + printOrderedBlocks(errs());
3982 + bool finish = false;
3984 + bool makeProgress = false;
3985 + int numRemainedBlk = countActiveBlock(orderedBlks.begin(),
3986 + orderedBlks.end());
3991 + errs() << "numIter = " << numIter
3992 + << ", numRemaintedBlk = " << numRemainedBlk << "\n";
3995 + typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
3996 + iterBlk = orderedBlks.begin();
3997 + typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
3998 + iterBlkEnd = orderedBlks.end();
4000 + typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
4001 + sccBeginIter = iterBlk;
4002 + BlockT *sccBeginBlk = NULL;
4003 + int sccNumBlk = 0; // The number of active blocks, init to a
4004 + // maximum possible number.
4005 + int sccNumIter; // Number of iteration in this SCC.
4007 + while (iterBlk != iterBlkEnd) {
4008 + curBlk = *iterBlk;
4010 + if (sccBeginBlk == NULL) {
4011 + sccBeginIter = iterBlk;
4012 + sccBeginBlk = curBlk;
4014 + sccNumBlk = numRemainedBlk; // Init to maximum possible number.
4016 + errs() << "start processing SCC" << getSCCNum(sccBeginBlk);
4021 + if (!isRetiredBlock(curBlk)) {
4022 + patternMatch(curBlk);
4027 + bool contNextScc = true;
4028 + if (iterBlk == iterBlkEnd
4029 + || getSCCNum(sccBeginBlk) != getSCCNum(*iterBlk)) {
4030 + // Just finish one scc.
4032 + int sccRemainedNumBlk = countActiveBlock(sccBeginIter, iterBlk);
4033 + if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= sccNumBlk) {
4035 + errs() << "Can't reduce SCC " << getSCCNum(curBlk)
4036 + << ", sccNumIter = " << sccNumIter;
4037 + errs() << "doesn't make any progress\n";
4039 + contNextScc = true;
4040 + } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < sccNumBlk) {
4041 + sccNumBlk = sccRemainedNumBlk;
4042 + iterBlk = sccBeginIter;
4043 + contNextScc = false;
4045 + errs() << "repeat processing SCC" << getSCCNum(curBlk)
4046 + << "sccNumIter = " << sccNumIter << "\n";
4050 + // Finish the current scc.
4051 + contNextScc = true;
4054 + // Continue on next component in the current scc.
4055 + contNextScc = false;
4058 + if (contNextScc) {
4059 + sccBeginBlk = NULL;
4061 + } //while, "one iteration" over the function.
4063 + BlockT *entryBlk = FuncGTraits::nodes_begin(&func);
4064 + if (entryBlk->succ_size() == 0) {
4067 + errs() << "Reduce to one block\n";
4070 + int newnumRemainedBlk
4071 + = countActiveBlock(orderedBlks.begin(), orderedBlks.end());
4072 + // consider cloned blocks ??
4073 + if (newnumRemainedBlk == 1 || newnumRemainedBlk < numRemainedBlk) {
4074 + makeProgress = true;
4075 + numRemainedBlk = newnumRemainedBlk;
4077 + makeProgress = false;
4079 + errs() << "No progress\n";
4083 + } while (!finish && makeProgress);
4085 + // Misc wrap up to maintain the consistency of the Function representation.
4086 + CFGTraits::wrapup(FuncGTraits::nodes_begin(&func));
4088 + // Detach retired Block, release memory.
4089 + for (typename BlockInfoMap::iterator iterMap = blockInfoMap.begin(),
4090 + iterEndMap = blockInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
4091 + if ((*iterMap).second && (*iterMap).second->isRetired) {
4092 + assert(((*iterMap).first)->getNumber() != -1);
4094 + errs() << "Erase BB" << ((*iterMap).first)->getNumber() << "\n";
4096 + (*iterMap).first->eraseFromParent(); //Remove from the parent Function.
4098 + delete (*iterMap).second;
4100 + blockInfoMap.clear();
4102 + // clear loopLandInfoMap
4103 + for (typename LoopLandInfoMap::iterator iterMap = loopLandInfoMap.begin(),
4104 + iterEndMap = loopLandInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
4105 + delete (*iterMap).second;
4107 + loopLandInfoMap.clear();
4114 + assert(!"IRREDUCIBL_CF");
4118 +} //CFGStructurizer::run
4120 +/// Print the ordered Blocks.
4122 +template<class PassT>
4123 +void CFGStructurizer<PassT>::printOrderedBlocks(llvm::raw_ostream &os) {
4125 + for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
4126 + iterBlk = orderedBlks.begin(), iterBlkEnd = orderedBlks.end();
4127 + iterBlk != iterBlkEnd;
4129 + os << "BB" << (*iterBlk)->getNumber();
4130 + os << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
4131 + if (i != 0 && i % 10 == 0) {
4137 +} //printOrderedBlocks
4139 +/// Compute the reversed DFS post order of Blocks
4141 +template<class PassT> void CFGStructurizer<PassT>::orderBlocks() {
4144 + for (scc_iterator<FuncT *> sccIter = scc_begin(funcRep),
4145 + sccEnd = scc_end(funcRep); sccIter != sccEnd; ++sccIter, ++sccNum) {
4146 + std::vector<BlockT *> &sccNext = *sccIter;
4147 + for (typename std::vector<BlockT *>::const_iterator
4148 + blockIter = sccNext.begin(), blockEnd = sccNext.end();
4149 + blockIter != blockEnd; ++blockIter) {
4151 + orderedBlks.push_back(bb);
4152 + recordSccnum(bb, sccNum);
4156 + //walk through all the block in func to check for unreachable
4157 + for (BlockIterator blockIter1 = FuncGTraits::nodes_begin(funcRep),
4158 + blockEnd1 = FuncGTraits::nodes_end(funcRep);
4159 + blockIter1 != blockEnd1; ++blockIter1) {
4160 + BlockT *bb = &(*blockIter1);
4161 + sccNum = getSCCNum(bb);
4162 + if (sccNum == INVALIDSCCNUM) {
4163 + errs() << "unreachable block BB" << bb->getNumber() << "\n";
4168 +template<class PassT> int CFGStructurizer<PassT>::patternMatch(BlockT *curBlk) {
4173 + errs() << "Begin patternMatch BB" << curBlk->getNumber() << "\n";
4176 + while ((curMatch = patternMatchGroup(curBlk)) > 0) {
4177 + numMatch += curMatch;
4181 + errs() << "End patternMatch BB" << curBlk->getNumber()
4182 + << ", numMatch = " << numMatch << "\n";
4188 +template<class PassT>
4189 +int CFGStructurizer<PassT>::patternMatchGroup(BlockT *curBlk) {
4191 + numMatch += serialPatternMatch(curBlk);
4192 + numMatch += ifPatternMatch(curBlk);
4193 + numMatch += loopendPatternMatch(curBlk);
4194 + numMatch += loopPatternMatch(curBlk);
4196 +}//patternMatchGroup
4198 +template<class PassT>
4199 +int CFGStructurizer<PassT>::serialPatternMatch(BlockT *curBlk) {
4200 + if (curBlk->succ_size() != 1) {
4204 + BlockT *childBlk = *curBlk->succ_begin();
4205 + if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk)) {
4209 + mergeSerialBlock(curBlk, childBlk);
4210 + ++numSerialPatternMatch;
4212 +} //serialPatternMatch
4214 +template<class PassT>
4215 +int CFGStructurizer<PassT>::ifPatternMatch(BlockT *curBlk) {
4217 + if (curBlk->succ_size() != 2) {
4221 + if (hasBackEdge(curBlk)) {
4225 + InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(curBlk);
4226 + if (branchInstr == NULL) {
4230 + assert(CFGTraits::isCondBranch(branchInstr));
4232 + BlockT *trueBlk = CFGTraits::getTrueBranch(branchInstr);
4233 + BlockT *falseBlk = CFGTraits::getFalseBranch(curBlk, branchInstr);
4238 + if (trueBlk->succ_size() == 1 && falseBlk->succ_size() == 1
4239 + && *trueBlk->succ_begin() == *falseBlk->succ_begin()) {
4240 + landBlk = *trueBlk->succ_begin();
4241 + } else if (trueBlk->succ_size() == 0 && falseBlk->succ_size() == 0) {
4243 + } else if (trueBlk->succ_size() == 1 && *trueBlk->succ_begin() == falseBlk) {
4244 + landBlk = falseBlk;
4246 + } else if (falseBlk->succ_size() == 1
4247 + && *falseBlk->succ_begin() == trueBlk) {
4248 + landBlk = trueBlk;
4250 + } else if (falseBlk->succ_size() == 1
4251 + && isSameloopDetachedContbreak(trueBlk, falseBlk)) {
4252 + landBlk = *falseBlk->succ_begin();
4253 + } else if (trueBlk->succ_size() == 1
4254 + && isSameloopDetachedContbreak(falseBlk, trueBlk)) {
4255 + landBlk = *trueBlk->succ_begin();
4257 + return handleJumpintoIf(curBlk, trueBlk, falseBlk);
4260 + // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
4261 + // new BB created for landBlk==NULL may introduce new challenge to the
4262 + // reduction process.
4263 + if (landBlk != NULL &&
4264 + ((trueBlk && trueBlk->pred_size() > 1)
4265 + || (falseBlk && falseBlk->pred_size() > 1))) {
4266 + cloned += improveSimpleJumpintoIf(curBlk, trueBlk, falseBlk, &landBlk);
4269 + if (trueBlk && trueBlk->pred_size() > 1) {
4270 + trueBlk = cloneBlockForPredecessor(trueBlk, curBlk);
4274 + if (falseBlk && falseBlk->pred_size() > 1) {
4275 + falseBlk = cloneBlockForPredecessor(falseBlk, curBlk);
4279 + mergeIfthenelseBlock(branchInstr, curBlk, trueBlk, falseBlk, landBlk);
4281 + ++numIfPatternMatch;
4283 + numClonedBlock += cloned;
4285 + return 1 + cloned;
4288 +template<class PassT>
4289 +int CFGStructurizer<PassT>::switchPatternMatch(BlockT *curBlk) {
4291 +} //switchPatternMatch
4293 +template<class PassT>
4294 +int CFGStructurizer<PassT>::loopendPatternMatch(BlockT *curBlk) {
4295 + LoopT *loopRep = loopInfo->getLoopFor(curBlk);
4296 + typename std::vector<LoopT *> nestedLoops;
4298 + nestedLoops.push_back(loopRep);
4299 + loopRep = loopRep->getParentLoop();
4302 + if (nestedLoops.size() == 0) {
4306 + // Process nested loop outside->inside, so "continue" to a outside loop won't
4307 + // be mistaken as "break" of the current loop.
4309 + for (typename std::vector<LoopT *>::reverse_iterator
4310 + iter = nestedLoops.rbegin(), iterEnd = nestedLoops.rend();
4311 + iter != iterEnd; ++iter) {
4314 + if (getLoopLandBlock(loopRep) != NULL) {
4318 + BlockT *loopHeader = loopRep->getHeader();
4320 + int numBreak = loopbreakPatternMatch(loopRep, loopHeader);
4322 + if (numBreak == -1) {
4326 + int numCont = loopcontPatternMatch(loopRep, loopHeader);
4327 + num += numBreak + numCont;
4331 +} //loopendPatternMatch
4333 +template<class PassT>
4334 +int CFGStructurizer<PassT>::loopPatternMatch(BlockT *curBlk) {
4335 + if (curBlk->succ_size() != 0) {
4340 + LoopT *loopRep = loopInfo->getLoopFor(curBlk);
4341 + while (loopRep && loopRep->getHeader() == curBlk) {
4342 + LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
4344 + BlockT *landBlk = loopLand->landBlk;
4346 + if (!isRetiredBlock(landBlk)) {
4347 + mergeLooplandBlock(curBlk, loopLand);
4351 + loopRep = loopRep->getParentLoop();
4354 + numLoopPatternMatch += numLoop;
4357 +} //loopPatternMatch
4359 +template<class PassT>
4360 +int CFGStructurizer<PassT>::loopbreakPatternMatch(LoopT *loopRep,
4361 + BlockT *loopHeader) {
4362 + BlockTSmallerVector exitingBlks;
4363 + loopRep->getExitingBlocks(exitingBlks);
4366 + errs() << "Loop has " << exitingBlks.size() << " exiting blocks\n";
4369 + if (exitingBlks.size() == 0) {
4370 + setLoopLandBlock(loopRep);
4374 + // Compute the corresponding exitBlks and exit block set.
4375 + BlockTSmallerVector exitBlks;
4376 + std::set<BlockT *> exitBlkSet;
4377 + for (typename BlockTSmallerVector::const_iterator iter = exitingBlks.begin(),
4378 + iterEnd = exitingBlks.end(); iter != iterEnd; ++iter) {
4379 + BlockT *exitingBlk = *iter;
4380 + BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
4381 + exitBlks.push_back(exitBlk);
4382 + exitBlkSet.insert(exitBlk); //non-duplicate insert
4385 + assert(exitBlkSet.size() > 0);
4386 + assert(exitBlks.size() == exitingBlks.size());
4389 + errs() << "Loop has " << exitBlkSet.size() << " exit blocks\n";
4392 + // Find exitLandBlk.
4393 + BlockT *exitLandBlk = NULL;
4394 + int numCloned = 0;
4395 + int numSerial = 0;
4397 + if (exitBlkSet.size() == 1) {
4398 + exitLandBlk = *exitBlkSet.begin();
4400 + exitLandBlk = findNearestCommonPostDom(exitBlkSet);
4402 + if (exitLandBlk == NULL) {
4406 + bool allInPath = true;
4407 + bool allNotInPath = true;
4408 + for (typename std::set<BlockT*>::const_iterator
4409 + iter = exitBlkSet.begin(),
4410 + iterEnd = exitBlkSet.end();
4411 + iter != iterEnd; ++iter) {
4412 + BlockT *exitBlk = *iter;
4414 + PathToKind pathKind = singlePathTo(exitBlk, exitLandBlk, true);
4416 + errs() << "BB" << exitBlk->getNumber()
4417 + << " to BB" << exitLandBlk->getNumber() << " PathToKind="
4418 + << pathKind << "\n";
4421 + allInPath = allInPath && (pathKind == SinglePath_InPath);
4422 + allNotInPath = allNotInPath && (pathKind == SinglePath_NotInPath);
4424 + if (!allInPath && !allNotInPath) {
4426 + errs() << "singlePath check fail\n";
4430 + } // check all exit blocks
4432 + if (allNotInPath) {
4434 + // TODO: Simplify, maybe separate function?
4435 + LoopT *parentLoopRep = loopRep->getParentLoop();
4436 + BlockT *parentLoopHeader = NULL;
4437 + if (parentLoopRep)
4438 + parentLoopHeader = parentLoopRep->getHeader();
4440 + if (exitLandBlk == parentLoopHeader &&
4441 + (exitLandBlk = relocateLoopcontBlock(parentLoopRep,
4444 + exitLandBlk)) != NULL) {
4446 + errs() << "relocateLoopcontBlock success\n";
4448 + } else if ((exitLandBlk = addLoopEndbranchBlock(loopRep,
4450 + exitBlks)) != NULL) {
4452 + errs() << "insertEndbranchBlock success\n";
4456 + errs() << "loop exit fail\n";
4462 + // Handle side entry to exit path.
4464 + exitBlkSet.clear();
4465 + for (typename BlockTSmallerVector::iterator iterExiting =
4466 + exitingBlks.begin(),
4467 + iterExitingEnd = exitingBlks.end();
4468 + iterExiting != iterExitingEnd; ++iterExiting) {
4469 + BlockT *exitingBlk = *iterExiting;
4470 + BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
4471 + BlockT *newExitBlk = exitBlk;
4473 + if (exitBlk != exitLandBlk && exitBlk->pred_size() > 1) {
4474 + newExitBlk = cloneBlockForPredecessor(exitBlk, exitingBlk);
4478 + numCloned += cloneOnSideEntryTo(exitingBlk, newExitBlk, exitLandBlk);
4480 + exitBlks.push_back(newExitBlk);
4481 + exitBlkSet.insert(newExitBlk);
4484 + for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
4485 + iterExitEnd = exitBlks.end();
4486 + iterExit != iterExitEnd; ++iterExit) {
4487 + BlockT *exitBlk = *iterExit;
4488 + numSerial += serialPatternMatch(exitBlk);
4491 + for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
4492 + iterExitEnd = exitBlks.end();
4493 + iterExit != iterExitEnd; ++iterExit) {
4494 + BlockT *exitBlk = *iterExit;
4495 + if (exitBlk->pred_size() > 1) {
4496 + if (exitBlk != exitLandBlk) {
4500 + if (exitBlk != exitLandBlk &&
4501 + (exitBlk->succ_size() != 1 ||
4502 + *exitBlk->succ_begin() != exitLandBlk)) {
4509 + exitLandBlk = recordLoopLandBlock(loopRep, exitLandBlk, exitBlks, exitBlkSet);
4511 + // Fold break into the breaking block. Leverage across level breaks.
4512 + assert(exitingBlks.size() == exitBlks.size());
4513 + for (typename BlockTSmallerVector::const_iterator iterExit = exitBlks.begin(),
4514 + iterExiting = exitingBlks.begin(), iterExitEnd = exitBlks.end();
4515 + iterExit != iterExitEnd; ++iterExit, ++iterExiting) {
4516 + BlockT *exitBlk = *iterExit;
4517 + BlockT *exitingBlk = *iterExiting;
4518 + assert(exitBlk->pred_size() == 1 || exitBlk == exitLandBlk);
4519 + LoopT *exitingLoop = loopInfo->getLoopFor(exitingBlk);
4520 + handleLoopbreak(exitingBlk, exitingLoop, exitBlk, loopRep, exitLandBlk);
4523 + int numBreak = static_cast<int>(exitingBlks.size());
4524 + numLoopbreakPatternMatch += numBreak;
4525 + numClonedBlock += numCloned;
4526 + return numBreak + numSerial + numCloned;
4527 +} //loopbreakPatternMatch
4529 +template<class PassT>
4530 +int CFGStructurizer<PassT>::loopcontPatternMatch(LoopT *loopRep,
4531 + BlockT *loopHeader) {
4533 + SmallVector<BlockT *, DEFAULT_VEC_SLOTS> contBlk;
4534 + for (typename InvBlockGTraits::ChildIteratorType iter =
4535 + InvBlockGTraits::child_begin(loopHeader),
4536 + iterEnd = InvBlockGTraits::child_end(loopHeader);
4537 + iter != iterEnd; ++iter) {
4538 + BlockT *curBlk = *iter;
4539 + if (loopRep->contains(curBlk)) {
4540 + handleLoopcontBlock(curBlk, loopInfo->getLoopFor(curBlk),
4541 + loopHeader, loopRep);
4542 + contBlk.push_back(curBlk);
4547 + for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator
4548 + iter = contBlk.begin(), iterEnd = contBlk.end();
4549 + iter != iterEnd; ++iter) {
4550 + (*iter)->removeSuccessor(loopHeader);
4553 + numLoopcontPatternMatch += numCont;
4556 +} //loopcontPatternMatch
4559 +template<class PassT>
4560 +bool CFGStructurizer<PassT>::isSameloopDetachedContbreak(BlockT *src1Blk,
4561 + BlockT *src2Blk) {
4562 + // return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in the
4563 + // same loop with LoopLandInfo without explicitly keeping track of
4564 + // loopContBlks and loopBreakBlks, this is a method to get the information.
4566 + if (src1Blk->succ_size() == 0) {
4567 + LoopT *loopRep = loopInfo->getLoopFor(src1Blk);
4568 + if (loopRep != NULL && loopRep == loopInfo->getLoopFor(src2Blk)) {
4569 + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
4570 + if (theEntry != NULL) {
4572 + errs() << "isLoopContBreakBlock yes src1 = BB"
4573 + << src1Blk->getNumber()
4574 + << " src2 = BB" << src2Blk->getNumber() << "\n";
4581 +} //isSameloopDetachedContbreak
4583 +template<class PassT>
4584 +int CFGStructurizer<PassT>::handleJumpintoIf(BlockT *headBlk,
4586 + BlockT *falseBlk) {
4587 + int num = handleJumpintoIfImp(headBlk, trueBlk, falseBlk);
4590 + errs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
4592 + num = handleJumpintoIfImp(headBlk, falseBlk, trueBlk);
4597 +template<class PassT>
4598 +int CFGStructurizer<PassT>::handleJumpintoIfImp(BlockT *headBlk,
4600 + BlockT *falseBlk) {
4604 + //trueBlk could be the common post dominator
4605 + downBlk = trueBlk;
4608 + errs() << "handleJumpintoIfImp head = BB" << headBlk->getNumber()
4609 + << " true = BB" << trueBlk->getNumber()
4610 + << ", numSucc=" << trueBlk->succ_size()
4611 + << " false = BB" << falseBlk->getNumber() << "\n";
4616 + errs() << "check down = BB" << downBlk->getNumber();
4619 + if (singlePathTo(falseBlk, downBlk) == SinglePath_InPath) {
4621 + errs() << " working\n";
4624 + num += cloneOnSideEntryTo(headBlk, trueBlk, downBlk);
4625 + num += cloneOnSideEntryTo(headBlk, falseBlk, downBlk);
4627 + numClonedBlock += num;
4628 + num += serialPatternMatch(*headBlk->succ_begin());
4629 + num += serialPatternMatch(*(++headBlk->succ_begin()));
4630 + num += ifPatternMatch(headBlk);
4636 + errs() << " not working\n";
4638 + downBlk = (downBlk->succ_size() == 1) ? (*downBlk->succ_begin()) : NULL;
4639 + } // walk down the postDomTree
4642 +} //handleJumpintoIf
4644 +template<class PassT>
4645 +void CFGStructurizer<PassT>::showImproveSimpleJumpintoIf(BlockT *headBlk,
4650 + errs() << "head = BB" << headBlk->getNumber()
4651 + << " size = " << headBlk->size();
4654 + headBlk->print(errs());
4659 + errs() << ", true = BB" << trueBlk->getNumber() << " size = "
4660 + << trueBlk->size() << " numPred = " << trueBlk->pred_size();
4663 + trueBlk->print(errs());
4668 + errs() << ", false = BB" << falseBlk->getNumber() << " size = "
4669 + << falseBlk->size() << " numPred = " << falseBlk->pred_size();
4672 + falseBlk->print(errs());
4677 + errs() << ", land = BB" << landBlk->getNumber() << " size = "
4678 + << landBlk->size() << " numPred = " << landBlk->pred_size();
4681 + landBlk->print(errs());
4687 +} //showImproveSimpleJumpintoIf
4689 +template<class PassT>
4690 +int CFGStructurizer<PassT>::improveSimpleJumpintoIf(BlockT *headBlk,
4693 + BlockT **plandBlk) {
4694 + bool migrateTrue = false;
4695 + bool migrateFalse = false;
4697 + BlockT *landBlk = *plandBlk;
4699 + assert((trueBlk == NULL || trueBlk->succ_size() <= 1)
4700 + && (falseBlk == NULL || falseBlk->succ_size() <= 1));
4702 + if (trueBlk == falseBlk) {
4706 + migrateTrue = needMigrateBlock(trueBlk);
4707 + migrateFalse = needMigrateBlock(falseBlk);
4709 + if (!migrateTrue && !migrateFalse) {
4713 + // If we need to migrate either trueBlk and falseBlk, migrate the rest that
4714 + // have more than one predecessors. without doing this, its predecessor
4715 + // rather than headBlk will have undefined value in initReg.
4716 + if (!migrateTrue && trueBlk && trueBlk->pred_size() > 1) {
4717 + migrateTrue = true;
4719 + if (!migrateFalse && falseBlk && falseBlk->pred_size() > 1) {
4720 + migrateFalse = true;
4724 + errs() << "before improveSimpleJumpintoIf: ";
4725 + showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
4728 + // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
4730 + // new: headBlk => if () {initReg = 1; org trueBlk branch} else
4731 + // {initReg = 0; org falseBlk branch }
4732 + // => landBlk => if (initReg) {org trueBlk} else {org falseBlk}
4734 + // if landBlk->pred_size() > 2, put the about if-else inside
4735 + // if (initReg !=2) {...}
4737 + // add initReg = initVal to headBlk
4739 + const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
4740 + unsigned initReg =
4741 + funcRep->getRegInfo().createVirtualRegister(I32RC);
4742 + if (!migrateTrue || !migrateFalse) {
4743 + int initVal = migrateTrue ? 0 : 1;
4744 + CFGTraits::insertAssignInstrBefore(headBlk, passRep, initReg, initVal);
4747 + int numNewBlk = 0;
4749 + if (landBlk == NULL) {
4750 + landBlk = funcRep->CreateMachineBasicBlock();
4751 + funcRep->push_back(landBlk); //insert to function
4754 + trueBlk->addSuccessor(landBlk);
4756 + headBlk->addSuccessor(landBlk);
4760 + falseBlk->addSuccessor(landBlk);
4762 + headBlk->addSuccessor(landBlk);
4768 + bool landBlkHasOtherPred = (landBlk->pred_size() > 2);
4770 + //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
4771 + typename BlockT::iterator insertPos =
4772 + CFGTraits::getInstrPos
4773 + (landBlk, CFGTraits::insertInstrBefore(landBlk, AMDGPU::ENDIF, passRep));
4775 + if (landBlkHasOtherPred) {
4777 + funcRep->getRegInfo().createVirtualRegister(I32RC);
4778 + CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 2);
4779 + unsigned cmpResReg =
4780 + funcRep->getRegInfo().createVirtualRegister(I32RC);
4782 + CFGTraits::insertCompareInstrBefore(landBlk, insertPos, passRep, cmpResReg,
4784 + CFGTraits::insertCondBranchBefore(landBlk, insertPos,
4785 + AMDGPU::IF_PREDICATE_SET, passRep,
4786 + cmpResReg, DebugLoc());
4789 + CFGTraits::insertCondBranchBefore(landBlk, insertPos, AMDGPU::IF_PREDICATE_SET,
4790 + passRep, initReg, DebugLoc());
4792 + if (migrateTrue) {
4793 + migrateInstruction(trueBlk, landBlk, insertPos);
4794 + // need to uncondionally insert the assignment to ensure a path from its
4795 + // predecessor rather than headBlk has valid value in initReg if
4796 + // (initVal != 1).
4797 + CFGTraits::insertAssignInstrBefore(trueBlk, passRep, initReg, 1);
4799 + CFGTraits::insertInstrBefore(insertPos, AMDGPU::ELSE, passRep);
4801 + if (migrateFalse) {
4802 + migrateInstruction(falseBlk, landBlk, insertPos);
4803 + // need to uncondionally insert the assignment to ensure a path from its
4804 + // predecessor rather than headBlk has valid value in initReg if
4806 + CFGTraits::insertAssignInstrBefore(falseBlk, passRep, initReg, 0);
4809 + if (landBlkHasOtherPred) {
4811 + CFGTraits::insertInstrBefore(insertPos, AMDGPU::ENDIF, passRep);
4813 + // put initReg = 2 to other predecessors of landBlk
4814 + for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
4815 + predIterEnd = landBlk->pred_end(); predIter != predIterEnd;
4817 + BlockT *curBlk = *predIter;
4818 + if (curBlk != trueBlk && curBlk != falseBlk) {
4819 + CFGTraits::insertAssignInstrBefore(curBlk, passRep, initReg, 2);
4824 + errs() << "result from improveSimpleJumpintoIf: ";
4825 + showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
4829 + *plandBlk = landBlk;
4832 +} //improveSimpleJumpintoIf
4834 +template<class PassT>
4835 +void CFGStructurizer<PassT>::handleLoopbreak(BlockT *exitingBlk,
4836 + LoopT *exitingLoop,
4839 + BlockT *landBlk) {
4841 + errs() << "Trying to break loop-depth = " << getLoopDepth(exitLoop)
4842 + << " from loop-depth = " << getLoopDepth(exitingLoop) << "\n";
4844 + const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
4846 + RegiT initReg = INVALIDREGNUM;
4847 + if (exitingLoop != exitLoop) {
4848 + initReg = static_cast<int>
4849 + (funcRep->getRegInfo().createVirtualRegister(I32RC));
4850 + assert(initReg != INVALIDREGNUM);
4851 + addLoopBreakInitReg(exitLoop, initReg);
4852 + while (exitingLoop != exitLoop && exitingLoop) {
4853 + addLoopBreakOnReg(exitingLoop, initReg);
4854 + exitingLoop = exitingLoop->getParentLoop();
4856 + assert(exitingLoop == exitLoop);
4859 + mergeLoopbreakBlock(exitingBlk, exitBlk, landBlk, initReg);
4861 +} //handleLoopbreak
4863 +template<class PassT>
4864 +void CFGStructurizer<PassT>::handleLoopcontBlock(BlockT *contingBlk,
4865 + LoopT *contingLoop,
4867 + LoopT *contLoop) {
4869 + errs() << "loopcontPattern cont = BB" << contingBlk->getNumber()
4870 + << " header = BB" << contBlk->getNumber() << "\n";
4872 + errs() << "Trying to continue loop-depth = "
4873 + << getLoopDepth(contLoop)
4874 + << " from loop-depth = " << getLoopDepth(contingLoop) << "\n";
4877 + RegiT initReg = INVALIDREGNUM;
4878 + const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
4879 + if (contingLoop != contLoop) {
4880 + initReg = static_cast<int>
4881 + (funcRep->getRegInfo().createVirtualRegister(I32RC));
4882 + assert(initReg != INVALIDREGNUM);
4883 + addLoopContInitReg(contLoop, initReg);
4884 + while (contingLoop && contingLoop->getParentLoop() != contLoop) {
4885 + addLoopBreakOnReg(contingLoop, initReg); //not addLoopContOnReg
4886 + contingLoop = contingLoop->getParentLoop();
4888 + assert(contingLoop && contingLoop->getParentLoop() == contLoop);
4889 + addLoopContOnReg(contingLoop, initReg);
4892 + settleLoopcontBlock(contingBlk, contBlk, initReg);
4893 +} //handleLoopcontBlock
4895 +template<class PassT>
4896 +void CFGStructurizer<PassT>::mergeSerialBlock(BlockT *dstBlk, BlockT *srcBlk) {
4898 + errs() << "serialPattern BB" << dstBlk->getNumber()
4899 + << " <= BB" << srcBlk->getNumber() << "\n";
4901 + dstBlk->splice(dstBlk->end(), srcBlk, srcBlk->begin(), srcBlk->end());
4903 + dstBlk->removeSuccessor(srcBlk);
4904 + CFGTraits::cloneSuccessorList(dstBlk, srcBlk);
4906 + removeSuccessor(srcBlk);
4907 + retireBlock(dstBlk, srcBlk);
4908 +} //mergeSerialBlock
4910 +template<class PassT>
4911 +void CFGStructurizer<PassT>::mergeIfthenelseBlock(InstrT *branchInstr,
4915 + BlockT *landBlk) {
4917 + errs() << "ifPattern BB" << curBlk->getNumber();
4920 + errs() << "BB" << trueBlk->getNumber();
4922 + errs() << " } else ";
4925 + errs() << "BB" << falseBlk->getNumber();
4927 + errs() << " }\n ";
4928 + errs() << "landBlock: ";
4929 + if (landBlk == NULL) {
4932 + errs() << "BB" << landBlk->getNumber();
4937 + int oldOpcode = branchInstr->getOpcode();
4938 + DebugLoc branchDL = branchInstr->getDebugLoc();
4948 + typename BlockT::iterator branchInstrPos =
4949 + CFGTraits::getInstrPos(curBlk, branchInstr);
4950 + CFGTraits::insertCondBranchBefore(branchInstrPos,
4951 + CFGTraits::getBranchNzeroOpcode(oldOpcode),
4956 + curBlk->splice(branchInstrPos, trueBlk, trueBlk->begin(), trueBlk->end());
4957 + curBlk->removeSuccessor(trueBlk);
4958 + if (landBlk && trueBlk->succ_size()!=0) {
4959 + trueBlk->removeSuccessor(landBlk);
4961 + retireBlock(curBlk, trueBlk);
4963 + CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ELSE, passRep);
4966 + curBlk->splice(branchInstrPos, falseBlk, falseBlk->begin(),
4968 + curBlk->removeSuccessor(falseBlk);
4969 + if (landBlk && falseBlk->succ_size() != 0) {
4970 + falseBlk->removeSuccessor(landBlk);
4972 + retireBlock(curBlk, falseBlk);
4974 + CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ENDIF, passRep);
4976 + branchInstr->eraseFromParent();
4978 + if (landBlk && trueBlk && falseBlk) {
4979 + curBlk->addSuccessor(landBlk);
4982 +} //mergeIfthenelseBlock
4984 +template<class PassT>
4985 +void CFGStructurizer<PassT>::mergeLooplandBlock(BlockT *dstBlk,
4986 + LoopLandInfo *loopLand) {
4987 + BlockT *landBlk = loopLand->landBlk;
4990 + errs() << "loopPattern header = BB" << dstBlk->getNumber()
4991 + << " land = BB" << landBlk->getNumber() << "\n";
4994 + // Loop contInitRegs are init at the beginning of the loop.
4995 + for (typename std::set<RegiT>::const_iterator iter =
4996 + loopLand->contInitRegs.begin(),
4997 + iterEnd = loopLand->contInitRegs.end(); iter != iterEnd; ++iter) {
4998 + CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
5001 + /* we last inserterd the DebugLoc in the
5002 + * BREAK_LOGICALZ_i32 or AMDGPU::BREAK_LOGICALNZ statement in the current dstBlk.
5003 + * search for the DebugLoc in the that statement.
5004 + * if not found, we have to insert the empty/default DebugLoc */
5005 + InstrT *loopBreakInstr = CFGTraits::getLoopBreakInstr(dstBlk);
5006 + DebugLoc DLBreak = (loopBreakInstr) ? loopBreakInstr->getDebugLoc() : DebugLoc();
5008 + CFGTraits::insertInstrBefore(dstBlk, AMDGPU::WHILELOOP, passRep, DLBreak);
5009 + // Loop breakInitRegs are init before entering the loop.
5010 + for (typename std::set<RegiT>::const_iterator iter =
5011 + loopLand->breakInitRegs.begin(),
5012 + iterEnd = loopLand->breakInitRegs.end(); iter != iterEnd; ++iter) {
5013 + CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
5015 + // Loop endbranchInitRegs are init before entering the loop.
5016 + for (typename std::set<RegiT>::const_iterator iter =
5017 + loopLand->endbranchInitRegs.begin(),
5018 + iterEnd = loopLand->endbranchInitRegs.end(); iter != iterEnd; ++iter) {
5019 + CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
5022 + /* we last inserterd the DebugLoc in the continue statement in the current dstBlk
5023 + * search for the DebugLoc in the continue statement.
5024 + * if not found, we have to insert the empty/default DebugLoc */
5025 + InstrT *continueInstr = CFGTraits::getContinueInstr(dstBlk);
5026 + DebugLoc DLContinue = (continueInstr) ? continueInstr->getDebugLoc() : DebugLoc();
5028 + CFGTraits::insertInstrEnd(dstBlk, AMDGPU::ENDLOOP, passRep, DLContinue);
5029 + // Loop breakOnRegs are check after the ENDLOOP: break the loop outside this
5031 + for (typename std::set<RegiT>::const_iterator iter =
5032 + loopLand->breakOnRegs.begin(),
5033 + iterEnd = loopLand->breakOnRegs.end(); iter != iterEnd; ++iter) {
5034 + CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::PREDICATED_BREAK, passRep,
5038 + // Loop contOnRegs are check after the ENDLOOP: cont the loop outside this
5040 + for (std::set<RegiT>::const_iterator iter = loopLand->contOnRegs.begin(),
5041 + iterEnd = loopLand->contOnRegs.end(); iter != iterEnd; ++iter) {
5042 + CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::CONTINUE_LOGICALNZ_i32,
5046 + dstBlk->splice(dstBlk->end(), landBlk, landBlk->begin(), landBlk->end());
5048 + for (typename BlockT::succ_iterator iter = landBlk->succ_begin(),
5049 + iterEnd = landBlk->succ_end(); iter != iterEnd; ++iter) {
5050 + dstBlk->addSuccessor(*iter); // *iter's predecessor is also taken care of.
5053 + removeSuccessor(landBlk);
5054 + retireBlock(dstBlk, landBlk);
5055 +} //mergeLooplandBlock
5057 +template<class PassT>
5058 +void CFGStructurizer<PassT>::reversePredicateSetter(typename BlockT::iterator I) {
5060 + if (I->getOpcode() == AMDGPU::PRED_X) {
5061 + switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) {
5062 + case OPCODE_IS_ZERO_INT:
5063 + static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO_INT);
5065 + case OPCODE_IS_NOT_ZERO_INT:
5066 + static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_ZERO_INT);
5068 + case OPCODE_IS_ZERO:
5069 + static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO);
5071 + case OPCODE_IS_NOT_ZERO:
5072 + static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_ZERO);
5075 + assert(0 && "PRED_X Opcode invalid!");
5081 +template<class PassT>
5082 +void CFGStructurizer<PassT>::mergeLoopbreakBlock(BlockT *exitingBlk,
5084 + BlockT *exitLandBlk,
5087 + errs() << "loopbreakPattern exiting = BB" << exitingBlk->getNumber()
5088 + << " exit = BB" << exitBlk->getNumber()
5089 + << " land = BB" << exitLandBlk->getNumber() << "\n";
5092 + InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(exitingBlk);
5093 + assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
5095 + DebugLoc DL = branchInstr->getDebugLoc();
5097 + BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
5099 + // transform exitingBlk to
5101 + // exitBlk (if exitBlk != exitLandBlk)
5105 + // successor = {orgSuccessor(exitingBlk) - exitBlk}
5107 + typename BlockT::iterator branchInstrPos =
5108 + CFGTraits::getInstrPos(exitingBlk, branchInstr);
5110 + if (exitBlk == exitLandBlk && setReg == INVALIDREGNUM) {
5113 + if (trueBranch != exitBlk) {
5114 + reversePredicateSetter(branchInstrPos);
5116 + CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL);
5118 + if (trueBranch != exitBlk) {
5119 + reversePredicateSetter(branchInstr);
5121 + CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL);
5122 + if (exitBlk != exitLandBlk) {
5123 + //splice is insert-before ...
5124 + exitingBlk->splice(branchInstrPos, exitBlk, exitBlk->begin(),
5127 + if (setReg != INVALIDREGNUM) {
5128 + CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
5130 + CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::BREAK, passRep);
5133 + //now branchInst can be erase safely
5134 + branchInstr->eraseFromParent();
5136 + //now take care of successors, retire blocks
5137 + exitingBlk->removeSuccessor(exitBlk);
5138 + if (exitBlk != exitLandBlk) {
5139 + //splice is insert-before ...
5140 + exitBlk->removeSuccessor(exitLandBlk);
5141 + retireBlock(exitingBlk, exitBlk);
5144 +} //mergeLoopbreakBlock
5146 +template<class PassT>
5147 +void CFGStructurizer<PassT>::settleLoopcontBlock(BlockT *contingBlk,
5151 + errs() << "settleLoopcontBlock conting = BB"
5152 + << contingBlk->getNumber()
5153 + << ", cont = BB" << contBlk->getNumber() << "\n";
5156 + InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(contingBlk);
5157 + if (branchInstr) {
5158 + assert(CFGTraits::isCondBranch(branchInstr));
5159 + typename BlockT::iterator branchInstrPos =
5160 + CFGTraits::getInstrPos(contingBlk, branchInstr);
5161 + BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
5162 + int oldOpcode = branchInstr->getOpcode();
5163 + DebugLoc DL = branchInstr->getDebugLoc();
5165 + // transform contingBlk to
5167 + // move instr after branchInstr
5173 + // successor = {orgSuccessor(contingBlk) - loopHeader}
5175 + bool useContinueLogical =
5176 + (setReg == INVALIDREGNUM && (&*contingBlk->rbegin()) == branchInstr);
5178 + if (useContinueLogical == false) {
5179 + int branchOpcode =
5180 + trueBranch == contBlk ? CFGTraits::getBranchNzeroOpcode(oldOpcode)
5181 + : CFGTraits::getBranchZeroOpcode(oldOpcode);
5183 + CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
5185 + if (setReg != INVALIDREGNUM) {
5186 + CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
5187 + // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
5188 + CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, DL);
5190 + // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
5191 + CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, DL);
5194 + CFGTraits::insertInstrEnd(contingBlk, AMDGPU::ENDIF, passRep, DL);
5196 + int branchOpcode =
5197 + trueBranch == contBlk ? CFGTraits::getContinueNzeroOpcode(oldOpcode)
5198 + : CFGTraits::getContinueZeroOpcode(oldOpcode);
5200 + CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
5203 + branchInstr->eraseFromParent();
5205 + // if we've arrived here then we've already erased the branch instruction
5206 + // travel back up the basic block to see the last reference of our debug location
5207 + // we've just inserted that reference here so it should be representative
5208 + if (setReg != INVALIDREGNUM) {
5209 + CFGTraits::insertAssignInstrBefore(contingBlk, passRep, setReg, 1);
5210 + // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
5211 + CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
5213 + // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
5214 + CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
5218 +} //settleLoopcontBlock
5220 +// BBs in exitBlkSet are determined as in break-path for loopRep,
5221 +// before we can put code for BBs as inside loop-body for loopRep
5222 +// check whether those BBs are determined as cont-BB for parentLoopRep
5224 +// If so, generate a new BB newBlk
5225 +// (1) set newBlk common successor of BBs in exitBlkSet
5226 +// (2) change the continue-instr in BBs in exitBlkSet to break-instr
5227 +// (3) generate continue-instr in newBlk
5229 +template<class PassT>
5230 +typename CFGStructurizer<PassT>::BlockT *
5231 +CFGStructurizer<PassT>::relocateLoopcontBlock(LoopT *parentLoopRep,
5233 + std::set<BlockT *> &exitBlkSet,
5234 + BlockT *exitLandBlk) {
5235 + std::set<BlockT *> endBlkSet;
5239 + for (typename std::set<BlockT *>::const_iterator iter = exitBlkSet.begin(),
5240 + iterEnd = exitBlkSet.end();
5241 + iter != iterEnd; ++iter) {
5242 + BlockT *exitBlk = *iter;
5243 + BlockT *endBlk = singlePathEnd(exitBlk, exitLandBlk);
5245 + if (endBlk == NULL || CFGTraits::getContinueInstr(endBlk) == NULL)
5248 + endBlkSet.insert(endBlk);
5251 + BlockT *newBlk = funcRep->CreateMachineBasicBlock();
5252 + funcRep->push_back(newBlk); //insert to function
5253 + CFGTraits::insertInstrEnd(newBlk, AMDGPU::CONTINUE, passRep);
5254 + SHOWNEWBLK(newBlk, "New continue block: ");
5256 + for (typename std::set<BlockT*>::const_iterator iter = endBlkSet.begin(),
5257 + iterEnd = endBlkSet.end();
5258 + iter != iterEnd; ++iter) {
5259 + BlockT *endBlk = *iter;
5260 + InstrT *contInstr = CFGTraits::getContinueInstr(endBlk);
5262 + contInstr->eraseFromParent();
5264 + endBlk->addSuccessor(newBlk);
5266 + errs() << "Add new continue Block to BB"
5267 + << endBlk->getNumber() << " successors\n";
5272 +} //relocateLoopcontBlock
5275 +// LoopEndbranchBlock is a BB created by the CFGStructurizer to use as
5276 +// LoopLandBlock. This BB branch on the loop endBranchInit register to the
5277 +// pathes corresponding to the loop exiting branches.
5279 +template<class PassT>
5280 +typename CFGStructurizer<PassT>::BlockT *
5281 +CFGStructurizer<PassT>::addLoopEndbranchBlock(LoopT *loopRep,
5282 + BlockTSmallerVector &exitingBlks,
5283 + BlockTSmallerVector &exitBlks) {
5284 + const AMDGPUInstrInfo *tii =
5285 + static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
5286 + const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
5288 + RegiT endBranchReg = static_cast<int>
5289 + (funcRep->getRegInfo().createVirtualRegister(I32RC));
5290 + assert(endBranchReg >= 0);
5292 + // reg = 0 before entering the loop
5293 + addLoopEndbranchInitReg(loopRep, endBranchReg);
5295 + uint32_t numBlks = static_cast<uint32_t>(exitingBlks.size());
5296 + assert(numBlks >=2 && numBlks == exitBlks.size());
5298 + BlockT *preExitingBlk = exitingBlks[0];
5299 + BlockT *preExitBlk = exitBlks[0];
5300 + BlockT *preBranchBlk = funcRep->CreateMachineBasicBlock();
5301 + funcRep->push_back(preBranchBlk); //insert to function
5302 + SHOWNEWBLK(preBranchBlk, "New loopEndbranch block: ");
5304 + BlockT *newLandBlk = preBranchBlk;
5306 + CFGTraits::replaceInstrUseOfBlockWith(preExitingBlk, preExitBlk,
5308 + preExitingBlk->removeSuccessor(preExitBlk);
5309 + preExitingBlk->addSuccessor(newLandBlk);
5311 + //it is redundant to add reg = 0 to exitingBlks[0]
5313 + // For 1..n th exiting path (the last iteration handles two pathes) create the
5314 + // branch to the previous path and the current path.
5315 + for (uint32_t i = 1; i < numBlks; ++i) {
5316 + BlockT *curExitingBlk = exitingBlks[i];
5317 + BlockT *curExitBlk = exitBlks[i];
5318 + BlockT *curBranchBlk;
5320 + if (i == numBlks - 1) {
5321 + curBranchBlk = curExitBlk;
5323 + curBranchBlk = funcRep->CreateMachineBasicBlock();
5324 + funcRep->push_back(curBranchBlk); //insert to function
5325 + SHOWNEWBLK(curBranchBlk, "New loopEndbranch block: ");
5328 + // Add reg = i to exitingBlks[i].
5329 + CFGTraits::insertAssignInstrBefore(curExitingBlk, passRep,
5332 + // Remove the edge (exitingBlks[i] exitBlks[i]) add new edge
5333 + // (exitingBlks[i], newLandBlk).
5334 + CFGTraits::replaceInstrUseOfBlockWith(curExitingBlk, curExitBlk,
5336 + curExitingBlk->removeSuccessor(curExitBlk);
5337 + curExitingBlk->addSuccessor(newLandBlk);
5339 + // add to preBranchBlk the branch instruction:
5340 + // if (endBranchReg == preVal)
5345 + // preValReg = i - 1
5348 + RegiT preValReg = static_cast<int>
5349 + (funcRep->getRegInfo().createVirtualRegister(I32RC));
5351 + preBranchBlk->insert(preBranchBlk->begin(),
5352 + tii->getMovImmInstr(preBranchBlk->getParent(), preValReg,
5355 + // condResReg = (endBranchReg == preValReg)
5356 + RegiT condResReg = static_cast<int>
5357 + (funcRep->getRegInfo().createVirtualRegister(I32RC));
5358 + BuildMI(preBranchBlk, DL, tii->get(tii->getIEQOpcode()), condResReg)
5359 + .addReg(endBranchReg).addReg(preValReg);
5361 + BuildMI(preBranchBlk, DL, tii->get(AMDGPU::BRANCH_COND_i32))
5362 + .addMBB(preExitBlk).addReg(condResReg);
5364 + preBranchBlk->addSuccessor(preExitBlk);
5365 + preBranchBlk->addSuccessor(curBranchBlk);
5367 + // Update preExitingBlk, preExitBlk, preBranchBlk.
5368 + preExitingBlk = curExitingBlk;
5369 + preExitBlk = curExitBlk;
5370 + preBranchBlk = curBranchBlk;
5372 + } //end for 1 .. n blocks
5374 + return newLandBlk;
5375 +} //addLoopEndbranchBlock
5377 +template<class PassT>
5378 +typename CFGStructurizer<PassT>::PathToKind
5379 +CFGStructurizer<PassT>::singlePathTo(BlockT *srcBlk, BlockT *dstBlk,
5380 + bool allowSideEntry) {
5383 + if (srcBlk == dstBlk) {
5384 + return SinglePath_InPath;
5387 + while (srcBlk && srcBlk->succ_size() == 1) {
5388 + srcBlk = *srcBlk->succ_begin();
5389 + if (srcBlk == dstBlk) {
5390 + return SinglePath_InPath;
5393 + if (!allowSideEntry && srcBlk->pred_size() > 1) {
5394 + return Not_SinglePath;
5398 + if (srcBlk && srcBlk->succ_size()==0) {
5399 + return SinglePath_NotInPath;
5402 + return Not_SinglePath;
5405 +// If there is a single path from srcBlk to dstBlk, return the last block before
5406 +// dstBlk If there is a single path from srcBlk->end without dstBlk, return the
5407 +// last block in the path Otherwise, return NULL
5408 +template<class PassT>
5409 +typename CFGStructurizer<PassT>::BlockT *
5410 +CFGStructurizer<PassT>::singlePathEnd(BlockT *srcBlk, BlockT *dstBlk,
5411 + bool allowSideEntry) {
5414 + if (srcBlk == dstBlk) {
5418 + if (srcBlk->succ_size() == 0) {
5422 + while (srcBlk && srcBlk->succ_size() == 1) {
5423 + BlockT *preBlk = srcBlk;
5425 + srcBlk = *srcBlk->succ_begin();
5426 + if (srcBlk == NULL) {
5430 + if (!allowSideEntry && srcBlk->pred_size() > 1) {
5435 + if (srcBlk && srcBlk->succ_size()==0) {
5443 +template<class PassT>
5444 +int CFGStructurizer<PassT>::cloneOnSideEntryTo(BlockT *preBlk, BlockT *srcBlk,
5447 + assert(preBlk->isSuccessor(srcBlk));
5448 + while (srcBlk && srcBlk != dstBlk) {
5449 + assert(srcBlk->succ_size() == 1);
5450 + if (srcBlk->pred_size() > 1) {
5451 + srcBlk = cloneBlockForPredecessor(srcBlk, preBlk);
5456 + srcBlk = *srcBlk->succ_begin();
5460 +} //cloneOnSideEntryTo
5462 +template<class PassT>
5463 +typename CFGStructurizer<PassT>::BlockT *
5464 +CFGStructurizer<PassT>::cloneBlockForPredecessor(BlockT *curBlk,
5465 + BlockT *predBlk) {
5466 + assert(predBlk->isSuccessor(curBlk) &&
5467 + "succBlk is not a prececessor of curBlk");
5469 + BlockT *cloneBlk = CFGTraits::clone(curBlk); //clone instructions
5470 + CFGTraits::replaceInstrUseOfBlockWith(predBlk, curBlk, cloneBlk);
5471 + //srcBlk, oldBlk, newBlk
5473 + predBlk->removeSuccessor(curBlk);
5474 + predBlk->addSuccessor(cloneBlk);
5476 + // add all successor to cloneBlk
5477 + CFGTraits::cloneSuccessorList(cloneBlk, curBlk);
5479 + numClonedInstr += curBlk->size();
5482 + errs() << "Cloned block: " << "BB"
5483 + << curBlk->getNumber() << "size " << curBlk->size() << "\n";
5486 + SHOWNEWBLK(cloneBlk, "result of Cloned block: ");
5489 +} //cloneBlockForPredecessor
5491 +template<class PassT>
5492 +typename CFGStructurizer<PassT>::BlockT *
5493 +CFGStructurizer<PassT>::exitingBlock2ExitBlock(LoopT *loopRep,
5494 + BlockT *exitingBlk) {
5495 + BlockT *exitBlk = NULL;
5497 + for (typename BlockT::succ_iterator iterSucc = exitingBlk->succ_begin(),
5498 + iterSuccEnd = exitingBlk->succ_end();
5499 + iterSucc != iterSuccEnd; ++iterSucc) {
5500 + BlockT *curBlk = *iterSucc;
5501 + if (!loopRep->contains(curBlk)) {
5502 + assert(exitBlk == NULL);
5507 + assert(exitBlk != NULL);
5510 +} //exitingBlock2ExitBlock
5512 +template<class PassT>
5513 +void CFGStructurizer<PassT>::migrateInstruction(BlockT *srcBlk,
5515 + InstrIterator insertPos) {
5516 + InstrIterator spliceEnd;
5517 + //look for the input branchinstr, not the AMDGPU branchinstr
5518 + InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
5519 + if (branchInstr == NULL) {
5521 + errs() << "migrateInstruction don't see branch instr\n" ;
5523 + spliceEnd = srcBlk->end();
5526 + errs() << "migrateInstruction see branch instr\n" ;
5527 + branchInstr->dump();
5529 + spliceEnd = CFGTraits::getInstrPos(srcBlk, branchInstr);
5532 + errs() << "migrateInstruction before splice dstSize = " << dstBlk->size()
5533 + << "srcSize = " << srcBlk->size() << "\n";
5536 + //splice insert before insertPos
5537 + dstBlk->splice(insertPos, srcBlk, srcBlk->begin(), spliceEnd);
5540 + errs() << "migrateInstruction after splice dstSize = " << dstBlk->size()
5541 + << "srcSize = " << srcBlk->size() << "\n";
5543 +} //migrateInstruction
5545 +// normalizeInfiniteLoopExit change
5547 +// uncond_br LoopHeader
5551 +// cond_br 1 LoopHeader dummyExit
5552 +// and return the newly added dummy exit block
5554 +template<class PassT>
5555 +typename CFGStructurizer<PassT>::BlockT *
5556 +CFGStructurizer<PassT>::normalizeInfiniteLoopExit(LoopT* LoopRep) {
5557 + BlockT *loopHeader;
5558 + BlockT *loopLatch;
5559 + loopHeader = LoopRep->getHeader();
5560 + loopLatch = LoopRep->getLoopLatch();
5561 + BlockT *dummyExitBlk = NULL;
5562 + const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
5563 + if (loopHeader!=NULL && loopLatch!=NULL) {
5564 + InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(loopLatch);
5565 + if (branchInstr!=NULL && CFGTraits::isUncondBranch(branchInstr)) {
5566 + dummyExitBlk = funcRep->CreateMachineBasicBlock();
5567 + funcRep->push_back(dummyExitBlk); //insert to function
5568 + SHOWNEWBLK(dummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
5570 + if (DEBUGME) errs() << "Old branch instr: " << *branchInstr << "\n";
5572 + typename BlockT::iterator insertPos =
5573 + CFGTraits::getInstrPos(loopLatch, branchInstr);
5575 + funcRep->getRegInfo().createVirtualRegister(I32RC);
5576 + CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 1);
5577 + InstrT *newInstr =
5578 + CFGTraits::insertInstrBefore(insertPos, AMDGPU::BRANCH_COND_i32, passRep);
5579 + MachineInstrBuilder(newInstr).addMBB(loopHeader).addReg(immReg, false);
5581 + SHOWNEWINSTR(newInstr);
5583 + branchInstr->eraseFromParent();
5584 + loopLatch->addSuccessor(dummyExitBlk);
5588 + return dummyExitBlk;
5589 +} //normalizeInfiniteLoopExit
5591 +template<class PassT>
5592 +void CFGStructurizer<PassT>::removeUnconditionalBranch(BlockT *srcBlk) {
5593 + InstrT *branchInstr;
5595 + // I saw two unconditional branch in one basic block in example
5596 + // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
5597 + while ((branchInstr = CFGTraits::getLoopendBlockBranchInstr(srcBlk))
5598 + && CFGTraits::isUncondBranch(branchInstr)) {
5600 + errs() << "Removing unconditional branch instruction" ;
5601 + branchInstr->dump();
5603 + branchInstr->eraseFromParent();
5605 +} //removeUnconditionalBranch
5607 +template<class PassT>
5608 +void CFGStructurizer<PassT>::removeRedundantConditionalBranch(BlockT *srcBlk) {
5609 + if (srcBlk->succ_size() == 2) {
5610 + BlockT *blk1 = *srcBlk->succ_begin();
5611 + BlockT *blk2 = *(++srcBlk->succ_begin());
5613 + if (blk1 == blk2) {
5614 + InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
5615 + assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
5617 + errs() << "Removing unneeded conditional branch instruction" ;
5618 + branchInstr->dump();
5620 + branchInstr->eraseFromParent();
5621 + SHOWNEWBLK(blk1, "Removing redundant successor");
5622 + srcBlk->removeSuccessor(blk1);
5625 +} //removeRedundantConditionalBranch
5627 +template<class PassT>
5628 +void CFGStructurizer<PassT>::addDummyExitBlock(SmallVector<BlockT*,
5629 + DEFAULT_VEC_SLOTS> &retBlks) {
5630 + BlockT *dummyExitBlk = funcRep->CreateMachineBasicBlock();
5631 + funcRep->push_back(dummyExitBlk); //insert to function
5632 + CFGTraits::insertInstrEnd(dummyExitBlk, AMDGPU::RETURN, passRep);
5634 + for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator iter =
5636 + iterEnd = retBlks.end(); iter != iterEnd; ++iter) {
5637 + BlockT *curBlk = *iter;
5638 + InstrT *curInstr = CFGTraits::getReturnInstr(curBlk);
5640 + curInstr->eraseFromParent();
5642 + curBlk->addSuccessor(dummyExitBlk);
5644 + errs() << "Add dummyExitBlock to BB" << curBlk->getNumber()
5645 + << " successors\n";
5649 + SHOWNEWBLK(dummyExitBlk, "DummyExitBlock: ");
5650 +} //addDummyExitBlock
5652 +template<class PassT>
5653 +void CFGStructurizer<PassT>::removeSuccessor(BlockT *srcBlk) {
5654 + while (srcBlk->succ_size()) {
5655 + srcBlk->removeSuccessor(*srcBlk->succ_begin());
5659 +template<class PassT>
5660 +void CFGStructurizer<PassT>::recordSccnum(BlockT *srcBlk, int sccNum) {
5661 + BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
5663 + if (srcBlkInfo == NULL) {
5664 + srcBlkInfo = new BlockInfo();
5667 + srcBlkInfo->sccNum = sccNum;
5670 +template<class PassT>
5671 +int CFGStructurizer<PassT>::getSCCNum(BlockT *srcBlk) {
5672 + BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
5673 + return srcBlkInfo ? srcBlkInfo->sccNum : INVALIDSCCNUM;
5676 +template<class PassT>
5677 +void CFGStructurizer<PassT>::retireBlock(BlockT *dstBlk, BlockT *srcBlk) {
5679 + errs() << "Retiring BB" << srcBlk->getNumber() << "\n";
5682 + BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
5684 + if (srcBlkInfo == NULL) {
5685 + srcBlkInfo = new BlockInfo();
5688 + srcBlkInfo->isRetired = true;
5689 + assert(srcBlk->succ_size() == 0 && srcBlk->pred_size() == 0
5690 + && "can't retire block yet");
5693 +template<class PassT>
5694 +bool CFGStructurizer<PassT>::isRetiredBlock(BlockT *srcBlk) {
5695 + BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
5696 + return (srcBlkInfo && srcBlkInfo->isRetired);
5699 +template<class PassT>
5700 +bool CFGStructurizer<PassT>::isActiveLoophead(BlockT *curBlk) {
5701 + LoopT *loopRep = loopInfo->getLoopFor(curBlk);
5702 + while (loopRep && loopRep->getHeader() == curBlk) {
5703 + LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
5705 + if(loopLand == NULL)
5708 + BlockT *landBlk = loopLand->landBlk;
5710 + if (!isRetiredBlock(landBlk)) {
5714 + loopRep = loopRep->getParentLoop();
5718 +} //isActiveLoophead
5720 +template<class PassT>
5721 +bool CFGStructurizer<PassT>::needMigrateBlock(BlockT *blk) {
5722 + const unsigned blockSizeThreshold = 30;
5723 + const unsigned cloneInstrThreshold = 100;
5725 + bool multiplePreds = blk && (blk->pred_size() > 1);
5727 + if(!multiplePreds)
5730 + unsigned blkSize = blk->size();
5731 + return ((blkSize > blockSizeThreshold)
5732 + && (blkSize * (blk->pred_size() - 1) > cloneInstrThreshold));
5733 +} //needMigrateBlock
5735 +template<class PassT>
5736 +typename CFGStructurizer<PassT>::BlockT *
5737 +CFGStructurizer<PassT>::recordLoopLandBlock(LoopT *loopRep, BlockT *landBlk,
5738 + BlockTSmallerVector &exitBlks,
5739 + std::set<BlockT *> &exitBlkSet) {
5740 + SmallVector<BlockT *, DEFAULT_VEC_SLOTS> inpathBlks; //in exit path blocks
5742 + for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
5743 + predIterEnd = landBlk->pred_end();
5744 + predIter != predIterEnd; ++predIter) {
5745 + BlockT *curBlk = *predIter;
5746 + if (loopRep->contains(curBlk) || exitBlkSet.count(curBlk)) {
5747 + inpathBlks.push_back(curBlk);
5751 + //if landBlk has predecessors that are not in the given loop,
5752 + //create a new block
5753 + BlockT *newLandBlk = landBlk;
5754 + if (inpathBlks.size() != landBlk->pred_size()) {
5755 + newLandBlk = funcRep->CreateMachineBasicBlock();
5756 + funcRep->push_back(newLandBlk); //insert to function
5757 + newLandBlk->addSuccessor(landBlk);
5758 + for (typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::iterator iter =
5759 + inpathBlks.begin(),
5760 + iterEnd = inpathBlks.end(); iter != iterEnd; ++iter) {
5761 + BlockT *curBlk = *iter;
5762 + CFGTraits::replaceInstrUseOfBlockWith(curBlk, landBlk, newLandBlk);
5763 + //srcBlk, oldBlk, newBlk
5764 + curBlk->removeSuccessor(landBlk);
5765 + curBlk->addSuccessor(newLandBlk);
5767 + for (size_t i = 0, tot = exitBlks.size(); i < tot; ++i) {
5768 + if (exitBlks[i] == landBlk) {
5769 + exitBlks[i] = newLandBlk;
5772 + SHOWNEWBLK(newLandBlk, "NewLandingBlock: ");
5775 + setLoopLandBlock(loopRep, newLandBlk);
5777 + return newLandBlk;
5778 +} // recordLoopbreakLand
5780 +template<class PassT>
5781 +void CFGStructurizer<PassT>::setLoopLandBlock(LoopT *loopRep, BlockT *blk) {
5782 + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
5784 + if (theEntry == NULL) {
5785 + theEntry = new LoopLandInfo();
5787 + assert(theEntry->landBlk == NULL);
5789 + if (blk == NULL) {
5790 + blk = funcRep->CreateMachineBasicBlock();
5791 + funcRep->push_back(blk); //insert to function
5792 + SHOWNEWBLK(blk, "DummyLandingBlock for loop without break: ");
5795 + theEntry->landBlk = blk;
5798 + errs() << "setLoopLandBlock loop-header = BB"
5799 + << loopRep->getHeader()->getNumber()
5800 + << " landing-block = BB" << blk->getNumber() << "\n";
5802 +} // setLoopLandBlock
5804 +template<class PassT>
5805 +void CFGStructurizer<PassT>::addLoopBreakOnReg(LoopT *loopRep, RegiT regNum) {
5806 + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
5808 + if (theEntry == NULL) {
5809 + theEntry = new LoopLandInfo();
5812 + theEntry->breakOnRegs.insert(regNum);
5815 + errs() << "addLoopBreakOnReg loop-header = BB"
5816 + << loopRep->getHeader()->getNumber()
5817 + << " regNum = " << regNum << "\n";
5819 +} // addLoopBreakOnReg
5821 +template<class PassT>
5822 +void CFGStructurizer<PassT>::addLoopContOnReg(LoopT *loopRep, RegiT regNum) {
5823 + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
5825 + if (theEntry == NULL) {
5826 + theEntry = new LoopLandInfo();
5828 + theEntry->contOnRegs.insert(regNum);
5831 + errs() << "addLoopContOnReg loop-header = BB"
5832 + << loopRep->getHeader()->getNumber()
5833 + << " regNum = " << regNum << "\n";
5835 +} // addLoopContOnReg
5837 +template<class PassT>
5838 +void CFGStructurizer<PassT>::addLoopBreakInitReg(LoopT *loopRep, RegiT regNum) {
5839 + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
5841 + if (theEntry == NULL) {
5842 + theEntry = new LoopLandInfo();
5844 + theEntry->breakInitRegs.insert(regNum);
5847 + errs() << "addLoopBreakInitReg loop-header = BB"
5848 + << loopRep->getHeader()->getNumber()
5849 + << " regNum = " << regNum << "\n";
5851 +} // addLoopBreakInitReg
5853 +template<class PassT>
5854 +void CFGStructurizer<PassT>::addLoopContInitReg(LoopT *loopRep, RegiT regNum) {
5855 + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
5857 + if (theEntry == NULL) {
5858 + theEntry = new LoopLandInfo();
5860 + theEntry->contInitRegs.insert(regNum);
5863 + errs() << "addLoopContInitReg loop-header = BB"
5864 + << loopRep->getHeader()->getNumber()
5865 + << " regNum = " << regNum << "\n";
5867 +} // addLoopContInitReg
5869 +template<class PassT>
5870 +void CFGStructurizer<PassT>::addLoopEndbranchInitReg(LoopT *loopRep,
5872 + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
5874 + if (theEntry == NULL) {
5875 + theEntry = new LoopLandInfo();
5877 + theEntry->endbranchInitRegs.insert(regNum);
5880 + errs() << "addLoopEndbranchInitReg loop-header = BB"
5881 + << loopRep->getHeader()->getNumber()
5882 + << " regNum = " << regNum << "\n";
5884 +} // addLoopEndbranchInitReg
5886 +template<class PassT>
5887 +typename CFGStructurizer<PassT>::LoopLandInfo *
5888 +CFGStructurizer<PassT>::getLoopLandInfo(LoopT *loopRep) {
5889 + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
5892 +} // getLoopLandInfo
5894 +template<class PassT>
5895 +typename CFGStructurizer<PassT>::BlockT *
5896 +CFGStructurizer<PassT>::getLoopLandBlock(LoopT *loopRep) {
5897 + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
5899 + return theEntry ? theEntry->landBlk : NULL;
5900 +} // getLoopLandBlock
5903 +template<class PassT>
5904 +bool CFGStructurizer<PassT>::hasBackEdge(BlockT *curBlk) {
5905 + LoopT *loopRep = loopInfo->getLoopFor(curBlk);
5906 + if (loopRep == NULL)
5909 + BlockT *loopHeader = loopRep->getHeader();
5911 + return curBlk->isSuccessor(loopHeader);
5915 +template<class PassT>
5916 +unsigned CFGStructurizer<PassT>::getLoopDepth(LoopT *loopRep) {
5917 + return loopRep ? loopRep->getLoopDepth() : 0;
5920 +template<class PassT>
5921 +int CFGStructurizer<PassT>::countActiveBlock
5922 +(typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterStart,
5923 + typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterEnd) {
5925 + while (iterStart != iterEnd) {
5926 + if (!isRetiredBlock(*iterStart)) {
5933 +} //countActiveBlock
5935 +// This is work around solution for findNearestCommonDominator not avaiable to
5936 +// post dom a proper fix should go to Dominators.h.
5938 +template<class PassT>
5939 +typename CFGStructurizer<PassT>::BlockT*
5940 +CFGStructurizer<PassT>::findNearestCommonPostDom(BlockT *blk1, BlockT *blk2) {
5942 + if (postDomTree->dominates(blk1, blk2)) {
5945 + if (postDomTree->dominates(blk2, blk1)) {
5949 + DomTreeNodeT *node1 = postDomTree->getNode(blk1);
5950 + DomTreeNodeT *node2 = postDomTree->getNode(blk2);
5952 + // Handle newly cloned node.
5953 + if (node1 == NULL && blk1->succ_size() == 1) {
5954 + return findNearestCommonPostDom(*blk1->succ_begin(), blk2);
5956 + if (node2 == NULL && blk2->succ_size() == 1) {
5957 + return findNearestCommonPostDom(blk1, *blk2->succ_begin());
5960 + if (node1 == NULL || node2 == NULL) {
5964 + node1 = node1->getIDom();
5966 + if (postDomTree->dominates(node1, node2)) {
5967 + return node1->getBlock();
5969 + node1 = node1->getIDom();
5975 +template<class PassT>
5976 +typename CFGStructurizer<PassT>::BlockT *
5977 +CFGStructurizer<PassT>::findNearestCommonPostDom
5978 +(typename std::set<BlockT *> &blks) {
5979 + BlockT *commonDom;
5980 + typename std::set<BlockT *>::const_iterator iter = blks.begin();
5981 + typename std::set<BlockT *>::const_iterator iterEnd = blks.end();
5982 + for (commonDom = *iter; iter != iterEnd && commonDom != NULL; ++iter) {
5983 + BlockT *curBlk = *iter;
5984 + if (curBlk != commonDom) {
5985 + commonDom = findNearestCommonPostDom(curBlk, commonDom);
5990 + errs() << "Common post dominator for exit blocks is ";
5992 + errs() << "BB" << commonDom->getNumber() << "\n";
5994 + errs() << "NULL\n";
5999 +} //findNearestCommonPostDom
6001 +} //end namespace llvm
6006 +//===----------------------------------------------------------------------===//
6008 +// CFGStructurizer for AMDGPU
6010 +//===----------------------------------------------------------------------===//
6013 +using namespace llvmCFGStruct;
6016 +class AMDGPUCFGStructurizer : public MachineFunctionPass {
6018 + typedef MachineInstr InstructionType;
6019 + typedef MachineFunction FunctionType;
6020 + typedef MachineBasicBlock BlockType;
6021 + typedef MachineLoopInfo LoopinfoType;
6022 + typedef MachineDominatorTree DominatortreeType;
6023 + typedef MachinePostDominatorTree PostDominatortreeType;
6024 + typedef MachineDomTreeNode DomTreeNodeType;
6025 + typedef MachineLoop LoopType;
6028 + TargetMachine &TM;
6029 + const TargetInstrInfo *TII;
6030 + const AMDGPURegisterInfo *TRI;
6033 + AMDGPUCFGStructurizer(char &pid, TargetMachine &tm);
6034 + const TargetInstrInfo *getTargetInstrInfo() const;
6040 +} //end of namespace llvm
6041 +AMDGPUCFGStructurizer::AMDGPUCFGStructurizer(char &pid, TargetMachine &tm)
6042 +: MachineFunctionPass(pid), TM(tm), TII(tm.getInstrInfo()),
6043 + TRI(static_cast<const AMDGPURegisterInfo *>(tm.getRegisterInfo())) {
6046 +const TargetInstrInfo *AMDGPUCFGStructurizer::getTargetInstrInfo() const {
6049 +//===----------------------------------------------------------------------===//
6053 +//===----------------------------------------------------------------------===//
6056 +using namespace llvmCFGStruct;
6059 +class AMDGPUCFGPrepare : public AMDGPUCFGStructurizer {
6064 + AMDGPUCFGPrepare(TargetMachine &tm);
6066 + virtual const char *getPassName() const;
6067 + virtual void getAnalysisUsage(AnalysisUsage &AU) const;
6069 + bool runOnMachineFunction(MachineFunction &F);
6075 +char AMDGPUCFGPrepare::ID = 0;
6076 +} //end of namespace llvm
6078 +AMDGPUCFGPrepare::AMDGPUCFGPrepare(TargetMachine &tm)
6079 + : AMDGPUCFGStructurizer(ID, tm ) {
6081 +const char *AMDGPUCFGPrepare::getPassName() const {
6082 + return "AMD IL Control Flow Graph Preparation Pass";
6085 +void AMDGPUCFGPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
6086 + AU.addPreserved<MachineFunctionAnalysis>();
6087 + AU.addRequired<MachineFunctionAnalysis>();
6088 + AU.addRequired<MachineDominatorTree>();
6089 + AU.addRequired<MachinePostDominatorTree>();
6090 + AU.addRequired<MachineLoopInfo>();
6093 +//===----------------------------------------------------------------------===//
6097 +//===----------------------------------------------------------------------===//
6100 +using namespace llvmCFGStruct;
6103 +class AMDGPUCFGPerform : public AMDGPUCFGStructurizer {
6108 + AMDGPUCFGPerform(TargetMachine &tm);
6109 + virtual const char *getPassName() const;
6110 + virtual void getAnalysisUsage(AnalysisUsage &AU) const;
6111 + bool runOnMachineFunction(MachineFunction &F);
6117 +char AMDGPUCFGPerform::ID = 0;
6118 +} //end of namespace llvm
6120 + AMDGPUCFGPerform::AMDGPUCFGPerform(TargetMachine &tm)
6121 +: AMDGPUCFGStructurizer(ID, tm) {
6124 +const char *AMDGPUCFGPerform::getPassName() const {
6125 + return "AMD IL Control Flow Graph structurizer Pass";
6128 +void AMDGPUCFGPerform::getAnalysisUsage(AnalysisUsage &AU) const {
6129 + AU.addPreserved<MachineFunctionAnalysis>();
6130 + AU.addRequired<MachineFunctionAnalysis>();
6131 + AU.addRequired<MachineDominatorTree>();
6132 + AU.addRequired<MachinePostDominatorTree>();
6133 + AU.addRequired<MachineLoopInfo>();
6136 +//===----------------------------------------------------------------------===//
6138 +// CFGStructTraits<AMDGPUCFGStructurizer>
6140 +//===----------------------------------------------------------------------===//
6142 +namespace llvmCFGStruct {
6143 +// this class is tailor to the AMDGPU backend
6145 +struct CFGStructTraits<AMDGPUCFGStructurizer> {
6146 + typedef int RegiT;
6148 + static int getBranchNzeroOpcode(int oldOpcode) {
6149 + switch(oldOpcode) {
6150 + case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
6151 + case AMDGPU::BRANCH_COND_i32:
6152 + case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
6154 + assert(0 && "internal error");
6159 + static int getBranchZeroOpcode(int oldOpcode) {
6160 + switch(oldOpcode) {
6161 + case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
6162 + case AMDGPU::BRANCH_COND_i32:
6163 + case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
6165 + assert(0 && "internal error");
6170 + static int getContinueNzeroOpcode(int oldOpcode) {
6171 + switch(oldOpcode) {
6172 + case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
6174 + assert(0 && "internal error");
6179 + static int getContinueZeroOpcode(int oldOpcode) {
6180 + switch(oldOpcode) {
6181 + case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
6183 + assert(0 && "internal error");
6188 + static MachineBasicBlock *getTrueBranch(MachineInstr *instr) {
6189 + return instr->getOperand(0).getMBB();
6192 + static void setTrueBranch(MachineInstr *instr, MachineBasicBlock *blk) {
6193 + instr->getOperand(0).setMBB(blk);
6196 + static MachineBasicBlock *
6197 + getFalseBranch(MachineBasicBlock *blk, MachineInstr *instr) {
6198 + assert(blk->succ_size() == 2);
6199 + MachineBasicBlock *trueBranch = getTrueBranch(instr);
6200 + MachineBasicBlock::succ_iterator iter = blk->succ_begin();
6201 + MachineBasicBlock::succ_iterator iterNext = iter;
6204 + return (*iter == trueBranch) ? *iterNext : *iter;
6207 + static bool isCondBranch(MachineInstr *instr) {
6208 + switch (instr->getOpcode()) {
6209 + case AMDGPU::JUMP:
6210 + return instr->getOperand(instr->findFirstPredOperandIdx()).getReg() != 0;
6211 + case AMDGPU::BRANCH_COND_i32:
6212 + case AMDGPU::BRANCH_COND_f32:
6220 + static bool isUncondBranch(MachineInstr *instr) {
6221 + switch (instr->getOpcode()) {
6222 + case AMDGPU::JUMP:
6223 + return instr->getOperand(instr->findFirstPredOperandIdx()).getReg() == 0;
6224 + case AMDGPU::BRANCH:
6232 + static DebugLoc getLastDebugLocInBB(MachineBasicBlock *blk) {
6233 + //get DebugLoc from the first MachineBasicBlock instruction with debug info
6235 + for (MachineBasicBlock::iterator iter = blk->begin(); iter != blk->end(); ++iter) {
6236 + MachineInstr *instr = &(*iter);
6237 + if (instr->getDebugLoc().isUnknown() == false) {
6238 + DL = instr->getDebugLoc();
6244 + static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *blk) {
6245 + MachineBasicBlock::reverse_iterator iter = blk->rbegin();
6246 + MachineInstr *instr = &*iter;
6247 + if (instr && (isCondBranch(instr) || isUncondBranch(instr))) {
6253 + // The correct naming for this is getPossibleLoopendBlockBranchInstr.
6255 + // BB with backward-edge could have move instructions after the branch
6256 + // instruction. Such move instruction "belong to" the loop backward-edge.
6258 + static MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *blk) {
6259 + const AMDGPUInstrInfo * TII = static_cast<const AMDGPUInstrInfo *>(
6260 + blk->getParent()->getTarget().getInstrInfo());
6262 + for (MachineBasicBlock::reverse_iterator iter = blk->rbegin(),
6263 + iterEnd = blk->rend(); iter != iterEnd; ++iter) {
6264 + // FIXME: Simplify
6265 + MachineInstr *instr = &*iter;
6267 + if (isCondBranch(instr) || isUncondBranch(instr)) {
6269 + } else if (!TII->isMov(instr->getOpcode())) {
6277 + static MachineInstr *getReturnInstr(MachineBasicBlock *blk) {
6278 + MachineBasicBlock::reverse_iterator iter = blk->rbegin();
6279 + if (iter != blk->rend()) {
6280 + MachineInstr *instr = &(*iter);
6281 + if (instr->getOpcode() == AMDGPU::RETURN) {
6288 + static MachineInstr *getContinueInstr(MachineBasicBlock *blk) {
6289 + MachineBasicBlock::reverse_iterator iter = blk->rbegin();
6290 + if (iter != blk->rend()) {
6291 + MachineInstr *instr = &(*iter);
6292 + if (instr->getOpcode() == AMDGPU::CONTINUE) {
6299 + static MachineInstr *getLoopBreakInstr(MachineBasicBlock *blk) {
6300 + for (MachineBasicBlock::iterator iter = blk->begin(); (iter != blk->end()); ++iter) {
6301 + MachineInstr *instr = &(*iter);
6302 + if (instr->getOpcode() == AMDGPU::PREDICATED_BREAK) {
6309 + static bool isReturnBlock(MachineBasicBlock *blk) {
6310 + MachineInstr *instr = getReturnInstr(blk);
6311 + bool isReturn = (blk->succ_size() == 0);
6314 + } else if (isReturn) {
6316 + errs() << "BB" << blk->getNumber()
6317 + <<" is return block without RETURN instr\n";
6324 + static MachineBasicBlock::iterator
6325 + getInstrPos(MachineBasicBlock *blk, MachineInstr *instr) {
6326 + assert(instr->getParent() == blk && "instruction doesn't belong to block");
6327 + MachineBasicBlock::iterator iter = blk->begin();
6328 + MachineBasicBlock::iterator iterEnd = blk->end();
6329 + while (&(*iter) != instr && iter != iterEnd) {
6333 + assert(iter != iterEnd);
6337 + static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
6338 + AMDGPUCFGStructurizer *passRep) {
6339 + return insertInstrBefore(blk,newOpcode,passRep,DebugLoc());
6340 + } //insertInstrBefore
6342 + static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
6343 + AMDGPUCFGStructurizer *passRep, DebugLoc DL) {
6344 + const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
6345 + MachineInstr *newInstr =
6346 + blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL);
6348 + MachineBasicBlock::iterator res;
6349 + if (blk->begin() != blk->end()) {
6350 + blk->insert(blk->begin(), newInstr);
6352 + blk->push_back(newInstr);
6355 + SHOWNEWINSTR(newInstr);
6358 + } //insertInstrBefore
6360 + static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
6361 + AMDGPUCFGStructurizer *passRep) {
6362 + insertInstrEnd(blk,newOpcode,passRep,DebugLoc());
6363 + } //insertInstrEnd
6365 + static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
6366 + AMDGPUCFGStructurizer *passRep, DebugLoc DL) {
6367 + const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
6368 + MachineInstr *newInstr = blk->getParent()
6369 + ->CreateMachineInstr(tii->get(newOpcode), DL);
6371 + blk->push_back(newInstr);
6372 + //assume the instruction doesn't take any reg operand ...
6374 + SHOWNEWINSTR(newInstr);
6375 + } //insertInstrEnd
6377 + static MachineInstr *insertInstrBefore(MachineBasicBlock::iterator instrPos,
6379 + AMDGPUCFGStructurizer *passRep) {
6380 + MachineInstr *oldInstr = &(*instrPos);
6381 + const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
6382 + MachineBasicBlock *blk = oldInstr->getParent();
6383 + MachineInstr *newInstr =
6384 + blk->getParent()->CreateMachineInstr(tii->get(newOpcode),
6387 + blk->insert(instrPos, newInstr);
6388 + //assume the instruction doesn't take any reg operand ...
6390 + SHOWNEWINSTR(newInstr);
6392 + } //insertInstrBefore
6394 + static void insertCondBranchBefore(MachineBasicBlock::iterator instrPos,
6396 + AMDGPUCFGStructurizer *passRep,
6398 + MachineInstr *oldInstr = &(*instrPos);
6399 + const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
6400 + MachineBasicBlock *blk = oldInstr->getParent();
6401 + MachineInstr *newInstr =
6402 + blk->getParent()->CreateMachineInstr(tii->get(newOpcode),
6405 + blk->insert(instrPos, newInstr);
6406 + MachineInstrBuilder(newInstr).addReg(oldInstr->getOperand(1).getReg(),
6409 + SHOWNEWINSTR(newInstr);
6410 + //erase later oldInstr->eraseFromParent();
6411 + } //insertCondBranchBefore
6413 + static void insertCondBranchBefore(MachineBasicBlock *blk,
6414 + MachineBasicBlock::iterator insertPos,
6416 + AMDGPUCFGStructurizer *passRep,
6419 + const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
6421 + MachineInstr *newInstr =
6422 + blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL);
6425 + blk->insert(insertPos, newInstr);
6426 + MachineInstrBuilder(newInstr).addReg(regNum, false);
6428 + SHOWNEWINSTR(newInstr);
6429 + } //insertCondBranchBefore
6431 + static void insertCondBranchEnd(MachineBasicBlock *blk,
6433 + AMDGPUCFGStructurizer *passRep,
6435 + const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
6436 + MachineInstr *newInstr =
6437 + blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DebugLoc());
6439 + blk->push_back(newInstr);
6440 + MachineInstrBuilder(newInstr).addReg(regNum, false);
6442 + SHOWNEWINSTR(newInstr);
6443 + } //insertCondBranchEnd
6446 + static void insertAssignInstrBefore(MachineBasicBlock::iterator instrPos,
6447 + AMDGPUCFGStructurizer *passRep,
6448 + RegiT regNum, int regVal) {
6449 + MachineInstr *oldInstr = &(*instrPos);
6450 + const AMDGPUInstrInfo *tii =
6451 + static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
6452 + MachineBasicBlock *blk = oldInstr->getParent();
6453 + MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum,
6455 + blk->insert(instrPos, newInstr);
6457 + SHOWNEWINSTR(newInstr);
6458 + } //insertAssignInstrBefore
6460 + static void insertAssignInstrBefore(MachineBasicBlock *blk,
6461 + AMDGPUCFGStructurizer *passRep,
6462 + RegiT regNum, int regVal) {
6463 + const AMDGPUInstrInfo *tii =
6464 + static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
6466 + MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum,
6468 + if (blk->begin() != blk->end()) {
6469 + blk->insert(blk->begin(), newInstr);
6471 + blk->push_back(newInstr);
6474 + SHOWNEWINSTR(newInstr);
6476 + } //insertInstrBefore
6478 + static void insertCompareInstrBefore(MachineBasicBlock *blk,
6479 + MachineBasicBlock::iterator instrPos,
6480 + AMDGPUCFGStructurizer *passRep,
6481 + RegiT dstReg, RegiT src1Reg,
6483 + const AMDGPUInstrInfo *tii =
6484 + static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
6485 + MachineInstr *newInstr =
6486 + blk->getParent()->CreateMachineInstr(tii->get(tii->getIEQOpcode()), DebugLoc());
6488 + MachineInstrBuilder(newInstr).addReg(dstReg, RegState::Define); //set target
6489 + MachineInstrBuilder(newInstr).addReg(src1Reg); //set src value
6490 + MachineInstrBuilder(newInstr).addReg(src2Reg); //set src value
6492 + blk->insert(instrPos, newInstr);
6493 + SHOWNEWINSTR(newInstr);
6495 + } //insertCompareInstrBefore
6497 + static void cloneSuccessorList(MachineBasicBlock *dstBlk,
6498 + MachineBasicBlock *srcBlk) {
6499 + for (MachineBasicBlock::succ_iterator iter = srcBlk->succ_begin(),
6500 + iterEnd = srcBlk->succ_end(); iter != iterEnd; ++iter) {
6501 + dstBlk->addSuccessor(*iter); // *iter's predecessor is also taken care of
6503 + } //cloneSuccessorList
6505 + static MachineBasicBlock *clone(MachineBasicBlock *srcBlk) {
6506 + MachineFunction *func = srcBlk->getParent();
6507 + MachineBasicBlock *newBlk = func->CreateMachineBasicBlock();
6508 + func->push_back(newBlk); //insert to function
6509 + for (MachineBasicBlock::iterator iter = srcBlk->begin(),
6510 + iterEnd = srcBlk->end();
6511 + iter != iterEnd; ++iter) {
6512 + MachineInstr *instr = func->CloneMachineInstr(iter);
6513 + newBlk->push_back(instr);
6518 + //MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose because
6519 + //the AMDGPU instruction is not recognized as terminator fix this and retire
6521 + static void replaceInstrUseOfBlockWith(MachineBasicBlock *srcBlk,
6522 + MachineBasicBlock *oldBlk,
6523 + MachineBasicBlock *newBlk) {
6524 + MachineInstr *branchInstr = getLoopendBlockBranchInstr(srcBlk);
6525 + if (branchInstr && isCondBranch(branchInstr) &&
6526 + getTrueBranch(branchInstr) == oldBlk) {
6527 + setTrueBranch(branchInstr, newBlk);
6531 + static void wrapup(MachineBasicBlock *entryBlk) {
6532 + assert((!entryBlk->getParent()->getJumpTableInfo()
6533 + || entryBlk->getParent()->getJumpTableInfo()->isEmpty())
6534 + && "found a jump table");
6536 + //collect continue right before endloop
6537 + SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> contInstr;
6538 + MachineBasicBlock::iterator pre = entryBlk->begin();
6539 + MachineBasicBlock::iterator iterEnd = entryBlk->end();
6540 + MachineBasicBlock::iterator iter = pre;
6541 + while (iter != iterEnd) {
6542 + if (pre->getOpcode() == AMDGPU::CONTINUE
6543 + && iter->getOpcode() == AMDGPU::ENDLOOP) {
6544 + contInstr.push_back(pre);
6550 + //delete continue right before endloop
6551 + for (unsigned i = 0; i < contInstr.size(); ++i) {
6552 + contInstr[i]->eraseFromParent();
6555 + // TODO to fix up jump table so later phase won't be confused. if
6556 + // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
6557 + // there isn't such an interface yet. alternatively, replace all the other
6558 + // blocks in the jump table with the entryBlk //}
6562 + static MachineDominatorTree *getDominatorTree(AMDGPUCFGStructurizer &pass) {
6563 + return &pass.getAnalysis<MachineDominatorTree>();
6566 + static MachinePostDominatorTree*
6567 + getPostDominatorTree(AMDGPUCFGStructurizer &pass) {
6568 + return &pass.getAnalysis<MachinePostDominatorTree>();
6571 + static MachineLoopInfo *getLoopInfo(AMDGPUCFGStructurizer &pass) {
6572 + return &pass.getAnalysis<MachineLoopInfo>();
6574 +}; // template class CFGStructTraits
6575 +} //end of namespace llvm
6577 +// createAMDGPUCFGPreparationPass- Returns a pass
6578 +FunctionPass *llvm::createAMDGPUCFGPreparationPass(TargetMachine &tm
6580 + return new AMDGPUCFGPrepare(tm );
6583 +bool AMDGPUCFGPrepare::runOnMachineFunction(MachineFunction &func) {
6584 + return llvmCFGStruct::CFGStructurizer<AMDGPUCFGStructurizer>().prepare(func,
6589 +// createAMDGPUCFGStructurizerPass- Returns a pass
6590 +FunctionPass *llvm::createAMDGPUCFGStructurizerPass(TargetMachine &tm
6592 + return new AMDGPUCFGPerform(tm );
6595 +bool AMDGPUCFGPerform::runOnMachineFunction(MachineFunction &func) {
6596 + return llvmCFGStruct::CFGStructurizer<AMDGPUCFGStructurizer>().run(func,
6600 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILDevice.cpp llvm-r600/lib/Target/R600/AMDILDevice.cpp
6601 --- llvm-3.2.src/lib/Target/R600/AMDILDevice.cpp 1970-01-01 01:00:00.000000000 +0100
6602 +++ llvm-r600/lib/Target/R600/AMDILDevice.cpp 2013-01-25 19:43:57.440049721 +0100
6604 +//===-- AMDILDevice.cpp - Base class for AMDIL Devices --------------------===//
6606 +// The LLVM Compiler Infrastructure
6608 +// This file is distributed under the University of Illinois Open Source
6609 +// License. See LICENSE.TXT for details.
6612 +//==-----------------------------------------------------------------------===//
6613 +#include "AMDILDevice.h"
6614 +#include "AMDGPUSubtarget.h"
6616 +using namespace llvm;
6617 +// Default implementation for all of the classes.
6618 +AMDGPUDevice::AMDGPUDevice(AMDGPUSubtarget *ST) : mSTM(ST) {
6619 + mHWBits.resize(AMDGPUDeviceInfo::MaxNumberCapabilities);
6620 + mSWBits.resize(AMDGPUDeviceInfo::MaxNumberCapabilities);
6622 + DeviceFlag = OCL_DEVICE_ALL;
6625 +AMDGPUDevice::~AMDGPUDevice() {
6630 +size_t AMDGPUDevice::getMaxGDSSize() const {
6635 +AMDGPUDevice::getDeviceFlag() const {
6636 + return DeviceFlag;
6639 +size_t AMDGPUDevice::getMaxNumCBs() const {
6640 + if (usesHardware(AMDGPUDeviceInfo::ConstantMem)) {
6641 + return HW_MAX_NUM_CB;
6647 +size_t AMDGPUDevice::getMaxCBSize() const {
6648 + if (usesHardware(AMDGPUDeviceInfo::ConstantMem)) {
6649 + return MAX_CB_SIZE;
6655 +size_t AMDGPUDevice::getMaxScratchSize() const {
6659 +uint32_t AMDGPUDevice::getStackAlignment() const {
6663 +void AMDGPUDevice::setCaps() {
6664 + mSWBits.set(AMDGPUDeviceInfo::HalfOps);
6665 + mSWBits.set(AMDGPUDeviceInfo::ByteOps);
6666 + mSWBits.set(AMDGPUDeviceInfo::ShortOps);
6667 + mSWBits.set(AMDGPUDeviceInfo::HW64BitDivMod);
6668 + if (mSTM->isOverride(AMDGPUDeviceInfo::NoInline)) {
6669 + mSWBits.set(AMDGPUDeviceInfo::NoInline);
6671 + if (mSTM->isOverride(AMDGPUDeviceInfo::MacroDB)) {
6672 + mSWBits.set(AMDGPUDeviceInfo::MacroDB);
6674 + if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) {
6675 + mSWBits.set(AMDGPUDeviceInfo::ConstantMem);
6677 + mHWBits.set(AMDGPUDeviceInfo::ConstantMem);
6679 + if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) {
6680 + mSWBits.set(AMDGPUDeviceInfo::PrivateMem);
6682 + mHWBits.set(AMDGPUDeviceInfo::PrivateMem);
6684 + if (mSTM->isOverride(AMDGPUDeviceInfo::BarrierDetect)) {
6685 + mSWBits.set(AMDGPUDeviceInfo::BarrierDetect);
6687 + mSWBits.set(AMDGPUDeviceInfo::ByteLDSOps);
6688 + mSWBits.set(AMDGPUDeviceInfo::LongOps);
6691 +AMDGPUDeviceInfo::ExecutionMode
6692 +AMDGPUDevice::getExecutionMode(AMDGPUDeviceInfo::Caps Caps) const {
6693 + if (mHWBits[Caps]) {
6694 + assert(!mSWBits[Caps] && "Cannot set both SW and HW caps");
6695 + return AMDGPUDeviceInfo::Hardware;
6698 + if (mSWBits[Caps]) {
6699 + assert(!mHWBits[Caps] && "Cannot set both SW and HW caps");
6700 + return AMDGPUDeviceInfo::Software;
6703 + return AMDGPUDeviceInfo::Unsupported;
6707 +bool AMDGPUDevice::isSupported(AMDGPUDeviceInfo::Caps Mode) const {
6708 + return getExecutionMode(Mode) != AMDGPUDeviceInfo::Unsupported;
6711 +bool AMDGPUDevice::usesHardware(AMDGPUDeviceInfo::Caps Mode) const {
6712 + return getExecutionMode(Mode) == AMDGPUDeviceInfo::Hardware;
6715 +bool AMDGPUDevice::usesSoftware(AMDGPUDeviceInfo::Caps Mode) const {
6716 + return getExecutionMode(Mode) == AMDGPUDeviceInfo::Software;
6720 +AMDGPUDevice::getDataLayout() const {
6721 + return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16"
6722 + "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
6723 + "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
6724 + "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
6725 + "-v512:512:512-v1024:1024:1024-v2048:2048:2048"
6728 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILDevice.h llvm-r600/lib/Target/R600/AMDILDevice.h
6729 --- llvm-3.2.src/lib/Target/R600/AMDILDevice.h 1970-01-01 01:00:00.000000000 +0100
6730 +++ llvm-r600/lib/Target/R600/AMDILDevice.h 2013-01-25 19:43:57.440049721 +0100
6732 +//===---- AMDILDevice.h - Define Device Data for AMDGPU -----*- C++ -*------===//
6734 +// The LLVM Compiler Infrastructure
6736 +// This file is distributed under the University of Illinois Open Source
6737 +// License. See LICENSE.TXT for details.
6739 +//==-----------------------------------------------------------------------===//
6742 +/// \brief Interface for the subtarget data classes.
6744 +/// This file will define the interface that each generation needs to
6745 +/// implement in order to correctly answer queries on the capabilities of the
6746 +/// specific hardware.
6747 +//===----------------------------------------------------------------------===//
6748 +#ifndef AMDILDEVICEIMPL_H
6749 +#define AMDILDEVICEIMPL_H
6751 +#include "llvm/ADT/BitVector.h"
6754 + class AMDGPUSubtarget;
6756 +//===----------------------------------------------------------------------===//
6757 +// Interface for data that is specific to a single device
6758 +//===----------------------------------------------------------------------===//
6759 +class AMDGPUDevice {
6761 + AMDGPUDevice(AMDGPUSubtarget *ST);
6762 + virtual ~AMDGPUDevice();
6764 + // Enum values for the various memory types.
6776 + /// \returns The max LDS size that the hardware supports. Size is in
6778 + virtual size_t getMaxLDSSize() const = 0;
6780 + /// \returns The max GDS size that the hardware supports if the GDS is
6781 + /// supported by the hardware. Size is in bytes.
6782 + virtual size_t getMaxGDSSize() const;
6784 + /// \returns The max number of hardware constant address spaces that
6785 + /// are supported by this device.
6786 + virtual size_t getMaxNumCBs() const;
6788 + /// \returns The max number of bytes a single hardware constant buffer
6789 + /// can support. Size is in bytes.
6790 + virtual size_t getMaxCBSize() const;
6792 + /// \returns The max number of bytes allowed by the hardware scratch
6793 + /// buffer. Size is in bytes.
6794 + virtual size_t getMaxScratchSize() const;
6796 + /// \brief Get the flag that corresponds to the device.
6797 + virtual uint32_t getDeviceFlag() const;
6799 + /// \returns The number of work-items that exist in a single hardware
6801 + virtual size_t getWavefrontSize() const = 0;
6803 + /// \brief Get the generational name of this specific device.
6804 + virtual uint32_t getGeneration() const = 0;
6806 + /// \brief Get the stack alignment of this specific device.
6807 + virtual uint32_t getStackAlignment() const;
6809 + /// \brief Get the resource ID for this specific device.
6810 + virtual uint32_t getResourceID(uint32_t DeviceID) const = 0;
6812 + /// \brief Get the max number of UAV's for this device.
6813 + virtual uint32_t getMaxNumUAVs() const = 0;
6816 + // API utilizing more detailed capabilities of each family of
6817 + // cards. If a capability is supported, then either usesHardware or
6818 + // usesSoftware returned true. If usesHardware returned true, then
6819 + // usesSoftware must return false for the same capability. Hardware
6820 + // execution means that the feature is done natively by the hardware
6821 + // and is not emulated by the softare. Software execution means
6822 + // that the feature could be done in the hardware, but there is
6823 + // software that emulates it with possibly using the hardware for
6824 + // support since the hardware does not fully comply with OpenCL
6827 + bool isSupported(AMDGPUDeviceInfo::Caps Mode) const;
6828 + bool usesHardware(AMDGPUDeviceInfo::Caps Mode) const;
6829 + bool usesSoftware(AMDGPUDeviceInfo::Caps Mode) const;
6830 + virtual std::string getDataLayout() const;
6831 + static const unsigned int MAX_LDS_SIZE_700 = 16384;
6832 + static const unsigned int MAX_LDS_SIZE_800 = 32768;
6833 + static const unsigned int WavefrontSize = 64;
6834 + static const unsigned int HalfWavefrontSize = 32;
6835 + static const unsigned int QuarterWavefrontSize = 16;
6837 + virtual void setCaps();
6838 + llvm::BitVector mHWBits;
6839 + llvm::BitVector mSWBits;
6840 + AMDGPUSubtarget *mSTM;
6841 + uint32_t DeviceFlag;
6843 + AMDGPUDeviceInfo::ExecutionMode
6844 + getExecutionMode(AMDGPUDeviceInfo::Caps Caps) const;
6847 +} // namespace llvm
6848 +#endif // AMDILDEVICEIMPL_H
6849 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILDeviceInfo.cpp llvm-r600/lib/Target/R600/AMDILDeviceInfo.cpp
6850 --- llvm-3.2.src/lib/Target/R600/AMDILDeviceInfo.cpp 1970-01-01 01:00:00.000000000 +0100
6851 +++ llvm-r600/lib/Target/R600/AMDILDeviceInfo.cpp 2013-01-25 19:43:57.440049721 +0100
6853 +//===-- AMDILDeviceInfo.cpp - AMDILDeviceInfo class -----------------------===//
6855 +// The LLVM Compiler Infrastructure
6857 +// This file is distributed under the University of Illinois Open Source
6858 +// License. See LICENSE.TXT for details.
6860 +//==-----------------------------------------------------------------------===//
6863 +/// \brief Function that creates DeviceInfo from a device name and other information.
6865 +//==-----------------------------------------------------------------------===//
6866 +#include "AMDILDevices.h"
6867 +#include "AMDGPUSubtarget.h"
6869 +using namespace llvm;
6871 +namespace AMDGPUDeviceInfo {
6873 +AMDGPUDevice* getDeviceFromName(const std::string &deviceName,
6874 + AMDGPUSubtarget *ptr,
6875 + bool is64bit, bool is64on32bit) {
6876 + if (deviceName.c_str()[2] == '7') {
6877 + switch (deviceName.c_str()[3]) {
6879 + return new AMDGPU710Device(ptr);
6881 + return new AMDGPU770Device(ptr);
6883 + return new AMDGPU7XXDevice(ptr);
6885 + } else if (deviceName == "cypress") {
6887 + assert(!is64bit && "This device does not support 64bit pointers!");
6888 + assert(!is64on32bit && "This device does not support 64bit"
6889 + " on 32bit pointers!");
6891 + return new AMDGPUCypressDevice(ptr);
6892 + } else if (deviceName == "juniper") {
6894 + assert(!is64bit && "This device does not support 64bit pointers!");
6895 + assert(!is64on32bit && "This device does not support 64bit"
6896 + " on 32bit pointers!");
6898 + return new AMDGPUEvergreenDevice(ptr);
6899 + } else if (deviceName == "redwood") {
6901 + assert(!is64bit && "This device does not support 64bit pointers!");
6902 + assert(!is64on32bit && "This device does not support 64bit"
6903 + " on 32bit pointers!");
6905 + return new AMDGPURedwoodDevice(ptr);
6906 + } else if (deviceName == "cedar") {
6908 + assert(!is64bit && "This device does not support 64bit pointers!");
6909 + assert(!is64on32bit && "This device does not support 64bit"
6910 + " on 32bit pointers!");
6912 + return new AMDGPUCedarDevice(ptr);
6913 + } else if (deviceName == "barts" || deviceName == "turks") {
6915 + assert(!is64bit && "This device does not support 64bit pointers!");
6916 + assert(!is64on32bit && "This device does not support 64bit"
6917 + " on 32bit pointers!");
6919 + return new AMDGPUNIDevice(ptr);
6920 + } else if (deviceName == "cayman") {
6922 + assert(!is64bit && "This device does not support 64bit pointers!");
6923 + assert(!is64on32bit && "This device does not support 64bit"
6924 + " on 32bit pointers!");
6926 + return new AMDGPUCaymanDevice(ptr);
6927 + } else if (deviceName == "caicos") {
6929 + assert(!is64bit && "This device does not support 64bit pointers!");
6930 + assert(!is64on32bit && "This device does not support 64bit"
6931 + " on 32bit pointers!");
6933 + return new AMDGPUNIDevice(ptr);
6934 + } else if (deviceName == "SI") {
6935 + return new AMDGPUSIDevice(ptr);
6938 + assert(!is64bit && "This device does not support 64bit pointers!");
6939 + assert(!is64on32bit && "This device does not support 64bit"
6940 + " on 32bit pointers!");
6942 + return new AMDGPU7XXDevice(ptr);
6945 +} // End namespace AMDGPUDeviceInfo
6946 +} // End namespace llvm
6947 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILDeviceInfo.h llvm-r600/lib/Target/R600/AMDILDeviceInfo.h
6948 --- llvm-3.2.src/lib/Target/R600/AMDILDeviceInfo.h 1970-01-01 01:00:00.000000000 +0100
6949 +++ llvm-r600/lib/Target/R600/AMDILDeviceInfo.h 2013-01-25 19:43:57.440049721 +0100
6951 +//===-- AMDILDeviceInfo.h - Constants for describing devices --------------===//
6953 +// The LLVM Compiler Infrastructure
6955 +// This file is distributed under the University of Illinois Open Source
6956 +// License. See LICENSE.TXT for details.
6959 +//==-----------------------------------------------------------------------===//
6960 +#ifndef AMDILDEVICEINFO_H
6961 +#define AMDILDEVICEINFO_H
6967 + class AMDGPUDevice;
6968 + class AMDGPUSubtarget;
6969 + namespace AMDGPUDeviceInfo {
6970 + /// Each Capabilities can be executed using a hardware instruction,
6971 + /// emulated with a sequence of software instructions, or not
6972 + /// supported at all.
6973 + enum ExecutionMode {
6974 + Unsupported = 0, ///< Unsupported feature on the card(Default value)
6975 + /// This is the execution mode that is set if the feature is emulated in
6978 + /// This execution mode is set if the feature exists natively in hardware
6983 + HalfOps = 0x1, ///< Half float is supported or not.
6984 + DoubleOps = 0x2, ///< Double is supported or not.
6985 + ByteOps = 0x3, ///< Byte(char) is support or not.
6986 + ShortOps = 0x4, ///< Short is supported or not.
6987 + LongOps = 0x5, ///< Long is supported or not.
6988 + Images = 0x6, ///< Images are supported or not.
6989 + ByteStores = 0x7, ///< ByteStores available(!HD4XXX).
6990 + ConstantMem = 0x8, ///< Constant/CB memory.
6991 + LocalMem = 0x9, ///< Local/LDS memory.
6992 + PrivateMem = 0xA, ///< Scratch/Private/Stack memory.
6993 + RegionMem = 0xB, ///< OCL GDS Memory Extension.
6994 + FMA = 0xC, ///< Use HW FMA or SW FMA.
6995 + ArenaSegment = 0xD, ///< Use for Arena UAV per pointer 12-1023.
6996 + MultiUAV = 0xE, ///< Use for UAV per Pointer 0-7.
6997 + Reserved0 = 0xF, ///< ReservedFlag
6998 + NoAlias = 0x10, ///< Cached loads.
6999 + Signed24BitOps = 0x11, ///< Peephole Optimization.
7000 + /// Debug mode implies that no hardware features or optimizations
7001 + /// are performned and that all memory access go through a single
7002 + /// uav(Arena on HD5XXX/HD6XXX and Raw on HD4XXX).
7004 + CachedMem = 0x13, ///< Cached mem is available or not.
7005 + BarrierDetect = 0x14, ///< Detect duplicate barriers.
7006 + Reserved1 = 0x15, ///< Reserved flag
7007 + ByteLDSOps = 0x16, ///< Flag to specify if byte LDS ops are available.
7008 + ArenaVectors = 0x17, ///< Flag to specify if vector loads from arena work.
7009 + TmrReg = 0x18, ///< Flag to specify if Tmr register is supported.
7010 + NoInline = 0x19, ///< Flag to specify that no inlining should occur.
7011 + MacroDB = 0x1A, ///< Flag to specify that backend handles macrodb.
7012 + HW64BitDivMod = 0x1B, ///< Flag for backend to generate 64bit div/mod.
7013 + ArenaUAV = 0x1C, ///< Flag to specify that arena uav is supported.
7014 + PrivateUAV = 0x1D, ///< Flag to specify that private memory uses uav's.
7015 + /// If more capabilities are required, then
7016 + /// this number needs to be increased.
7017 + /// All capabilities must come before this
7019 + MaxNumberCapabilities = 0x20
7021 + /// These have to be in order with the older generations
7022 + /// having the lower number enumerations.
7024 + HD4XXX = 0, ///< 7XX based devices.
7025 + HD5XXX, ///< Evergreen based devices.
7026 + HD6XXX, ///< NI/Evergreen+ based devices.
7027 + HD7XXX, ///< Southern Islands based devices.
7028 + HDTEST, ///< Experimental feature testing device.
7034 + getDeviceFromName(const std::string &name, AMDGPUSubtarget *ptr,
7035 + bool is64bit = false, bool is64on32bit = false);
7036 + } // namespace AMDILDeviceInfo
7037 +} // namespace llvm
7038 +#endif // AMDILDEVICEINFO_H
7039 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILDevices.h llvm-r600/lib/Target/R600/AMDILDevices.h
7040 --- llvm-3.2.src/lib/Target/R600/AMDILDevices.h 1970-01-01 01:00:00.000000000 +0100
7041 +++ llvm-r600/lib/Target/R600/AMDILDevices.h 2013-01-25 19:43:57.440049721 +0100
7043 +//===-- AMDILDevices.h - Consolidate AMDIL Device headers -----------------===//
7045 +// The LLVM Compiler Infrastructure
7047 +// This file is distributed under the University of Illinois Open Source
7048 +// License. See LICENSE.TXT for details.
7051 +//==-----------------------------------------------------------------------===//
7052 +#ifndef AMDIL_DEVICES_H
7053 +#define AMDIL_DEVICES_H
7054 +// Include all of the device specific header files
7055 +#include "AMDIL7XXDevice.h"
7056 +#include "AMDILDevice.h"
7057 +#include "AMDILEvergreenDevice.h"
7058 +#include "AMDILNIDevice.h"
7059 +#include "AMDILSIDevice.h"
7061 +#endif // AMDIL_DEVICES_H
7062 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILEvergreenDevice.cpp llvm-r600/lib/Target/R600/AMDILEvergreenDevice.cpp
7063 --- llvm-3.2.src/lib/Target/R600/AMDILEvergreenDevice.cpp 1970-01-01 01:00:00.000000000 +0100
7064 +++ llvm-r600/lib/Target/R600/AMDILEvergreenDevice.cpp 2013-01-25 19:43:57.440049721 +0100
7066 +//===-- AMDILEvergreenDevice.cpp - Device Info for Evergreen --------------===//
7068 +// The LLVM Compiler Infrastructure
7070 +// This file is distributed under the University of Illinois Open Source
7071 +// License. See LICENSE.TXT for details.
7074 +//==-----------------------------------------------------------------------===//
7075 +#include "AMDILEvergreenDevice.h"
7077 +using namespace llvm;
7079 +AMDGPUEvergreenDevice::AMDGPUEvergreenDevice(AMDGPUSubtarget *ST)
7080 +: AMDGPUDevice(ST) {
7082 + std::string name = ST->getDeviceName();
7083 + if (name == "cedar") {
7084 + DeviceFlag = OCL_DEVICE_CEDAR;
7085 + } else if (name == "redwood") {
7086 + DeviceFlag = OCL_DEVICE_REDWOOD;
7087 + } else if (name == "cypress") {
7088 + DeviceFlag = OCL_DEVICE_CYPRESS;
7090 + DeviceFlag = OCL_DEVICE_JUNIPER;
7094 +AMDGPUEvergreenDevice::~AMDGPUEvergreenDevice() {
7097 +size_t AMDGPUEvergreenDevice::getMaxLDSSize() const {
7098 + if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
7099 + return MAX_LDS_SIZE_800;
7104 +size_t AMDGPUEvergreenDevice::getMaxGDSSize() const {
7105 + if (usesHardware(AMDGPUDeviceInfo::RegionMem)) {
7106 + return MAX_LDS_SIZE_800;
7111 +uint32_t AMDGPUEvergreenDevice::getMaxNumUAVs() const {
7115 +uint32_t AMDGPUEvergreenDevice::getResourceID(uint32_t id) const {
7118 + assert(0 && "ID type passed in is unknown!");
7122 + return GLOBAL_RETURN_RAW_UAV_ID;
7124 + case ARENA_UAV_ID:
7125 + return DEFAULT_ARENA_UAV_ID;
7127 + if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
7128 + return DEFAULT_LDS_ID;
7130 + return DEFAULT_ARENA_UAV_ID;
7133 + if (usesHardware(AMDGPUDeviceInfo::RegionMem)) {
7134 + return DEFAULT_GDS_ID;
7136 + return DEFAULT_ARENA_UAV_ID;
7139 + if (usesHardware(AMDGPUDeviceInfo::PrivateMem)) {
7140 + return DEFAULT_SCRATCH_ID;
7142 + return DEFAULT_ARENA_UAV_ID;
7148 +size_t AMDGPUEvergreenDevice::getWavefrontSize() const {
7149 + return AMDGPUDevice::WavefrontSize;
7152 +uint32_t AMDGPUEvergreenDevice::getGeneration() const {
7153 + return AMDGPUDeviceInfo::HD5XXX;
7156 +void AMDGPUEvergreenDevice::setCaps() {
7157 + mSWBits.set(AMDGPUDeviceInfo::ArenaSegment);
7158 + mHWBits.set(AMDGPUDeviceInfo::ArenaUAV);
7159 + mHWBits.set(AMDGPUDeviceInfo::HW64BitDivMod);
7160 + mSWBits.reset(AMDGPUDeviceInfo::HW64BitDivMod);
7161 + mSWBits.set(AMDGPUDeviceInfo::Signed24BitOps);
7162 + if (mSTM->isOverride(AMDGPUDeviceInfo::ByteStores)) {
7163 + mHWBits.set(AMDGPUDeviceInfo::ByteStores);
7165 + if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) {
7166 + mSWBits.set(AMDGPUDeviceInfo::LocalMem);
7167 + mSWBits.set(AMDGPUDeviceInfo::RegionMem);
7169 + mHWBits.set(AMDGPUDeviceInfo::LocalMem);
7170 + mHWBits.set(AMDGPUDeviceInfo::RegionMem);
7172 + mHWBits.set(AMDGPUDeviceInfo::Images);
7173 + if (mSTM->isOverride(AMDGPUDeviceInfo::NoAlias)) {
7174 + mHWBits.set(AMDGPUDeviceInfo::NoAlias);
7176 + mHWBits.set(AMDGPUDeviceInfo::CachedMem);
7177 + if (mSTM->isOverride(AMDGPUDeviceInfo::MultiUAV)) {
7178 + mHWBits.set(AMDGPUDeviceInfo::MultiUAV);
7180 + mHWBits.set(AMDGPUDeviceInfo::ByteLDSOps);
7181 + mSWBits.reset(AMDGPUDeviceInfo::ByteLDSOps);
7182 + mHWBits.set(AMDGPUDeviceInfo::ArenaVectors);
7183 + mHWBits.set(AMDGPUDeviceInfo::LongOps);
7184 + mSWBits.reset(AMDGPUDeviceInfo::LongOps);
7185 + mHWBits.set(AMDGPUDeviceInfo::TmrReg);
7188 +AMDGPUCypressDevice::AMDGPUCypressDevice(AMDGPUSubtarget *ST)
7189 + : AMDGPUEvergreenDevice(ST) {
7193 +AMDGPUCypressDevice::~AMDGPUCypressDevice() {
7196 +void AMDGPUCypressDevice::setCaps() {
7197 + if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) {
7198 + mHWBits.set(AMDGPUDeviceInfo::DoubleOps);
7199 + mHWBits.set(AMDGPUDeviceInfo::FMA);
7204 +AMDGPUCedarDevice::AMDGPUCedarDevice(AMDGPUSubtarget *ST)
7205 + : AMDGPUEvergreenDevice(ST) {
7209 +AMDGPUCedarDevice::~AMDGPUCedarDevice() {
7212 +void AMDGPUCedarDevice::setCaps() {
7213 + mSWBits.set(AMDGPUDeviceInfo::FMA);
7216 +size_t AMDGPUCedarDevice::getWavefrontSize() const {
7217 + return AMDGPUDevice::QuarterWavefrontSize;
7220 +AMDGPURedwoodDevice::AMDGPURedwoodDevice(AMDGPUSubtarget *ST)
7221 + : AMDGPUEvergreenDevice(ST) {
7225 +AMDGPURedwoodDevice::~AMDGPURedwoodDevice() {
7228 +void AMDGPURedwoodDevice::setCaps() {
7229 + mSWBits.set(AMDGPUDeviceInfo::FMA);
7232 +size_t AMDGPURedwoodDevice::getWavefrontSize() const {
7233 + return AMDGPUDevice::HalfWavefrontSize;
7235 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILEvergreenDevice.h llvm-r600/lib/Target/R600/AMDILEvergreenDevice.h
7236 --- llvm-3.2.src/lib/Target/R600/AMDILEvergreenDevice.h 1970-01-01 01:00:00.000000000 +0100
7237 +++ llvm-r600/lib/Target/R600/AMDILEvergreenDevice.h 2013-01-25 19:43:57.440049721 +0100
7239 +//==- AMDILEvergreenDevice.h - Define Evergreen Device for AMDIL -*- C++ -*--=//
7241 +// The LLVM Compiler Infrastructure
7243 +// This file is distributed under the University of Illinois Open Source
7244 +// License. See LICENSE.TXT for details.
7246 +//==-----------------------------------------------------------------------===//
7249 +/// \brief Interface for the subtarget data classes.
7251 +/// This file will define the interface that each generation needs to
7252 +/// implement in order to correctly answer queries on the capabilities of the
7253 +/// specific hardware.
7254 +//===----------------------------------------------------------------------===//
7255 +#ifndef AMDILEVERGREENDEVICE_H
7256 +#define AMDILEVERGREENDEVICE_H
7257 +#include "AMDILDevice.h"
7258 +#include "AMDGPUSubtarget.h"
7261 + class AMDGPUSubtarget;
7262 +//===----------------------------------------------------------------------===//
7263 +// Evergreen generation of devices and their respective sub classes
7264 +//===----------------------------------------------------------------------===//
7267 +/// \brief The AMDGPUEvergreenDevice is the base device class for all of the Evergreen
7268 +/// series of cards.
7270 +/// This class contains information required to differentiate
7271 +/// the Evergreen device from the generic AMDGPUDevice. This device represents
7272 +/// that capabilities of the 'Juniper' cards, also known as the HD57XX.
7273 +class AMDGPUEvergreenDevice : public AMDGPUDevice {
7275 + AMDGPUEvergreenDevice(AMDGPUSubtarget *ST);
7276 + virtual ~AMDGPUEvergreenDevice();
7277 + virtual size_t getMaxLDSSize() const;
7278 + virtual size_t getMaxGDSSize() const;
7279 + virtual size_t getWavefrontSize() const;
7280 + virtual uint32_t getGeneration() const;
7281 + virtual uint32_t getMaxNumUAVs() const;
7282 + virtual uint32_t getResourceID(uint32_t) const;
7284 + virtual void setCaps();
7287 +/// The AMDGPUCypressDevice is similiar to the AMDGPUEvergreenDevice, except it has
7288 +/// support for double precision operations. This device is used to represent
7289 +/// both the Cypress and Hemlock cards, which are commercially known as HD58XX
7290 +/// and HD59XX cards.
7291 +class AMDGPUCypressDevice : public AMDGPUEvergreenDevice {
7293 + AMDGPUCypressDevice(AMDGPUSubtarget *ST);
7294 + virtual ~AMDGPUCypressDevice();
7296 + virtual void setCaps();
7300 +/// \brief The AMDGPUCedarDevice is the class that represents all of the 'Cedar' based
7303 +/// This class differs from the base AMDGPUEvergreenDevice in that the
7304 +/// device is a ~quarter of the 'Juniper'. These are commercially known as the
7305 +/// HD54XX and HD53XX series of cards.
7306 +class AMDGPUCedarDevice : public AMDGPUEvergreenDevice {
7308 + AMDGPUCedarDevice(AMDGPUSubtarget *ST);
7309 + virtual ~AMDGPUCedarDevice();
7310 + virtual size_t getWavefrontSize() const;
7312 + virtual void setCaps();
7315 +/// \brief The AMDGPURedwoodDevice is the class the represents all of the 'Redwood' based
7318 +/// This class differs from the base class, in that these devices are
7319 +/// considered about half of a 'Juniper' device. These are commercially known as
7320 +/// the HD55XX and HD56XX series of cards.
7321 +class AMDGPURedwoodDevice : public AMDGPUEvergreenDevice {
7323 + AMDGPURedwoodDevice(AMDGPUSubtarget *ST);
7324 + virtual ~AMDGPURedwoodDevice();
7325 + virtual size_t getWavefrontSize() const;
7327 + virtual void setCaps();
7330 +} // namespace llvm
7331 +#endif // AMDILEVERGREENDEVICE_H
7332 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILFrameLowering.cpp llvm-r600/lib/Target/R600/AMDILFrameLowering.cpp
7333 --- llvm-3.2.src/lib/Target/R600/AMDILFrameLowering.cpp 1970-01-01 01:00:00.000000000 +0100
7334 +++ llvm-r600/lib/Target/R600/AMDILFrameLowering.cpp 2013-01-25 19:43:57.440049721 +0100
7336 +//===----------------------- AMDILFrameLowering.cpp -----------------*- C++ -*-===//
7338 +// The LLVM Compiler Infrastructure
7340 +// This file is distributed under the University of Illinois Open Source
7341 +// License. See LICENSE.TXT for details.
7343 +//==-----------------------------------------------------------------------===//
7346 +/// \brief Interface to describe a layout of a stack frame on a AMDGPU target
7349 +//===----------------------------------------------------------------------===//
7350 +#include "AMDILFrameLowering.h"
7351 +#include "llvm/CodeGen/MachineFrameInfo.h"
7353 +using namespace llvm;
7354 +AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl,
7355 + int LAO, unsigned TransAl)
7356 + : TargetFrameLowering(D, StackAl, LAO, TransAl) {
7359 +AMDGPUFrameLowering::~AMDGPUFrameLowering() {
7362 +int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
7364 + const MachineFrameInfo *MFI = MF.getFrameInfo();
7365 + return MFI->getObjectOffset(FI);
7368 +const TargetFrameLowering::SpillSlot *
7369 +AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
7374 +AMDGPUFrameLowering::emitPrologue(MachineFunction &MF) const {
7377 +AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {
7380 +AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const {
7383 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILFrameLowering.h llvm-r600/lib/Target/R600/AMDILFrameLowering.h
7384 --- llvm-3.2.src/lib/Target/R600/AMDILFrameLowering.h 1970-01-01 01:00:00.000000000 +0100
7385 +++ llvm-r600/lib/Target/R600/AMDILFrameLowering.h 2013-01-25 19:43:57.443383054 +0100
7387 +//===--------------------- AMDILFrameLowering.h -----------------*- C++ -*-===//
7389 +// The LLVM Compiler Infrastructure
7391 +// This file is distributed under the University of Illinois Open Source
7392 +// License. See LICENSE.TXT for details.
7394 +//===----------------------------------------------------------------------===//
7397 +/// \brief Interface to describe a layout of a stack frame on a AMDIL target
7400 +//===----------------------------------------------------------------------===//
7401 +#ifndef AMDILFRAME_LOWERING_H
7402 +#define AMDILFRAME_LOWERING_H
7404 +#include "llvm/CodeGen/MachineFunction.h"
7405 +#include "llvm/Target/TargetFrameLowering.h"
7409 +/// \brief Information about the stack frame layout on the AMDGPU targets.
7411 +/// It holds the direction of the stack growth, the known stack alignment on
7412 +/// entry to each function, and the offset to the locals area.
7413 +/// See TargetFrameInfo for more comments.
7414 +class AMDGPUFrameLowering : public TargetFrameLowering {
7416 + AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO,
7417 + unsigned TransAl = 1);
7418 + virtual ~AMDGPUFrameLowering();
7419 + virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
7420 + virtual const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const;
7421 + virtual void emitPrologue(MachineFunction &MF) const;
7422 + virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
7423 + virtual bool hasFP(const MachineFunction &MF) const;
7425 +} // namespace llvm
7426 +#endif // AMDILFRAME_LOWERING_H
7427 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDIL.h llvm-r600/lib/Target/R600/AMDIL.h
7428 --- llvm-3.2.src/lib/Target/R600/AMDIL.h 1970-01-01 01:00:00.000000000 +0100
7429 +++ llvm-r600/lib/Target/R600/AMDIL.h 2013-01-25 19:43:57.433383055 +0100
7431 +//===-- AMDIL.h - Top-level interface for AMDIL representation --*- C++ -*-===//
7433 +// The LLVM Compiler Infrastructure
7435 +// This file is distributed under the University of Illinois Open Source
7436 +// License. See LICENSE.TXT for details.
7438 +//==-----------------------------------------------------------------------===//
7440 +/// This file contains the entry points for global functions defined in the LLVM
7441 +/// AMDGPU back-end.
7443 +//===----------------------------------------------------------------------===//
7448 +#include "llvm/CodeGen/MachineFunction.h"
7449 +#include "llvm/Target/TargetMachine.h"
7451 +#define ARENA_SEGMENT_RESERVED_UAVS 12
7452 +#define DEFAULT_ARENA_UAV_ID 8
7453 +#define DEFAULT_RAW_UAV_ID 7
7454 +#define GLOBAL_RETURN_RAW_UAV_ID 11
7455 +#define HW_MAX_NUM_CB 8
7456 +#define MAX_NUM_UNIQUE_UAVS 8
7457 +#define OPENCL_MAX_NUM_ATOMIC_COUNTERS 8
7458 +#define OPENCL_MAX_READ_IMAGES 128
7459 +#define OPENCL_MAX_WRITE_IMAGES 8
7460 +#define OPENCL_MAX_SAMPLERS 16
7462 +// The next two values can never be zero, as zero is the ID that is
7463 +// used to assert against.
7464 +#define DEFAULT_LDS_ID 1
7465 +#define DEFAULT_GDS_ID 1
7466 +#define DEFAULT_SCRATCH_ID 1
7467 +#define DEFAULT_VEC_SLOTS 8
7469 +#define OCL_DEVICE_RV710 0x0001
7470 +#define OCL_DEVICE_RV730 0x0002
7471 +#define OCL_DEVICE_RV770 0x0004
7472 +#define OCL_DEVICE_CEDAR 0x0008
7473 +#define OCL_DEVICE_REDWOOD 0x0010
7474 +#define OCL_DEVICE_JUNIPER 0x0020
7475 +#define OCL_DEVICE_CYPRESS 0x0040
7476 +#define OCL_DEVICE_CAICOS 0x0080
7477 +#define OCL_DEVICE_TURKS 0x0100
7478 +#define OCL_DEVICE_BARTS 0x0200
7479 +#define OCL_DEVICE_CAYMAN 0x0400
7480 +#define OCL_DEVICE_ALL 0x3FFF
7482 +/// The number of function ID's that are reserved for
7483 +/// internal compiler usage.
7484 +const unsigned int RESERVED_FUNCS = 1024;
7487 +class AMDGPUInstrPrinter;
7488 +class FunctionPass;
7492 +class TargetMachine;
7494 +// Instruction selection passes.
7496 + createAMDGPUISelDag(TargetMachine &TM);
7498 + createAMDGPUPeepholeOpt(TargetMachine &TM);
7500 +// Pre emit passes.
7502 + createAMDGPUCFGPreparationPass(TargetMachine &TM);
7504 + createAMDGPUCFGStructurizerPass(TargetMachine &TM);
7506 +extern Target TheAMDGPUTarget;
7507 +} // end namespace llvm;
7509 +// Include device information enumerations
7510 +#include "AMDILDeviceInfo.h"
7513 +/// OpenCL uses address spaces to differentiate between
7514 +/// various memory regions on the hardware. On the CPU
7515 +/// all of the address spaces point to the same memory,
7516 +/// however on the GPU, each address space points to
7517 +/// a seperate piece of memory that is unique from other
7518 +/// memory locations.
7519 +namespace AMDGPUAS {
7520 +enum AddressSpaces {
7521 + PRIVATE_ADDRESS = 0, ///< Address space for private memory.
7522 + GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
7523 + CONSTANT_ADDRESS = 2, ///< Address space for constant memory
7524 + LOCAL_ADDRESS = 3, ///< Address space for local memory.
7525 + REGION_ADDRESS = 4, ///< Address space for region memory.
7526 + ADDRESS_NONE = 5, ///< Address space for unknown memory.
7527 + PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0)
7528 + PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1)
7529 + USER_SGPR_ADDRESS = 8, ///< Address space for USER_SGPRS on SI
7530 + CONSTANT_BUFFER_0 = 9,
7531 + CONSTANT_BUFFER_1 = 10,
7532 + CONSTANT_BUFFER_2 = 11,
7533 + CONSTANT_BUFFER_3 = 12,
7534 + CONSTANT_BUFFER_4 = 13,
7535 + CONSTANT_BUFFER_5 = 14,
7536 + CONSTANT_BUFFER_6 = 15,
7537 + CONSTANT_BUFFER_7 = 16,
7538 + CONSTANT_BUFFER_8 = 17,
7539 + CONSTANT_BUFFER_9 = 18,
7540 + CONSTANT_BUFFER_10 = 19,
7541 + CONSTANT_BUFFER_11 = 20,
7542 + CONSTANT_BUFFER_12 = 21,
7543 + CONSTANT_BUFFER_13 = 22,
7544 + CONSTANT_BUFFER_14 = 23,
7545 + CONSTANT_BUFFER_15 = 24,
7549 +} // namespace AMDGPUAS
7551 +} // end namespace llvm
7553 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILInstrInfo.td llvm-r600/lib/Target/R600/AMDILInstrInfo.td
7554 --- llvm-3.2.src/lib/Target/R600/AMDILInstrInfo.td 1970-01-01 01:00:00.000000000 +0100
7555 +++ llvm-r600/lib/Target/R600/AMDILInstrInfo.td 2013-01-25 19:43:57.443383054 +0100
7557 +//===------------ AMDILInstrInfo.td - AMDIL Target ------*-tablegen-*------===//
7559 +// The LLVM Compiler Infrastructure
7561 +// This file is distributed under the University of Illinois Open Source
7562 +// License. See LICENSE.TXT for details.
7564 +//==-----------------------------------------------------------------------===//
7566 +// This file describes the AMDIL instructions in TableGen format.
7568 +//===----------------------------------------------------------------------===//
7569 +// AMDIL Instruction Predicate Definitions
7570 +// Predicate that is set to true if the hardware supports double precision
7572 +def HasHWDDiv : Predicate<"Subtarget.device()"
7573 + "->getGeneration() > AMDGPUDeviceInfo::HD4XXX && "
7574 + "Subtarget.device()->usesHardware(AMDGPUDeviceInfo::DoubleOps)">;
7576 +// Predicate that is set to true if the hardware supports double, but not double
7577 +// precision divide in hardware
7578 +def HasSWDDiv : Predicate<"Subtarget.device()"
7579 + "->getGeneration() == AMDGPUDeviceInfo::HD4XXX &&"
7580 + "Subtarget.device()->usesHardware(AMDGPUDeviceInfo::DoubleOps)">;
7582 +// Predicate that is set to true if the hardware support 24bit signed
7583 +// math ops. Otherwise a software expansion to 32bit math ops is used instead.
7584 +def HasHWSign24Bit : Predicate<"Subtarget.device()"
7585 + "->getGeneration() > AMDGPUDeviceInfo::HD5XXX">;
7587 +// Predicate that is set to true if 64bit operations are supported or not
7588 +def HasHW64Bit : Predicate<"Subtarget.device()"
7589 + "->usesHardware(AMDGPUDeviceInfo::LongOps)">;
7590 +def HasSW64Bit : Predicate<"Subtarget.device()"
7591 + "->usesSoftware(AMDGPUDeviceInfo::LongOps)">;
7593 +// Predicate that is set to true if the timer register is supported
7594 +def HasTmrRegister : Predicate<"Subtarget.device()"
7595 + "->isSupported(AMDGPUDeviceInfo::TmrReg)">;
7596 +// Predicate that is true if we are at least evergreen series
7597 +def HasDeviceIDInst : Predicate<"Subtarget.device()"
7598 + "->getGeneration() >= AMDGPUDeviceInfo::HD5XXX">;
7600 +// Predicate that is true if we have region address space.
7601 +def hasRegionAS : Predicate<"Subtarget.device()"
7602 + "->usesHardware(AMDGPUDeviceInfo::RegionMem)">;
7604 +// Predicate that is false if we don't have region address space.
7605 +def noRegionAS : Predicate<"!Subtarget.device()"
7606 + "->isSupported(AMDGPUDeviceInfo::RegionMem)">;
7609 +// Predicate that is set to true if 64bit Mul is supported in the IL or not
7610 +def HasHW64Mul : Predicate<"Subtarget.calVersion()"
7611 + ">= CAL_VERSION_SC_139"
7612 + "&& Subtarget.device()"
7613 + "->getGeneration() >="
7614 + "AMDGPUDeviceInfo::HD5XXX">;
7615 +def HasSW64Mul : Predicate<"Subtarget.calVersion()"
7616 + "< CAL_VERSION_SC_139">;
7617 +// Predicate that is set to true if 64bit Div/Mod is supported in the IL or not
7618 +def HasHW64DivMod : Predicate<"Subtarget.device()"
7619 + "->usesHardware(AMDGPUDeviceInfo::HW64BitDivMod)">;
7620 +def HasSW64DivMod : Predicate<"Subtarget.device()"
7621 + "->usesSoftware(AMDGPUDeviceInfo::HW64BitDivMod)">;
7623 +// Predicate that is set to true if 64bit pointer are used.
7624 +def Has64BitPtr : Predicate<"Subtarget.is64bit()">;
7625 +def Has32BitPtr : Predicate<"!Subtarget.is64bit()">;
7626 +//===--------------------------------------------------------------------===//
7628 +//===--------------------------------------------------------------------===//
7629 +def brtarget : Operand<OtherVT>;
7631 +//===--------------------------------------------------------------------===//
7632 +// Custom Selection DAG Type Profiles
7633 +//===--------------------------------------------------------------------===//
7634 +//===----------------------------------------------------------------------===//
7635 +// Generic Profile Types
7636 +//===----------------------------------------------------------------------===//
7638 +def SDTIL_GenBinaryOp : SDTypeProfile<1, 2, [
7639 + SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
7641 +def SDTIL_GenTernaryOp : SDTypeProfile<1, 3, [
7642 + SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisSameAs<2, 3>
7644 +def SDTIL_GenVecBuild : SDTypeProfile<1, 1, [
7645 + SDTCisEltOfVec<1, 0>
7648 +//===----------------------------------------------------------------------===//
7649 +// Flow Control Profile Types
7650 +//===----------------------------------------------------------------------===//
7651 +// Branch instruction where second and third are basic blocks
7652 +def SDTIL_BRCond : SDTypeProfile<0, 2, [
7653 + SDTCisVT<0, OtherVT>
7656 +//===--------------------------------------------------------------------===//
7657 +// Custom Selection DAG Nodes
7658 +//===--------------------------------------------------------------------===//
7659 +//===----------------------------------------------------------------------===//
7660 +// Flow Control DAG Nodes
7661 +//===----------------------------------------------------------------------===//
7662 +def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>;
7664 +//===----------------------------------------------------------------------===//
7665 +// Call/Return DAG Nodes
7666 +//===----------------------------------------------------------------------===//
7667 +def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
7668 + [SDNPHasChain, SDNPOptInGlue]>;
7670 +//===--------------------------------------------------------------------===//
7672 +//===--------------------------------------------------------------------===//
7673 +// Floating point math functions
7674 +def IL_div_inf : SDNode<"AMDGPUISD::DIV_INF", SDTIL_GenBinaryOp>;
7675 +def IL_mad : SDNode<"AMDGPUISD::MAD", SDTIL_GenTernaryOp>;
7677 +//===----------------------------------------------------------------------===//
7678 +// Integer functions
7679 +//===----------------------------------------------------------------------===//
7680 +def IL_umul : SDNode<"AMDGPUISD::UMUL" , SDTIntBinOp,
7681 + [SDNPCommutative, SDNPAssociative]>;
7683 +//===--------------------------------------------------------------------===//
7684 +// Custom Pattern DAG Nodes
7685 +//===--------------------------------------------------------------------===//
7686 +def global_store : PatFrag<(ops node:$val, node:$ptr),
7687 + (store node:$val, node:$ptr), [{
7688 + return isGlobalStore(dyn_cast<StoreSDNode>(N));
7691 +//===----------------------------------------------------------------------===//
7692 +// Load pattern fragments
7693 +//===----------------------------------------------------------------------===//
7694 +// Global address space loads
7695 +def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
7696 + return isGlobalLoad(dyn_cast<LoadSDNode>(N));
7698 +// Constant address space loads
7699 +def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
7700 + return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
7703 +//===----------------------------------------------------------------------===//
7704 +// Complex addressing mode patterns
7705 +//===----------------------------------------------------------------------===//
7706 +def ADDR : ComplexPattern<i32, 2, "SelectADDR", [], []>;
7707 +def ADDRF : ComplexPattern<i32, 2, "SelectADDR", [frameindex], []>;
7708 +def ADDR64 : ComplexPattern<i64, 2, "SelectADDR64", [], []>;
7709 +def ADDR64F : ComplexPattern<i64, 2, "SelectADDR64", [frameindex], []>;
7711 +//===----------------------------------------------------------------------===//
7712 +// Instruction format classes
7713 +//===----------------------------------------------------------------------===//
7714 +class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
7717 + let Namespace = "AMDGPU";
7718 + dag OutOperandList = outs;
7719 + dag InOperandList = ins;
7720 + let Pattern = pattern;
7721 + let AsmString = !strconcat(asmstr, "\n");
7723 + let Itinerary = NullALU;
7724 + bit hasIEEEFlag = 0;
7725 + bit hasZeroOpFlag = 0;
7728 + let hasSideEffects = 0;
7731 +//===--------------------------------------------------------------------===//
7732 +// Multiclass Instruction formats
7733 +//===--------------------------------------------------------------------===//
7734 +// Multiclass that handles branch instructions
7735 +multiclass BranchConditional<SDNode Op> {
7736 + def _i32 : ILFormat<(outs),
7737 + (ins brtarget:$target, GPRI32:$src0),
7738 + "; i32 Pseudo branch instruction",
7739 + [(Op bb:$target, GPRI32:$src0)]>;
7740 + def _f32 : ILFormat<(outs),
7741 + (ins brtarget:$target, GPRF32:$src0),
7742 + "; f32 Pseudo branch instruction",
7743 + [(Op bb:$target, GPRF32:$src0)]>;
7746 +// Only scalar types should generate flow control
7747 +multiclass BranchInstr<string name> {
7748 + def _i32 : ILFormat<(outs), (ins GPRI32:$src),
7749 + !strconcat(name, " $src"), []>;
7750 + def _f32 : ILFormat<(outs), (ins GPRF32:$src),
7751 + !strconcat(name, " $src"), []>;
7753 +// Only scalar types should generate flow control
7754 +multiclass BranchInstr2<string name> {
7755 + def _i32 : ILFormat<(outs), (ins GPRI32:$src0, GPRI32:$src1),
7756 + !strconcat(name, " $src0, $src1"), []>;
7757 + def _f32 : ILFormat<(outs), (ins GPRF32:$src0, GPRF32:$src1),
7758 + !strconcat(name, " $src0, $src1"), []>;
7761 +//===--------------------------------------------------------------------===//
7762 +// Intrinsics support
7763 +//===--------------------------------------------------------------------===//
7764 +include "AMDILIntrinsics.td"
7765 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILIntrinsicInfo.cpp llvm-r600/lib/Target/R600/AMDILIntrinsicInfo.cpp
7766 --- llvm-3.2.src/lib/Target/R600/AMDILIntrinsicInfo.cpp 1970-01-01 01:00:00.000000000 +0100
7767 +++ llvm-r600/lib/Target/R600/AMDILIntrinsicInfo.cpp 2013-01-25 19:43:57.446716388 +0100
7769 +//===- AMDILIntrinsicInfo.cpp - AMDGPU Intrinsic Information ------*- C++ -*-===//
7771 +// The LLVM Compiler Infrastructure
7773 +// This file is distributed under the University of Illinois Open Source
7774 +// License. See LICENSE.TXT for details.
7776 +//==-----------------------------------------------------------------------===//
7779 +/// \brief AMDGPU Implementation of the IntrinsicInfo class.
7781 +//===-----------------------------------------------------------------------===//
7783 +#include "AMDILIntrinsicInfo.h"
7785 +#include "AMDGPUSubtarget.h"
7786 +#include "llvm/DerivedTypes.h"
7787 +#include "llvm/Intrinsics.h"
7788 +#include "llvm/Module.h"
7790 +using namespace llvm;
7792 +#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
7793 +#include "AMDGPUGenIntrinsics.inc"
7794 +#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
7796 +AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm)
7797 + : TargetIntrinsicInfo() {
7801 +AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys,
7802 + unsigned int numTys) const {
7803 + static const char* const names[] = {
7804 +#define GET_INTRINSIC_NAME_TABLE
7805 +#include "AMDGPUGenIntrinsics.inc"
7806 +#undef GET_INTRINSIC_NAME_TABLE
7809 + if (IntrID < Intrinsic::num_intrinsics) {
7812 + assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics
7813 + && "Invalid intrinsic ID");
7815 + std::string Result(names[IntrID - Intrinsic::num_intrinsics]);
7820 +AMDGPUIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const {
7821 +#define GET_FUNCTION_RECOGNIZER
7822 +#include "AMDGPUGenIntrinsics.inc"
7823 +#undef GET_FUNCTION_RECOGNIZER
7824 + AMDGPUIntrinsic::ID IntrinsicID
7825 + = (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic;
7826 + IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name);
7828 + if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) {
7829 + return IntrinsicID;
7835 +AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const {
7837 +#define GET_INTRINSIC_OVERLOAD_TABLE
7838 +#include "AMDGPUGenIntrinsics.inc"
7839 +#undef GET_INTRINSIC_OVERLOAD_TABLE
7843 +AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
7845 + unsigned numTys) const {
7846 + assert(!"Not implemented");
7848 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILIntrinsicInfo.h llvm-r600/lib/Target/R600/AMDILIntrinsicInfo.h
7849 --- llvm-3.2.src/lib/Target/R600/AMDILIntrinsicInfo.h 1970-01-01 01:00:00.000000000 +0100
7850 +++ llvm-r600/lib/Target/R600/AMDILIntrinsicInfo.h 2013-01-25 19:43:57.446716388 +0100
7852 +//===- AMDILIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===//
7854 +// The LLVM Compiler Infrastructure
7856 +// This file is distributed under the University of Illinois Open Source
7857 +// License. See LICENSE.TXT for details.
7859 +//==-----------------------------------------------------------------------===//
7862 +/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class.
7864 +//===-----------------------------------------------------------------------===//
7865 +#ifndef AMDIL_INTRINSICS_H
7866 +#define AMDIL_INTRINSICS_H
7868 +#include "llvm/Intrinsics.h"
7869 +#include "llvm/Target/TargetIntrinsicInfo.h"
7872 +class TargetMachine;
7874 +namespace AMDGPUIntrinsic {
7876 + last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1,
7877 +#define GET_INTRINSIC_ENUM_VALUES
7878 +#include "AMDGPUGenIntrinsics.inc"
7879 +#undef GET_INTRINSIC_ENUM_VALUES
7880 + , num_AMDGPU_intrinsics
7883 +} // end namespace AMDGPUIntrinsic
7885 +class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo {
7887 + AMDGPUIntrinsicInfo(TargetMachine *tm);
7888 + std::string getName(unsigned int IntrId, Type **Tys = 0,
7889 + unsigned int numTys = 0) const;
7890 + unsigned int lookupName(const char *Name, unsigned int Len) const;
7891 + bool isOverloaded(unsigned int IID) const;
7892 + Function *getDeclaration(Module *M, unsigned int ID,
7894 + unsigned int numTys = 0) const;
7897 +} // end namespace llvm
7899 +#endif // AMDIL_INTRINSICS_H
7901 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILIntrinsics.td llvm-r600/lib/Target/R600/AMDILIntrinsics.td
7902 --- llvm-3.2.src/lib/Target/R600/AMDILIntrinsics.td 1970-01-01 01:00:00.000000000 +0100
7903 +++ llvm-r600/lib/Target/R600/AMDILIntrinsics.td 2013-01-25 19:43:57.446716388 +0100
7905 +//===- AMDILIntrinsics.td - Defines AMDIL Intrinscs -*- tablegen -*-===//
7907 +// The LLVM Compiler Infrastructure
7909 +// This file is distributed under the University of Illinois Open Source
7910 +// License. See LICENSE.TXT for details.
7912 +//==-----------------------------------------------------------------------===//
7914 +// This file defines all of the amdil-specific intrinsics
7916 +//===---------------------------------------------------------------===//
7917 +//===--------------------------------------------------------------------===//
7918 +// Intrinsic classes
7919 +// Generic versions of the above classes but for Target specific intrinsics
7920 +// instead of SDNode patterns.
7921 +//===--------------------------------------------------------------------===//
7922 +let TargetPrefix = "AMDIL", isTarget = 1 in {
7923 + class VoidIntLong :
7924 + Intrinsic<[llvm_i64_ty], [], []>;
7925 + class VoidIntInt :
7926 + Intrinsic<[llvm_i32_ty], [], []>;
7927 + class VoidIntBool :
7928 + Intrinsic<[llvm_i32_ty], [], []>;
7929 + class UnaryIntInt :
7930 + Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
7931 + class UnaryIntFloat :
7932 + Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
7933 + class ConvertIntFTOI :
7934 + Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
7935 + class ConvertIntITOF :
7936 + Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty], [IntrNoMem]>;
7937 + class UnaryIntNoRetInt :
7938 + Intrinsic<[], [llvm_anyint_ty], []>;
7939 + class UnaryIntNoRetFloat :
7940 + Intrinsic<[], [llvm_anyfloat_ty], []>;
7941 + class BinaryIntInt :
7942 + Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
7943 + class BinaryIntFloat :
7944 + Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
7945 + class BinaryIntNoRetInt :
7946 + Intrinsic<[], [llvm_anyint_ty, LLVMMatchType<0>], []>;
7947 + class BinaryIntNoRetFloat :
7948 + Intrinsic<[], [llvm_anyfloat_ty, LLVMMatchType<0>], []>;
7949 + class TernaryIntInt :
7950 + Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
7951 + LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
7952 + class TernaryIntFloat :
7953 + Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>,
7954 + LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
7955 + class QuaternaryIntInt :
7956 + Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
7957 + LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
7958 + class UnaryAtomicInt :
7959 + Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
7960 + class BinaryAtomicInt :
7961 + Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
7962 + class TernaryAtomicInt :
7963 + Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
7964 + class UnaryAtomicIntNoRet :
7965 + Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
7966 + class BinaryAtomicIntNoRet :
7967 + Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
7968 + class TernaryAtomicIntNoRet :
7969 + Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
7972 +let TargetPrefix = "AMDIL", isTarget = 1 in {
7973 + def int_AMDIL_abs : GCCBuiltin<"__amdil_abs">, UnaryIntInt;
7975 + def int_AMDIL_bit_extract_i32 : GCCBuiltin<"__amdil_ibit_extract">,
7977 + def int_AMDIL_bit_extract_u32 : GCCBuiltin<"__amdil_ubit_extract">,
7979 + def int_AMDIL_bit_reverse_u32 : GCCBuiltin<"__amdil_ubit_reverse">,
7981 + def int_AMDIL_bit_count_i32 : GCCBuiltin<"__amdil_count_bits">,
7983 + def int_AMDIL_bit_find_first_lo : GCCBuiltin<"__amdil_ffb_lo">,
7985 + def int_AMDIL_bit_find_first_hi : GCCBuiltin<"__amdil_ffb_hi">,
7987 + def int_AMDIL_bit_find_first_sgn : GCCBuiltin<"__amdil_ffb_signed">,
7989 + def int_AMDIL_media_bitalign : GCCBuiltin<"__amdil_bitalign">,
7991 + def int_AMDIL_media_bytealign : GCCBuiltin<"__amdil_bytealign">,
7993 + def int_AMDIL_bit_insert_u32 : GCCBuiltin<"__amdil_ubit_insert">,
7995 + def int_AMDIL_bfi : GCCBuiltin<"__amdil_bfi">,
7997 + def int_AMDIL_bfm : GCCBuiltin<"__amdil_bfm">,
7999 + def int_AMDIL_mad_i32 : GCCBuiltin<"__amdil_imad">,
8001 + def int_AMDIL_mad_u32 : GCCBuiltin<"__amdil_umad">,
8003 + def int_AMDIL_mad : GCCBuiltin<"__amdil_mad">,
8005 + def int_AMDIL_mulhi_i32 : GCCBuiltin<"__amdil_imul_high">,
8007 + def int_AMDIL_mulhi_u32 : GCCBuiltin<"__amdil_umul_high">,
8009 + def int_AMDIL_mul24_i32 : GCCBuiltin<"__amdil_imul24">,
8011 + def int_AMDIL_mul24_u32 : GCCBuiltin<"__amdil_umul24">,
8013 + def int_AMDIL_mulhi24_i32 : GCCBuiltin<"__amdil_imul24_high">,
8015 + def int_AMDIL_mulhi24_u32 : GCCBuiltin<"__amdil_umul24_high">,
8017 + def int_AMDIL_mad24_i32 : GCCBuiltin<"__amdil_imad24">,
8019 + def int_AMDIL_mad24_u32 : GCCBuiltin<"__amdil_umad24">,
8021 + def int_AMDIL_carry_i32 : GCCBuiltin<"__amdil_carry">,
8023 + def int_AMDIL_borrow_i32 : GCCBuiltin<"__amdil_borrow">,
8025 + def int_AMDIL_min_i32 : GCCBuiltin<"__amdil_imin">,
8027 + def int_AMDIL_min_u32 : GCCBuiltin<"__amdil_umin">,
8029 + def int_AMDIL_min : GCCBuiltin<"__amdil_min">,
8031 + def int_AMDIL_max_i32 : GCCBuiltin<"__amdil_imax">,
8033 + def int_AMDIL_max_u32 : GCCBuiltin<"__amdil_umax">,
8035 + def int_AMDIL_max : GCCBuiltin<"__amdil_max">,
8037 + def int_AMDIL_media_lerp_u4 : GCCBuiltin<"__amdil_u4lerp">,
8039 + def int_AMDIL_media_sad : GCCBuiltin<"__amdil_sad">,
8041 + def int_AMDIL_media_sad_hi : GCCBuiltin<"__amdil_sadhi">,
8043 + def int_AMDIL_fraction : GCCBuiltin<"__amdil_fraction">,
8045 + def int_AMDIL_clamp : GCCBuiltin<"__amdil_clamp">,
8047 + def int_AMDIL_pireduce : GCCBuiltin<"__amdil_pireduce">,
8049 + def int_AMDIL_round_nearest : GCCBuiltin<"__amdil_round_nearest">,
8051 + def int_AMDIL_round_neginf : GCCBuiltin<"__amdil_round_neginf">,
8053 + def int_AMDIL_round_zero : GCCBuiltin<"__amdil_round_zero">,
8055 + def int_AMDIL_acos : GCCBuiltin<"__amdil_acos">,
8057 + def int_AMDIL_atan : GCCBuiltin<"__amdil_atan">,
8059 + def int_AMDIL_asin : GCCBuiltin<"__amdil_asin">,
8061 + def int_AMDIL_cos : GCCBuiltin<"__amdil_cos">,
8063 + def int_AMDIL_cos_vec : GCCBuiltin<"__amdil_cos_vec">,
8065 + def int_AMDIL_tan : GCCBuiltin<"__amdil_tan">,
8067 + def int_AMDIL_sin : GCCBuiltin<"__amdil_sin">,
8069 + def int_AMDIL_sin_vec : GCCBuiltin<"__amdil_sin_vec">,
8071 + def int_AMDIL_pow : GCCBuiltin<"__amdil_pow">, BinaryIntFloat;
8072 + def int_AMDIL_div : GCCBuiltin<"__amdil_div">, BinaryIntFloat;
8073 + def int_AMDIL_udiv : GCCBuiltin<"__amdil_udiv">, BinaryIntInt;
8074 + def int_AMDIL_sqrt: GCCBuiltin<"__amdil_sqrt">,
8076 + def int_AMDIL_sqrt_vec: GCCBuiltin<"__amdil_sqrt_vec">,
8078 + def int_AMDIL_exp : GCCBuiltin<"__amdil_exp">,
8080 + def int_AMDIL_exp_vec : GCCBuiltin<"__amdil_exp_vec">,
8082 + def int_AMDIL_exn : GCCBuiltin<"__amdil_exn">,
8084 + def int_AMDIL_log_vec : GCCBuiltin<"__amdil_log_vec">,
8086 + def int_AMDIL_ln : GCCBuiltin<"__amdil_ln">,
8088 + def int_AMDIL_sign: GCCBuiltin<"__amdil_sign">,
8090 + def int_AMDIL_fma: GCCBuiltin<"__amdil_fma">,
8092 + def int_AMDIL_rsq : GCCBuiltin<"__amdil_rsq">,
8094 + def int_AMDIL_rsq_vec : GCCBuiltin<"__amdil_rsq_vec">,
8096 + def int_AMDIL_length : GCCBuiltin<"__amdil_length">,
8098 + def int_AMDIL_lerp : GCCBuiltin<"__amdil_lerp">,
8100 + def int_AMDIL_media_sad4 : GCCBuiltin<"__amdil_sad4">,
8101 + Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty,
8102 + llvm_v4i32_ty, llvm_i32_ty], []>;
8104 + def int_AMDIL_frexp_f64 : GCCBuiltin<"__amdil_frexp">,
8105 + Intrinsic<[llvm_v2i64_ty], [llvm_double_ty], []>;
8106 + def int_AMDIL_ldexp : GCCBuiltin<"__amdil_ldexp">,
8107 + Intrinsic<[llvm_anyfloat_ty], [llvm_anyfloat_ty, llvm_anyint_ty], []>;
8108 + def int_AMDIL_drcp : GCCBuiltin<"__amdil_rcp">,
8109 + Intrinsic<[llvm_double_ty], [llvm_double_ty], []>;
8110 + def int_AMDIL_convert_f16_f32 : GCCBuiltin<"__amdil_half_to_float">,
8112 + def int_AMDIL_convert_f32_f16 : GCCBuiltin<"__amdil_float_to_half">,
8114 + def int_AMDIL_convert_f32_i32_rpi : GCCBuiltin<"__amdil_float_to_int_rpi">,
8116 + def int_AMDIL_convert_f32_i32_flr : GCCBuiltin<"__amdil_float_to_int_flr">,
8118 + def int_AMDIL_convert_f32_f16_near : GCCBuiltin<"__amdil_float_to_half_near">,
8120 + def int_AMDIL_convert_f32_f16_neg_inf : GCCBuiltin<"__amdil_float_to_half_neg_inf">,
8122 + def int_AMDIL_convert_f32_f16_plus_inf : GCCBuiltin<"__amdil_float_to_half_plus_inf">,
8124 + def int_AMDIL_media_convert_f2v4u8 : GCCBuiltin<"__amdil_f_2_u4">,
8125 + Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], []>;
8126 + def int_AMDIL_media_unpack_byte_0 : GCCBuiltin<"__amdil_unpack_0">,
8128 + def int_AMDIL_media_unpack_byte_1 : GCCBuiltin<"__amdil_unpack_1">,
8130 + def int_AMDIL_media_unpack_byte_2 : GCCBuiltin<"__amdil_unpack_2">,
8132 + def int_AMDIL_media_unpack_byte_3 : GCCBuiltin<"__amdil_unpack_3">,
8134 + def int_AMDIL_dp2_add : GCCBuiltin<"__amdil_dp2_add">,
8135 + Intrinsic<[llvm_float_ty], [llvm_v2f32_ty,
8136 + llvm_v2f32_ty, llvm_float_ty], []>;
8137 + def int_AMDIL_dp2 : GCCBuiltin<"__amdil_dp2">,
8138 + Intrinsic<[llvm_float_ty], [llvm_v2f32_ty,
8139 + llvm_v2f32_ty], []>;
8140 + def int_AMDIL_dp3 : GCCBuiltin<"__amdil_dp3">,
8141 + Intrinsic<[llvm_float_ty], [llvm_v4f32_ty,
8142 + llvm_v4f32_ty], []>;
8143 + def int_AMDIL_dp4 : GCCBuiltin<"__amdil_dp4">,
8144 + Intrinsic<[llvm_float_ty], [llvm_v4f32_ty,
8145 + llvm_v4f32_ty], []>;
8147 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILISelDAGToDAG.cpp llvm-r600/lib/Target/R600/AMDILISelDAGToDAG.cpp
8148 --- llvm-3.2.src/lib/Target/R600/AMDILISelDAGToDAG.cpp 1970-01-01 01:00:00.000000000 +0100
8149 +++ llvm-r600/lib/Target/R600/AMDILISelDAGToDAG.cpp 2013-01-25 19:43:57.443383054 +0100
8151 +//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===//
8153 +// The LLVM Compiler Infrastructure
8155 +// This file is distributed under the University of Illinois Open Source
8156 +// License. See LICENSE.TXT for details.
8158 +//==-----------------------------------------------------------------------===//
8161 +/// \brief Defines an instruction selector for the AMDGPU target.
8163 +//===----------------------------------------------------------------------===//
8164 +#include "AMDGPUInstrInfo.h"
8165 +#include "AMDGPUISelLowering.h" // For AMDGPUISD
8166 +#include "AMDGPURegisterInfo.h"
8167 +#include "AMDILDevices.h"
8168 +#include "R600InstrInfo.h"
8169 +#include "llvm/ADT/ValueMap.h"
8170 +#include "llvm/CodeGen/PseudoSourceValue.h"
8171 +#include "llvm/CodeGen/SelectionDAGISel.h"
8172 +#include "llvm/Support/Compiler.h"
8173 +#include "llvm/CodeGen/SelectionDAG.h"
8177 +using namespace llvm;
8179 +//===----------------------------------------------------------------------===//
8180 +// Instruction Selector Implementation
8181 +//===----------------------------------------------------------------------===//
8184 +/// AMDGPU specific code to select AMDGPU machine instructions for
8185 +/// SelectionDAG operations.
8186 +class AMDGPUDAGToDAGISel : public SelectionDAGISel {
8187 + // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
8188 + // make the right decision when generating code for different targets.
8189 + const AMDGPUSubtarget &Subtarget;
8191 + AMDGPUDAGToDAGISel(TargetMachine &TM);
8192 + virtual ~AMDGPUDAGToDAGISel();
8194 + SDNode *Select(SDNode *N);
8195 + virtual const char *getPassName() const;
8198 + inline SDValue getSmallIPtrImm(unsigned Imm);
8199 + bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
8201 + // Complex pattern selectors
8202 + bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
8203 + bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
8204 + bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
8206 + static bool checkType(const Value *ptr, unsigned int addrspace);
8207 + static const Value *getBasePointerValue(const Value *V);
8209 + static bool isGlobalStore(const StoreSDNode *N);
8210 + static bool isPrivateStore(const StoreSDNode *N);
8211 + static bool isLocalStore(const StoreSDNode *N);
8212 + static bool isRegionStore(const StoreSDNode *N);
8214 + static bool isCPLoad(const LoadSDNode *N);
8215 + static bool isConstantLoad(const LoadSDNode *N, int cbID);
8216 + static bool isGlobalLoad(const LoadSDNode *N);
8217 + static bool isParamLoad(const LoadSDNode *N);
8218 + static bool isPrivateLoad(const LoadSDNode *N);
8219 + static bool isLocalLoad(const LoadSDNode *N);
8220 + static bool isRegionLoad(const LoadSDNode *N);
8222 + bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
8223 + bool SelectGlobalValueVariableOffset(SDValue Addr,
8224 + SDValue &BaseReg, SDValue& Offset);
8225 + bool SelectADDR8BitOffset(SDValue Addr, SDValue& Base, SDValue& Offset);
8226 + bool SelectADDRReg(SDValue Addr, SDValue& Base, SDValue& Offset);
8227 + bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
8229 + // Include the pieces autogenerated from the target description.
8230 +#include "AMDGPUGenDAGISel.inc"
8232 +} // end anonymous namespace
8234 +/// \brief This pass converts a legalized DAG into a AMDGPU-specific
8235 +// DAG, ready for instruction scheduling.
8236 +FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM
8238 + return new AMDGPUDAGToDAGISel(TM);
8241 +AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM
8243 + : SelectionDAGISel(TM), Subtarget(TM.getSubtarget<AMDGPUSubtarget>()) {
8246 +AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() {
8249 +SDValue AMDGPUDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) {
8250 + return CurDAG->getTargetConstant(Imm, MVT::i32);
8253 +bool AMDGPUDAGToDAGISel::SelectADDRParam(
8254 + SDValue Addr, SDValue& R1, SDValue& R2) {
8256 + if (Addr.getOpcode() == ISD::FrameIndex) {
8257 + if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
8258 + R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
8259 + R2 = CurDAG->getTargetConstant(0, MVT::i32);
8262 + R2 = CurDAG->getTargetConstant(0, MVT::i32);
8264 + } else if (Addr.getOpcode() == ISD::ADD) {
8265 + R1 = Addr.getOperand(0);
8266 + R2 = Addr.getOperand(1);
8269 + R2 = CurDAG->getTargetConstant(0, MVT::i32);
8274 +bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) {
8275 + if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
8276 + Addr.getOpcode() == ISD::TargetGlobalAddress) {
8279 + return SelectADDRParam(Addr, R1, R2);
8283 +bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) {
8284 + if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
8285 + Addr.getOpcode() == ISD::TargetGlobalAddress) {
8289 + if (Addr.getOpcode() == ISD::FrameIndex) {
8290 + if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
8291 + R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
8292 + R2 = CurDAG->getTargetConstant(0, MVT::i64);
8295 + R2 = CurDAG->getTargetConstant(0, MVT::i64);
8297 + } else if (Addr.getOpcode() == ISD::ADD) {
8298 + R1 = Addr.getOperand(0);
8299 + R2 = Addr.getOperand(1);
8302 + R2 = CurDAG->getTargetConstant(0, MVT::i64);
8307 +SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
8308 + unsigned int Opc = N->getOpcode();
8309 + if (N->isMachineOpcode()) {
8310 + return NULL; // Already selected.
8314 + case ISD::FrameIndex: {
8315 + if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(N)) {
8316 + unsigned int FI = FIN->getIndex();
8317 + EVT OpVT = N->getValueType(0);
8318 + unsigned int NewOpc = AMDGPU::COPY;
8319 + SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i32);
8320 + return CurDAG->SelectNodeTo(N, NewOpc, OpVT, TFI);
8324 + case ISD::ConstantFP:
8325 + case ISD::Constant: {
8326 + const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
8327 + // XXX: Custom immediate lowering not implemented yet. Instead we use
8328 + // pseudo instructions defined in SIInstructions.td
8329 + if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
8332 + const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo());
8334 + uint64_t ImmValue = 0;
8335 + unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
8337 + if (N->getOpcode() == ISD::ConstantFP) {
8338 + // XXX: 64-bit Immediates not supported yet
8339 + assert(N->getValueType(0) != MVT::f64);
8341 + ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N);
8342 + APFloat Value = C->getValueAPF();
8343 + float FloatValue = Value.convertToFloat();
8344 + if (FloatValue == 0.0) {
8345 + ImmReg = AMDGPU::ZERO;
8346 + } else if (FloatValue == 0.5) {
8347 + ImmReg = AMDGPU::HALF;
8348 + } else if (FloatValue == 1.0) {
8349 + ImmReg = AMDGPU::ONE;
8351 + ImmValue = Value.bitcastToAPInt().getZExtValue();
8354 + // XXX: 64-bit Immediates not supported yet
8355 + assert(N->getValueType(0) != MVT::i64);
8357 + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
8358 + if (C->getZExtValue() == 0) {
8359 + ImmReg = AMDGPU::ZERO;
8360 + } else if (C->getZExtValue() == 1) {
8361 + ImmReg = AMDGPU::ONE_INT;
8363 + ImmValue = C->getZExtValue();
8367 + for (SDNode::use_iterator Use = N->use_begin(), Next = llvm::next(Use);
8368 + Use != SDNode::use_end(); Use = Next) {
8369 + Next = llvm::next(Use);
8370 + std::vector<SDValue> Ops;
8371 + for (unsigned i = 0; i < Use->getNumOperands(); ++i) {
8372 + Ops.push_back(Use->getOperand(i));
8375 + if (!Use->isMachineOpcode()) {
8376 + if (ImmReg == AMDGPU::ALU_LITERAL_X) {
8377 + // We can only use literal constants (e.g. AMDGPU::ZERO,
8378 + // AMDGPU::ONE, etc) in machine opcodes.
8382 + if (!TII->isALUInstr(Use->getMachineOpcode())) {
8386 + int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(), R600Operands::IMM);
8387 + assert(ImmIdx != -1);
8389 + // subtract one from ImmIdx, because the DST operand is usually index
8390 + // 0 for MachineInstrs, but we have no DST in the Ops vector.
8393 + // Check that we aren't already using an immediate.
8394 + // XXX: It's possible for an instruction to have more than one
8395 + // immediate operand, but this is not supported yet.
8396 + if (ImmReg == AMDGPU::ALU_LITERAL_X) {
8397 + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Use->getOperand(ImmIdx));
8400 + if (C->getZExtValue() != 0) {
8401 + // This instruction is already using an immediate.
8405 + // Set the immediate value
8406 + Ops[ImmIdx] = CurDAG->getTargetConstant(ImmValue, MVT::i32);
8409 + // Set the immediate register
8410 + Ops[Use.getOperandNo()] = CurDAG->getRegister(ImmReg, MVT::i32);
8412 + CurDAG->UpdateNodeOperands(*Use, Ops.data(), Use->getNumOperands());
8417 + SDNode *Result = SelectCode(N);
8419 + // Fold operands of selected node
8421 + const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
8422 + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
8423 + const R600InstrInfo *TII =
8424 + static_cast<const R600InstrInfo*>(TM.getInstrInfo());
8425 + if (Result && TII->isALUInstr(Result->getMachineOpcode())) {
8426 + bool IsModified = false;
8428 + std::vector<SDValue> Ops;
8429 + for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end();
8431 + Ops.push_back(*I);
8432 + IsModified = FoldOperands(Result->getMachineOpcode(), TII, Ops);
8434 + Result = CurDAG->MorphNodeTo(Result, Result->getOpcode(),
8435 + Result->getVTList(), Ops.data(), Ops.size());
8437 + } while (IsModified);
8444 +bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode,
8445 + const R600InstrInfo *TII, std::vector<SDValue> &Ops) {
8446 + int OperandIdx[] = {
8447 + TII->getOperandIdx(Opcode, R600Operands::SRC0),
8448 + TII->getOperandIdx(Opcode, R600Operands::SRC1),
8449 + TII->getOperandIdx(Opcode, R600Operands::SRC2)
8452 + TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL),
8453 + TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL),
8454 + TII->getOperandIdx(Opcode, R600Operands::SRC2_SEL)
8456 + for (unsigned i = 0; i < 3; i++) {
8457 + if (OperandIdx[i] < 0)
8459 + SDValue Operand = Ops[OperandIdx[i] - 1];
8460 + switch (Operand.getOpcode()) {
8461 + case AMDGPUISD::CONST_ADDRESS: {
8462 + SDValue CstOffset;
8463 + if (!Operand.getValueType().isVector() &&
8464 + SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) {
8465 + Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32);
8466 + Ops[SelIdx[i] - 1] = CstOffset;
8478 +bool AMDGPUDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) {
8482 + Type *ptrType = ptr->getType();
8483 + return dyn_cast<PointerType>(ptrType)->getAddressSpace() == addrspace;
8486 +const Value * AMDGPUDAGToDAGISel::getBasePointerValue(const Value *V) {
8490 + const Value *ret = NULL;
8491 + ValueMap<const Value *, bool> ValueBitMap;
8492 + std::queue<const Value *, std::list<const Value *> > ValueQueue;
8493 + ValueQueue.push(V);
8494 + while (!ValueQueue.empty()) {
8495 + V = ValueQueue.front();
8496 + if (ValueBitMap.find(V) == ValueBitMap.end()) {
8497 + ValueBitMap[V] = true;
8498 + if (dyn_cast<Argument>(V) && dyn_cast<PointerType>(V->getType())) {
8501 + } else if (dyn_cast<GlobalVariable>(V)) {
8504 + } else if (dyn_cast<Constant>(V)) {
8505 + const ConstantExpr *CE = dyn_cast<ConstantExpr>(V);
8507 + ValueQueue.push(CE->getOperand(0));
8509 + } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
8512 + } else if (const Instruction *I = dyn_cast<Instruction>(V)) {
8513 + uint32_t numOps = I->getNumOperands();
8514 + for (uint32_t x = 0; x < numOps; ++x) {
8515 + ValueQueue.push(I->getOperand(x));
8518 + assert(!"Found a Value that we didn't know how to handle!");
8526 +bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
8527 + return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
8530 +bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
8531 + return (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
8532 + && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
8533 + && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS));
8536 +bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
8537 + return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
8540 +bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
8541 + return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
8544 +bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int cbID) {
8545 + if (checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)) {
8548 + MachineMemOperand *MMO = N->getMemOperand();
8549 + const Value *V = MMO->getValue();
8550 + const Value *BV = getBasePointerValue(V);
8552 + && MMO->getValue()
8553 + && ((V && dyn_cast<GlobalValue>(V))
8554 + || (BV && dyn_cast<GlobalValue>(
8555 + getBasePointerValue(MMO->getValue()))))) {
8556 + return checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS);
8562 +bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) {
8563 + return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
8566 +bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) {
8567 + return checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS);
8570 +bool AMDGPUDAGToDAGISel::isLocalLoad(const LoadSDNode *N) {
8571 + return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
8574 +bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) {
8575 + return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
8578 +bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) {
8579 + MachineMemOperand *MMO = N->getMemOperand();
8580 + if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
8582 + const Value *V = MMO->getValue();
8583 + const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V);
8584 + if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
8592 +bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) {
8593 + if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
8594 + // Check to make sure we are not a constant pool load or a constant load
8595 + // that is marked as a private load
8596 + if (isCPLoad(N) || isConstantLoad(N, -1)) {
8600 + if (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
8601 + && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
8602 + && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS)
8603 + && !checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)
8604 + && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_D_ADDRESS)
8605 + && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS)) {
8611 +const char *AMDGPUDAGToDAGISel::getPassName() const {
8612 + return "AMDGPU DAG->DAG Pattern Instruction Selection";
8620 +///==== AMDGPU Functions ====///
8622 +bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
8623 + SDValue& IntPtr) {
8624 + if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
8625 + IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, true);
8631 +bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
8632 + SDValue& BaseReg, SDValue &Offset) {
8633 + if (!dyn_cast<ConstantSDNode>(Addr)) {
8635 + Offset = CurDAG->getIntPtrConstant(0, true);
8641 +bool AMDGPUDAGToDAGISel::SelectADDR8BitOffset(SDValue Addr, SDValue& Base,
8642 + SDValue& Offset) {
8643 + if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
8644 + Addr.getOpcode() == ISD::TargetGlobalAddress) {
8649 + if (Addr.getOpcode() == ISD::ADD) {
8650 + bool Match = false;
8652 + // Find the base ptr and the offset
8653 + for (unsigned i = 0; i < Addr.getNumOperands(); i++) {
8654 + SDValue Arg = Addr.getOperand(i);
8655 + ConstantSDNode * OffsetNode = dyn_cast<ConstantSDNode>(Arg);
8656 + // This arg isn't a constant so it must be the base PTR.
8657 + if (!OffsetNode) {
8658 + Base = Addr.getOperand(i);
8661 + // Check if the constant argument fits in 8-bits. The offset is in bytes
8662 + // so we need to convert it to dwords.
8663 + if (isUInt<8>(OffsetNode->getZExtValue() >> 2)) {
8665 + Offset = CurDAG->getTargetConstant(OffsetNode->getZExtValue() >> 2,
8672 + // Default case, no offset
8674 + Offset = CurDAG->getTargetConstant(0, MVT::i32);
8678 +bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
8679 + SDValue &Offset) {
8680 + ConstantSDNode * IMMOffset;
8682 + if (Addr.getOpcode() == ISD::ADD
8683 + && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
8684 + && isInt<16>(IMMOffset->getZExtValue())) {
8686 + Base = Addr.getOperand(0);
8687 + Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32);
8689 + // If the pointer address is constant, we can move it to the offset field.
8690 + } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
8691 + && isInt<16>(IMMOffset->getZExtValue())) {
8692 + Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
8693 + CurDAG->getEntryNode().getDebugLoc(),
8694 + AMDGPU::ZERO, MVT::i32);
8695 + Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32);
8699 + // Default case, no offset
8701 + Offset = CurDAG->getTargetConstant(0, MVT::i32);
8705 +bool AMDGPUDAGToDAGISel::SelectADDRReg(SDValue Addr, SDValue& Base,
8706 + SDValue& Offset) {
8707 + if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
8708 + Addr.getOpcode() == ISD::TargetGlobalAddress ||
8709 + Addr.getOpcode() != ISD::ADD) {
8713 + Base = Addr.getOperand(0);
8714 + Offset = Addr.getOperand(1);
8718 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILISelLowering.cpp llvm-r600/lib/Target/R600/AMDILISelLowering.cpp
8719 --- llvm-3.2.src/lib/Target/R600/AMDILISelLowering.cpp 1970-01-01 01:00:00.000000000 +0100
8720 +++ llvm-r600/lib/Target/R600/AMDILISelLowering.cpp 2013-01-25 19:43:57.443383054 +0100
8722 +//===-- AMDILISelLowering.cpp - AMDIL DAG Lowering Implementation ---------===//
8724 +// The LLVM Compiler Infrastructure
8726 +// This file is distributed under the University of Illinois Open Source
8727 +// License. See LICENSE.TXT for details.
8729 +//==-----------------------------------------------------------------------===//
8732 +/// \brief TargetLowering functions borrowed from AMDIL.
8734 +//===----------------------------------------------------------------------===//
8736 +#include "AMDGPUISelLowering.h"
8737 +#include "AMDGPURegisterInfo.h"
8738 +#include "AMDILDevices.h"
8739 +#include "AMDILIntrinsicInfo.h"
8740 +#include "AMDGPUSubtarget.h"
8741 +#include "llvm/CallingConv.h"
8742 +#include "llvm/CodeGen/MachineFrameInfo.h"
8743 +#include "llvm/CodeGen/MachineRegisterInfo.h"
8744 +#include "llvm/CodeGen/PseudoSourceValue.h"
8745 +#include "llvm/CodeGen/SelectionDAG.h"
8746 +#include "llvm/CodeGen/SelectionDAGNodes.h"
8747 +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
8748 +#include "llvm/DerivedTypes.h"
8749 +#include "llvm/Instructions.h"
8750 +#include "llvm/Intrinsics.h"
8751 +#include "llvm/Support/raw_ostream.h"
8752 +#include "llvm/Target/TargetInstrInfo.h"
8753 +#include "llvm/Target/TargetOptions.h"
8755 +using namespace llvm;
8756 +//===----------------------------------------------------------------------===//
8757 +// Calling Convention Implementation
8758 +//===----------------------------------------------------------------------===//
8759 +#include "AMDGPUGenCallingConv.inc"
8761 +//===----------------------------------------------------------------------===//
8762 +// TargetLowering Implementation Help Functions End
8763 +//===----------------------------------------------------------------------===//
8765 +//===----------------------------------------------------------------------===//
8766 +// TargetLowering Class Implementation Begins
8767 +//===----------------------------------------------------------------------===//
8768 +void AMDGPUTargetLowering::InitAMDILLowering() {
8788 + int IntTypes[] = {
8795 + int FloatTypes[] = {
8800 + int VectorTypes[] = {
8812 + size_t NumTypes = sizeof(types) / sizeof(*types);
8813 + size_t NumFloatTypes = sizeof(FloatTypes) / sizeof(*FloatTypes);
8814 + size_t NumIntTypes = sizeof(IntTypes) / sizeof(*IntTypes);
8815 + size_t NumVectorTypes = sizeof(VectorTypes) / sizeof(*VectorTypes);
8817 + const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget<AMDGPUSubtarget>();
8818 + // These are the current register classes that are
8821 + for (unsigned int x = 0; x < NumTypes; ++x) {
8822 + MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x];
8824 + //FIXME: SIGN_EXTEND_INREG is not meaningful for floating point types
8825 + // We cannot sextinreg, expand to shifts
8826 + setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
8827 + setOperationAction(ISD::SUBE, VT, Expand);
8828 + setOperationAction(ISD::SUBC, VT, Expand);
8829 + setOperationAction(ISD::ADDE, VT, Expand);
8830 + setOperationAction(ISD::ADDC, VT, Expand);
8831 + setOperationAction(ISD::BRCOND, VT, Custom);
8832 + setOperationAction(ISD::BR_JT, VT, Expand);
8833 + setOperationAction(ISD::BRIND, VT, Expand);
8834 + // TODO: Implement custom UREM/SREM routines
8835 + setOperationAction(ISD::SREM, VT, Expand);
8836 + setOperationAction(ISD::SMUL_LOHI, VT, Expand);
8837 + setOperationAction(ISD::UMUL_LOHI, VT, Expand);
8838 + if (VT != MVT::i64 && VT != MVT::v2i64) {
8839 + setOperationAction(ISD::SDIV, VT, Custom);
8842 + for (unsigned int x = 0; x < NumFloatTypes; ++x) {
8843 + MVT::SimpleValueType VT = (MVT::SimpleValueType)FloatTypes[x];
8845 + // IL does not have these operations for floating point types
8846 + setOperationAction(ISD::FP_ROUND_INREG, VT, Expand);
8847 + setOperationAction(ISD::SETOLT, VT, Expand);
8848 + setOperationAction(ISD::SETOGE, VT, Expand);
8849 + setOperationAction(ISD::SETOGT, VT, Expand);
8850 + setOperationAction(ISD::SETOLE, VT, Expand);
8851 + setOperationAction(ISD::SETULT, VT, Expand);
8852 + setOperationAction(ISD::SETUGE, VT, Expand);
8853 + setOperationAction(ISD::SETUGT, VT, Expand);
8854 + setOperationAction(ISD::SETULE, VT, Expand);
8857 + for (unsigned int x = 0; x < NumIntTypes; ++x) {
8858 + MVT::SimpleValueType VT = (MVT::SimpleValueType)IntTypes[x];
8860 + // GPU also does not have divrem function for signed or unsigned
8861 + setOperationAction(ISD::SDIVREM, VT, Expand);
8863 + // GPU does not have [S|U]MUL_LOHI functions as a single instruction
8864 + setOperationAction(ISD::SMUL_LOHI, VT, Expand);
8865 + setOperationAction(ISD::UMUL_LOHI, VT, Expand);
8867 + // GPU doesn't have a rotl, rotr, or byteswap instruction
8868 + setOperationAction(ISD::ROTR, VT, Expand);
8869 + setOperationAction(ISD::BSWAP, VT, Expand);
8871 + // GPU doesn't have any counting operators
8872 + setOperationAction(ISD::CTPOP, VT, Expand);
8873 + setOperationAction(ISD::CTTZ, VT, Expand);
8874 + setOperationAction(ISD::CTLZ, VT, Expand);
8877 + for (unsigned int ii = 0; ii < NumVectorTypes; ++ii) {
8878 + MVT::SimpleValueType VT = (MVT::SimpleValueType)VectorTypes[ii];
8880 + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
8881 + setOperationAction(ISD::SDIVREM, VT, Expand);
8882 + setOperationAction(ISD::SMUL_LOHI, VT, Expand);
8883 + // setOperationAction(ISD::VSETCC, VT, Expand);
8884 + setOperationAction(ISD::SELECT_CC, VT, Expand);
8887 + if (STM.device()->isSupported(AMDGPUDeviceInfo::LongOps)) {
8888 + setOperationAction(ISD::MULHU, MVT::i64, Expand);
8889 + setOperationAction(ISD::MULHU, MVT::v2i64, Expand);
8890 + setOperationAction(ISD::MULHS, MVT::i64, Expand);
8891 + setOperationAction(ISD::MULHS, MVT::v2i64, Expand);
8892 + setOperationAction(ISD::ADD, MVT::v2i64, Expand);
8893 + setOperationAction(ISD::SREM, MVT::v2i64, Expand);
8894 + setOperationAction(ISD::Constant , MVT::i64 , Legal);
8895 + setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
8896 + setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand);
8897 + setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand);
8898 + setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand);
8899 + setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand);
8901 + if (STM.device()->isSupported(AMDGPUDeviceInfo::DoubleOps)) {
8902 + // we support loading/storing v2f64 but not operations on the type
8903 + setOperationAction(ISD::FADD, MVT::v2f64, Expand);
8904 + setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
8905 + setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
8906 + setOperationAction(ISD::FP_ROUND_INREG, MVT::v2f64, Expand);
8907 + setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
8908 + setOperationAction(ISD::ConstantFP , MVT::f64 , Legal);
8909 + // We want to expand vector conversions into their scalar
8911 + setOperationAction(ISD::TRUNCATE, MVT::v2f64, Expand);
8912 + setOperationAction(ISD::SIGN_EXTEND, MVT::v2f64, Expand);
8913 + setOperationAction(ISD::ZERO_EXTEND, MVT::v2f64, Expand);
8914 + setOperationAction(ISD::ANY_EXTEND, MVT::v2f64, Expand);
8915 + setOperationAction(ISD::FABS, MVT::f64, Expand);
8916 + setOperationAction(ISD::FABS, MVT::v2f64, Expand);
8918 + // TODO: Fix the UDIV24 algorithm so it works for these
8919 + // types correctly. This needs vector comparisons
8920 + // for this to work correctly.
8921 + setOperationAction(ISD::UDIV, MVT::v2i8, Expand);
8922 + setOperationAction(ISD::UDIV, MVT::v4i8, Expand);
8923 + setOperationAction(ISD::UDIV, MVT::v2i16, Expand);
8924 + setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
8925 + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
8926 + setOperationAction(ISD::SUBC, MVT::Other, Expand);
8927 + setOperationAction(ISD::ADDE, MVT::Other, Expand);
8928 + setOperationAction(ISD::ADDC, MVT::Other, Expand);
8929 + setOperationAction(ISD::BRCOND, MVT::Other, Custom);
8930 + setOperationAction(ISD::BR_JT, MVT::Other, Expand);
8931 + setOperationAction(ISD::BRIND, MVT::Other, Expand);
8932 + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
8935 + // Use the default implementation.
8936 + setOperationAction(ISD::ConstantFP , MVT::f32 , Legal);
8937 + setOperationAction(ISD::Constant , MVT::i32 , Legal);
8939 + setSchedulingPreference(Sched::RegPressure);
8940 + setPow2DivIsCheap(false);
8941 + setSelectIsExpensive(true);
8942 + setJumpIsExpensive(true);
8944 + maxStoresPerMemcpy = 4096;
8945 + maxStoresPerMemmove = 4096;
8946 + maxStoresPerMemset = 4096;
8951 +AMDGPUTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
8952 + const CallInst &I, unsigned Intrinsic) const {
8956 +// The backend supports 32 and 64 bit floating point immediates
8958 +AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
8959 + if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
8960 + || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
8968 +AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
8969 + if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
8970 + || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
8978 +// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
8979 +// be zero. Op is expected to be a target specific node. Used by DAG
8983 +AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
8987 + const SelectionDAG &DAG,
8988 + unsigned Depth) const {
8991 + KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything
8992 + switch (Op.getOpcode()) {
8994 + case ISD::SELECT_CC:
8995 + DAG.ComputeMaskedBits(
9001 + DAG.ComputeMaskedBits(
9006 + assert((KnownZero & KnownOne) == 0
9007 + && "Bits known to be one AND zero?");
9008 + assert((KnownZero2 & KnownOne2) == 0
9009 + && "Bits known to be one AND zero?");
9010 + // Only known if known in both the LHS and RHS
9011 + KnownOne &= KnownOne2;
9012 + KnownZero &= KnownZero2;
9017 +//===----------------------------------------------------------------------===//
9018 +// Other Lowering Hooks
9019 +//===----------------------------------------------------------------------===//
9022 +AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
9023 + EVT OVT = Op.getValueType();
9025 + if (OVT.getScalarType() == MVT::i64) {
9026 + DST = LowerSDIV64(Op, DAG);
9027 + } else if (OVT.getScalarType() == MVT::i32) {
9028 + DST = LowerSDIV32(Op, DAG);
9029 + } else if (OVT.getScalarType() == MVT::i16
9030 + || OVT.getScalarType() == MVT::i8) {
9031 + DST = LowerSDIV24(Op, DAG);
9033 + DST = SDValue(Op.getNode(), 0);
9039 +AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const {
9040 + EVT OVT = Op.getValueType();
9042 + if (OVT.getScalarType() == MVT::i64) {
9043 + DST = LowerSREM64(Op, DAG);
9044 + } else if (OVT.getScalarType() == MVT::i32) {
9045 + DST = LowerSREM32(Op, DAG);
9046 + } else if (OVT.getScalarType() == MVT::i16) {
9047 + DST = LowerSREM16(Op, DAG);
9048 + } else if (OVT.getScalarType() == MVT::i8) {
9049 + DST = LowerSREM8(Op, DAG);
9051 + DST = SDValue(Op.getNode(), 0);
9057 +AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const {
9058 + SDValue Data = Op.getOperand(0);
9059 + VTSDNode *BaseType = cast<VTSDNode>(Op.getOperand(1));
9060 + DebugLoc DL = Op.getDebugLoc();
9061 + EVT DVT = Data.getValueType();
9062 + EVT BVT = BaseType->getVT();
9063 + unsigned baseBits = BVT.getScalarType().getSizeInBits();
9064 + unsigned srcBits = DVT.isSimple() ? DVT.getScalarType().getSizeInBits() : 1;
9065 + unsigned shiftBits = srcBits - baseBits;
9066 + if (srcBits < 32) {
9067 + // If the op is less than 32 bits, then it needs to extend to 32bits
9068 + // so it can properly keep the upper bits valid.
9069 + EVT IVT = genIntType(32, DVT.isVector() ? DVT.getVectorNumElements() : 1);
9070 + Data = DAG.getNode(ISD::ZERO_EXTEND, DL, IVT, Data);
9071 + shiftBits = 32 - baseBits;
9074 + SDValue Shift = DAG.getConstant(shiftBits, DVT);
9075 + // Shift left by 'Shift' bits.
9076 + Data = DAG.getNode(ISD::SHL, DL, DVT, Data, Shift);
9077 + // Signed shift Right by 'Shift' bits.
9078 + Data = DAG.getNode(ISD::SRA, DL, DVT, Data, Shift);
9079 + if (srcBits < 32) {
9080 + // Once the sign extension is done, the op needs to be converted to
9081 + // its original type.
9082 + Data = DAG.getSExtOrTrunc(Data, DL, Op.getOperand(0).getValueType());
9087 +AMDGPUTargetLowering::genIntType(uint32_t size, uint32_t numEle) const {
9088 + int iSize = (size * numEle);
9089 + int vEle = (iSize >> ((size == 64) ? 6 : 5));
9095 + return EVT(MVT::i64);
9097 + return EVT(MVT::getVectorVT(MVT::i64, vEle));
9101 + return EVT(MVT::i32);
9103 + return EVT(MVT::getVectorVT(MVT::i32, vEle));
9109 +AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
9110 + SDValue Chain = Op.getOperand(0);
9111 + SDValue Cond = Op.getOperand(1);
9112 + SDValue Jump = Op.getOperand(2);
9114 + Result = DAG.getNode(
9115 + AMDGPUISD::BRANCH_COND,
9117 + Op.getValueType(),
9118 + Chain, Jump, Cond);
9123 +AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
9124 + DebugLoc DL = Op.getDebugLoc();
9125 + EVT OVT = Op.getValueType();
9126 + SDValue LHS = Op.getOperand(0);
9127 + SDValue RHS = Op.getOperand(1);
9130 + if (!OVT.isVector()) {
9133 + } else if (OVT.getVectorNumElements() == 2) {
9134 + INTTY = MVT::v2i32;
9135 + FLTTY = MVT::v2f32;
9136 + } else if (OVT.getVectorNumElements() == 4) {
9137 + INTTY = MVT::v4i32;
9138 + FLTTY = MVT::v4f32;
9140 + unsigned bitsize = OVT.getScalarType().getSizeInBits();
9141 + // char|short jq = ia ^ ib;
9142 + SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);
9144 + // jq = jq >> (bitsize - 2)
9145 + jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT));
9148 + jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));
9151 + jq = DAG.getSExtOrTrunc(jq, DL, INTTY);
9153 + // int ia = (int)LHS;
9154 + SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);
9156 + // int ib, (int)RHS;
9157 + SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);
9159 + // float fa = (float)ia;
9160 + SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
9162 + // float fb = (float)ib;
9163 + SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
9165 + // float fq = native_divide(fa, fb);
9166 + SDValue fq = DAG.getNode(AMDGPUISD::DIV_INF, DL, FLTTY, fa, fb);
9168 + // fq = trunc(fq);
9169 + fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
9171 + // float fqneg = -fq;
9172 + SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
9174 + // float fr = mad(fqneg, fb, fa);
9175 + SDValue fr = DAG.getNode(AMDGPUISD::MAD, DL, FLTTY, fqneg, fb, fa);
9177 + // int iq = (int)fq;
9178 + SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
9181 + fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);
9184 + fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);
9186 + // int cv = fr >= fb;
9188 + if (INTTY == MVT::i32) {
9189 + cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
9191 + cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
9193 + // jq = (cv ? jq : 0);
9194 + jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq,
9195 + DAG.getConstant(0, OVT));
9197 + iq = DAG.getSExtOrTrunc(iq, DL, OVT);
9198 + iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
9203 +AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
9204 + DebugLoc DL = Op.getDebugLoc();
9205 + EVT OVT = Op.getValueType();
9206 + SDValue LHS = Op.getOperand(0);
9207 + SDValue RHS = Op.getOperand(1);
9208 + // The LowerSDIV32 function generates equivalent to the following IL.
9213 + // iadd r0, r0, r10
9214 + // iadd r1, r1, r11
9215 + // ixor r0, r0, r10
9216 + // ixor r1, r1, r11
9217 + // udiv r0, r0, r1
9218 + // ixor r10, r10, r11
9219 + // iadd r0, r0, r10
9220 + // ixor DST, r0, r10
9229 + SDValue r10 = DAG.getSelectCC(DL,
9230 + r0, DAG.getConstant(0, OVT),
9231 + DAG.getConstant(-1, MVT::i32),
9232 + DAG.getConstant(0, MVT::i32),
9236 + SDValue r11 = DAG.getSelectCC(DL,
9237 + r1, DAG.getConstant(0, OVT),
9238 + DAG.getConstant(-1, MVT::i32),
9239 + DAG.getConstant(0, MVT::i32),
9242 + // iadd r0, r0, r10
9243 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
9245 + // iadd r1, r1, r11
9246 + r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
9248 + // ixor r0, r0, r10
9249 + r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
9251 + // ixor r1, r1, r11
9252 + r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
9254 + // udiv r0, r0, r1
9255 + r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);
9257 + // ixor r10, r10, r11
9258 + r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);
9260 + // iadd r0, r0, r10
9261 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
9263 + // ixor DST, r0, r10
9264 + SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
9269 +AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
9270 + return SDValue(Op.getNode(), 0);
9274 +AMDGPUTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const {
9275 + DebugLoc DL = Op.getDebugLoc();
9276 + EVT OVT = Op.getValueType();
9277 + MVT INTTY = MVT::i32;
9278 + if (OVT == MVT::v2i8) {
9279 + INTTY = MVT::v2i32;
9280 + } else if (OVT == MVT::v4i8) {
9281 + INTTY = MVT::v4i32;
9283 + SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
9284 + SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
9285 + LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
9286 + LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
9291 +AMDGPUTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const {
9292 + DebugLoc DL = Op.getDebugLoc();
9293 + EVT OVT = Op.getValueType();
9294 + MVT INTTY = MVT::i32;
9295 + if (OVT == MVT::v2i16) {
9296 + INTTY = MVT::v2i32;
9297 + } else if (OVT == MVT::v4i16) {
9298 + INTTY = MVT::v4i32;
9300 + SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
9301 + SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
9302 + LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
9303 + LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
9308 +AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
9309 + DebugLoc DL = Op.getDebugLoc();
9310 + EVT OVT = Op.getValueType();
9311 + SDValue LHS = Op.getOperand(0);
9312 + SDValue RHS = Op.getOperand(1);
9313 + // The LowerSREM32 function generates equivalent to the following IL.
9318 + // iadd r0, r0, r10
9319 + // iadd r1, r1, r11
9320 + // ixor r0, r0, r10
9321 + // ixor r1, r1, r11
9322 + // udiv r20, r0, r1
9323 + // umul r20, r20, r1
9324 + // sub r0, r0, r20
9325 + // iadd r0, r0, r10
9326 + // ixor DST, r0, r10
9335 + SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT);
9338 + SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT);
9340 + // iadd r0, r0, r10
9341 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
9343 + // iadd r1, r1, r11
9344 + r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
9346 + // ixor r0, r0, r10
9347 + r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
9349 + // ixor r1, r1, r11
9350 + r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
9352 + // udiv r20, r0, r1
9353 + SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);
9355 + // umul r20, r20, r1
9356 + r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1);
9358 + // sub r0, r0, r20
9359 + r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);
9361 + // iadd r0, r0, r10
9362 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
9364 + // ixor DST, r0, r10
9365 + SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
9370 +AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const {
9371 + return SDValue(Op.getNode(), 0);
9373 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILNIDevice.cpp llvm-r600/lib/Target/R600/AMDILNIDevice.cpp
9374 --- llvm-3.2.src/lib/Target/R600/AMDILNIDevice.cpp 1970-01-01 01:00:00.000000000 +0100
9375 +++ llvm-r600/lib/Target/R600/AMDILNIDevice.cpp 2013-01-25 19:43:57.446716388 +0100
9377 +//===-- AMDILNIDevice.cpp - Device Info for Northern Islands devices ------===//
9379 +// The LLVM Compiler Infrastructure
9381 +// This file is distributed under the University of Illinois Open Source
9382 +// License. See LICENSE.TXT for details.
9385 +//==-----------------------------------------------------------------------===//
9386 +#include "AMDILNIDevice.h"
9387 +#include "AMDILEvergreenDevice.h"
9388 +#include "AMDGPUSubtarget.h"
9390 +using namespace llvm;
9392 +AMDGPUNIDevice::AMDGPUNIDevice(AMDGPUSubtarget *ST)
9393 + : AMDGPUEvergreenDevice(ST) {
9394 + std::string name = ST->getDeviceName();
9395 + if (name == "caicos") {
9396 + DeviceFlag = OCL_DEVICE_CAICOS;
9397 + } else if (name == "turks") {
9398 + DeviceFlag = OCL_DEVICE_TURKS;
9399 + } else if (name == "cayman") {
9400 + DeviceFlag = OCL_DEVICE_CAYMAN;
9402 + DeviceFlag = OCL_DEVICE_BARTS;
9405 +AMDGPUNIDevice::~AMDGPUNIDevice() {
9409 +AMDGPUNIDevice::getMaxLDSSize() const {
9410 + if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
9411 + return MAX_LDS_SIZE_900;
9418 +AMDGPUNIDevice::getGeneration() const {
9419 + return AMDGPUDeviceInfo::HD6XXX;
9423 +AMDGPUCaymanDevice::AMDGPUCaymanDevice(AMDGPUSubtarget *ST)
9424 + : AMDGPUNIDevice(ST) {
9428 +AMDGPUCaymanDevice::~AMDGPUCaymanDevice() {
9432 +AMDGPUCaymanDevice::setCaps() {
9433 + if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) {
9434 + mHWBits.set(AMDGPUDeviceInfo::DoubleOps);
9435 + mHWBits.set(AMDGPUDeviceInfo::FMA);
9437 + mHWBits.set(AMDGPUDeviceInfo::Signed24BitOps);
9438 + mSWBits.reset(AMDGPUDeviceInfo::Signed24BitOps);
9439 + mSWBits.set(AMDGPUDeviceInfo::ArenaSegment);
9442 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILNIDevice.h llvm-r600/lib/Target/R600/AMDILNIDevice.h
9443 --- llvm-3.2.src/lib/Target/R600/AMDILNIDevice.h 1970-01-01 01:00:00.000000000 +0100
9444 +++ llvm-r600/lib/Target/R600/AMDILNIDevice.h 2013-01-25 19:43:57.446716388 +0100
9446 +//===------- AMDILNIDevice.h - Define NI Device for AMDIL -*- C++ -*------===//
9448 +// The LLVM Compiler Infrastructure
9450 +// This file is distributed under the University of Illinois Open Source
9451 +// License. See LICENSE.TXT for details.
9453 +//==-----------------------------------------------------------------------===//
9455 +/// \brief Interface for the subtarget data classes.
9457 +/// This file will define the interface that each generation needs to
9458 +/// implement in order to correctly answer queries on the capabilities of the
9459 +/// specific hardware.
9460 +//===---------------------------------------------------------------------===//
9461 +#ifndef AMDILNIDEVICE_H
9462 +#define AMDILNIDEVICE_H
9463 +#include "AMDILEvergreenDevice.h"
9464 +#include "AMDGPUSubtarget.h"
9468 +class AMDGPUSubtarget;
9469 +//===---------------------------------------------------------------------===//
9470 +// NI generation of devices and their respective sub classes
9471 +//===---------------------------------------------------------------------===//
9473 +/// \brief The AMDGPUNIDevice is the base class for all Northern Island series of
9476 +/// It is very similiar to the AMDGPUEvergreenDevice, with the major
9477 +/// exception being differences in wavefront size and hardware capabilities. The
9478 +/// NI devices are all 64 wide wavefronts and also add support for signed 24 bit
9479 +/// integer operations
9480 +class AMDGPUNIDevice : public AMDGPUEvergreenDevice {
9482 + AMDGPUNIDevice(AMDGPUSubtarget*);
9483 + virtual ~AMDGPUNIDevice();
9484 + virtual size_t getMaxLDSSize() const;
9485 + virtual uint32_t getGeneration() const;
9488 +/// Just as the AMDGPUCypressDevice is the double capable version of the
9489 +/// AMDGPUEvergreenDevice, the AMDGPUCaymanDevice is the double capable version
9490 +/// of the AMDGPUNIDevice. The other major difference is that the Cayman Device
9491 +/// has 4 wide ALU's, whereas the rest of the NI family is a 5 wide.
9492 +class AMDGPUCaymanDevice: public AMDGPUNIDevice {
9494 + AMDGPUCaymanDevice(AMDGPUSubtarget*);
9495 + virtual ~AMDGPUCaymanDevice();
9497 + virtual void setCaps();
9500 +static const unsigned int MAX_LDS_SIZE_900 = AMDGPUDevice::MAX_LDS_SIZE_800;
9501 +} // namespace llvm
9502 +#endif // AMDILNIDEVICE_H
9503 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILPeepholeOptimizer.cpp llvm-r600/lib/Target/R600/AMDILPeepholeOptimizer.cpp
9504 --- llvm-3.2.src/lib/Target/R600/AMDILPeepholeOptimizer.cpp 1970-01-01 01:00:00.000000000 +0100
9505 +++ llvm-r600/lib/Target/R600/AMDILPeepholeOptimizer.cpp 2013-01-25 19:43:57.450049721 +0100
9507 +//===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===//
9509 +// The LLVM Compiler Infrastructure
9511 +// This file is distributed under the University of Illinois Open Source
9512 +// License. See LICENSE.TXT for details.
9515 +//==-----------------------------------------------------------------------===//
9517 +#define DEBUG_TYPE "PeepholeOpt"
9519 +#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
9524 +#include "AMDILDevices.h"
9525 +#include "AMDGPUInstrInfo.h"
9526 +#include "llvm/ADT/Statistic.h"
9527 +#include "llvm/ADT/StringExtras.h"
9528 +#include "llvm/ADT/StringRef.h"
9529 +#include "llvm/ADT/Twine.h"
9530 +#include "llvm/Constants.h"
9531 +#include "llvm/CodeGen/MachineFunction.h"
9532 +#include "llvm/CodeGen/MachineFunctionAnalysis.h"
9533 +#include "llvm/Function.h"
9534 +#include "llvm/Instructions.h"
9535 +#include "llvm/Module.h"
9536 +#include "llvm/Support/Debug.h"
9537 +#include "llvm/Support/MathExtras.h"
9542 +STATISTIC(PointerAssignments, "Number of dynamic pointer "
9543 + "assigments discovered");
9544 +STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
9547 +using namespace llvm;
9548 +// The Peephole optimization pass is used to do simple last minute optimizations
9549 +// that are required for correct code or to remove redundant functions
9554 +class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
9556 + TargetMachine &TM;
9558 + AMDGPUPeepholeOpt(TargetMachine &tm);
9559 + ~AMDGPUPeepholeOpt();
9560 + const char *getPassName() const;
9561 + bool runOnFunction(Function &F);
9562 + bool doInitialization(Module &M);
9563 + bool doFinalization(Module &M);
9564 + void getAnalysisUsage(AnalysisUsage &AU) const;
9567 + // Function to initiate all of the instruction level optimizations.
9568 + bool instLevelOptimizations(BasicBlock::iterator *inst);
9569 + // Quick check to see if we need to dump all of the pointers into the
9570 + // arena. If this is correct, then we set all pointers to exist in arena. This
9571 + // is a workaround for aliasing of pointers in a struct/union.
9572 + bool dumpAllIntoArena(Function &F);
9573 + // Because I don't want to invalidate any pointers while in the
9574 + // safeNestedForEachFunction. I push atomic conversions to a vector and handle
9575 + // it later. This function does the conversions if required.
9576 + void doAtomicConversionIfNeeded(Function &F);
9577 + // Because __amdil_is_constant cannot be properly evaluated if
9578 + // optimizations are disabled, the call's are placed in a vector
9579 + // and evaluated after the __amdil_image* functions are evaluated
9580 + // which should allow the __amdil_is_constant function to be
9581 + // evaluated correctly.
9582 + void doIsConstCallConversionIfNeeded();
9585 + bool mConvertAtomics;
9586 + CodeGenOpt::Level optLevel;
9587 + // Run a series of tests to see if we can optimize a CALL instruction.
9588 + bool optimizeCallInst(BasicBlock::iterator *bbb);
9589 + // A peephole optimization to optimize bit extract sequences.
9590 + bool optimizeBitExtract(Instruction *inst);
9591 + // A peephole optimization to optimize bit insert sequences.
9592 + bool optimizeBitInsert(Instruction *inst);
9593 + bool setupBitInsert(Instruction *base,
9594 + Instruction *&src,
9596 + Constant *&shift);
9597 + // Expand the bit field insert instruction on versions of OpenCL that
9598 + // don't support it.
9599 + bool expandBFI(CallInst *CI);
9600 + // Expand the bit field mask instruction on version of OpenCL that
9601 + // don't support it.
9602 + bool expandBFM(CallInst *CI);
9603 + // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
9604 + // this case we need to expand them. These functions check for 24bit functions
9605 + // and then expand.
9606 + bool isSigned24BitOps(CallInst *CI);
9607 + void expandSigned24BitOps(CallInst *CI);
9608 + // One optimization that can occur is that if the required workgroup size is
9609 + // specified then the result of get_local_size is known at compile time and
9610 + // can be returned accordingly.
9611 + bool isRWGLocalOpt(CallInst *CI);
9612 + // On northern island cards, the division is slightly less accurate than on
9613 + // previous generations, so we need to utilize a more accurate division. So we
9614 + // can translate the accurate divide to a normal divide on all other cards.
9615 + bool convertAccurateDivide(CallInst *CI);
9616 + void expandAccurateDivide(CallInst *CI);
9617 + // If the alignment is set incorrectly, it can produce really inefficient
9618 + // code. This checks for this scenario and fixes it if possible.
9619 + bool correctMisalignedMemOp(Instruction *inst);
9621 + // If we are in no opt mode, then we need to make sure that
9622 + // local samplers are properly propagated as constant propagation
9623 + // doesn't occur and we need to know the value of kernel defined
9624 + // samplers at compile time.
9625 + bool propagateSamplerInst(CallInst *CI);
9627 + // Helper functions
9629 + // Group of functions that recursively calculate the size of a structure based
9630 + // on it's sub-types.
9631 + size_t getTypeSize(Type * const T, bool dereferencePtr = false);
9632 + size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
9633 + size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
9634 + size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
9635 + size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
9636 + size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
9637 + size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
9638 + size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
9640 + LLVMContext *mCTX;
9642 + const AMDGPUSubtarget *mSTM;
9643 + SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
9644 + SmallVector<CallInst *, 16> isConstVec;
9645 +}; // class AMDGPUPeepholeOpt
9646 + char AMDGPUPeepholeOpt::ID = 0;
9648 +// A template function that has two levels of looping before calling the
9649 +// function with a pointer to the current iterator.
9650 +template<class InputIterator, class SecondIterator, class Function>
9651 +Function safeNestedForEach(InputIterator First, InputIterator Last,
9652 + SecondIterator S, Function F) {
9653 + for ( ; First != Last; ++First) {
9654 + SecondIterator sf, sl;
9655 + for (sf = First->begin(), sl = First->end();
9665 +} // anonymous namespace
9669 + createAMDGPUPeepholeOpt(TargetMachine &tm) {
9670 + return new AMDGPUPeepholeOpt(tm);
9672 +} // llvm namespace
9674 +AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
9675 + : FunctionPass(ID), TM(tm) {
9677 + optLevel = TM.getOptLevel();
9681 +AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() {
9685 +AMDGPUPeepholeOpt::getPassName() const {
9686 + return "AMDGPU PeepHole Optimization Pass";
9690 +containsPointerType(Type *Ty) {
9694 + switch(Ty->getTypeID()) {
9697 + case Type::StructTyID: {
9698 + const StructType *ST = dyn_cast<StructType>(Ty);
9699 + for (StructType::element_iterator stb = ST->element_begin(),
9700 + ste = ST->element_end(); stb != ste; ++stb) {
9701 + if (!containsPointerType(*stb)) {
9708 + case Type::VectorTyID:
9709 + case Type::ArrayTyID:
9710 + return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
9711 + case Type::PointerTyID:
9718 +AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) {
9719 + bool dumpAll = false;
9720 + for (Function::const_arg_iterator cab = F.arg_begin(),
9721 + cae = F.arg_end(); cab != cae; ++cab) {
9722 + const Argument *arg = cab;
9723 + const PointerType *PT = dyn_cast<PointerType>(arg->getType());
9727 + Type *DereferencedType = PT->getElementType();
9728 + if (!dyn_cast<StructType>(DereferencedType)
9732 + if (!containsPointerType(DereferencedType)) {
9735 + // FIXME: Because a pointer inside of a struct/union may be aliased to
9736 + // another pointer we need to take the conservative approach and place all
9737 + // pointers into the arena until more advanced detection is implemented.
9743 +AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() {
9744 + if (isConstVec.empty()) {
9747 + for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
9748 + CallInst *CI = isConstVec[x];
9749 + Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
9750 + Type *aType = Type::getInt32Ty(*mCTX);
9751 + Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
9752 + : ConstantInt::get(aType, 0);
9753 + CI->replaceAllUsesWith(Val);
9754 + CI->eraseFromParent();
9756 + isConstVec.clear();
9759 +AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) {
9760 + // Don't do anything if we don't have any atomic operations.
9761 + if (atomicFuncs.empty()) {
9764 + // Change the function name for the atomic if it is required
9765 + uint32_t size = atomicFuncs.size();
9766 + for (uint32_t x = 0; x < size; ++x) {
9767 + atomicFuncs[x].first->setOperand(
9768 + atomicFuncs[x].first->getNumOperands()-1,
9769 + atomicFuncs[x].second);
9773 + if (mConvertAtomics) {
9779 +AMDGPUPeepholeOpt::runOnFunction(Function &MF) {
9782 + mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
9786 + mCTX = &MF.getType()->getContext();
9787 + mConvertAtomics = true;
9788 + safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
9789 + std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
9792 + doAtomicConversionIfNeeded(MF);
9793 + doIsConstCallConversionIfNeeded();
9802 +AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) {
9803 + Instruction *inst = (*bbb);
9804 + CallInst *CI = dyn_cast<CallInst>(inst);
9808 + if (isSigned24BitOps(CI)) {
9809 + expandSigned24BitOps(CI);
9811 + CI->eraseFromParent();
9814 + if (propagateSamplerInst(CI)) {
9817 + if (expandBFI(CI) || expandBFM(CI)) {
9819 + CI->eraseFromParent();
9822 + if (convertAccurateDivide(CI)) {
9823 + expandAccurateDivide(CI);
9825 + CI->eraseFromParent();
9829 + StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
9830 + if (calleeName.startswith("__amdil_is_constant")) {
9831 + // If we do not have optimizations, then this
9832 + // cannot be properly evaluated, so we add the
9833 + // call instruction to a vector and process
9834 + // them at the end of processing after the
9835 + // samplers have been correctly handled.
9836 + if (optLevel == CodeGenOpt::None) {
9837 + isConstVec.push_back(CI);
9840 + Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
9841 + Type *aType = Type::getInt32Ty(*mCTX);
9842 + Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
9843 + : ConstantInt::get(aType, 0);
9844 + CI->replaceAllUsesWith(Val);
9846 + CI->eraseFromParent();
9851 + if (calleeName.equals("__amdil_is_asic_id_i32")) {
9852 + ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
9853 + Type *aType = Type::getInt32Ty(*mCTX);
9856 + Val = ConstantInt::get(aType,
9857 + mSTM->device()->getDeviceFlag() & CV->getZExtValue());
9859 + Val = ConstantInt::get(aType, 0);
9861 + CI->replaceAllUsesWith(Val);
9863 + CI->eraseFromParent();
9866 + Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
9870 + if (F->getName().startswith("__atom") && !CI->getNumUses()
9871 + && F->getName().find("_xchg") == StringRef::npos) {
9872 + std::string buffer(F->getName().str() + "_noret");
9873 + F = dyn_cast<Function>(
9874 + F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
9875 + atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
9878 + if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
9879 + && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
9882 + if (!mConvertAtomics) {
9885 + StringRef name = F->getName();
9886 + if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
9887 + mConvertAtomics = false;
9893 +AMDGPUPeepholeOpt::setupBitInsert(Instruction *base,
9894 + Instruction *&src,
9896 + Constant *&shift) {
9899 + dbgs() << "Null pointer passed into function.\n";
9903 + bool andOp = false;
9904 + if (base->getOpcode() == Instruction::Shl) {
9905 + shift = dyn_cast<Constant>(base->getOperand(1));
9906 + } else if (base->getOpcode() == Instruction::And) {
9907 + mask = dyn_cast<Constant>(base->getOperand(1));
9911 + dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
9913 + // If the base is neither a Shl or a And, we don't fit any of the patterns above.
9916 + src = dyn_cast<Instruction>(base->getOperand(0));
9919 + dbgs() << "Failed setup since the base operand is not an instruction!\n";
9923 + // If we find an 'and' operation, then we don't need to
9924 + // find the next operation as we already know the
9925 + // bits that are valid at this point.
9929 + if (src->getOpcode() == Instruction::Shl && !shift) {
9930 + shift = dyn_cast<Constant>(src->getOperand(1));
9931 + src = dyn_cast<Instruction>(src->getOperand(0));
9932 + } else if (src->getOpcode() == Instruction::And && !mask) {
9933 + mask = dyn_cast<Constant>(src->getOperand(1));
9935 + if (!mask && !shift) {
9937 + dbgs() << "Failed setup since both mask and shift are NULL!\n";
9939 + // Did not find a constant mask or a shift.
9945 +AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) {
9949 + if (!inst->isBinaryOp()) {
9952 + if (inst->getOpcode() != Instruction::Or) {
9955 + if (optLevel == CodeGenOpt::None) {
9958 + // We want to do an optimization on a sequence of ops that in the end equals a
9959 + // single ISA instruction.
9960 + // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
9961 + // Some simplified versions of this pattern are as follows:
9962 + // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
9963 + // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
9964 + // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
9965 + // (A & B) | (D << F) when (1 << F) >= B
9966 + // (A << C) | (D & E) when (1 << C) >= E
9967 + if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
9968 + // The HD4XXX hardware doesn't support the ubit_insert instruction.
9971 + Type *aType = inst->getType();
9972 + bool isVector = aType->isVectorTy();
9974 + // This optimization only works on 32bit integers.
9975 + if (aType->getScalarType()
9976 + != Type::getInt32Ty(inst->getContext())) {
9980 + const VectorType *VT = dyn_cast<VectorType>(aType);
9981 + numEle = VT->getNumElements();
9982 + // We currently cannot support more than 4 elements in a intrinsic and we
9983 + // cannot support Vec3 types.
9984 + if (numEle > 4 || numEle == 3) {
9988 + // TODO: Handle vectors.
9991 + dbgs() << "!!! Vectors are not supported yet!\n";
9995 + Instruction *LHSSrc = NULL, *RHSSrc = NULL;
9996 + Constant *LHSMask = NULL, *RHSMask = NULL;
9997 + Constant *LHSShift = NULL, *RHSShift = NULL;
9998 + Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
9999 + Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
10000 + if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
10002 + dbgs() << "Found an OR Operation that failed setup!\n";
10004 + if (LHS) { LHS->dump(); }
10005 + if (LHSSrc) { LHSSrc->dump(); }
10006 + if (LHSMask) { LHSMask->dump(); }
10007 + if (LHSShift) { LHSShift->dump(); }
10009 + // There was an issue with the setup for BitInsert.
10012 + if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
10014 + dbgs() << "Found an OR Operation that failed setup!\n";
10016 + if (RHS) { RHS->dump(); }
10017 + if (RHSSrc) { RHSSrc->dump(); }
10018 + if (RHSMask) { RHSMask->dump(); }
10019 + if (RHSShift) { RHSShift->dump(); }
10021 + // There was an issue with the setup for BitInsert.
10025 + dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
10026 + dbgs() << "Op: "; inst->dump();
10027 + dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
10028 + dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
10029 + dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
10030 + dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
10031 + dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
10032 + dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
10033 + dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
10034 + dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
10036 + Constant *offset = NULL;
10037 + Constant *width = NULL;
10038 + uint32_t lhsMaskVal = 0, rhsMaskVal = 0;
10039 + uint32_t lhsShiftVal = 0, rhsShiftVal = 0;
10040 + uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
10041 + uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
10042 + lhsMaskVal = (LHSMask
10043 + ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
10044 + rhsMaskVal = (RHSMask
10045 + ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
10046 + lhsShiftVal = (LHSShift
10047 + ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
10048 + rhsShiftVal = (RHSShift
10049 + ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
10050 + lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
10051 + rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
10052 + lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
10053 + rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
10054 + // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
10056 + dbgs() << "Found pattern: \'((A" << (LHSMask ? " & B)" : ")");
10057 + dbgs() << (LHSShift ? " << C)" : ")") << " | ((D" ;
10058 + dbgs() << (RHSMask ? " & E)" : ")");
10059 + dbgs() << (RHSShift ? " << F)\'\n" : ")\'\n");
10060 + dbgs() << "A = LHSSrc\t\tD = RHSSrc \n";
10061 + dbgs() << "B = " << lhsMaskVal << "\t\tE = " << rhsMaskVal << "\n";
10062 + dbgs() << "C = " << lhsShiftVal << "\t\tF = " << rhsShiftVal << "\n";
10063 + dbgs() << "width(B) = " << lhsMaskWidth;
10064 + dbgs() << "\twidth(E) = " << rhsMaskWidth << "\n";
10065 + dbgs() << "offset(B) = " << lhsMaskOffset;
10066 + dbgs() << "\toffset(E) = " << rhsMaskOffset << "\n";
10067 + dbgs() << "Constraints: \n";
10068 + dbgs() << "\t(1) B ^ E == 0\n";
10069 + dbgs() << "\t(2-LHS) B is a mask\n";
10070 + dbgs() << "\t(2-LHS) E is a mask\n";
10071 + dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n";
10072 + dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n";
10074 + if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
10076 + dbgs() << lhsMaskVal << " ^ " << rhsMaskVal;
10077 + dbgs() << " = " << (lhsMaskVal ^ rhsMaskVal) << "\n";
10078 + dbgs() << "Failed constraint 1!\n";
10083 + dbgs() << "LHS = " << lhsMaskOffset << "";
10084 + dbgs() << " >= (" << rhsMaskWidth << " + " << rhsMaskOffset << ") = ";
10085 + dbgs() << (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset));
10086 + dbgs() << "\nRHS = " << rhsMaskOffset << "";
10087 + dbgs() << " >= (" << lhsMaskWidth << " + " << lhsMaskOffset << ") = ";
10088 + dbgs() << (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset));
10091 + if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
10092 + offset = ConstantInt::get(aType, lhsMaskOffset, false);
10093 + width = ConstantInt::get(aType, lhsMaskWidth, false);
10095 + if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
10097 + dbgs() << "Value is not a Mask: " << lhsMaskVal << "\n";
10098 + dbgs() << "Failed constraint 2!\n";
10103 + LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
10105 + } else if (lhsShiftVal != lhsMaskOffset) {
10106 + LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
10110 + dbgs() << "Optimizing LHS!\n";
10112 + } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
10113 + offset = ConstantInt::get(aType, rhsMaskOffset, false);
10114 + width = ConstantInt::get(aType, rhsMaskWidth, false);
10117 + if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
10119 + dbgs() << "Non-Mask: " << rhsMaskVal << "\n";
10120 + dbgs() << "Failed constraint 2!\n";
10125 + LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
10127 + } else if (rhsShiftVal != rhsMaskOffset) {
10128 + LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
10132 + dbgs() << "Optimizing RHS!\n";
10136 + dbgs() << "Failed constraint 3!\n";
10141 + dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
10142 + dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
10143 + dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
10144 + dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
10146 + if (!offset || !width) {
10148 + dbgs() << "Either width or offset are NULL, failed detection!\n";
10152 + // Lets create the function signature.
10153 + std::vector<Type *> callTypes;
10154 + callTypes.push_back(aType);
10155 + callTypes.push_back(aType);
10156 + callTypes.push_back(aType);
10157 + callTypes.push_back(aType);
10158 + FunctionType *funcType = FunctionType::get(aType, callTypes, false);
10159 + std::string name = "__amdil_ubit_insert";
10160 + if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
10162 + dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
10163 + getOrInsertFunction(llvm::StringRef(name), funcType));
10164 + Value *Operands[4] = {
10170 + CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
10172 + dbgs() << "Old Inst: ";
10174 + dbgs() << "New Inst: ";
10176 + dbgs() << "\n\n";
10178 + CI->insertBefore(inst);
10179 + inst->replaceAllUsesWith(CI);
10184 +AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst) {
10188 + if (!inst->isBinaryOp()) {
10191 + if (inst->getOpcode() != Instruction::And) {
10194 + if (optLevel == CodeGenOpt::None) {
10197 + // We want to do some simple optimizations on Shift right/And patterns. The
10198 + // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
10199 + // value smaller than 32 and C is a mask. If C is a constant value, then the
10200 + // following transformation can occur. For signed integers, it turns into the
10201 + // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
10202 + // integers, it turns into the function call dst =
10203 + // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
10204 + // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
10205 + // Evergreen hardware.
10206 + if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
10207 + // This does not work on HD4XXX hardware.
10210 + Type *aType = inst->getType();
10211 + bool isVector = aType->isVectorTy();
10213 + // XXX Support vector types
10218 + // This only works on 32bit integers
10219 + if (aType->getScalarType()
10220 + != Type::getInt32Ty(inst->getContext())) {
10224 + const VectorType *VT = dyn_cast<VectorType>(aType);
10225 + numEle = VT->getNumElements();
10226 + // We currently cannot support more than 4 elements in a intrinsic and we
10227 + // cannot support Vec3 types.
10228 + if (numEle > 4 || numEle == 3) {
10232 + BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
10233 + // If the first operand is not a shift instruction, then we can return as it
10234 + // doesn't match this pattern.
10235 + if (!ShiftInst || !ShiftInst->isShift()) {
10238 + // If we are a shift left, then we need don't match this pattern.
10239 + if (ShiftInst->getOpcode() == Instruction::Shl) {
10242 + bool isSigned = ShiftInst->isArithmeticShift();
10243 + Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
10244 + Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
10245 + // Lets make sure that the shift value and the and mask are constant integers.
10246 + if (!AndMask || !ShrVal) {
10249 + Constant *newMaskConst;
10250 + Constant *shiftValConst;
10252 + // Handle the vector case
10253 + std::vector<Constant *> maskVals;
10254 + std::vector<Constant *> shiftVals;
10255 + ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
10256 + ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
10257 + Type *scalarType = AndMaskVec->getType()->getScalarType();
10258 + assert(AndMaskVec->getNumOperands() ==
10259 + ShrValVec->getNumOperands() && "cannot have a "
10260 + "combination where the number of elements to a "
10261 + "shift and an and are different!");
10262 + for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
10263 + ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
10264 + ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
10265 + if (!AndCI || !ShiftIC) {
10268 + uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
10269 + if (!isMask_32(maskVal)) {
10272 + maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
10273 + uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
10274 + // If the mask or shiftval is greater than the bitcount, then break out.
10275 + if (maskVal >= 32 || shiftVal >= 32) {
10278 + // If the mask val is greater than the the number of original bits left
10279 + // then this optimization is invalid.
10280 + if (maskVal > (32 - shiftVal)) {
10283 + maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
10284 + shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
10286 + newMaskConst = ConstantVector::get(maskVals);
10287 + shiftValConst = ConstantVector::get(shiftVals);
10289 + // Handle the scalar case
10290 + uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
10291 + // This must be a mask value where all lower bits are set to 1 and then any
10292 + // bit higher is set to 0.
10293 + if (!isMask_32(maskVal)) {
10296 + maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
10297 + // Count the number of bits set in the mask, this is the width of the
10298 + // resulting bit set that is extracted from the source value.
10299 + uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
10300 + // If the mask or shift val is greater than the bitcount, then break out.
10301 + if (maskVal >= 32 || shiftVal >= 32) {
10304 + // If the mask val is greater than the the number of original bits left then
10305 + // this optimization is invalid.
10306 + if (maskVal > (32 - shiftVal)) {
10309 + newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
10310 + shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
10312 + // Lets create the function signature.
10313 + std::vector<Type *> callTypes;
10314 + callTypes.push_back(aType);
10315 + callTypes.push_back(aType);
10316 + callTypes.push_back(aType);
10317 + FunctionType *funcType = FunctionType::get(aType, callTypes, false);
10318 + std::string name = "llvm.AMDGPU.bit.extract.u32";
10320 + name += ".v" + itostr(numEle) + "i32";
10324 + // Lets create the function.
10326 + dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
10327 + getOrInsertFunction(llvm::StringRef(name), funcType));
10328 + Value *Operands[3] = {
10329 + ShiftInst->getOperand(0),
10333 + // Lets create the Call with the operands
10334 + CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
10335 + CI->setDoesNotAccessMemory();
10336 + CI->insertBefore(inst);
10337 + inst->replaceAllUsesWith(CI);
10342 +AMDGPUPeepholeOpt::expandBFI(CallInst *CI) {
10346 + Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
10347 + if (!LHS->getName().startswith("__amdil_bfi")) {
10350 + Type* type = CI->getOperand(0)->getType();
10351 + Constant *negOneConst = NULL;
10352 + if (type->isVectorTy()) {
10353 + std::vector<Constant *> negOneVals;
10354 + negOneConst = ConstantInt::get(CI->getContext(),
10355 + APInt(32, StringRef("-1"), 10));
10356 + for (size_t x = 0,
10357 + y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
10358 + negOneVals.push_back(negOneConst);
10360 + negOneConst = ConstantVector::get(negOneVals);
10362 + negOneConst = ConstantInt::get(CI->getContext(),
10363 + APInt(32, StringRef("-1"), 10));
10365 + // __amdil_bfi => (A & B) | (~A & C)
10366 + BinaryOperator *lhs =
10367 + BinaryOperator::Create(Instruction::And, CI->getOperand(0),
10368 + CI->getOperand(1), "bfi_and", CI);
10369 + BinaryOperator *rhs =
10370 + BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
10372 + rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
10374 + lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
10375 + CI->replaceAllUsesWith(lhs);
10380 +AMDGPUPeepholeOpt::expandBFM(CallInst *CI) {
10384 + Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
10385 + if (!LHS->getName().startswith("__amdil_bfm")) {
10388 + // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
10389 + Constant *newMaskConst = NULL;
10390 + Constant *newShiftConst = NULL;
10391 + Type* type = CI->getOperand(0)->getType();
10392 + if (type->isVectorTy()) {
10393 + std::vector<Constant*> newMaskVals, newShiftVals;
10394 + newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
10395 + newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
10396 + for (size_t x = 0,
10397 + y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
10398 + newMaskVals.push_back(newMaskConst);
10399 + newShiftVals.push_back(newShiftConst);
10401 + newMaskConst = ConstantVector::get(newMaskVals);
10402 + newShiftConst = ConstantVector::get(newShiftVals);
10404 + newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
10405 + newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
10407 + BinaryOperator *lhs =
10408 + BinaryOperator::Create(Instruction::And, CI->getOperand(0),
10409 + newMaskConst, "bfm_mask", CI);
10410 + lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
10411 + lhs, "bfm_shl", CI);
10412 + lhs = BinaryOperator::Create(Instruction::Sub, lhs,
10413 + newShiftConst, "bfm_sub", CI);
10414 + BinaryOperator *rhs =
10415 + BinaryOperator::Create(Instruction::And, CI->getOperand(1),
10416 + newMaskConst, "bfm_mask", CI);
10417 + lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
10418 + CI->replaceAllUsesWith(lhs);
10423 +AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) {
10424 + Instruction *inst = (*bbb);
10425 + if (optimizeCallInst(bbb)) {
10428 + if (optimizeBitExtract(inst)) {
10431 + if (optimizeBitInsert(inst)) {
10434 + if (correctMisalignedMemOp(inst)) {
10440 +AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) {
10441 + LoadInst *linst = dyn_cast<LoadInst>(inst);
10442 + StoreInst *sinst = dyn_cast<StoreInst>(inst);
10443 + unsigned alignment;
10444 + Type* Ty = inst->getType();
10446 + alignment = linst->getAlignment();
10447 + Ty = inst->getType();
10448 + } else if (sinst) {
10449 + alignment = sinst->getAlignment();
10450 + Ty = sinst->getValueOperand()->getType();
10454 + unsigned size = getTypeSize(Ty);
10455 + if (size == alignment || size < alignment) {
10458 + if (!Ty->isStructTy()) {
10461 + if (alignment < 4) {
10463 + linst->setAlignment(0);
10465 + } else if (sinst) {
10466 + sinst->setAlignment(0);
10473 +AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI) {
10477 + Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
10478 + std::string namePrefix = LHS->getName().substr(0, 14);
10479 + if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
10480 + && namePrefix != "__amdil__imul24_high") {
10483 + if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
10490 +AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI) {
10491 + assert(isSigned24BitOps(CI) && "Must be a "
10492 + "signed 24 bit operation to call this function!");
10493 + Value *LHS = CI->getOperand(CI->getNumOperands()-1);
10494 + // On 7XX and 8XX we do not have signed 24bit, so we need to
10495 + // expand it to the following:
10496 + // imul24 turns into 32bit imul
10497 + // imad24 turns into 32bit imad
10498 + // imul24_high turns into 32bit imulhigh
10499 + if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
10500 + Type *aType = CI->getOperand(0)->getType();
10501 + bool isVector = aType->isVectorTy();
10502 + int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
10503 + std::vector<Type*> callTypes;
10504 + callTypes.push_back(CI->getOperand(0)->getType());
10505 + callTypes.push_back(CI->getOperand(1)->getType());
10506 + callTypes.push_back(CI->getOperand(2)->getType());
10507 + FunctionType *funcType =
10508 + FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
10509 + std::string name = "__amdil_imad";
10511 + name += "_v" + itostr(numEle) + "i32";
10515 + Function *Func = dyn_cast<Function>(
10516 + CI->getParent()->getParent()->getParent()->
10517 + getOrInsertFunction(llvm::StringRef(name), funcType));
10518 + Value *Operands[3] = {
10519 + CI->getOperand(0),
10520 + CI->getOperand(1),
10521 + CI->getOperand(2)
10523 + CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
10524 + nCI->insertBefore(CI);
10525 + CI->replaceAllUsesWith(nCI);
10526 + } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
10527 + BinaryOperator *mulOp =
10528 + BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
10529 + CI->getOperand(1), "imul24", CI);
10530 + CI->replaceAllUsesWith(mulOp);
10531 + } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
10532 + Type *aType = CI->getOperand(0)->getType();
10534 + bool isVector = aType->isVectorTy();
10535 + int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
10536 + std::vector<Type*> callTypes;
10537 + callTypes.push_back(CI->getOperand(0)->getType());
10538 + callTypes.push_back(CI->getOperand(1)->getType());
10539 + FunctionType *funcType =
10540 + FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
10541 + std::string name = "__amdil_imul_high";
10543 + name += "_v" + itostr(numEle) + "i32";
10547 + Function *Func = dyn_cast<Function>(
10548 + CI->getParent()->getParent()->getParent()->
10549 + getOrInsertFunction(llvm::StringRef(name), funcType));
10550 + Value *Operands[2] = {
10551 + CI->getOperand(0),
10552 + CI->getOperand(1)
10554 + CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
10555 + nCI->insertBefore(CI);
10556 + CI->replaceAllUsesWith(nCI);
10561 +AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI) {
10562 + return (CI != NULL
10563 + && CI->getOperand(CI->getNumOperands() - 1)->getName()
10564 + == "__amdil_get_local_size_int");
10568 +AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI) {
10572 + if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
10573 + && (mSTM->getDeviceName() == "cayman")) {
10576 + return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
10577 + == "__amdil_improved_div";
10581 +AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI) {
10582 + assert(convertAccurateDivide(CI)
10583 + && "expanding accurate divide can only happen if it is expandable!");
10584 + BinaryOperator *divOp =
10585 + BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
10586 + CI->getOperand(1), "fdiv32", CI);
10587 + CI->replaceAllUsesWith(divOp);
10591 +AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) {
10592 + if (optLevel != CodeGenOpt::None) {
10600 + unsigned funcNameIdx = 0;
10601 + funcNameIdx = CI->getNumOperands() - 1;
10602 + StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
10603 + if (calleeName != "__amdil_image2d_read_norm"
10604 + && calleeName != "__amdil_image2d_read_unnorm"
10605 + && calleeName != "__amdil_image3d_read_norm"
10606 + && calleeName != "__amdil_image3d_read_unnorm") {
10610 + unsigned samplerIdx = 2;
10612 + Value *sampler = CI->getOperand(samplerIdx);
10613 + LoadInst *lInst = dyn_cast<LoadInst>(sampler);
10618 + if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
10622 + GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
10623 + // If we are loading from what is not a global value, then we
10624 + // fail and return.
10629 + // If we don't have an initializer or we have an initializer and
10630 + // the initializer is not a 32bit integer, we fail.
10631 + if (!gv->hasInitializer()
10632 + || !gv->getInitializer()->getType()->isIntegerTy(32)) {
10636 + // Now that we have the global variable initializer, lets replace
10637 + // all uses of the load instruction with the samplerVal and
10638 + // reparse the __amdil_is_constant() function.
10639 + Constant *samplerVal = gv->getInitializer();
10640 + lInst->replaceAllUsesWith(samplerVal);
10645 +AMDGPUPeepholeOpt::doInitialization(Module &M) {
10650 +AMDGPUPeepholeOpt::doFinalization(Module &M) {
10655 +AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const {
10656 + AU.addRequired<MachineFunctionAnalysis>();
10657 + FunctionPass::getAnalysisUsage(AU);
10658 + AU.setPreservesAll();
10661 +size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
10666 + switch (T->getTypeID()) {
10667 + case Type::X86_FP80TyID:
10668 + case Type::FP128TyID:
10669 + case Type::PPC_FP128TyID:
10670 + case Type::LabelTyID:
10671 + assert(0 && "These types are not supported by this backend");
10673 + case Type::FloatTyID:
10674 + case Type::DoubleTyID:
10675 + size = T->getPrimitiveSizeInBits() >> 3;
10677 + case Type::PointerTyID:
10678 + size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
10680 + case Type::IntegerTyID:
10681 + size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
10683 + case Type::StructTyID:
10684 + size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
10686 + case Type::ArrayTyID:
10687 + size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
10689 + case Type::FunctionTyID:
10690 + size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
10692 + case Type::VectorTyID:
10693 + size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
10699 +size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
10700 + bool dereferencePtr) {
10706 + StructType::element_iterator eib;
10707 + StructType::element_iterator eie;
10708 + for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
10710 + size += getTypeSize(curType, dereferencePtr);
10715 +size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
10716 + bool dereferencePtr) {
10717 + return IT ? (IT->getBitWidth() >> 3) : 0;
10720 +size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
10721 + bool dereferencePtr) {
10722 + assert(0 && "Should not be able to calculate the size of an function type");
10726 +size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
10727 + bool dereferencePtr) {
10728 + return (size_t)(AT ? (getTypeSize(AT->getElementType(),
10729 + dereferencePtr) * AT->getNumElements())
10733 +size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
10734 + bool dereferencePtr) {
10735 + return VT ? (VT->getBitWidth() >> 3) : 0;
10738 +size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
10739 + bool dereferencePtr) {
10743 + Type *CT = PT->getElementType();
10744 + if (CT->getTypeID() == Type::StructTyID &&
10745 + PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
10746 + return getTypeSize(dyn_cast<StructType>(CT));
10747 + } else if (dereferencePtr) {
10749 + for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
10750 + size += getTypeSize(PT->getContainedType(x), dereferencePtr);
10758 +size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
10759 + bool dereferencePtr) {
10760 + //assert(0 && "Should not be able to calculate the size of an opaque type");
10763 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILRegisterInfo.td llvm-r600/lib/Target/R600/AMDILRegisterInfo.td
10764 --- llvm-3.2.src/lib/Target/R600/AMDILRegisterInfo.td 1970-01-01 01:00:00.000000000 +0100
10765 +++ llvm-r600/lib/Target/R600/AMDILRegisterInfo.td 2013-01-25 19:43:57.450049721 +0100
10767 +//===- AMDILRegisterInfo.td - AMDIL Register defs ----------*- tablegen -*-===//
10769 +// The LLVM Compiler Infrastructure
10771 +// This file is distributed under the University of Illinois Open Source
10772 +// License. See LICENSE.TXT for details.
10774 +//==-----------------------------------------------------------------------===//
10776 +// Declarations that describe the AMDIL register file
10778 +//===----------------------------------------------------------------------===//
10780 +class AMDILReg<bits<16> num, string n> : Register<n> {
10781 + field bits<16> Value;
10783 + let Namespace = "AMDGPU";
10786 +// We will start with 8 registers for each class before expanding to more
10787 +// Since the swizzle is added based on the register class, we can leave it
10788 +// off here and just specify different registers for different register classes
10789 +def R1 : AMDILReg<1, "r1">, DwarfRegNum<[1]>;
10790 +def R2 : AMDILReg<2, "r2">, DwarfRegNum<[2]>;
10791 +def R3 : AMDILReg<3, "r3">, DwarfRegNum<[3]>;
10792 +def R4 : AMDILReg<4, "r4">, DwarfRegNum<[4]>;
10793 +def R5 : AMDILReg<5, "r5">, DwarfRegNum<[5]>;
10794 +def R6 : AMDILReg<6, "r6">, DwarfRegNum<[6]>;
10795 +def R7 : AMDILReg<7, "r7">, DwarfRegNum<[7]>;
10796 +def R8 : AMDILReg<8, "r8">, DwarfRegNum<[8]>;
10797 +def R9 : AMDILReg<9, "r9">, DwarfRegNum<[9]>;
10798 +def R10 : AMDILReg<10, "r10">, DwarfRegNum<[10]>;
10799 +def R11 : AMDILReg<11, "r11">, DwarfRegNum<[11]>;
10800 +def R12 : AMDILReg<12, "r12">, DwarfRegNum<[12]>;
10801 +def R13 : AMDILReg<13, "r13">, DwarfRegNum<[13]>;
10802 +def R14 : AMDILReg<14, "r14">, DwarfRegNum<[14]>;
10803 +def R15 : AMDILReg<15, "r15">, DwarfRegNum<[15]>;
10804 +def R16 : AMDILReg<16, "r16">, DwarfRegNum<[16]>;
10805 +def R17 : AMDILReg<17, "r17">, DwarfRegNum<[17]>;
10806 +def R18 : AMDILReg<18, "r18">, DwarfRegNum<[18]>;
10807 +def R19 : AMDILReg<19, "r19">, DwarfRegNum<[19]>;
10808 +def R20 : AMDILReg<20, "r20">, DwarfRegNum<[20]>;
10810 +// All registers between 1000 and 1024 are reserved and cannot be used
10811 +// unless commented in this section
10812 +// r1021-r1025 are used to dynamically calculate the local/group/thread/region/region_local ID's
10813 +// r1020 is used to hold the frame index for local arrays
10814 +// r1019 is used to hold the dynamic stack allocation pointer
10815 +// r1018 is used as a temporary register for handwritten code
10816 +// r1017 is used as a temporary register for handwritten code
10817 +// r1016 is used as a temporary register for load/store code
10818 +// r1015 is used as a temporary register for data segment offset
10819 +// r1014 is used as a temporary register for store code
10820 +// r1013 is used as the section data pointer register
10821 +// r1012-r1010 and r1001-r1008 are used for temporary I/O registers
10822 +// r1009 is used as the frame pointer register
10823 +// r999 is used as the mem register.
10824 +// r998 is used as the return address register.
10825 +//def R1025 : AMDILReg<1025, "r1025">, DwarfRegNum<[1025]>;
10826 +//def R1024 : AMDILReg<1024, "r1024">, DwarfRegNum<[1024]>;
10827 +//def R1023 : AMDILReg<1023, "r1023">, DwarfRegNum<[1023]>;
10828 +//def R1022 : AMDILReg<1022, "r1022">, DwarfRegNum<[1022]>;
10829 +//def R1021 : AMDILReg<1021, "r1021">, DwarfRegNum<[1021]>;
10830 +//def R1020 : AMDILReg<1020, "r1020">, DwarfRegNum<[1020]>;
10831 +def SP : AMDILReg<1019, "r1019">, DwarfRegNum<[1019]>;
10832 +def T1 : AMDILReg<1018, "r1018">, DwarfRegNum<[1018]>;
10833 +def T2 : AMDILReg<1017, "r1017">, DwarfRegNum<[1017]>;
10834 +def T3 : AMDILReg<1016, "r1016">, DwarfRegNum<[1016]>;
10835 +def T4 : AMDILReg<1015, "r1015">, DwarfRegNum<[1015]>;
10836 +def T5 : AMDILReg<1014, "r1014">, DwarfRegNum<[1014]>;
10837 +def SDP : AMDILReg<1013, "r1013">, DwarfRegNum<[1013]>;
10838 +def R1012: AMDILReg<1012, "r1012">, DwarfRegNum<[1012]>;
10839 +def R1011: AMDILReg<1011, "r1011">, DwarfRegNum<[1011]>;
10840 +def R1010: AMDILReg<1010, "r1010">, DwarfRegNum<[1010]>;
10841 +def DFP : AMDILReg<1009, "r1009">, DwarfRegNum<[1009]>;
10842 +def R1008: AMDILReg<1008, "r1008">, DwarfRegNum<[1008]>;
10843 +def R1007: AMDILReg<1007, "r1007">, DwarfRegNum<[1007]>;
10844 +def R1006: AMDILReg<1006, "r1006">, DwarfRegNum<[1006]>;
10845 +def R1005: AMDILReg<1005, "r1005">, DwarfRegNum<[1005]>;
10846 +def R1004: AMDILReg<1004, "r1004">, DwarfRegNum<[1004]>;
10847 +def R1003: AMDILReg<1003, "r1003">, DwarfRegNum<[1003]>;
10848 +def R1002: AMDILReg<1002, "r1002">, DwarfRegNum<[1002]>;
10849 +def R1001: AMDILReg<1001, "r1001">, DwarfRegNum<[1001]>;
10850 +def MEM : AMDILReg<999, "mem">, DwarfRegNum<[999]>;
10851 +def RA : AMDILReg<998, "r998">, DwarfRegNum<[998]>;
10852 +def FP : AMDILReg<997, "r997">, DwarfRegNum<[997]>;
10853 +def GPRI16 : RegisterClass<"AMDGPU", [i16], 16,
10854 + (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
10855 + let AltOrders = [(add (sequence "R%u", 1, 20))];
10856 + let AltOrderSelect = [{
10860 +def GPRI32 : RegisterClass<"AMDGPU", [i32], 32,
10861 + (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
10862 + let AltOrders = [(add (sequence "R%u", 1, 20))];
10863 + let AltOrderSelect = [{
10867 +def GPRF32 : RegisterClass<"AMDGPU", [f32], 32,
10868 + (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
10869 + let AltOrders = [(add (sequence "R%u", 1, 20))];
10870 + let AltOrderSelect = [{
10874 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILSIDevice.cpp llvm-r600/lib/Target/R600/AMDILSIDevice.cpp
10875 --- llvm-3.2.src/lib/Target/R600/AMDILSIDevice.cpp 1970-01-01 01:00:00.000000000 +0100
10876 +++ llvm-r600/lib/Target/R600/AMDILSIDevice.cpp 2013-01-25 19:43:57.450049721 +0100
10878 +//===-- AMDILSIDevice.cpp - Device Info for Southern Islands GPUs ---------===//
10880 +// The LLVM Compiler Infrastructure
10882 +// This file is distributed under the University of Illinois Open Source
10883 +// License. See LICENSE.TXT for details.
10886 +//==-----------------------------------------------------------------------===//
10887 +#include "AMDILSIDevice.h"
10888 +#include "AMDILEvergreenDevice.h"
10889 +#include "AMDILNIDevice.h"
10890 +#include "AMDGPUSubtarget.h"
10892 +using namespace llvm;
10894 +AMDGPUSIDevice::AMDGPUSIDevice(AMDGPUSubtarget *ST)
10895 + : AMDGPUEvergreenDevice(ST) {
10897 +AMDGPUSIDevice::~AMDGPUSIDevice() {
10901 +AMDGPUSIDevice::getMaxLDSSize() const {
10902 + if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
10903 + return MAX_LDS_SIZE_900;
10910 +AMDGPUSIDevice::getGeneration() const {
10911 + return AMDGPUDeviceInfo::HD7XXX;
10915 +AMDGPUSIDevice::getDataLayout() const {
10916 + return std::string("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16"
10917 + "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
10918 + "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
10919 + "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
10920 + "-v512:512:512-v1024:1024:1024-v2048:2048:2048"
10923 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/AMDILSIDevice.h llvm-r600/lib/Target/R600/AMDILSIDevice.h
10924 --- llvm-3.2.src/lib/Target/R600/AMDILSIDevice.h 1970-01-01 01:00:00.000000000 +0100
10925 +++ llvm-r600/lib/Target/R600/AMDILSIDevice.h 2013-01-25 19:43:57.450049721 +0100
10927 +//===------- AMDILSIDevice.h - Define SI Device for AMDIL -*- C++ -*------===//
10929 +// The LLVM Compiler Infrastructure
10931 +// This file is distributed under the University of Illinois Open Source
10932 +// License. See LICENSE.TXT for details.
10934 +//==-----------------------------------------------------------------------===//
10937 +/// \brief Interface for the subtarget data classes.
10939 +/// This file will define the interface that each generation needs to
10940 +/// implement in order to correctly answer queries on the capabilities of the
10941 +/// specific hardware.
10942 +//===---------------------------------------------------------------------===//
10943 +#ifndef AMDILSIDEVICE_H
10944 +#define AMDILSIDEVICE_H
10945 +#include "AMDILEvergreenDevice.h"
10948 +class AMDGPUSubtarget;
10949 +//===---------------------------------------------------------------------===//
10950 +// SI generation of devices and their respective sub classes
10951 +//===---------------------------------------------------------------------===//
10953 +/// \brief The AMDGPUSIDevice is the base class for all Southern Island series
10955 +class AMDGPUSIDevice : public AMDGPUEvergreenDevice {
10957 + AMDGPUSIDevice(AMDGPUSubtarget*);
10958 + virtual ~AMDGPUSIDevice();
10959 + virtual size_t getMaxLDSSize() const;
10960 + virtual uint32_t getGeneration() const;
10961 + virtual std::string getDataLayout() const;
10964 +} // namespace llvm
10965 +#endif // AMDILSIDEVICE_H
10966 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/CMakeLists.txt llvm-r600/lib/Target/R600/CMakeLists.txt
10967 --- llvm-3.2.src/lib/Target/R600/CMakeLists.txt 1970-01-01 01:00:00.000000000 +0100
10968 +++ llvm-r600/lib/Target/R600/CMakeLists.txt 2013-01-25 19:43:57.453383054 +0100
10970 +set(LLVM_TARGET_DEFINITIONS AMDGPU.td)
10972 +tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info)
10973 +tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info)
10974 +tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel)
10975 +tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv)
10976 +tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget)
10977 +tablegen(LLVM AMDGPUGenIntrinsics.inc -gen-tgt-intrinsic)
10978 +tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
10979 +tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer)
10980 +tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
10981 +add_public_tablegen_target(AMDGPUCommonTableGen)
10983 +add_llvm_target(AMDGPUCodeGen
10984 + AMDIL7XXDevice.cpp
10985 + AMDILCFGStructurizer.cpp
10987 + AMDILDeviceInfo.cpp
10988 + AMDILEvergreenDevice.cpp
10989 + AMDILFrameLowering.cpp
10990 + AMDILIntrinsicInfo.cpp
10991 + AMDILISelDAGToDAG.cpp
10992 + AMDILISelLowering.cpp
10993 + AMDILNIDevice.cpp
10994 + AMDILPeepholeOptimizer.cpp
10995 + AMDILSIDevice.cpp
10996 + AMDGPUAsmPrinter.cpp
10997 + AMDGPUMCInstLower.cpp
10998 + AMDGPUSubtarget.cpp
10999 + AMDGPUTargetMachine.cpp
11000 + AMDGPUISelLowering.cpp
11001 + AMDGPUConvertToISA.cpp
11002 + AMDGPUInstrInfo.cpp
11003 + AMDGPURegisterInfo.cpp
11004 + R600ExpandSpecialInstrs.cpp
11005 + R600InstrInfo.cpp
11006 + R600ISelLowering.cpp
11007 + R600LowerConstCopy.cpp
11008 + R600MachineFunctionInfo.cpp
11009 + R600RegisterInfo.cpp
11010 + SIAssignInterpRegs.cpp
11012 + SIISelLowering.cpp
11013 + SILowerLiteralConstants.cpp
11014 + SILowerControlFlow.cpp
11015 + SIMachineFunctionInfo.cpp
11016 + SIRegisterInfo.cpp
11017 + SIFixSGPRLiveness.cpp
11020 +add_dependencies(LLVMR600CodeGen intrinsics_gen)
11022 +add_subdirectory(InstPrinter)
11023 +add_subdirectory(TargetInfo)
11024 +add_subdirectory(MCTargetDesc)
11025 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp llvm-r600/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
11026 --- llvm-3.2.src/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp 1970-01-01 01:00:00.000000000 +0100
11027 +++ llvm-r600/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp 2013-01-25 19:43:57.456716387 +0100
11029 +//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===//
11031 +// The LLVM Compiler Infrastructure
11033 +// This file is distributed under the University of Illinois Open Source
11034 +// License. See LICENSE.TXT for details.
11037 +//===----------------------------------------------------------------------===//
11039 +#include "AMDGPUInstPrinter.h"
11040 +#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
11041 +#include "llvm/MC/MCInst.h"
11043 +using namespace llvm;
11045 +void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
11046 + StringRef Annot) {
11047 + printInstruction(MI, OS);
11049 + printAnnotation(OS, Annot);
11052 +void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
11053 + raw_ostream &O) {
11055 + const MCOperand &Op = MI->getOperand(OpNo);
11056 + if (Op.isReg()) {
11057 + switch (Op.getReg()) {
11058 + // This is the default predicate state, so we don't need to print it.
11059 + case AMDGPU::PRED_SEL_OFF: break;
11060 + default: O << getRegisterName(Op.getReg()); break;
11062 + } else if (Op.isImm()) {
11063 + O << Op.getImm();
11064 + } else if (Op.isFPImm()) {
11065 + O << Op.getFPImm();
11067 + assert(!"unknown operand type in printOperand");
11071 +void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
11072 + raw_ostream &O) {
11073 + printOperand(MI, OpNo, O);
11075 + printOperand(MI, OpNo + 1, O);
11078 +void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
11079 + raw_ostream &O, StringRef Asm) {
11080 + const MCOperand &Op = MI->getOperand(OpNo);
11081 + assert(Op.isImm());
11082 + if (Op.getImm() == 1) {
11087 +void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
11088 + raw_ostream &O) {
11089 + printIfSet(MI, OpNo, O, "|");
11092 +void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
11093 + raw_ostream &O) {
11094 + printIfSet(MI, OpNo, O, "_SAT");
11097 +void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
11098 + raw_ostream &O) {
11104 + L.i = MI->getOperand(OpNo).getImm();
11105 + O << L.i << "(" << L.f << ")";
11108 +void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
11109 + raw_ostream &O) {
11110 + printIfSet(MI, OpNo, O, " *");
11113 +void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
11114 + raw_ostream &O) {
11115 + printIfSet(MI, OpNo, O, "-");
11118 +void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
11119 + raw_ostream &O) {
11120 + switch (MI->getOperand(OpNo).getImm()) {
11134 +void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo,
11135 + raw_ostream &O) {
11136 + const MCOperand &Op = MI->getOperand(OpNo);
11137 + if (Op.getImm() != 0) {
11138 + O << " + " << Op.getImm();
11142 +void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
11143 + raw_ostream &O) {
11144 + printIfSet(MI, OpNo, O, "ExecMask,");
11147 +void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo,
11148 + raw_ostream &O) {
11149 + printIfSet(MI, OpNo, O, "Pred,");
11152 +void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
11153 + raw_ostream &O) {
11154 + const MCOperand &Op = MI->getOperand(OpNo);
11155 + if (Op.getImm() == 0) {
11156 + O << " (MASKED)";
11160 +void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo,
11161 + raw_ostream &O) {
11162 + const char * chans = "XYZW";
11163 + int sel = MI->getOperand(OpNo).getImm();
11165 + int chan = sel & 3;
11168 + if (sel >= 512) {
11170 + int cb = sel >> 12;
11172 + O << cb << "[" << sel << "]";
11173 + } else if (sel >= 448) {
11176 + } else if (sel >= 0){
11181 + O << "." << chans[chan];
11184 +#include "AMDGPUGenAsmWriter.inc"
11185 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h llvm-r600/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
11186 --- llvm-3.2.src/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h 1970-01-01 01:00:00.000000000 +0100
11187 +++ llvm-r600/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h 2013-01-25 19:43:57.456716387 +0100
11189 +//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===//
11191 +// The LLVM Compiler Infrastructure
11193 +// This file is distributed under the University of Illinois Open Source
11194 +// License. See LICENSE.TXT for details.
11196 +//===----------------------------------------------------------------------===//
11199 +//===----------------------------------------------------------------------===//
11201 +#ifndef AMDGPUINSTPRINTER_H
11202 +#define AMDGPUINSTPRINTER_H
11204 +#include "llvm/ADT/StringRef.h"
11205 +#include "llvm/MC/MCInstPrinter.h"
11206 +#include "llvm/Support/raw_ostream.h"
11210 +class AMDGPUInstPrinter : public MCInstPrinter {
11212 + AMDGPUInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
11213 + const MCRegisterInfo &MRI)
11214 + : MCInstPrinter(MAI, MII, MRI) {}
11216 + //Autogenerated by tblgen
11217 + void printInstruction(const MCInst *MI, raw_ostream &O);
11218 + static const char *getRegisterName(unsigned RegNo);
11220 + virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
11223 + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11224 + void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11225 + void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, StringRef Asm);
11226 + void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11227 + void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11228 + void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11229 + void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11230 + void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11231 + void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11232 + void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11233 + void printUpdateExecMask(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11234 + void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11235 + void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11236 + void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
11239 +} // End namespace llvm
11241 +#endif // AMDGPUINSTRPRINTER_H
11242 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/InstPrinter/CMakeLists.txt llvm-r600/lib/Target/R600/InstPrinter/CMakeLists.txt
11243 --- llvm-3.2.src/lib/Target/R600/InstPrinter/CMakeLists.txt 1970-01-01 01:00:00.000000000 +0100
11244 +++ llvm-r600/lib/Target/R600/InstPrinter/CMakeLists.txt 2013-01-25 19:43:57.456716387 +0100
11246 +include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
11248 +add_llvm_library(LLVMR600AsmPrinter
11249 + AMDGPUInstPrinter.cpp
11252 +add_dependencies(LLVMR600AsmPrinter R600CommonTableGen)
11253 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/InstPrinter/LLVMBuild.txt llvm-r600/lib/Target/R600/InstPrinter/LLVMBuild.txt
11254 --- llvm-3.2.src/lib/Target/R600/InstPrinter/LLVMBuild.txt 1970-01-01 01:00:00.000000000 +0100
11255 +++ llvm-r600/lib/Target/R600/InstPrinter/LLVMBuild.txt 2013-01-25 19:43:57.456716387 +0100
11257 +;===- ./lib/Target/R600/InstPrinter/LLVMBuild.txt -----------*- Conf -*--===;
11259 +; The LLVM Compiler Infrastructure
11261 +; This file is distributed under the University of Illinois Open Source
11262 +; License. See LICENSE.TXT for details.
11264 +;===------------------------------------------------------------------------===;
11266 +; This is an LLVMBuild description file for the components in this subdirectory.
11268 +; For more information on the LLVMBuild system, please see:
11270 +; http://llvm.org/docs/LLVMBuild.html
11272 +;===------------------------------------------------------------------------===;
11276 +name = R600AsmPrinter
11278 +required_libraries = MC Support
11279 +add_to_library_groups = R600
11281 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/InstPrinter/Makefile llvm-r600/lib/Target/R600/InstPrinter/Makefile
11282 --- llvm-3.2.src/lib/Target/R600/InstPrinter/Makefile 1970-01-01 01:00:00.000000000 +0100
11283 +++ llvm-r600/lib/Target/R600/InstPrinter/Makefile 2013-01-25 19:43:57.456716387 +0100
11285 +#===- lib/Target/R600/AsmPrinter/Makefile ------------------*- Makefile -*-===##
11287 +# The LLVM Compiler Infrastructure
11289 +# This file is distributed under the University of Illinois Open Source
11290 +# License. See LICENSE.TXT for details.
11292 +##===----------------------------------------------------------------------===##
11293 +LEVEL = ../../../..
11294 +LIBRARYNAME = LLVMR600AsmPrinter
11296 +# Hack: we need to include 'main' x86 target directory to grab private headers
11297 +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
11299 +include $(LEVEL)/Makefile.common
11300 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/LLVMBuild.txt llvm-r600/lib/Target/R600/LLVMBuild.txt
11301 --- llvm-3.2.src/lib/Target/R600/LLVMBuild.txt 1970-01-01 01:00:00.000000000 +0100
11302 +++ llvm-r600/lib/Target/R600/LLVMBuild.txt 2013-01-25 19:43:57.456716387 +0100
11304 +;===- ./lib/Target/AMDIL/LLVMBuild.txt -------------------------*- Conf -*--===;
11306 +; The LLVM Compiler Infrastructure
11308 +; This file is distributed under the University of Illinois Open Source
11309 +; License. See LICENSE.TXT for details.
11311 +;===------------------------------------------------------------------------===;
11313 +; This is an LLVMBuild description file for the components in this subdirectory.
11315 +; For more information on the LLVMBuild system, please see:
11317 +; http://llvm.org/docs/LLVMBuild.html
11319 +;===------------------------------------------------------------------------===;
11322 +subdirectories = InstPrinter MCTargetDesc TargetInfo
11325 +type = TargetGroup
11328 +has_asmprinter = 1
11332 +name = R600CodeGen
11334 +required_libraries = AsmPrinter CodeGen Core SelectionDAG Support Target MC R600AsmPrinter R600Desc R600Info
11335 +add_to_library_groups = R600
11336 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/Makefile llvm-r600/lib/Target/R600/Makefile
11337 --- llvm-3.2.src/lib/Target/R600/Makefile 1970-01-01 01:00:00.000000000 +0100
11338 +++ llvm-r600/lib/Target/R600/Makefile 2013-01-25 19:43:57.460049721 +0100
11340 +##===- lib/Target/R600/Makefile ---------------------------*- Makefile -*-===##
11342 +# The LLVM Compiler Infrastructure
11344 +# This file is distributed under the University of Illinois Open Source
11345 +# License. See LICENSE.TXT for details.
11347 +##===----------------------------------------------------------------------===##
11350 +LIBRARYNAME = LLVMR600CodeGen
11353 +# Make sure that tblgen is run, first thing.
11354 +BUILT_SOURCES = AMDGPUGenRegisterInfo.inc AMDGPUGenInstrInfo.inc \
11355 + AMDGPUGenDAGISel.inc AMDGPUGenSubtargetInfo.inc \
11356 + AMDGPUGenMCCodeEmitter.inc AMDGPUGenCallingConv.inc \
11357 + AMDGPUGenIntrinsics.inc AMDGPUGenDFAPacketizer.inc \
11358 + AMDGPUGenAsmWriter.inc
11360 +DIRS = InstPrinter TargetInfo MCTargetDesc
11362 +include $(LEVEL)/Makefile.common
11363 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
11364 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp 1970-01-01 01:00:00.000000000 +0100
11365 +++ llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp 2013-01-25 19:43:57.456716387 +0100
11367 +//===-- AMDGPUAsmBackend.cpp - AMDGPU Assembler Backend -------------------===//
11369 +// The LLVM Compiler Infrastructure
11371 +// This file is distributed under the University of Illinois Open Source
11372 +// License. See LICENSE.TXT for details.
11375 +//===----------------------------------------------------------------------===//
11377 +#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
11378 +#include "llvm/ADT/StringRef.h"
11379 +#include "llvm/MC/MCAsmBackend.h"
11380 +#include "llvm/MC/MCAssembler.h"
11381 +#include "llvm/MC/MCObjectWriter.h"
11382 +#include "llvm/MC/MCValue.h"
11383 +#include "llvm/Support/TargetRegistry.h"
11385 +using namespace llvm;
11389 +class AMDGPUMCObjectWriter : public MCObjectWriter {
11391 + AMDGPUMCObjectWriter(raw_ostream &OS) : MCObjectWriter(OS, true) { }
11392 + virtual void ExecutePostLayoutBinding(MCAssembler &Asm,
11393 + const MCAsmLayout &Layout) {
11394 + //XXX: Implement if necessary.
11396 + virtual void RecordRelocation(const MCAssembler &Asm,
11397 + const MCAsmLayout &Layout,
11398 + const MCFragment *Fragment,
11399 + const MCFixup &Fixup,
11400 + MCValue Target, uint64_t &FixedValue) {
11401 + assert(!"Not implemented");
11404 + virtual void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout);
11408 +class AMDGPUAsmBackend : public MCAsmBackend {
11410 + AMDGPUAsmBackend(const Target &T)
11411 + : MCAsmBackend() {}
11413 + virtual AMDGPUMCObjectWriter *createObjectWriter(raw_ostream &OS) const;
11414 + virtual unsigned getNumFixupKinds() const { return 0; };
11415 + virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
11416 + uint64_t Value) const;
11417 + virtual bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
11418 + const MCInstFragment *DF,
11419 + const MCAsmLayout &Layout) const {
11422 + virtual void relaxInstruction(const MCInst &Inst, MCInst &Res) const {
11423 + assert(!"Not implemented");
11425 + virtual bool mayNeedRelaxation(const MCInst &Inst) const { return false; }
11426 + virtual bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
11431 +} //End anonymous namespace
11433 +void AMDGPUMCObjectWriter::WriteObject(MCAssembler &Asm,
11434 + const MCAsmLayout &Layout) {
11435 + for (MCAssembler::iterator I = Asm.begin(), E = Asm.end(); I != E; ++I) {
11436 + Asm.writeSectionData(I, Layout);
11440 +MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, StringRef TT,
11442 + return new AMDGPUAsmBackend(T);
11445 +AMDGPUMCObjectWriter * AMDGPUAsmBackend::createObjectWriter(
11446 + raw_ostream &OS) const {
11447 + return new AMDGPUMCObjectWriter(OS);
11450 +void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
11451 + unsigned DataSize, uint64_t Value) const {
11453 + uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset());
11454 + assert(Fixup.getKind() == FK_PCRel_4);
11455 + *Dst = (Value - 4) / 4;
11457 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
11458 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp 1970-01-01 01:00:00.000000000 +0100
11459 +++ llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp 2013-01-25 19:43:57.456716387 +0100
11461 +//===-- MCTargetDesc/AMDGPUMCAsmInfo.cpp - Assembly Info ------------------===//
11463 +// The LLVM Compiler Infrastructure
11465 +// This file is distributed under the University of Illinois Open Source
11466 +// License. See LICENSE.TXT for details.
11469 +//===----------------------------------------------------------------------===//
11471 +#include "AMDGPUMCAsmInfo.h"
11473 +using namespace llvm;
11474 +AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Target &T, StringRef &TT) : MCAsmInfo() {
11475 + HasSingleParameterDotFile = false;
11476 + WeakDefDirective = 0;
11477 + //===------------------------------------------------------------------===//
11478 + HasSubsectionsViaSymbols = true;
11479 + HasMachoZeroFillDirective = false;
11480 + HasMachoTBSSDirective = false;
11481 + HasStaticCtorDtorReferenceInStaticMode = false;
11482 + LinkerRequiresNonEmptyDwarfLines = true;
11483 + MaxInstLength = 16;
11485 + SeparatorString = "\n";
11486 + CommentColumn = 40;
11487 + CommentString = ";";
11488 + LabelSuffix = ":";
11489 + GlobalPrefix = "@";
11490 + PrivateGlobalPrefix = ";.";
11491 + LinkerPrivateGlobalPrefix = "!";
11492 + InlineAsmStart = ";#ASMSTART";
11493 + InlineAsmEnd = ";#ASMEND";
11494 + AssemblerDialect = 0;
11495 + AllowQuotesInName = false;
11496 + AllowNameToStartWithDigit = false;
11497 + AllowPeriodsInName = false;
11499 + //===--- Data Emission Directives -------------------------------------===//
11500 + ZeroDirective = ".zero";
11501 + AsciiDirective = ".ascii\t";
11502 + AscizDirective = ".asciz\t";
11503 + Data8bitsDirective = ".byte\t";
11504 + Data16bitsDirective = ".short\t";
11505 + Data32bitsDirective = ".long\t";
11506 + Data64bitsDirective = ".quad\t";
11507 + GPRel32Directive = 0;
11508 + SunStyleELFSectionSwitchSyntax = true;
11509 + UsesELFSectionDirectiveForBSS = true;
11510 + HasMicrosoftFastStdCallMangling = false;
11512 + //===--- Alignment Information ----------------------------------------===//
11513 + AlignDirective = ".align\t";
11514 + AlignmentIsInBytes = true;
11515 + TextAlignFillValue = 0;
11517 + //===--- Global Variable Emission Directives --------------------------===//
11518 + GlobalDirective = ".global";
11519 + ExternDirective = ".extern";
11520 + HasSetDirective = false;
11521 + HasAggressiveSymbolFolding = true;
11522 + COMMDirectiveAlignmentIsInBytes = false;
11523 + HasDotTypeDotSizeDirective = false;
11524 + HasNoDeadStrip = true;
11525 + HasSymbolResolver = false;
11526 + WeakRefDirective = ".weakref\t";
11527 + LinkOnceDirective = 0;
11528 + //===--- Dwarf Emission Directives -----------------------------------===//
11529 + HasLEB128 = true;
11530 + SupportsDebugInformation = true;
11531 + ExceptionsType = ExceptionHandling::None;
11532 + DwarfUsesInlineInfoSection = false;
11533 + DwarfSectionOffsetDirective = ".offset";
11538 +AMDGPUMCAsmInfo::getDataASDirective(unsigned int Size, unsigned int AS) const {
11543 +AMDGPUMCAsmInfo::getNonexecutableStackSection(MCContext &CTX) const {
11546 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
11547 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h 1970-01-01 01:00:00.000000000 +0100
11548 +++ llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h 2013-01-25 19:43:57.456716387 +0100
11550 +//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface ----------===//
11552 +// The LLVM Compiler Infrastructure
11554 +// This file is distributed under the University of Illinois Open Source
11555 +// License. See LICENSE.TXT for details.
11557 +//===----------------------------------------------------------------------===//
11561 +//===----------------------------------------------------------------------===//
11563 +#ifndef AMDGPUMCASMINFO_H
11564 +#define AMDGPUMCASMINFO_H
11566 +#include "llvm/MC/MCAsmInfo.h"
11572 +class AMDGPUMCAsmInfo : public MCAsmInfo {
11574 + explicit AMDGPUMCAsmInfo(const Target &T, StringRef &TT);
11575 + const char* getDataASDirective(unsigned int Size, unsigned int AS) const;
11576 + const MCSection* getNonexecutableStackSection(MCContext &CTX) const;
11578 +} // namespace llvm
11579 +#endif // AMDGPUMCASMINFO_H
11580 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
11581 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h 1970-01-01 01:00:00.000000000 +0100
11582 +++ llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h 2013-01-25 19:43:57.456716387 +0100
11584 +//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----------------===//
11586 +// The LLVM Compiler Infrastructure
11588 +// This file is distributed under the University of Illinois Open Source
11589 +// License. See LICENSE.TXT for details.
11591 +//===----------------------------------------------------------------------===//
11594 +/// \brief CodeEmitter interface for R600 and SI codegen.
11596 +//===----------------------------------------------------------------------===//
11598 +#ifndef AMDGPUCODEEMITTER_H
11599 +#define AMDGPUCODEEMITTER_H
11601 +#include "llvm/MC/MCCodeEmitter.h"
11602 +#include "llvm/Support/raw_ostream.h"
11609 +class AMDGPUMCCodeEmitter : public MCCodeEmitter {
11612 + uint64_t getBinaryCodeForInstr(const MCInst &MI,
11613 + SmallVectorImpl<MCFixup> &Fixups) const;
11615 + virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
11616 + SmallVectorImpl<MCFixup> &Fixups) const {
11620 + virtual unsigned GPR4AlignEncode(const MCInst &MI, unsigned OpNo,
11621 + SmallVectorImpl<MCFixup> &Fixups) const {
11624 + virtual unsigned GPR2AlignEncode(const MCInst &MI, unsigned OpNo,
11625 + SmallVectorImpl<MCFixup> &Fixups) const {
11628 + virtual uint64_t VOPPostEncode(const MCInst &MI, uint64_t Value) const {
11631 + virtual uint64_t i32LiteralEncode(const MCInst &MI, unsigned OpNo,
11632 + SmallVectorImpl<MCFixup> &Fixups) const {
11635 + virtual uint32_t SMRDmemriEncode(const MCInst &MI, unsigned OpNo,
11636 + SmallVectorImpl<MCFixup> &Fixups) const {
11641 +} // End namespace llvm
11643 +#endif // AMDGPUCODEEMITTER_H
11644 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
11645 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp 1970-01-01 01:00:00.000000000 +0100
11646 +++ llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp 2013-01-25 19:43:57.460049721 +0100
11648 +//===-- AMDGPUMCTargetDesc.cpp - AMDGPU Target Descriptions ---------------===//
11650 +// The LLVM Compiler Infrastructure
11652 +// This file is distributed under the University of Illinois Open Source
11653 +// License. See LICENSE.TXT for details.
11655 +//===----------------------------------------------------------------------===//
11658 +/// \brief This file provides AMDGPU specific target descriptions.
11660 +//===----------------------------------------------------------------------===//
11662 +#include "AMDGPUMCTargetDesc.h"
11663 +#include "AMDGPUMCAsmInfo.h"
11664 +#include "InstPrinter/AMDGPUInstPrinter.h"
11665 +#include "llvm/MC/MachineLocation.h"
11666 +#include "llvm/MC/MCCodeGenInfo.h"
11667 +#include "llvm/MC/MCInstrInfo.h"
11668 +#include "llvm/MC/MCRegisterInfo.h"
11669 +#include "llvm/MC/MCStreamer.h"
11670 +#include "llvm/MC/MCSubtargetInfo.h"
11671 +#include "llvm/Support/ErrorHandling.h"
11672 +#include "llvm/Support/TargetRegistry.h"
11674 +#define GET_INSTRINFO_MC_DESC
11675 +#include "AMDGPUGenInstrInfo.inc"
11677 +#define GET_SUBTARGETINFO_MC_DESC
11678 +#include "AMDGPUGenSubtargetInfo.inc"
11680 +#define GET_REGINFO_MC_DESC
11681 +#include "AMDGPUGenRegisterInfo.inc"
11683 +using namespace llvm;
11685 +static MCInstrInfo *createAMDGPUMCInstrInfo() {
11686 + MCInstrInfo *X = new MCInstrInfo();
11687 + InitAMDGPUMCInstrInfo(X);
11691 +static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) {
11692 + MCRegisterInfo *X = new MCRegisterInfo();
11693 + InitAMDGPUMCRegisterInfo(X, 0);
11697 +static MCSubtargetInfo *createAMDGPUMCSubtargetInfo(StringRef TT, StringRef CPU,
11699 + MCSubtargetInfo * X = new MCSubtargetInfo();
11700 + InitAMDGPUMCSubtargetInfo(X, TT, CPU, FS);
11704 +static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(StringRef TT, Reloc::Model RM,
11705 + CodeModel::Model CM,
11706 + CodeGenOpt::Level OL) {
11707 + MCCodeGenInfo *X = new MCCodeGenInfo();
11708 + X->InitMCCodeGenInfo(RM, CM, OL);
11712 +static MCInstPrinter *createAMDGPUMCInstPrinter(const Target &T,
11713 + unsigned SyntaxVariant,
11714 + const MCAsmInfo &MAI,
11715 + const MCInstrInfo &MII,
11716 + const MCRegisterInfo &MRI,
11717 + const MCSubtargetInfo &STI) {
11718 + return new AMDGPUInstPrinter(MAI, MII, MRI);
11721 +static MCCodeEmitter *createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII,
11722 + const MCRegisterInfo &MRI,
11723 + const MCSubtargetInfo &STI,
11724 + MCContext &Ctx) {
11725 + if (STI.getFeatureBits() & AMDGPU::Feature64BitPtr) {
11726 + return createSIMCCodeEmitter(MCII, MRI, STI, Ctx);
11728 + return createR600MCCodeEmitter(MCII, MRI, STI, Ctx);
11732 +static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
11733 + MCContext &Ctx, MCAsmBackend &MAB,
11734 + raw_ostream &_OS,
11735 + MCCodeEmitter *_Emitter,
11737 + bool NoExecStack) {
11738 + return createPureStreamer(Ctx, MAB, _OS, _Emitter);
11741 +extern "C" void LLVMInitializeR600TargetMC() {
11743 + RegisterMCAsmInfo<AMDGPUMCAsmInfo> Y(TheAMDGPUTarget);
11745 + TargetRegistry::RegisterMCCodeGenInfo(TheAMDGPUTarget, createAMDGPUMCCodeGenInfo);
11747 + TargetRegistry::RegisterMCInstrInfo(TheAMDGPUTarget, createAMDGPUMCInstrInfo);
11749 + TargetRegistry::RegisterMCRegInfo(TheAMDGPUTarget, createAMDGPUMCRegisterInfo);
11751 + TargetRegistry::RegisterMCSubtargetInfo(TheAMDGPUTarget, createAMDGPUMCSubtargetInfo);
11753 + TargetRegistry::RegisterMCInstPrinter(TheAMDGPUTarget, createAMDGPUMCInstPrinter);
11755 + TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, createAMDGPUMCCodeEmitter);
11757 + TargetRegistry::RegisterMCAsmBackend(TheAMDGPUTarget, createAMDGPUAsmBackend);
11759 + TargetRegistry::RegisterMCObjectStreamer(TheAMDGPUTarget, createMCStreamer);
11761 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
11762 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h 1970-01-01 01:00:00.000000000 +0100
11763 +++ llvm-r600/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h 2013-01-25 19:43:57.460049721 +0100
11765 +//===-- AMDGPUMCTargetDesc.h - AMDGPU Target Descriptions -----*- C++ -*-===//
11767 +// The LLVM Compiler Infrastructure
11769 +// This file is distributed under the University of Illinois Open Source
11770 +// License. See LICENSE.TXT for details.
11772 +//===----------------------------------------------------------------------===//
11775 +/// \brief Provides AMDGPU specific target descriptions.
11777 +//===----------------------------------------------------------------------===//
11780 +#ifndef AMDGPUMCTARGETDESC_H
11781 +#define AMDGPUMCTARGETDESC_H
11783 +#include "llvm/ADT/StringRef.h"
11786 +class MCAsmBackend;
11787 +class MCCodeEmitter;
11789 +class MCInstrInfo;
11790 +class MCRegisterInfo;
11791 +class MCSubtargetInfo;
11794 +extern Target TheAMDGPUTarget;
11796 +MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
11797 + const MCRegisterInfo &MRI,
11798 + const MCSubtargetInfo &STI,
11801 +MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
11802 + const MCRegisterInfo &MRI,
11803 + const MCSubtargetInfo &STI,
11806 +MCAsmBackend *createAMDGPUAsmBackend(const Target &T, StringRef TT,
11808 +} // End llvm namespace
11810 +#define GET_REGINFO_ENUM
11811 +#include "AMDGPUGenRegisterInfo.inc"
11813 +#define GET_INSTRINFO_ENUM
11814 +#include "AMDGPUGenInstrInfo.inc"
11816 +#define GET_SUBTARGETINFO_ENUM
11817 +#include "AMDGPUGenSubtargetInfo.inc"
11819 +#endif // AMDGPUMCTARGETDESC_H
11820 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/CMakeLists.txt llvm-r600/lib/Target/R600/MCTargetDesc/CMakeLists.txt
11821 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/CMakeLists.txt 1970-01-01 01:00:00.000000000 +0100
11822 +++ llvm-r600/lib/Target/R600/MCTargetDesc/CMakeLists.txt 2013-01-25 19:43:57.460049721 +0100
11825 +add_llvm_library(LLVMR600Desc
11826 + AMDGPUAsmBackend.cpp
11827 + AMDGPUMCTargetDesc.cpp
11828 + AMDGPUMCAsmInfo.cpp
11829 + R600MCCodeEmitter.cpp
11830 + SIMCCodeEmitter.cpp
11833 +add_dependencies(LLVMR600Desc AMDGPUCommonTableGen)
11834 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/LLVMBuild.txt llvm-r600/lib/Target/R600/MCTargetDesc/LLVMBuild.txt
11835 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/LLVMBuild.txt 1970-01-01 01:00:00.000000000 +0100
11836 +++ llvm-r600/lib/Target/R600/MCTargetDesc/LLVMBuild.txt 2013-01-25 19:43:57.460049721 +0100
11838 +;===- ./lib/Target/R600/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
11840 +; The LLVM Compiler Infrastructure
11842 +; This file is distributed under the University of Illinois Open Source
11843 +; License. See LICENSE.TXT for details.
11845 +;===------------------------------------------------------------------------===;
11847 +; This is an LLVMBuild description file for the components in this subdirectory.
11849 +; For more information on the LLVMBuild system, please see:
11851 +; http://llvm.org/docs/LLVMBuild.html
11853 +;===------------------------------------------------------------------------===;
11859 +required_libraries = R600AsmPrinter R600Info MC
11860 +add_to_library_groups = R600
11861 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/Makefile llvm-r600/lib/Target/R600/MCTargetDesc/Makefile
11862 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/Makefile 1970-01-01 01:00:00.000000000 +0100
11863 +++ llvm-r600/lib/Target/R600/MCTargetDesc/Makefile 2013-01-25 19:43:57.460049721 +0100
11865 +##===- lib/Target/AMDGPU/TargetDesc/Makefile ----------------*- Makefile -*-===##
11867 +# The LLVM Compiler Infrastructure
11869 +# This file is distributed under the University of Illinois Open Source
11870 +# License. See LICENSE.TXT for details.
11872 +##===----------------------------------------------------------------------===##
11874 +LEVEL = ../../../..
11875 +LIBRARYNAME = LLVMR600Desc
11877 +# Hack: we need to include 'main' target directory to grab private headers
11878 +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
11880 +include $(LEVEL)/Makefile.common
11881 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp llvm-r600/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
11882 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp 1970-01-01 01:00:00.000000000 +0100
11883 +++ llvm-r600/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp 2013-01-25 19:43:57.460049721 +0100
11885 +//===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===//
11887 +// The LLVM Compiler Infrastructure
11889 +// This file is distributed under the University of Illinois Open Source
11890 +// License. See LICENSE.TXT for details.
11892 +//===----------------------------------------------------------------------===//
11896 +/// This code emitter outputs bytecode that is understood by the r600g driver
11897 +/// in the Mesa [1] project. The bytecode is very similar to the hardware's ISA,
11898 +/// but it still needs to be run through a finalizer in order to be executed
11901 +/// [1] http://www.mesa3d.org/
11903 +//===----------------------------------------------------------------------===//
11905 +#include "R600Defines.h"
11906 +#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
11907 +#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
11908 +#include "llvm/MC/MCCodeEmitter.h"
11909 +#include "llvm/MC/MCContext.h"
11910 +#include "llvm/MC/MCInst.h"
11911 +#include "llvm/MC/MCInstrInfo.h"
11912 +#include "llvm/MC/MCRegisterInfo.h"
11913 +#include "llvm/MC/MCSubtargetInfo.h"
11914 +#include "llvm/Support/raw_ostream.h"
11916 +#include <stdio.h>
11918 +#define SRC_BYTE_COUNT 11
11919 +#define DST_BYTE_COUNT 5
11921 +using namespace llvm;
11925 +class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
11926 + R600MCCodeEmitter(const R600MCCodeEmitter &); // DO NOT IMPLEMENT
11927 + void operator=(const R600MCCodeEmitter &); // DO NOT IMPLEMENT
11928 + const MCInstrInfo &MCII;
11929 + const MCRegisterInfo &MRI;
11930 + const MCSubtargetInfo &STI;
11935 + R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
11936 + const MCSubtargetInfo &sti, MCContext &ctx)
11937 + : MCII(mcii), MRI(mri), STI(sti), Ctx(ctx) { }
11939 + /// \brief Encode the instruction and write it to the OS.
11940 + virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
11941 + SmallVectorImpl<MCFixup> &Fixups) const;
11943 + /// \returns the encoding for an MCOperand.
11944 + virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
11945 + SmallVectorImpl<MCFixup> &Fixups) const;
11948 + void EmitALUInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
11949 + raw_ostream &OS) const;
11950 + void EmitSrc(const MCInst &MI, unsigned OpIdx, raw_ostream &OS) const;
11951 + void EmitSrcISA(const MCInst &MI, unsigned RegOpIdx, unsigned SelOpIdx,
11952 + raw_ostream &OS) const;
11953 + void EmitDst(const MCInst &MI, raw_ostream &OS) const;
11954 + void EmitTexInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
11955 + raw_ostream &OS) const;
11956 + void EmitFCInstr(const MCInst &MI, raw_ostream &OS) const;
11958 + void EmitNullBytes(unsigned int byteCount, raw_ostream &OS) const;
11960 + void EmitByte(unsigned int byte, raw_ostream &OS) const;
11962 + void EmitTwoBytes(uint32_t bytes, raw_ostream &OS) const;
11964 + void Emit(uint32_t value, raw_ostream &OS) const;
11965 + void Emit(uint64_t value, raw_ostream &OS) const;
11967 + unsigned getHWRegChan(unsigned reg) const;
11968 + unsigned getHWReg(unsigned regNo) const;
11970 + bool isFCOp(unsigned opcode) const;
11971 + bool isTexOp(unsigned opcode) const;
11972 + bool isFlagSet(const MCInst &MI, unsigned Operand, unsigned Flag) const;
11976 +} // End anonymous namespace
11995 + FC_IF_PREDICATE = 0,
12000 + FC_BREAK_PREDICATE,
12004 +enum TextureTypes {
12010 + TEXTURE_SHADOW1D,
12011 + TEXTURE_SHADOW2D,
12012 + TEXTURE_SHADOWRECT,
12013 + TEXTURE_1D_ARRAY,
12014 + TEXTURE_2D_ARRAY,
12015 + TEXTURE_SHADOW1D_ARRAY,
12016 + TEXTURE_SHADOW2D_ARRAY
12019 +MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
12020 + const MCRegisterInfo &MRI,
12021 + const MCSubtargetInfo &STI,
12022 + MCContext &Ctx) {
12023 + return new R600MCCodeEmitter(MCII, MRI, STI, Ctx);
12026 +void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
12027 + SmallVectorImpl<MCFixup> &Fixups) const {
12028 + if (isTexOp(MI.getOpcode())) {
12029 + EmitTexInstr(MI, Fixups, OS);
12030 + } else if (isFCOp(MI.getOpcode())){
12031 + EmitFCInstr(MI, OS);
12032 + } else if (MI.getOpcode() == AMDGPU::RETURN ||
12033 + MI.getOpcode() == AMDGPU::BUNDLE ||
12034 + MI.getOpcode() == AMDGPU::KILL) {
12037 + switch(MI.getOpcode()) {
12038 + case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
12039 + case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
12040 + uint64_t inst = getBinaryCodeForInstr(MI, Fixups);
12041 + EmitByte(INSTR_NATIVE, OS);
12045 + case AMDGPU::CONSTANT_LOAD_eg:
12046 + case AMDGPU::VTX_READ_PARAM_8_eg:
12047 + case AMDGPU::VTX_READ_PARAM_16_eg:
12048 + case AMDGPU::VTX_READ_PARAM_32_eg:
12049 + case AMDGPU::VTX_READ_GLOBAL_8_eg:
12050 + case AMDGPU::VTX_READ_GLOBAL_32_eg:
12051 + case AMDGPU::VTX_READ_GLOBAL_128_eg:
12052 + case AMDGPU::TEX_VTX_CONSTBUF: {
12053 + uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
12054 + uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
12056 + EmitByte(INSTR_VTX, OS);
12057 + Emit(InstWord01, OS);
12058 + Emit(InstWord2, OS);
12061 + case AMDGPU::EG_ExportSwz:
12062 + case AMDGPU::R600_ExportSwz:
12063 + case AMDGPU::EG_ExportBuf:
12064 + case AMDGPU::R600_ExportBuf: {
12065 + uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
12066 + EmitByte(INSTR_EXPORT, OS);
12072 + EmitALUInstr(MI, Fixups, OS);
12078 +void R600MCCodeEmitter::EmitALUInstr(const MCInst &MI,
12079 + SmallVectorImpl<MCFixup> &Fixups,
12080 + raw_ostream &OS) const {
12081 + const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode());
12083 + // Emit instruction type
12084 + EmitByte(INSTR_ALU, OS);
12086 + uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
12088 + //older alu have different encoding for instructions with one or two src
12090 + if ((STI.getFeatureBits() & AMDGPU::FeatureR600ALUInst) &&
12091 + !(MCDesc.TSFlags & R600_InstFlag::OP3)) {
12092 + uint64_t ISAOpCode = InstWord01 & (0x3FFULL << 39);
12093 + InstWord01 &= ~(0x3FFULL << 39);
12094 + InstWord01 |= ISAOpCode << 1;
12097 + unsigned SrcNum = MCDesc.TSFlags & R600_InstFlag::OP3 ? 3 :
12098 + MCDesc.TSFlags & R600_InstFlag::OP2 ? 2 : 1;
12100 + EmitByte(SrcNum, OS);
12102 + const unsigned SrcOps[3][2] = {
12103 + {R600Operands::SRC0, R600Operands::SRC0_SEL},
12104 + {R600Operands::SRC1, R600Operands::SRC1_SEL},
12105 + {R600Operands::SRC2, R600Operands::SRC2_SEL}
12108 + for (unsigned SrcIdx = 0; SrcIdx < SrcNum; ++SrcIdx) {
12109 + unsigned RegOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][0]];
12110 + unsigned SelOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][1]];
12111 + EmitSrcISA(MI, RegOpIdx, SelOpIdx, OS);
12114 + Emit(InstWord01, OS);
12118 +void R600MCCodeEmitter::EmitSrc(const MCInst &MI, unsigned OpIdx,
12119 + raw_ostream &OS) const {
12120 + const MCOperand &MO = MI.getOperand(OpIdx);
12126 + // Emit the source select (2 bytes). For GPRs, this is the register index.
12127 + // For other potential instruction operands, (e.g. constant registers) the
12128 + // value of the source select is defined in the r600isa docs.
12129 + if (MO.isReg()) {
12130 + unsigned reg = MO.getReg();
12131 + EmitTwoBytes(getHWReg(reg), OS);
12132 + if (reg == AMDGPU::ALU_LITERAL_X) {
12133 + unsigned ImmOpIndex = MI.getNumOperands() - 1;
12134 + MCOperand ImmOp = MI.getOperand(ImmOpIndex);
12135 + if (ImmOp.isFPImm()) {
12136 + Value.f = ImmOp.getFPImm();
12138 + assert(ImmOp.isImm());
12139 + Value.i = ImmOp.getImm();
12143 + // XXX: Handle other operand types.
12144 + EmitTwoBytes(0, OS);
12147 + // Emit the source channel (1 byte)
12148 + if (MO.isReg()) {
12149 + EmitByte(getHWRegChan(MO.getReg()), OS);
12154 + // XXX: Emit isNegated (1 byte)
12155 + if ((!(isFlagSet(MI, OpIdx, MO_FLAG_ABS)))
12156 + && (isFlagSet(MI, OpIdx, MO_FLAG_NEG) ||
12158 + (MO.getReg() == AMDGPU::NEG_ONE || MO.getReg() == AMDGPU::NEG_HALF)))){
12164 + // Emit isAbsolute (1 byte)
12165 + if (isFlagSet(MI, OpIdx, MO_FLAG_ABS)) {
12171 + // XXX: Emit relative addressing mode (1 byte)
12174 + // Emit kc_bank, This will be adjusted later by r600_asm
12177 + // Emit the literal value, if applicable (4 bytes).
12178 + Emit(Value.i, OS);
12182 +void R600MCCodeEmitter::EmitSrcISA(const MCInst &MI, unsigned RegOpIdx,
12183 + unsigned SelOpIdx, raw_ostream &OS) const {
12184 + const MCOperand &RegMO = MI.getOperand(RegOpIdx);
12185 + const MCOperand &SelMO = MI.getOperand(SelOpIdx);
12190 + } InlineConstant;
12191 + InlineConstant.i = 0;
12192 + // Emit source type (1 byte) and source select (4 bytes). For GPRs type is 0
12193 + // and select is 0 (GPR index is encoded in the instr encoding. For constants
12194 + // type is 1 and select is the original const select passed from the driver.
12195 + unsigned Reg = RegMO.getReg();
12196 + if (Reg == AMDGPU::ALU_CONST) {
12198 + uint32_t Sel = SelMO.getImm();
12202 + Emit((uint32_t)0, OS);
12205 + if (Reg == AMDGPU::ALU_LITERAL_X) {
12206 + unsigned ImmOpIndex = MI.getNumOperands() - 1;
12207 + MCOperand ImmOp = MI.getOperand(ImmOpIndex);
12208 + if (ImmOp.isFPImm()) {
12209 + InlineConstant.f = ImmOp.getFPImm();
12211 + assert(ImmOp.isImm());
12212 + InlineConstant.i = ImmOp.getImm();
12216 + // Emit the literal value, if applicable (4 bytes).
12217 + Emit(InlineConstant.i, OS);
12220 +void R600MCCodeEmitter::EmitTexInstr(const MCInst &MI,
12221 + SmallVectorImpl<MCFixup> &Fixups,
12222 + raw_ostream &OS) const {
12224 + unsigned Opcode = MI.getOpcode();
12225 + bool hasOffsets = (Opcode == AMDGPU::TEX_LD);
12226 + unsigned OpOffset = hasOffsets ? 3 : 0;
12227 + int64_t Resource = MI.getOperand(OpOffset + 2).getImm();
12228 + int64_t Sampler = MI.getOperand(OpOffset + 3).getImm();
12229 + int64_t TextureType = MI.getOperand(OpOffset + 4).getImm();
12230 + unsigned srcSelect[4] = {0, 1, 2, 3};
12232 + // Emit instruction type
12235 + // Emit instruction
12236 + EmitByte(getBinaryCodeForInstr(MI, Fixups), OS);
12238 + // Emit resource id
12239 + EmitByte(Resource, OS);
12241 + // Emit source register
12242 + EmitByte(getHWReg(MI.getOperand(1).getReg()), OS);
12244 + // XXX: Emit src isRelativeAddress
12247 + // Emit destination register
12248 + EmitByte(getHWReg(MI.getOperand(0).getReg()), OS);
12250 + // XXX: Emit dst isRealtiveAddress
12253 + // XXX: Emit dst select
12254 + EmitByte(0, OS); // X
12255 + EmitByte(1, OS); // Y
12256 + EmitByte(2, OS); // Z
12257 + EmitByte(3, OS); // W
12259 + // XXX: Emit lod bias
12262 + // XXX: Emit coord types
12263 + unsigned coordType[4] = {1, 1, 1, 1};
12265 + if (TextureType == TEXTURE_RECT
12266 + || TextureType == TEXTURE_SHADOWRECT) {
12267 + coordType[ELEMENT_X] = 0;
12268 + coordType[ELEMENT_Y] = 0;
12271 + if (TextureType == TEXTURE_1D_ARRAY
12272 + || TextureType == TEXTURE_SHADOW1D_ARRAY) {
12273 + if (Opcode == AMDGPU::TEX_SAMPLE_C_L || Opcode == AMDGPU::TEX_SAMPLE_C_LB) {
12274 + coordType[ELEMENT_Y] = 0;
12276 + coordType[ELEMENT_Z] = 0;
12277 + srcSelect[ELEMENT_Z] = ELEMENT_Y;
12279 + } else if (TextureType == TEXTURE_2D_ARRAY
12280 + || TextureType == TEXTURE_SHADOW2D_ARRAY) {
12281 + coordType[ELEMENT_Z] = 0;
12284 + for (unsigned i = 0; i < 4; i++) {
12285 + EmitByte(coordType[i], OS);
12288 + // XXX: Emit offsets
12290 + for (unsigned i = 2; i < 5; i++)
12291 + EmitByte(MI.getOperand(i).getImm()<<1, OS);
12293 + EmitNullBytes(3, OS);
12295 + // Emit sampler id
12296 + EmitByte(Sampler, OS);
12298 + // XXX:Emit source select
12299 + if ((TextureType == TEXTURE_SHADOW1D
12300 + || TextureType == TEXTURE_SHADOW2D
12301 + || TextureType == TEXTURE_SHADOWRECT
12302 + || TextureType == TEXTURE_SHADOW1D_ARRAY)
12303 + && Opcode != AMDGPU::TEX_SAMPLE_C_L
12304 + && Opcode != AMDGPU::TEX_SAMPLE_C_LB) {
12305 + srcSelect[ELEMENT_W] = ELEMENT_Z;
12308 + for (unsigned i = 0; i < 4; i++) {
12309 + EmitByte(srcSelect[i], OS);
12313 +void R600MCCodeEmitter::EmitFCInstr(const MCInst &MI, raw_ostream &OS) const {
12315 + // Emit instruction type
12316 + EmitByte(INSTR_FC, OS);
12319 + unsigned NumOperands = MI.getNumOperands();
12320 + if (NumOperands > 0) {
12321 + assert(NumOperands == 1);
12322 + EmitSrc(MI, 0, OS);
12324 + EmitNullBytes(SRC_BYTE_COUNT, OS);
12327 + // Emit FC Instruction
12328 + enum FCInstr instr;
12329 + switch (MI.getOpcode()) {
12330 + case AMDGPU::PREDICATED_BREAK:
12331 + instr = FC_BREAK_PREDICATE;
12333 + case AMDGPU::CONTINUE:
12334 + instr = FC_CONTINUE;
12336 + case AMDGPU::IF_PREDICATE_SET:
12337 + instr = FC_IF_PREDICATE;
12339 + case AMDGPU::ELSE:
12342 + case AMDGPU::ENDIF:
12343 + instr = FC_ENDIF;
12345 + case AMDGPU::ENDLOOP:
12346 + instr = FC_ENDLOOP;
12348 + case AMDGPU::WHILELOOP:
12349 + instr = FC_BGNLOOP;
12355 + EmitByte(instr, OS);
12358 +void R600MCCodeEmitter::EmitNullBytes(unsigned int ByteCount,
12359 + raw_ostream &OS) const {
12361 + for (unsigned int i = 0; i < ByteCount; i++) {
12366 +void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const {
12367 + OS.write((uint8_t) Byte & 0xff);
12370 +void R600MCCodeEmitter::EmitTwoBytes(unsigned int Bytes,
12371 + raw_ostream &OS) const {
12372 + OS.write((uint8_t) (Bytes & 0xff));
12373 + OS.write((uint8_t) ((Bytes >> 8) & 0xff));
12376 +void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const {
12377 + for (unsigned i = 0; i < 4; i++) {
12378 + OS.write((uint8_t) ((Value >> (8 * i)) & 0xff));
12382 +void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const {
12383 + for (unsigned i = 0; i < 8; i++) {
12384 + EmitByte((Value >> (8 * i)) & 0xff, OS);
12388 +unsigned R600MCCodeEmitter::getHWRegChan(unsigned reg) const {
12389 + return MRI.getEncodingValue(reg) >> HW_CHAN_SHIFT;
12392 +unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const {
12393 + return MRI.getEncodingValue(RegNo) & HW_REG_MASK;
12396 +uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
12397 + const MCOperand &MO,
12398 + SmallVectorImpl<MCFixup> &Fixup) const {
12399 + if (MO.isReg()) {
12400 + if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) {
12401 + return MRI.getEncodingValue(MO.getReg());
12403 + return getHWReg(MO.getReg());
12405 + } else if (MO.isImm()) {
12406 + return MO.getImm();
12413 +//===----------------------------------------------------------------------===//
12414 +// Encoding helper functions
12415 +//===----------------------------------------------------------------------===//
12417 +bool R600MCCodeEmitter::isFCOp(unsigned opcode) const {
12419 + default: return false;
12420 + case AMDGPU::PREDICATED_BREAK:
12421 + case AMDGPU::CONTINUE:
12422 + case AMDGPU::IF_PREDICATE_SET:
12423 + case AMDGPU::ELSE:
12424 + case AMDGPU::ENDIF:
12425 + case AMDGPU::ENDLOOP:
12426 + case AMDGPU::WHILELOOP:
12431 +bool R600MCCodeEmitter::isTexOp(unsigned opcode) const {
12433 + default: return false;
12434 + case AMDGPU::TEX_LD:
12435 + case AMDGPU::TEX_GET_TEXTURE_RESINFO:
12436 + case AMDGPU::TEX_SAMPLE:
12437 + case AMDGPU::TEX_SAMPLE_C:
12438 + case AMDGPU::TEX_SAMPLE_L:
12439 + case AMDGPU::TEX_SAMPLE_C_L:
12440 + case AMDGPU::TEX_SAMPLE_LB:
12441 + case AMDGPU::TEX_SAMPLE_C_LB:
12442 + case AMDGPU::TEX_SAMPLE_G:
12443 + case AMDGPU::TEX_SAMPLE_C_G:
12444 + case AMDGPU::TEX_GET_GRADIENTS_H:
12445 + case AMDGPU::TEX_GET_GRADIENTS_V:
12446 + case AMDGPU::TEX_SET_GRADIENTS_H:
12447 + case AMDGPU::TEX_SET_GRADIENTS_V:
12452 +bool R600MCCodeEmitter::isFlagSet(const MCInst &MI, unsigned Operand,
12453 + unsigned Flag) const {
12454 + const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode());
12455 + unsigned FlagIndex = GET_FLAG_OPERAND_IDX(MCDesc.TSFlags);
12456 + if (FlagIndex == 0) {
12459 + assert(MI.getOperand(FlagIndex).isImm());
12460 + return !!((MI.getOperand(FlagIndex).getImm() >>
12461 + (NUM_MO_FLAGS * Operand)) & Flag);
12464 +#include "AMDGPUGenMCCodeEmitter.inc"
12465 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp llvm-r600/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
12466 --- llvm-3.2.src/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp 1970-01-01 01:00:00.000000000 +0100
12467 +++ llvm-r600/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp 2013-01-25 19:43:57.460049721 +0100
12469 +//===-- SIMCCodeEmitter.cpp - SI Code Emitter -------------------------------===//
12471 +// The LLVM Compiler Infrastructure
12473 +// This file is distributed under the University of Illinois Open Source
12474 +// License. See LICENSE.TXT for details.
12476 +//===----------------------------------------------------------------------===//
12479 +/// \brief The SI code emitter produces machine code that can be executed
12480 +/// directly on the GPU device.
12482 +//===----------------------------------------------------------------------===//
12484 +#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
12485 +#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
12486 +#include "llvm/MC/MCCodeEmitter.h"
12487 +#include "llvm/MC/MCContext.h"
12488 +#include "llvm/MC/MCInst.h"
12489 +#include "llvm/MC/MCInstrInfo.h"
12490 +#include "llvm/MC/MCRegisterInfo.h"
12491 +#include "llvm/MC/MCSubtargetInfo.h"
12492 +#include "llvm/MC/MCFixup.h"
12493 +#include "llvm/Support/raw_ostream.h"
12495 +#define VGPR_BIT(src_idx) (1ULL << (9 * src_idx - 1))
12496 +#define SI_INSTR_FLAGS_ENCODING_MASK 0xf
12498 +// These must be kept in sync with SIInstructions.td and also the
12499 +// InstrEncodingInfo array in SIInstrInfo.cpp.
12501 +// NOTE: This enum is only used to identify the encoding type within LLVM,
12502 +// the actual encoding type that is part of the instruction format is different
12503 +namespace SIInstrEncodingType {
12524 +using namespace llvm;
12527 +class SIMCCodeEmitter : public AMDGPUMCCodeEmitter {
12528 + SIMCCodeEmitter(const SIMCCodeEmitter &); // DO NOT IMPLEMENT
12529 + void operator=(const SIMCCodeEmitter &); // DO NOT IMPLEMENT
12530 + const MCInstrInfo &MCII;
12531 + const MCRegisterInfo &MRI;
12532 + const MCSubtargetInfo &STI;
12536 + SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
12537 + const MCSubtargetInfo &sti, MCContext &ctx)
12538 + : MCII(mcii), MRI(mri), STI(sti), Ctx(ctx) { }
12540 + ~SIMCCodeEmitter() { }
12542 + /// \breif Encode the instruction and write it to the OS.
12543 + virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
12544 + SmallVectorImpl<MCFixup> &Fixups) const;
12546 + /// \returns the encoding for an MCOperand.
12547 + virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
12548 + SmallVectorImpl<MCFixup> &Fixups) const;
12552 + /// \brief Encode a sequence of registers with the correct alignment.
12553 + unsigned GPRAlign(const MCInst &MI, unsigned OpNo, unsigned shift) const;
12555 + /// \brief Encoding for when 2 consecutive registers are used
12556 + virtual unsigned GPR2AlignEncode(const MCInst &MI, unsigned OpNo,
12557 + SmallVectorImpl<MCFixup> &Fixup) const;
12559 + /// \brief Encoding for when 4 consectuive registers are used
12560 + virtual unsigned GPR4AlignEncode(const MCInst &MI, unsigned OpNo,
12561 + SmallVectorImpl<MCFixup> &Fixup) const;
12563 + /// \brief Encoding for SMRD indexed loads
12564 + virtual uint32_t SMRDmemriEncode(const MCInst &MI, unsigned OpNo,
12565 + SmallVectorImpl<MCFixup> &Fixup) const;
12567 + /// \brief Post-Encoder method for VOP instructions
12568 + virtual uint64_t VOPPostEncode(const MCInst &MI, uint64_t Value) const;
12572 + /// \returns this SIInstrEncodingType for this instruction.
12573 + unsigned getEncodingType(const MCInst &MI) const;
12575 + /// \brief Get then size in bytes of this instructions encoding.
12576 + unsigned getEncodingBytes(const MCInst &MI) const;
12578 + /// \returns the hardware encoding for a register
12579 + unsigned getRegBinaryCode(unsigned reg) const;
12581 + /// \brief Generated function that returns the hardware encoding for
12583 + unsigned getHWRegNum(unsigned reg) const;
12587 +} // End anonymous namespace
12589 +MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII,
12590 + const MCRegisterInfo &MRI,
12591 + const MCSubtargetInfo &STI,
12592 + MCContext &Ctx) {
12593 + return new SIMCCodeEmitter(MCII, MRI, STI, Ctx);
12596 +void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
12597 + SmallVectorImpl<MCFixup> &Fixups) const {
12598 + uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups);
12599 + unsigned bytes = getEncodingBytes(MI);
12600 + for (unsigned i = 0; i < bytes; i++) {
12601 + OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff));
12605 +uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
12606 + const MCOperand &MO,
12607 + SmallVectorImpl<MCFixup> &Fixups) const {
12608 + if (MO.isReg()) {
12609 + return getRegBinaryCode(MO.getReg());
12610 + } else if (MO.isImm()) {
12611 + return MO.getImm();
12612 + } else if (MO.isFPImm()) {
12613 + // XXX: Not all instructions can use inline literals
12614 + // XXX: We should make sure this is a 32-bit constant
12619 + Imm.F = MO.getFPImm();
12621 + } else if (MO.isExpr()) {
12622 + const MCExpr *Expr = MO.getExpr();
12623 + MCFixupKind Kind = MCFixupKind(FK_PCRel_4);
12624 + Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
12627 + llvm_unreachable("Encoding of this operand type is not supported yet.");
12632 +//===----------------------------------------------------------------------===//
12633 +// Custom Operand Encodings
12634 +//===----------------------------------------------------------------------===//
12636 +unsigned SIMCCodeEmitter::GPRAlign(const MCInst &MI, unsigned OpNo,
12637 + unsigned shift) const {
12638 + unsigned regCode = getRegBinaryCode(MI.getOperand(OpNo).getReg());
12639 + return regCode >> shift;
12642 +unsigned SIMCCodeEmitter::GPR2AlignEncode(const MCInst &MI,
12644 + SmallVectorImpl<MCFixup> &Fixup) const {
12645 + return GPRAlign(MI, OpNo, 1);
12648 +unsigned SIMCCodeEmitter::GPR4AlignEncode(const MCInst &MI,
12650 + SmallVectorImpl<MCFixup> &Fixup) const {
12651 + return GPRAlign(MI, OpNo, 2);
12654 +#define SMRD_OFFSET_MASK 0xff
12655 +#define SMRD_IMM_SHIFT 8
12656 +#define SMRD_SBASE_MASK 0x3f
12657 +#define SMRD_SBASE_SHIFT 9
12658 +/// This function is responsibe for encoding the offset
12659 +/// and the base ptr for SMRD instructions it should return a bit string in
12662 +/// OFFSET = bits{7-0}
12664 +/// SBASE = bits{14-9}
12666 +uint32_t SIMCCodeEmitter::SMRDmemriEncode(const MCInst &MI, unsigned OpNo,
12667 + SmallVectorImpl<MCFixup> &Fixup) const {
12668 + uint32_t Encoding;
12670 + const MCOperand &OffsetOp = MI.getOperand(OpNo + 1);
12672 + //XXX: Use this function for SMRD loads with register offsets
12673 + assert(OffsetOp.isImm());
12676 + (getMachineOpValue(MI, OffsetOp, Fixup) & SMRD_OFFSET_MASK)
12677 + | (1 << SMRD_IMM_SHIFT) //XXX If the Offset is a register we shouldn't set this bit
12678 + | ((GPR2AlignEncode(MI, OpNo, Fixup) & SMRD_SBASE_MASK) << SMRD_SBASE_SHIFT)
12684 +//===----------------------------------------------------------------------===//
12685 +// Post Encoder Callbacks
12686 +//===----------------------------------------------------------------------===//
12688 +uint64_t SIMCCodeEmitter::VOPPostEncode(const MCInst &MI, uint64_t Value) const{
12689 + unsigned encodingType = getEncodingType(MI);
12690 + unsigned numSrcOps;
12691 + unsigned vgprBitOffset;
12693 + if (encodingType == SIInstrEncodingType::VOP3) {
12695 + vgprBitOffset = 32;
12698 + vgprBitOffset = 0;
12701 + // Add one to skip over the destination reg operand.
12702 + for (unsigned opIdx = 1; opIdx < numSrcOps + 1; opIdx++) {
12703 + const MCOperand &MO = MI.getOperand(opIdx);
12704 + if (MO.isReg()) {
12705 + unsigned reg = MI.getOperand(opIdx).getReg();
12706 + if (AMDGPUMCRegisterClasses[AMDGPU::VReg_32RegClassID].contains(reg) ||
12707 + AMDGPUMCRegisterClasses[AMDGPU::VReg_64RegClassID].contains(reg)) {
12708 + Value |= (VGPR_BIT(opIdx)) << vgprBitOffset;
12710 + } else if (MO.isFPImm()) {
12715 + // XXX: Not all instructions can use inline literals
12716 + // XXX: We should make sure this is a 32-bit constant
12717 + Imm.f = MO.getFPImm();
12718 + Value |= ((uint64_t)Imm.i) << 32;
12724 +//===----------------------------------------------------------------------===//
12725 +// Encoding helper functions
12726 +//===----------------------------------------------------------------------===//
12728 +unsigned SIMCCodeEmitter::getEncodingType(const MCInst &MI) const {
12729 + return MCII.get(MI.getOpcode()).TSFlags & SI_INSTR_FLAGS_ENCODING_MASK;
12732 +unsigned SIMCCodeEmitter::getEncodingBytes(const MCInst &MI) const {
12734 + // These instructions aren't real instructions with an encoding type, so
12735 + // we need to manually specify their size.
12736 + switch (MI.getOpcode()) {
12738 + case AMDGPU::SI_LOAD_LITERAL_I32:
12739 + case AMDGPU::SI_LOAD_LITERAL_F32:
12743 + unsigned encoding_type = getEncodingType(MI);
12744 + switch (encoding_type) {
12745 + case SIInstrEncodingType::EXP:
12746 + case SIInstrEncodingType::LDS:
12747 + case SIInstrEncodingType::MUBUF:
12748 + case SIInstrEncodingType::MTBUF:
12749 + case SIInstrEncodingType::MIMG:
12750 + case SIInstrEncodingType::VOP3:
12758 +unsigned SIMCCodeEmitter::getRegBinaryCode(unsigned reg) const {
12760 + case AMDGPU::M0: return 124;
12761 + case AMDGPU::SREG_LIT_0: return 128;
12762 + case AMDGPU::SI_LITERAL_CONSTANT: return 255;
12763 + default: return MRI.getEncodingValue(reg);
12767 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/Processors.td llvm-r600/lib/Target/R600/Processors.td
12768 --- llvm-3.2.src/lib/Target/R600/Processors.td 1970-01-01 01:00:00.000000000 +0100
12769 +++ llvm-r600/lib/Target/R600/Processors.td 2013-01-25 19:43:57.460049721 +0100
12771 +//===-- Processors.td - TODO: Add brief description -------===//
12773 +// The LLVM Compiler Infrastructure
12775 +// This file is distributed under the University of Illinois Open Source
12776 +// License. See LICENSE.TXT for details.
12778 +//===----------------------------------------------------------------------===//
12780 +// AMDIL processors supported.
12782 +//===----------------------------------------------------------------------===//
12784 +class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features>
12785 +: Processor<Name, itin, Features>;
12786 +def : Proc<"r600", R600_EG_Itin, [FeatureR600ALUInst]>;
12787 +def : Proc<"rv710", R600_EG_Itin, []>;
12788 +def : Proc<"rv730", R600_EG_Itin, []>;
12789 +def : Proc<"rv770", R600_EG_Itin, [FeatureFP64]>;
12790 +def : Proc<"cedar", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
12791 +def : Proc<"redwood", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
12792 +def : Proc<"juniper", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
12793 +def : Proc<"cypress", R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>;
12794 +def : Proc<"barts", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
12795 +def : Proc<"turks", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
12796 +def : Proc<"caicos", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
12797 +def : Proc<"cayman", R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>;
12798 +def : Proc<"SI", SI_Itin, [Feature64BitPtr]>;
12800 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600Defines.h llvm-r600/lib/Target/R600/R600Defines.h
12801 --- llvm-3.2.src/lib/Target/R600/R600Defines.h 1970-01-01 01:00:00.000000000 +0100
12802 +++ llvm-r600/lib/Target/R600/R600Defines.h 2013-01-25 19:43:57.460049721 +0100
12804 +//===-- R600Defines.h - R600 Helper Macros ----------------------*- C++ -*-===//
12806 +// The LLVM Compiler Infrastructure
12808 +// This file is distributed under the University of Illinois Open Source
12809 +// License. See LICENSE.TXT for details.
12812 +//===----------------------------------------------------------------------===//
12814 +#ifndef R600DEFINES_H_
12815 +#define R600DEFINES_H_
12817 +#include "llvm/MC/MCRegisterInfo.h"
12820 +#define MO_FLAG_CLAMP (1 << 0)
12821 +#define MO_FLAG_NEG (1 << 1)
12822 +#define MO_FLAG_ABS (1 << 2)
12823 +#define MO_FLAG_MASK (1 << 3)
12824 +#define MO_FLAG_PUSH (1 << 4)
12825 +#define MO_FLAG_NOT_LAST (1 << 5)
12826 +#define MO_FLAG_LAST (1 << 6)
12827 +#define NUM_MO_FLAGS 7
12829 +/// \brief Helper for getting the operand index for the instruction flags
12831 +#define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3)
12833 +namespace R600_InstFlag {
12835 + TRANS_ONLY = (1 << 0),
12837 + REDUCTION = (1 << 2),
12841 + VECTOR = (1 << 6),
12842 + //FlagOperand bits 7, 8
12843 + NATIVE_OPERANDS = (1 << 9),
12849 +#define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS)
12851 +/// \brief Defines for extracting register infomation from register encoding
12852 +#define HW_REG_MASK 0x1ff
12853 +#define HW_CHAN_SHIFT 9
12855 +namespace R600Operands {
12858 + UPDATE_EXEC_MASK,
12859 + UPDATE_PREDICATE,
12884 + const static int ALUOpTable[3][R600Operands::COUNT] = {
12885 +// W C S S S S S S S S S S S
12886 +// R O D L S R R R R S R R R R S R R R L P
12887 +// D U I M R A R C C C C R C C C C R C C C A R I
12888 +// S E U T O E M C 0 0 0 0 C 1 1 1 1 C 2 2 2 S E M
12889 +// T M P E D L P 0 N R A S 1 N R A S 2 N R S T D M
12890 + {0,-1,-1, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1,-1,10,11,12},
12891 + {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,13,14,15,16,-1,-1,-1,-1,17,18,19},
12892 + {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17}
12897 +#endif // R600DEFINES_H_
12898 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600ExpandSpecialInstrs.cpp llvm-r600/lib/Target/R600/R600ExpandSpecialInstrs.cpp
12899 --- llvm-3.2.src/lib/Target/R600/R600ExpandSpecialInstrs.cpp 1970-01-01 01:00:00.000000000 +0100
12900 +++ llvm-r600/lib/Target/R600/R600ExpandSpecialInstrs.cpp 2013-01-25 19:43:57.463383054 +0100
12902 +//===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===//
12904 +// The LLVM Compiler Infrastructure
12906 +// This file is distributed under the University of Illinois Open Source
12907 +// License. See LICENSE.TXT for details.
12909 +//===----------------------------------------------------------------------===//
12912 +/// Vector, Reduction, and Cube instructions need to fill the entire instruction
12913 +/// group to work correctly. This pass expands these individual instructions
12914 +/// into several instructions that will completely fill the instruction group.
12916 +//===----------------------------------------------------------------------===//
12918 +#include "AMDGPU.h"
12919 +#include "R600Defines.h"
12920 +#include "R600InstrInfo.h"
12921 +#include "R600RegisterInfo.h"
12922 +#include "R600MachineFunctionInfo.h"
12923 +#include "llvm/CodeGen/MachineFunctionPass.h"
12924 +#include "llvm/CodeGen/MachineInstrBuilder.h"
12925 +#include "llvm/CodeGen/MachineRegisterInfo.h"
12927 +using namespace llvm;
12931 +class R600ExpandSpecialInstrsPass : public MachineFunctionPass {
12935 + const R600InstrInfo *TII;
12937 + bool ExpandInputPerspective(MachineInstr& MI);
12938 + bool ExpandInputConstant(MachineInstr& MI);
12941 + R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
12942 + TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())) { }
12944 + virtual bool runOnMachineFunction(MachineFunction &MF);
12946 + const char *getPassName() const {
12947 + return "R600 Expand special instructions pass";
12951 +} // End anonymous namespace
12953 +char R600ExpandSpecialInstrsPass::ID = 0;
12955 +FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) {
12956 + return new R600ExpandSpecialInstrsPass(TM);
12959 +bool R600ExpandSpecialInstrsPass::ExpandInputPerspective(MachineInstr &MI) {
12960 + const R600RegisterInfo &TRI = TII->getRegisterInfo();
12961 + if (MI.getOpcode() != AMDGPU::input_perspective)
12964 + MachineBasicBlock::iterator I = &MI;
12965 + unsigned DstReg = MI.getOperand(0).getReg();
12966 + R600MachineFunctionInfo *MFI = MI.getParent()->getParent()
12967 + ->getInfo<R600MachineFunctionInfo>();
12968 + unsigned IJIndexBase;
12970 + // In Evergreen ISA doc section 8.3.2 :
12971 + // We need to interpolate XY and ZW in two different instruction groups.
12972 + // An INTERP_* must occupy all 4 slots of an instruction group.
12973 + // Output of INTERP_XY is written in X,Y slots
12974 + // Output of INTERP_ZW is written in Z,W slots
12976 + // Thus interpolation requires the following sequences :
12978 + // AnyGPR.x = INTERP_ZW; (Write Masked Out)
12979 + // AnyGPR.y = INTERP_ZW; (Write Masked Out)
12980 + // DstGPR.z = INTERP_ZW;
12981 + // DstGPR.w = INTERP_ZW; (End of first IG)
12982 + // DstGPR.x = INTERP_XY;
12983 + // DstGPR.y = INTERP_XY;
12984 + // AnyGPR.z = INTERP_XY; (Write Masked Out)
12985 + // AnyGPR.w = INTERP_XY; (Write Masked Out) (End of second IG)
12987 + switch (MI.getOperand(1).getImm()) {
12989 + IJIndexBase = MFI->GetIJPerspectiveIndex();
12992 + IJIndexBase = MFI->GetIJLinearIndex();
12995 + assert(0 && "Unknow ij index");
12998 + for (unsigned i = 0; i < 8; i++) {
12999 + unsigned IJIndex = AMDGPU::R600_TReg32RegClass.getRegister(
13000 + 2 * IJIndexBase + ((i + 1) % 2));
13001 + unsigned ReadReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
13002 + MI.getOperand(2).getImm());
13005 + unsigned Sel = AMDGPU::sel_x;
13007 + case 0:Sel = AMDGPU::sel_x;break;
13008 + case 1:Sel = AMDGPU::sel_y;break;
13009 + case 2:Sel = AMDGPU::sel_z;break;
13010 + case 3:Sel = AMDGPU::sel_w;break;
13014 + unsigned Res = TRI.getSubReg(DstReg, Sel);
13016 + unsigned Opcode = (i < 4)?AMDGPU::INTERP_ZW:AMDGPU::INTERP_XY;
13018 + MachineBasicBlock &MBB = *(MI.getParent());
13019 + MachineInstr *NewMI =
13020 + TII->buildDefaultInstruction(MBB, I, Opcode, Res, IJIndex, ReadReg);
13022 + if (!(i> 1 && i < 6)) {
13023 + TII->addFlag(NewMI, 0, MO_FLAG_MASK);
13027 + TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
13030 + MI.eraseFromParent();
13035 +bool R600ExpandSpecialInstrsPass::ExpandInputConstant(MachineInstr &MI) {
13036 + const R600RegisterInfo &TRI = TII->getRegisterInfo();
13037 + if (MI.getOpcode() != AMDGPU::input_constant)
13040 + MachineBasicBlock::iterator I = &MI;
13041 + unsigned DstReg = MI.getOperand(0).getReg();
13043 + for (unsigned i = 0; i < 4; i++) {
13044 + unsigned ReadReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
13045 + MI.getOperand(1).getImm());
13047 + unsigned Sel = AMDGPU::sel_x;
13049 + case 0:Sel = AMDGPU::sel_x;break;
13050 + case 1:Sel = AMDGPU::sel_y;break;
13051 + case 2:Sel = AMDGPU::sel_z;break;
13052 + case 3:Sel = AMDGPU::sel_w;break;
13056 + unsigned Res = TRI.getSubReg(DstReg, Sel);
13058 + MachineBasicBlock &MBB = *(MI.getParent());
13059 + MachineInstr *NewMI = TII->buildDefaultInstruction(
13060 + MBB, I, AMDGPU::INTERP_LOAD_P0, Res, ReadReg);
13063 + TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
13066 + MI.eraseFromParent();
13071 +bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
13073 + const R600RegisterInfo &TRI = TII->getRegisterInfo();
13075 + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
13076 + BB != BB_E; ++BB) {
13077 + MachineBasicBlock &MBB = *BB;
13078 + MachineBasicBlock::iterator I = MBB.begin();
13079 + while (I != MBB.end()) {
13080 + MachineInstr &MI = *I;
13081 + I = llvm::next(I);
13083 + switch (MI.getOpcode()) {
13085 + // Expand PRED_X to one of the PRED_SET instructions.
13086 + case AMDGPU::PRED_X: {
13087 + uint64_t Flags = MI.getOperand(3).getImm();
13088 + // The native opcode used by PRED_X is stored as an immediate in the
13089 + // third operand.
13090 + MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
13091 + MI.getOperand(2).getImm(), // opcode
13092 + MI.getOperand(0).getReg(), // dst
13093 + MI.getOperand(1).getReg(), // src0
13094 + AMDGPU::ZERO); // src1
13095 + TII->addFlag(PredSet, 0, MO_FLAG_MASK);
13096 + if (Flags & MO_FLAG_PUSH) {
13097 + TII->setImmOperand(PredSet, R600Operands::UPDATE_EXEC_MASK, 1);
13099 + TII->setImmOperand(PredSet, R600Operands::UPDATE_PREDICATE, 1);
13101 + MI.eraseFromParent();
13104 + case AMDGPU::BREAK:
13105 + MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
13106 + AMDGPU::PRED_SETE_INT,
13107 + AMDGPU::PREDICATE_BIT,
13110 + TII->addFlag(PredSet, 0, MO_FLAG_MASK);
13111 + TII->setImmOperand(PredSet, R600Operands::UPDATE_EXEC_MASK, 1);
13113 + BuildMI(MBB, I, MBB.findDebugLoc(I),
13114 + TII->get(AMDGPU::PREDICATED_BREAK))
13115 + .addReg(AMDGPU::PREDICATE_BIT);
13116 + MI.eraseFromParent();
13120 + if (ExpandInputPerspective(MI))
13122 + if (ExpandInputConstant(MI))
13125 + bool IsReduction = TII->isReductionOp(MI.getOpcode());
13126 + bool IsVector = TII->isVector(MI);
13127 + bool IsCube = TII->isCubeOp(MI.getOpcode());
13128 + if (!IsReduction && !IsVector && !IsCube) {
13132 + // Expand the instruction
13134 + // Reduction instructions:
13135 + // T0_X = DP4 T1_XYZW, T2_XYZW
13137 + // TO_X = DP4 T1_X, T2_X
13138 + // TO_Y (write masked) = DP4 T1_Y, T2_Y
13139 + // TO_Z (write masked) = DP4 T1_Z, T2_Z
13140 + // TO_W (write masked) = DP4 T1_W, T2_W
13142 + // Vector instructions:
13143 + // T0_X = MULLO_INT T1_X, T2_X
13145 + // T0_X = MULLO_INT T1_X, T2_X
13146 + // T0_Y (write masked) = MULLO_INT T1_X, T2_X
13147 + // T0_Z (write masked) = MULLO_INT T1_X, T2_X
13148 + // T0_W (write masked) = MULLO_INT T1_X, T2_X
13150 + // Cube instructions:
13151 + // T0_XYZW = CUBE T1_XYZW
13153 + // TO_X = CUBE T1_Z, T1_Y
13154 + // T0_Y = CUBE T1_Z, T1_X
13155 + // T0_Z = CUBE T1_X, T1_Z
13156 + // T0_W = CUBE T1_Y, T1_Z
13157 + for (unsigned Chan = 0; Chan < 4; Chan++) {
13158 + unsigned DstReg = MI.getOperand(
13159 + TII->getOperandIdx(MI, R600Operands::DST)).getReg();
13160 + unsigned Src0 = MI.getOperand(
13161 + TII->getOperandIdx(MI, R600Operands::SRC0)).getReg();
13162 + unsigned Src1 = 0;
13164 + // Determine the correct source registers
13166 + int Src1Idx = TII->getOperandIdx(MI, R600Operands::SRC1);
13167 + if (Src1Idx != -1) {
13168 + Src1 = MI.getOperand(Src1Idx).getReg();
13171 + if (IsReduction) {
13172 + unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
13173 + Src0 = TRI.getSubReg(Src0, SubRegIndex);
13174 + Src1 = TRI.getSubReg(Src1, SubRegIndex);
13175 + } else if (IsCube) {
13176 + static const int CubeSrcSwz[] = {2, 2, 0, 1};
13177 + unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]);
13178 + unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
13179 + Src1 = TRI.getSubReg(Src0, SubRegIndex1);
13180 + Src0 = TRI.getSubReg(Src0, SubRegIndex0);
13183 + // Determine the correct destination registers;
13184 + bool Mask = false;
13185 + bool NotLast = true;
13187 + unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
13188 + DstReg = TRI.getSubReg(DstReg, SubRegIndex);
13190 + // Mask the write if the original instruction does not write to
13191 + // the current Channel.
13192 + Mask = (Chan != TRI.getHWRegChan(DstReg));
13193 + unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
13194 + DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
13197 + // Set the IsLast bit
13198 + NotLast = (Chan != 3 );
13200 + // Add the new instruction
13201 + unsigned Opcode = MI.getOpcode();
13202 + switch (Opcode) {
13203 + case AMDGPU::CUBE_r600_pseudo:
13204 + Opcode = AMDGPU::CUBE_r600_real;
13206 + case AMDGPU::CUBE_eg_pseudo:
13207 + Opcode = AMDGPU::CUBE_eg_real;
13209 + case AMDGPU::DOT4_r600_pseudo:
13210 + Opcode = AMDGPU::DOT4_r600_real;
13212 + case AMDGPU::DOT4_eg_pseudo:
13213 + Opcode = AMDGPU::DOT4_eg_real;
13219 + MachineInstr *NewMI =
13220 + TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1);
13222 + NewMI->setIsInsideBundle(Chan != 0);
13224 + TII->addFlag(NewMI, 0, MO_FLAG_MASK);
13227 + TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
13230 + MI.eraseFromParent();
13235 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600InstrInfo.cpp llvm-r600/lib/Target/R600/R600InstrInfo.cpp
13236 --- llvm-3.2.src/lib/Target/R600/R600InstrInfo.cpp 1970-01-01 01:00:00.000000000 +0100
13237 +++ llvm-r600/lib/Target/R600/R600InstrInfo.cpp 2013-01-25 19:43:57.466716387 +0100
13239 +//===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===//
13241 +// The LLVM Compiler Infrastructure
13243 +// This file is distributed under the University of Illinois Open Source
13244 +// License. See LICENSE.TXT for details.
13246 +//===----------------------------------------------------------------------===//
13249 +/// \brief R600 Implementation of TargetInstrInfo.
13251 +//===----------------------------------------------------------------------===//
13253 +#include "R600InstrInfo.h"
13254 +#include "AMDGPUTargetMachine.h"
13255 +#include "AMDGPUSubtarget.h"
13256 +#include "R600Defines.h"
13257 +#include "R600RegisterInfo.h"
13258 +#include "llvm/CodeGen/MachineInstrBuilder.h"
13260 +#define GET_INSTRINFO_CTOR
13261 +#include "AMDGPUGenDFAPacketizer.inc"
13263 +using namespace llvm;
13265 +R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm)
13266 + : AMDGPUInstrInfo(tm),
13270 +const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const {
13274 +bool R600InstrInfo::isTrig(const MachineInstr &MI) const {
13275 + return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG;
13278 +bool R600InstrInfo::isVector(const MachineInstr &MI) const {
13279 + return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
13283 +R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
13284 + MachineBasicBlock::iterator MI, DebugLoc DL,
13285 + unsigned DestReg, unsigned SrcReg,
13286 + bool KillSrc) const {
13287 + if (AMDGPU::R600_Reg128RegClass.contains(DestReg)
13288 + && AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
13289 + for (unsigned I = 0; I < 4; I++) {
13290 + unsigned SubRegIndex = RI.getSubRegFromChannel(I);
13291 + buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
13292 + RI.getSubReg(DestReg, SubRegIndex),
13293 + RI.getSubReg(SrcReg, SubRegIndex))
13295 + RegState::Define | RegState::Implicit);
13299 + // We can't copy vec4 registers
13300 + assert(!AMDGPU::R600_Reg128RegClass.contains(DestReg)
13301 + && !AMDGPU::R600_Reg128RegClass.contains(SrcReg));
13303 + MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
13304 + DestReg, SrcReg);
13305 + NewMI->getOperand(getOperandIdx(*NewMI, R600Operands::SRC0))
13306 + .setIsKill(KillSrc);
13310 +MachineInstr * R600InstrInfo::getMovImmInstr(MachineFunction *MF,
13311 + unsigned DstReg, int64_t Imm) const {
13312 + MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::MOV), DebugLoc());
13313 + MachineInstrBuilder(MI).addReg(DstReg, RegState::Define);
13314 + MachineInstrBuilder(MI).addReg(AMDGPU::ALU_LITERAL_X);
13315 + MachineInstrBuilder(MI).addImm(Imm);
13316 + MachineInstrBuilder(MI).addReg(0); // PREDICATE_BIT
13321 +unsigned R600InstrInfo::getIEQOpcode() const {
13322 + return AMDGPU::SETE_INT;
13325 +bool R600InstrInfo::isMov(unsigned Opcode) const {
13329 + default: return false;
13330 + case AMDGPU::MOV:
13331 + case AMDGPU::MOV_IMM_F32:
13332 + case AMDGPU::MOV_IMM_I32:
13337 +// Some instructions act as place holders to emulate operations that the GPU
13338 +// hardware does automatically. This function can be used to check if
13339 +// an opcode falls into this category.
13340 +bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const {
13341 + switch (Opcode) {
13342 + default: return false;
13343 + case AMDGPU::RETURN:
13344 + case AMDGPU::RESERVE_REG:
13349 +bool R600InstrInfo::isReductionOp(unsigned Opcode) const {
13351 + default: return false;
13352 + case AMDGPU::DOT4_r600_pseudo:
13353 + case AMDGPU::DOT4_eg_pseudo:
13358 +bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
13360 + default: return false;
13361 + case AMDGPU::CUBE_r600_pseudo:
13362 + case AMDGPU::CUBE_r600_real:
13363 + case AMDGPU::CUBE_eg_pseudo:
13364 + case AMDGPU::CUBE_eg_real:
13369 +bool R600InstrInfo::isALUInstr(unsigned Opcode) const {
13370 + unsigned TargetFlags = get(Opcode).TSFlags;
13372 + return ((TargetFlags & R600_InstFlag::OP1) |
13373 + (TargetFlags & R600_InstFlag::OP2) |
13374 + (TargetFlags & R600_InstFlag::OP3));
13377 +DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM,
13378 + const ScheduleDAG *DAG) const {
13379 + const InstrItineraryData *II = TM->getInstrItineraryData();
13380 + return TM->getSubtarget<AMDGPUSubtarget>().createDFAPacketizer(II);
13384 +isPredicateSetter(unsigned Opcode) {
13385 + switch (Opcode) {
13386 + case AMDGPU::PRED_X:
13393 +static MachineInstr *
13394 +findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
13395 + MachineBasicBlock::iterator I) {
13396 + while (I != MBB.begin()) {
13398 + MachineInstr *MI = I;
13399 + if (isPredicateSetter(MI->getOpcode()))
13407 +R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
13408 + MachineBasicBlock *&TBB,
13409 + MachineBasicBlock *&FBB,
13410 + SmallVectorImpl<MachineOperand> &Cond,
13411 + bool AllowModify) const {
13412 + // Most of the following comes from the ARM implementation of AnalyzeBranch
13414 + // If the block has no terminators, it just falls into the block after it.
13415 + MachineBasicBlock::iterator I = MBB.end();
13416 + if (I == MBB.begin())
13419 + while (I->isDebugValue()) {
13420 + if (I == MBB.begin())
13424 + if (static_cast<MachineInstr *>(I)->getOpcode() != AMDGPU::JUMP) {
13428 + // Get the last instruction in the block.
13429 + MachineInstr *LastInst = I;
13431 + // If there is only one terminator instruction, process it.
13432 + unsigned LastOpc = LastInst->getOpcode();
13433 + if (I == MBB.begin() ||
13434 + static_cast<MachineInstr *>(--I)->getOpcode() != AMDGPU::JUMP) {
13435 + if (LastOpc == AMDGPU::JUMP) {
13436 + if(!isPredicated(LastInst)) {
13437 + TBB = LastInst->getOperand(0).getMBB();
13440 + MachineInstr *predSet = I;
13441 + while (!isPredicateSetter(predSet->getOpcode())) {
13444 + TBB = LastInst->getOperand(0).getMBB();
13445 + Cond.push_back(predSet->getOperand(1));
13446 + Cond.push_back(predSet->getOperand(2));
13447 + Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
13451 + return true; // Can't handle indirect branch.
13454 + // Get the instruction before it if it is a terminator.
13455 + MachineInstr *SecondLastInst = I;
13456 + unsigned SecondLastOpc = SecondLastInst->getOpcode();
13458 + // If the block ends with a B and a Bcc, handle it.
13459 + if (SecondLastOpc == AMDGPU::JUMP &&
13460 + isPredicated(SecondLastInst) &&
13461 + LastOpc == AMDGPU::JUMP &&
13462 + !isPredicated(LastInst)) {
13463 + MachineInstr *predSet = --I;
13464 + while (!isPredicateSetter(predSet->getOpcode())) {
13467 + TBB = SecondLastInst->getOperand(0).getMBB();
13468 + FBB = LastInst->getOperand(0).getMBB();
13469 + Cond.push_back(predSet->getOperand(1));
13470 + Cond.push_back(predSet->getOperand(2));
13471 + Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
13475 + // Otherwise, can't handle this.
13479 +int R600InstrInfo::getBranchInstr(const MachineOperand &op) const {
13480 + const MachineInstr *MI = op.getParent();
13482 + switch (MI->getDesc().OpInfo->RegClass) {
13483 + default: // FIXME: fallthrough??
13484 + case AMDGPU::GPRI32RegClassID: return AMDGPU::BRANCH_COND_i32;
13485 + case AMDGPU::GPRF32RegClassID: return AMDGPU::BRANCH_COND_f32;
13490 +R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
13491 + MachineBasicBlock *TBB,
13492 + MachineBasicBlock *FBB,
13493 + const SmallVectorImpl<MachineOperand> &Cond,
13494 + DebugLoc DL) const {
13495 + assert(TBB && "InsertBranch must not be told to insert a fallthrough");
13498 + if (Cond.empty()) {
13499 + BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB).addReg(0);
13502 + MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
13503 + assert(PredSet && "No previous predicate !");
13504 + addFlag(PredSet, 0, MO_FLAG_PUSH);
13505 + PredSet->getOperand(2).setImm(Cond[1].getImm());
13507 + BuildMI(&MBB, DL, get(AMDGPU::JUMP))
13509 + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
13513 + MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
13514 + assert(PredSet && "No previous predicate !");
13515 + addFlag(PredSet, 0, MO_FLAG_PUSH);
13516 + PredSet->getOperand(2).setImm(Cond[1].getImm());
13517 + BuildMI(&MBB, DL, get(AMDGPU::JUMP))
13519 + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
13520 + BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB).addReg(0);
13526 +R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
13528 + // Note : we leave PRED* instructions there.
13529 + // They may be needed when predicating instructions.
13531 + MachineBasicBlock::iterator I = MBB.end();
13533 + if (I == MBB.begin()) {
13537 + switch (I->getOpcode()) {
13540 + case AMDGPU::JUMP:
13541 + if (isPredicated(I)) {
13542 + MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
13543 + clearFlag(predSet, 0, MO_FLAG_PUSH);
13545 + I->eraseFromParent();
13550 + if (I == MBB.begin()) {
13554 + switch (I->getOpcode()) {
13555 + // FIXME: only one case??
13558 + case AMDGPU::JUMP:
13559 + if (isPredicated(I)) {
13560 + MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
13561 + clearFlag(predSet, 0, MO_FLAG_PUSH);
13563 + I->eraseFromParent();
13570 +R600InstrInfo::isPredicated(const MachineInstr *MI) const {
13571 + int idx = MI->findFirstPredOperandIdx();
13575 + unsigned Reg = MI->getOperand(idx).getReg();
13577 + default: return false;
13578 + case AMDGPU::PRED_SEL_ONE:
13579 + case AMDGPU::PRED_SEL_ZERO:
13580 + case AMDGPU::PREDICATE_BIT:
13586 +R600InstrInfo::isPredicable(MachineInstr *MI) const {
13587 + // XXX: KILL* instructions can be predicated, but they must be the last
13588 + // instruction in a clause, so this means any instructions after them cannot
13589 + // be predicated. Until we have proper support for instruction clauses in the
13590 + // backend, we will mark KILL* instructions as unpredicable.
13592 + if (MI->getOpcode() == AMDGPU::KILLGT) {
13595 + return AMDGPUInstrInfo::isPredicable(MI);
13601 +R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
13602 + unsigned NumCyles,
13603 + unsigned ExtraPredCycles,
13604 + const BranchProbability &Probability) const{
13609 +R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
13610 + unsigned NumTCycles,
13611 + unsigned ExtraTCycles,
13612 + MachineBasicBlock &FMBB,
13613 + unsigned NumFCycles,
13614 + unsigned ExtraFCycles,
13615 + const BranchProbability &Probability) const {
13620 +R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
13621 + unsigned NumCyles,
13622 + const BranchProbability &Probability)
13628 +R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
13629 + MachineBasicBlock &FMBB) const {
13635 +R600InstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
13636 + MachineOperand &MO = Cond[1];
13637 + switch (MO.getImm()) {
13638 + case OPCODE_IS_ZERO_INT:
13639 + MO.setImm(OPCODE_IS_NOT_ZERO_INT);
13641 + case OPCODE_IS_NOT_ZERO_INT:
13642 + MO.setImm(OPCODE_IS_ZERO_INT);
13644 + case OPCODE_IS_ZERO:
13645 + MO.setImm(OPCODE_IS_NOT_ZERO);
13647 + case OPCODE_IS_NOT_ZERO:
13648 + MO.setImm(OPCODE_IS_ZERO);
13654 + MachineOperand &MO2 = Cond[2];
13655 + switch (MO2.getReg()) {
13656 + case AMDGPU::PRED_SEL_ZERO:
13657 + MO2.setReg(AMDGPU::PRED_SEL_ONE);
13659 + case AMDGPU::PRED_SEL_ONE:
13660 + MO2.setReg(AMDGPU::PRED_SEL_ZERO);
13669 +R600InstrInfo::DefinesPredicate(MachineInstr *MI,
13670 + std::vector<MachineOperand> &Pred) const {
13671 + return isPredicateSetter(MI->getOpcode());
13676 +R600InstrInfo::SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
13677 + const SmallVectorImpl<MachineOperand> &Pred2) const {
13683 +R600InstrInfo::PredicateInstruction(MachineInstr *MI,
13684 + const SmallVectorImpl<MachineOperand> &Pred) const {
13685 + int PIdx = MI->findFirstPredOperandIdx();
13687 + if (PIdx != -1) {
13688 + MachineOperand &PMO = MI->getOperand(PIdx);
13689 + PMO.setReg(Pred[2].getReg());
13690 + MachineInstrBuilder(MI).addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
13697 +unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
13698 + const MachineInstr *MI,
13699 + unsigned *PredCost) const {
13705 +MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MBB,
13706 + MachineBasicBlock::iterator I,
13709 + unsigned Src0Reg,
13710 + unsigned Src1Reg) const {
13711 + MachineInstrBuilder MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opcode),
13715 + MIB.addImm(0) // $update_exec_mask
13716 + .addImm(0); // $update_predicate
13718 + MIB.addImm(1) // $write
13719 + .addImm(0) // $omod
13720 + .addImm(0) // $dst_rel
13721 + .addImm(0) // $dst_clamp
13722 + .addReg(Src0Reg) // $src0
13723 + .addImm(0) // $src0_neg
13724 + .addImm(0) // $src0_rel
13725 + .addImm(0) // $src0_abs
13726 + .addImm(-1); // $src0_sel
13729 + MIB.addReg(Src1Reg) // $src1
13730 + .addImm(0) // $src1_neg
13731 + .addImm(0) // $src1_rel
13732 + .addImm(0) // $src1_abs
13733 + .addImm(-1); // $src1_sel
13736 + //XXX: The r600g finalizer expects this to be 1, once we've moved the
13737 + //scheduling to the backend, we can change the default to 0.
13738 + MIB.addImm(1) // $last
13739 + .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel
13740 + .addImm(0); // $literal
13745 +MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
13746 + MachineBasicBlock::iterator I,
13748 + uint64_t Imm) const {
13749 + MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg,
13750 + AMDGPU::ALU_LITERAL_X);
13751 + setImmOperand(MovImm, R600Operands::IMM, Imm);
13755 +int R600InstrInfo::getOperandIdx(const MachineInstr &MI,
13756 + R600Operands::Ops Op) const {
13757 + return getOperandIdx(MI.getOpcode(), Op);
13760 +int R600InstrInfo::getOperandIdx(unsigned Opcode,
13761 + R600Operands::Ops Op) const {
13762 + unsigned TargetFlags = get(Opcode).TSFlags;
13763 + unsigned OpTableIdx;
13765 + if (!HAS_NATIVE_OPERANDS(TargetFlags)) {
13767 + case R600Operands::DST: return 0;
13768 + case R600Operands::SRC0: return 1;
13769 + case R600Operands::SRC1: return 2;
13770 + case R600Operands::SRC2: return 3;
13772 + assert(!"Unknown operand type for instruction");
13777 + if (TargetFlags & R600_InstFlag::OP1) {
13779 + } else if (TargetFlags & R600_InstFlag::OP2) {
13782 + assert((TargetFlags & R600_InstFlag::OP3) && "OP1, OP2, or OP3 not defined "
13783 + "for this instruction");
13787 + return R600Operands::ALUOpTable[OpTableIdx][Op];
13790 +void R600InstrInfo::setImmOperand(MachineInstr *MI, R600Operands::Ops Op,
13791 + int64_t Imm) const {
13792 + int Idx = getOperandIdx(*MI, Op);
13793 + assert(Idx != -1 && "Operand not supported for this instruction.");
13794 + assert(MI->getOperand(Idx).isImm());
13795 + MI->getOperand(Idx).setImm(Imm);
13798 +//===----------------------------------------------------------------------===//
13799 +// Instruction flag getters/setters
13800 +//===----------------------------------------------------------------------===//
13802 +bool R600InstrInfo::hasFlagOperand(const MachineInstr &MI) const {
13803 + return GET_FLAG_OPERAND_IDX(get(MI.getOpcode()).TSFlags) != 0;
13806 +MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx,
13807 + unsigned Flag) const {
13808 + unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
13809 + int FlagIndex = 0;
13811 + // If we pass something other than the default value of Flag to this
13812 + // function, it means we are want to set a flag on an instruction
13813 + // that uses native encoding.
13814 + assert(HAS_NATIVE_OPERANDS(TargetFlags));
13815 + bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3;
13817 + case MO_FLAG_CLAMP:
13818 + FlagIndex = getOperandIdx(*MI, R600Operands::CLAMP);
13820 + case MO_FLAG_MASK:
13821 + FlagIndex = getOperandIdx(*MI, R600Operands::WRITE);
13823 + case MO_FLAG_NOT_LAST:
13824 + case MO_FLAG_LAST:
13825 + FlagIndex = getOperandIdx(*MI, R600Operands::LAST);
13827 + case MO_FLAG_NEG:
13828 + switch (SrcIdx) {
13829 + case 0: FlagIndex = getOperandIdx(*MI, R600Operands::SRC0_NEG); break;
13830 + case 1: FlagIndex = getOperandIdx(*MI, R600Operands::SRC1_NEG); break;
13831 + case 2: FlagIndex = getOperandIdx(*MI, R600Operands::SRC2_NEG); break;
13835 + case MO_FLAG_ABS:
13836 + assert(!IsOP3 && "Cannot set absolute value modifier for OP3 "
13837 + "instructions.");
13838 + switch (SrcIdx) {
13839 + case 0: FlagIndex = getOperandIdx(*MI, R600Operands::SRC0_ABS); break;
13840 + case 1: FlagIndex = getOperandIdx(*MI, R600Operands::SRC1_ABS); break;
13848 + assert(FlagIndex != -1 && "Flag not supported for this instruction");
13850 + FlagIndex = GET_FLAG_OPERAND_IDX(TargetFlags);
13851 + assert(FlagIndex != 0 &&
13852 + "Instruction flags not supported for this instruction");
13855 + MachineOperand &FlagOp = MI->getOperand(FlagIndex);
13856 + assert(FlagOp.isImm());
13860 +void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand,
13861 + unsigned Flag) const {
13862 + unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
13866 + if (HAS_NATIVE_OPERANDS(TargetFlags)) {
13867 + MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
13868 + if (Flag == MO_FLAG_NOT_LAST) {
13869 + clearFlag(MI, Operand, MO_FLAG_LAST);
13870 + } else if (Flag == MO_FLAG_MASK) {
13871 + clearFlag(MI, Operand, Flag);
13873 + FlagOp.setImm(1);
13876 + MachineOperand &FlagOp = getFlagOp(MI, Operand);
13877 + FlagOp.setImm(FlagOp.getImm() | (Flag << (NUM_MO_FLAGS * Operand)));
13881 +void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand,
13882 + unsigned Flag) const {
13883 + unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
13884 + if (HAS_NATIVE_OPERANDS(TargetFlags)) {
13885 + MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
13886 + FlagOp.setImm(0);
13888 + MachineOperand &FlagOp = getFlagOp(MI);
13889 + unsigned InstFlags = FlagOp.getImm();
13890 + InstFlags &= ~(Flag << (NUM_MO_FLAGS * Operand));
13891 + FlagOp.setImm(InstFlags);
13894 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600InstrInfo.h llvm-r600/lib/Target/R600/R600InstrInfo.h
13895 --- llvm-3.2.src/lib/Target/R600/R600InstrInfo.h 1970-01-01 01:00:00.000000000 +0100
13896 +++ llvm-r600/lib/Target/R600/R600InstrInfo.h 2013-01-25 19:43:57.466716387 +0100
13898 +//===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===//
13900 +// The LLVM Compiler Infrastructure
13902 +// This file is distributed under the University of Illinois Open Source
13903 +// License. See LICENSE.TXT for details.
13905 +//===----------------------------------------------------------------------===//
13908 +/// \brief Interface definition for R600InstrInfo
13910 +//===----------------------------------------------------------------------===//
13912 +#ifndef R600INSTRUCTIONINFO_H_
13913 +#define R600INSTRUCTIONINFO_H_
13915 +#include "AMDIL.h"
13916 +#include "AMDGPUInstrInfo.h"
13917 +#include "R600Defines.h"
13918 +#include "R600RegisterInfo.h"
13924 + class AMDGPUTargetMachine;
13925 + class DFAPacketizer;
13926 + class ScheduleDAG;
13927 + class MachineFunction;
13928 + class MachineInstr;
13929 + class MachineInstrBuilder;
13931 + class R600InstrInfo : public AMDGPUInstrInfo {
13933 + const R600RegisterInfo RI;
13935 + int getBranchInstr(const MachineOperand &op) const;
13938 + explicit R600InstrInfo(AMDGPUTargetMachine &tm);
13940 + const R600RegisterInfo &getRegisterInfo() const;
13941 + virtual void copyPhysReg(MachineBasicBlock &MBB,
13942 + MachineBasicBlock::iterator MI, DebugLoc DL,
13943 + unsigned DestReg, unsigned SrcReg,
13944 + bool KillSrc) const;
13946 + bool isTrig(const MachineInstr &MI) const;
13947 + bool isPlaceHolderOpcode(unsigned opcode) const;
13948 + bool isReductionOp(unsigned opcode) const;
13949 + bool isCubeOp(unsigned opcode) const;
13951 + /// \returns true if this \p Opcode represents an ALU instruction.
13952 + bool isALUInstr(unsigned Opcode) const;
13954 + /// \breif Vector instructions are instructions that must fill all
13955 + /// instruction slots within an instruction group.
13956 + bool isVector(const MachineInstr &MI) const;
13958 + virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg,
13959 + int64_t Imm) const;
13961 + virtual unsigned getIEQOpcode() const;
13962 + virtual bool isMov(unsigned Opcode) const;
13964 + DFAPacketizer *CreateTargetScheduleState(const TargetMachine *TM,
13965 + const ScheduleDAG *DAG) const;
13967 + bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
13969 + bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
13970 + SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const;
13972 + unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const;
13974 + unsigned RemoveBranch(MachineBasicBlock &MBB) const;
13976 + bool isPredicated(const MachineInstr *MI) const;
13978 + bool isPredicable(MachineInstr *MI) const;
13981 + isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
13982 + const BranchProbability &Probability) const;
13984 + bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
13985 + unsigned ExtraPredCycles,
13986 + const BranchProbability &Probability) const ;
13989 + isProfitableToIfCvt(MachineBasicBlock &TMBB,
13990 + unsigned NumTCycles, unsigned ExtraTCycles,
13991 + MachineBasicBlock &FMBB,
13992 + unsigned NumFCycles, unsigned ExtraFCycles,
13993 + const BranchProbability &Probability) const;
13995 + bool DefinesPredicate(MachineInstr *MI,
13996 + std::vector<MachineOperand> &Pred) const;
13998 + bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
13999 + const SmallVectorImpl<MachineOperand> &Pred2) const;
14001 + bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
14002 + MachineBasicBlock &FMBB) const;
14004 + bool PredicateInstruction(MachineInstr *MI,
14005 + const SmallVectorImpl<MachineOperand> &Pred) const;
14007 + unsigned int getInstrLatency(const InstrItineraryData *ItinData,
14008 + const MachineInstr *MI,
14009 + unsigned *PredCost = 0) const;
14011 + virtual int getInstrLatency(const InstrItineraryData *ItinData,
14012 + SDNode *Node) const { return 1;}
14014 + /// You can use this function to avoid manually specifying each instruction
14015 + /// modifier operand when building a new instruction.
14017 + /// \returns a MachineInstr with all the instruction modifiers initialized
14018 + /// to their default values.
14019 + MachineInstrBuilder buildDefaultInstruction(MachineBasicBlock &MBB,
14020 + MachineBasicBlock::iterator I,
14023 + unsigned Src0Reg,
14024 + unsigned Src1Reg = 0) const;
14026 + MachineInstr *buildMovImm(MachineBasicBlock &BB,
14027 + MachineBasicBlock::iterator I,
14029 + uint64_t Imm) const;
14031 + /// \brief Get the index of Op in the MachineInstr.
14033 + /// \returns -1 if the Instruction does not contain the specified \p Op.
14034 + int getOperandIdx(const MachineInstr &MI, R600Operands::Ops Op) const;
14036 + /// \brief Get the index of \p Op for the given Opcode.
14038 + /// \returns -1 if the Instruction does not contain the specified \p Op.
14039 + int getOperandIdx(unsigned Opcode, R600Operands::Ops Op) const;
14041 + /// \brief Helper function for setting instruction flag values.
14042 + void setImmOperand(MachineInstr *MI, R600Operands::Ops Op, int64_t Imm) const;
14044 + /// \returns true if this instruction has an operand for storing target flags.
14045 + bool hasFlagOperand(const MachineInstr &MI) const;
14047 + ///\brief Add one of the MO_FLAG* flags to the specified \p Operand.
14048 + void addFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
14050 + ///\brief Determine if the specified \p Flag is set on this \p Operand.
14051 + bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const;
14053 + /// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2)
14054 + /// \param Flag The flag being set.
14056 + /// \returns the operand containing the flags for this instruction.
14057 + MachineOperand &getFlagOp(MachineInstr *MI, unsigned SrcIdx = 0,
14058 + unsigned Flag = 0) const;
14060 + /// \brief Clear the specified flag on the instruction.
14061 + void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
14064 +} // End llvm namespace
14066 +#endif // R600INSTRINFO_H_
14067 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600Instructions.td llvm-r600/lib/Target/R600/R600Instructions.td
14068 --- llvm-3.2.src/lib/Target/R600/R600Instructions.td 1970-01-01 01:00:00.000000000 +0100
14069 +++ llvm-r600/lib/Target/R600/R600Instructions.td 2013-01-25 19:43:57.466716387 +0100
14071 +//===-- R600Instructions.td - R600 Instruction defs -------*- tablegen -*-===//
14073 +// The LLVM Compiler Infrastructure
14075 +// This file is distributed under the University of Illinois Open Source
14076 +// License. See LICENSE.TXT for details.
14078 +//===----------------------------------------------------------------------===//
14080 +// R600 Tablegen instruction definitions
14082 +//===----------------------------------------------------------------------===//
14084 +include "R600Intrinsics.td"
14086 +class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern,
14087 + InstrItinClass itin>
14088 + : AMDGPUInst <outs, ins, asm, pattern> {
14090 + field bits<64> Inst;
14093 + bit isVector = 0;
14094 + bits<2> FlagOperandIdx = 0;
14097 + bit HasNativeOperands = 0;
14099 + bits<11> op_code = inst;
14100 + //let Inst = inst;
14101 + let Namespace = "AMDGPU";
14102 + let OutOperandList = outs;
14103 + let InOperandList = ins;
14104 + let AsmString = asm;
14105 + let Pattern = pattern;
14106 + let Itinerary = itin;
14108 + let TSFlags{4} = Trig;
14109 + let TSFlags{5} = Op3;
14111 + // Vector instructions are instructions that must fill all slots in an
14112 + // instruction group
14113 + let TSFlags{6} = isVector;
14114 + let TSFlags{8-7} = FlagOperandIdx;
14115 + let TSFlags{9} = HasNativeOperands;
14116 + let TSFlags{10} = Op1;
14117 + let TSFlags{11} = Op2;
14120 +class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> :
14121 + AMDGPUInst <outs, ins, asm, pattern> {
14122 + field bits<64> Inst;
14124 + let Namespace = "AMDGPU";
14127 +def MEMxi : Operand<iPTR> {
14128 + let MIOperandInfo = (ops R600_TReg32_X:$ptr, i32imm:$index);
14129 + let PrintMethod = "printMemOperand";
14132 +def MEMrr : Operand<iPTR> {
14133 + let MIOperandInfo = (ops R600_Reg32:$ptr, R600_Reg32:$index);
14136 +// Operands for non-registers
14138 +class InstFlag<string PM = "printOperand", int Default = 0>
14139 + : OperandWithDefaultOps <i32, (ops (i32 Default))> {
14140 + let PrintMethod = PM;
14143 +// src_sel for ALU src operands, see also ALU_CONST, ALU_PARAM registers
14144 +def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))> {
14145 + let PrintMethod = "printSel";
14148 +def LITERAL : InstFlag<"printLiteral">;
14150 +def WRITE : InstFlag <"printWrite", 1>;
14151 +def OMOD : InstFlag <"printOMOD">;
14152 +def REL : InstFlag <"printRel">;
14153 +def CLAMP : InstFlag <"printClamp">;
14154 +def NEG : InstFlag <"printNeg">;
14155 +def ABS : InstFlag <"printAbs">;
14156 +def UEM : InstFlag <"printUpdateExecMask">;
14157 +def UP : InstFlag <"printUpdatePred">;
14159 +// XXX: The r600g finalizer in Mesa expects last to be one in most cases.
14160 +// Once we start using the packetizer in this backend we should have this
14162 +def LAST : InstFlag<"printLast", 1>;
14164 +def ADDRParam : ComplexPattern<i32, 2, "SelectADDRParam", [], []>;
14165 +def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
14166 +def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
14167 +def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>;
14168 +def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>;
14170 +class R600ALU_Word0 {
14171 + field bits<32> Word0;
14174 + bits<1> src0_neg;
14175 + bits<1> src0_rel;
14177 + bits<1> src1_rel;
14178 + bits<1> src1_neg;
14179 + bits<3> index_mode = 0;
14180 + bits<2> pred_sel;
14183 + bits<9> src0_sel = src0{8-0};
14184 + bits<2> src0_chan = src0{10-9};
14185 + bits<9> src1_sel = src1{8-0};
14186 + bits<2> src1_chan = src1{10-9};
14188 + let Word0{8-0} = src0_sel;
14189 + let Word0{9} = src0_rel;
14190 + let Word0{11-10} = src0_chan;
14191 + let Word0{12} = src0_neg;
14192 + let Word0{21-13} = src1_sel;
14193 + let Word0{22} = src1_rel;
14194 + let Word0{24-23} = src1_chan;
14195 + let Word0{25} = src1_neg;
14196 + let Word0{28-26} = index_mode;
14197 + let Word0{30-29} = pred_sel;
14198 + let Word0{31} = last;
14201 +class R600ALU_Word1 {
14202 + field bits<32> Word1;
14205 + bits<3> bank_swizzle = 0;
14209 + bits<7> dst_sel = dst{6-0};
14210 + bits<2> dst_chan = dst{10-9};
14212 + let Word1{20-18} = bank_swizzle;
14213 + let Word1{27-21} = dst_sel;
14214 + let Word1{28} = dst_rel;
14215 + let Word1{30-29} = dst_chan;
14216 + let Word1{31} = clamp;
14219 +class R600ALU_Word1_OP2 <bits<11> alu_inst> : R600ALU_Word1{
14221 + bits<1> src0_abs;
14222 + bits<1> src1_abs;
14223 + bits<1> update_exec_mask;
14224 + bits<1> update_pred;
14228 + let Word1{0} = src0_abs;
14229 + let Word1{1} = src1_abs;
14230 + let Word1{2} = update_exec_mask;
14231 + let Word1{3} = update_pred;
14232 + let Word1{4} = write;
14233 + let Word1{6-5} = omod;
14234 + let Word1{17-7} = alu_inst;
14237 +class R600ALU_Word1_OP3 <bits<5> alu_inst> : R600ALU_Word1{
14240 + bits<1> src2_rel;
14241 + bits<1> src2_neg;
14243 + bits<9> src2_sel = src2{8-0};
14244 + bits<2> src2_chan = src2{10-9};
14246 + let Word1{8-0} = src2_sel;
14247 + let Word1{9} = src2_rel;
14248 + let Word1{11-10} = src2_chan;
14249 + let Word1{12} = src2_neg;
14250 + let Word1{17-13} = alu_inst;
14254 + field bits<32> Word0;
14257 + bits<2> FETCH_TYPE;
14258 + bits<1> FETCH_WHOLE_QUAD;
14259 + bits<8> BUFFER_ID;
14261 + bits<2> SRC_SEL_X;
14262 + bits<6> MEGA_FETCH_COUNT;
14264 + let Word0{4-0} = VC_INST;
14265 + let Word0{6-5} = FETCH_TYPE;
14266 + let Word0{7} = FETCH_WHOLE_QUAD;
14267 + let Word0{15-8} = BUFFER_ID;
14268 + let Word0{22-16} = SRC_GPR;
14269 + let Word0{23} = SRC_REL;
14270 + let Word0{25-24} = SRC_SEL_X;
14271 + let Word0{31-26} = MEGA_FETCH_COUNT;
14274 +class VTX_WORD1_GPR {
14275 + field bits<32> Word1;
14278 + bits<3> DST_SEL_X;
14279 + bits<3> DST_SEL_Y;
14280 + bits<3> DST_SEL_Z;
14281 + bits<3> DST_SEL_W;
14282 + bits<1> USE_CONST_FIELDS;
14283 + bits<6> DATA_FORMAT;
14284 + bits<2> NUM_FORMAT_ALL;
14285 + bits<1> FORMAT_COMP_ALL;
14286 + bits<1> SRF_MODE_ALL;
14288 + let Word1{6-0} = DST_GPR;
14289 + let Word1{7} = DST_REL;
14290 + let Word1{8} = 0; // Reserved
14291 + let Word1{11-9} = DST_SEL_X;
14292 + let Word1{14-12} = DST_SEL_Y;
14293 + let Word1{17-15} = DST_SEL_Z;
14294 + let Word1{20-18} = DST_SEL_W;
14295 + let Word1{21} = USE_CONST_FIELDS;
14296 + let Word1{27-22} = DATA_FORMAT;
14297 + let Word1{29-28} = NUM_FORMAT_ALL;
14298 + let Word1{30} = FORMAT_COMP_ALL;
14299 + let Word1{31} = SRF_MODE_ALL;
14303 +XXX: R600 subtarget uses a slightly different encoding than the other
14304 +subtargets. We currently handle this in R600MCCodeEmitter, but we may
14305 +want to use these instruction classes in the future.
14307 +class R600ALU_Word1_OP2_r600 : R600ALU_Word1_OP2 {
14309 + bits<1> fog_merge;
14310 + bits<10> alu_inst;
14312 + let Inst{37} = fog_merge;
14313 + let Inst{39-38} = omod;
14314 + let Inst{49-40} = alu_inst;
14317 +class R600ALU_Word1_OP2_r700 : R600ALU_Word1_OP2 {
14319 + bits<11> alu_inst;
14321 + let Inst{38-37} = omod;
14322 + let Inst{49-39} = alu_inst;
14326 +def R600_Pred : PredicateOperand<i32, (ops R600_Predicate),
14327 + (ops PRED_SEL_OFF)>;
14330 +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
14332 +// Class for instructions with only one source register.
14333 +// If you add new ins to this instruction, make sure they are listed before
14334 +// $literal, because the backend currently assumes that the last operand is
14335 +// a literal. Also be sure to update the enum R600Op1OperandIndex::ROI in
14336 +// R600Defines.h, R600InstrInfo::buildDefaultInstruction(),
14337 +// and R600InstrInfo::getOperandIdx().
14338 +class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
14339 + InstrItinClass itin = AnyALU> :
14341 + (outs R600_Reg32:$dst),
14342 + (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
14343 + R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
14344 + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
14345 + !strconcat(opName,
14346 + "$clamp $dst$write$dst_rel$omod, "
14347 + "$src0_neg$src0_abs$src0$src0_sel$src0_abs$src0_rel, "
14348 + "$literal $pred_sel$last"),
14352 + R600ALU_Word1_OP2 <inst> {
14355 + let src1_rel = 0;
14356 + let src1_neg = 0;
14357 + let src1_abs = 0;
14358 + let update_exec_mask = 0;
14359 + let update_pred = 0;
14360 + let HasNativeOperands = 1;
14362 + let DisableEncoding = "$literal";
14364 + let Inst{31-0} = Word0;
14365 + let Inst{63-32} = Word1;
14368 +class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
14369 + InstrItinClass itin = AnyALU> :
14370 + R600_1OP <inst, opName,
14371 + [(set R600_Reg32:$dst, (node R600_Reg32:$src0))]
14374 +// If you add our change the operands for R600_2OP instructions, you must
14375 +// also update the R600Op2OperandIndex::ROI enum in R600Defines.h,
14376 +// R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx().
14377 +class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
14378 + InstrItinClass itin = AnyALU> :
14380 + (outs R600_Reg32:$dst),
14381 + (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write,
14382 + OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
14383 + R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
14384 + R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, SEL:$src1_sel,
14385 + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
14386 + !strconcat(opName,
14387 + "$clamp $update_exec_mask$update_pred$dst$write$dst_rel$omod, "
14388 + "$src0_neg$src0_abs$src0$src0_sel$src0_abs$src0_rel, "
14389 + "$src1_neg$src1_abs$src1$src1_sel$src1_abs$src1_rel, "
14390 + "$literal $pred_sel$last"),
14394 + R600ALU_Word1_OP2 <inst> {
14396 + let HasNativeOperands = 1;
14398 + let DisableEncoding = "$literal";
14400 + let Inst{31-0} = Word0;
14401 + let Inst{63-32} = Word1;
14404 +class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
14405 + InstrItinClass itim = AnyALU> :
14406 + R600_2OP <inst, opName,
14407 + [(set R600_Reg32:$dst, (node R600_Reg32:$src0,
14408 + R600_Reg32:$src1))]
14411 +// If you add our change the operands for R600_3OP instructions, you must
14412 +// also update the R600Op3OperandIndex::ROI enum in R600Defines.h,
14413 +// R600InstrInfo::buildDefaultInstruction(), and
14414 +// R600InstrInfo::getOperandIdx().
14415 +class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
14416 + InstrItinClass itin = AnyALU> :
14418 + (outs R600_Reg32:$dst),
14419 + (ins REL:$dst_rel, CLAMP:$clamp,
14420 + R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel,
14421 + R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel,
14422 + R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel,
14423 + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
14424 + !strconcat(opName, "$clamp $dst$dst_rel, "
14425 + "$src0_neg$src0$src0_sel$src0_rel, "
14426 + "$src1_neg$src1$src1_sel$src1_rel, "
14427 + "$src2_neg$src2$src2_sel$src2_rel, "
14428 + "$literal $pred_sel$last"),
14432 + R600ALU_Word1_OP3<inst>{
14434 + let HasNativeOperands = 1;
14435 + let DisableEncoding = "$literal";
14438 + let Inst{31-0} = Word0;
14439 + let Inst{63-32} = Word1;
14442 +class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
14443 + InstrItinClass itin = VecALU> :
14445 + (outs R600_Reg32:$dst),
14451 +class R600_TEX <bits<11> inst, string opName, list<dag> pattern,
14452 + InstrItinClass itin = AnyALU> :
14454 + (outs R600_Reg128:$dst),
14455 + (ins R600_Reg128:$src0, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
14456 + !strconcat(opName, "$dst, $src0, $resourceId, $samplerId, $textureTarget"),
14459 + let Inst {10-0} = inst;
14462 +} // End mayLoad = 1, mayStore = 0, hasSideEffects = 0
14464 +def TEX_SHADOW : PatLeaf<
14466 + [{uint32_t TType = (uint32_t)N->getZExtValue();
14467 + return (TType >= 6 && TType <= 8) || (TType >= 11 && TType <= 13);
14471 +def TEX_RECT : PatLeaf<
14473 + [{uint32_t TType = (uint32_t)N->getZExtValue();
14474 + return TType == 5;
14478 +class EG_CF_RAT <bits <8> cf_inst, bits <6> rat_inst, bits<4> rat_id, dag outs,
14479 + dag ins, string asm, list<dag> pattern> :
14480 + InstR600ISA <outs, ins, asm, pattern> {
14482 + bits<7> INDEX_GPR;
14487 + bits<2> ELEM_SIZE;
14489 + bits<12> ARRAY_SIZE;
14490 + bits<4> COMP_MASK;
14491 + bits<4> BURST_COUNT;
14497 + // CF_ALLOC_EXPORT_WORD0_RAT
14498 + let Inst{3-0} = rat_id;
14499 + let Inst{9-4} = rat_inst;
14500 + let Inst{10} = 0; // Reserved
14501 + let Inst{12-11} = RIM;
14502 + let Inst{14-13} = TYPE;
14503 + let Inst{21-15} = RW_GPR;
14504 + let Inst{22} = RW_REL;
14505 + let Inst{29-23} = INDEX_GPR;
14506 + let Inst{31-30} = ELEM_SIZE;
14508 + // CF_ALLOC_EXPORT_WORD1_BUF
14509 + let Inst{43-32} = ARRAY_SIZE;
14510 + let Inst{47-44} = COMP_MASK;
14511 + let Inst{51-48} = BURST_COUNT;
14512 + let Inst{52} = VPM;
14513 + let Inst{53} = eop;
14514 + let Inst{61-54} = cf_inst;
14515 + let Inst{62} = MARK;
14516 + let Inst{63} = BARRIER;
14519 +class LoadParamFrag <PatFrag load_type> : PatFrag <
14520 + (ops node:$ptr), (load_type node:$ptr),
14521 + [{ return isParamLoad(dyn_cast<LoadSDNode>(N)); }]
14524 +def load_param : LoadParamFrag<load>;
14525 +def load_param_zexti8 : LoadParamFrag<zextloadi8>;
14526 +def load_param_zexti16 : LoadParamFrag<zextloadi16>;
14528 +def isR600 : Predicate<"Subtarget.device()"
14529 + "->getGeneration() == AMDGPUDeviceInfo::HD4XXX">;
14530 +def isR700 : Predicate<"Subtarget.device()"
14531 + "->getGeneration() == AMDGPUDeviceInfo::HD4XXX &&"
14532 + "Subtarget.device()->getDeviceFlag()"
14533 + ">= OCL_DEVICE_RV710">;
14534 +def isEG : Predicate<
14535 + "Subtarget.device()->getGeneration() >= AMDGPUDeviceInfo::HD5XXX && "
14536 + "Subtarget.device()->getGeneration() < AMDGPUDeviceInfo::HD7XXX && "
14537 + "Subtarget.device()->getDeviceFlag() != OCL_DEVICE_CAYMAN">;
14539 +def isCayman : Predicate<"Subtarget.device()"
14540 + "->getDeviceFlag() == OCL_DEVICE_CAYMAN">;
14541 +def isEGorCayman : Predicate<"Subtarget.device()"
14542 + "->getGeneration() == AMDGPUDeviceInfo::HD5XXX"
14543 + "|| Subtarget.device()->getGeneration() =="
14544 + "AMDGPUDeviceInfo::HD6XXX">;
14546 +def isR600toCayman : Predicate<
14547 + "Subtarget.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX">;
14549 +//===----------------------------------------------------------------------===//
14551 +//===----------------------------------------------------------------------===//
14553 +def INTERP: SDNode<"AMDGPUISD::INTERP",
14554 + SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisInt<1>, SDTCisInt<2>]>
14557 +def INTERP_P0: SDNode<"AMDGPUISD::INTERP_P0",
14558 + SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisInt<1>]>
14561 +def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS",
14562 + SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisPtrTy<1>]>,
14566 +//===----------------------------------------------------------------------===//
14567 +// Interpolation Instructions
14568 +//===----------------------------------------------------------------------===//
14570 +let usesCustomInserter = 1 in {
14571 +def input_perspective : AMDGPUShaderInst <
14572 + (outs R600_Reg128:$dst),
14573 + (ins i32imm:$src0, i32imm:$src1),
14574 + "input_perspective $src0 $src1 : dst",
14575 + [(set R600_Reg128:$dst, (INTERP (i32 imm:$src0), (i32 imm:$src1)))]>;
14576 +} // End usesCustomInserter = 1
14578 +def input_constant : AMDGPUShaderInst <
14579 + (outs R600_Reg128:$dst),
14580 + (ins i32imm:$src),
14581 + "input_perspective $src : dst",
14582 + [(set R600_Reg128:$dst, (INTERP_P0 (i32 imm:$src)))]>;
14586 +def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> {
14587 + let bank_swizzle = 5;
14590 +def INTERP_ZW : R600_2OP <0xD7, "INTERP_ZW", []> {
14591 + let bank_swizzle = 5;
14594 +def INTERP_LOAD_P0 : R600_1OP <0xE0, "INTERP_LOAD_P0", []>;
14596 +//===----------------------------------------------------------------------===//
14597 +// Export Instructions
14598 +//===----------------------------------------------------------------------===//
14600 +def ExportType : SDTypeProfile<0, 5, [SDTCisFP<0>, SDTCisInt<1>]>;
14602 +def EXPORT: SDNode<"AMDGPUISD::EXPORT", ExportType,
14603 + [SDNPHasChain, SDNPSideEffect]>;
14605 +class ExportWord0 {
14606 + field bits<32> Word0;
14608 + bits<13> arraybase;
14611 + bits<2> elem_size;
14613 + let Word0{12-0} = arraybase;
14614 + let Word0{14-13} = type;
14615 + let Word0{21-15} = gpr;
14616 + let Word0{22} = 0; // RW_REL
14617 + let Word0{29-23} = 0; // INDEX_GPR
14618 + let Word0{31-30} = elem_size;
14621 +class ExportSwzWord1 {
14622 + field bits<32> Word1;
14631 + let Word1{2-0} = sw_x;
14632 + let Word1{5-3} = sw_y;
14633 + let Word1{8-6} = sw_z;
14634 + let Word1{11-9} = sw_w;
14637 +class ExportBufWord1 {
14638 + field bits<32> Word1;
14640 + bits<12> arraySize;
14641 + bits<4> compMask;
14645 + let Word1{11-0} = arraySize;
14646 + let Word1{15-12} = compMask;
14649 +multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
14650 + def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg),
14652 + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sel_x),
14653 + 0, 61, 0, 7, 7, 7, cf_inst, 0)
14656 + def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg),
14658 + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sel_x),
14659 + 0, 61, 7, 0, 7, 7, cf_inst, 0)
14662 + def : Pat<(int_R600_store_pixel_dummy),
14664 + (v4f32 (IMPLICIT_DEF)), 0, 0, 7, 7, 7, 7, cf_inst, 0)
14667 + def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 0),
14668 + (i32 imm:$type), (i32 imm:$arraybase), (i32 imm)),
14669 + (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase,
14670 + 0, 1, 2, 3, cf_inst, 0)
14672 + def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 1),
14673 + (i32 imm:$type), (i32 imm:$arraybase), (i32 imm)),
14674 + (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase,
14675 + 0, 1, 2, 3, cf_inst, 0)
14678 + def : Pat<(int_R600_store_swizzle (v4f32 R600_Reg128:$src), imm:$arraybase,
14680 + (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase,
14681 + 0, 1, 2, 3, cf_inst, 0)
14685 +multiclass SteamOutputExportPattern<Instruction ExportInst,
14686 + bits<8> buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> {
14688 + def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
14689 + (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)),
14690 + (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
14691 + 4095, imm:$mask, buf0inst, 0)>;
14693 + def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
14694 + (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)),
14695 + (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
14696 + 4095, imm:$mask, buf1inst, 0)>;
14698 + def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
14699 + (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)),
14700 + (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
14701 + 4095, imm:$mask, buf2inst, 0)>;
14703 + def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
14704 + (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)),
14705 + (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
14706 + 4095, imm:$mask, buf3inst, 0)>;
14709 +let isTerminator = 1, usesCustomInserter = 1 in {
14711 +class ExportSwzInst : InstR600ISA<(
14713 + (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase,
14714 + i32imm:$sw_x, i32imm:$sw_y, i32imm:$sw_z, i32imm:$sw_w, i32imm:$inst,
14716 + !strconcat("EXPORT", " $gpr"),
14717 + []>, ExportWord0, ExportSwzWord1 {
14718 + let elem_size = 3;
14719 + let Inst{31-0} = Word0;
14720 + let Inst{63-32} = Word1;
14723 +} // End isTerminator = 1, usesCustomInserter = 1
14725 +class ExportBufInst : InstR600ISA<(
14727 + (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase,
14728 + i32imm:$arraySize, i32imm:$compMask, i32imm:$inst, i32imm:$eop),
14729 + !strconcat("EXPORT", " $gpr"),
14730 + []>, ExportWord0, ExportBufWord1 {
14731 + let elem_size = 0;
14732 + let Inst{31-0} = Word0;
14733 + let Inst{63-32} = Word1;
14736 +let Predicates = [isR600toCayman] in {
14738 +//===----------------------------------------------------------------------===//
14739 +// Common Instructions R600, R700, Evergreen, Cayman
14740 +//===----------------------------------------------------------------------===//
14742 +def ADD : R600_2OP_Helper <0x0, "ADD", fadd>;
14743 +// Non-IEEE MUL: 0 * anything = 0
14744 +def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>;
14745 +def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>;
14746 +def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax>;
14747 +def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin>;
14749 +// For the SET* instructions there is a naming conflict in TargetSelectionDAG.td,
14750 +// so some of the instruction names don't match the asm string.
14751 +// XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics.
14752 +def SETE : R600_2OP <
14754 + [(set R600_Reg32:$dst,
14755 + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
14759 +def SGT : R600_2OP <
14761 + [(set R600_Reg32:$dst,
14762 + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
14766 +def SGE : R600_2OP <
14768 + [(set R600_Reg32:$dst,
14769 + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
14773 +def SNE : R600_2OP <
14775 + [(set R600_Reg32:$dst,
14776 + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
14780 +def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
14781 +def TRUNC : R600_1OP_Helper <0x11, "TRUNC", int_AMDGPU_trunc>;
14782 +def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>;
14783 +def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>;
14784 +def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>;
14786 +def MOV : R600_1OP <0x19, "MOV", []>;
14788 +let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in {
14790 +class MOV_IMM <ValueType vt, Operand immType> : AMDGPUInst <
14791 + (outs R600_Reg32:$dst),
14792 + (ins immType:$imm),
14797 +} // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1
14799 +def MOV_IMM_I32 : MOV_IMM<i32, i32imm>;
14802 + (MOV_IMM_I32 imm:$val)
14805 +def MOV_IMM_F32 : MOV_IMM<f32, f32imm>;
14808 + (MOV_IMM_F32 fpimm:$val)
14811 +def PRED_SETE : R600_2OP <0x20, "PRED_SETE", []>;
14812 +def PRED_SETGT : R600_2OP <0x21, "PRED_SETGT", []>;
14813 +def PRED_SETGE : R600_2OP <0x22, "PRED_SETGE", []>;
14814 +def PRED_SETNE : R600_2OP <0x23, "PRED_SETNE", []>;
14816 +let hasSideEffects = 1 in {
14818 +def KILLGT : R600_2OP <0x2D, "KILLGT", []>;
14820 +} // end hasSideEffects
14822 +def AND_INT : R600_2OP_Helper <0x30, "AND_INT", and>;
14823 +def OR_INT : R600_2OP_Helper <0x31, "OR_INT", or>;
14824 +def XOR_INT : R600_2OP_Helper <0x32, "XOR_INT", xor>;
14825 +def NOT_INT : R600_1OP_Helper <0x33, "NOT_INT", not>;
14826 +def ADD_INT : R600_2OP_Helper <0x34, "ADD_INT", add>;
14827 +def SUB_INT : R600_2OP_Helper <0x35, "SUB_INT", sub>;
14828 +def MAX_INT : R600_2OP_Helper <0x36, "MAX_INT", AMDGPUsmax>;
14829 +def MIN_INT : R600_2OP_Helper <0x37, "MIN_INT", AMDGPUsmin>;
14830 +def MAX_UINT : R600_2OP_Helper <0x38, "MAX_UINT", AMDGPUumax>;
14831 +def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", AMDGPUumin>;
14833 +def SETE_INT : R600_2OP <
14834 + 0x3A, "SETE_INT",
14835 + [(set (i32 R600_Reg32:$dst),
14836 + (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETEQ))]
14839 +def SETGT_INT : R600_2OP <
14841 + [(set (i32 R600_Reg32:$dst),
14842 + (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETGT))]
14845 +def SETGE_INT : R600_2OP <
14846 + 0x3C, "SETGE_INT",
14847 + [(set (i32 R600_Reg32:$dst),
14848 + (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETGE))]
14851 +def SETNE_INT : R600_2OP <
14852 + 0x3D, "SETNE_INT",
14853 + [(set (i32 R600_Reg32:$dst),
14854 + (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETNE))]
14857 +def SETGT_UINT : R600_2OP <
14858 + 0x3E, "SETGT_UINT",
14859 + [(set (i32 R600_Reg32:$dst),
14860 + (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUGT))]
14863 +def SETGE_UINT : R600_2OP <
14864 + 0x3F, "SETGE_UINT",
14865 + [(set (i32 R600_Reg32:$dst),
14866 + (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUGE))]
14869 +def PRED_SETE_INT : R600_2OP <0x42, "PRED_SETE_INT", []>;
14870 +def PRED_SETGT_INT : R600_2OP <0x43, "PRED_SETGE_INT", []>;
14871 +def PRED_SETGE_INT : R600_2OP <0x44, "PRED_SETGE_INT", []>;
14872 +def PRED_SETNE_INT : R600_2OP <0x45, "PRED_SETNE_INT", []>;
14874 +def CNDE_INT : R600_3OP <
14875 + 0x1C, "CNDE_INT",
14876 + [(set (i32 R600_Reg32:$dst),
14877 + (selectcc (i32 R600_Reg32:$src0), 0,
14878 + (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2),
14882 +def CNDGE_INT : R600_3OP <
14883 + 0x1E, "CNDGE_INT",
14884 + [(set (i32 R600_Reg32:$dst),
14885 + (selectcc (i32 R600_Reg32:$src0), 0,
14886 + (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2),
14890 +def CNDGT_INT : R600_3OP <
14891 + 0x1D, "CNDGT_INT",
14892 + [(set (i32 R600_Reg32:$dst),
14893 + (selectcc (i32 R600_Reg32:$src0), 0,
14894 + (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2),
14898 +//===----------------------------------------------------------------------===//
14899 +// Texture instructions
14900 +//===----------------------------------------------------------------------===//
14902 +def TEX_LD : R600_TEX <
14904 + [(set R600_Reg128:$dst, (int_AMDGPU_txf R600_Reg128:$src0, imm:$src1, imm:$src2, imm:$src3, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
14906 +let AsmString = "TEX_LD $dst, $src0, $src1, $src2, $src3, $resourceId, $samplerId, $textureTarget";
14907 +let InOperandList = (ins R600_Reg128:$src0, i32imm:$src1, i32imm:$src2, i32imm:$src3, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget);
14910 +def TEX_GET_TEXTURE_RESINFO : R600_TEX <
14911 + 0x04, "TEX_GET_TEXTURE_RESINFO",
14912 + [(set R600_Reg128:$dst, (int_AMDGPU_txq R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
14915 +def TEX_GET_GRADIENTS_H : R600_TEX <
14916 + 0x07, "TEX_GET_GRADIENTS_H",
14917 + [(set R600_Reg128:$dst, (int_AMDGPU_ddx R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
14920 +def TEX_GET_GRADIENTS_V : R600_TEX <
14921 + 0x08, "TEX_GET_GRADIENTS_V",
14922 + [(set R600_Reg128:$dst, (int_AMDGPU_ddy R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
14925 +def TEX_SET_GRADIENTS_H : R600_TEX <
14926 + 0x0B, "TEX_SET_GRADIENTS_H",
14930 +def TEX_SET_GRADIENTS_V : R600_TEX <
14931 + 0x0C, "TEX_SET_GRADIENTS_V",
14935 +def TEX_SAMPLE : R600_TEX <
14936 + 0x10, "TEX_SAMPLE",
14937 + [(set R600_Reg128:$dst, (int_AMDGPU_tex R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
14940 +def TEX_SAMPLE_C : R600_TEX <
14941 + 0x18, "TEX_SAMPLE_C",
14942 + [(set R600_Reg128:$dst, (int_AMDGPU_tex R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))]
14945 +def TEX_SAMPLE_L : R600_TEX <
14946 + 0x11, "TEX_SAMPLE_L",
14947 + [(set R600_Reg128:$dst, (int_AMDGPU_txl R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
14950 +def TEX_SAMPLE_C_L : R600_TEX <
14951 + 0x19, "TEX_SAMPLE_C_L",
14952 + [(set R600_Reg128:$dst, (int_AMDGPU_txl R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))]
14955 +def TEX_SAMPLE_LB : R600_TEX <
14956 + 0x12, "TEX_SAMPLE_LB",
14957 + [(set R600_Reg128:$dst, (int_AMDGPU_txb R600_Reg128:$src0,imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
14960 +def TEX_SAMPLE_C_LB : R600_TEX <
14961 + 0x1A, "TEX_SAMPLE_C_LB",
14962 + [(set R600_Reg128:$dst, (int_AMDGPU_txb R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))]
14965 +def TEX_SAMPLE_G : R600_TEX <
14966 + 0x14, "TEX_SAMPLE_G",
14970 +def TEX_SAMPLE_C_G : R600_TEX <
14971 + 0x1C, "TEX_SAMPLE_C_G",
14975 +//===----------------------------------------------------------------------===//
14976 +// Helper classes for common instructions
14977 +//===----------------------------------------------------------------------===//
14979 +class MUL_LIT_Common <bits<5> inst> : R600_3OP <
14984 +class MULADD_Common <bits<5> inst> : R600_3OP <
14986 + [(set (f32 R600_Reg32:$dst),
14987 + (IL_mad R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2))]
14990 +class CNDE_Common <bits<5> inst> : R600_3OP <
14992 + [(set R600_Reg32:$dst,
14993 + (selectcc (f32 R600_Reg32:$src0), FP_ZERO,
14994 + (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2),
14998 +class CNDGT_Common <bits<5> inst> : R600_3OP <
15000 + [(set R600_Reg32:$dst,
15001 + (selectcc (f32 R600_Reg32:$src0), FP_ZERO,
15002 + (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2),
15006 +class CNDGE_Common <bits<5> inst> : R600_3OP <
15008 + [(set R600_Reg32:$dst,
15009 + (selectcc (f32 R600_Reg32:$src0), FP_ZERO,
15010 + (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2),
15014 +multiclass DOT4_Common <bits<11> inst> {
15016 + def _pseudo : R600_REDUCTION <inst,
15017 + (ins R600_Reg128:$src0, R600_Reg128:$src1),
15018 + "DOT4 $dst $src0, $src1",
15019 + [(set R600_Reg32:$dst, (int_AMDGPU_dp4 R600_Reg128:$src0, R600_Reg128:$src1))]
15022 + def _real : R600_2OP <inst, "DOT4", []>;
15025 +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
15026 +multiclass CUBE_Common <bits<11> inst> {
15028 + def _pseudo : InstR600 <
15030 + (outs R600_Reg128:$dst),
15031 + (ins R600_Reg128:$src),
15032 + "CUBE $dst $src",
15033 + [(set R600_Reg128:$dst, (int_AMDGPU_cube R600_Reg128:$src))],
15036 + let isPseudo = 1;
15039 + def _real : R600_2OP <inst, "CUBE", []>;
15041 +} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0
15043 +class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
15044 + inst, "EXP_IEEE", fexp2
15047 +class FLT_TO_INT_Common <bits<11> inst> : R600_1OP_Helper <
15048 + inst, "FLT_TO_INT", fp_to_sint
15051 +class INT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
15052 + inst, "INT_TO_FLT", sint_to_fp
15055 +class FLT_TO_UINT_Common <bits<11> inst> : R600_1OP_Helper <
15056 + inst, "FLT_TO_UINT", fp_to_uint
15059 +class UINT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
15060 + inst, "UINT_TO_FLT", uint_to_fp
15063 +class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP <
15064 + inst, "LOG_CLAMPED", []
15067 +class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
15068 + inst, "LOG_IEEE", flog2
15071 +class LSHL_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHL", shl>;
15072 +class LSHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHR", srl>;
15073 +class ASHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "ASHR", sra>;
15074 +class MULHI_INT_Common <bits<11> inst> : R600_2OP_Helper <
15075 + inst, "MULHI_INT", mulhs
15077 +class MULHI_UINT_Common <bits<11> inst> : R600_2OP_Helper <
15078 + inst, "MULHI", mulhu
15080 +class MULLO_INT_Common <bits<11> inst> : R600_2OP_Helper <
15081 + inst, "MULLO_INT", mul
15083 +class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []>;
15085 +class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP <
15086 + inst, "RECIP_CLAMPED", []
15089 +class RECIP_IEEE_Common <bits<11> inst> : R600_1OP <
15090 + inst, "RECIP_IEEE", [(set R600_Reg32:$dst, (fdiv FP_ONE, R600_Reg32:$src0))]
15093 +class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper <
15094 + inst, "RECIP_UINT", AMDGPUurecip
15097 +class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper <
15098 + inst, "RECIPSQRT_CLAMPED", int_AMDGPU_rsq
15101 +class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP <
15102 + inst, "RECIPSQRT_IEEE", []
15105 +class SIN_Common <bits<11> inst> : R600_1OP <
15106 + inst, "SIN", []>{
15110 +class COS_Common <bits<11> inst> : R600_1OP <
15111 + inst, "COS", []> {
15115 +//===----------------------------------------------------------------------===//
15116 +// Helper patterns for complex intrinsics
15117 +//===----------------------------------------------------------------------===//
15119 +multiclass DIV_Common <InstR600 recip_ieee> {
15121 + (int_AMDGPU_div R600_Reg32:$src0, R600_Reg32:$src1),
15122 + (MUL R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1))
15126 + (fdiv R600_Reg32:$src0, R600_Reg32:$src1),
15127 + (MUL R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1))
15131 +class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee> : Pat <
15132 + (int_TGSI_lit_z R600_Reg32:$src_x, R600_Reg32:$src_y, R600_Reg32:$src_w),
15133 + (exp_ieee (mul_lit (log_clamped (MAX R600_Reg32:$src_y, (f32 ZERO))), R600_Reg32:$src_w, R600_Reg32:$src_x))
15136 +//===----------------------------------------------------------------------===//
15137 +// R600 / R700 Instructions
15138 +//===----------------------------------------------------------------------===//
15140 +let Predicates = [isR600] in {
15142 + def MUL_LIT_r600 : MUL_LIT_Common<0x0C>;
15143 + def MULADD_r600 : MULADD_Common<0x10>;
15144 + def CNDE_r600 : CNDE_Common<0x18>;
15145 + def CNDGT_r600 : CNDGT_Common<0x19>;
15146 + def CNDGE_r600 : CNDGE_Common<0x1A>;
15147 + defm DOT4_r600 : DOT4_Common<0x50>;
15148 + defm CUBE_r600 : CUBE_Common<0x52>;
15149 + def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>;
15150 + def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>;
15151 + def LOG_IEEE_r600 : LOG_IEEE_Common<0x63>;
15152 + def RECIP_CLAMPED_r600 : RECIP_CLAMPED_Common<0x64>;
15153 + def RECIP_IEEE_r600 : RECIP_IEEE_Common<0x66>;
15154 + def RECIPSQRT_CLAMPED_r600 : RECIPSQRT_CLAMPED_Common<0x67>;
15155 + def RECIPSQRT_IEEE_r600 : RECIPSQRT_IEEE_Common<0x69>;
15156 + def FLT_TO_INT_r600 : FLT_TO_INT_Common<0x6b>;
15157 + def INT_TO_FLT_r600 : INT_TO_FLT_Common<0x6c>;
15158 + def FLT_TO_UINT_r600 : FLT_TO_UINT_Common<0x79>;
15159 + def UINT_TO_FLT_r600 : UINT_TO_FLT_Common<0x6d>;
15160 + def SIN_r600 : SIN_Common<0x6E>;
15161 + def COS_r600 : COS_Common<0x6F>;
15162 + def ASHR_r600 : ASHR_Common<0x70>;
15163 + def LSHR_r600 : LSHR_Common<0x71>;
15164 + def LSHL_r600 : LSHL_Common<0x72>;
15165 + def MULLO_INT_r600 : MULLO_INT_Common<0x73>;
15166 + def MULHI_INT_r600 : MULHI_INT_Common<0x74>;
15167 + def MULLO_UINT_r600 : MULLO_UINT_Common<0x75>;
15168 + def MULHI_UINT_r600 : MULHI_UINT_Common<0x76>;
15169 + def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>;
15171 + defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
15172 + def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>;
15174 + def : Pat<(fsqrt R600_Reg32:$src),
15175 + (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_r600 R600_Reg32:$src))>;
15177 + def R600_ExportSwz : ExportSwzInst {
15178 + let Word1{20-17} = 1; // BURST_COUNT
15179 + let Word1{21} = eop;
15180 + let Word1{22} = 1; // VALID_PIXEL_MODE
15181 + let Word1{30-23} = inst;
15182 + let Word1{31} = 1; // BARRIER
15184 + defm : ExportPattern<R600_ExportSwz, 39>;
15186 + def R600_ExportBuf : ExportBufInst {
15187 + let Word1{20-17} = 1; // BURST_COUNT
15188 + let Word1{21} = eop;
15189 + let Word1{22} = 1; // VALID_PIXEL_MODE
15190 + let Word1{30-23} = inst;
15191 + let Word1{31} = 1; // BARRIER
15193 + defm : SteamOutputExportPattern<R600_ExportBuf, 0x20, 0x21, 0x22, 0x23>;
15196 +// Helper pattern for normalizing inputs to triginomic instructions for R700+
15198 +class COS_PAT <InstR600 trig> : Pat<
15199 + (fcos R600_Reg32:$src),
15200 + (trig (MUL (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src))
15203 +class SIN_PAT <InstR600 trig> : Pat<
15204 + (fsin R600_Reg32:$src),
15205 + (trig (MUL (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src))
15208 +//===----------------------------------------------------------------------===//
15209 +// R700 Only instructions
15210 +//===----------------------------------------------------------------------===//
15212 +let Predicates = [isR700] in {
15213 + def SIN_r700 : SIN_Common<0x6E>;
15214 + def COS_r700 : COS_Common<0x6F>;
15216 + // R700 normalizes inputs to SIN/COS the same as EG
15217 + def : SIN_PAT <SIN_r700>;
15218 + def : COS_PAT <COS_r700>;
15221 +//===----------------------------------------------------------------------===//
15222 +// Evergreen Only instructions
15223 +//===----------------------------------------------------------------------===//
15225 +let Predicates = [isEG] in {
15227 +def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>;
15228 +defm DIV_eg : DIV_Common<RECIP_IEEE_eg>;
15230 +def MULLO_INT_eg : MULLO_INT_Common<0x8F>;
15231 +def MULHI_INT_eg : MULHI_INT_Common<0x90>;
15232 +def MULLO_UINT_eg : MULLO_UINT_Common<0x91>;
15233 +def MULHI_UINT_eg : MULHI_UINT_Common<0x92>;
15234 +def RECIP_UINT_eg : RECIP_UINT_Common<0x94>;
15235 +def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>;
15236 +def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;
15237 +def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
15238 +def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
15239 +def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
15240 +def SIN_eg : SIN_Common<0x8D>;
15241 +def COS_eg : COS_Common<0x8E>;
15243 +def : SIN_PAT <SIN_eg>;
15244 +def : COS_PAT <COS_eg>;
15245 +def : Pat<(fsqrt R600_Reg32:$src),
15246 + (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_eg R600_Reg32:$src))>;
15247 +} // End Predicates = [isEG]
15249 +//===----------------------------------------------------------------------===//
15250 +// Evergreen / Cayman Instructions
15251 +//===----------------------------------------------------------------------===//
15253 +let Predicates = [isEGorCayman] in {
15255 + // BFE_UINT - bit_extract, an optimization for mask and shift
15260 + // bit_extract = (Input << (32 - Offset - Width)) >> (32 - Width)
15262 + // Example Usage:
15263 + // (Offset, Width)
15265 + // (0, 8) = (Input << 24) >> 24 = (Input & 0xff) >> 0
15266 + // (8, 8) = (Input << 16) >> 24 = (Input & 0xffff) >> 8
15267 + // (16,8) = (Input << 8) >> 24 = (Input & 0xffffff) >> 16
15268 + // (24,8) = (Input << 0) >> 24 = (Input & 0xffffffff) >> 24
15269 + def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT",
15270 + [(set R600_Reg32:$dst, (int_AMDIL_bit_extract_u32 R600_Reg32:$src0,
15271 + R600_Reg32:$src1,
15272 + R600_Reg32:$src2))],
15276 + def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT",
15277 + [(set R600_Reg32:$dst, (AMDGPUbitalign R600_Reg32:$src0, R600_Reg32:$src1,
15278 + R600_Reg32:$src2))],
15282 + def MULADD_eg : MULADD_Common<0x14>;
15283 + def ASHR_eg : ASHR_Common<0x15>;
15284 + def LSHR_eg : LSHR_Common<0x16>;
15285 + def LSHL_eg : LSHL_Common<0x17>;
15286 + def CNDE_eg : CNDE_Common<0x19>;
15287 + def CNDGT_eg : CNDGT_Common<0x1A>;
15288 + def CNDGE_eg : CNDGE_Common<0x1B>;
15289 + def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
15290 + def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
15291 + defm DOT4_eg : DOT4_Common<0xBE>;
15292 + defm CUBE_eg : CUBE_Common<0xC0>;
15294 + def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common<MUL_LIT_eg, LOG_CLAMPED_eg, EXP_IEEE_eg>;
15296 + def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> {
15297 + let Pattern = [];
15300 + def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>;
15302 + def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> {
15303 + let Pattern = [];
15306 + def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
15308 + // TRUNC is used for the FLT_TO_INT instructions to work around a
15309 + // perceived problem where the rounding modes are applied differently
15310 + // depending on the instruction and the slot they are in.
15312 + // https://bugs.freedesktop.org/show_bug.cgi?id=50232
15313 + // Mesa commit: a1a0974401c467cb86ef818f22df67c21774a38c
15315 + // XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes,
15316 + // which do not need to be truncated since the fp values are 0.0f or 1.0f.
15317 + // We should look into handling these cases separately.
15318 + def : Pat<(fp_to_sint R600_Reg32:$src0),
15319 + (FLT_TO_INT_eg (TRUNC R600_Reg32:$src0))>;
15321 + def : Pat<(fp_to_uint R600_Reg32:$src0),
15322 + (FLT_TO_UINT_eg (TRUNC R600_Reg32:$src0))>;
15324 + def EG_ExportSwz : ExportSwzInst {
15325 + let Word1{19-16} = 1; // BURST_COUNT
15326 + let Word1{20} = 1; // VALID_PIXEL_MODE
15327 + let Word1{21} = eop;
15328 + let Word1{29-22} = inst;
15329 + let Word1{30} = 0; // MARK
15330 + let Word1{31} = 1; // BARRIER
15332 + defm : ExportPattern<EG_ExportSwz, 83>;
15334 + def EG_ExportBuf : ExportBufInst {
15335 + let Word1{19-16} = 1; // BURST_COUNT
15336 + let Word1{20} = 1; // VALID_PIXEL_MODE
15337 + let Word1{21} = eop;
15338 + let Word1{29-22} = inst;
15339 + let Word1{30} = 0; // MARK
15340 + let Word1{31} = 1; // BARRIER
15342 + defm : SteamOutputExportPattern<EG_ExportBuf, 0x40, 0x41, 0x42, 0x43>;
15344 +//===----------------------------------------------------------------------===//
15345 +// Memory read/write instructions
15346 +//===----------------------------------------------------------------------===//
15347 +let usesCustomInserter = 1 in {
15349 +class RAT_WRITE_CACHELESS_eg <dag ins, bits<4> comp_mask, string name,
15350 + list<dag> pattern>
15351 + : EG_CF_RAT <0x57, 0x2, 0, (outs), ins,
15352 + !strconcat(name, " $rw_gpr, $index_gpr, $eop"), pattern> {
15354 + // XXX: Have a separate instruction for non-indexed writes.
15357 + let ELEM_SIZE = 0;
15359 + let ARRAY_SIZE = 0;
15360 + let COMP_MASK = comp_mask;
15361 + let BURST_COUNT = 0;
15367 +} // End usesCustomInserter = 1
15370 +def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg <
15371 + (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
15372 + 0x1, "RAT_WRITE_CACHELESS_32_eg",
15373 + [(global_store (i32 R600_TReg32_X:$rw_gpr), R600_TReg32_X:$index_gpr)]
15377 +def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg <
15378 + (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
15379 + 0xf, "RAT_WRITE_CACHELESS_128",
15380 + [(global_store (v4i32 R600_Reg128:$rw_gpr), R600_TReg32_X:$index_gpr)]
15383 +class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
15384 + : InstR600ISA <outs, (ins MEMxi:$ptr), name#" $dst, $ptr", pattern>,
15385 + VTX_WORD1_GPR, VTX_WORD0 {
15389 + let FETCH_TYPE = 2;
15390 + let FETCH_WHOLE_QUAD = 0;
15391 + let BUFFER_ID = buffer_id;
15393 + // XXX: We can infer this field based on the SRC_GPR. This would allow us
15394 + // to store vertex addresses in any channel, not just X.
15395 + let SRC_SEL_X = 0;
15397 + // The docs say that if this bit is set, then DATA_FORMAT, NUM_FORMAT_ALL,
15398 + // FORMAT_COMP_ALL, SRF_MODE_ALL, and ENDIAN_SWAP fields will be ignored,
15399 + // however, based on my testing if USE_CONST_FIELDS is set, then all
15400 + // these fields need to be set to 0.
15401 + let USE_CONST_FIELDS = 0;
15402 + let NUM_FORMAT_ALL = 1;
15403 + let FORMAT_COMP_ALL = 0;
15404 + let SRF_MODE_ALL = 0;
15406 + let Inst{31-0} = Word0;
15407 + let Inst{63-32} = Word1;
15408 + // LLVM can only encode 64-bit instructions, so these fields are manually
15409 + // encoded in R600CodeEmitter
15411 + // bits<16> OFFSET;
15412 + // bits<2> ENDIAN_SWAP = 0;
15413 + // bits<1> CONST_BUF_NO_STRIDE = 0;
15414 + // bits<1> MEGA_FETCH = 0;
15415 + // bits<1> ALT_CONST = 0;
15416 + // bits<2> BUFFER_INDEX_MODE = 0;
15420 + // VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
15421 + // is done in R600CodeEmitter
15423 + // Inst{79-64} = OFFSET;
15424 + // Inst{81-80} = ENDIAN_SWAP;
15425 + // Inst{82} = CONST_BUF_NO_STRIDE;
15426 + // Inst{83} = MEGA_FETCH;
15427 + // Inst{84} = ALT_CONST;
15428 + // Inst{86-85} = BUFFER_INDEX_MODE;
15429 + // Inst{95-86} = 0; Reserved
15431 + // VTX_WORD3 (Padding)
15433 + // Inst{127-96} = 0;
15436 +class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern>
15437 + : VTX_READ_eg <"VTX_READ_8", buffer_id, (outs R600_TReg32_X:$dst),
15440 + let MEGA_FETCH_COUNT = 1;
15441 + let DST_SEL_X = 0;
15442 + let DST_SEL_Y = 7; // Masked
15443 + let DST_SEL_Z = 7; // Masked
15444 + let DST_SEL_W = 7; // Masked
15445 + let DATA_FORMAT = 1; // FMT_8
15448 +class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern>
15449 + : VTX_READ_eg <"VTX_READ_16", buffer_id, (outs R600_TReg32_X:$dst),
15451 + let MEGA_FETCH_COUNT = 2;
15452 + let DST_SEL_X = 0;
15453 + let DST_SEL_Y = 7; // Masked
15454 + let DST_SEL_Z = 7; // Masked
15455 + let DST_SEL_W = 7; // Masked
15456 + let DATA_FORMAT = 5; // FMT_16
15460 +class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
15461 + : VTX_READ_eg <"VTX_READ_32", buffer_id, (outs R600_TReg32_X:$dst),
15464 + let MEGA_FETCH_COUNT = 4;
15465 + let DST_SEL_X = 0;
15466 + let DST_SEL_Y = 7; // Masked
15467 + let DST_SEL_Z = 7; // Masked
15468 + let DST_SEL_W = 7; // Masked
15469 + let DATA_FORMAT = 0xD; // COLOR_32
15471 + // This is not really necessary, but there were some GPU hangs that appeared
15472 + // to be caused by ALU instructions in the next instruction group that wrote
15473 + // to the $ptr registers of the VTX_READ.
15475 + // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
15476 + // %T2_X<def> = MOV %ZERO
15477 + //Adding this constraint prevents this from happening.
15478 + let Constraints = "$ptr.ptr = $dst";
15481 +class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
15482 + : VTX_READ_eg <"VTX_READ_128", buffer_id, (outs R600_Reg128:$dst),
15485 + let MEGA_FETCH_COUNT = 16;
15486 + let DST_SEL_X = 0;
15487 + let DST_SEL_Y = 1;
15488 + let DST_SEL_Z = 2;
15489 + let DST_SEL_W = 3;
15490 + let DATA_FORMAT = 0x22; // COLOR_32_32_32_32
15492 + // XXX: Need to force VTX_READ_128 instructions to write to the same register
15493 + // that holds its buffer address to avoid potential hangs. We can't use
15494 + // the same constraint as VTX_READ_32_eg, because the $ptr.ptr and $dst
15495 + // registers are different sizes.
15498 +//===----------------------------------------------------------------------===//
15499 +// VTX Read from parameter memory space
15500 +//===----------------------------------------------------------------------===//
15502 +def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0,
15503 + [(set (i32 R600_TReg32_X:$dst), (load_param_zexti8 ADDRVTX_READ:$ptr))]
15506 +def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0,
15507 + [(set (i32 R600_TReg32_X:$dst), (load_param_zexti16 ADDRVTX_READ:$ptr))]
15510 +def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
15511 + [(set (i32 R600_TReg32_X:$dst), (load_param ADDRVTX_READ:$ptr))]
15514 +//===----------------------------------------------------------------------===//
15515 +// VTX Read from global memory space
15516 +//===----------------------------------------------------------------------===//
15519 +def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1,
15520 + [(set (i32 R600_TReg32_X:$dst), (zextloadi8_global ADDRVTX_READ:$ptr))]
15524 +def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
15525 + [(set (i32 R600_TReg32_X:$dst), (global_load ADDRVTX_READ:$ptr))]
15529 +def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
15530 + [(set (v4i32 R600_Reg128:$dst), (global_load ADDRVTX_READ:$ptr))]
15533 +//===----------------------------------------------------------------------===//
15535 +// XXX: We are currently storing all constants in the global address space.
15536 +//===----------------------------------------------------------------------===//
15538 +def CONSTANT_LOAD_eg : VTX_READ_32_eg <1,
15539 + [(set (i32 R600_TReg32_X:$dst), (constant_load ADDRVTX_READ:$ptr))]
15544 +let Predicates = [isCayman] in {
15546 +let isVector = 1 in {
15548 +def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>;
15550 +def MULLO_INT_cm : MULLO_INT_Common<0x8F>;
15551 +def MULHI_INT_cm : MULHI_INT_Common<0x90>;
15552 +def MULLO_UINT_cm : MULLO_UINT_Common<0x91>;
15553 +def MULHI_UINT_cm : MULHI_UINT_Common<0x92>;
15554 +def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>;
15555 +def EXP_IEEE_cm : EXP_IEEE_Common<0x81>;
15556 +def LOG_IEEE_ : LOG_IEEE_Common<0x83>;
15557 +def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>;
15558 +def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>;
15559 +def SIN_cm : SIN_Common<0x8D>;
15560 +def COS_cm : COS_Common<0x8E>;
15561 +} // End isVector = 1
15563 +def : SIN_PAT <SIN_cm>;
15564 +def : COS_PAT <COS_cm>;
15566 +defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
15568 +// RECIP_UINT emulation for Cayman
15570 + (AMDGPUurecip R600_Reg32:$src0),
15571 + (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg R600_Reg32:$src0)),
15572 + (MOV_IMM_I32 0x4f800000)))
15576 +def : Pat<(fsqrt R600_Reg32:$src),
15577 + (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm R600_Reg32:$src))>;
15581 +//===----------------------------------------------------------------------===//
15582 +// Branch Instructions
15583 +//===----------------------------------------------------------------------===//
15586 +def IF_PREDICATE_SET : ILFormat<(outs), (ins GPRI32:$src),
15587 + "IF_PREDICATE_SET $src", []>;
15589 +def PREDICATED_BREAK : ILFormat<(outs), (ins GPRI32:$src),
15590 + "PREDICATED_BREAK $src", []>;
15592 +//===----------------------------------------------------------------------===//
15593 +// Pseudo instructions
15594 +//===----------------------------------------------------------------------===//
15596 +let isPseudo = 1 in {
15598 +def PRED_X : InstR600 <
15599 + 0, (outs R600_Predicate_Bit:$dst),
15600 + (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags),
15601 + "", [], NullALU> {
15602 + let FlagOperandIdx = 3;
15605 +let isTerminator = 1, isBranch = 1, isBarrier = 1 in {
15607 +def JUMP : InstR600 <0x10,
15609 + (ins brtarget:$target, R600_Pred:$p),
15610 + "JUMP $target ($p)",
15614 +} // End isTerminator = 1, isBranch = 1, isBarrier = 1
15616 +let usesCustomInserter = 1 in {
15618 +let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in {
15620 +def MASK_WRITE : AMDGPUShaderInst <
15622 + (ins R600_Reg32:$src),
15623 + "MASK_WRITE $src",
15627 +} // End mayLoad = 0, mayStore = 0, hasSideEffects = 1
15630 +def RESERVE_REG : AMDGPUShaderInst <
15632 + (ins i32imm:$src),
15633 + "RESERVE_REG $src",
15634 + [(int_AMDGPU_reserve_reg imm:$src)]
15636 +def TXD: AMDGPUShaderInst <
15637 + (outs R600_Reg128:$dst),
15638 + (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
15639 + "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
15640 + [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
15643 +def TXD_SHADOW: AMDGPUShaderInst <
15644 + (outs R600_Reg128:$dst),
15645 + (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
15646 + "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
15647 + [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))]
15650 +} // End isPseudo = 1
15651 +} // End usesCustomInserter = 1
15653 +def CLAMP_R600 : CLAMP <R600_Reg32>;
15654 +def FABS_R600 : FABS<R600_Reg32>;
15655 +def FNEG_R600 : FNEG<R600_Reg32>;
15657 +//===---------------------------------------------------------------------===//
15658 +// Return instruction
15659 +//===---------------------------------------------------------------------===//
15660 +let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1 in {
15661 + def RETURN : ILFormat<(outs), (ins variable_ops),
15662 + "RETURN", [(IL_retflag)]>;
15666 +//===----------------------------------------------------------------------===//
15667 +// Constant Buffer Addressing Support
15668 +//===----------------------------------------------------------------------===//
15670 +let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in {
15671 +def CONST_COPY : Instruction {
15672 + let OutOperandList = (outs R600_Reg32:$dst);
15673 + let InOperandList = (ins i32imm:$src);
15674 + let Pattern = [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))];
15675 + let AsmString = "CONST_COPY";
15676 + let neverHasSideEffects = 1;
15677 + let isAsCheapAsAMove = 1;
15678 + let Itinerary = NullALU;
15680 +} // end isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"
15682 +def TEX_VTX_CONSTBUF :
15683 + InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr), "VTX_READ_eg $dst, $ptr",
15684 + [(set R600_Reg128:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr))]>,
15685 + VTX_WORD1_GPR, VTX_WORD0 {
15688 + let FETCH_TYPE = 2;
15689 + let FETCH_WHOLE_QUAD = 0;
15690 + let BUFFER_ID = 0;
15692 + let SRC_SEL_X = 0;
15694 + let USE_CONST_FIELDS = 0;
15695 + let NUM_FORMAT_ALL = 2;
15696 + let FORMAT_COMP_ALL = 1;
15697 + let SRF_MODE_ALL = 1;
15698 + let MEGA_FETCH_COUNT = 16;
15699 + let DST_SEL_X = 0;
15700 + let DST_SEL_Y = 1;
15701 + let DST_SEL_Z = 2;
15702 + let DST_SEL_W = 3;
15703 + let DATA_FORMAT = 35;
15705 + let Inst{31-0} = Word0;
15706 + let Inst{63-32} = Word1;
15708 +// LLVM can only encode 64-bit instructions, so these fields are manually
15709 +// encoded in R600CodeEmitter
15711 +// bits<16> OFFSET;
15712 +// bits<2> ENDIAN_SWAP = 0;
15713 +// bits<1> CONST_BUF_NO_STRIDE = 0;
15714 +// bits<1> MEGA_FETCH = 0;
15715 +// bits<1> ALT_CONST = 0;
15716 +// bits<2> BUFFER_INDEX_MODE = 0;
15720 +// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
15721 +// is done in R600CodeEmitter
15723 +// Inst{79-64} = OFFSET;
15724 +// Inst{81-80} = ENDIAN_SWAP;
15725 +// Inst{82} = CONST_BUF_NO_STRIDE;
15726 +// Inst{83} = MEGA_FETCH;
15727 +// Inst{84} = ALT_CONST;
15728 +// Inst{86-85} = BUFFER_INDEX_MODE;
15729 +// Inst{95-86} = 0; Reserved
15731 +// VTX_WORD3 (Padding)
15733 +// Inst{127-96} = 0;
15737 +//===--------------------------------------------------------------------===//
15738 +// Instructions support
15739 +//===--------------------------------------------------------------------===//
15740 +//===---------------------------------------------------------------------===//
15741 +// Custom Inserter for Branches and returns, this eventually will be a
15743 +//===---------------------------------------------------------------------===//
15744 +let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
15745 + def BRANCH : ILFormat<(outs), (ins brtarget:$target),
15746 + "; Pseudo unconditional branch instruction",
15747 + [(br bb:$target)]>;
15748 + defm BRANCH_COND : BranchConditional<IL_brcond>;
15751 +//===---------------------------------------------------------------------===//
15752 +// Flow and Program control Instructions
15753 +//===---------------------------------------------------------------------===//
15754 +let isTerminator=1 in {
15755 + def SWITCH : ILFormat< (outs), (ins GPRI32:$src),
15756 + !strconcat("SWITCH", " $src"), []>;
15757 + def CASE : ILFormat< (outs), (ins GPRI32:$src),
15758 + !strconcat("CASE", " $src"), []>;
15759 + def BREAK : ILFormat< (outs), (ins),
15761 + def CONTINUE : ILFormat< (outs), (ins),
15763 + def DEFAULT : ILFormat< (outs), (ins),
15765 + def ELSE : ILFormat< (outs), (ins),
15767 + def ENDSWITCH : ILFormat< (outs), (ins),
15768 + "ENDSWITCH", []>;
15769 + def ENDMAIN : ILFormat< (outs), (ins),
15771 + def END : ILFormat< (outs), (ins),
15773 + def ENDFUNC : ILFormat< (outs), (ins),
15775 + def ENDIF : ILFormat< (outs), (ins),
15777 + def WHILELOOP : ILFormat< (outs), (ins),
15779 + def ENDLOOP : ILFormat< (outs), (ins),
15781 + def FUNC : ILFormat< (outs), (ins),
15783 + def RETDYN : ILFormat< (outs), (ins),
15785 + // This opcode has custom swizzle pattern encoded in Swizzle Encoder
15786 + defm IF_LOGICALNZ : BranchInstr<"IF_LOGICALNZ">;
15787 + // This opcode has custom swizzle pattern encoded in Swizzle Encoder
15788 + defm IF_LOGICALZ : BranchInstr<"IF_LOGICALZ">;
15789 + // This opcode has custom swizzle pattern encoded in Swizzle Encoder
15790 + defm BREAK_LOGICALNZ : BranchInstr<"BREAK_LOGICALNZ">;
15791 + // This opcode has custom swizzle pattern encoded in Swizzle Encoder
15792 + defm BREAK_LOGICALZ : BranchInstr<"BREAK_LOGICALZ">;
15793 + // This opcode has custom swizzle pattern encoded in Swizzle Encoder
15794 + defm CONTINUE_LOGICALNZ : BranchInstr<"CONTINUE_LOGICALNZ">;
15795 + // This opcode has custom swizzle pattern encoded in Swizzle Encoder
15796 + defm CONTINUE_LOGICALZ : BranchInstr<"CONTINUE_LOGICALZ">;
15797 + defm IFC : BranchInstr2<"IFC">;
15798 + defm BREAKC : BranchInstr2<"BREAKC">;
15799 + defm CONTINUEC : BranchInstr2<"CONTINUEC">;
15802 +//===----------------------------------------------------------------------===//
15804 +//===----------------------------------------------------------------------===//
15806 +//CNDGE_INT extra pattern
15808 + (selectcc (i32 R600_Reg32:$src0), -1, (i32 R600_Reg32:$src1),
15809 + (i32 R600_Reg32:$src2), COND_GT),
15810 + (CNDGE_INT R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2)
15815 + (int_AMDGPU_kilp),
15816 + (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO)))
15820 + (int_AMDGPU_kill R600_Reg32:$src0),
15821 + (MASK_WRITE (KILLGT (f32 ZERO), (f32 R600_Reg32:$src0)))
15824 +// SGT Reverse args
15826 + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, COND_LT),
15827 + (SGT R600_Reg32:$src1, R600_Reg32:$src0)
15830 +// SGE Reverse args
15832 + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, COND_LE),
15833 + (SGE R600_Reg32:$src1, R600_Reg32:$src0)
15836 +// SETGT_INT reverse args
15838 + (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETLT),
15839 + (SETGT_INT R600_Reg32:$src1, R600_Reg32:$src0)
15842 +// SETGE_INT reverse args
15844 + (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETLE),
15845 + (SETGE_INT R600_Reg32:$src1, R600_Reg32:$src0)
15848 +// SETGT_UINT reverse args
15850 + (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETULT),
15851 + (SETGT_UINT R600_Reg32:$src1, R600_Reg32:$src0)
15854 +// SETGE_UINT reverse args
15856 + (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETULE),
15857 + (SETGE_UINT R600_Reg32:$src1, R600_Reg32:$src0)
15860 +// The next two patterns are special cases for handling 'true if ordered' and
15861 +// 'true if unordered' conditionals. The assumption here is that the behavior of
15862 +// SETE and SNE conforms to the Direct3D 10 rules for floating point values
15863 +// described here:
15864 +// http://msdn.microsoft.com/en-us/library/windows/desktop/cc308050.aspx#alpha_32_bit
15865 +// We assume that SETE returns false when one of the operands is NAN and
15866 +// SNE returns true when on of the operands is NAN
15868 +//SETE - 'true if ordered'
15870 + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETO),
15871 + (SETE R600_Reg32:$src0, R600_Reg32:$src1)
15874 +//SNE - 'true if unordered'
15876 + (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETUO),
15877 + (SNE R600_Reg32:$src0, R600_Reg32:$src1)
15880 +def : Extract_Element <f32, v4f32, R600_Reg128, 0, sel_x>;
15881 +def : Extract_Element <f32, v4f32, R600_Reg128, 1, sel_y>;
15882 +def : Extract_Element <f32, v4f32, R600_Reg128, 2, sel_z>;
15883 +def : Extract_Element <f32, v4f32, R600_Reg128, 3, sel_w>;
15885 +def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 0, sel_x>;
15886 +def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 1, sel_y>;
15887 +def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 2, sel_z>;
15888 +def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 3, sel_w>;
15890 +def : Extract_Element <i32, v4i32, R600_Reg128, 0, sel_x>;
15891 +def : Extract_Element <i32, v4i32, R600_Reg128, 1, sel_y>;
15892 +def : Extract_Element <i32, v4i32, R600_Reg128, 2, sel_z>;
15893 +def : Extract_Element <i32, v4i32, R600_Reg128, 3, sel_w>;
15895 +def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 0, sel_x>;
15896 +def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 1, sel_y>;
15897 +def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 2, sel_z>;
15898 +def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 3, sel_w>;
15900 +def : Vector_Build <v4f32, R600_Reg128, f32, R600_Reg32>;
15901 +def : Vector_Build <v4i32, R600_Reg128, i32, R600_Reg32>;
15903 +// bitconvert patterns
15905 +def : BitConvert <i32, f32, R600_Reg32>;
15906 +def : BitConvert <f32, i32, R600_Reg32>;
15907 +def : BitConvert <v4f32, v4i32, R600_Reg128>;
15908 +def : BitConvert <v4i32, v4f32, R600_Reg128>;
15910 +// DWORDADDR pattern
15911 +def : DwordAddrPat <i32, R600_Reg32>;
15913 +} // End isR600toCayman Predicate
15914 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600Intrinsics.td llvm-r600/lib/Target/R600/R600Intrinsics.td
15915 --- llvm-3.2.src/lib/Target/R600/R600Intrinsics.td 1970-01-01 01:00:00.000000000 +0100
15916 +++ llvm-r600/lib/Target/R600/R600Intrinsics.td 2013-01-25 19:43:57.466716387 +0100
15918 +//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===//
15920 +// The LLVM Compiler Infrastructure
15922 +// This file is distributed under the University of Illinois Open Source
15923 +// License. See LICENSE.TXT for details.
15925 +//===----------------------------------------------------------------------===//
15927 +// R600 Intrinsic Definitions
15929 +//===----------------------------------------------------------------------===//
15931 +let TargetPrefix = "R600", isTarget = 1 in {
15932 + def int_R600_load_input : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
15933 + def int_R600_load_input_perspective :
15934 + Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadMem]>;
15935 + def int_R600_load_input_constant :
15936 + Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadMem]>;
15937 + def int_R600_load_input_linear :
15938 + Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadMem]>;
15939 + def int_R600_store_swizzle :
15940 + Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
15941 + def int_R600_store_stream_output :
15942 + Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
15943 + def int_R600_store_pixel_color :
15944 + Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
15945 + def int_R600_store_pixel_depth :
15946 + Intrinsic<[], [llvm_float_ty], []>;
15947 + def int_R600_store_pixel_stencil :
15948 + Intrinsic<[], [llvm_float_ty], []>;
15949 + def int_R600_store_pixel_dummy :
15950 + Intrinsic<[], [], []>;
15952 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600ISelLowering.cpp llvm-r600/lib/Target/R600/R600ISelLowering.cpp
15953 --- llvm-3.2.src/lib/Target/R600/R600ISelLowering.cpp 1970-01-01 01:00:00.000000000 +0100
15954 +++ llvm-r600/lib/Target/R600/R600ISelLowering.cpp 2013-01-25 19:43:57.463383054 +0100
15956 +//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
15958 +// The LLVM Compiler Infrastructure
15960 +// This file is distributed under the University of Illinois Open Source
15961 +// License. See LICENSE.TXT for details.
15963 +//===----------------------------------------------------------------------===//
15966 +/// \brief Custom DAG lowering for R600
15968 +//===----------------------------------------------------------------------===//
15970 +#include "R600ISelLowering.h"
15971 +#include "R600Defines.h"
15972 +#include "R600InstrInfo.h"
15973 +#include "R600MachineFunctionInfo.h"
15974 +#include "llvm/Argument.h"
15975 +#include "llvm/Function.h"
15976 +#include "llvm/CodeGen/MachineInstrBuilder.h"
15977 +#include "llvm/CodeGen/MachineRegisterInfo.h"
15978 +#include "llvm/CodeGen/SelectionDAG.h"
15980 +using namespace llvm;
15982 +R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
15983 + AMDGPUTargetLowering(TM),
15984 + TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
15985 + setOperationAction(ISD::MUL, MVT::i64, Expand);
15986 + addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
15987 + addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
15988 + addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
15989 + addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
15990 + computeRegisterProperties();
15992 + setOperationAction(ISD::FADD, MVT::v4f32, Expand);
15993 + setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
15994 + setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
15995 + setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
15997 + setOperationAction(ISD::ADD, MVT::v4i32, Expand);
15998 + setOperationAction(ISD::AND, MVT::v4i32, Expand);
15999 + setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
16000 + setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
16001 + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
16002 + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
16003 + setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
16004 + setOperationAction(ISD::UREM, MVT::v4i32, Expand);
16005 + setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
16007 + setOperationAction(ISD::BR_CC, MVT::i32, Custom);
16008 + setOperationAction(ISD::BR_CC, MVT::f32, Custom);
16010 + setOperationAction(ISD::FSUB, MVT::f32, Expand);
16012 + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
16013 + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
16014 + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
16015 + setOperationAction(ISD::FPOW, MVT::f32, Custom);
16017 + setOperationAction(ISD::ROTL, MVT::i32, Custom);
16019 + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
16020 + setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
16022 + setOperationAction(ISD::SETCC, MVT::i32, Custom);
16023 + setOperationAction(ISD::SETCC, MVT::f32, Custom);
16024 + setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
16026 + setOperationAction(ISD::SELECT, MVT::i32, Custom);
16027 + setOperationAction(ISD::SELECT, MVT::f32, Custom);
16029 + setOperationAction(ISD::STORE, MVT::i32, Custom);
16030 + setOperationAction(ISD::STORE, MVT::v4i32, Custom);
16032 + setOperationAction(ISD::LOAD, MVT::i32, Custom);
16033 + setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
16034 + setTargetDAGCombine(ISD::FP_ROUND);
16035 + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
16037 + setSchedulingPreference(Sched::VLIW);
16040 +MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
16041 + MachineInstr * MI, MachineBasicBlock * BB) const {
16042 + MachineFunction * MF = BB->getParent();
16043 + MachineRegisterInfo &MRI = MF->getRegInfo();
16044 + MachineBasicBlock::iterator I = *MI;
16046 + switch (MI->getOpcode()) {
16047 + default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
16048 + case AMDGPU::SHADER_TYPE: break;
16049 + case AMDGPU::CLAMP_R600: {
16050 + MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
16052 + MI->getOperand(0).getReg(),
16053 + MI->getOperand(1).getReg());
16054 + TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
16058 + case AMDGPU::FABS_R600: {
16059 + MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
16061 + MI->getOperand(0).getReg(),
16062 + MI->getOperand(1).getReg());
16063 + TII->addFlag(NewMI, 0, MO_FLAG_ABS);
16067 + case AMDGPU::FNEG_R600: {
16068 + MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
16070 + MI->getOperand(0).getReg(),
16071 + MI->getOperand(1).getReg());
16072 + TII->addFlag(NewMI, 0, MO_FLAG_NEG);
16076 + case AMDGPU::MASK_WRITE: {
16077 + unsigned maskedRegister = MI->getOperand(0).getReg();
16078 + assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
16079 + MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
16080 + TII->addFlag(defInstr, 0, MO_FLAG_MASK);
16084 + case AMDGPU::MOV_IMM_F32:
16085 + TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
16086 + MI->getOperand(1).getFPImm()->getValueAPF()
16087 + .bitcastToAPInt().getZExtValue());
16089 + case AMDGPU::MOV_IMM_I32:
16090 + TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
16091 + MI->getOperand(1).getImm());
16095 + case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
16096 + case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
16097 + unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
16099 + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
16100 + .addOperand(MI->getOperand(0))
16101 + .addOperand(MI->getOperand(1))
16102 + .addImm(EOP); // Set End of program bit
16106 + case AMDGPU::RESERVE_REG: {
16107 + R600MachineFunctionInfo * MFI = MF->getInfo<R600MachineFunctionInfo>();
16108 + int64_t ReservedIndex = MI->getOperand(0).getImm();
16109 + unsigned ReservedReg =
16110 + AMDGPU::R600_TReg32RegClass.getRegister(ReservedIndex);
16111 + MFI->ReservedRegs.push_back(ReservedReg);
16112 + unsigned SuperReg =
16113 + AMDGPU::R600_Reg128RegClass.getRegister(ReservedIndex / 4);
16114 + MFI->ReservedRegs.push_back(SuperReg);
16118 + case AMDGPU::TXD: {
16119 + unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
16120 + unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
16122 + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
16123 + .addOperand(MI->getOperand(3))
16124 + .addOperand(MI->getOperand(4))
16125 + .addOperand(MI->getOperand(5))
16126 + .addOperand(MI->getOperand(6));
16127 + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
16128 + .addOperand(MI->getOperand(2))
16129 + .addOperand(MI->getOperand(4))
16130 + .addOperand(MI->getOperand(5))
16131 + .addOperand(MI->getOperand(6));
16132 + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
16133 + .addOperand(MI->getOperand(0))
16134 + .addOperand(MI->getOperand(1))
16135 + .addOperand(MI->getOperand(4))
16136 + .addOperand(MI->getOperand(5))
16137 + .addOperand(MI->getOperand(6))
16138 + .addReg(T0, RegState::Implicit)
16139 + .addReg(T1, RegState::Implicit);
16143 + case AMDGPU::TXD_SHADOW: {
16144 + unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
16145 + unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
16147 + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
16148 + .addOperand(MI->getOperand(3))
16149 + .addOperand(MI->getOperand(4))
16150 + .addOperand(MI->getOperand(5))
16151 + .addOperand(MI->getOperand(6));
16152 + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
16153 + .addOperand(MI->getOperand(2))
16154 + .addOperand(MI->getOperand(4))
16155 + .addOperand(MI->getOperand(5))
16156 + .addOperand(MI->getOperand(6));
16157 + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
16158 + .addOperand(MI->getOperand(0))
16159 + .addOperand(MI->getOperand(1))
16160 + .addOperand(MI->getOperand(4))
16161 + .addOperand(MI->getOperand(5))
16162 + .addOperand(MI->getOperand(6))
16163 + .addReg(T0, RegState::Implicit)
16164 + .addReg(T1, RegState::Implicit);
16168 + case AMDGPU::BRANCH:
16169 + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
16170 + .addOperand(MI->getOperand(0))
16174 + case AMDGPU::BRANCH_COND_f32: {
16175 + MachineInstr *NewMI =
16176 + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
16177 + AMDGPU::PREDICATE_BIT)
16178 + .addOperand(MI->getOperand(1))
16179 + .addImm(OPCODE_IS_NOT_ZERO)
16180 + .addImm(0); // Flags
16181 + TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
16182 + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
16183 + .addOperand(MI->getOperand(0))
16184 + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
16188 + case AMDGPU::BRANCH_COND_i32: {
16189 + MachineInstr *NewMI =
16190 + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
16191 + AMDGPU::PREDICATE_BIT)
16192 + .addOperand(MI->getOperand(1))
16193 + .addImm(OPCODE_IS_NOT_ZERO_INT)
16194 + .addImm(0); // Flags
16195 + TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
16196 + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
16197 + .addOperand(MI->getOperand(0))
16198 + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
16202 + case AMDGPU::input_perspective: {
16203 + R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
16205 + // XXX Be more fine about register reservation
16206 + for (unsigned i = 0; i < 4; i ++) {
16207 + unsigned ReservedReg = AMDGPU::R600_TReg32RegClass.getRegister(i);
16208 + MFI->ReservedRegs.push_back(ReservedReg);
16211 + switch (MI->getOperand(1).getImm()) {
16212 + case 0:// Perspective
16213 + MFI->HasPerspectiveInterpolation = true;
16216 + MFI->HasLinearInterpolation = true;
16219 + assert(0 && "Unknow ij index");
16225 + case AMDGPU::EG_ExportSwz:
16226 + case AMDGPU::R600_ExportSwz: {
16227 + // Instruction is left unmodified if its not the last one of its type
16228 + bool isLastInstructionOfItsType = true;
16229 + unsigned InstExportType = MI->getOperand(1).getImm();
16230 + for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
16231 + EndBlock = BB->end(); NextExportInst != EndBlock;
16232 + NextExportInst = llvm::next(NextExportInst)) {
16233 + if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
16234 + NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
16235 + unsigned CurrentInstExportType = NextExportInst->getOperand(1)
16237 + if (CurrentInstExportType == InstExportType) {
16238 + isLastInstructionOfItsType = false;
16243 + bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
16244 + if (!EOP && !isLastInstructionOfItsType)
16246 + unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
16247 + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
16248 + .addOperand(MI->getOperand(0))
16249 + .addOperand(MI->getOperand(1))
16250 + .addOperand(MI->getOperand(2))
16251 + .addOperand(MI->getOperand(3))
16252 + .addOperand(MI->getOperand(4))
16253 + .addOperand(MI->getOperand(5))
16254 + .addOperand(MI->getOperand(6))
16261 + MI->eraseFromParent();
16265 +//===----------------------------------------------------------------------===//
16266 +// Custom DAG Lowering Operations
16267 +//===----------------------------------------------------------------------===//
16269 +using namespace llvm::Intrinsic;
16270 +using namespace llvm::AMDGPUIntrinsic;
16273 +InsertScalarToRegisterExport(SelectionDAG &DAG, DebugLoc DL, SDNode **ExportMap,
16274 + unsigned Slot, unsigned Channel, unsigned Inst, unsigned Type,
16275 + SDValue Scalar, SDValue Chain) {
16276 + if (!ExportMap[Slot]) {
16277 + SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
16279 + DAG.getUNDEF(MVT::v4f32),
16281 + DAG.getConstant(Channel, MVT::i32));
16283 + unsigned Mask = 1 << Channel;
16285 + const SDValue Ops[] = {Chain, Vector, DAG.getConstant(Inst, MVT::i32),
16286 + DAG.getConstant(Type, MVT::i32), DAG.getConstant(Slot, MVT::i32),
16287 + DAG.getConstant(Mask, MVT::i32)};
16289 + SDValue Res = DAG.getNode(
16290 + AMDGPUISD::EXPORT,
16294 + ExportMap[Slot] = Res.getNode();
16298 + SDNode *ExportInstruction = (SDNode *) ExportMap[Slot] ;
16299 + SDValue PreviousVector = ExportInstruction->getOperand(1);
16300 + SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
16304 + DAG.getConstant(Channel, MVT::i32));
16306 + unsigned Mask = dyn_cast<ConstantSDNode>(ExportInstruction->getOperand(5))
16307 + ->getZExtValue();
16308 + Mask |= (1 << Channel);
16310 + const SDValue Ops[] = {ExportInstruction->getOperand(0), Vector,
16311 + DAG.getConstant(Inst, MVT::i32),
16312 + DAG.getConstant(Type, MVT::i32),
16313 + DAG.getConstant(Slot, MVT::i32),
16314 + DAG.getConstant(Mask, MVT::i32)};
16316 + DAG.UpdateNodeOperands(ExportInstruction,
16323 +SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
16324 + switch (Op.getOpcode()) {
16325 + default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
16326 + case ISD::BR_CC: return LowerBR_CC(Op, DAG);
16327 + case ISD::ROTL: return LowerROTL(Op, DAG);
16328 + case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
16329 + case ISD::SELECT: return LowerSELECT(Op, DAG);
16330 + case ISD::SETCC: return LowerSETCC(Op, DAG);
16331 + case ISD::STORE: return LowerSTORE(Op, DAG);
16332 + case ISD::LOAD: return LowerLOAD(Op, DAG);
16333 + case ISD::FPOW: return LowerFPOW(Op, DAG);
16334 + case ISD::INTRINSIC_VOID: {
16335 + SDValue Chain = Op.getOperand(0);
16336 + unsigned IntrinsicID =
16337 + cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
16338 + switch (IntrinsicID) {
16339 + case AMDGPUIntrinsic::AMDGPU_store_output: {
16340 + MachineFunction &MF = DAG.getMachineFunction();
16341 + MachineRegisterInfo &MRI = MF.getRegInfo();
16342 + int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
16343 + unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
16344 + if (!MRI.isLiveOut(Reg)) {
16345 + MRI.addLiveOut(Reg);
16347 + return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
16349 + case AMDGPUIntrinsic::R600_store_pixel_color: {
16350 + MachineFunction &MF = DAG.getMachineFunction();
16351 + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
16352 + int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
16354 + SDNode **OutputsMap = MFI->Outputs;
16355 + return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap,
16356 + RegIndex / 4, RegIndex % 4, 0, 0, Op.getOperand(2),
16361 + // default for switch(IntrinsicID)
16364 + // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
16367 + case ISD::INTRINSIC_WO_CHAIN: {
16368 + unsigned IntrinsicID =
16369 + cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
16370 + EVT VT = Op.getValueType();
16371 + DebugLoc DL = Op.getDebugLoc();
16372 + switch(IntrinsicID) {
16373 + default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
16374 + case AMDGPUIntrinsic::R600_load_input: {
16375 + int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
16376 + unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
16377 + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
16379 + case AMDGPUIntrinsic::R600_load_input_perspective: {
16380 + int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
16382 + return DAG.getUNDEF(MVT::f32);
16383 + SDValue FullVector = DAG.getNode(
16384 + AMDGPUISD::INTERP,
16386 + DAG.getConstant(0, MVT::i32), DAG.getConstant(slot / 4 , MVT::i32));
16387 + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
16388 + DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32));
16390 + case AMDGPUIntrinsic::R600_load_input_linear: {
16391 + int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
16393 + return DAG.getUNDEF(MVT::f32);
16394 + SDValue FullVector = DAG.getNode(
16395 + AMDGPUISD::INTERP,
16397 + DAG.getConstant(1, MVT::i32), DAG.getConstant(slot / 4 , MVT::i32));
16398 + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
16399 + DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32));
16401 + case AMDGPUIntrinsic::R600_load_input_constant: {
16402 + int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
16404 + return DAG.getUNDEF(MVT::f32);
16405 + SDValue FullVector = DAG.getNode(
16406 + AMDGPUISD::INTERP_P0,
16408 + DAG.getConstant(slot / 4 , MVT::i32));
16409 + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
16410 + DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32));
16413 + case r600_read_ngroups_x:
16414 + return LowerImplicitParameter(DAG, VT, DL, 0);
16415 + case r600_read_ngroups_y:
16416 + return LowerImplicitParameter(DAG, VT, DL, 1);
16417 + case r600_read_ngroups_z:
16418 + return LowerImplicitParameter(DAG, VT, DL, 2);
16419 + case r600_read_global_size_x:
16420 + return LowerImplicitParameter(DAG, VT, DL, 3);
16421 + case r600_read_global_size_y:
16422 + return LowerImplicitParameter(DAG, VT, DL, 4);
16423 + case r600_read_global_size_z:
16424 + return LowerImplicitParameter(DAG, VT, DL, 5);
16425 + case r600_read_local_size_x:
16426 + return LowerImplicitParameter(DAG, VT, DL, 6);
16427 + case r600_read_local_size_y:
16428 + return LowerImplicitParameter(DAG, VT, DL, 7);
16429 + case r600_read_local_size_z:
16430 + return LowerImplicitParameter(DAG, VT, DL, 8);
16432 + case r600_read_tgid_x:
16433 + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
16434 + AMDGPU::T1_X, VT);
16435 + case r600_read_tgid_y:
16436 + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
16437 + AMDGPU::T1_Y, VT);
16438 + case r600_read_tgid_z:
16439 + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
16440 + AMDGPU::T1_Z, VT);
16441 + case r600_read_tidig_x:
16442 + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
16443 + AMDGPU::T0_X, VT);
16444 + case r600_read_tidig_y:
16445 + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
16446 + AMDGPU::T0_Y, VT);
16447 + case r600_read_tidig_z:
16448 + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
16449 + AMDGPU::T0_Z, VT);
16451 + // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
16454 + } // end switch(Op.getOpcode())
16455 + return SDValue();
16458 +void R600TargetLowering::ReplaceNodeResults(SDNode *N,
16459 + SmallVectorImpl<SDValue> &Results,
16460 + SelectionDAG &DAG) const {
16461 + switch (N->getOpcode()) {
16463 + case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
16465 + case ISD::LOAD: {
16466 + SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
16467 + Results.push_back(SDValue(Node, 0));
16468 + Results.push_back(SDValue(Node, 1));
16469 + // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
16471 + DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
16477 +SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
16478 + return DAG.getNode(
16480 + Op.getDebugLoc(),
16482 + Op, DAG.getConstantFP(0.0f, MVT::f32),
16483 + DAG.getCondCode(ISD::SETNE)
16487 +SDValue R600TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
16488 + SDValue Chain = Op.getOperand(0);
16489 + SDValue CC = Op.getOperand(1);
16490 + SDValue LHS = Op.getOperand(2);
16491 + SDValue RHS = Op.getOperand(3);
16492 + SDValue JumpT = Op.getOperand(4);
16493 + SDValue CmpValue;
16496 + if (LHS.getValueType() == MVT::i32) {
16497 + CmpValue = DAG.getNode(
16499 + Op.getDebugLoc(),
16502 + DAG.getConstant(-1, MVT::i32),
16503 + DAG.getConstant(0, MVT::i32),
16505 + } else if (LHS.getValueType() == MVT::f32) {
16506 + CmpValue = DAG.getNode(
16508 + Op.getDebugLoc(),
16511 + DAG.getConstantFP(1.0f, MVT::f32),
16512 + DAG.getConstantFP(0.0f, MVT::f32),
16515 + assert(0 && "Not valid type for br_cc");
16517 + Result = DAG.getNode(
16518 + AMDGPUISD::BRANCH_COND,
16519 + CmpValue.getDebugLoc(),
16520 + MVT::Other, Chain,
16521 + JumpT, CmpValue);
16525 +SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
16527 + unsigned DwordOffset) const {
16528 + unsigned ByteOffset = DwordOffset * 4;
16529 + PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
16530 + AMDGPUAS::PARAM_I_ADDRESS);
16532 + // We shouldn't be using an offset wider than 16-bits for implicit parameters.
16533 + assert(isInt<16>(ByteOffset));
16535 + return DAG.getLoad(VT, DL, DAG.getEntryNode(),
16536 + DAG.getConstant(ByteOffset, MVT::i32), // PTR
16537 + MachinePointerInfo(ConstantPointerNull::get(PtrType)),
16538 + false, false, false, 0);
16541 +SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
16542 + DebugLoc DL = Op.getDebugLoc();
16543 + EVT VT = Op.getValueType();
16545 + return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
16546 + Op.getOperand(0),
16547 + Op.getOperand(0),
16548 + DAG.getNode(ISD::SUB, DL, VT,
16549 + DAG.getConstant(32, MVT::i32),
16550 + Op.getOperand(1)));
16553 +bool R600TargetLowering::isZero(SDValue Op) const {
16554 + if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
16555 + return Cst->isNullValue();
16556 + } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
16557 + return CstFP->isZero();
16563 +SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
16564 + DebugLoc DL = Op.getDebugLoc();
16565 + EVT VT = Op.getValueType();
16567 + SDValue LHS = Op.getOperand(0);
16568 + SDValue RHS = Op.getOperand(1);
16569 + SDValue True = Op.getOperand(2);
16570 + SDValue False = Op.getOperand(3);
16571 + SDValue CC = Op.getOperand(4);
16574 + // LHS and RHS are guaranteed to be the same value type
16575 + EVT CompareVT = LHS.getValueType();
16577 + // Check if we can lower this to a native operation.
16579 + // Try to lower to a CND* instruction:
16580 + // CND* instructions requires RHS to be zero. Some SELECT_CC nodes that
16581 + // can be lowered to CND* instructions can also be lowered to SET*
16582 + // instructions. CND* instructions are cheaper, because they dont't
16583 + // require additional instructions to convert their result to the correct
16584 + // value type, so this check should be first.
16585 + if (isZero(LHS) || isZero(RHS)) {
16586 + SDValue Cond = (isZero(LHS) ? RHS : LHS);
16587 + SDValue Zero = (isZero(LHS) ? LHS : RHS);
16588 + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
16589 + if (CompareVT != VT) {
16590 + // Bitcast True / False to the correct types. This will end up being
16591 + // a nop, but it allows us to define only a single pattern in the
16592 + // .TD files for each CND* instruction rather than having to have
16593 + // one pattern for integer True/False and one for fp True/False
16594 + True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
16595 + False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
16597 + if (isZero(LHS)) {
16598 + CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
16601 + switch (CCOpcode) {
16602 + case ISD::SETONE:
16603 + case ISD::SETUNE:
16605 + case ISD::SETULE:
16606 + case ISD::SETULT:
16607 + case ISD::SETOLE:
16608 + case ISD::SETOLT:
16611 + CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
16619 + SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
16622 + DAG.getCondCode(CCOpcode));
16623 + return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
16626 + // Try to lower to a SET* instruction:
16627 + // We need all the operands of SELECT_CC to have the same value type, so if
16628 + // necessary we need to change True and False to be the same type as LHS and
16629 + // RHS, and then convert the result of the select_cc back to the correct type.
16631 + // Move hardware True/False values to the correct operand.
16632 + if (isHWTrueValue(False) && isHWFalseValue(True)) {
16633 + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
16634 + std::swap(False, True);
16635 + CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
16638 + if (isHWTrueValue(True) && isHWFalseValue(False)) {
16639 + if (CompareVT != VT) {
16640 + if (VT == MVT::f32 && CompareVT == MVT::i32) {
16641 + SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
16643 + DAG.getConstant(-1, MVT::i32),
16644 + DAG.getConstant(0, MVT::i32),
16646 + // Convert integer values of true (-1) and false (0) to fp values of
16647 + // true (1.0f) and false (0.0f).
16648 + SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean,
16649 + DAG.getConstant(1, MVT::i32));
16650 + return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB);
16651 + } else if (VT == MVT::i32 && CompareVT == MVT::f32) {
16652 + SDValue BoolAsFlt = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
16654 + DAG.getConstantFP(1.0f, MVT::f32),
16655 + DAG.getConstantFP(0.0f, MVT::f32),
16657 + // Convert fp values of true (1.0f) and false (0.0f) to integer values
16658 + // of true (-1) and false (0).
16659 + SDValue Neg = DAG.getNode(ISD::FNEG, DL, MVT::f32, BoolAsFlt);
16660 + return DAG.getNode(ISD::FP_TO_SINT, DL, VT, Neg);
16662 + // I don't think there will be any other type pairings.
16663 + assert(!"Unhandled operand type parings in SELECT_CC");
16666 + // This SELECT_CC is already legal.
16667 + return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
16671 + // Possible Min/Max pattern
16672 + SDValue MinMax = LowerMinMax(Op, DAG);
16673 + if (MinMax.getNode()) {
16677 + // If we make it this for it means we have no native instructions to handle
16678 + // this SELECT_CC, so we must lower it.
16679 + SDValue HWTrue, HWFalse;
16681 + if (CompareVT == MVT::f32) {
16682 + HWTrue = DAG.getConstantFP(1.0f, CompareVT);
16683 + HWFalse = DAG.getConstantFP(0.0f, CompareVT);
16684 + } else if (CompareVT == MVT::i32) {
16685 + HWTrue = DAG.getConstant(-1, CompareVT);
16686 + HWFalse = DAG.getConstant(0, CompareVT);
16689 + assert(!"Unhandled value type in LowerSELECT_CC");
16692 + // Lower this unsupported SELECT_CC into a combination of two supported
16693 + // SELECT_CC operations.
16694 + SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
16696 + return DAG.getNode(ISD::SELECT_CC, DL, VT,
16699 + DAG.getCondCode(ISD::SETNE));
16702 +SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
16703 + return DAG.getNode(ISD::SELECT_CC,
16704 + Op.getDebugLoc(),
16705 + Op.getValueType(),
16706 + Op.getOperand(0),
16707 + DAG.getConstant(0, MVT::i32),
16708 + Op.getOperand(1),
16709 + Op.getOperand(2),
16710 + DAG.getCondCode(ISD::SETNE));
16713 +SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
16715 + SDValue LHS = Op.getOperand(0);
16716 + SDValue RHS = Op.getOperand(1);
16717 + SDValue CC = Op.getOperand(2);
16718 + DebugLoc DL = Op.getDebugLoc();
16719 + assert(Op.getValueType() == MVT::i32);
16720 + if (LHS.getValueType() == MVT::i32) {
16721 + Cond = DAG.getNode(
16723 + Op.getDebugLoc(),
16726 + DAG.getConstant(-1, MVT::i32),
16727 + DAG.getConstant(0, MVT::i32),
16729 + } else if (LHS.getValueType() == MVT::f32) {
16730 + Cond = DAG.getNode(
16732 + Op.getDebugLoc(),
16735 + DAG.getConstantFP(1.0f, MVT::f32),
16736 + DAG.getConstantFP(0.0f, MVT::f32),
16738 + Cond = DAG.getNode(
16744 + assert(0 && "Not valid type for set_cc");
16746 + Cond = DAG.getNode(
16750 + DAG.getConstant(1, MVT::i32),
16755 +SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
16756 + DebugLoc DL = Op.getDebugLoc();
16757 + StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
16758 + SDValue Chain = Op.getOperand(0);
16759 + SDValue Value = Op.getOperand(1);
16760 + SDValue Ptr = Op.getOperand(2);
16762 + if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
16763 + Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
16764 + // Convert pointer from byte address to dword address.
16765 + Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
16766 + DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
16767 + Ptr, DAG.getConstant(2, MVT::i32)));
16769 + if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
16770 + assert(!"Truncated and indexed stores not supported yet");
16772 + Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
16776 + return SDValue();
16779 +// return (512 + (kc_bank << 12)
16781 +ConstantAddressBlock(unsigned AddressSpace) {
16782 + switch (AddressSpace) {
16783 + case AMDGPUAS::CONSTANT_BUFFER_0:
16785 + case AMDGPUAS::CONSTANT_BUFFER_1:
16786 + return 512 + 4096;
16787 + case AMDGPUAS::CONSTANT_BUFFER_2:
16788 + return 512 + 4096 * 2;
16789 + case AMDGPUAS::CONSTANT_BUFFER_3:
16790 + return 512 + 4096 * 3;
16791 + case AMDGPUAS::CONSTANT_BUFFER_4:
16792 + return 512 + 4096 * 4;
16793 + case AMDGPUAS::CONSTANT_BUFFER_5:
16794 + return 512 + 4096 * 5;
16795 + case AMDGPUAS::CONSTANT_BUFFER_6:
16796 + return 512 + 4096 * 6;
16797 + case AMDGPUAS::CONSTANT_BUFFER_7:
16798 + return 512 + 4096 * 7;
16799 + case AMDGPUAS::CONSTANT_BUFFER_8:
16800 + return 512 + 4096 * 8;
16801 + case AMDGPUAS::CONSTANT_BUFFER_9:
16802 + return 512 + 4096 * 9;
16803 + case AMDGPUAS::CONSTANT_BUFFER_10:
16804 + return 512 + 4096 * 10;
16805 + case AMDGPUAS::CONSTANT_BUFFER_11:
16806 + return 512 + 4096 * 11;
16807 + case AMDGPUAS::CONSTANT_BUFFER_12:
16808 + return 512 + 4096 * 12;
16809 + case AMDGPUAS::CONSTANT_BUFFER_13:
16810 + return 512 + 4096 * 13;
16811 + case AMDGPUAS::CONSTANT_BUFFER_14:
16812 + return 512 + 4096 * 14;
16813 + case AMDGPUAS::CONSTANT_BUFFER_15:
16814 + return 512 + 4096 * 15;
16820 +SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
16822 + EVT VT = Op.getValueType();
16823 + DebugLoc DL = Op.getDebugLoc();
16824 + LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
16825 + SDValue Chain = Op.getOperand(0);
16826 + SDValue Ptr = Op.getOperand(1);
16827 + SDValue LoweredLoad;
16829 + int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
16830 + if (ConstantBlock > -1) {
16832 + if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
16833 + dyn_cast<Constant>(LoadNode->getSrcValue())) {
16834 + SDValue Slots[4];
16835 + for (unsigned i = 0; i < 4; i++) {
16836 + // We want Const position encoded with the following formula :
16837 + // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
16838 + // const_index is Ptr computed by llvm using an alignment of 16.
16839 + // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
16840 + // then div by 4 at the ISel step
16841 + SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
16842 + DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
16843 + Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
16845 + Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
16847 + // non constant ptr cant be folded, keeps it as a v4f32 load
16848 + Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
16849 + DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32))
16853 + if (!VT.isVector()) {
16854 + Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
16855 + DAG.getConstant(0, MVT::i32));
16858 + SDValue MergedValues[2] = {
16862 + return DAG.getMergeValues(MergedValues, 2, DL);
16865 + return SDValue();
16868 +SDValue R600TargetLowering::LowerFPOW(SDValue Op,
16869 + SelectionDAG &DAG) const {
16870 + DebugLoc DL = Op.getDebugLoc();
16871 + EVT VT = Op.getValueType();
16872 + SDValue LogBase = DAG.getNode(ISD::FLOG2, DL, VT, Op.getOperand(0));
16873 + SDValue MulLogBase = DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), LogBase);
16874 + return DAG.getNode(ISD::FEXP2, DL, VT, MulLogBase);
16877 +/// XXX Only kernel functions are supported, so we can assume for now that
16878 +/// every function is a kernel function, but in the future we should use
16879 +/// separate calling conventions for kernel and non-kernel functions.
16880 +SDValue R600TargetLowering::LowerFormalArguments(
16882 + CallingConv::ID CallConv,
16884 + const SmallVectorImpl<ISD::InputArg> &Ins,
16885 + DebugLoc DL, SelectionDAG &DAG,
16886 + SmallVectorImpl<SDValue> &InVals) const {
16887 + unsigned ParamOffsetBytes = 36;
16888 + Function::const_arg_iterator FuncArg =
16889 + DAG.getMachineFunction().getFunction()->arg_begin();
16890 + for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
16891 + EVT VT = Ins[i].VT;
16892 + Type *ArgType = FuncArg->getType();
16893 + unsigned ArgSizeInBits = ArgType->isPointerTy() ?
16894 + 32 : ArgType->getPrimitiveSizeInBits();
16895 + unsigned ArgBytes = ArgSizeInBits >> 3;
16897 + if (ArgSizeInBits < VT.getSizeInBits()) {
16898 + assert(!ArgType->isFloatTy() &&
16899 + "Extending floating point arguments not supported yet");
16900 + ArgVT = MVT::getIntegerVT(ArgSizeInBits);
16904 + PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
16905 + AMDGPUAS::PARAM_I_ADDRESS);
16906 + SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
16907 + DAG.getConstant(ParamOffsetBytes, MVT::i32),
16908 + MachinePointerInfo(new Argument(PtrTy)),
16909 + ArgVT, false, false, ArgBytes);
16910 + InVals.push_back(Arg);
16911 + ParamOffsetBytes += ArgBytes;
16916 +EVT R600TargetLowering::getSetCCResultType(EVT VT) const {
16917 + if (!VT.isVector()) return MVT::i32;
16918 + return VT.changeVectorElementTypeToInteger();
16921 +//===----------------------------------------------------------------------===//
16922 +// Custom DAG Optimizations
16923 +//===----------------------------------------------------------------------===//
16925 +SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
16926 + DAGCombinerInfo &DCI) const {
16927 + SelectionDAG &DAG = DCI.DAG;
16929 + switch (N->getOpcode()) {
16930 + // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
16931 + case ISD::FP_ROUND: {
16932 + SDValue Arg = N->getOperand(0);
16933 + if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
16934 + return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0),
16935 + Arg.getOperand(0));
16939 + // Extract_vec (Build_vector) generated by custom lowering
16940 + // also needs to be customly combined
16941 + case ISD::EXTRACT_VECTOR_ELT: {
16942 + SDValue Arg = N->getOperand(0);
16943 + if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
16944 + if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
16945 + unsigned Element = Const->getZExtValue();
16946 + return Arg->getOperand(Element);
16951 + return SDValue();
16953 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600ISelLowering.h llvm-r600/lib/Target/R600/R600ISelLowering.h
16954 --- llvm-3.2.src/lib/Target/R600/R600ISelLowering.h 1970-01-01 01:00:00.000000000 +0100
16955 +++ llvm-r600/lib/Target/R600/R600ISelLowering.h 2013-01-25 19:43:57.463383054 +0100
16957 +//===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===//
16959 +// The LLVM Compiler Infrastructure
16961 +// This file is distributed under the University of Illinois Open Source
16962 +// License. See LICENSE.TXT for details.
16964 +//===----------------------------------------------------------------------===//
16967 +/// \brief R600 DAG Lowering interface definition
16969 +//===----------------------------------------------------------------------===//
16971 +#ifndef R600ISELLOWERING_H
16972 +#define R600ISELLOWERING_H
16974 +#include "AMDGPUISelLowering.h"
16978 +class R600InstrInfo;
16980 +class R600TargetLowering : public AMDGPUTargetLowering {
16982 + R600TargetLowering(TargetMachine &TM);
16983 + virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
16984 + MachineBasicBlock * BB) const;
16985 + virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
16986 + virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
16987 + void ReplaceNodeResults(SDNode * N,
16988 + SmallVectorImpl<SDValue> &Results,
16989 + SelectionDAG &DAG) const;
16990 + virtual SDValue LowerFormalArguments(
16992 + CallingConv::ID CallConv,
16994 + const SmallVectorImpl<ISD::InputArg> &Ins,
16995 + DebugLoc DL, SelectionDAG &DAG,
16996 + SmallVectorImpl<SDValue> &InVals) const;
16997 + virtual EVT getSetCCResultType(EVT VT) const;
16999 + const R600InstrInfo * TII;
17001 + /// Each OpenCL kernel has nine implicit parameters that are stored in the
17002 + /// first nine dwords of a Vertex Buffer. These implicit parameters are
17003 + /// lowered to load instructions which retreive the values from the Vertex
17005 + SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
17006 + DebugLoc DL, unsigned DwordOffset) const;
17008 + void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
17009 + MachineRegisterInfo & MRI, unsigned dword_offset) const;
17011 + SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
17013 + /// \brief Lower ROTL opcode to BITALIGN
17014 + SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
17016 + SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
17017 + SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
17018 + SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
17019 + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
17020 + SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
17021 + SDValue LowerFPOW(SDValue Op, SelectionDAG &DAG) const;
17022 + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
17024 + bool isZero(SDValue Op) const;
17027 +} // End namespace llvm;
17029 +#endif // R600ISELLOWERING_H
17030 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600LowerConstCopy.cpp llvm-r600/lib/Target/R600/R600LowerConstCopy.cpp
17031 --- llvm-3.2.src/lib/Target/R600/R600LowerConstCopy.cpp 1970-01-01 01:00:00.000000000 +0100
17032 +++ llvm-r600/lib/Target/R600/R600LowerConstCopy.cpp 2013-01-25 19:43:57.466716387 +0100
17034 +//===-- R600LowerConstCopy.cpp - Propagate ConstCopy / lower them to MOV---===//
17036 +// The LLVM Compiler Infrastructure
17038 +// This file is distributed under the University of Illinois Open Source
17039 +// License. See LICENSE.TXT for details.
17041 +//===----------------------------------------------------------------------===//
17044 +/// This pass is intended to handle remaining ConstCopy pseudo MachineInstr.
17045 +/// ISel will fold each Const Buffer read inside scalar ALU. However it cannot
17046 +/// fold them inside vector instruction, like DOT4 or Cube ; ISel emits
17047 +/// ConstCopy instead. This pass (executed after ExpandingSpecialInstr) will try
17048 +/// to fold them if possible or replace them by MOV otherwise.
17049 +/// TODO : Implement the folding part, using Copy Propagation algorithm.
17051 +//===----------------------------------------------------------------------===//
17053 +#include "AMDGPU.h"
17054 +#include "llvm/CodeGen/MachineFunction.h"
17055 +#include "llvm/CodeGen/MachineFunctionPass.h"
17056 +#include "R600InstrInfo.h"
17057 +#include "llvm/GlobalValue.h"
17058 +#include "llvm/CodeGen/MachineInstrBuilder.h"
17062 +class R600LowerConstCopy : public MachineFunctionPass {
17065 + const R600InstrInfo *TII;
17067 + R600LowerConstCopy(TargetMachine &tm);
17068 + virtual bool runOnMachineFunction(MachineFunction &MF);
17070 + const char *getPassName() const { return "R600 Eliminate Symbolic Operand"; }
17073 +char R600LowerConstCopy::ID = 0;
17076 +R600LowerConstCopy::R600LowerConstCopy(TargetMachine &tm) :
17077 + MachineFunctionPass(ID),
17078 + TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo()))
17082 +bool R600LowerConstCopy::runOnMachineFunction(MachineFunction &MF) {
17083 + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
17084 + BB != BB_E; ++BB) {
17085 + MachineBasicBlock &MBB = *BB;
17086 + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
17088 + MachineInstr &MI = *I;
17089 + I = llvm::next(I);
17090 + if (MI.getOpcode() != AMDGPU::CONST_COPY)
17092 + MachineInstr *NewMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::MOV,
17093 + MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
17094 + NewMI->getOperand(9).setImm(MI.getOperand(1).getImm());
17095 + MI.eraseFromParent();
17101 +FunctionPass *createR600LowerConstCopy(TargetMachine &tm) {
17102 + return new R600LowerConstCopy(tm);
17108 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600MachineFunctionInfo.cpp llvm-r600/lib/Target/R600/R600MachineFunctionInfo.cpp
17109 --- llvm-3.2.src/lib/Target/R600/R600MachineFunctionInfo.cpp 1970-01-01 01:00:00.000000000 +0100
17110 +++ llvm-r600/lib/Target/R600/R600MachineFunctionInfo.cpp 2013-01-25 19:43:57.470049720 +0100
17112 +//===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===//
17114 +// The LLVM Compiler Infrastructure
17116 +// This file is distributed under the University of Illinois Open Source
17117 +// License. See LICENSE.TXT for details.
17120 +//===----------------------------------------------------------------------===//
17122 +#include "R600MachineFunctionInfo.h"
17124 +using namespace llvm;
17126 +R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
17127 + : MachineFunctionInfo(),
17128 + HasLinearInterpolation(false),
17129 + HasPerspectiveInterpolation(false) {
17130 + memset(Outputs, 0, sizeof(Outputs));
17133 +unsigned R600MachineFunctionInfo::GetIJPerspectiveIndex() const {
17134 + assert(HasPerspectiveInterpolation);
17138 +unsigned R600MachineFunctionInfo::GetIJLinearIndex() const {
17139 + assert(HasLinearInterpolation);
17140 + if (HasPerspectiveInterpolation)
17145 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600MachineFunctionInfo.h llvm-r600/lib/Target/R600/R600MachineFunctionInfo.h
17146 --- llvm-3.2.src/lib/Target/R600/R600MachineFunctionInfo.h 1970-01-01 01:00:00.000000000 +0100
17147 +++ llvm-r600/lib/Target/R600/R600MachineFunctionInfo.h 2013-01-25 19:43:57.470049720 +0100
17149 +//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=//
17151 +// The LLVM Compiler Infrastructure
17153 +// This file is distributed under the University of Illinois Open Source
17154 +// License. See LICENSE.TXT for details.
17156 +//===----------------------------------------------------------------------===//
17159 +//===----------------------------------------------------------------------===//
17161 +#ifndef R600MACHINEFUNCTIONINFO_H
17162 +#define R600MACHINEFUNCTIONINFO_H
17164 +#include "llvm/CodeGen/MachineFunction.h"
17165 +#include "llvm/CodeGen/SelectionDAG.h"
17170 +class R600MachineFunctionInfo : public MachineFunctionInfo {
17173 + R600MachineFunctionInfo(const MachineFunction &MF);
17174 + std::vector<unsigned> ReservedRegs;
17175 + SDNode *Outputs[16];
17176 + bool HasLinearInterpolation;
17177 + bool HasPerspectiveInterpolation;
17179 + unsigned GetIJLinearIndex() const;
17180 + unsigned GetIJPerspectiveIndex() const;
17184 +} // End llvm namespace
17186 +#endif //R600MACHINEFUNCTIONINFO_H
17187 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600RegisterInfo.cpp llvm-r600/lib/Target/R600/R600RegisterInfo.cpp
17188 --- llvm-3.2.src/lib/Target/R600/R600RegisterInfo.cpp 1970-01-01 01:00:00.000000000 +0100
17189 +++ llvm-r600/lib/Target/R600/R600RegisterInfo.cpp 2013-01-25 19:43:57.470049720 +0100
17191 +//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===//
17193 +// The LLVM Compiler Infrastructure
17195 +// This file is distributed under the University of Illinois Open Source
17196 +// License. See LICENSE.TXT for details.
17198 +//===----------------------------------------------------------------------===//
17201 +/// \brief R600 implementation of the TargetRegisterInfo class.
17203 +//===----------------------------------------------------------------------===//
17205 +#include "R600RegisterInfo.h"
17206 +#include "AMDGPUTargetMachine.h"
17207 +#include "R600Defines.h"
17208 +#include "R600MachineFunctionInfo.h"
17210 +using namespace llvm;
17212 +R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm,
17213 + const TargetInstrInfo &tii)
17214 +: AMDGPURegisterInfo(tm, tii),
17219 +BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
17220 + BitVector Reserved(getNumRegs());
17221 + const R600MachineFunctionInfo * MFI = MF.getInfo<R600MachineFunctionInfo>();
17223 + Reserved.set(AMDGPU::ZERO);
17224 + Reserved.set(AMDGPU::HALF);
17225 + Reserved.set(AMDGPU::ONE);
17226 + Reserved.set(AMDGPU::ONE_INT);
17227 + Reserved.set(AMDGPU::NEG_HALF);
17228 + Reserved.set(AMDGPU::NEG_ONE);
17229 + Reserved.set(AMDGPU::PV_X);
17230 + Reserved.set(AMDGPU::ALU_LITERAL_X);
17231 + Reserved.set(AMDGPU::ALU_CONST);
17232 + Reserved.set(AMDGPU::PREDICATE_BIT);
17233 + Reserved.set(AMDGPU::PRED_SEL_OFF);
17234 + Reserved.set(AMDGPU::PRED_SEL_ZERO);
17235 + Reserved.set(AMDGPU::PRED_SEL_ONE);
17237 + for (std::vector<unsigned>::const_iterator I = MFI->ReservedRegs.begin(),
17238 + E = MFI->ReservedRegs.end(); I != E; ++I) {
17239 + Reserved.set(*I);
17245 +const TargetRegisterClass *
17246 +R600RegisterInfo::getISARegClass(const TargetRegisterClass * rc) const {
17247 + switch (rc->getID()) {
17248 + case AMDGPU::GPRF32RegClassID:
17249 + case AMDGPU::GPRI32RegClassID:
17250 + return &AMDGPU::R600_Reg32RegClass;
17251 + default: return rc;
17255 +unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
17256 + return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
17259 +const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
17261 + switch(VT.SimpleTy) {
17263 + case MVT::i32: return &AMDGPU::R600_TReg32RegClass;
17267 +unsigned R600RegisterInfo::getSubRegFromChannel(unsigned Channel) const {
17268 + switch (Channel) {
17269 + default: assert(!"Invalid channel index"); return 0;
17270 + case 0: return AMDGPU::sel_x;
17271 + case 1: return AMDGPU::sel_y;
17272 + case 2: return AMDGPU::sel_z;
17273 + case 3: return AMDGPU::sel_w;
17276 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600RegisterInfo.h llvm-r600/lib/Target/R600/R600RegisterInfo.h
17277 --- llvm-3.2.src/lib/Target/R600/R600RegisterInfo.h 1970-01-01 01:00:00.000000000 +0100
17278 +++ llvm-r600/lib/Target/R600/R600RegisterInfo.h 2013-01-25 19:43:57.470049720 +0100
17280 +//===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===//
17282 +// The LLVM Compiler Infrastructure
17284 +// This file is distributed under the University of Illinois Open Source
17285 +// License. See LICENSE.TXT for details.
17287 +//===----------------------------------------------------------------------===//
17290 +/// \brief Interface definition for R600RegisterInfo
17292 +//===----------------------------------------------------------------------===//
17294 +#ifndef R600REGISTERINFO_H_
17295 +#define R600REGISTERINFO_H_
17297 +#include "AMDGPUTargetMachine.h"
17298 +#include "AMDGPURegisterInfo.h"
17302 +class R600TargetMachine;
17303 +class TargetInstrInfo;
17305 +struct R600RegisterInfo : public AMDGPURegisterInfo {
17306 + AMDGPUTargetMachine &TM;
17307 + const TargetInstrInfo &TII;
17309 + R600RegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii);
17311 + virtual BitVector getReservedRegs(const MachineFunction &MF) const;
17313 + /// \param RC is an AMDIL reg class.
17315 + /// \returns the R600 reg class that is equivalent to \p RC.
17316 + virtual const TargetRegisterClass *getISARegClass(
17317 + const TargetRegisterClass *RC) const;
17319 + /// \brief get the HW encoding for a register's channel.
17320 + unsigned getHWRegChan(unsigned reg) const;
17322 + /// \brief get the register class of the specified type to use in the
17323 + /// CFGStructurizer
17324 + virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
17326 + /// \returns the sub reg enum value for the given \p Channel
17327 + /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sel_x)
17328 + unsigned getSubRegFromChannel(unsigned Channel) const;
17332 +} // End namespace llvm
17334 +#endif // AMDIDSAREGISTERINFO_H_
17335 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600RegisterInfo.td llvm-r600/lib/Target/R600/R600RegisterInfo.td
17336 --- llvm-3.2.src/lib/Target/R600/R600RegisterInfo.td 1970-01-01 01:00:00.000000000 +0100
17337 +++ llvm-r600/lib/Target/R600/R600RegisterInfo.td 2013-01-25 19:43:57.470049720 +0100
17340 +class R600Reg <string name, bits<16> encoding> : Register<name> {
17341 + let Namespace = "AMDGPU";
17342 + let HWEncoding = encoding;
17345 +class R600RegWithChan <string name, bits<9> sel, string chan> :
17346 + Register <name> {
17348 + field bits<2> chan_encoding = !if(!eq(chan, "X"), 0,
17349 + !if(!eq(chan, "Y"), 1,
17350 + !if(!eq(chan, "Z"), 2,
17351 + !if(!eq(chan, "W"), 3, 0))));
17352 + let HWEncoding{8-0} = sel;
17353 + let HWEncoding{10-9} = chan_encoding;
17354 + let Namespace = "AMDGPU";
17357 +class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
17358 + RegisterWithSubRegs<n, subregs> {
17359 + let Namespace = "AMDGPU";
17360 + let SubRegIndices = [sel_x, sel_y, sel_z, sel_w];
17361 + let HWEncoding = encoding;
17364 +foreach Index = 0-127 in {
17365 + foreach Chan = [ "X", "Y", "Z", "W" ] in {
17366 + // 32-bit Temporary Registers
17367 + def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>;
17369 + // 128-bit Temporary Registers
17370 + def T#Index#_XYZW : R600Reg_128 <"T"#Index#".XYZW",
17371 + [!cast<Register>("T"#Index#"_X"),
17372 + !cast<Register>("T"#Index#"_Y"),
17373 + !cast<Register>("T"#Index#"_Z"),
17374 + !cast<Register>("T"#Index#"_W")],
17378 +// Array Base Register holding input in FS
17379 +foreach Index = 448-464 in {
17380 + def ArrayBase#Index : R600Reg<"ARRAY_BASE", Index>;
17384 +// Special Registers
17386 +def ZERO : R600Reg<"0.0", 248>;
17387 +def ONE : R600Reg<"1.0", 249>;
17388 +def NEG_ONE : R600Reg<"-1.0", 249>;
17389 +def ONE_INT : R600Reg<"1", 250>;
17390 +def HALF : R600Reg<"0.5", 252>;
17391 +def NEG_HALF : R600Reg<"-0.5", 252>;
17392 +def ALU_LITERAL_X : R600Reg<"literal.x", 253>;
17393 +def PV_X : R600Reg<"pv.x", 254>;
17394 +def PREDICATE_BIT : R600Reg<"PredicateBit", 0>;
17395 +def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
17396 +def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
17397 +def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>;
17399 +def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32,
17400 + (add (sequence "ArrayBase%u", 448, 464))>;
17401 +// special registers for ALU src operands
17402 +// const buffer reference, SRCx_SEL contains index
17403 +def ALU_CONST : R600Reg<"CBuf", 0>;
17404 +// interpolation param reference, SRCx_SEL contains index
17405 +def ALU_PARAM : R600Reg<"Param", 0>;
17407 +def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32,
17408 + (add (sequence "T%u_X", 0, 127))>;
17410 +def R600_TReg32_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
17411 + (add (sequence "T%u_Y", 0, 127))>;
17413 +def R600_TReg32_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
17414 + (add (sequence "T%u_Z", 0, 127))>;
17416 +def R600_TReg32_W : RegisterClass <"AMDGPU", [f32, i32], 32,
17417 + (add (sequence "T%u_W", 0, 127))>;
17419 +def R600_TReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
17420 + (interleave R600_TReg32_X, R600_TReg32_Y,
17421 + R600_TReg32_Z, R600_TReg32_W)>;
17423 +def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
17426 + ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF,
17427 + ALU_CONST, ALU_PARAM
17430 +def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add
17431 + PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>;
17433 +def R600_Predicate_Bit: RegisterClass <"AMDGPU", [i32], 32, (add
17436 +def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
17437 + (add (sequence "T%u_XYZW", 0, 127))> {
17438 + let CopyCost = -1;
17440 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/R600Schedule.td llvm-r600/lib/Target/R600/R600Schedule.td
17441 --- llvm-3.2.src/lib/Target/R600/R600Schedule.td 1970-01-01 01:00:00.000000000 +0100
17442 +++ llvm-r600/lib/Target/R600/R600Schedule.td 2013-01-25 19:43:57.470049720 +0100
17444 +//===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===//
17446 +// The LLVM Compiler Infrastructure
17448 +// This file is distributed under the University of Illinois Open Source
17449 +// License. See LICENSE.TXT for details.
17451 +//===----------------------------------------------------------------------===//
17453 +// R600 has a VLIW architecture. On pre-cayman cards there are 5 instruction
17454 +// slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS. For cayman cards, the TRANS
17455 +// slot has been removed.
17457 +//===----------------------------------------------------------------------===//
17460 +def ALU_X : FuncUnit;
17461 +def ALU_Y : FuncUnit;
17462 +def ALU_Z : FuncUnit;
17463 +def ALU_W : FuncUnit;
17464 +def TRANS : FuncUnit;
17466 +def AnyALU : InstrItinClass;
17467 +def VecALU : InstrItinClass;
17468 +def TransALU : InstrItinClass;
17470 +def R600_EG_Itin : ProcessorItineraries <
17471 + [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL],
17474 + InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS]>]>,
17475 + InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_X, ALU_W]>]>,
17476 + InstrItinData<TransALU, [InstrStage<1, [TRANS]>]>,
17477 + InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
17480 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIAnnotateControlFlow.cpp llvm-r600/lib/Target/R600/SIAnnotateControlFlow.cpp
17481 --- llvm-3.2.src/lib/Target/R600/SIAnnotateControlFlow.cpp 1970-01-01 01:00:00.000000000 +0100
17482 +++ llvm-r600/lib/Target/R600/SIAnnotateControlFlow.cpp 2013-01-25 19:43:57.470049720 +0100
17484 +//===-- SIAnnotateControlFlow.cpp - ------------------===//
17486 +// The LLVM Compiler Infrastructure
17488 +// This file is distributed under the University of Illinois Open Source
17489 +// License. See LICENSE.TXT for details.
17491 +//===----------------------------------------------------------------------===//
17494 +/// Annotates the control flow with hardware specific intrinsics.
17496 +//===----------------------------------------------------------------------===//
17498 +#include "AMDGPU.h"
17500 +#include "llvm/Pass.h"
17501 +#include "llvm/Module.h"
17502 +#include "llvm/Analysis/Dominators.h"
17503 +#include "llvm/Transforms/Utils/BasicBlockUtils.h"
17504 +#include "llvm/ADT/DepthFirstIterator.h"
17505 +#include "llvm/Transforms/Utils/SSAUpdater.h"
17507 +using namespace llvm;
17511 +// Complex types used in this pass
17512 +typedef std::pair<BasicBlock *, Value *> StackEntry;
17513 +typedef SmallVector<StackEntry, 16> StackVector;
17515 +// Intrinsic names the control flow is annotated with
17516 +static const char *IfIntrinsic = "llvm.SI.if";
17517 +static const char *ElseIntrinsic = "llvm.SI.else";
17518 +static const char *BreakIntrinsic = "llvm.SI.break";
17519 +static const char *IfBreakIntrinsic = "llvm.SI.if.break";
17520 +static const char *ElseBreakIntrinsic = "llvm.SI.else.break";
17521 +static const char *LoopIntrinsic = "llvm.SI.loop";
17522 +static const char *EndCfIntrinsic = "llvm.SI.end.cf";
17524 +class SIAnnotateControlFlow : public FunctionPass {
17531 + Type *ReturnStruct;
17533 + ConstantInt *BoolTrue;
17534 + ConstantInt *BoolFalse;
17535 + UndefValue *BoolUndef;
17536 + Constant *Int64Zero;
17541 + Constant *IfBreak;
17542 + Constant *ElseBreak;
17546 + DominatorTree *DT;
17547 + StackVector Stack;
17548 + SSAUpdater PhiInserter;
17550 + bool isTopOfStack(BasicBlock *BB);
17552 + Value *popSaved();
17554 + void push(BasicBlock *BB, Value *Saved);
17556 + bool isElse(PHINode *Phi);
17558 + void eraseIfUnused(PHINode *Phi);
17560 + void openIf(BranchInst *Term);
17562 + void insertElse(BranchInst *Term);
17564 + void handleLoopCondition(Value *Cond);
17566 + void handleLoop(BranchInst *Term);
17568 + void closeControlFlow(BasicBlock *BB);
17571 + SIAnnotateControlFlow():
17572 + FunctionPass(ID) { }
17574 + virtual bool doInitialization(Module &M);
17576 + virtual bool runOnFunction(Function &F);
17578 + virtual const char *getPassName() const {
17579 + return "SI annotate control flow";
17582 + virtual void getAnalysisUsage(AnalysisUsage &AU) const {
17583 + AU.addRequired<DominatorTree>();
17584 + AU.addPreserved<DominatorTree>();
17585 + FunctionPass::getAnalysisUsage(AU);
17590 +} // end anonymous namespace
17592 +char SIAnnotateControlFlow::ID = 0;
17594 +/// \brief Initialize all the types and constants used in the pass
17595 +bool SIAnnotateControlFlow::doInitialization(Module &M) {
17596 + LLVMContext &Context = M.getContext();
17598 + Void = Type::getVoidTy(Context);
17599 + Boolean = Type::getInt1Ty(Context);
17600 + Int64 = Type::getInt64Ty(Context);
17601 + ReturnStruct = StructType::get(Boolean, Int64, (Type *)0);
17603 + BoolTrue = ConstantInt::getTrue(Context);
17604 + BoolFalse = ConstantInt::getFalse(Context);
17605 + BoolUndef = UndefValue::get(Boolean);
17606 + Int64Zero = ConstantInt::get(Int64, 0);
17608 + If = M.getOrInsertFunction(
17609 + IfIntrinsic, ReturnStruct, Boolean, (Type *)0);
17611 + Else = M.getOrInsertFunction(
17612 + ElseIntrinsic, ReturnStruct, Int64, (Type *)0);
17614 + Break = M.getOrInsertFunction(
17615 + BreakIntrinsic, Int64, Int64, (Type *)0);
17617 + IfBreak = M.getOrInsertFunction(
17618 + IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)0);
17620 + ElseBreak = M.getOrInsertFunction(
17621 + ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)0);
17623 + Loop = M.getOrInsertFunction(
17624 + LoopIntrinsic, Boolean, Int64, (Type *)0);
17626 + EndCf = M.getOrInsertFunction(
17627 + EndCfIntrinsic, Void, Int64, (Type *)0);
17632 +/// \brief Is BB the last block saved on the stack ?
17633 +bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) {
17634 + return Stack.back().first == BB;
17637 +/// \brief Pop the last saved value from the control flow stack
17638 +Value *SIAnnotateControlFlow::popSaved() {
17639 + return Stack.pop_back_val().second;
17642 +/// \brief Push a BB and saved value to the control flow stack
17643 +void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) {
17644 + Stack.push_back(std::make_pair(BB, Saved));
17647 +/// \brief Can the condition represented by this PHI node treated like
17648 +/// an "Else" block?
17649 +bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
17650 + BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock();
17651 + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
17652 + if (Phi->getIncomingBlock(i) == IDom) {
17654 + if (Phi->getIncomingValue(i) != BoolTrue)
17658 + if (Phi->getIncomingValue(i) != BoolFalse)
17666 +// \brief Erase "Phi" if it is not used any more
17667 +void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
17668 + if (!Phi->hasNUsesOrMore(1))
17669 + Phi->eraseFromParent();
17672 +/// \brief Open a new "If" block
17673 +void SIAnnotateControlFlow::openIf(BranchInst *Term) {
17674 + Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term);
17675 + Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
17676 + push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
17679 +/// \brief Close the last "If" block and open a new "Else" block
17680 +void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
17681 + Value *Ret = CallInst::Create(Else, popSaved(), "", Term);
17682 + Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
17683 + push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
17686 +/// \brief Recursively handle the condition leading to a loop
17687 +void SIAnnotateControlFlow::handleLoopCondition(Value *Cond) {
17688 + if (PHINode *Phi = dyn_cast<PHINode>(Cond)) {
17690 + // Handle all non constant incoming values first
17691 + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
17692 + Value *Incoming = Phi->getIncomingValue(i);
17693 + if (isa<ConstantInt>(Incoming))
17696 + Phi->setIncomingValue(i, BoolFalse);
17697 + handleLoopCondition(Incoming);
17700 + BasicBlock *Parent = Phi->getParent();
17701 + BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock();
17703 + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
17705 + Value *Incoming = Phi->getIncomingValue(i);
17706 + if (Incoming != BoolTrue)
17709 + BasicBlock *From = Phi->getIncomingBlock(i);
17710 + if (From == IDom) {
17711 + CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt());
17712 + if (OldEnd && OldEnd->getCalledFunction() == EndCf) {
17713 + Value *Args[] = {
17714 + OldEnd->getArgOperand(0),
17715 + PhiInserter.GetValueAtEndOfBlock(Parent)
17717 + Value *Ret = CallInst::Create(ElseBreak, Args, "", OldEnd);
17718 + PhiInserter.AddAvailableValue(Parent, Ret);
17723 + TerminatorInst *Insert = From->getTerminator();
17724 + Value *Arg = PhiInserter.GetValueAtEndOfBlock(From);
17725 + Value *Ret = CallInst::Create(Break, Arg, "", Insert);
17726 + PhiInserter.AddAvailableValue(From, Ret);
17728 + eraseIfUnused(Phi);
17730 + } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
17731 + BasicBlock *Parent = Inst->getParent();
17732 + TerminatorInst *Insert = Parent->getTerminator();
17733 + Value *Args[] = { Cond, PhiInserter.GetValueAtEndOfBlock(Parent) };
17734 + Value *Ret = CallInst::Create(IfBreak, Args, "", Insert);
17735 + PhiInserter.AddAvailableValue(Parent, Ret);
17738 + assert(0 && "Unhandled loop condition!");
17742 +/// \brief Handle a back edge (loop)
17743 +void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
17744 + BasicBlock *Target = Term->getSuccessor(1);
17745 + PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front());
17747 + PhiInserter.Initialize(Int64, "");
17748 + PhiInserter.AddAvailableValue(Target, Broken);
17750 + Value *Cond = Term->getCondition();
17751 + Term->setCondition(BoolTrue);
17752 + handleLoopCondition(Cond);
17754 + BasicBlock *BB = Term->getParent();
17755 + Value *Arg = PhiInserter.GetValueAtEndOfBlock(BB);
17756 + for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target);
17757 + PI != PE; ++PI) {
17759 + Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI);
17762 + Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
17763 + push(Term->getSuccessor(0), Arg);
17766 +/// \brief Close the last opened control flow
17767 +void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
17768 + CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt());
17771 +/// \brief Annotate the control flow with intrinsics so the backend can
17772 +/// recognize if/then/else and loops.
17773 +bool SIAnnotateControlFlow::runOnFunction(Function &F) {
17774 + DT = &getAnalysis<DominatorTree>();
17776 + for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
17777 + E = df_end(&F.getEntryBlock()); I != E; ++I) {
17779 + BranchInst *Term = dyn_cast<BranchInst>((*I)->getTerminator());
17781 + if (!Term || Term->isUnconditional()) {
17782 + if (isTopOfStack(*I))
17783 + closeControlFlow(*I);
17787 + if (I.nodeVisited(Term->getSuccessor(1))) {
17788 + if (isTopOfStack(*I))
17789 + closeControlFlow(*I);
17790 + handleLoop(Term);
17794 + if (isTopOfStack(*I)) {
17795 + PHINode *Phi = dyn_cast<PHINode>(Term->getCondition());
17796 + if (Phi && Phi->getParent() == *I && isElse(Phi)) {
17797 + insertElse(Term);
17798 + eraseIfUnused(Phi);
17801 + closeControlFlow(*I);
17806 + assert(Stack.empty());
17810 +/// \brief Create the annotation pass
17811 +FunctionPass *llvm::createSIAnnotateControlFlowPass() {
17812 + return new SIAnnotateControlFlow();
17814 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIAssignInterpRegs.cpp llvm-r600/lib/Target/R600/SIAssignInterpRegs.cpp
17815 --- llvm-3.2.src/lib/Target/R600/SIAssignInterpRegs.cpp 1970-01-01 01:00:00.000000000 +0100
17816 +++ llvm-r600/lib/Target/R600/SIAssignInterpRegs.cpp 2013-01-25 19:43:57.470049720 +0100
17818 +//===-- SIAssignInterpRegs.cpp - Assign interpolation registers -----------===//
17820 +// The LLVM Compiler Infrastructure
17822 +// This file is distributed under the University of Illinois Open Source
17823 +// License. See LICENSE.TXT for details.
17825 +//===----------------------------------------------------------------------===//
17828 +/// \brief This pass maps the pseudo interpolation registers to the correct physical
17831 +/// Prior to executing a fragment shader, the GPU loads interpolation
17832 +/// parameters into physical registers. The specific physical register that each
17833 +/// interpolation parameter ends up in depends on the type of the interpolation
17834 +/// parameter as well as how many interpolation parameters are used by the
17837 +//===----------------------------------------------------------------------===//
17841 +#include "AMDGPU.h"
17842 +#include "AMDIL.h"
17843 +#include "SIMachineFunctionInfo.h"
17844 +#include "llvm/CodeGen/MachineFunctionPass.h"
17845 +#include "llvm/CodeGen/MachineInstrBuilder.h"
17846 +#include "llvm/CodeGen/MachineRegisterInfo.h"
17848 +using namespace llvm;
17852 +class SIAssignInterpRegsPass : public MachineFunctionPass {
17856 + TargetMachine &TM;
17858 + void addLiveIn(MachineFunction * MF, MachineRegisterInfo & MRI,
17859 + unsigned physReg, unsigned virtReg);
17862 + SIAssignInterpRegsPass(TargetMachine &tm) :
17863 + MachineFunctionPass(ID), TM(tm) { }
17865 + virtual bool runOnMachineFunction(MachineFunction &MF);
17867 + const char *getPassName() const { return "SI Assign intrpolation registers"; }
17870 +} // End anonymous namespace
17872 +char SIAssignInterpRegsPass::ID = 0;
17874 +#define INTERP_VALUES 16
17875 +#define REQUIRED_VALUE_MAX_INDEX 7
17877 +struct InterpInfo {
17879 + unsigned Regs[3];
17880 + unsigned RegCount;
17884 +FunctionPass *llvm::createSIAssignInterpRegsPass(TargetMachine &tm) {
17885 + return new SIAssignInterpRegsPass(tm);
17888 +bool SIAssignInterpRegsPass::runOnMachineFunction(MachineFunction &MF) {
17890 + struct InterpInfo InterpUse[INTERP_VALUES] = {
17891 + {false, {AMDGPU::PERSP_SAMPLE_I, AMDGPU::PERSP_SAMPLE_J}, 2},
17892 + {false, {AMDGPU::PERSP_CENTER_I, AMDGPU::PERSP_CENTER_J}, 2},
17893 + {false, {AMDGPU::PERSP_CENTROID_I, AMDGPU::PERSP_CENTROID_J}, 2},
17894 + {false, {AMDGPU::PERSP_I_W, AMDGPU::PERSP_J_W, AMDGPU::PERSP_1_W}, 3},
17895 + {false, {AMDGPU::LINEAR_SAMPLE_I, AMDGPU::LINEAR_SAMPLE_J}, 2},
17896 + {false, {AMDGPU::LINEAR_CENTER_I, AMDGPU::LINEAR_CENTER_J}, 2},
17897 + {false, {AMDGPU::LINEAR_CENTROID_I, AMDGPU::LINEAR_CENTROID_J}, 2},
17898 + {false, {AMDGPU::LINE_STIPPLE_TEX_COORD}, 1},
17899 + {false, {AMDGPU::POS_X_FLOAT}, 1},
17900 + {false, {AMDGPU::POS_Y_FLOAT}, 1},
17901 + {false, {AMDGPU::POS_Z_FLOAT}, 1},
17902 + {false, {AMDGPU::POS_W_FLOAT}, 1},
17903 + {false, {AMDGPU::FRONT_FACE}, 1},
17904 + {false, {AMDGPU::ANCILLARY}, 1},
17905 + {false, {AMDGPU::SAMPLE_COVERAGE}, 1},
17906 + {false, {AMDGPU::POS_FIXED_PT}, 1}
17909 + SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>();
17910 + // This pass is only needed for pixel shaders.
17911 + if (MFI->ShaderType != ShaderType::PIXEL) {
17914 + MachineRegisterInfo &MRI = MF.getRegInfo();
17915 + bool ForceEnable = true;
17917 + // First pass, mark the interpolation values that are used.
17918 + for (unsigned InterpIdx = 0; InterpIdx < INTERP_VALUES; InterpIdx++) {
17919 + for (unsigned RegIdx = 0; RegIdx < InterpUse[InterpIdx].RegCount;
17921 + InterpUse[InterpIdx].Enabled = InterpUse[InterpIdx].Enabled ||
17922 + !MRI.use_empty(InterpUse[InterpIdx].Regs[RegIdx]);
17923 + if (InterpUse[InterpIdx].Enabled &&
17924 + InterpIdx <= REQUIRED_VALUE_MAX_INDEX) {
17925 + ForceEnable = false;
17930 + // At least one interpolation mode must be enabled or else the GPU will hang.
17931 + if (ForceEnable) {
17932 + InterpUse[0].Enabled = true;
17935 + unsigned UsedVgprs = 0;
17937 + // Second pass, replace with VGPRs.
17938 + for (unsigned InterpIdx = 0; InterpIdx < INTERP_VALUES; InterpIdx++) {
17939 + if (!InterpUse[InterpIdx].Enabled) {
17942 + MFI->SPIPSInputAddr |= (1 << InterpIdx);
17944 + for (unsigned RegIdx = 0; RegIdx < InterpUse[InterpIdx].RegCount;
17945 + RegIdx++, UsedVgprs++) {
17946 + unsigned NewReg = AMDGPU::VReg_32RegClass.getRegister(UsedVgprs);
17947 + unsigned VirtReg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
17948 + MRI.replaceRegWith(InterpUse[InterpIdx].Regs[RegIdx], VirtReg);
17949 + addLiveIn(&MF, MRI, NewReg, VirtReg);
17956 +void SIAssignInterpRegsPass::addLiveIn(MachineFunction * MF,
17957 + MachineRegisterInfo & MRI,
17958 + unsigned physReg, unsigned virtReg) {
17959 + const TargetInstrInfo * TII = TM.getInstrInfo();
17960 + if (!MRI.isLiveIn(physReg)) {
17961 + MRI.addLiveIn(physReg, virtReg);
17962 + MF->front().addLiveIn(physReg);
17963 + BuildMI(MF->front(), MF->front().begin(), DebugLoc(),
17964 + TII->get(TargetOpcode::COPY), virtReg)
17965 + .addReg(physReg);
17967 + MRI.replaceRegWith(virtReg, MRI.getLiveInVirtReg(physReg));
17970 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIInsertWaits.cpp llvm-r600/lib/Target/R600/SIInsertWaits.cpp
17971 --- llvm-3.2.src/lib/Target/R600/SIInsertWaits.cpp 1970-01-01 01:00:00.000000000 +0100
17972 +++ llvm-r600/lib/Target/R600/SIInsertWaits.cpp 2013-01-25 19:43:57.473383054 +0100
17974 +//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
17976 +// The LLVM Compiler Infrastructure
17978 +// This file is distributed under the University of Illinois Open Source
17979 +// License. See LICENSE.TXT for details.
17981 +//===----------------------------------------------------------------------===//
17984 +/// \brief Insert wait instructions for memory reads and writes.
17986 +/// Memory reads and writes are issued asynchronously, so we need to insert
17987 +/// S_WAITCNT instructions when we want to access any of their results or
17988 +/// overwrite any register that's used asynchronously.
17990 +//===----------------------------------------------------------------------===//
17992 +#include "AMDGPU.h"
17993 +#include "SIInstrInfo.h"
17994 +#include "SIMachineFunctionInfo.h"
17995 +#include "llvm/CodeGen/MachineFunction.h"
17996 +#include "llvm/CodeGen/MachineFunctionPass.h"
17997 +#include "llvm/CodeGen/MachineInstrBuilder.h"
17998 +#include "llvm/CodeGen/MachineRegisterInfo.h"
18000 +using namespace llvm;
18004 +/// \brief One variable for each of the hardware counters
18011 + unsigned Array[3];
18015 +typedef Counters RegCounters[512];
18016 +typedef std::pair<unsigned, unsigned> RegInterval;
18018 +class SIInsertWaits : public MachineFunctionPass {
18022 + const SIInstrInfo *TII;
18023 + const SIRegisterInfo &TRI;
18024 + const MachineRegisterInfo *MRI;
18026 + /// \brief Constant hardware limits
18027 + static const Counters WaitCounts;
18029 + /// \brief Constant zero value
18030 + static const Counters ZeroCounts;
18032 + /// \brief Counter values we have already waited on.
18033 + Counters WaitedOn;
18035 + /// \brief Counter values for last instruction issued.
18036 + Counters LastIssued;
18038 + /// \brief Registers used by async instructions.
18039 + RegCounters UsedRegs;
18041 + /// \brief Registers defined by async instructions.
18042 + RegCounters DefinedRegs;
18044 + /// \brief Different export instruction types seen since last wait.
18045 + unsigned ExpInstrTypesSeen;
18047 + /// \brief Get increment/decrement amount for this instruction.
18048 + Counters getHwCounts(MachineInstr &MI);
18050 + /// \brief Is operand relevant for async execution?
18051 + bool isOpRelevant(MachineOperand &Op);
18053 + /// \brief Get register interval an operand affects.
18054 + RegInterval getRegInterval(MachineOperand &Op);
18056 + /// \brief Handle instructions async components
18057 + void pushInstruction(MachineInstr &MI);
18059 + /// \brief Insert the actual wait instruction
18060 + bool insertWait(MachineBasicBlock &MBB,
18061 + MachineBasicBlock::iterator I,
18062 + const Counters &Counts);
18064 + /// \brief Resolve all operand dependencies to counter requirements
18065 + Counters handleOperands(MachineInstr &MI);
18068 + SIInsertWaits(TargetMachine &tm) :
18069 + MachineFunctionPass(ID),
18070 + TII(static_cast<const SIInstrInfo*>(tm.getInstrInfo())),
18071 + TRI(TII->getRegisterInfo()) { }
18073 + virtual bool runOnMachineFunction(MachineFunction &MF);
18075 + const char *getPassName() const {
18076 + return "SI insert wait instructions";
18081 +} // End anonymous namespace
18083 +char SIInsertWaits::ID = 0;
18085 +const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } };
18086 +const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
18088 +FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
18089 + return new SIInsertWaits(tm);
18092 +Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
18094 + uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags;
18097 + Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
18099 + // Only consider stores or EXP for EXP_CNT
18100 + Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
18101 + (MI.getOpcode() == AMDGPU::EXP || !MI.getDesc().mayStore()));
18103 + // LGKM may uses larger values
18104 + if (TSFlags & SIInstrFlags::LGKM_CNT) {
18106 + MachineOperand &Op = MI.getOperand(0);
18107 + assert(Op.isReg() && "First LGKM operand must be a register!");
18109 + unsigned Reg = Op.getReg();
18110 + unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
18111 + Result.Named.LGKM = Size > 4 ? 2 : 1;
18114 + Result.Named.LGKM = 0;
18120 +bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
18122 + // Constants are always irrelevant
18126 + // Defines are always relevant
18130 + // For exports all registers are relevant
18131 + MachineInstr &MI = *Op.getParent();
18132 + if (MI.getOpcode() == AMDGPU::EXP)
18135 + // For stores the stored value is also relevant
18136 + if (!MI.getDesc().mayStore())
18139 + for (MachineInstr::mop_iterator I = MI.operands_begin(),
18140 + E = MI.operands_end(); I != E; ++I) {
18142 + if (I->isReg() && I->isUse())
18143 + return Op.isIdenticalTo(*I);
18149 +RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
18152 + return std::make_pair(0, 0);
18154 + unsigned Reg = Op.getReg();
18155 + unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
18157 + assert(Size >= 4);
18159 + RegInterval Result;
18160 + Result.first = TRI.getEncodingValue(Reg);
18161 + Result.second = Result.first + Size / 4;
18166 +void SIInsertWaits::pushInstruction(MachineInstr &MI) {
18168 + // Get the hardware counter increments and sum them up
18169 + Counters Increment = getHwCounts(MI);
18170 + unsigned Sum = 0;
18172 + for (unsigned i = 0; i < 3; ++i) {
18173 + LastIssued.Array[i] += Increment.Array[i];
18174 + Sum += Increment.Array[i];
18177 + // If we don't increase anything then that's it
18181 + // Remember which export instructions we have seen
18182 + if (Increment.Named.EXP) {
18183 + ExpInstrTypesSeen |= MI.getOpcode() == AMDGPU::EXP ? 1 : 2;
18186 + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
18188 + MachineOperand &Op = MI.getOperand(i);
18189 + if (!isOpRelevant(Op))
18192 + RegInterval Interval = getRegInterval(Op);
18193 + for (unsigned j = Interval.first; j < Interval.second; ++j) {
18195 + // Remember which registers we define
18197 + DefinedRegs[j] = LastIssued;
18199 + // and which one we are using
18201 + UsedRegs[j] = LastIssued;
18206 +bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
18207 + MachineBasicBlock::iterator I,
18208 + const Counters &Required) {
18210 + // End of program? No need to wait on anything
18211 + if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
18214 + // Figure out if the async instructions execute in order
18217 + // VM_CNT is always ordered
18218 + Ordered[0] = true;
18220 + // EXP_CNT is unordered if we have both EXP & VM-writes
18221 + Ordered[1] = ExpInstrTypesSeen == 3;
18223 + // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
18224 + Ordered[2] = false;
18226 + // The values we are going to put into the S_WAITCNT instruction
18227 + Counters Counts = WaitCounts;
18229 + // Do we really need to wait?
18230 + bool NeedWait = false;
18232 + for (unsigned i = 0; i < 3; ++i) {
18234 + if (Required.Array[i] <= WaitedOn.Array[i])
18239 + if (Ordered[i]) {
18240 + unsigned Value = LastIssued.Array[i] - Required.Array[i];
18242 + // adjust the value to the real hardware posibilities
18243 + Counts.Array[i] = std::min(Value, WaitCounts.Array[i]);
18246 + Counts.Array[i] = 0;
18248 + // Remember on what we have waited on
18249 + WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
18255 + // Reset EXP_CNT instruction types
18256 + if (Counts.Named.EXP == 0)
18257 + ExpInstrTypesSeen = 0;
18259 + // Build the wait instruction
18260 + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
18261 + .addImm((Counts.Named.VM & 0xF) |
18262 + ((Counts.Named.EXP & 0x7) << 4) |
18263 + ((Counts.Named.LGKM & 0x7) << 8));
18268 +/// \brief helper function for handleOperands
18269 +static void increaseCounters(Counters &Dst, const Counters &Src) {
18271 + for (unsigned i = 0; i < 3; ++i)
18272 + Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
18275 +Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
18277 + Counters Result = ZeroCounts;
18279 + // For each register affected by this
18280 + // instruction increase the result sequence
18281 + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
18283 + MachineOperand &Op = MI.getOperand(i);
18284 + RegInterval Interval = getRegInterval(Op);
18285 + for (unsigned j = Interval.first; j < Interval.second; ++j) {
18288 + increaseCounters(Result, UsedRegs[j]);
18291 + increaseCounters(Result, DefinedRegs[j]);
18298 +bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
18300 + bool Changes = false;
18302 + MRI = &MF.getRegInfo();
18304 + WaitedOn = ZeroCounts;
18305 + LastIssued = ZeroCounts;
18307 + memset(&UsedRegs, 0, sizeof(UsedRegs));
18308 + memset(&DefinedRegs, 0, sizeof(DefinedRegs));
18310 + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
18311 + BI != BE; ++BI) {
18313 + MachineBasicBlock &MBB = *BI;
18314 + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
18317 + Changes |= insertWait(MBB, I, handleOperands(*I));
18318 + pushInstruction(*I);
18321 + // Wait for everything at the end of the MBB
18322 + Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
18327 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIInstrFormats.td llvm-r600/lib/Target/R600/SIInstrFormats.td
18328 --- llvm-3.2.src/lib/Target/R600/SIInstrFormats.td 1970-01-01 01:00:00.000000000 +0100
18329 +++ llvm-r600/lib/Target/R600/SIInstrFormats.td 2013-01-25 19:43:57.473383054 +0100
18331 +//===-- SIInstrFormats.td - SI Instruction Formats ------------------------===//
18333 +// The LLVM Compiler Infrastructure
18335 +// This file is distributed under the University of Illinois Open Source
18336 +// License. See LICENSE.TXT for details.
18338 +//===----------------------------------------------------------------------===//
18340 +// SI Instruction format definitions.
18342 +// Instructions with _32 take 32-bit operands.
18343 +// Instructions with _64 take 64-bit operands.
18345 +// VOP_* instructions can use either a 32-bit or 64-bit encoding. The 32-bit
18346 +// encoding is the standard encoding, but instruction that make use of
18347 +// any of the instruction modifiers must use the 64-bit encoding.
18349 +// Instructions with _e32 use the 32-bit encoding.
18350 +// Instructions with _e64 use the 64-bit encoding.
18352 +//===----------------------------------------------------------------------===//
18354 +class VOP3b_2IN <bits<9> op, string opName, RegisterClass dstClass,
18355 + RegisterClass src0Class, RegisterClass src1Class,
18356 + list<dag> pattern>
18357 + : VOP3b <op, (outs dstClass:$vdst),
18358 + (ins src0Class:$src0, src1Class:$src1, InstFlag:$src2, InstFlag:$sdst,
18359 + InstFlag:$omod, InstFlag:$neg),
18364 +class VOP3_1_32 <bits<9> op, string opName, list<dag> pattern>
18365 + : VOP3b_2IN <op, opName, SReg_1, AllReg_32, VReg_32, pattern>;
18367 +class VOP3_32 <bits<9> op, string opName, list<dag> pattern>
18368 + : VOP3 <op, (outs VReg_32:$dst), (ins AllReg_32:$src0, VReg_32:$src1, VReg_32:$src2, i32imm:$src3, i32imm:$src4, i32imm:$src5, i32imm:$src6), opName, pattern>;
18370 +class VOP3_64 <bits<9> op, string opName, list<dag> pattern>
18371 + : VOP3 <op, (outs VReg_64:$dst), (ins AllReg_64:$src0, VReg_64:$src1, VReg_64:$src2, i32imm:$src3, i32imm:$src4, i32imm:$src5, i32imm:$src6), opName, pattern>;
18374 +class SOP1_32 <bits<8> op, string opName, list<dag> pattern>
18375 + : SOP1 <op, (outs SReg_32:$dst), (ins SReg_32:$src0), opName, pattern>;
18377 +class SOP1_64 <bits<8> op, string opName, list<dag> pattern>
18378 + : SOP1 <op, (outs SReg_64:$dst), (ins SReg_64:$src0), opName, pattern>;
18380 +class SOP2_32 <bits<7> op, string opName, list<dag> pattern>
18381 + : SOP2 <op, (outs SReg_32:$dst), (ins SReg_32:$src0, SReg_32:$src1), opName, pattern>;
18383 +class SOP2_64 <bits<7> op, string opName, list<dag> pattern>
18384 + : SOP2 <op, (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), opName, pattern>;
18386 +class SOP2_VCC <bits<7> op, string opName, list<dag> pattern>
18387 + : SOP2 <op, (outs SReg_1:$vcc), (ins SReg_64:$src0, SReg_64:$src1), opName, pattern>;
18389 +class VOP1_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
18390 + string opName, list<dag> pattern> :
18392 + op, (outs vrc:$dst), (ins arc:$src0), opName, pattern
18395 +multiclass VOP1_32 <bits<8> op, string opName, list<dag> pattern> {
18396 + def _e32: VOP1_Helper <op, VReg_32, AllReg_32, opName, pattern>;
18397 + def _e64 : VOP3_32 <{1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
18402 +multiclass VOP1_64 <bits<8> op, string opName, list<dag> pattern> {
18404 + def _e32 : VOP1_Helper <op, VReg_64, AllReg_64, opName, pattern>;
18406 + def _e64 : VOP3_64 <
18407 + {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
18412 +class VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc,
18413 + string opName, list<dag> pattern> :
18415 + op, (outs vrc:$dst), (ins arc:$src0, vrc:$src1), opName, pattern
18418 +multiclass VOP2_32 <bits<6> op, string opName, list<dag> pattern> {
18420 + def _e32 : VOP2_Helper <op, VReg_32, AllReg_32, opName, pattern>;
18422 + def _e64 : VOP3_32 <{1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
18427 +multiclass VOP2_64 <bits<6> op, string opName, list<dag> pattern> {
18428 + def _e32: VOP2_Helper <op, VReg_64, AllReg_64, opName, pattern>;
18430 + def _e64 : VOP3_64 <
18431 + {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
18436 +class SOPK_32 <bits<5> op, string opName, list<dag> pattern>
18437 + : SOPK <op, (outs SReg_32:$dst), (ins i16imm:$src0), opName, pattern>;
18439 +class SOPK_64 <bits<5> op, string opName, list<dag> pattern>
18440 + : SOPK <op, (outs SReg_64:$dst), (ins i16imm:$src0), opName, pattern>;
18442 +class VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
18443 + string opName, list<dag> pattern> :
18445 + op, (ins arc:$src0, vrc:$src1), opName, pattern
18448 +multiclass VOPC_32 <bits<9> op, string opName, list<dag> pattern> {
18450 + def _e32 : VOPC_Helper <
18451 + {op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
18452 + VReg_32, AllReg_32, opName, pattern
18455 + def _e64 : VOP3_1_32 <
18461 +multiclass VOPC_64 <bits<8> op, string opName, list<dag> pattern> {
18463 + def _e32 : VOPC_Helper <op, VReg_64, AllReg_64, opName, pattern>;
18465 + def _e64 : VOP3_64 <
18466 + {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
18471 +class SOPC_32 <bits<7> op, string opName, list<dag> pattern>
18472 + : SOPC <op, (outs SCCReg:$dst), (ins SReg_32:$src0, SReg_32:$src1), opName, pattern>;
18474 +class SOPC_64 <bits<7> op, string opName, list<dag> pattern>
18475 + : SOPC <op, (outs SCCReg:$dst), (ins SReg_64:$src0, SReg_64:$src1), opName, pattern>;
18477 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIInstrInfo.cpp llvm-r600/lib/Target/R600/SIInstrInfo.cpp
18478 --- llvm-3.2.src/lib/Target/R600/SIInstrInfo.cpp 1970-01-01 01:00:00.000000000 +0100
18479 +++ llvm-r600/lib/Target/R600/SIInstrInfo.cpp 2013-01-25 19:43:57.473383054 +0100
18481 +//===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===//
18483 +// The LLVM Compiler Infrastructure
18485 +// This file is distributed under the University of Illinois Open Source
18486 +// License. See LICENSE.TXT for details.
18488 +//===----------------------------------------------------------------------===//
18491 +/// \brief SI Implementation of TargetInstrInfo.
18493 +//===----------------------------------------------------------------------===//
18496 +#include "SIInstrInfo.h"
18497 +#include "AMDGPUTargetMachine.h"
18498 +#include "llvm/CodeGen/MachineInstrBuilder.h"
18499 +#include "llvm/CodeGen/MachineRegisterInfo.h"
18500 +#include "llvm/MC/MCInstrDesc.h"
18502 +#include <stdio.h>
18504 +using namespace llvm;
18506 +SIInstrInfo::SIInstrInfo(AMDGPUTargetMachine &tm)
18507 + : AMDGPUInstrInfo(tm),
18511 +const SIRegisterInfo &SIInstrInfo::getRegisterInfo() const {
18516 +SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
18517 + MachineBasicBlock::iterator MI, DebugLoc DL,
18518 + unsigned DestReg, unsigned SrcReg,
18519 + bool KillSrc) const {
18520 + // If we are trying to copy to or from SCC, there is a bug somewhere else in
18521 + // the backend. While it may be theoretically possible to do this, it should
18522 + // never be necessary.
18523 + assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC);
18525 + if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
18526 + assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
18527 + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
18528 + .addReg(SrcReg, getKillRegState(KillSrc));
18529 + } else if (AMDGPU::VReg_32RegClass.contains(DestReg)) {
18530 + assert(AMDGPU::VReg_32RegClass.contains(SrcReg) ||
18531 + AMDGPU::SReg_32RegClass.contains(SrcReg));
18532 + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
18533 + .addReg(SrcReg, getKillRegState(KillSrc));
18535 + assert(AMDGPU::SReg_32RegClass.contains(DestReg));
18536 + assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
18537 + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
18538 + .addReg(SrcReg, getKillRegState(KillSrc));
18542 +MachineInstr * SIInstrInfo::getMovImmInstr(MachineFunction *MF, unsigned DstReg,
18543 + int64_t Imm) const {
18544 + MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::V_MOV_IMM_I32), DebugLoc());
18545 + MachineInstrBuilder(MI).addReg(DstReg, RegState::Define);
18546 + MachineInstrBuilder(MI).addImm(Imm);
18552 +bool SIInstrInfo::isMov(unsigned Opcode) const {
18554 + default: return false;
18555 + case AMDGPU::S_MOV_B32:
18556 + case AMDGPU::S_MOV_B64:
18557 + case AMDGPU::V_MOV_B32_e32:
18558 + case AMDGPU::V_MOV_B32_e64:
18559 + case AMDGPU::V_MOV_IMM_F32:
18560 + case AMDGPU::V_MOV_IMM_I32:
18561 + case AMDGPU::S_MOV_IMM_I32:
18567 +SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
18568 + return RC != &AMDGPU::EXECRegRegClass;
18570 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIInstrInfo.h llvm-r600/lib/Target/R600/SIInstrInfo.h
18571 --- llvm-3.2.src/lib/Target/R600/SIInstrInfo.h 1970-01-01 01:00:00.000000000 +0100
18572 +++ llvm-r600/lib/Target/R600/SIInstrInfo.h 2013-01-25 19:43:57.476716387 +0100
18574 +//===-- SIInstrInfo.h - SI Instruction Info Interface ---------------------===//
18576 +// The LLVM Compiler Infrastructure
18578 +// This file is distributed under the University of Illinois Open Source
18579 +// License. See LICENSE.TXT for details.
18581 +//===----------------------------------------------------------------------===//
18584 +/// \brief Interface definition for SIInstrInfo.
18586 +//===----------------------------------------------------------------------===//
18589 +#ifndef SIINSTRINFO_H
18590 +#define SIINSTRINFO_H
18592 +#include "AMDGPUInstrInfo.h"
18593 +#include "SIRegisterInfo.h"
18597 +class SIInstrInfo : public AMDGPUInstrInfo {
18599 + const SIRegisterInfo RI;
18602 + explicit SIInstrInfo(AMDGPUTargetMachine &tm);
18604 + const SIRegisterInfo &getRegisterInfo() const;
18606 + virtual void copyPhysReg(MachineBasicBlock &MBB,
18607 + MachineBasicBlock::iterator MI, DebugLoc DL,
18608 + unsigned DestReg, unsigned SrcReg,
18609 + bool KillSrc) const;
18611 + /// \returns the encoding type of this instruction.
18612 + unsigned getEncodingType(const MachineInstr &MI) const;
18614 + /// \returns the size of this instructions encoding in number of bytes.
18615 + unsigned getEncodingBytes(const MachineInstr &MI) const;
18617 + virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg,
18618 + int64_t Imm) const;
18620 + virtual unsigned getIEQOpcode() const { assert(!"Implement"); return 0;}
18621 + virtual bool isMov(unsigned Opcode) const;
18623 + virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
18626 +} // End namespace llvm
18628 +namespace SIInstrFlags {
18630 + // First 4 bits are the instruction encoding
18632 + EXP_CNT = 1 << 5,
18633 + LGKM_CNT = 1 << 6
18637 +#endif //SIINSTRINFO_H
18638 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIInstrInfo.td llvm-r600/lib/Target/R600/SIInstrInfo.td
18639 --- llvm-3.2.src/lib/Target/R600/SIInstrInfo.td 1970-01-01 01:00:00.000000000 +0100
18640 +++ llvm-r600/lib/Target/R600/SIInstrInfo.td 2013-01-25 19:43:57.476716387 +0100
18642 +//===-- SIInstrInfo.td - SI Instruction Encodings ---------*- tablegen -*--===//
18644 +// The LLVM Compiler Infrastructure
18646 +// This file is distributed under the University of Illinois Open Source
18647 +// License. See LICENSE.TXT for details.
18649 +//===----------------------------------------------------------------------===//
18651 +//===----------------------------------------------------------------------===//
18652 +// SI DAG Profiles
18653 +//===----------------------------------------------------------------------===//
18654 +def SDTVCCBinaryOp : SDTypeProfile<1, 2, [
18655 + SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 2>
18658 +//===----------------------------------------------------------------------===//
18660 +//===----------------------------------------------------------------------===//
18662 +// and operation on 64-bit wide vcc
18663 +def SIsreg1_and : SDNode<"SIISD::VCC_AND", SDTVCCBinaryOp,
18664 + [SDNPCommutative, SDNPAssociative]
18667 +// Special bitcast node for sharing VCC register between VALU and SALU
18668 +def SIsreg1_bitcast : SDNode<"SIISD::VCC_BITCAST",
18669 + SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>
18672 +// and operation on 64-bit wide vcc
18673 +def SIvcc_and : SDNode<"SIISD::VCC_AND", SDTVCCBinaryOp,
18674 + [SDNPCommutative, SDNPAssociative]
18677 +// Special bitcast node for sharing VCC register between VALU and SALU
18678 +def SIvcc_bitcast : SDNode<"SIISD::VCC_BITCAST",
18679 + SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>
18682 +class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
18683 + AMDGPUInst<outs, ins, asm, pattern> {
18685 + field bits<4> EncodingType = 0;
18686 + field bits<1> VM_CNT = 0;
18687 + field bits<1> EXP_CNT = 0;
18688 + field bits<1> LGKM_CNT = 0;
18690 + let TSFlags{3-0} = EncodingType;
18691 + let TSFlags{4} = VM_CNT;
18692 + let TSFlags{5} = EXP_CNT;
18693 + let TSFlags{6} = LGKM_CNT;
18696 +class Enc32 <dag outs, dag ins, string asm, list<dag> pattern> :
18697 + InstSI <outs, ins, asm, pattern> {
18699 + field bits<32> Inst;
18702 +class Enc64 <dag outs, dag ins, string asm, list<dag> pattern> :
18703 + InstSI <outs, ins, asm, pattern> {
18705 + field bits<64> Inst;
18708 +class SIOperand <ValueType vt, dag opInfo>: Operand <vt> {
18709 + let EncoderMethod = "encodeOperand";
18710 + let MIOperandInfo = opInfo;
18713 +def IMM16bit : ImmLeaf <
18715 + [{return isInt<16>(Imm);}]
18718 +def IMM8bit : ImmLeaf <
18720 + [{return (int32_t)Imm >= 0 && (int32_t)Imm <= 0xff;}]
18723 +def IMM12bit : ImmLeaf <
18725 + [{return (int16_t)Imm >= 0 && (int16_t)Imm <= 0xfff;}]
18728 +def IMM32bitIn64bit : ImmLeaf <
18730 + [{return isInt<32>(Imm);}]
18733 +class GPR4Align <RegisterClass rc> : Operand <vAny> {
18734 + let EncoderMethod = "GPR4AlignEncode";
18735 + let MIOperandInfo = (ops rc:$reg);
18738 +class GPR2Align <RegisterClass rc, ValueType vt> : Operand <vt> {
18739 + let EncoderMethod = "GPR2AlignEncode";
18740 + let MIOperandInfo = (ops rc:$reg);
18743 +def SMRDmemrr : Operand<iPTR> {
18744 + let MIOperandInfo = (ops SReg_64, SReg_32);
18745 + let EncoderMethod = "GPR2AlignEncode";
18748 +def SMRDmemri : Operand<iPTR> {
18749 + let MIOperandInfo = (ops SReg_64, i32imm);
18750 + let EncoderMethod = "SMRDmemriEncode";
18753 +def ADDR_Reg : ComplexPattern<i64, 2, "SelectADDRReg", [], []>;
18754 +def ADDR_Offset8 : ComplexPattern<i64, 2, "SelectADDR8BitOffset", [], []>;
18756 +let Uses = [EXEC] in {
18760 + (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
18761 + VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3),
18762 + "EXP $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3",
18775 + let Inst{3-0} = EN;
18776 + let Inst{9-4} = TGT;
18777 + let Inst{10} = COMPR;
18778 + let Inst{11} = DONE;
18779 + let Inst{12} = VM;
18780 + let Inst{31-26} = 0x3e;
18781 + let Inst{39-32} = VSRC0;
18782 + let Inst{47-40} = VSRC1;
18783 + let Inst{55-48} = VSRC2;
18784 + let Inst{63-56} = VSRC3;
18785 + let EncodingType = 0; //SIInstrEncodingType::EXP
18790 +class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
18791 + Enc64 <outs, ins, asm, pattern> {
18806 + let Inst{11-8} = DMASK;
18807 + let Inst{12} = UNORM;
18808 + let Inst{13} = GLC;
18809 + let Inst{14} = DA;
18810 + let Inst{15} = R128;
18811 + let Inst{16} = TFE;
18812 + let Inst{17} = LWE;
18813 + let Inst{24-18} = op;
18814 + let Inst{25} = SLC;
18815 + let Inst{31-26} = 0x3c;
18816 + let Inst{39-32} = VADDR;
18817 + let Inst{47-40} = VDATA;
18818 + let Inst{52-48} = SRSRC;
18819 + let Inst{57-53} = SSAMP;
18820 + let EncodingType = 2; //SIInstrEncodingType::MIMG
18826 +class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
18827 + Enc64<outs, ins, asm, pattern> {
18843 + let Inst{11-0} = OFFSET;
18844 + let Inst{12} = OFFEN;
18845 + let Inst{13} = IDXEN;
18846 + let Inst{14} = GLC;
18847 + let Inst{15} = ADDR64;
18848 + let Inst{18-16} = op;
18849 + let Inst{22-19} = DFMT;
18850 + let Inst{25-23} = NFMT;
18851 + let Inst{31-26} = 0x3a; //encoding
18852 + let Inst{39-32} = VADDR;
18853 + let Inst{47-40} = VDATA;
18854 + let Inst{52-48} = SRSRC;
18855 + let Inst{54} = SLC;
18856 + let Inst{55} = TFE;
18857 + let Inst{63-56} = SOFFSET;
18858 + let EncodingType = 3; //SIInstrEncodingType::MTBUF
18863 + let neverHasSideEffects = 1;
18866 +class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
18867 + Enc64<outs, ins, asm, pattern> {
18882 + let Inst{11-0} = OFFSET;
18883 + let Inst{12} = OFFEN;
18884 + let Inst{13} = IDXEN;
18885 + let Inst{14} = GLC;
18886 + let Inst{15} = ADDR64;
18887 + let Inst{16} = LDS;
18888 + let Inst{24-18} = op;
18889 + let Inst{31-26} = 0x38; //encoding
18890 + let Inst{39-32} = VADDR;
18891 + let Inst{47-40} = VDATA;
18892 + let Inst{52-48} = SRSRC;
18893 + let Inst{54} = SLC;
18894 + let Inst{55} = TFE;
18895 + let Inst{63-56} = SOFFSET;
18896 + let EncodingType = 4; //SIInstrEncodingType::MUBUF
18901 + let neverHasSideEffects = 1;
18904 +} // End Uses = [EXEC]
18906 +class SMRD <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
18907 + Enc32<outs, ins, asm, pattern> {
18911 + bits<8> OFFSET = PTR{7-0};
18912 + bits<1> IMM = PTR{8};
18913 + bits<6> SBASE = PTR{14-9};
18915 + let Inst{7-0} = OFFSET;
18916 + let Inst{8} = IMM;
18917 + let Inst{14-9} = SBASE;
18918 + let Inst{21-15} = SDST;
18919 + let Inst{26-22} = op;
18920 + let Inst{31-27} = 0x18; //encoding
18921 + let EncodingType = 5; //SIInstrEncodingType::SMRD
18923 + let LGKM_CNT = 1;
18926 +class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
18927 + Enc32<outs, ins, asm, pattern> {
18932 + let Inst{7-0} = SSRC0;
18933 + let Inst{15-8} = op;
18934 + let Inst{22-16} = SDST;
18935 + let Inst{31-23} = 0x17d; //encoding;
18936 + let EncodingType = 6; //SIInstrEncodingType::SOP1
18939 + let mayStore = 0;
18940 + let hasSideEffects = 0;
18943 +class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
18944 + Enc32 <outs, ins, asm, pattern> {
18950 + let Inst{7-0} = SSRC0;
18951 + let Inst{15-8} = SSRC1;
18952 + let Inst{22-16} = SDST;
18953 + let Inst{29-23} = op;
18954 + let Inst{31-30} = 0x2; // encoding
18955 + let EncodingType = 7; // SIInstrEncodingType::SOP2
18958 + let mayStore = 0;
18959 + let hasSideEffects = 0;
18962 +class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
18963 + Enc32<outs, ins, asm, pattern> {
18968 + let Inst{7-0} = SSRC0;
18969 + let Inst{15-8} = SSRC1;
18970 + let Inst{22-16} = op;
18971 + let Inst{31-23} = 0x17e;
18972 + let EncodingType = 8; // SIInstrEncodingType::SOPC
18974 + let DisableEncoding = "$dst";
18976 + let mayStore = 0;
18977 + let hasSideEffects = 0;
18980 +class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
18981 + Enc32 <outs, ins , asm, pattern> {
18984 + bits <16> SIMM16;
18986 + let Inst{15-0} = SIMM16;
18987 + let Inst{22-16} = SDST;
18988 + let Inst{27-23} = op;
18989 + let Inst{31-28} = 0xb; //encoding
18990 + let EncodingType = 9; // SIInstrEncodingType::SOPK
18993 + let mayStore = 0;
18994 + let hasSideEffects = 0;
18997 +class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern> : Enc32 <
19003 + bits <16> SIMM16;
19005 + let Inst{15-0} = SIMM16;
19006 + let Inst{22-16} = op;
19007 + let Inst{31-23} = 0x17f; // encoding
19008 + let EncodingType = 10; // SIInstrEncodingType::SOPP
19011 + let mayStore = 0;
19012 + let hasSideEffects = 0;
19015 +let Uses = [EXEC] in {
19017 +class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
19018 + Enc32 <outs, ins, asm, pattern> {
19022 + bits<2> ATTRCHAN;
19025 + let Inst{7-0} = VSRC;
19026 + let Inst{9-8} = ATTRCHAN;
19027 + let Inst{15-10} = ATTR;
19028 + let Inst{17-16} = op;
19029 + let Inst{25-18} = VDST;
19030 + let Inst{31-26} = 0x32; // encoding
19031 + let EncodingType = 11; // SIInstrEncodingType::VINTRP
19033 + let neverHasSideEffects = 1;
19035 + let mayStore = 0;
19038 +class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
19039 + Enc32 <outs, ins, asm, pattern> {
19044 + let Inst{8-0} = SRC0;
19045 + let Inst{16-9} = op;
19046 + let Inst{24-17} = VDST;
19047 + let Inst{31-25} = 0x3f; //encoding
19049 + let EncodingType = 12; // SIInstrEncodingType::VOP1
19050 + let PostEncoderMethod = "VOPPostEncode";
19053 + let mayStore = 0;
19054 + let hasSideEffects = 0;
19057 +class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
19058 + Enc32 <outs, ins, asm, pattern> {
19064 + let Inst{8-0} = SRC0;
19065 + let Inst{16-9} = VSRC1;
19066 + let Inst{24-17} = VDST;
19067 + let Inst{30-25} = op;
19068 + let Inst{31} = 0x0; //encoding
19070 + let EncodingType = 13; // SIInstrEncodingType::VOP2
19071 + let PostEncoderMethod = "VOPPostEncode";
19074 + let mayStore = 0;
19075 + let hasSideEffects = 0;
19078 +class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
19079 + Enc64 <outs, ins, asm, pattern> {
19090 + let Inst{7-0} = VDST;
19091 + let Inst{10-8} = ABS;
19092 + let Inst{11} = CLAMP;
19093 + let Inst{25-17} = op;
19094 + let Inst{31-26} = 0x34; //encoding
19095 + let Inst{40-32} = SRC0;
19096 + let Inst{49-41} = SRC1;
19097 + let Inst{58-50} = SRC2;
19098 + let Inst{60-59} = OMOD;
19099 + let Inst{63-61} = NEG;
19101 + let EncodingType = 14; // SIInstrEncodingType::VOP3
19102 + let PostEncoderMethod = "VOPPostEncode";
19105 + let mayStore = 0;
19106 + let hasSideEffects = 0;
19109 +class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
19110 + Enc64 <outs, ins, asm, pattern> {
19120 + let Inst{7-0} = VDST;
19121 + let Inst{14-8} = SDST;
19122 + let Inst{25-17} = op;
19123 + let Inst{31-26} = 0x34; //encoding
19124 + let Inst{40-32} = SRC0;
19125 + let Inst{49-41} = SRC1;
19126 + let Inst{58-50} = SRC2;
19127 + let Inst{60-59} = OMOD;
19128 + let Inst{63-61} = NEG;
19130 + let EncodingType = 14; // SIInstrEncodingType::VOP3
19131 + let PostEncoderMethod = "VOPPostEncode";
19134 + let mayStore = 0;
19135 + let hasSideEffects = 0;
19138 +class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
19139 + Enc32 <(outs VCCReg:$dst), ins, asm, pattern> {
19144 + let Inst{8-0} = SRC0;
19145 + let Inst{16-9} = VSRC1;
19146 + let Inst{24-17} = op;
19147 + let Inst{31-25} = 0x3e;
19149 + let EncodingType = 15; //SIInstrEncodingType::VOPC
19150 + let PostEncoderMethod = "VOPPostEncode";
19151 + let DisableEncoding = "$dst";
19153 + let mayStore = 0;
19154 + let hasSideEffects = 0;
19157 +} // End Uses = [EXEC]
19159 +class MIMG_Load_Helper <bits<7> op, string asm> : MIMG <
19161 + (outs VReg_128:$vdata),
19162 + (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
19163 + i1imm:$tfe, i1imm:$lwe, i1imm:$slc, VReg_128:$vaddr,
19164 + GPR4Align<SReg_256>:$srsrc, GPR4Align<SReg_128>:$ssamp),
19168 + let mayStore = 0;
19171 +class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF <
19173 + (outs regClass:$dst),
19174 + (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
19175 + i1imm:$lds, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc, i1imm:$slc,
19176 + i1imm:$tfe, SReg_32:$soffset),
19180 + let mayStore = 0;
19183 +class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
19185 + (outs regClass:$dst),
19186 + (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
19187 + i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc,
19188 + i1imm:$slc, i1imm:$tfe, SReg_32:$soffset),
19192 + let mayStore = 0;
19195 +class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
19198 + (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
19199 + i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr,
19200 + GPR4Align<SReg_128>:$srsrc, i1imm:$slc, i1imm:$tfe, SReg_32:$soffset),
19203 + let mayStore = 1;
19207 +multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass dstClass,
19209 + def _IMM : SMRD <
19211 + (outs dstClass:$dst),
19212 + (ins SMRDmemri:$src0),
19214 + [(set (vt dstClass:$dst), (constant_load ADDR_Offset8:$src0))]
19217 + def _SGPR : SMRD <
19219 + (outs dstClass:$dst),
19220 + (ins SMRDmemrr:$src0),
19222 + [(set (vt dstClass:$dst), (constant_load ADDR_Reg:$src0))]
19226 +multiclass SMRD_32 <bits<5> op, string asm, RegisterClass dstClass> {
19227 + defm _F32 : SMRD_Helper <op, asm, dstClass, f32>;
19228 + defm _I32 : SMRD_Helper <op, asm, dstClass, i32>;
19231 +include "SIInstrFormats.td"
19232 +include "SIInstructions.td"
19233 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIInstructions.td llvm-r600/lib/Target/R600/SIInstructions.td
19234 --- llvm-3.2.src/lib/Target/R600/SIInstructions.td 1970-01-01 01:00:00.000000000 +0100
19235 +++ llvm-r600/lib/Target/R600/SIInstructions.td 2013-01-25 19:43:57.480049720 +0100
19237 +//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
19239 +// The LLVM Compiler Infrastructure
19241 +// This file is distributed under the University of Illinois Open Source
19242 +// License. See LICENSE.TXT for details.
19244 +//===----------------------------------------------------------------------===//
19245 +// This file was originally auto-generated from a GPU register header file and
19246 +// all the instruction definitions were originally commented out. Instructions
19247 +// that are not yet supported remain commented out.
19248 +//===----------------------------------------------------------------------===//
19250 +def isSI : Predicate<"Subtarget.device()"
19251 + "->getGeneration() == AMDGPUDeviceInfo::HD7XXX">;
19253 +let Predicates = [isSI] in {
19255 +let neverHasSideEffects = 1 in {
19256 +def S_MOV_B32 : SOP1_32 <0x00000003, "S_MOV_B32", []>;
19257 +def S_MOV_B64 : SOP1_64 <0x00000004, "S_MOV_B64", []>;
19258 +def S_CMOV_B32 : SOP1_32 <0x00000005, "S_CMOV_B32", []>;
19259 +def S_CMOV_B64 : SOP1_64 <0x00000006, "S_CMOV_B64", []>;
19260 +def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32", []>;
19261 +def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", []>;
19262 +def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>;
19263 +def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>;
19264 +def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32", []>;
19265 +def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>;
19266 +} // End neverHasSideEffects = 1
19267 +////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>;
19268 +////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>;
19269 +////def S_BCNT1_I32_B32 : SOP1_BCNT1 <0x0000000f, "S_BCNT1_I32_B32", []>;
19270 +////def S_BCNT1_I32_B64 : SOP1_BCNT1 <0x00000010, "S_BCNT1_I32_B64", []>;
19271 +////def S_FF0_I32_B32 : SOP1_FF0 <0x00000011, "S_FF0_I32_B32", []>;
19272 +////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>;
19273 +////def S_FF1_I32_B32 : SOP1_FF1 <0x00000013, "S_FF1_I32_B32", []>;
19274 +////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "S_FF1_I32_B64", []>;
19275 +//def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32", []>;
19276 +//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>;
19277 +def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>;
19278 +//def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>;
19279 +//def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8", []>;
19280 +//def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16", []>;
19281 +////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "S_BITSET0_B32", []>;
19282 +////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>;
19283 +////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>;
19284 +////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "S_BITSET1_B64", []>;
19285 +def S_GETPC_B64 : SOP1_64 <0x0000001f, "S_GETPC_B64", []>;
19286 +def S_SETPC_B64 : SOP1_64 <0x00000020, "S_SETPC_B64", []>;
19287 +def S_SWAPPC_B64 : SOP1_64 <0x00000021, "S_SWAPPC_B64", []>;
19288 +def S_RFE_B64 : SOP1_64 <0x00000022, "S_RFE_B64", []>;
19290 +let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC] in {
19292 +def S_AND_SAVEEXEC_B64 : SOP1_64 <0x00000024, "S_AND_SAVEEXEC_B64", []>;
19293 +def S_OR_SAVEEXEC_B64 : SOP1_64 <0x00000025, "S_OR_SAVEEXEC_B64", []>;
19294 +def S_XOR_SAVEEXEC_B64 : SOP1_64 <0x00000026, "S_XOR_SAVEEXEC_B64", []>;
19295 +def S_ANDN2_SAVEEXEC_B64 : SOP1_64 <0x00000027, "S_ANDN2_SAVEEXEC_B64", []>;
19296 +def S_ORN2_SAVEEXEC_B64 : SOP1_64 <0x00000028, "S_ORN2_SAVEEXEC_B64", []>;
19297 +def S_NAND_SAVEEXEC_B64 : SOP1_64 <0x00000029, "S_NAND_SAVEEXEC_B64", []>;
19298 +def S_NOR_SAVEEXEC_B64 : SOP1_64 <0x0000002a, "S_NOR_SAVEEXEC_B64", []>;
19299 +def S_XNOR_SAVEEXEC_B64 : SOP1_64 <0x0000002b, "S_XNOR_SAVEEXEC_B64", []>;
19301 +} // End hasSideEffects = 1
19303 +def S_QUADMASK_B32 : SOP1_32 <0x0000002c, "S_QUADMASK_B32", []>;
19304 +def S_QUADMASK_B64 : SOP1_64 <0x0000002d, "S_QUADMASK_B64", []>;
19305 +def S_MOVRELS_B32 : SOP1_32 <0x0000002e, "S_MOVRELS_B32", []>;
19306 +def S_MOVRELS_B64 : SOP1_64 <0x0000002f, "S_MOVRELS_B64", []>;
19307 +def S_MOVRELD_B32 : SOP1_32 <0x00000030, "S_MOVRELD_B32", []>;
19308 +def S_MOVRELD_B64 : SOP1_64 <0x00000031, "S_MOVRELD_B64", []>;
19309 +//def S_CBRANCH_JOIN : SOP1_ <0x00000032, "S_CBRANCH_JOIN", []>;
19310 +def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "S_MOV_REGRD_B32", []>;
19311 +def S_ABS_I32 : SOP1_32 <0x00000034, "S_ABS_I32", []>;
19312 +def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>;
19313 +def S_MOVK_I32 : SOPK_32 <0x00000000, "S_MOVK_I32", []>;
19314 +def S_CMOVK_I32 : SOPK_32 <0x00000002, "S_CMOVK_I32", []>;
19317 +This instruction is disabled for now until we can figure out how to teach
19318 +the instruction selector to correctly use the S_CMP* vs V_CMP*
19321 +When this instruction is enabled the code generator sometimes produces this
19324 +SCC = S_CMPK_EQ_I32 SGPR0, imm
19326 +VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1
19328 +def S_CMPK_EQ_I32 : SOPK <
19329 + 0x00000003, (outs SCCReg:$dst), (ins SReg_32:$src0, i32imm:$src1),
19331 + [(set SCCReg:$dst, (setcc SReg_32:$src0, imm:$src1, SETEQ))]
19335 +def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "S_CMPK_LG_I32", []>;
19336 +def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "S_CMPK_GT_I32", []>;
19337 +def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "S_CMPK_GE_I32", []>;
19338 +def S_CMPK_LT_I32 : SOPK_32 <0x00000007, "S_CMPK_LT_I32", []>;
19339 +def S_CMPK_LE_I32 : SOPK_32 <0x00000008, "S_CMPK_LE_I32", []>;
19340 +def S_CMPK_EQ_U32 : SOPK_32 <0x00000009, "S_CMPK_EQ_U32", []>;
19341 +def S_CMPK_LG_U32 : SOPK_32 <0x0000000a, "S_CMPK_LG_U32", []>;
19342 +def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "S_CMPK_GT_U32", []>;
19343 +def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "S_CMPK_GE_U32", []>;
19344 +def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>;
19345 +def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>;
19346 +def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>;
19347 +def S_MULK_I32 : SOPK_32 <0x00000010, "S_MULK_I32", []>;
19348 +//def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "S_CBRANCH_I_FORK", []>;
19349 +def S_GETREG_B32 : SOPK_32 <0x00000012, "S_GETREG_B32", []>;
19350 +def S_SETREG_B32 : SOPK_32 <0x00000013, "S_SETREG_B32", []>;
19351 +def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>;
19352 +//def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "S_SETREG_IMM32_B32", []>;
19353 +//def EXP : EXP_ <0x00000000, "EXP", []>;
19355 +defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32", []>;
19356 +defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", []>;
19358 + (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_LT)),
19359 + (V_CMP_LT_F32_e64 AllReg_32:$src0, VReg_32:$src1)
19361 +defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", []>;
19363 + (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_EQ)),
19364 + (V_CMP_EQ_F32_e64 AllReg_32:$src0, VReg_32:$src1)
19366 +defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", []>;
19368 + (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_LE)),
19369 + (V_CMP_LE_F32_e64 AllReg_32:$src0, VReg_32:$src1)
19371 +defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", []>;
19373 + (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_GT)),
19374 + (V_CMP_GT_F32_e64 AllReg_32:$src0, VReg_32:$src1)
19376 +defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32", []>;
19378 + (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_NE)),
19379 + (V_CMP_LG_F32_e64 AllReg_32:$src0, VReg_32:$src1)
19381 +defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", []>;
19383 + (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_GE)),
19384 + (V_CMP_GE_F32_e64 AllReg_32:$src0, VReg_32:$src1)
19386 +defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32", []>;
19387 +defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32", []>;
19388 +defm V_CMP_NGE_F32 : VOPC_32 <0x00000009, "V_CMP_NGE_F32", []>;
19389 +defm V_CMP_NLG_F32 : VOPC_32 <0x0000000a, "V_CMP_NLG_F32", []>;
19390 +defm V_CMP_NGT_F32 : VOPC_32 <0x0000000b, "V_CMP_NGT_F32", []>;
19391 +defm V_CMP_NLE_F32 : VOPC_32 <0x0000000c, "V_CMP_NLE_F32", []>;
19392 +defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", []>;
19394 + (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_NE)),
19395 + (V_CMP_NEQ_F32_e64 AllReg_32:$src0, VReg_32:$src1)
19397 +defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32", []>;
19398 +defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32", []>;
19400 +//Side effect is writing to EXEC
19401 +let hasSideEffects = 1 in {
19403 +defm V_CMPX_F_F32 : VOPC_32 <0x00000010, "V_CMPX_F_F32", []>;
19404 +defm V_CMPX_LT_F32 : VOPC_32 <0x00000011, "V_CMPX_LT_F32", []>;
19405 +defm V_CMPX_EQ_F32 : VOPC_32 <0x00000012, "V_CMPX_EQ_F32", []>;
19406 +defm V_CMPX_LE_F32 : VOPC_32 <0x00000013, "V_CMPX_LE_F32", []>;
19407 +defm V_CMPX_GT_F32 : VOPC_32 <0x00000014, "V_CMPX_GT_F32", []>;
19408 +defm V_CMPX_LG_F32 : VOPC_32 <0x00000015, "V_CMPX_LG_F32", []>;
19409 +defm V_CMPX_GE_F32 : VOPC_32 <0x00000016, "V_CMPX_GE_F32", []>;
19410 +defm V_CMPX_O_F32 : VOPC_32 <0x00000017, "V_CMPX_O_F32", []>;
19411 +defm V_CMPX_U_F32 : VOPC_32 <0x00000018, "V_CMPX_U_F32", []>;
19412 +defm V_CMPX_NGE_F32 : VOPC_32 <0x00000019, "V_CMPX_NGE_F32", []>;
19413 +defm V_CMPX_NLG_F32 : VOPC_32 <0x0000001a, "V_CMPX_NLG_F32", []>;
19414 +defm V_CMPX_NGT_F32 : VOPC_32 <0x0000001b, "V_CMPX_NGT_F32", []>;
19415 +defm V_CMPX_NLE_F32 : VOPC_32 <0x0000001c, "V_CMPX_NLE_F32", []>;
19416 +defm V_CMPX_NEQ_F32 : VOPC_32 <0x0000001d, "V_CMPX_NEQ_F32", []>;
19417 +defm V_CMPX_NLT_F32 : VOPC_32 <0x0000001e, "V_CMPX_NLT_F32", []>;
19418 +defm V_CMPX_TRU_F32 : VOPC_32 <0x0000001f, "V_CMPX_TRU_F32", []>;
19420 +} // End hasSideEffects = 1
19422 +defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64", []>;
19423 +defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", []>;
19424 +defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64", []>;
19425 +defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64", []>;
19426 +defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64", []>;
19427 +defm V_CMP_LG_F64 : VOPC_64 <0x00000025, "V_CMP_LG_F64", []>;
19428 +defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64", []>;
19429 +defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64", []>;
19430 +defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64", []>;
19431 +defm V_CMP_NGE_F64 : VOPC_64 <0x00000029, "V_CMP_NGE_F64", []>;
19432 +defm V_CMP_NLG_F64 : VOPC_64 <0x0000002a, "V_CMP_NLG_F64", []>;
19433 +defm V_CMP_NGT_F64 : VOPC_64 <0x0000002b, "V_CMP_NGT_F64", []>;
19434 +defm V_CMP_NLE_F64 : VOPC_64 <0x0000002c, "V_CMP_NLE_F64", []>;
19435 +defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", []>;
19436 +defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64", []>;
19437 +defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64", []>;
19439 +//Side effect is writing to EXEC
19440 +let hasSideEffects = 1 in {
19442 +defm V_CMPX_F_F64 : VOPC_64 <0x00000030, "V_CMPX_F_F64", []>;
19443 +defm V_CMPX_LT_F64 : VOPC_64 <0x00000031, "V_CMPX_LT_F64", []>;
19444 +defm V_CMPX_EQ_F64 : VOPC_64 <0x00000032, "V_CMPX_EQ_F64", []>;
19445 +defm V_CMPX_LE_F64 : VOPC_64 <0x00000033, "V_CMPX_LE_F64", []>;
19446 +defm V_CMPX_GT_F64 : VOPC_64 <0x00000034, "V_CMPX_GT_F64", []>;
19447 +defm V_CMPX_LG_F64 : VOPC_64 <0x00000035, "V_CMPX_LG_F64", []>;
19448 +defm V_CMPX_GE_F64 : VOPC_64 <0x00000036, "V_CMPX_GE_F64", []>;
19449 +defm V_CMPX_O_F64 : VOPC_64 <0x00000037, "V_CMPX_O_F64", []>;
19450 +defm V_CMPX_U_F64 : VOPC_64 <0x00000038, "V_CMPX_U_F64", []>;
19451 +defm V_CMPX_NGE_F64 : VOPC_64 <0x00000039, "V_CMPX_NGE_F64", []>;
19452 +defm V_CMPX_NLG_F64 : VOPC_64 <0x0000003a, "V_CMPX_NLG_F64", []>;
19453 +defm V_CMPX_NGT_F64 : VOPC_64 <0x0000003b, "V_CMPX_NGT_F64", []>;
19454 +defm V_CMPX_NLE_F64 : VOPC_64 <0x0000003c, "V_CMPX_NLE_F64", []>;
19455 +defm V_CMPX_NEQ_F64 : VOPC_64 <0x0000003d, "V_CMPX_NEQ_F64", []>;
19456 +defm V_CMPX_NLT_F64 : VOPC_64 <0x0000003e, "V_CMPX_NLT_F64", []>;
19457 +defm V_CMPX_TRU_F64 : VOPC_64 <0x0000003f, "V_CMPX_TRU_F64", []>;
19459 +} // End hasSideEffects = 1
19461 +defm V_CMPS_F_F32 : VOPC_32 <0x00000040, "V_CMPS_F_F32", []>;
19462 +defm V_CMPS_LT_F32 : VOPC_32 <0x00000041, "V_CMPS_LT_F32", []>;
19463 +defm V_CMPS_EQ_F32 : VOPC_32 <0x00000042, "V_CMPS_EQ_F32", []>;
19464 +defm V_CMPS_LE_F32 : VOPC_32 <0x00000043, "V_CMPS_LE_F32", []>;
19465 +defm V_CMPS_GT_F32 : VOPC_32 <0x00000044, "V_CMPS_GT_F32", []>;
19466 +defm V_CMPS_LG_F32 : VOPC_32 <0x00000045, "V_CMPS_LG_F32", []>;
19467 +defm V_CMPS_GE_F32 : VOPC_32 <0x00000046, "V_CMPS_GE_F32", []>;
19468 +defm V_CMPS_O_F32 : VOPC_32 <0x00000047, "V_CMPS_O_F32", []>;
19469 +defm V_CMPS_U_F32 : VOPC_32 <0x00000048, "V_CMPS_U_F32", []>;
19470 +defm V_CMPS_NGE_F32 : VOPC_32 <0x00000049, "V_CMPS_NGE_F32", []>;
19471 +defm V_CMPS_NLG_F32 : VOPC_32 <0x0000004a, "V_CMPS_NLG_F32", []>;
19472 +defm V_CMPS_NGT_F32 : VOPC_32 <0x0000004b, "V_CMPS_NGT_F32", []>;
19473 +defm V_CMPS_NLE_F32 : VOPC_32 <0x0000004c, "V_CMPS_NLE_F32", []>;
19474 +defm V_CMPS_NEQ_F32 : VOPC_32 <0x0000004d, "V_CMPS_NEQ_F32", []>;
19475 +defm V_CMPS_NLT_F32 : VOPC_32 <0x0000004e, "V_CMPS_NLT_F32", []>;
19476 +defm V_CMPS_TRU_F32 : VOPC_32 <0x0000004f, "V_CMPS_TRU_F32", []>;
19477 +defm V_CMPSX_F_F32 : VOPC_32 <0x00000050, "V_CMPSX_F_F32", []>;
19478 +defm V_CMPSX_LT_F32 : VOPC_32 <0x00000051, "V_CMPSX_LT_F32", []>;
19479 +defm V_CMPSX_EQ_F32 : VOPC_32 <0x00000052, "V_CMPSX_EQ_F32", []>;
19480 +defm V_CMPSX_LE_F32 : VOPC_32 <0x00000053, "V_CMPSX_LE_F32", []>;
19481 +defm V_CMPSX_GT_F32 : VOPC_32 <0x00000054, "V_CMPSX_GT_F32", []>;
19482 +defm V_CMPSX_LG_F32 : VOPC_32 <0x00000055, "V_CMPSX_LG_F32", []>;
19483 +defm V_CMPSX_GE_F32 : VOPC_32 <0x00000056, "V_CMPSX_GE_F32", []>;
19484 +defm V_CMPSX_O_F32 : VOPC_32 <0x00000057, "V_CMPSX_O_F32", []>;
19485 +defm V_CMPSX_U_F32 : VOPC_32 <0x00000058, "V_CMPSX_U_F32", []>;
19486 +defm V_CMPSX_NGE_F32 : VOPC_32 <0x00000059, "V_CMPSX_NGE_F32", []>;
19487 +defm V_CMPSX_NLG_F32 : VOPC_32 <0x0000005a, "V_CMPSX_NLG_F32", []>;
19488 +defm V_CMPSX_NGT_F32 : VOPC_32 <0x0000005b, "V_CMPSX_NGT_F32", []>;
19489 +defm V_CMPSX_NLE_F32 : VOPC_32 <0x0000005c, "V_CMPSX_NLE_F32", []>;
19490 +defm V_CMPSX_NEQ_F32 : VOPC_32 <0x0000005d, "V_CMPSX_NEQ_F32", []>;
19491 +defm V_CMPSX_NLT_F32 : VOPC_32 <0x0000005e, "V_CMPSX_NLT_F32", []>;
19492 +defm V_CMPSX_TRU_F32 : VOPC_32 <0x0000005f, "V_CMPSX_TRU_F32", []>;
19493 +defm V_CMPS_F_F64 : VOPC_64 <0x00000060, "V_CMPS_F_F64", []>;
19494 +defm V_CMPS_LT_F64 : VOPC_64 <0x00000061, "V_CMPS_LT_F64", []>;
19495 +defm V_CMPS_EQ_F64 : VOPC_64 <0x00000062, "V_CMPS_EQ_F64", []>;
19496 +defm V_CMPS_LE_F64 : VOPC_64 <0x00000063, "V_CMPS_LE_F64", []>;
19497 +defm V_CMPS_GT_F64 : VOPC_64 <0x00000064, "V_CMPS_GT_F64", []>;
19498 +defm V_CMPS_LG_F64 : VOPC_64 <0x00000065, "V_CMPS_LG_F64", []>;
19499 +defm V_CMPS_GE_F64 : VOPC_64 <0x00000066, "V_CMPS_GE_F64", []>;
19500 +defm V_CMPS_O_F64 : VOPC_64 <0x00000067, "V_CMPS_O_F64", []>;
19501 +defm V_CMPS_U_F64 : VOPC_64 <0x00000068, "V_CMPS_U_F64", []>;
19502 +defm V_CMPS_NGE_F64 : VOPC_64 <0x00000069, "V_CMPS_NGE_F64", []>;
19503 +defm V_CMPS_NLG_F64 : VOPC_64 <0x0000006a, "V_CMPS_NLG_F64", []>;
19504 +defm V_CMPS_NGT_F64 : VOPC_64 <0x0000006b, "V_CMPS_NGT_F64", []>;
19505 +defm V_CMPS_NLE_F64 : VOPC_64 <0x0000006c, "V_CMPS_NLE_F64", []>;
19506 +defm V_CMPS_NEQ_F64 : VOPC_64 <0x0000006d, "V_CMPS_NEQ_F64", []>;
19507 +defm V_CMPS_NLT_F64 : VOPC_64 <0x0000006e, "V_CMPS_NLT_F64", []>;
19508 +defm V_CMPS_TRU_F64 : VOPC_64 <0x0000006f, "V_CMPS_TRU_F64", []>;
19509 +defm V_CMPSX_F_F64 : VOPC_64 <0x00000070, "V_CMPSX_F_F64", []>;
19510 +defm V_CMPSX_LT_F64 : VOPC_64 <0x00000071, "V_CMPSX_LT_F64", []>;
19511 +defm V_CMPSX_EQ_F64 : VOPC_64 <0x00000072, "V_CMPSX_EQ_F64", []>;
19512 +defm V_CMPSX_LE_F64 : VOPC_64 <0x00000073, "V_CMPSX_LE_F64", []>;
19513 +defm V_CMPSX_GT_F64 : VOPC_64 <0x00000074, "V_CMPSX_GT_F64", []>;
19514 +defm V_CMPSX_LG_F64 : VOPC_64 <0x00000075, "V_CMPSX_LG_F64", []>;
19515 +defm V_CMPSX_GE_F64 : VOPC_64 <0x00000076, "V_CMPSX_GE_F64", []>;
19516 +defm V_CMPSX_O_F64 : VOPC_64 <0x00000077, "V_CMPSX_O_F64", []>;
19517 +defm V_CMPSX_U_F64 : VOPC_64 <0x00000078, "V_CMPSX_U_F64", []>;
19518 +defm V_CMPSX_NGE_F64 : VOPC_64 <0x00000079, "V_CMPSX_NGE_F64", []>;
19519 +defm V_CMPSX_NLG_F64 : VOPC_64 <0x0000007a, "V_CMPSX_NLG_F64", []>;
19520 +defm V_CMPSX_NGT_F64 : VOPC_64 <0x0000007b, "V_CMPSX_NGT_F64", []>;
19521 +defm V_CMPSX_NLE_F64 : VOPC_64 <0x0000007c, "V_CMPSX_NLE_F64", []>;
19522 +defm V_CMPSX_NEQ_F64 : VOPC_64 <0x0000007d, "V_CMPSX_NEQ_F64", []>;
19523 +defm V_CMPSX_NLT_F64 : VOPC_64 <0x0000007e, "V_CMPSX_NLT_F64", []>;
19524 +defm V_CMPSX_TRU_F64 : VOPC_64 <0x0000007f, "V_CMPSX_TRU_F64", []>;
19525 +defm V_CMP_F_I32 : VOPC_32 <0x00000080, "V_CMP_F_I32", []>;
19526 +defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", []>;
19528 + (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_LT)),
19529 + (V_CMP_LT_I32_e64 AllReg_32:$src0, VReg_32:$src1)
19531 +defm V_CMP_EQ_I32 : VOPC_32 <0x00000082, "V_CMP_EQ_I32", []>;
19533 + (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_EQ)),
19534 + (V_CMP_EQ_I32_e64 AllReg_32:$src0, VReg_32:$src1)
19536 +defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", []>;
19538 + (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_LE)),
19539 + (V_CMP_LE_I32_e64 AllReg_32:$src0, VReg_32:$src1)
19541 +defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", []>;
19543 + (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_GT)),
19544 + (V_CMP_GT_I32_e64 AllReg_32:$src0, VReg_32:$src1)
19546 +defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", []>;
19548 + (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_NE)),
19549 + (V_CMP_NE_I32_e64 AllReg_32:$src0, VReg_32:$src1)
19551 +defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", []>;
19553 + (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_GE)),
19554 + (V_CMP_GE_I32_e64 AllReg_32:$src0, VReg_32:$src1)
19556 +defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32", []>;
19558 +let hasSideEffects = 1 in {
19560 +defm V_CMPX_F_I32 : VOPC_32 <0x00000090, "V_CMPX_F_I32", []>;
19561 +defm V_CMPX_LT_I32 : VOPC_32 <0x00000091, "V_CMPX_LT_I32", []>;
19562 +defm V_CMPX_EQ_I32 : VOPC_32 <0x00000092, "V_CMPX_EQ_I32", []>;
19563 +defm V_CMPX_LE_I32 : VOPC_32 <0x00000093, "V_CMPX_LE_I32", []>;
19564 +defm V_CMPX_GT_I32 : VOPC_32 <0x00000094, "V_CMPX_GT_I32", []>;
19565 +defm V_CMPX_NE_I32 : VOPC_32 <0x00000095, "V_CMPX_NE_I32", []>;
19566 +defm V_CMPX_GE_I32 : VOPC_32 <0x00000096, "V_CMPX_GE_I32", []>;
19567 +defm V_CMPX_T_I32 : VOPC_32 <0x00000097, "V_CMPX_T_I32", []>;
19569 +} // End hasSideEffects
19571 +defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64", []>;
19572 +defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64", []>;
19573 +defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64", []>;
19574 +defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64", []>;
19575 +defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64", []>;
19576 +defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64", []>;
19577 +defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64", []>;
19578 +defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64", []>;
19580 +let hasSideEffects = 1 in {
19582 +defm V_CMPX_F_I64 : VOPC_64 <0x000000b0, "V_CMPX_F_I64", []>;
19583 +defm V_CMPX_LT_I64 : VOPC_64 <0x000000b1, "V_CMPX_LT_I64", []>;
19584 +defm V_CMPX_EQ_I64 : VOPC_64 <0x000000b2, "V_CMPX_EQ_I64", []>;
19585 +defm V_CMPX_LE_I64 : VOPC_64 <0x000000b3, "V_CMPX_LE_I64", []>;
19586 +defm V_CMPX_GT_I64 : VOPC_64 <0x000000b4, "V_CMPX_GT_I64", []>;
19587 +defm V_CMPX_NE_I64 : VOPC_64 <0x000000b5, "V_CMPX_NE_I64", []>;
19588 +defm V_CMPX_GE_I64 : VOPC_64 <0x000000b6, "V_CMPX_GE_I64", []>;
19589 +defm V_CMPX_T_I64 : VOPC_64 <0x000000b7, "V_CMPX_T_I64", []>;
19591 +} // End hasSideEffects
19593 +defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32", []>;
19594 +defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32", []>;
19595 +defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32", []>;
19596 +defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32", []>;
19597 +defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32", []>;
19598 +defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32", []>;
19599 +defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32", []>;
19600 +defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32", []>;
19602 +let hasSideEffects = 1 in {
19604 +defm V_CMPX_F_U32 : VOPC_32 <0x000000d0, "V_CMPX_F_U32", []>;
19605 +defm V_CMPX_LT_U32 : VOPC_32 <0x000000d1, "V_CMPX_LT_U32", []>;
19606 +defm V_CMPX_EQ_U32 : VOPC_32 <0x000000d2, "V_CMPX_EQ_U32", []>;
19607 +defm V_CMPX_LE_U32 : VOPC_32 <0x000000d3, "V_CMPX_LE_U32", []>;
19608 +defm V_CMPX_GT_U32 : VOPC_32 <0x000000d4, "V_CMPX_GT_U32", []>;
19609 +defm V_CMPX_NE_U32 : VOPC_32 <0x000000d5, "V_CMPX_NE_U32", []>;
19610 +defm V_CMPX_GE_U32 : VOPC_32 <0x000000d6, "V_CMPX_GE_U32", []>;
19611 +defm V_CMPX_T_U32 : VOPC_32 <0x000000d7, "V_CMPX_T_U32", []>;
19613 +} // End hasSideEffects
19615 +defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64", []>;
19616 +defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64", []>;
19617 +defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64", []>;
19618 +defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64", []>;
19619 +defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64", []>;
19620 +defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64", []>;
19621 +defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64", []>;
19622 +defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64", []>;
19623 +defm V_CMPX_F_U64 : VOPC_64 <0x000000f0, "V_CMPX_F_U64", []>;
19624 +defm V_CMPX_LT_U64 : VOPC_64 <0x000000f1, "V_CMPX_LT_U64", []>;
19625 +defm V_CMPX_EQ_U64 : VOPC_64 <0x000000f2, "V_CMPX_EQ_U64", []>;
19626 +defm V_CMPX_LE_U64 : VOPC_64 <0x000000f3, "V_CMPX_LE_U64", []>;
19627 +defm V_CMPX_GT_U64 : VOPC_64 <0x000000f4, "V_CMPX_GT_U64", []>;
19628 +defm V_CMPX_NE_U64 : VOPC_64 <0x000000f5, "V_CMPX_NE_U64", []>;
19629 +defm V_CMPX_GE_U64 : VOPC_64 <0x000000f6, "V_CMPX_GE_U64", []>;
19630 +defm V_CMPX_T_U64 : VOPC_64 <0x000000f7, "V_CMPX_T_U64", []>;
19631 +defm V_CMP_CLASS_F32 : VOPC_32 <0x00000088, "V_CMP_CLASS_F32", []>;
19632 +defm V_CMPX_CLASS_F32 : VOPC_32 <0x00000098, "V_CMPX_CLASS_F32", []>;
19633 +defm V_CMP_CLASS_F64 : VOPC_64 <0x000000a8, "V_CMP_CLASS_F64", []>;
19634 +defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64", []>;
19635 +//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
19636 +//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
19637 +//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>;
19638 +def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>;
19639 +//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "BUFFER_STORE_FORMAT_X", []>;
19640 +//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>;
19641 +//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>;
19642 +//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>;
19643 +//def BUFFER_LOAD_UBYTE : MUBUF_ <0x00000008, "BUFFER_LOAD_UBYTE", []>;
19644 +//def BUFFER_LOAD_SBYTE : MUBUF_ <0x00000009, "BUFFER_LOAD_SBYTE", []>;
19645 +//def BUFFER_LOAD_USHORT : MUBUF_ <0x0000000a, "BUFFER_LOAD_USHORT", []>;
19646 +//def BUFFER_LOAD_SSHORT : MUBUF_ <0x0000000b, "BUFFER_LOAD_SSHORT", []>;
19647 +//def BUFFER_LOAD_DWORD : MUBUF_ <0x0000000c, "BUFFER_LOAD_DWORD", []>;
19648 +//def BUFFER_LOAD_DWORDX2 : MUBUF_DWORDX2 <0x0000000d, "BUFFER_LOAD_DWORDX2", []>;
19649 +//def BUFFER_LOAD_DWORDX4 : MUBUF_DWORDX4 <0x0000000e, "BUFFER_LOAD_DWORDX4", []>;
19650 +//def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>;
19651 +//def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>;
19652 +//def BUFFER_STORE_DWORD : MUBUF_ <0x0000001c, "BUFFER_STORE_DWORD", []>;
19653 +//def BUFFER_STORE_DWORDX2 : MUBUF_DWORDX2 <0x0000001d, "BUFFER_STORE_DWORDX2", []>;
19654 +//def BUFFER_STORE_DWORDX4 : MUBUF_DWORDX4 <0x0000001e, "BUFFER_STORE_DWORDX4", []>;
19655 +//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
19656 +//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>;
19657 +//def BUFFER_ATOMIC_ADD : MUBUF_ <0x00000032, "BUFFER_ATOMIC_ADD", []>;
19658 +//def BUFFER_ATOMIC_SUB : MUBUF_ <0x00000033, "BUFFER_ATOMIC_SUB", []>;
19659 +//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "BUFFER_ATOMIC_RSUB", []>;
19660 +//def BUFFER_ATOMIC_SMIN : MUBUF_ <0x00000035, "BUFFER_ATOMIC_SMIN", []>;
19661 +//def BUFFER_ATOMIC_UMIN : MUBUF_ <0x00000036, "BUFFER_ATOMIC_UMIN", []>;
19662 +//def BUFFER_ATOMIC_SMAX : MUBUF_ <0x00000037, "BUFFER_ATOMIC_SMAX", []>;
19663 +//def BUFFER_ATOMIC_UMAX : MUBUF_ <0x00000038, "BUFFER_ATOMIC_UMAX", []>;
19664 +//def BUFFER_ATOMIC_AND : MUBUF_ <0x00000039, "BUFFER_ATOMIC_AND", []>;
19665 +//def BUFFER_ATOMIC_OR : MUBUF_ <0x0000003a, "BUFFER_ATOMIC_OR", []>;
19666 +//def BUFFER_ATOMIC_XOR : MUBUF_ <0x0000003b, "BUFFER_ATOMIC_XOR", []>;
19667 +//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "BUFFER_ATOMIC_INC", []>;
19668 +//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "BUFFER_ATOMIC_DEC", []>;
19669 +//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "BUFFER_ATOMIC_FCMPSWAP", []>;
19670 +//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "BUFFER_ATOMIC_FMIN", []>;
19671 +//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "BUFFER_ATOMIC_FMAX", []>;
19672 +//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "BUFFER_ATOMIC_SWAP_X2", []>;
19673 +//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "BUFFER_ATOMIC_CMPSWAP_X2", []>;
19674 +//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "BUFFER_ATOMIC_ADD_X2", []>;
19675 +//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "BUFFER_ATOMIC_SUB_X2", []>;
19676 +//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "BUFFER_ATOMIC_RSUB_X2", []>;
19677 +//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "BUFFER_ATOMIC_SMIN_X2", []>;
19678 +//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "BUFFER_ATOMIC_UMIN_X2", []>;
19679 +//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "BUFFER_ATOMIC_SMAX_X2", []>;
19680 +//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "BUFFER_ATOMIC_UMAX_X2", []>;
19681 +//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "BUFFER_ATOMIC_AND_X2", []>;
19682 +//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "BUFFER_ATOMIC_OR_X2", []>;
19683 +//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "BUFFER_ATOMIC_XOR_X2", []>;
19684 +//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "BUFFER_ATOMIC_INC_X2", []>;
19685 +//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "BUFFER_ATOMIC_DEC_X2", []>;
19686 +//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "BUFFER_ATOMIC_FCMPSWAP_X2", []>;
19687 +//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "BUFFER_ATOMIC_FMIN_X2", []>;
19688 +//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "BUFFER_ATOMIC_FMAX_X2", []>;
19689 +//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "BUFFER_WBINVL1_SC", []>;
19690 +//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "BUFFER_WBINVL1", []>;
19691 +//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "TBUFFER_LOAD_FORMAT_X", []>;
19692 +//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>;
19693 +//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>;
19694 +def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORMAT_XYZW", VReg_128>;
19695 +//def TBUFFER_STORE_FORMAT_X : MTBUF_ <0x00000004, "TBUFFER_STORE_FORMAT_X", []>;
19696 +//def TBUFFER_STORE_FORMAT_XY : MTBUF_ <0x00000005, "TBUFFER_STORE_FORMAT_XY", []>;
19697 +//def TBUFFER_STORE_FORMAT_XYZ : MTBUF_ <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", []>;
19698 +//def TBUFFER_STORE_FORMAT_XYZW : MTBUF_ <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", []>;
19700 +defm S_LOAD_DWORD : SMRD_32 <0x00000000, "S_LOAD_DWORD", SReg_32>;
19702 +//def S_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000001, "S_LOAD_DWORDX2", []>;
19703 +defm S_LOAD_DWORDX4 : SMRD_Helper <0x00000002, "S_LOAD_DWORDX4", SReg_128, v4i32>;
19704 +defm S_LOAD_DWORDX8 : SMRD_Helper <0x00000003, "S_LOAD_DWORDX8", SReg_256, v8i32>;
19705 +//def S_LOAD_DWORDX16 : SMRD_DWORDX16 <0x00000004, "S_LOAD_DWORDX16", []>;
19706 +//def S_BUFFER_LOAD_DWORD : SMRD_ <0x00000008, "S_BUFFER_LOAD_DWORD", []>;
19707 +//def S_BUFFER_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000009, "S_BUFFER_LOAD_DWORDX2", []>;
19708 +//def S_BUFFER_LOAD_DWORDX4 : SMRD_DWORDX4 <0x0000000a, "S_BUFFER_LOAD_DWORDX4", []>;
19709 +//def S_BUFFER_LOAD_DWORDX8 : SMRD_DWORDX8 <0x0000000b, "S_BUFFER_LOAD_DWORDX8", []>;
19710 +//def S_BUFFER_LOAD_DWORDX16 : SMRD_DWORDX16 <0x0000000c, "S_BUFFER_LOAD_DWORDX16", []>;
19712 +//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
19713 +//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
19714 +//def IMAGE_LOAD : MIMG_NoPattern_ <"IMAGE_LOAD", 0x00000000>;
19715 +//def IMAGE_LOAD_MIP : MIMG_NoPattern_ <"IMAGE_LOAD_MIP", 0x00000001>;
19716 +//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>;
19717 +//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_PCK_SGN", 0x00000003>;
19718 +//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK", 0x00000004>;
19719 +//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK_SGN", 0x00000005>;
19720 +//def IMAGE_STORE : MIMG_NoPattern_ <"IMAGE_STORE", 0x00000008>;
19721 +//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"IMAGE_STORE_MIP", 0x00000009>;
19722 +//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"IMAGE_STORE_PCK", 0x0000000a>;
19723 +//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"IMAGE_STORE_MIP_PCK", 0x0000000b>;
19724 +//def IMAGE_GET_RESINFO : MIMG_NoPattern_ <"IMAGE_GET_RESINFO", 0x0000000e>;
19725 +//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_SWAP", 0x0000000f>;
19726 +//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_CMPSWAP", 0x00000010>;
19727 +//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"IMAGE_ATOMIC_ADD", 0x00000011>;
19728 +//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_SUB", 0x00000012>;
19729 +//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_RSUB", 0x00000013>;
19730 +//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMIN", 0x00000014>;
19731 +//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMIN", 0x00000015>;
19732 +//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMAX", 0x00000016>;
19733 +//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMAX", 0x00000017>;
19734 +//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"IMAGE_ATOMIC_AND", 0x00000018>;
19735 +//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"IMAGE_ATOMIC_OR", 0x00000019>;
19736 +//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"IMAGE_ATOMIC_XOR", 0x0000001a>;
19737 +//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"IMAGE_ATOMIC_INC", 0x0000001b>;
19738 +//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"IMAGE_ATOMIC_DEC", 0x0000001c>;
19739 +//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>;
19740 +//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>;
19741 +//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>;
19742 +def IMAGE_SAMPLE : MIMG_Load_Helper <0x00000020, "IMAGE_SAMPLE">;
19743 +//def IMAGE_SAMPLE_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL", 0x00000021>;
19744 +def IMAGE_SAMPLE_D : MIMG_Load_Helper <0x00000022, "IMAGE_SAMPLE_D">;
19745 +//def IMAGE_SAMPLE_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL", 0x00000023>;
19746 +def IMAGE_SAMPLE_L : MIMG_Load_Helper <0x00000024, "IMAGE_SAMPLE_L">;
19747 +def IMAGE_SAMPLE_B : MIMG_Load_Helper <0x00000025, "IMAGE_SAMPLE_B">;
19748 +//def IMAGE_SAMPLE_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL", 0x00000026>;
19749 +//def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>;
19750 +//def IMAGE_SAMPLE_C : MIMG_NoPattern_ <"IMAGE_SAMPLE_C", 0x00000028>;
19751 +//def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>;
19752 +//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D", 0x0000002a>;
19753 +//def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>;
19754 +//def IMAGE_SAMPLE_C_L : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_L", 0x0000002c>;
19755 +//def IMAGE_SAMPLE_C_B : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B", 0x0000002d>;
19756 +//def IMAGE_SAMPLE_C_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL", 0x0000002e>;
19757 +//def IMAGE_SAMPLE_C_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ", 0x0000002f>;
19758 +//def IMAGE_SAMPLE_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_O", 0x00000030>;
19759 +//def IMAGE_SAMPLE_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL_O", 0x00000031>;
19760 +//def IMAGE_SAMPLE_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_O", 0x00000032>;
19761 +//def IMAGE_SAMPLE_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL_O", 0x00000033>;
19762 +//def IMAGE_SAMPLE_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_L_O", 0x00000034>;
19763 +//def IMAGE_SAMPLE_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_O", 0x00000035>;
19764 +//def IMAGE_SAMPLE_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL_O", 0x00000036>;
19765 +//def IMAGE_SAMPLE_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ_O", 0x00000037>;
19766 +//def IMAGE_SAMPLE_C_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_O", 0x00000038>;
19767 +//def IMAGE_SAMPLE_C_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL_O", 0x00000039>;
19768 +//def IMAGE_SAMPLE_C_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_O", 0x0000003a>;
19769 +//def IMAGE_SAMPLE_C_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL_O", 0x0000003b>;
19770 +//def IMAGE_SAMPLE_C_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_L_O", 0x0000003c>;
19771 +//def IMAGE_SAMPLE_C_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_O", 0x0000003d>;
19772 +//def IMAGE_SAMPLE_C_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL_O", 0x0000003e>;
19773 +//def IMAGE_SAMPLE_C_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ_O", 0x0000003f>;
19774 +//def IMAGE_GATHER4 : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4", 0x00000040>;
19775 +//def IMAGE_GATHER4_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL", 0x00000041>;
19776 +//def IMAGE_GATHER4_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L", 0x00000044>;
19777 +//def IMAGE_GATHER4_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B", 0x00000045>;
19778 +//def IMAGE_GATHER4_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL", 0x00000046>;
19779 +//def IMAGE_GATHER4_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ", 0x00000047>;
19780 +//def IMAGE_GATHER4_C : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C", 0x00000048>;
19781 +//def IMAGE_GATHER4_C_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL", 0x00000049>;
19782 +//def IMAGE_GATHER4_C_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L", 0x0000004c>;
19783 +//def IMAGE_GATHER4_C_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B", 0x0000004d>;
19784 +//def IMAGE_GATHER4_C_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL", 0x0000004e>;
19785 +//def IMAGE_GATHER4_C_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ", 0x0000004f>;
19786 +//def IMAGE_GATHER4_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_O", 0x00000050>;
19787 +//def IMAGE_GATHER4_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL_O", 0x00000051>;
19788 +//def IMAGE_GATHER4_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L_O", 0x00000054>;
19789 +//def IMAGE_GATHER4_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_O", 0x00000055>;
19790 +//def IMAGE_GATHER4_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL_O", 0x00000056>;
19791 +//def IMAGE_GATHER4_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ_O", 0x00000057>;
19792 +//def IMAGE_GATHER4_C_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_O", 0x00000058>;
19793 +//def IMAGE_GATHER4_C_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL_O", 0x00000059>;
19794 +//def IMAGE_GATHER4_C_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L_O", 0x0000005c>;
19795 +//def IMAGE_GATHER4_C_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_O", 0x0000005d>;
19796 +//def IMAGE_GATHER4_C_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL_O", 0x0000005e>;
19797 +//def IMAGE_GATHER4_C_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ_O", 0x0000005f>;
19798 +//def IMAGE_GET_LOD : MIMG_NoPattern_ <"IMAGE_GET_LOD", 0x00000060>;
19799 +//def IMAGE_SAMPLE_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD", 0x00000068>;
19800 +//def IMAGE_SAMPLE_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL", 0x00000069>;
19801 +//def IMAGE_SAMPLE_C_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD", 0x0000006a>;
19802 +//def IMAGE_SAMPLE_C_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL", 0x0000006b>;
19803 +//def IMAGE_SAMPLE_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_O", 0x0000006c>;
19804 +//def IMAGE_SAMPLE_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL_O", 0x0000006d>;
19805 +//def IMAGE_SAMPLE_C_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_O", 0x0000006e>;
19806 +//def IMAGE_SAMPLE_C_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL_O", 0x0000006f>;
19807 +//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"IMAGE_RSRC256", 0x0000007e>;
19808 +//def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>;
19809 +//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>;
19811 +let neverHasSideEffects = 1 in {
19812 +defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>;
19813 +} // End neverHasSideEffects
19814 +defm V_READFIRSTLANE_B32 : VOP1_32 <0x00000002, "V_READFIRSTLANE_B32", []>;
19815 +//defm V_CVT_I32_F64 : VOP1_32 <0x00000003, "V_CVT_I32_F64", []>;
19816 +//defm V_CVT_F64_I32 : VOP1_64 <0x00000004, "V_CVT_F64_I32", []>;
19817 +defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32",
19818 + [(set VReg_32:$dst, (sint_to_fp AllReg_32:$src0))]
19820 +//defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32", []>;
19821 +//defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>;
19822 +defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32",
19823 + [(set VReg_32:$dst, (fp_to_sint AllReg_32:$src0))]
19825 +defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>;
19826 +////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>;
19827 +//defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", []>;
19828 +//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>;
19829 +//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>;
19830 +//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>;
19831 +//defm V_CVT_F32_F64 : VOP1_32 <0x0000000f, "V_CVT_F32_F64", []>;
19832 +//defm V_CVT_F64_F32 : VOP1_64 <0x00000010, "V_CVT_F64_F32", []>;
19833 +//defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", []>;
19834 +//defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>;
19835 +//defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>;
19836 +//defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", []>;
19837 +//defm V_CVT_U32_F64 : VOP1_32 <0x00000015, "V_CVT_U32_F64", []>;
19838 +//defm V_CVT_F64_U32 : VOP1_64 <0x00000016, "V_CVT_F64_U32", []>;
19839 +defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32",
19840 + [(set VReg_32:$dst, (AMDGPUfract AllReg_32:$src0))]
19842 +defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", []>;
19843 +defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32", []>;
19844 +defm V_RNDNE_F32 : VOP1_32 <0x00000023, "V_RNDNE_F32",
19845 + [(set VReg_32:$dst, (frint AllReg_32:$src0))]
19847 +defm V_FLOOR_F32 : VOP1_32 <0x00000024, "V_FLOOR_F32",
19848 + [(set VReg_32:$dst, (ffloor AllReg_32:$src0))]
19850 +defm V_EXP_F32 : VOP1_32 <0x00000025, "V_EXP_F32",
19851 + [(set VReg_32:$dst, (fexp2 AllReg_32:$src0))]
19853 +defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>;
19854 +defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32", []>;
19855 +defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>;
19856 +defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>;
19857 +defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32",
19858 + [(set VReg_32:$dst, (fdiv FP_ONE, AllReg_32:$src0))]
19860 +defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>;
19861 +defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>;
19862 +defm V_RSQ_LEGACY_F32 : VOP1_32 <
19863 + 0x0000002d, "V_RSQ_LEGACY_F32",
19864 + [(set VReg_32:$dst, (int_AMDGPU_rsq AllReg_32:$src0))]
19866 +defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", []>;
19867 +defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", []>;
19868 +defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>;
19869 +defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", []>;
19870 +defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", []>;
19871 +defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32", []>;
19872 +defm V_SQRT_F64 : VOP1_64 <0x00000034, "V_SQRT_F64", []>;
19873 +defm V_SIN_F32 : VOP1_32 <0x00000035, "V_SIN_F32", []>;
19874 +defm V_COS_F32 : VOP1_32 <0x00000036, "V_COS_F32", []>;
19875 +defm V_NOT_B32 : VOP1_32 <0x00000037, "V_NOT_B32", []>;
19876 +defm V_BFREV_B32 : VOP1_32 <0x00000038, "V_BFREV_B32", []>;
19877 +defm V_FFBH_U32 : VOP1_32 <0x00000039, "V_FFBH_U32", []>;
19878 +defm V_FFBL_B32 : VOP1_32 <0x0000003a, "V_FFBL_B32", []>;
19879 +defm V_FFBH_I32 : VOP1_32 <0x0000003b, "V_FFBH_I32", []>;
19880 +//defm V_FREXP_EXP_I32_F64 : VOP1_32 <0x0000003c, "V_FREXP_EXP_I32_F64", []>;
19881 +defm V_FREXP_MANT_F64 : VOP1_64 <0x0000003d, "V_FREXP_MANT_F64", []>;
19882 +defm V_FRACT_F64 : VOP1_64 <0x0000003e, "V_FRACT_F64", []>;
19883 +//defm V_FREXP_EXP_I32_F32 : VOP1_32 <0x0000003f, "V_FREXP_EXP_I32_F32", []>;
19884 +defm V_FREXP_MANT_F32 : VOP1_32 <0x00000040, "V_FREXP_MANT_F32", []>;
19885 +//def V_CLREXCP : VOP1_ <0x00000041, "V_CLREXCP", []>;
19886 +defm V_MOVRELD_B32 : VOP1_32 <0x00000042, "V_MOVRELD_B32", []>;
19887 +defm V_MOVRELS_B32 : VOP1_32 <0x00000043, "V_MOVRELS_B32", []>;
19888 +defm V_MOVRELSD_B32 : VOP1_32 <0x00000044, "V_MOVRELSD_B32", []>;
19890 +def V_INTERP_P1_F32 : VINTRP <
19892 + (outs VReg_32:$dst),
19893 + (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
19894 + "V_INTERP_P1_F32",
19896 + let DisableEncoding = "$m0";
19899 +def V_INTERP_P2_F32 : VINTRP <
19901 + (outs VReg_32:$dst),
19902 + (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
19903 + "V_INTERP_P2_F32",
19906 + let Constraints = "$src0 = $dst";
19907 + let DisableEncoding = "$src0,$m0";
19911 +def V_INTERP_MOV_F32 : VINTRP <
19913 + (outs VReg_32:$dst),
19914 + (ins i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
19915 + "V_INTERP_MOV_F32",
19918 + let DisableEncoding = "$m0";
19921 +//def S_NOP : SOPP_ <0x00000000, "S_NOP", []>;
19923 +let isTerminator = 1 in {
19925 +def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM",
19926 + [(IL_retflag)]> {
19928 + let isBarrier = 1;
19929 + let hasCtrlDep = 1;
19932 +let isBranch = 1 in {
19933 +def S_BRANCH : SOPP <
19934 + 0x00000002, (ins brtarget:$target), "S_BRANCH",
19935 + [(br bb:$target)]> {
19936 + let isBarrier = 1;
19939 +let DisableEncoding = "$scc" in {
19940 +def S_CBRANCH_SCC0 : SOPP <
19941 + 0x00000004, (ins brtarget:$target, SCCReg:$scc),
19942 + "S_CBRANCH_SCC0", []
19944 +def S_CBRANCH_SCC1 : SOPP <
19945 + 0x00000005, (ins brtarget:$target, SCCReg:$scc),
19946 + "S_CBRANCH_SCC1",
19949 +} // End DisableEncoding = "$scc"
19951 +def S_CBRANCH_VCCZ : SOPP <
19952 + 0x00000006, (ins brtarget:$target, VCCReg:$vcc),
19953 + "S_CBRANCH_VCCZ",
19956 +def S_CBRANCH_VCCNZ : SOPP <
19957 + 0x00000007, (ins brtarget:$target, VCCReg:$vcc),
19958 + "S_CBRANCH_VCCNZ",
19962 +let DisableEncoding = "$exec" in {
19963 +def S_CBRANCH_EXECZ : SOPP <
19964 + 0x00000008, (ins brtarget:$target, EXECReg:$exec),
19965 + "S_CBRANCH_EXECZ",
19968 +def S_CBRANCH_EXECNZ : SOPP <
19969 + 0x00000009, (ins brtarget:$target, EXECReg:$exec),
19970 + "S_CBRANCH_EXECNZ",
19973 +} // End DisableEncoding = "$exec"
19976 +} // End isBranch = 1
19977 +} // End isTerminator = 1
19979 +//def S_BARRIER : SOPP_ <0x0000000a, "S_BARRIER", []>;
19980 +let hasSideEffects = 1 in {
19981 +def S_WAITCNT : SOPP <0x0000000c, (ins i32imm:$simm16), "S_WAITCNT $simm16",
19984 +} // End hasSideEffects
19985 +//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>;
19986 +//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>;
19987 +//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>;
19988 +//def S_SENDMSG : SOPP_ <0x00000010, "S_SENDMSG", []>;
19989 +//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>;
19990 +//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>;
19991 +//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>;
19992 +//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>;
19993 +//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>;
19994 +//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>;
19996 +def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst),
19997 + (ins AllReg_32:$src0, VReg_32:$src1, VCCReg:$vcc), "V_CNDMASK_B32_e32",
20000 + let DisableEncoding = "$vcc";
20003 +def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst),
20004 + (ins VReg_32:$src0, VReg_32:$src1, SReg_1:$src2, InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
20005 + "V_CNDMASK_B32_e64",
20006 + [(set (i32 VReg_32:$dst), (select SReg_1:$src2, VReg_32:$src1, VReg_32:$src0))]
20009 +//f32 pattern for V_CNDMASK_B32_e64
20011 + (f32 (select SReg_1:$src2, VReg_32:$src1, VReg_32:$src0)),
20012 + (V_CNDMASK_B32_e64 VReg_32:$src0, VReg_32:$src1, SReg_1:$src2)
20015 +defm V_READLANE_B32 : VOP2_32 <0x00000001, "V_READLANE_B32", []>;
20016 +defm V_WRITELANE_B32 : VOP2_32 <0x00000002, "V_WRITELANE_B32", []>;
20018 +defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32", []>;
20020 + (f32 (fadd AllReg_32:$src0, VReg_32:$src1)),
20021 + (V_ADD_F32_e32 AllReg_32:$src0, VReg_32:$src1)
20024 +defm V_SUB_F32 : VOP2_32 <0x00000004, "V_SUB_F32", []>;
20026 + (f32 (fsub AllReg_32:$src0, VReg_32:$src1)),
20027 + (V_SUB_F32_e32 AllReg_32:$src0, VReg_32:$src1)
20029 +defm V_SUBREV_F32 : VOP2_32 <0x00000005, "V_SUBREV_F32", []>;
20030 +defm V_MAC_LEGACY_F32 : VOP2_32 <0x00000006, "V_MAC_LEGACY_F32", []>;
20031 +defm V_MUL_LEGACY_F32 : VOP2_32 <
20032 + 0x00000007, "V_MUL_LEGACY_F32",
20033 + [(set VReg_32:$dst, (int_AMDGPU_mul AllReg_32:$src0, VReg_32:$src1))]
20036 +defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32",
20037 + [(set VReg_32:$dst, (fmul AllReg_32:$src0, VReg_32:$src1))]
20039 +//defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24", []>;
20040 +//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>;
20041 +//defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24", []>;
20042 +//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>;
20043 +defm V_MIN_LEGACY_F32 : VOP2_32 <0x0000000d, "V_MIN_LEGACY_F32",
20044 + [(set VReg_32:$dst, (AMDGPUfmin AllReg_32:$src0, VReg_32:$src1))]
20047 +defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32",
20048 + [(set VReg_32:$dst, (AMDGPUfmax AllReg_32:$src0, VReg_32:$src1))]
20050 +defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>;
20051 +defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>;
20052 +defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", []>;
20053 +defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", []>;
20054 +defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", []>;
20055 +defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", []>;
20056 +defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", []>;
20057 +defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", []>;
20058 +defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", []>;
20059 +defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", []>;
20060 +defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", []>;
20061 +defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", []>;
20062 +defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32",
20063 + [(set VReg_32:$dst, (and AllReg_32:$src0, VReg_32:$src1))]
20065 +defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32",
20066 + [(set VReg_32:$dst, (or AllReg_32:$src0, VReg_32:$src1))]
20068 +defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32",
20069 + [(set VReg_32:$dst, (xor AllReg_32:$src0, VReg_32:$src1))]
20071 +defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32", []>;
20072 +defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;
20073 +defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>;
20074 +defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;
20075 +//defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
20076 +//defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
20077 +//defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
20078 +let Defs = [VCC] in { // Carry-out goes to VCC
20079 +defm V_ADD_I32 : VOP2_32 <0x00000025, "V_ADD_I32",
20080 + [(set VReg_32:$dst, (add (i32 AllReg_32:$src0), (i32 VReg_32:$src1)))]
20082 +defm V_SUB_I32 : VOP2_32 <0x00000026, "V_SUB_I32",
20083 + [(set VReg_32:$dst, (sub (i32 AllReg_32:$src0), (i32 VReg_32:$src1)))]
20085 +} // End Defs = [VCC]
20086 +defm V_SUBREV_I32 : VOP2_32 <0x00000027, "V_SUBREV_I32", []>;
20087 +defm V_ADDC_U32 : VOP2_32 <0x00000028, "V_ADDC_U32", []>;
20088 +defm V_SUBB_U32 : VOP2_32 <0x00000029, "V_SUBB_U32", []>;
20089 +defm V_SUBBREV_U32 : VOP2_32 <0x0000002a, "V_SUBBREV_U32", []>;
20090 +defm V_LDEXP_F32 : VOP2_32 <0x0000002b, "V_LDEXP_F32", []>;
20091 +////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "V_CVT_PKACCUM_U8_F32", []>;
20092 +////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "V_CVT_PKNORM_I16_F32", []>;
20093 +////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "V_CVT_PKNORM_U16_F32", []>;
20094 +defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32",
20095 + [(set VReg_32:$dst, (int_SI_packf16 AllReg_32:$src0, VReg_32:$src1))]
20097 +////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>;
20098 +////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>;
20099 +def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32", []>;
20100 +def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32", []>;
20101 +def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32", []>;
20102 +def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32", []>;
20103 +def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32", []>;
20104 +def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32", []>;
20105 +def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32", []>;
20106 +def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32", []>;
20107 +def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32", []>;
20108 +def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32", []>;
20109 +def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32", []>;
20110 +def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32", []>;
20111 +////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>;
20112 +////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>;
20113 +////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>;
20114 +////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>;
20115 +//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>;
20117 +let neverHasSideEffects = 1 in {
20119 +def V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>;
20120 +def V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", []>;
20121 +//def V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24", []>;
20122 +//def V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24", []>;
20124 +} // End neverHasSideEffects
20125 +def V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>;
20126 +def V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>;
20127 +def V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>;
20128 +def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
20129 +def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", []>;
20130 +def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", []>;
20131 +def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", []>;
20132 +def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", []>;
20133 +def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", []>;
20134 +//def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
20135 +def V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
20136 +def V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>;
20137 +def V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
20138 +////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>;
20139 +////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>;
20140 +////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>;
20141 +////def V_MAX3_F32 : VOP3_MAX3 <0x00000154, "V_MAX3_F32", []>;
20142 +////def V_MAX3_I32 : VOP3_MAX3 <0x00000155, "V_MAX3_I32", []>;
20143 +////def V_MAX3_U32 : VOP3_MAX3 <0x00000156, "V_MAX3_U32", []>;
20144 +////def V_MED3_F32 : VOP3_MED3 <0x00000157, "V_MED3_F32", []>;
20145 +////def V_MED3_I32 : VOP3_MED3 <0x00000158, "V_MED3_I32", []>;
20146 +////def V_MED3_U32 : VOP3_MED3 <0x00000159, "V_MED3_U32", []>;
20147 +//def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>;
20148 +//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>;
20149 +//def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>;
20150 +def V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
20151 +////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>;
20152 +def V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
20153 +def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>;
20154 +def V_LSHL_B64 : VOP3_64 <0x00000161, "V_LSHL_B64", []>;
20155 +def V_LSHR_B64 : VOP3_64 <0x00000162, "V_LSHR_B64", []>;
20156 +def V_ASHR_I64 : VOP3_64 <0x00000163, "V_ASHR_I64", []>;
20157 +def V_ADD_F64 : VOP3_64 <0x00000164, "V_ADD_F64", []>;
20158 +def V_MUL_F64 : VOP3_64 <0x00000165, "V_MUL_F64", []>;
20159 +def V_MIN_F64 : VOP3_64 <0x00000166, "V_MIN_F64", []>;
20160 +def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>;
20161 +def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>;
20162 +def V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>;
20163 +def V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>;
20164 +def V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>;
20165 +def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
20166 +def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
20167 +def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
20168 +def V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>;
20169 +def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>;
20170 +//def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>;
20171 +//def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
20172 +//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
20173 +def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>;
20174 +def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>;
20175 +def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>;
20176 +def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32", []>;
20177 +def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32", []>;
20178 +def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32", []>;
20179 +def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32", []>;
20180 +def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32", []>;
20181 +def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32", []>;
20182 +def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32", []>;
20183 +def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32", []>;
20185 +def S_CSELECT_B32 : SOP2 <
20186 + 0x0000000a, (outs SReg_32:$dst),
20187 + (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32",
20188 + [(set (i32 SReg_32:$dst), (select SCCReg:$scc, SReg_32:$src0, SReg_32:$src1))]
20191 +def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>;
20193 +// f32 pattern for S_CSELECT_B32
20195 + (f32 (select SCCReg:$scc, SReg_32:$src0, SReg_32:$src1)),
20196 + (S_CSELECT_B32 SReg_32:$src0, SReg_32:$src1, SCCReg:$scc)
20199 +def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32", []>;
20201 +def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64",
20202 + [(set SReg_64:$dst, (and SReg_64:$src0, SReg_64:$src1))]
20204 +def S_AND_VCC : SOP2_VCC <0x0000000f, "S_AND_B64",
20205 + [(set SReg_1:$vcc, (SIvcc_and SReg_64:$src0, SReg_64:$src1))]
20207 +def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32", []>;
20208 +def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64", []>;
20209 +def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", []>;
20210 +def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", []>;
20211 +def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>;
20212 +def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>;
20213 +def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>;
20214 +def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>;
20215 +def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>;
20216 +def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>;
20217 +def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>;
20218 +def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>;
20219 +def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>;
20220 +def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>;
20221 +def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32", []>;
20222 +def S_LSHL_B64 : SOP2_64 <0x0000001f, "S_LSHL_B64", []>;
20223 +def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32", []>;
20224 +def S_LSHR_B64 : SOP2_64 <0x00000021, "S_LSHR_B64", []>;
20225 +def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32", []>;
20226 +def S_ASHR_I64 : SOP2_64 <0x00000023, "S_ASHR_I64", []>;
20227 +def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>;
20228 +def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>;
20229 +def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>;
20230 +def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>;
20231 +def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>;
20232 +def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>;
20233 +def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>;
20234 +//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>;
20235 +def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
20237 +class V_MOV_IMM <Operand immType, SDNode immNode> : InstSI <
20238 + (outs VReg_32:$dst),
20239 + (ins immType:$src0),
20241 + [(set VReg_32:$dst, (immNode:$src0))]
20244 +let isCodeGenOnly = 1, isPseudo = 1 in {
20246 +def V_MOV_IMM_I32 : V_MOV_IMM<i32imm, imm>;
20247 +def V_MOV_IMM_F32 : V_MOV_IMM<f32imm, fpimm>;
20249 +def S_MOV_IMM_I32 : InstSI <
20250 + (outs SReg_32:$dst),
20251 + (ins i32imm:$src0),
20253 + [(set SReg_32:$dst, (imm:$src0))]
20256 +// i64 immediates aren't really supported in hardware, but LLVM will use the i64
20257 +// type for indices on load and store instructions. The pattern for
20258 +// S_MOV_IMM_I64 will only match i64 immediates that can fit into 32-bits,
20259 +// which the hardware can handle.
20260 +def S_MOV_IMM_I64 : InstSI <
20261 + (outs SReg_64:$dst),
20262 + (ins i64imm:$src0),
20263 + "S_MOV_IMM_I64 $dst, $src0",
20264 + [(set SReg_64:$dst, (IMM32bitIn64bit:$src0))]
20267 +} // End isCodeGenOnly, isPseudo = 1
20269 +class SI_LOAD_LITERAL<Operand ImmType> :
20270 + Enc32 <(outs), (ins ImmType:$imm), "LOAD_LITERAL $imm", []> {
20273 + let Inst{31-0} = imm;
20276 +def SI_LOAD_LITERAL_I32 : SI_LOAD_LITERAL<i32imm>;
20277 +def SI_LOAD_LITERAL_F32 : SI_LOAD_LITERAL<f32imm>;
20279 +let isCodeGenOnly = 1, isPseudo = 1 in {
20281 +def SET_M0 : InstSI <
20282 + (outs SReg_32:$dst),
20283 + (ins i32imm:$src0),
20285 + [(set SReg_32:$dst, (int_SI_set_M0 imm:$src0))]
20288 +def LOAD_CONST : AMDGPUShaderInst <
20289 + (outs GPRF32:$dst),
20290 + (ins i32imm:$src),
20291 + "LOAD_CONST $dst, $src",
20292 + [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))]
20295 +let usesCustomInserter = 1 in {
20297 +def SI_V_CNDLT : InstSI <
20298 + (outs VReg_32:$dst),
20299 + (ins VReg_32:$src0, VReg_32:$src1, VReg_32:$src2),
20300 + "SI_V_CNDLT $dst, $src0, $src1, $src2",
20301 + [(set VReg_32:$dst, (int_AMDGPU_cndlt VReg_32:$src0, VReg_32:$src1, VReg_32:$src2))]
20304 +def SI_INTERP : InstSI <
20305 + (outs VReg_32:$dst),
20306 + (ins VReg_32:$i, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, SReg_32:$params),
20307 + "SI_INTERP $dst, $i, $j, $attr_chan, $attr, $params",
20311 +def SI_INTERP_CONST : InstSI <
20312 + (outs VReg_32:$dst),
20313 + (ins i32imm:$attr_chan, i32imm:$attr, SReg_32:$params),
20314 + "SI_INTERP_CONST $dst, $attr_chan, $attr, $params",
20315 + [(set VReg_32:$dst, (int_SI_fs_interp_constant imm:$attr_chan,
20316 + imm:$attr, SReg_32:$params))]
20319 +def SI_WQM : InstSI <
20326 +} // end usesCustomInserter
20328 +// SI Psuedo instructions. These are used by the CFG structurizer pass
20329 +// and should be lowered to ISA instructions prior to codegen.
20331 +let mayLoad = 1, mayStore = 1, hasSideEffects = 1,
20332 + Uses = [EXEC], Defs = [EXEC] in {
20334 +let isBranch = 1, isTerminator = 1 in {
20336 +def SI_IF : InstSI <
20337 + (outs SReg_64:$dst),
20338 + (ins SReg_1:$vcc, brtarget:$target),
20340 + [(set SReg_64:$dst, (int_SI_if SReg_1:$vcc, bb:$target))]
20343 +def SI_ELSE : InstSI <
20344 + (outs SReg_64:$dst),
20345 + (ins SReg_64:$src, brtarget:$target),
20347 + [(set SReg_64:$dst, (int_SI_else SReg_64:$src, bb:$target))]> {
20349 + let Constraints = "$src = $dst";
20352 +def SI_LOOP : InstSI <
20354 + (ins SReg_64:$saved, brtarget:$target),
20356 + [(int_SI_loop SReg_64:$saved, bb:$target)]
20359 +} // end isBranch = 1, isTerminator = 1
20361 +def SI_BREAK : InstSI <
20362 + (outs SReg_64:$dst),
20363 + (ins SReg_64:$src),
20365 + [(set SReg_64:$dst, (int_SI_break SReg_64:$src))]
20368 +def SI_IF_BREAK : InstSI <
20369 + (outs SReg_64:$dst),
20370 + (ins SReg_1:$vcc, SReg_64:$src),
20372 + [(set SReg_64:$dst, (int_SI_if_break SReg_1:$vcc, SReg_64:$src))]
20375 +def SI_ELSE_BREAK : InstSI <
20376 + (outs SReg_64:$dst),
20377 + (ins SReg_64:$src0, SReg_64:$src1),
20379 + [(set SReg_64:$dst, (int_SI_else_break SReg_64:$src0, SReg_64:$src1))]
20382 +def SI_END_CF : InstSI <
20384 + (ins SReg_64:$saved),
20386 + [(int_SI_end_cf SReg_64:$saved)]
20389 +def SI_KILL : InstSI <
20391 + (ins VReg_32:$src),
20393 + [(int_AMDGPU_kill VReg_32:$src)]
20396 +} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1
20397 + // Uses = [EXEC], Defs = [EXEC]
20399 +} // end IsCodeGenOnly, isPseudo
20402 + (int_AMDGPU_kilp),
20403 + (SI_KILL (V_MOV_IMM_I32 0xbf800000))
20406 +/* int_SI_vs_load_input */
20408 + (int_SI_vs_load_input SReg_128:$tlst, IMM12bit:$attr_offset,
20409 + VReg_32:$buf_idx_vgpr),
20410 + (BUFFER_LOAD_FORMAT_XYZW imm:$attr_offset, 0, 1, 0, 0, 0,
20411 + VReg_32:$buf_idx_vgpr, SReg_128:$tlst,
20412 + 0, 0, (i32 SREG_LIT_0))
20415 +/* int_SI_export */
20417 + (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr,
20418 + VReg_32:$src0,VReg_32:$src1, VReg_32:$src2, VReg_32:$src3),
20419 + (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm,
20420 + VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3)
20423 +/* int_SI_sample */
20425 + (int_SI_sample imm:$writemask, VReg_128:$coord, SReg_256:$rsrc, SReg_128:$sampler, imm),
20426 + (IMAGE_SAMPLE imm:$writemask, 0, 0, 0, 0, 0, 0, 0, VReg_128:$coord,
20427 + SReg_256:$rsrc, SReg_128:$sampler)
20431 + (int_SI_sample imm:$writemask, VReg_128:$coord, SReg_256:$rsrc, SReg_128:$sampler, TEX_RECT),
20432 + (IMAGE_SAMPLE imm:$writemask, 1, 0, 0, 0, 0, 0, 0, VReg_128:$coord,
20433 + SReg_256:$rsrc, SReg_128:$sampler)
20436 +/* int_SI_sample_lod */
20438 + (int_SI_sample_lod imm:$writemask, VReg_128:$coord, SReg_256:$rsrc, SReg_128:$sampler, imm),
20439 + (IMAGE_SAMPLE_L imm:$writemask, 0, 0, 0, 0, 0, 0, 0, VReg_128:$coord,
20440 + SReg_256:$rsrc, SReg_128:$sampler)
20443 +/* int_SI_sample_bias */
20445 + (int_SI_sample_bias imm:$writemask, VReg_128:$coord, SReg_256:$rsrc, SReg_128:$sampler, imm),
20446 + (IMAGE_SAMPLE_B imm:$writemask, 0, 0, 0, 0, 0, 0, 0, VReg_128:$coord,
20447 + SReg_256:$rsrc, SReg_128:$sampler)
20450 +def CLAMP_SI : CLAMP<VReg_32>;
20451 +def FABS_SI : FABS<VReg_32>;
20452 +def FNEG_SI : FNEG<VReg_32>;
20454 +def : Extract_Element <f32, v4f32, VReg_128, 0, sel_x>;
20455 +def : Extract_Element <f32, v4f32, VReg_128, 1, sel_y>;
20456 +def : Extract_Element <f32, v4f32, VReg_128, 2, sel_z>;
20457 +def : Extract_Element <f32, v4f32, VReg_128, 3, sel_w>;
20459 +def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 4, sel_x>;
20460 +def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 5, sel_y>;
20461 +def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 6, sel_z>;
20462 +def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 7, sel_w>;
20464 +def : Vector_Build <v4f32, VReg_128, f32, VReg_32>;
20465 +def : Vector_Build <v4i32, SReg_128, i32, SReg_32>;
20467 +def : BitConvert <i32, f32, SReg_32>;
20468 +def : BitConvert <i32, f32, VReg_32>;
20470 +def : BitConvert <f32, i32, SReg_32>;
20471 +def : BitConvert <f32, i32, VReg_32>;
20474 + (i64 (SIsreg1_bitcast SReg_1:$vcc)),
20475 + (S_MOV_B64 (COPY_TO_REGCLASS SReg_1:$vcc, SReg_64))
20479 + (i1 (SIsreg1_bitcast SReg_64:$vcc)),
20480 + (COPY_TO_REGCLASS SReg_64:$vcc, SReg_1)
20484 + (i64 (SIvcc_bitcast VCCReg:$vcc)),
20485 + (S_MOV_B64 (COPY_TO_REGCLASS VCCReg:$vcc, SReg_64))
20489 + (i1 (SIvcc_bitcast SReg_64:$vcc)),
20490 + (COPY_TO_REGCLASS SReg_64:$vcc, VCCReg)
20493 +/********** ===================== **********/
20494 +/********** Interpolation Paterns **********/
20495 +/********** ===================== **********/
20498 + (int_SI_fs_interp_linear_center imm:$attr_chan, imm:$attr, SReg_32:$params),
20499 + (SI_INTERP (f32 LINEAR_CENTER_I), (f32 LINEAR_CENTER_J), imm:$attr_chan,
20500 + imm:$attr, SReg_32:$params)
20504 + (int_SI_fs_interp_linear_centroid imm:$attr_chan, imm:$attr, SReg_32:$params),
20505 + (SI_INTERP (f32 LINEAR_CENTROID_I), (f32 LINEAR_CENTROID_J), imm:$attr_chan,
20506 + imm:$attr, SReg_32:$params)
20510 + (int_SI_fs_interp_persp_center imm:$attr_chan, imm:$attr, SReg_32:$params),
20511 + (SI_INTERP (f32 PERSP_CENTER_I), (f32 PERSP_CENTER_J), imm:$attr_chan,
20512 + imm:$attr, SReg_32:$params)
20516 + (int_SI_fs_interp_persp_centroid imm:$attr_chan, imm:$attr, SReg_32:$params),
20517 + (SI_INTERP (f32 PERSP_CENTROID_I), (f32 PERSP_CENTROID_J), imm:$attr_chan,
20518 + imm:$attr, SReg_32:$params)
20522 + (int_SI_fs_read_face),
20527 + (int_SI_fs_read_pos 0),
20528 + (f32 POS_X_FLOAT)
20532 + (int_SI_fs_read_pos 1),
20533 + (f32 POS_Y_FLOAT)
20537 + (int_SI_fs_read_pos 2),
20538 + (f32 POS_Z_FLOAT)
20542 + (int_SI_fs_read_pos 3),
20543 + (f32 POS_W_FLOAT)
20546 +/********** ================== **********/
20547 +/********** Intrinsic Patterns **********/
20548 +/********** ================== **********/
20550 +/* llvm.AMDGPU.pow */
20551 +/* XXX: We are using IEEE MUL, not the 0 * anything = 0 MUL, is this correct? */
20552 +def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_F32_e32, VReg_32>;
20555 + (int_AMDGPU_div AllReg_32:$src0, AllReg_32:$src1),
20556 + (V_MUL_LEGACY_F32_e32 AllReg_32:$src0, (V_RCP_LEGACY_F32_e32 AllReg_32:$src1))
20560 + (fdiv AllReg_32:$src0, AllReg_32:$src1),
20561 + (V_MUL_F32_e32 AllReg_32:$src0, (V_RCP_F32_e32 AllReg_32:$src1))
20565 + (int_AMDGPU_cube VReg_128:$src),
20566 + (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
20567 + (V_CUBETC_F32 (EXTRACT_SUBREG VReg_128:$src, sel_x),
20568 + (EXTRACT_SUBREG VReg_128:$src, sel_y),
20569 + (EXTRACT_SUBREG VReg_128:$src, sel_z),
20570 + 0, 0, 0, 0), sel_x),
20571 + (V_CUBESC_F32 (EXTRACT_SUBREG VReg_128:$src, sel_x),
20572 + (EXTRACT_SUBREG VReg_128:$src, sel_y),
20573 + (EXTRACT_SUBREG VReg_128:$src, sel_z),
20574 + 0, 0, 0, 0), sel_y),
20575 + (V_CUBEMA_F32 (EXTRACT_SUBREG VReg_128:$src, sel_x),
20576 + (EXTRACT_SUBREG VReg_128:$src, sel_y),
20577 + (EXTRACT_SUBREG VReg_128:$src, sel_z),
20578 + 0, 0, 0, 0), sel_z),
20579 + (V_CUBEID_F32 (EXTRACT_SUBREG VReg_128:$src, sel_x),
20580 + (EXTRACT_SUBREG VReg_128:$src, sel_y),
20581 + (EXTRACT_SUBREG VReg_128:$src, sel_z),
20582 + 0, 0, 0, 0), sel_w)
20585 +/********** ================== **********/
20586 +/********** VOP3 Patterns **********/
20587 +/********** ================== **********/
20589 +def : Pat <(f32 (IL_mad AllReg_32:$src0, VReg_32:$src1, VReg_32:$src2)),
20590 + (V_MAD_LEGACY_F32 AllReg_32:$src0, VReg_32:$src1, VReg_32:$src2,
20593 +} // End isSI predicate
20594 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIIntrinsics.td llvm-r600/lib/Target/R600/SIIntrinsics.td
20595 --- llvm-3.2.src/lib/Target/R600/SIIntrinsics.td 1970-01-01 01:00:00.000000000 +0100
20596 +++ llvm-r600/lib/Target/R600/SIIntrinsics.td 2013-01-25 19:43:57.480049720 +0100
20598 +//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===//
20600 +// The LLVM Compiler Infrastructure
20602 +// This file is distributed under the University of Illinois Open Source
20603 +// License. See LICENSE.TXT for details.
20605 +//===----------------------------------------------------------------------===//
20607 +// SI Intrinsic Definitions
20609 +//===----------------------------------------------------------------------===//
20612 +let TargetPrefix = "SI", isTarget = 1 in {
20614 + def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
20615 + def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
20616 + /* XXX: We may need a seperate intrinsic here for loading integer values */
20617 + def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_i64_ty, llvm_i32_ty], []>;
20618 + def int_SI_vs_load_buffer_index : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>;
20619 + def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v4i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrReadMem]> ;
20620 + def int_SI_wqm : Intrinsic <[], [], []>;
20622 + class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_v4f32_ty, llvm_v8i32_ty, llvm_v4i32_ty, llvm_i32_ty], [IntrReadMem]>;
20624 + def int_SI_sample : Sample;
20625 + def int_SI_sample_bias : Sample;
20626 + def int_SI_sample_lod : Sample;
20628 + /* Interpolation Intrinsics */
20630 + def int_SI_set_M0 : Intrinsic <[llvm_i32_ty], [llvm_i32_ty]>;
20631 + class Interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadMem]>;
20633 + def int_SI_fs_interp_linear_center : Interp;
20634 + def int_SI_fs_interp_linear_centroid : Interp;
20635 + def int_SI_fs_interp_persp_center : Interp;
20636 + def int_SI_fs_interp_persp_centroid : Interp;
20637 + def int_SI_fs_interp_constant : Interp;
20639 + def int_SI_fs_read_face : Intrinsic <[llvm_float_ty], [], [IntrNoMem]>;
20640 + def int_SI_fs_read_pos : Intrinsic <[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
20642 + /* Control flow Intrinsics */
20644 + def int_SI_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
20645 + def int_SI_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>;
20646 + def int_SI_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>;
20647 + def int_SI_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>;
20648 + def int_SI_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>;
20649 + def int_SI_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>;
20650 + def int_SI_end_cf : Intrinsic<[], [llvm_i64_ty], []>;
20652 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIISelLowering.cpp llvm-r600/lib/Target/R600/SIISelLowering.cpp
20653 --- llvm-3.2.src/lib/Target/R600/SIISelLowering.cpp 1970-01-01 01:00:00.000000000 +0100
20654 +++ llvm-r600/lib/Target/R600/SIISelLowering.cpp 2013-01-25 19:43:57.470049720 +0100
20656 +//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
20658 +// The LLVM Compiler Infrastructure
20660 +// This file is distributed under the University of Illinois Open Source
20661 +// License. See LICENSE.TXT for details.
20663 +//===----------------------------------------------------------------------===//
20666 +/// \brief Custom DAG lowering for SI
20668 +//===----------------------------------------------------------------------===//
20670 +#include "SIISelLowering.h"
20671 +#include "AMDIL.h"
20672 +#include "AMDILIntrinsicInfo.h"
20673 +#include "SIInstrInfo.h"
20674 +#include "SIMachineFunctionInfo.h"
20675 +#include "SIRegisterInfo.h"
20676 +#include "llvm/CodeGen/MachineInstrBuilder.h"
20677 +#include "llvm/CodeGen/MachineRegisterInfo.h"
20678 +#include "llvm/CodeGen/SelectionDAG.h"
20680 +using namespace llvm;
20682 +SITargetLowering::SITargetLowering(TargetMachine &TM) :
20683 + AMDGPUTargetLowering(TM),
20684 + TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo())) {
20685 + addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
20686 + addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
20687 + addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass);
20688 + addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
20689 + addRegisterClass(MVT::i1, &AMDGPU::SCCRegRegClass);
20690 + addRegisterClass(MVT::i1, &AMDGPU::VCCRegRegClass);
20692 + addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
20693 + addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
20695 + computeRegisterProperties();
20697 + setOperationAction(ISD::AND, MVT::i1, Custom);
20699 + setOperationAction(ISD::ADD, MVT::i64, Legal);
20700 + setOperationAction(ISD::ADD, MVT::i32, Legal);
20702 + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
20704 + // We need to custom lower loads from the USER_SGPR address space, so we can
20705 + // add the SGPRs as livein registers.
20706 + setOperationAction(ISD::LOAD, MVT::i32, Custom);
20707 + setOperationAction(ISD::LOAD, MVT::i64, Custom);
20709 + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
20710 + setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
20712 + setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
20713 + setTargetDAGCombine(ISD::SELECT_CC);
20715 + setTargetDAGCombine(ISD::SETCC);
20718 +MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
20719 + MachineInstr * MI, MachineBasicBlock * BB) const {
20720 + const TargetInstrInfo * TII = getTargetMachine().getInstrInfo();
20721 + MachineRegisterInfo & MRI = BB->getParent()->getRegInfo();
20722 + MachineBasicBlock::iterator I = MI;
20724 + switch (MI->getOpcode()) {
20726 + return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
20727 + case AMDGPU::BRANCH: return BB;
20728 + case AMDGPU::CLAMP_SI:
20729 + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
20730 + .addOperand(MI->getOperand(0))
20731 + .addOperand(MI->getOperand(1))
20732 + // VSRC1-2 are unused, but we still need to fill all the
20733 + // operand slots, so we just reuse the VSRC0 operand
20734 + .addOperand(MI->getOperand(1))
20735 + .addOperand(MI->getOperand(1))
20736 + .addImm(0) // ABS
20737 + .addImm(1) // CLAMP
20738 + .addImm(0) // OMOD
20739 + .addImm(0); // NEG
20740 + MI->eraseFromParent();
20743 + case AMDGPU::FABS_SI:
20744 + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
20745 + .addOperand(MI->getOperand(0))
20746 + .addOperand(MI->getOperand(1))
20747 + // VSRC1-2 are unused, but we still need to fill all the
20748 + // operand slots, so we just reuse the VSRC0 operand
20749 + .addOperand(MI->getOperand(1))
20750 + .addOperand(MI->getOperand(1))
20751 + .addImm(1) // ABS
20752 + .addImm(0) // CLAMP
20753 + .addImm(0) // OMOD
20754 + .addImm(0); // NEG
20755 + MI->eraseFromParent();
20758 + case AMDGPU::FNEG_SI:
20759 + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
20760 + .addOperand(MI->getOperand(0))
20761 + .addOperand(MI->getOperand(1))
20762 + // VSRC1-2 are unused, but we still need to fill all the
20763 + // operand slots, so we just reuse the VSRC0 operand
20764 + .addOperand(MI->getOperand(1))
20765 + .addOperand(MI->getOperand(1))
20766 + .addImm(0) // ABS
20767 + .addImm(0) // CLAMP
20768 + .addImm(0) // OMOD
20769 + .addImm(1); // NEG
20770 + MI->eraseFromParent();
20772 + case AMDGPU::SHADER_TYPE:
20773 + BB->getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType =
20774 + MI->getOperand(0).getImm();
20775 + MI->eraseFromParent();
20778 + case AMDGPU::SI_INTERP:
20779 + LowerSI_INTERP(MI, *BB, I, MRI);
20781 + case AMDGPU::SI_INTERP_CONST:
20782 + LowerSI_INTERP_CONST(MI, *BB, I, MRI);
20784 + case AMDGPU::SI_WQM:
20785 + LowerSI_WQM(MI, *BB, I, MRI);
20787 + case AMDGPU::SI_V_CNDLT:
20788 + LowerSI_V_CNDLT(MI, *BB, I, MRI);
20794 +void SITargetLowering::LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
20795 + MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
20796 + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC)
20797 + .addReg(AMDGPU::EXEC);
20799 + MI->eraseFromParent();
20802 +void SITargetLowering::LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
20803 + MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
20804 + unsigned tmp = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
20805 + unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass);
20806 + MachineOperand dst = MI->getOperand(0);
20807 + MachineOperand iReg = MI->getOperand(1);
20808 + MachineOperand jReg = MI->getOperand(2);
20809 + MachineOperand attr_chan = MI->getOperand(3);
20810 + MachineOperand attr = MI->getOperand(4);
20811 + MachineOperand params = MI->getOperand(5);
20813 + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0)
20814 + .addOperand(params);
20816 + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P1_F32), tmp)
20817 + .addOperand(iReg)
20818 + .addOperand(attr_chan)
20819 + .addOperand(attr)
20822 + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P2_F32))
20825 + .addOperand(jReg)
20826 + .addOperand(attr_chan)
20827 + .addOperand(attr)
20830 + MI->eraseFromParent();
20833 +void SITargetLowering::LowerSI_INTERP_CONST(MachineInstr *MI,
20834 + MachineBasicBlock &BB, MachineBasicBlock::iterator I,
20835 + MachineRegisterInfo &MRI) const {
20836 + MachineOperand dst = MI->getOperand(0);
20837 + MachineOperand attr_chan = MI->getOperand(1);
20838 + MachineOperand attr = MI->getOperand(2);
20839 + MachineOperand params = MI->getOperand(3);
20840 + unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass);
20842 + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0)
20843 + .addOperand(params);
20845 + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_MOV_F32))
20847 + .addOperand(attr_chan)
20848 + .addOperand(attr)
20851 + MI->eraseFromParent();
20854 +void SITargetLowering::LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB,
20855 + MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
20856 + unsigned VCC = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
20858 + BuildMI(BB, I, BB.findDebugLoc(I),
20859 + TII->get(AMDGPU::V_CMP_GT_F32_e32),
20861 + .addReg(AMDGPU::SREG_LIT_0)
20862 + .addOperand(MI->getOperand(1));
20864 + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_CNDMASK_B32_e32))
20865 + .addOperand(MI->getOperand(0))
20866 + .addOperand(MI->getOperand(3))
20867 + .addOperand(MI->getOperand(2))
20870 + MI->eraseFromParent();
20873 +EVT SITargetLowering::getSetCCResultType(EVT VT) const {
20877 +//===----------------------------------------------------------------------===//
20878 +// Custom DAG Lowering Operations
20879 +//===----------------------------------------------------------------------===//
20881 +SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
20882 + switch (Op.getOpcode()) {
20883 + default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
20884 + case ISD::BRCOND: return LowerBRCOND(Op, DAG);
20885 + case ISD::LOAD: return LowerLOAD(Op, DAG);
20886 + case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
20887 + case ISD::AND: return Loweri1ContextSwitch(Op, DAG, ISD::AND);
20888 + case ISD::INTRINSIC_WO_CHAIN: {
20889 + unsigned IntrinsicID =
20890 + cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20891 + EVT VT = Op.getValueType();
20892 + switch (IntrinsicID) {
20893 + case AMDGPUIntrinsic::SI_vs_load_buffer_index:
20894 + return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
20895 + AMDGPU::VGPR0, VT);
20896 + default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
20901 + return SDValue();
20904 +/// \brief The function is for lowering i1 operations on the
20907 +/// In the VALU context, VCC is a one bit register, but in the
20908 +/// SALU context the VCC is a 64-bit register (1-bit per thread). Since only
20909 +/// the SALU can perform operations on the VCC register, we need to promote
20910 +/// the operand types from i1 to i64 in order for tablegen to be able to match
20911 +/// this operation to the correct SALU instruction. We do this promotion by
20912 +/// wrapping the operands in a CopyToReg node.
20914 +SDValue SITargetLowering::Loweri1ContextSwitch(SDValue Op,
20915 + SelectionDAG &DAG,
20916 + unsigned VCCNode) const {
20917 + DebugLoc DL = Op.getDebugLoc();
20919 + SDValue OpNode = DAG.getNode(VCCNode, DL, MVT::i64,
20920 + DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64,
20921 + Op.getOperand(0)),
20922 + DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64,
20923 + Op.getOperand(1)));
20925 + return DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i1, OpNode);
20928 +/// \brief Helper function for LowerBRCOND
20929 +static SDNode *findUser(SDValue Value, unsigned Opcode) {
20931 + SDNode *Parent = Value.getNode();
20932 + for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
20935 + if (I.getUse().get() != Value)
20938 + if (I->getOpcode() == Opcode)
20944 +/// This transforms the control flow intrinsics to get the branch destination as
20945 +/// last parameter, also switches branch target with BR if the need arise
20946 +SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
20947 + SelectionDAG &DAG) const {
20949 + DebugLoc DL = BRCOND.getDebugLoc();
20951 + SDNode *Intr = BRCOND.getOperand(1).getNode();
20952 + SDValue Target = BRCOND.getOperand(2);
20955 + if (Intr->getOpcode() == ISD::SETCC) {
20956 + // As long as we negate the condition everything is fine
20957 + SDNode *SetCC = Intr;
20958 + assert(SetCC->getConstantOperandVal(1) == 1);
20960 + CondCodeSDNode *CC = cast<CondCodeSDNode>(SetCC->getOperand(2).getNode());
20961 + assert(CC->get() == ISD::SETNE);
20962 + Intr = SetCC->getOperand(0).getNode();
20965 + // Get the target from BR if we don't negate the condition
20966 + BR = findUser(BRCOND, ISD::BR);
20967 + Target = BR->getOperand(1);
20970 + assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN);
20972 + // Build the result and
20973 + SmallVector<EVT, 4> Res;
20974 + for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i)
20975 + Res.push_back(Intr->getValueType(i));
20977 + // operands of the new intrinsic call
20978 + SmallVector<SDValue, 4> Ops;
20979 + Ops.push_back(BRCOND.getOperand(0));
20980 + for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i)
20981 + Ops.push_back(Intr->getOperand(i));
20982 + Ops.push_back(Target);
20984 + // build the new intrinsic call
20985 + SDNode *Result = DAG.getNode(
20986 + Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
20987 + DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode();
20990 + // Give the branch instruction our target
20991 + SDValue Ops[] = {
20992 + BR->getOperand(0),
20993 + BRCOND.getOperand(2)
20995 + DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2);
20998 + SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
21000 + // Copy the intrinsic results to registers
21001 + for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
21002 + SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
21006 + Chain = DAG.getCopyToReg(
21008 + CopyToReg->getOperand(1),
21009 + SDValue(Result, i - 1),
21012 + DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
21015 + // Remove the old intrinsic from the chain
21016 + DAG.ReplaceAllUsesOfValueWith(
21017 + SDValue(Intr, Intr->getNumValues() - 1),
21018 + Intr->getOperand(0));
21023 +SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
21024 + EVT VT = Op.getValueType();
21025 + LoadSDNode *Ptr = dyn_cast<LoadSDNode>(Op);
21029 + unsigned AddrSpace = Ptr->getPointerInfo().getAddrSpace();
21031 + // We only need to lower USER_SGPR address space loads
21032 + if (AddrSpace != AMDGPUAS::USER_SGPR_ADDRESS) {
21033 + return SDValue();
21036 + // Loads from the USER_SGPR address space can only have constant value
21038 + ConstantSDNode *BasePtr = dyn_cast<ConstantSDNode>(Ptr->getBasePtr());
21041 + unsigned TypeDwordWidth = VT.getSizeInBits() / 32;
21042 + const TargetRegisterClass * dstClass;
21043 + switch (TypeDwordWidth) {
21045 + assert(!"USER_SGPR value size not implemented");
21046 + return SDValue();
21048 + dstClass = &AMDGPU::SReg_32RegClass;
21051 + dstClass = &AMDGPU::SReg_64RegClass;
21054 + uint64_t Index = BasePtr->getZExtValue();
21055 + assert(Index % TypeDwordWidth == 0 && "USER_SGPR not properly aligned");
21056 + unsigned SGPRIndex = Index / TypeDwordWidth;
21057 + unsigned Reg = dstClass->getRegister(SGPRIndex);
21059 + DAG.ReplaceAllUsesOfValueWith(Op, CreateLiveInRegister(DAG, dstClass, Reg,
21061 + return SDValue();
21064 +SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
21065 + SDValue LHS = Op.getOperand(0);
21066 + SDValue RHS = Op.getOperand(1);
21067 + SDValue True = Op.getOperand(2);
21068 + SDValue False = Op.getOperand(3);
21069 + SDValue CC = Op.getOperand(4);
21070 + EVT VT = Op.getValueType();
21071 + DebugLoc DL = Op.getDebugLoc();
21073 + // Possible Min/Max pattern
21074 + SDValue MinMax = LowerMinMax(Op, DAG);
21075 + if (MinMax.getNode()) {
21079 + SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC);
21080 + return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
21083 +//===----------------------------------------------------------------------===//
21084 +// Custom DAG optimizations
21085 +//===----------------------------------------------------------------------===//
21087 +SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
21088 + DAGCombinerInfo &DCI) const {
21089 + SelectionDAG &DAG = DCI.DAG;
21090 + DebugLoc DL = N->getDebugLoc();
21091 + EVT VT = N->getValueType(0);
21093 + switch (N->getOpcode()) {
21095 + case ISD::SELECT_CC: {
21097 + ConstantSDNode *True, *False;
21098 + // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc)
21099 + if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2)))
21100 + && (False = dyn_cast<ConstantSDNode>(N->getOperand(3)))
21101 + && True->isAllOnesValue()
21102 + && False->isNullValue()
21103 + && VT == MVT::i1) {
21104 + return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0),
21105 + N->getOperand(1), N->getOperand(4));
21110 + case ISD::SETCC: {
21111 + SDValue Arg0 = N->getOperand(0);
21112 + SDValue Arg1 = N->getOperand(1);
21113 + SDValue CC = N->getOperand(2);
21114 + ConstantSDNode * C = NULL;
21115 + ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get();
21117 + // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne)
21118 + if (VT == MVT::i1
21119 + && Arg0.getOpcode() == ISD::SIGN_EXTEND
21120 + && Arg0.getOperand(0).getValueType() == MVT::i1
21121 + && (C = dyn_cast<ConstantSDNode>(Arg1))
21122 + && C->isNullValue()
21123 + && CCOp == ISD::SETNE) {
21124 + return SimplifySetCC(VT, Arg0.getOperand(0),
21125 + DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL);
21130 + return SDValue();
21133 +#define NODE_NAME_CASE(node) case SIISD::node: return #node;
21135 +const char* SITargetLowering::getTargetNodeName(unsigned Opcode) const {
21136 + switch (Opcode) {
21137 + default: return AMDGPUTargetLowering::getTargetNodeName(Opcode);
21138 + NODE_NAME_CASE(VCC_AND)
21139 + NODE_NAME_CASE(VCC_BITCAST)
21142 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIISelLowering.h llvm-r600/lib/Target/R600/SIISelLowering.h
21143 --- llvm-3.2.src/lib/Target/R600/SIISelLowering.h 1970-01-01 01:00:00.000000000 +0100
21144 +++ llvm-r600/lib/Target/R600/SIISelLowering.h 2013-01-25 19:43:57.473383054 +0100
21146 +//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===//
21148 +// The LLVM Compiler Infrastructure
21150 +// This file is distributed under the University of Illinois Open Source
21151 +// License. See LICENSE.TXT for details.
21153 +//===----------------------------------------------------------------------===//
21156 +/// \brief SI DAG Lowering interface definition
21158 +//===----------------------------------------------------------------------===//
21160 +#ifndef SIISELLOWERING_H
21161 +#define SIISELLOWERING_H
21163 +#include "AMDGPUISelLowering.h"
21164 +#include "SIInstrInfo.h"
21168 +class SITargetLowering : public AMDGPUTargetLowering {
21169 + const SIInstrInfo * TII;
21171 + void LowerMOV_IMM(MachineInstr *MI, MachineBasicBlock &BB,
21172 + MachineBasicBlock::iterator I, unsigned Opocde) const;
21173 + void LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
21174 + MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
21175 + void LowerSI_INTERP_CONST(MachineInstr *MI, MachineBasicBlock &BB,
21176 + MachineBasicBlock::iterator I, MachineRegisterInfo &MRI) const;
21177 + void LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
21178 + MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
21179 + void LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB,
21180 + MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
21182 + SDValue Loweri1ContextSwitch(SDValue Op, SelectionDAG &DAG,
21183 + unsigned VCCNode) const;
21184 + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
21185 + SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
21186 + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
21189 + SITargetLowering(TargetMachine &tm);
21190 + virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
21191 + MachineBasicBlock * BB) const;
21192 + virtual EVT getSetCCResultType(EVT VT) const;
21193 + virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
21194 + virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
21195 + virtual const char* getTargetNodeName(unsigned Opcode) const;
21198 +} // End namespace llvm
21200 +#endif //SIISELLOWERING_H
21201 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SILowerControlFlow.cpp llvm-r600/lib/Target/R600/SILowerControlFlow.cpp
21202 --- llvm-3.2.src/lib/Target/R600/SILowerControlFlow.cpp 1970-01-01 01:00:00.000000000 +0100
21203 +++ llvm-r600/lib/Target/R600/SILowerControlFlow.cpp 2013-01-25 19:43:57.480049720 +0100
21205 +//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
21207 +// The LLVM Compiler Infrastructure
21209 +// This file is distributed under the University of Illinois Open Source
21210 +// License. See LICENSE.TXT for details.
21212 +//===----------------------------------------------------------------------===//
21215 +/// \brief This pass lowers the pseudo control flow instructions to real
21216 +/// machine instructions.
21218 +/// All control flow is handled using predicated instructions and
21219 +/// a predicate stack. Each Scalar ALU controls the operations of 64 Vector
21220 +/// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs
21221 +/// by writting to the 64-bit EXEC register (each bit corresponds to a
21222 +/// single vector ALU). Typically, for predicates, a vector ALU will write
21223 +/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
21224 +/// Vector ALU) and then the ScalarALU will AND the VCC register with the
21225 +/// EXEC to update the predicates.
21228 +/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
21229 +/// %SGPR0 = SI_IF %VCC
21230 +/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
21231 +/// %SGPR0 = SI_ELSE %SGPR0
21232 +/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
21233 +/// SI_END_CF %SGPR0
21237 +/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask
21238 +/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
21239 +/// S_CBRANCH_EXECZ label0 // This instruction is an optional
21240 +/// // optimization which allows us to
21241 +/// // branch if all the bits of
21242 +/// // EXEC are zero.
21243 +/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
21246 +/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block
21247 +/// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
21248 +/// S_BRANCH_EXECZ label1 // Use our branch optimization
21249 +/// // instruction again.
21250 +/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block
21252 +/// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits
21253 +//===----------------------------------------------------------------------===//
21255 +#include "AMDGPU.h"
21256 +#include "SIInstrInfo.h"
21257 +#include "SIMachineFunctionInfo.h"
21258 +#include "llvm/CodeGen/MachineFunction.h"
21259 +#include "llvm/CodeGen/MachineFunctionPass.h"
21260 +#include "llvm/CodeGen/MachineInstrBuilder.h"
21261 +#include "llvm/CodeGen/MachineRegisterInfo.h"
21263 +using namespace llvm;
21267 +class SILowerControlFlowPass : public MachineFunctionPass {
21270 + static const unsigned SkipThreshold = 12;
21273 + const TargetInstrInfo *TII;
21275 + bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
21277 + void Skip(MachineInstr &From, MachineOperand &To);
21278 + void SkipIfDead(MachineInstr &MI);
21280 + void If(MachineInstr &MI);
21281 + void Else(MachineInstr &MI);
21282 + void Break(MachineInstr &MI);
21283 + void IfBreak(MachineInstr &MI);
21284 + void ElseBreak(MachineInstr &MI);
21285 + void Loop(MachineInstr &MI);
21286 + void EndCf(MachineInstr &MI);
21288 + void Kill(MachineInstr &MI);
21289 + void Branch(MachineInstr &MI);
21292 + SILowerControlFlowPass(TargetMachine &tm) :
21293 + MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
21295 + virtual bool runOnMachineFunction(MachineFunction &MF);
21297 + const char *getPassName() const {
21298 + return "SI Lower control flow instructions";
21303 +} // End anonymous namespace
21305 +char SILowerControlFlowPass::ID = 0;
21307 +FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
21308 + return new SILowerControlFlowPass(tm);
21311 +bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From,
21312 + MachineBasicBlock *To) {
21314 + unsigned NumInstr = 0;
21316 + for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty();
21317 + MBB = *MBB->succ_begin()) {
21319 + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
21320 + NumInstr < SkipThreshold && I != E; ++I) {
21322 + if (I->isBundle() || !I->isBundled())
21323 + if (++NumInstr >= SkipThreshold)
21331 +void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
21333 + if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
21336 + DebugLoc DL = From.getDebugLoc();
21337 + BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
21339 + .addReg(AMDGPU::EXEC);
21342 +void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
21344 + MachineBasicBlock &MBB = *MI.getParent();
21345 + DebugLoc DL = MI.getDebugLoc();
21347 + if (!shouldSkip(&MBB, &MBB.getParent()->back()))
21350 + MachineBasicBlock::iterator Insert = &MI;
21353 + // If the exec mask is non-zero, skip the next two instructions
21354 + BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
21356 + .addReg(AMDGPU::EXEC);
21358 + // Exec mask is zero: Export to NULL target...
21359 + BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
21361 + .addImm(0x09) // V_008DFC_SQ_EXP_NULL
21365 + .addReg(AMDGPU::SREG_LIT_0)
21366 + .addReg(AMDGPU::SREG_LIT_0)
21367 + .addReg(AMDGPU::SREG_LIT_0)
21368 + .addReg(AMDGPU::SREG_LIT_0);
21370 + // ... and terminate wavefront
21371 + BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
21374 +void SILowerControlFlowPass::If(MachineInstr &MI) {
21375 + MachineBasicBlock &MBB = *MI.getParent();
21376 + DebugLoc DL = MI.getDebugLoc();
21377 + unsigned Reg = MI.getOperand(0).getReg();
21378 + unsigned Vcc = MI.getOperand(1).getReg();
21380 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
21383 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
21384 + .addReg(AMDGPU::EXEC)
21387 + Skip(MI, MI.getOperand(2));
21389 + MI.eraseFromParent();
21392 +void SILowerControlFlowPass::Else(MachineInstr &MI) {
21393 + MachineBasicBlock &MBB = *MI.getParent();
21394 + DebugLoc DL = MI.getDebugLoc();
21395 + unsigned Dst = MI.getOperand(0).getReg();
21396 + unsigned Src = MI.getOperand(1).getReg();
21398 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
21399 + .addReg(Src); // Saved EXEC
21401 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
21402 + .addReg(AMDGPU::EXEC)
21405 + Skip(MI, MI.getOperand(2));
21407 + MI.eraseFromParent();
21410 +void SILowerControlFlowPass::Break(MachineInstr &MI) {
21411 + MachineBasicBlock &MBB = *MI.getParent();
21412 + DebugLoc DL = MI.getDebugLoc();
21414 + unsigned Dst = MI.getOperand(0).getReg();
21415 + unsigned Src = MI.getOperand(1).getReg();
21417 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
21418 + .addReg(AMDGPU::EXEC)
21421 + MI.eraseFromParent();
21424 +void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
21425 + MachineBasicBlock &MBB = *MI.getParent();
21426 + DebugLoc DL = MI.getDebugLoc();
21428 + unsigned Dst = MI.getOperand(0).getReg();
21429 + unsigned Vcc = MI.getOperand(1).getReg();
21430 + unsigned Src = MI.getOperand(2).getReg();
21432 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
21436 + MI.eraseFromParent();
21439 +void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
21440 + MachineBasicBlock &MBB = *MI.getParent();
21441 + DebugLoc DL = MI.getDebugLoc();
21443 + unsigned Dst = MI.getOperand(0).getReg();
21444 + unsigned Saved = MI.getOperand(1).getReg();
21445 + unsigned Src = MI.getOperand(2).getReg();
21447 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
21451 + MI.eraseFromParent();
21454 +void SILowerControlFlowPass::Loop(MachineInstr &MI) {
21455 + MachineBasicBlock &MBB = *MI.getParent();
21456 + DebugLoc DL = MI.getDebugLoc();
21457 + unsigned Src = MI.getOperand(0).getReg();
21459 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
21460 + .addReg(AMDGPU::EXEC)
21463 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
21464 + .addOperand(MI.getOperand(1))
21465 + .addReg(AMDGPU::EXEC);
21467 + MI.eraseFromParent();
21470 +void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
21471 + MachineBasicBlock &MBB = *MI.getParent();
21472 + DebugLoc DL = MI.getDebugLoc();
21473 + unsigned Reg = MI.getOperand(0).getReg();
21475 + BuildMI(MBB, MBB.getFirstNonPHI(), DL,
21476 + TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
21477 + .addReg(AMDGPU::EXEC)
21480 + MI.eraseFromParent();
21483 +void SILowerControlFlowPass::Branch(MachineInstr &MI) {
21484 + MachineBasicBlock *Next = MI.getParent()->getNextNode();
21485 + MachineBasicBlock *Target = MI.getOperand(0).getMBB();
21486 + if (Target == Next)
21487 + MI.eraseFromParent();
21492 +void SILowerControlFlowPass::Kill(MachineInstr &MI) {
21494 + MachineBasicBlock &MBB = *MI.getParent();
21495 + DebugLoc DL = MI.getDebugLoc();
21497 + // Kill is only allowed in pixel shaders
21498 + MachineFunction &MF = *MBB.getParent();
21499 + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
21500 + assert(Info->ShaderType == ShaderType::PIXEL);
21502 + // Clear this pixel from the exec mask if the operand is negative
21503 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC)
21504 + .addReg(AMDGPU::SREG_LIT_0)
21505 + .addOperand(MI.getOperand(0));
21507 + MI.eraseFromParent();
21510 +bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
21512 + bool HaveKill = false;
21513 + unsigned Depth = 0;
21515 + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
21516 + BI != BE; ++BI) {
21518 + MachineBasicBlock &MBB = *BI;
21519 + for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
21520 + I != MBB.end(); I = Next) {
21522 + Next = llvm::next(I);
21523 + MachineInstr &MI = *I;
21524 + switch (MI.getOpcode()) {
21526 + case AMDGPU::SI_IF:
21531 + case AMDGPU::SI_ELSE:
21535 + case AMDGPU::SI_BREAK:
21539 + case AMDGPU::SI_IF_BREAK:
21543 + case AMDGPU::SI_ELSE_BREAK:
21547 + case AMDGPU::SI_LOOP:
21552 + case AMDGPU::SI_END_CF:
21553 + if (--Depth == 0 && HaveKill) {
21555 + HaveKill = false;
21560 + case AMDGPU::SI_KILL:
21568 + case AMDGPU::S_BRANCH:
21577 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SILowerLiteralConstants.cpp llvm-r600/lib/Target/R600/SILowerLiteralConstants.cpp
21578 --- llvm-3.2.src/lib/Target/R600/SILowerLiteralConstants.cpp 1970-01-01 01:00:00.000000000 +0100
21579 +++ llvm-r600/lib/Target/R600/SILowerLiteralConstants.cpp 2013-01-25 19:43:57.480049720 +0100
21581 +//===-- SILowerLiteralConstants.cpp - Lower intrs using literal constants--===//
21583 +// The LLVM Compiler Infrastructure
21585 +// This file is distributed under the University of Illinois Open Source
21586 +// License. See LICENSE.TXT for details.
21588 +//===----------------------------------------------------------------------===//
21591 +/// \brief This pass performs the following transformation on instructions with
21592 +/// literal constants:
21594 +/// %VGPR0 = V_MOV_IMM_I32 1
21599 +/// * %VGPR = V_MOV_B32_32 SI_LITERAL_CONSTANT
21600 +/// * SI_LOAD_LITERAL 1
21602 +/// The resulting sequence matches exactly how the hardware handles immediate
21603 +/// operands, so this transformation greatly simplifies the code generator.
21605 +/// Only the *_MOV_IMM_* support immediate operands at the moment, but when
21606 +/// support for immediate operands is added to other instructions, they
21607 +/// will be lowered here as well.
21608 +//===----------------------------------------------------------------------===//
21610 +#include "AMDGPU.h"
21611 +#include "llvm/CodeGen/MachineFunction.h"
21612 +#include "llvm/CodeGen/MachineFunctionPass.h"
21613 +#include "llvm/CodeGen/MachineInstrBuilder.h"
21614 +#include "llvm/CodeGen/MachineInstrBundle.h"
21616 +using namespace llvm;
21620 +class SILowerLiteralConstantsPass : public MachineFunctionPass {
21624 + const TargetInstrInfo *TII;
21627 + SILowerLiteralConstantsPass(TargetMachine &tm) :
21628 + MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
21630 + virtual bool runOnMachineFunction(MachineFunction &MF);
21632 + const char *getPassName() const {
21633 + return "SI Lower literal constants pass";
21637 +} // End anonymous namespace
21639 +char SILowerLiteralConstantsPass::ID = 0;
21641 +FunctionPass *llvm::createSILowerLiteralConstantsPass(TargetMachine &tm) {
21642 + return new SILowerLiteralConstantsPass(tm);
21645 +bool SILowerLiteralConstantsPass::runOnMachineFunction(MachineFunction &MF) {
21646 + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
21647 + BB != BB_E; ++BB) {
21648 + MachineBasicBlock &MBB = *BB;
21649 + for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
21650 + I != MBB.end(); I = Next) {
21651 + Next = llvm::next(I);
21652 + MachineInstr &MI = *I;
21653 + switch (MI.getOpcode()) {
21655 + case AMDGPU::S_MOV_IMM_I32:
21656 + case AMDGPU::S_MOV_IMM_I64:
21657 + case AMDGPU::V_MOV_IMM_F32:
21658 + case AMDGPU::V_MOV_IMM_I32: {
21659 + unsigned MovOpcode;
21660 + unsigned LoadLiteralOpcode;
21661 + MachineOperand LiteralOp = MI.getOperand(1);
21662 + if (AMDGPU::VReg_32RegClass.contains(MI.getOperand(0).getReg())) {
21663 + MovOpcode = AMDGPU::V_MOV_B32_e32;
21665 + MovOpcode = AMDGPU::S_MOV_B32;
21667 + if (LiteralOp.isImm()) {
21668 + LoadLiteralOpcode = AMDGPU::SI_LOAD_LITERAL_I32;
21670 + LoadLiteralOpcode = AMDGPU::SI_LOAD_LITERAL_F32;
21672 + MachineInstr *First =
21673 + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(MovOpcode),
21674 + MI.getOperand(0).getReg())
21675 + .addReg(AMDGPU::SI_LITERAL_CONSTANT);
21676 + MachineInstr *Last =
21677 + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(LoadLiteralOpcode))
21678 + .addOperand(MI.getOperand(1));
21679 + Last->setIsInsideBundle();
21680 + llvm::finalizeBundle(MBB, First, Last);
21681 + MI.eraseFromParent();
21689 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIMachineFunctionInfo.cpp llvm-r600/lib/Target/R600/SIMachineFunctionInfo.cpp
21690 --- llvm-3.2.src/lib/Target/R600/SIMachineFunctionInfo.cpp 1970-01-01 01:00:00.000000000 +0100
21691 +++ llvm-r600/lib/Target/R600/SIMachineFunctionInfo.cpp 2013-01-25 19:43:57.480049720 +0100
21693 +//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===//
21695 +// The LLVM Compiler Infrastructure
21697 +// This file is distributed under the University of Illinois Open Source
21698 +// License. See LICENSE.TXT for details.
21701 +//===----------------------------------------------------------------------===//
21704 +#include "SIMachineFunctionInfo.h"
21706 +using namespace llvm;
21708 +SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
21709 + : MachineFunctionInfo(),
21710 + SPIPSInputAddr(0),
21713 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIMachineFunctionInfo.h llvm-r600/lib/Target/R600/SIMachineFunctionInfo.h
21714 --- llvm-3.2.src/lib/Target/R600/SIMachineFunctionInfo.h 1970-01-01 01:00:00.000000000 +0100
21715 +++ llvm-r600/lib/Target/R600/SIMachineFunctionInfo.h 2013-01-25 19:43:57.480049720 +0100
21717 +//===- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface -*- C++ -*-==//
21719 +// The LLVM Compiler Infrastructure
21721 +// This file is distributed under the University of Illinois Open Source
21722 +// License. See LICENSE.TXT for details.
21724 +//===----------------------------------------------------------------------===//
21728 +//===----------------------------------------------------------------------===//
21731 +#ifndef SIMACHINEFUNCTIONINFO_H_
21732 +#define SIMACHINEFUNCTIONINFO_H_
21734 +#include "llvm/CodeGen/MachineFunction.h"
21738 +/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
21739 +/// tells the hardware which interpolation parameters to load.
21740 +class SIMachineFunctionInfo : public MachineFunctionInfo {
21742 + SIMachineFunctionInfo(const MachineFunction &MF);
21743 + unsigned SPIPSInputAddr;
21744 + unsigned ShaderType;
21747 +} // End namespace llvm
21750 +#endif //_SIMACHINEFUNCTIONINFO_H_
21751 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIRegisterInfo.cpp llvm-r600/lib/Target/R600/SIRegisterInfo.cpp
21752 --- llvm-3.2.src/lib/Target/R600/SIRegisterInfo.cpp 1970-01-01 01:00:00.000000000 +0100
21753 +++ llvm-r600/lib/Target/R600/SIRegisterInfo.cpp 2013-01-25 19:43:57.480049720 +0100
21755 +//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
21757 +// The LLVM Compiler Infrastructure
21759 +// This file is distributed under the University of Illinois Open Source
21760 +// License. See LICENSE.TXT for details.
21762 +//===----------------------------------------------------------------------===//
21765 +/// \brief SI implementation of the TargetRegisterInfo class.
21767 +//===----------------------------------------------------------------------===//
21770 +#include "SIRegisterInfo.h"
21771 +#include "AMDGPUTargetMachine.h"
21773 +using namespace llvm;
21775 +SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine &tm,
21776 + const TargetInstrInfo &tii)
21777 +: AMDGPURegisterInfo(tm, tii),
21782 +BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
21783 + BitVector Reserved(getNumRegs());
21787 +const TargetRegisterClass *
21788 +SIRegisterInfo::getISARegClass(const TargetRegisterClass * rc) const {
21789 + switch (rc->getID()) {
21790 + case AMDGPU::GPRF32RegClassID:
21791 + return &AMDGPU::VReg_32RegClass;
21792 + default: return rc;
21796 +const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
21798 + switch(VT.SimpleTy) {
21800 + case MVT::i32: return &AMDGPU::VReg_32RegClass;
21803 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIRegisterInfo.h llvm-r600/lib/Target/R600/SIRegisterInfo.h
21804 --- llvm-3.2.src/lib/Target/R600/SIRegisterInfo.h 1970-01-01 01:00:00.000000000 +0100
21805 +++ llvm-r600/lib/Target/R600/SIRegisterInfo.h 2013-01-25 19:43:57.483383054 +0100
21807 +//===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===//
21809 +// The LLVM Compiler Infrastructure
21811 +// This file is distributed under the University of Illinois Open Source
21812 +// License. See LICENSE.TXT for details.
21814 +//===----------------------------------------------------------------------===//
21817 +/// \brief Interface definition for SIRegisterInfo
21819 +//===----------------------------------------------------------------------===//
21822 +#ifndef SIREGISTERINFO_H_
21823 +#define SIREGISTERINFO_H_
21825 +#include "AMDGPURegisterInfo.h"
21829 +class AMDGPUTargetMachine;
21830 +class TargetInstrInfo;
21832 +struct SIRegisterInfo : public AMDGPURegisterInfo {
21833 + AMDGPUTargetMachine &TM;
21834 + const TargetInstrInfo &TII;
21836 + SIRegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii);
21838 + virtual BitVector getReservedRegs(const MachineFunction &MF) const;
21840 + /// \param RC is an AMDIL reg class.
21842 + /// \returns the SI register class that is equivalent to \p RC.
21843 + virtual const TargetRegisterClass *
21844 + getISARegClass(const TargetRegisterClass *RC) const;
21846 + /// \brief get the register class of the specified type to use in the
21847 + /// CFGStructurizer
21848 + virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
21851 +} // End namespace llvm
21853 +#endif // SIREGISTERINFO_H_
21854 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SIRegisterInfo.td llvm-r600/lib/Target/R600/SIRegisterInfo.td
21855 --- llvm-3.2.src/lib/Target/R600/SIRegisterInfo.td 1970-01-01 01:00:00.000000000 +0100
21856 +++ llvm-r600/lib/Target/R600/SIRegisterInfo.td 2013-01-25 19:43:57.483383054 +0100
21859 +let Namespace = "AMDGPU" in {
21860 + def low : SubRegIndex;
21861 + def high : SubRegIndex;
21863 + def sub0 : SubRegIndex;
21864 + def sub1 : SubRegIndex;
21865 + def sub2 : SubRegIndex;
21866 + def sub3 : SubRegIndex;
21867 + def sub4 : SubRegIndex;
21868 + def sub5 : SubRegIndex;
21869 + def sub6 : SubRegIndex;
21870 + def sub7 : SubRegIndex;
21873 +class SIReg <string n, bits<16> encoding = 0> : Register<n> {
21874 + let Namespace = "AMDGPU";
21875 + let HWEncoding = encoding;
21878 +class SI_64 <string n, list<Register> subregs, bits<16> encoding> : RegisterWithSubRegs<n, subregs> {
21879 + let Namespace = "AMDGPU";
21880 + let SubRegIndices = [low, high];
21881 + let HWEncoding = encoding;
21884 +class SGPR_32 <bits<16> num, string name> : SIReg<name, num>;
21886 +class VGPR_32 <bits<16> num, string name> : SIReg<name, num>;
21888 +// Special Registers
21889 +def VCC : SIReg<"VCC", 106>;
21890 +def EXEC_LO : SIReg <"EXEC LO", 126>;
21891 +def EXEC_HI : SIReg <"EXEC HI", 127>;
21892 +def EXEC : SI_64<"EXEC", [EXEC_LO, EXEC_HI], 126>;
21893 +def SCC : SIReg<"SCC", 253>;
21894 +def SREG_LIT_0 : SIReg <"S LIT 0", 128>;
21895 +def SI_LITERAL_CONSTANT : SIReg<"LITERAL CONSTANT", 255>;
21896 +def M0 : SIReg <"M0", 124>;
21898 +//Interpolation registers
21899 +def PERSP_SAMPLE_I : SIReg <"PERSP_SAMPLE_I">;
21900 +def PERSP_SAMPLE_J : SIReg <"PERSP_SAMPLE_J">;
21901 +def PERSP_CENTER_I : SIReg <"PERSP_CENTER_I">;
21902 +def PERSP_CENTER_J : SIReg <"PERSP_CENTER_J">;
21903 +def PERSP_CENTROID_I : SIReg <"PERSP_CENTROID_I">;
21904 +def PERSP_CENTROID_J : SIReg <"PERP_CENTROID_J">;
21905 +def PERSP_I_W : SIReg <"PERSP_I_W">;
21906 +def PERSP_J_W : SIReg <"PERSP_J_W">;
21907 +def PERSP_1_W : SIReg <"PERSP_1_W">;
21908 +def LINEAR_SAMPLE_I : SIReg <"LINEAR_SAMPLE_I">;
21909 +def LINEAR_SAMPLE_J : SIReg <"LINEAR_SAMPLE_J">;
21910 +def LINEAR_CENTER_I : SIReg <"LINEAR_CENTER_I">;
21911 +def LINEAR_CENTER_J : SIReg <"LINEAR_CENTER_J">;
21912 +def LINEAR_CENTROID_I : SIReg <"LINEAR_CENTROID_I">;
21913 +def LINEAR_CENTROID_J : SIReg <"LINEAR_CENTROID_J">;
21914 +def LINE_STIPPLE_TEX_COORD : SIReg <"LINE_STIPPLE_TEX_COORD">;
21915 +def POS_X_FLOAT : SIReg <"POS_X_FLOAT">;
21916 +def POS_Y_FLOAT : SIReg <"POS_Y_FLOAT">;
21917 +def POS_Z_FLOAT : SIReg <"POS_Z_FLOAT">;
21918 +def POS_W_FLOAT : SIReg <"POS_W_FLOAT">;
21919 +def FRONT_FACE : SIReg <"FRONT_FACE">;
21920 +def ANCILLARY : SIReg <"ANCILLARY">;
21921 +def SAMPLE_COVERAGE : SIReg <"SAMPLE_COVERAGE">;
21922 +def POS_FIXED_PT : SIReg <"POS_FIXED_PT">;
21924 +// SGPR 32-bit registers
21925 +foreach Index = 0-101 in {
21926 + def SGPR#Index : SGPR_32 <Index, "SGPR"#Index>;
21929 +def SGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
21930 + (add (sequence "SGPR%u", 0, 101))>;
21932 +// SGPR 64-bit registers
21933 +def SGPR_64 : RegisterTuples<[low, high],
21934 + [(add (decimate SGPR_32, 2)),
21935 + (add(decimate (rotl SGPR_32, 1), 2))]>;
21937 +// SGPR 128-bit registers
21938 +def SGPR_128 : RegisterTuples<[sel_x, sel_y, sel_z, sel_w],
21939 + [(add (decimate SGPR_32, 4)),
21940 + (add (decimate (rotl SGPR_32, 1), 4)),
21941 + (add (decimate (rotl SGPR_32, 2), 4)),
21942 + (add (decimate (rotl SGPR_32, 3), 4))]>;
21944 +// SGPR 256-bit registers
21945 +def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
21946 + [(add (decimate SGPR_32, 8)),
21947 + (add (decimate (rotl SGPR_32, 1), 8)),
21948 + (add (decimate (rotl SGPR_32, 2), 8)),
21949 + (add (decimate (rotl SGPR_32, 3), 8)),
21950 + (add (decimate (rotl SGPR_32, 4), 8)),
21951 + (add (decimate (rotl SGPR_32, 5), 8)),
21952 + (add (decimate (rotl SGPR_32, 6), 8)),
21953 + (add (decimate (rotl SGPR_32, 7), 8))]>;
21955 +// VGPR 32-bit registers
21956 +foreach Index = 0-255 in {
21957 + def VGPR#Index : VGPR_32 <Index, "VGPR"#Index>;
21960 +def VGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
21961 + (add (sequence "VGPR%u", 0, 255))>;
21963 +// VGPR 64-bit registers
21964 +def VGPR_64 : RegisterTuples<[low, high],
21966 + (add (rotl VGPR_32, 1))]>;
21968 +// VGPR 128-bit registers
21969 +def VGPR_128 : RegisterTuples<[sel_x, sel_y, sel_z, sel_w],
21971 + (add (rotl VGPR_32, 1)),
21972 + (add (rotl VGPR_32, 2)),
21973 + (add (rotl VGPR_32, 3))]>;
21975 +// Register class for all scalar registers (SGPRs + Special Registers)
21976 +def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
21977 + (add SGPR_32, SREG_LIT_0, M0, EXEC_LO, EXEC_HI)
21980 +def SReg_64 : RegisterClass<"AMDGPU", [i64], 64, (add SGPR_64, VCC, EXEC)>;
21982 +def SReg_1 : RegisterClass<"AMDGPU", [i1], 1, (add VCC, SGPR_64, EXEC)>;
21984 +def SReg_128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, (add SGPR_128)>;
21986 +def SReg_256 : RegisterClass<"AMDGPU", [v8i32], 256, (add SGPR_256)>;
21988 +// Register class for all vector registers (VGPRs + Interploation Registers)
21989 +def VReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
21991 + PERSP_SAMPLE_I, PERSP_SAMPLE_J,
21992 + PERSP_CENTER_I, PERSP_CENTER_J,
21993 + PERSP_CENTROID_I, PERSP_CENTROID_J,
21994 + PERSP_I_W, PERSP_J_W, PERSP_1_W,
21995 + LINEAR_SAMPLE_I, LINEAR_SAMPLE_J,
21996 + LINEAR_CENTER_I, LINEAR_CENTER_J,
21997 + LINEAR_CENTROID_I, LINEAR_CENTROID_J,
21998 + LINE_STIPPLE_TEX_COORD,
22010 +def VReg_64 : RegisterClass<"AMDGPU", [i64], 64, (add VGPR_64)>;
22012 +def VReg_128 : RegisterClass<"AMDGPU", [v4f32], 128, (add VGPR_128)>;
22014 +// AllReg_* - A set of all scalar and vector registers of a given width.
22015 +def AllReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32, (add VReg_32, SReg_32)>;
22017 +def AllReg_64 : RegisterClass<"AMDGPU", [f64, i64], 64, (add SReg_64, VReg_64)>;
22019 +// Special register classes for predicates and the M0 register
22020 +def SCCReg : RegisterClass<"AMDGPU", [i1], 1, (add SCC)>;
22021 +def VCCReg : RegisterClass<"AMDGPU", [i1], 1, (add VCC)>;
22022 +def EXECReg : RegisterClass<"AMDGPU", [i1], 1, (add EXEC)>;
22023 +def M0Reg : RegisterClass<"AMDGPU", [i32], 32, (add M0)>;
22025 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/SISchedule.td llvm-r600/lib/Target/R600/SISchedule.td
22026 --- llvm-3.2.src/lib/Target/R600/SISchedule.td 1970-01-01 01:00:00.000000000 +0100
22027 +++ llvm-r600/lib/Target/R600/SISchedule.td 2013-01-25 19:43:57.483383054 +0100
22029 +//===-- SISchedule.td - SI Scheduling definitons -------------------------===//
22031 +// The LLVM Compiler Infrastructure
22033 +// This file is distributed under the University of Illinois Open Source
22034 +// License. See LICENSE.TXT for details.
22036 +//===----------------------------------------------------------------------===//
22038 +// TODO: This is just a place holder for now.
22040 +//===----------------------------------------------------------------------===//
22043 +def SI_Itin : ProcessorItineraries <[], [], []>;
22044 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp llvm-r600/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
22045 --- llvm-3.2.src/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp 1970-01-01 01:00:00.000000000 +0100
22046 +++ llvm-r600/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp 2013-01-25 19:43:57.483383054 +0100
22048 +//===-- TargetInfo/AMDGPUTargetInfo.cpp - TargetInfo for AMDGPU -----------===//
22050 +// The LLVM Compiler Infrastructure
22052 +// This file is distributed under the University of Illinois Open Source
22053 +// License. See LICENSE.TXT for details.
22055 +//===----------------------------------------------------------------------===//
22059 +//===----------------------------------------------------------------------===//
22061 +#include "AMDGPU.h"
22062 +#include "llvm/Support/TargetRegistry.h"
22064 +using namespace llvm;
22066 +/// \brief The target for the AMDGPU backend
22067 +Target llvm::TheAMDGPUTarget;
22069 +/// \brief Extern function to initialize the targets for the AMDGPU backend
22070 +extern "C" void LLVMInitializeR600TargetInfo() {
22071 + RegisterTarget<Triple::r600, false>
22072 + R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX");
22074 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/TargetInfo/CMakeLists.txt llvm-r600/lib/Target/R600/TargetInfo/CMakeLists.txt
22075 --- llvm-3.2.src/lib/Target/R600/TargetInfo/CMakeLists.txt 1970-01-01 01:00:00.000000000 +0100
22076 +++ llvm-r600/lib/Target/R600/TargetInfo/CMakeLists.txt 2013-01-25 19:43:57.483383054 +0100
22078 +include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
22080 +add_llvm_library(LLVMR600Info
22081 + AMDGPUTargetInfo.cpp
22084 +add_dependencies(LLVMR600Info AMDGPUCommonTableGen intrinsics_gen)
22085 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/TargetInfo/LLVMBuild.txt llvm-r600/lib/Target/R600/TargetInfo/LLVMBuild.txt
22086 --- llvm-3.2.src/lib/Target/R600/TargetInfo/LLVMBuild.txt 1970-01-01 01:00:00.000000000 +0100
22087 +++ llvm-r600/lib/Target/R600/TargetInfo/LLVMBuild.txt 2013-01-25 19:43:57.483383054 +0100
22089 +;===- ./lib/Target/R600/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
22091 +; The LLVM Compiler Infrastructure
22093 +; This file is distributed under the University of Illinois Open Source
22094 +; License. See LICENSE.TXT for details.
22096 +;===------------------------------------------------------------------------===;
22098 +; This is an LLVMBuild description file for the components in this subdirectory.
22100 +; For more information on the LLVMBuild system, please see:
22102 +; http://llvm.org/docs/LLVMBuild.html
22104 +;===------------------------------------------------------------------------===;
22110 +required_libraries = MC Support
22111 +add_to_library_groups = R600
22112 diff -Nur -x .git llvm-3.2.src/lib/Target/R600/TargetInfo/Makefile llvm-r600/lib/Target/R600/TargetInfo/Makefile
22113 --- llvm-3.2.src/lib/Target/R600/TargetInfo/Makefile 1970-01-01 01:00:00.000000000 +0100
22114 +++ llvm-r600/lib/Target/R600/TargetInfo/Makefile 2013-01-25 19:43:57.483383054 +0100
22116 +##===- lib/Target/AMDGPU/TargetInfo/Makefile ----------------*- Makefile -*-===##
22118 +# The LLVM Compiler Infrastructure
22120 +# This file is distributed under the University of Illinois Open Source
22121 +# License. See LICENSE.TXT for details.
22123 +##===----------------------------------------------------------------------===##
22124 +LEVEL = ../../../..
22125 +LIBRARYNAME = LLVMR600Info
22127 +# Hack: we need to include 'main' target directory to grab private headers
22128 +CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
22130 +include $(LEVEL)/Makefile.common
22131 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/add.v4i32.ll llvm-r600/test/CodeGen/R600/add.v4i32.ll
22132 --- llvm-3.2.src/test/CodeGen/R600/add.v4i32.ll 1970-01-01 01:00:00.000000000 +0100
22133 +++ llvm-r600/test/CodeGen/R600/add.v4i32.ll 2013-01-25 19:43:58.460049700 +0100
22135 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22137 +;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22138 +;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22139 +;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22140 +;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22142 +define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
22143 + %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
22144 + %a = load <4 x i32> addrspace(1) * %in
22145 + %b = load <4 x i32> addrspace(1) * %b_ptr
22146 + %result = add <4 x i32> %a, %b
22147 + store <4 x i32> %result, <4 x i32> addrspace(1)* %out
22150 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/and.v4i32.ll llvm-r600/test/CodeGen/R600/and.v4i32.ll
22151 --- llvm-3.2.src/test/CodeGen/R600/and.v4i32.ll 1970-01-01 01:00:00.000000000 +0100
22152 +++ llvm-r600/test/CodeGen/R600/and.v4i32.ll 2013-01-25 19:43:58.460049700 +0100
22154 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22156 +;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22157 +;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22158 +;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22159 +;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22161 +define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
22162 + %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
22163 + %a = load <4 x i32> addrspace(1) * %in
22164 + %b = load <4 x i32> addrspace(1) * %b_ptr
22165 + %result = and <4 x i32> %a, %b
22166 + store <4 x i32> %result, <4 x i32> addrspace(1)* %out
22169 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll llvm-r600/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll
22170 --- llvm-3.2.src/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll 1970-01-01 01:00:00.000000000 +0100
22171 +++ llvm-r600/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll 2013-01-25 19:43:58.460049700 +0100
22173 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22175 +;CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22177 +; This test is for a bug in
22178 +; DAGCombiner::reduceBuildVecConvertToConvertBuildVec() where
22179 +; the wrong type was being passed to
22180 +; TargetLowering::getOperationAction() when checking the legality of
22181 +; ISD::UINT_TO_FP and ISD::SINT_TO_FP opcodes.
22183 +define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
22185 + %ptr = getelementptr i32 addrspace(1)* %in, i32 1
22186 + %sint = load i32 addrspace(1) * %in
22187 + %conv = sitofp i32 %sint to float
22188 + %0 = insertelement <4 x float> undef, float %conv, i32 0
22189 + %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer
22190 + store <4 x float> %splat, <4 x float> addrspace(1)* %out
22194 +;CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22196 +define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
22198 + %ptr = getelementptr i32 addrspace(1)* %in, i32 1
22199 + %uint = load i32 addrspace(1) * %in
22200 + %conv = uitofp i32 %uint to float
22201 + %0 = insertelement <4 x float> undef, float %conv, i32 0
22202 + %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer
22203 + store <4 x float> %splat, <4 x float> addrspace(1)* %out
22206 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fabs.ll llvm-r600/test/CodeGen/R600/fabs.ll
22207 --- llvm-3.2.src/test/CodeGen/R600/fabs.ll 1970-01-01 01:00:00.000000000 +0100
22208 +++ llvm-r600/test/CodeGen/R600/fabs.ll 2013-01-25 19:43:58.460049700 +0100
22210 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22212 +;CHECK: MOV T{{[0-9]+\.[XYZW], \|T[0-9]+\.[XYZW]\|}}
22214 +define void @test() {
22215 + %r0 = call float @llvm.R600.load.input(i32 0)
22216 + %r1 = call float @fabs( float %r0)
22217 + call void @llvm.AMDGPU.store.output(float %r1, i32 0)
22221 +declare float @llvm.R600.load.input(i32) readnone
22223 +declare void @llvm.AMDGPU.store.output(float, i32)
22225 +declare float @fabs(float ) readnone
22226 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fadd.ll llvm-r600/test/CodeGen/R600/fadd.ll
22227 --- llvm-3.2.src/test/CodeGen/R600/fadd.ll 1970-01-01 01:00:00.000000000 +0100
22228 +++ llvm-r600/test/CodeGen/R600/fadd.ll 2013-01-25 19:43:58.460049700 +0100
22230 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22232 +; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22234 +define void @test() {
22235 + %r0 = call float @llvm.R600.load.input(i32 0)
22236 + %r1 = call float @llvm.R600.load.input(i32 1)
22237 + %r2 = fadd float %r0, %r1
22238 + call void @llvm.AMDGPU.store.output(float %r2, i32 0)
22242 +declare float @llvm.R600.load.input(i32) readnone
22244 +declare void @llvm.AMDGPU.store.output(float, i32)
22246 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fadd.v4f32.ll llvm-r600/test/CodeGen/R600/fadd.v4f32.ll
22247 --- llvm-3.2.src/test/CodeGen/R600/fadd.v4f32.ll 1970-01-01 01:00:00.000000000 +0100
22248 +++ llvm-r600/test/CodeGen/R600/fadd.v4f32.ll 2013-01-25 19:43:58.460049700 +0100
22250 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22252 +;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22253 +;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22254 +;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22255 +;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22257 +define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
22258 + %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
22259 + %a = load <4 x float> addrspace(1) * %in
22260 + %b = load <4 x float> addrspace(1) * %b_ptr
22261 + %result = fadd <4 x float> %a, %b
22262 + store <4 x float> %result, <4 x float> addrspace(1)* %out
22265 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fcmp-cnde-int-args.ll llvm-r600/test/CodeGen/R600/fcmp-cnde-int-args.ll
22266 --- llvm-3.2.src/test/CodeGen/R600/fcmp-cnde-int-args.ll 1970-01-01 01:00:00.000000000 +0100
22267 +++ llvm-r600/test/CodeGen/R600/fcmp-cnde-int-args.ll 2013-01-25 19:43:58.460049700 +0100
22269 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22271 +; This test checks a bug in R600TargetLowering::LowerSELECT_CC where the
22272 +; chance to optimize the fcmp + select instructions to CNDE was missed
22273 +; due to the fact that the operands to fcmp and select had different types
22275 +;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, 0.0, -1}}
22277 +define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
22279 + %0 = load float addrspace(1)* %in
22280 + %cmp = fcmp oeq float %0, 0.000000e+00
22281 + %value = select i1 %cmp, i32 -1, i32 0
22282 + store i32 %value, i32 addrspace(1)* %out
22285 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fcmp-cnd.ll llvm-r600/test/CodeGen/R600/fcmp-cnd.ll
22286 --- llvm-3.2.src/test/CodeGen/R600/fcmp-cnd.ll 1970-01-01 01:00:00.000000000 +0100
22287 +++ llvm-r600/test/CodeGen/R600/fcmp-cnd.ll 2013-01-25 19:43:58.460049700 +0100
22289 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22291 +;Not checking arguments 2 and 3 to CNDE, because they may change between
22292 +;registers and literal.x depending on what the optimizer does.
22293 +;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22295 +define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
22297 + %0 = load float addrspace(1)* %in
22298 + %cmp = fcmp oeq float %0, 0.000000e+00
22299 + %value = select i1 %cmp, i32 2, i32 3
22300 + store i32 %value, i32 addrspace(1)* %out
22303 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fcmp.ll llvm-r600/test/CodeGen/R600/fcmp.ll
22304 --- llvm-3.2.src/test/CodeGen/R600/fcmp.ll 1970-01-01 01:00:00.000000000 +0100
22305 +++ llvm-r600/test/CodeGen/R600/fcmp.ll 2013-01-25 19:43:58.460049700 +0100
22307 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22309 +;CHECK: SETE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22310 +;CHECK: MOV T{{[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
22311 +;CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22313 +define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
22315 + %0 = load float addrspace(1)* %in
22316 + %arrayidx1 = getelementptr inbounds float addrspace(1)* %in, i32 1
22317 + %1 = load float addrspace(1)* %arrayidx1
22318 + %cmp = fcmp oeq float %0, %1
22319 + %sext = sext i1 %cmp to i32
22320 + store i32 %sext, i32 addrspace(1)* %out
22323 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fdiv.v4f32.ll llvm-r600/test/CodeGen/R600/fdiv.v4f32.ll
22324 --- llvm-3.2.src/test/CodeGen/R600/fdiv.v4f32.ll 1970-01-01 01:00:00.000000000 +0100
22325 +++ llvm-r600/test/CodeGen/R600/fdiv.v4f32.ll 2013-01-25 19:43:58.460049700 +0100
22327 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22329 +;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22330 +;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22331 +;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22332 +;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22333 +;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22334 +;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22335 +;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22336 +;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22338 +define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
22339 + %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
22340 + %a = load <4 x float> addrspace(1) * %in
22341 + %b = load <4 x float> addrspace(1) * %b_ptr
22342 + %result = fdiv <4 x float> %a, %b
22343 + store <4 x float> %result, <4 x float> addrspace(1)* %out
22346 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/floor.ll llvm-r600/test/CodeGen/R600/floor.ll
22347 --- llvm-3.2.src/test/CodeGen/R600/floor.ll 1970-01-01 01:00:00.000000000 +0100
22348 +++ llvm-r600/test/CodeGen/R600/floor.ll 2013-01-25 19:43:58.463383033 +0100
22350 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22352 +;CHECK: FLOOR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22354 +define void @test() {
22355 + %r0 = call float @llvm.R600.load.input(i32 0)
22356 + %r1 = call float @floor(float %r0)
22357 + call void @llvm.AMDGPU.store.output(float %r1, i32 0)
22361 +declare float @llvm.R600.load.input(i32) readnone
22363 +declare void @llvm.AMDGPU.store.output(float, i32)
22365 +declare float @floor(float) readonly
22366 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fmax.ll llvm-r600/test/CodeGen/R600/fmax.ll
22367 --- llvm-3.2.src/test/CodeGen/R600/fmax.ll 1970-01-01 01:00:00.000000000 +0100
22368 +++ llvm-r600/test/CodeGen/R600/fmax.ll 2013-01-25 19:43:58.463383033 +0100
22370 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22372 +;CHECK: MAX T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22374 +define void @test() {
22375 + %r0 = call float @llvm.R600.load.input(i32 0)
22376 + %r1 = call float @llvm.R600.load.input(i32 1)
22377 + %r2 = fcmp uge float %r0, %r1
22378 + %r3 = select i1 %r2, float %r0, float %r1
22379 + call void @llvm.AMDGPU.store.output(float %r3, i32 0)
22383 +declare float @llvm.R600.load.input(i32) readnone
22385 +declare void @llvm.AMDGPU.store.output(float, i32)
22386 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fmin.ll llvm-r600/test/CodeGen/R600/fmin.ll
22387 --- llvm-3.2.src/test/CodeGen/R600/fmin.ll 1970-01-01 01:00:00.000000000 +0100
22388 +++ llvm-r600/test/CodeGen/R600/fmin.ll 2013-01-25 19:43:58.463383033 +0100
22390 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22392 +;CHECK: MIN T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22394 +define void @test() {
22395 + %r0 = call float @llvm.R600.load.input(i32 0)
22396 + %r1 = call float @llvm.R600.load.input(i32 1)
22397 + %r2 = fcmp uge float %r0, %r1
22398 + %r3 = select i1 %r2, float %r1, float %r0
22399 + call void @llvm.AMDGPU.store.output(float %r3, i32 0)
22403 +declare float @llvm.R600.load.input(i32) readnone
22405 +declare void @llvm.AMDGPU.store.output(float, i32)
22406 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fmul.ll llvm-r600/test/CodeGen/R600/fmul.ll
22407 --- llvm-3.2.src/test/CodeGen/R600/fmul.ll 1970-01-01 01:00:00.000000000 +0100
22408 +++ llvm-r600/test/CodeGen/R600/fmul.ll 2013-01-25 19:43:58.463383033 +0100
22410 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22412 +; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22414 +define void @test() {
22415 + %r0 = call float @llvm.R600.load.input(i32 0)
22416 + %r1 = call float @llvm.R600.load.input(i32 1)
22417 + %r2 = fmul float %r0, %r1
22418 + call void @llvm.AMDGPU.store.output(float %r2, i32 0)
22422 +declare float @llvm.R600.load.input(i32) readnone
22424 +declare void @llvm.AMDGPU.store.output(float, i32)
22426 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fmul.v4f32.ll llvm-r600/test/CodeGen/R600/fmul.v4f32.ll
22427 --- llvm-3.2.src/test/CodeGen/R600/fmul.v4f32.ll 1970-01-01 01:00:00.000000000 +0100
22428 +++ llvm-r600/test/CodeGen/R600/fmul.v4f32.ll 2013-01-25 19:43:58.463383033 +0100
22430 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22432 +;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22433 +;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22434 +;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22435 +;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22437 +define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
22438 + %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
22439 + %a = load <4 x float> addrspace(1) * %in
22440 + %b = load <4 x float> addrspace(1) * %b_ptr
22441 + %result = fmul <4 x float> %a, %b
22442 + store <4 x float> %result, <4 x float> addrspace(1)* %out
22445 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fsub.ll llvm-r600/test/CodeGen/R600/fsub.ll
22446 --- llvm-3.2.src/test/CodeGen/R600/fsub.ll 1970-01-01 01:00:00.000000000 +0100
22447 +++ llvm-r600/test/CodeGen/R600/fsub.ll 2013-01-25 19:43:58.463383033 +0100
22449 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22451 +; CHECK: MOV T{{[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
22452 +; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22454 +define void @test() {
22455 + %r0 = call float @llvm.R600.load.input(i32 0)
22456 + %r1 = call float @llvm.R600.load.input(i32 1)
22457 + %r2 = fsub float %r0, %r1
22458 + call void @llvm.AMDGPU.store.output(float %r2, i32 0)
22462 +declare float @llvm.R600.load.input(i32) readnone
22464 +declare void @llvm.AMDGPU.store.output(float, i32)
22466 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/fsub.v4f32.ll llvm-r600/test/CodeGen/R600/fsub.v4f32.ll
22467 --- llvm-3.2.src/test/CodeGen/R600/fsub.v4f32.ll 1970-01-01 01:00:00.000000000 +0100
22468 +++ llvm-r600/test/CodeGen/R600/fsub.v4f32.ll 2013-01-25 19:43:58.463383033 +0100
22470 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22472 +;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22473 +;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22474 +;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22475 +;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22477 +define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
22478 + %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
22479 + %a = load <4 x float> addrspace(1) * %in
22480 + %b = load <4 x float> addrspace(1) * %b_ptr
22481 + %result = fsub <4 x float> %a, %b
22482 + store <4 x float> %result, <4 x float> addrspace(1)* %out
22485 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/i8_to_double_to_float.ll llvm-r600/test/CodeGen/R600/i8_to_double_to_float.ll
22486 --- llvm-3.2.src/test/CodeGen/R600/i8_to_double_to_float.ll 1970-01-01 01:00:00.000000000 +0100
22487 +++ llvm-r600/test/CodeGen/R600/i8_to_double_to_float.ll 2013-01-25 19:43:58.463383033 +0100
22489 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22491 +;CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22493 +define void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) {
22494 + %1 = load i8 addrspace(1)* %in
22495 + %2 = uitofp i8 %1 to double
22496 + %3 = fptrunc double %2 to float
22497 + store float %3, float addrspace(1)* %out
22500 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/icmp-select-sete-reverse-args.ll llvm-r600/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
22501 --- llvm-3.2.src/test/CodeGen/R600/icmp-select-sete-reverse-args.ll 1970-01-01 01:00:00.000000000 +0100
22502 +++ llvm-r600/test/CodeGen/R600/icmp-select-sete-reverse-args.ll 2013-01-25 19:43:58.463383033 +0100
22504 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22506 +;Test that a select with reversed True/False values is correctly lowered
22507 +;to a SETNE_INT. There should only be one SETNE_INT instruction.
22509 +;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22510 +;CHECK_NOT: SETNE_INT
22512 +define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
22514 + %0 = load i32 addrspace(1)* %in
22515 + %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %in, i32 1
22516 + %1 = load i32 addrspace(1)* %arrayidx1
22517 + %cmp = icmp eq i32 %0, %1
22518 + %value = select i1 %cmp, i32 0, i32 -1
22519 + store i32 %value, i32 addrspace(1)* %out
22522 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/literals.ll llvm-r600/test/CodeGen/R600/literals.ll
22523 --- llvm-3.2.src/test/CodeGen/R600/literals.ll 1970-01-01 01:00:00.000000000 +0100
22524 +++ llvm-r600/test/CodeGen/R600/literals.ll 2013-01-25 19:43:58.463383033 +0100
22526 +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22528 +; Test using an integer literal constant.
22529 +; Generated ASM should be:
22530 +; ADD_INT REG literal.x, 5
22532 +; ADD_INT literal.x REG, 5
22534 +; CHECK: ADD_INT {{[A-Z0-9,. ]*}}literal.x,{{[A-Z0-9,. ]*}} 5
22535 +define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
22537 + %0 = add i32 5, %in
22538 + store i32 %0, i32 addrspace(1)* %out
22542 +; Test using a float literal constant.
22543 +; Generated ASM should be:
22544 +; ADD REG literal.x, 5.0
22546 +; ADD literal.x REG, 5.0
22548 +; CHECK: ADD {{[A-Z0-9,. ]*}}literal.x,{{[A-Z0-9,. ]*}} {{[0-9]+}}(5.0
22549 +define void @float_literal(float addrspace(1)* %out, float %in) {
22551 + %0 = fadd float 5.0, %in
22552 + store float %0, float addrspace(1)* %out
22556 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/lit.local.cfg llvm-r600/test/CodeGen/R600/lit.local.cfg
22557 --- llvm-3.2.src/test/CodeGen/R600/lit.local.cfg 1970-01-01 01:00:00.000000000 +0100
22558 +++ llvm-r600/test/CodeGen/R600/lit.local.cfg 2013-01-25 19:43:58.463383033 +0100
22560 +config.suffixes = ['.ll', '.c', '.cpp']
22562 +def getRoot(config):
22563 + if not config.parent:
22565 + return getRoot(config.parent)
22567 +root = getRoot(config)
22569 +targets = set(root.targets_to_build.split())
22570 +if not 'R600' in targets:
22571 + config.unsupported = True
22573 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/llvm.AMDGPU.mul.ll llvm-r600/test/CodeGen/R600/llvm.AMDGPU.mul.ll
22574 --- llvm-3.2.src/test/CodeGen/R600/llvm.AMDGPU.mul.ll 1970-01-01 01:00:00.000000000 +0100
22575 +++ llvm-r600/test/CodeGen/R600/llvm.AMDGPU.mul.ll 2013-01-25 19:43:58.463383033 +0100
22577 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22579 +;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22581 +define void @test() {
22582 + %r0 = call float @llvm.R600.load.input(i32 0)
22583 + %r1 = call float @llvm.R600.load.input(i32 1)
22584 + %r2 = call float @llvm.AMDGPU.mul( float %r0, float %r1)
22585 + call void @llvm.AMDGPU.store.output(float %r2, i32 0)
22589 +declare float @llvm.R600.load.input(i32) readnone
22591 +declare void @llvm.AMDGPU.store.output(float, i32)
22593 +declare float @llvm.AMDGPU.mul(float ,float ) readnone
22594 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/llvm.AMDGPU.trunc.ll llvm-r600/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
22595 --- llvm-3.2.src/test/CodeGen/R600/llvm.AMDGPU.trunc.ll 1970-01-01 01:00:00.000000000 +0100
22596 +++ llvm-r600/test/CodeGen/R600/llvm.AMDGPU.trunc.ll 2013-01-25 19:43:58.463383033 +0100
22598 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22600 +;CHECK: TRUNC T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22602 +define void @test() {
22603 + %r0 = call float @llvm.R600.load.input(i32 0)
22604 + %r1 = call float @llvm.AMDGPU.trunc( float %r0)
22605 + call void @llvm.AMDGPU.store.output(float %r1, i32 0)
22609 +declare float @llvm.R600.load.input(i32) readnone
22611 +declare void @llvm.AMDGPU.store.output(float, i32)
22613 +declare float @llvm.AMDGPU.trunc(float ) readnone
22614 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/llvm.cos.ll llvm-r600/test/CodeGen/R600/llvm.cos.ll
22615 --- llvm-3.2.src/test/CodeGen/R600/llvm.cos.ll 1970-01-01 01:00:00.000000000 +0100
22616 +++ llvm-r600/test/CodeGen/R600/llvm.cos.ll 2013-01-25 19:43:58.463383033 +0100
22618 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22620 +;CHECK: COS T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22622 +define void @test() {
22623 + %r0 = call float @llvm.R600.load.input(i32 0)
22624 + %r1 = call float @llvm.cos.f32(float %r0)
22625 + call void @llvm.AMDGPU.store.output(float %r1, i32 0)
22629 +declare float @llvm.cos.f32(float) readnone
22631 +declare float @llvm.R600.load.input(i32) readnone
22633 +declare void @llvm.AMDGPU.store.output(float, i32)
22634 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/llvm.pow.ll llvm-r600/test/CodeGen/R600/llvm.pow.ll
22635 --- llvm-3.2.src/test/CodeGen/R600/llvm.pow.ll 1970-01-01 01:00:00.000000000 +0100
22636 +++ llvm-r600/test/CodeGen/R600/llvm.pow.ll 2013-01-25 19:43:58.466716366 +0100
22638 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22640 +;CHECK: LOG_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22641 +;CHECK-NEXT: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22642 +;CHECK-NEXT: EXP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22644 +define void @test() {
22645 + %r0 = call float @llvm.R600.load.input(i32 0)
22646 + %r1 = call float @llvm.R600.load.input(i32 1)
22647 + %r2 = call float @llvm.pow.f32( float %r0, float %r1)
22648 + call void @llvm.AMDGPU.store.output(float %r2, i32 0)
22652 +declare float @llvm.R600.load.input(i32) readnone
22654 +declare void @llvm.AMDGPU.store.output(float, i32)
22656 +declare float @llvm.pow.f32(float ,float ) readonly
22657 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/llvm.sin.ll llvm-r600/test/CodeGen/R600/llvm.sin.ll
22658 --- llvm-3.2.src/test/CodeGen/R600/llvm.sin.ll 1970-01-01 01:00:00.000000000 +0100
22659 +++ llvm-r600/test/CodeGen/R600/llvm.sin.ll 2013-01-25 19:43:58.466716366 +0100
22661 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22663 +;CHECK: SIN T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22665 +define void @test() {
22666 + %r0 = call float @llvm.R600.load.input(i32 0)
22667 + %r1 = call float @llvm.sin.f32( float %r0)
22668 + call void @llvm.AMDGPU.store.output(float %r1, i32 0)
22672 +declare float @llvm.sin.f32(float) readnone
22674 +declare float @llvm.R600.load.input(i32) readnone
22676 +declare void @llvm.AMDGPU.store.output(float, i32)
22677 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/load.constant_addrspace.f32.ll llvm-r600/test/CodeGen/R600/load.constant_addrspace.f32.ll
22678 --- llvm-3.2.src/test/CodeGen/R600/load.constant_addrspace.f32.ll 1970-01-01 01:00:00.000000000 +0100
22679 +++ llvm-r600/test/CodeGen/R600/load.constant_addrspace.f32.ll 2013-01-25 19:43:58.466716366 +0100
22681 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22683 +;CHECK: VTX_READ_32 T{{[0-9]+\.X, T[0-9]+\.X}}
22685 +define void @test(float addrspace(1)* %out, float addrspace(2)* %in) {
22686 + %1 = load float addrspace(2)* %in
22687 + store float %1, float addrspace(1)* %out
22690 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/load.i8.ll llvm-r600/test/CodeGen/R600/load.i8.ll
22691 --- llvm-3.2.src/test/CodeGen/R600/load.i8.ll 1970-01-01 01:00:00.000000000 +0100
22692 +++ llvm-r600/test/CodeGen/R600/load.i8.ll 2013-01-25 19:43:58.466716366 +0100
22694 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22696 +;CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
22698 +define void @test(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
22699 + %1 = load i8 addrspace(1)* %in
22700 + %2 = zext i8 %1 to i32
22701 + store i32 %2, i32 addrspace(1)* %out
22704 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/reciprocal.ll llvm-r600/test/CodeGen/R600/reciprocal.ll
22705 --- llvm-3.2.src/test/CodeGen/R600/reciprocal.ll 1970-01-01 01:00:00.000000000 +0100
22706 +++ llvm-r600/test/CodeGen/R600/reciprocal.ll 2013-01-25 19:43:58.466716366 +0100
22708 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22710 +;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22712 +define void @test() {
22713 + %r0 = call float @llvm.R600.load.input(i32 0)
22714 + %r1 = fdiv float 1.0, %r0
22715 + call void @llvm.AMDGPU.store.output(float %r1, i32 0)
22719 +declare float @llvm.R600.load.input(i32) readnone
22721 +declare void @llvm.AMDGPU.store.output(float, i32)
22723 +declare float @llvm.AMDGPU.rcp(float ) readnone
22724 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/sdiv.ll llvm-r600/test/CodeGen/R600/sdiv.ll
22725 --- llvm-3.2.src/test/CodeGen/R600/sdiv.ll 1970-01-01 01:00:00.000000000 +0100
22726 +++ llvm-r600/test/CodeGen/R600/sdiv.ll 2013-01-25 19:43:58.466716366 +0100
22728 +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22730 +; The code generated by sdiv is long and complex and may frequently change.
22731 +; The goal of this test is to make sure the ISel doesn't fail.
22733 +; This program was previously failing to compile when one of the selectcc
22734 +; opcodes generated by the sdiv lowering was being legalized and optimized to:
22735 +; selectcc Remainder -1, 0, -1, SETGT
22736 +; This was fixed by adding an additional pattern in R600Instructions.td to
22737 +; match this pattern with a CNDGE_INT.
22741 +define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
22742 + %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
22743 + %num = load i32 addrspace(1) * %in
22744 + %den = load i32 addrspace(1) * %den_ptr
22745 + %result = sdiv i32 %num, %den
22746 + store i32 %result, i32 addrspace(1)* %out
22749 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/selectcc_cnde_int.ll llvm-r600/test/CodeGen/R600/selectcc_cnde_int.ll
22750 --- llvm-3.2.src/test/CodeGen/R600/selectcc_cnde_int.ll 1970-01-01 01:00:00.000000000 +0100
22751 +++ llvm-r600/test/CodeGen/R600/selectcc_cnde_int.ll 2013-01-25 19:43:58.466716366 +0100
22753 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22755 +;CHECK-NOT: SETE_INT
22756 +;CHECK: CNDE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1, literal.x, 2}}
22757 +define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
22758 + %1 = load i32 addrspace(1)* %in
22759 + %2 = icmp eq i32 %1, 0
22760 + %3 = select i1 %2, i32 1, i32 2
22761 + store i32 %3, i32 addrspace(1)* %out
22764 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/selectcc_cnde.ll llvm-r600/test/CodeGen/R600/selectcc_cnde.ll
22765 --- llvm-3.2.src/test/CodeGen/R600/selectcc_cnde.ll 1970-01-01 01:00:00.000000000 +0100
22766 +++ llvm-r600/test/CodeGen/R600/selectcc_cnde.ll 2013-01-25 19:43:58.466716366 +0100
22768 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22771 +;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1.0, literal.x, [-0-9]+\(2.0}}
22772 +define void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
22773 + %1 = load float addrspace(1)* %in
22774 + %2 = fcmp oeq float %1, 0.0
22775 + %3 = select i1 %2, float 1.0, float 2.0
22776 + store float %3, float addrspace(1)* %out
22779 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/selectcc-icmp-select-float.ll llvm-r600/test/CodeGen/R600/selectcc-icmp-select-float.ll
22780 --- llvm-3.2.src/test/CodeGen/R600/selectcc-icmp-select-float.ll 1970-01-01 01:00:00.000000000 +0100
22781 +++ llvm-r600/test/CodeGen/R600/selectcc-icmp-select-float.ll 2013-01-25 19:43:58.466716366 +0100
22783 +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22785 +; Note additional optimizations may cause this SGT to be replaced with a
22786 +; CND* instruction.
22787 +; CHECK: SGT_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, -1}}
22788 +; Test a selectcc with i32 LHS/RHS and float True/False
22790 +define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
22792 + %0 = load i32 addrspace(1)* %in
22793 + %1 = icmp sge i32 %0, 0
22794 + %2 = select i1 %1, float 1.0, float 0.0
22795 + store float %2, float addrspace(1)* %out
22798 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/setcc.v4i32.ll llvm-r600/test/CodeGen/R600/setcc.v4i32.ll
22799 --- llvm-3.2.src/test/CodeGen/R600/setcc.v4i32.ll 1970-01-01 01:00:00.000000000 +0100
22800 +++ llvm-r600/test/CodeGen/R600/setcc.v4i32.ll 2013-01-25 19:43:58.466716366 +0100
22802 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22803 +;CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22805 +define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
22806 + %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
22807 + %a = load <4 x i32> addrspace(1) * %in
22808 + %b = load <4 x i32> addrspace(1) * %b_ptr
22809 + %result = icmp eq <4 x i32> %a, %b
22810 + %sext = sext <4 x i1> %result to <4 x i32>
22811 + store <4 x i32> %sext, <4 x i32> addrspace(1)* %out
22814 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/short-args.ll llvm-r600/test/CodeGen/R600/short-args.ll
22815 --- llvm-3.2.src/test/CodeGen/R600/short-args.ll 1970-01-01 01:00:00.000000000 +0100
22816 +++ llvm-r600/test/CodeGen/R600/short-args.ll 2013-01-25 19:43:58.466716366 +0100
22818 +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22820 +; CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
22822 +define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
22824 + %0 = zext i8 %in to i32
22825 + store i32 %0, i32 addrspace(1)* %out, align 4
22829 +; CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
22831 +define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
22833 + %0 = zext i8 %in to i32
22834 + store i32 %0, i32 addrspace(1)* %out, align 4
22838 +; CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
22840 +define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
22842 + %0 = zext i16 %in to i32
22843 + store i32 %0, i32 addrspace(1)* %out, align 4
22847 +; CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
22849 +define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
22851 + %0 = zext i16 %in to i32
22852 + store i32 %0, i32 addrspace(1)* %out, align 4
22855 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/store.v4f32.ll llvm-r600/test/CodeGen/R600/store.v4f32.ll
22856 --- llvm-3.2.src/test/CodeGen/R600/store.v4f32.ll 1970-01-01 01:00:00.000000000 +0100
22857 +++ llvm-r600/test/CodeGen/R600/store.v4f32.ll 2013-01-25 19:43:58.466716366 +0100
22859 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22861 +;CHECK: RAT_WRITE_CACHELESS_128 T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
22863 +define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
22864 + %1 = load <4 x float> addrspace(1) * %in
22865 + store <4 x float> %1, <4 x float> addrspace(1)* %out
22868 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/store.v4i32.ll llvm-r600/test/CodeGen/R600/store.v4i32.ll
22869 --- llvm-3.2.src/test/CodeGen/R600/store.v4i32.ll 1970-01-01 01:00:00.000000000 +0100
22870 +++ llvm-r600/test/CodeGen/R600/store.v4i32.ll 2013-01-25 19:43:58.466716366 +0100
22872 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22874 +;CHECK: RAT_WRITE_CACHELESS_128 T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
22876 +define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
22877 + %1 = load <4 x i32> addrspace(1) * %in
22878 + store <4 x i32> %1, <4 x i32> addrspace(1)* %out
22881 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/udiv.v4i32.ll llvm-r600/test/CodeGen/R600/udiv.v4i32.ll
22882 --- llvm-3.2.src/test/CodeGen/R600/udiv.v4i32.ll 1970-01-01 01:00:00.000000000 +0100
22883 +++ llvm-r600/test/CodeGen/R600/udiv.v4i32.ll 2013-01-25 19:43:58.466716366 +0100
22885 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22887 +;The code generated by udiv is long and complex and may frequently change.
22888 +;The goal of this test is to make sure the ISel doesn't fail when it gets
22892 +define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
22893 + %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
22894 + %a = load <4 x i32> addrspace(1) * %in
22895 + %b = load <4 x i32> addrspace(1) * %b_ptr
22896 + %result = udiv <4 x i32> %a, %b
22897 + store <4 x i32> %result, <4 x i32> addrspace(1)* %out
22900 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/urem.v4i32.ll llvm-r600/test/CodeGen/R600/urem.v4i32.ll
22901 --- llvm-3.2.src/test/CodeGen/R600/urem.v4i32.ll 1970-01-01 01:00:00.000000000 +0100
22902 +++ llvm-r600/test/CodeGen/R600/urem.v4i32.ll 2013-01-25 19:43:58.470049700 +0100
22904 +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22906 +;The code generated by urem is long and complex and may frequently change.
22907 +;The goal of this test is to make sure the ISel doesn't fail when it gets
22911 +define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
22912 + %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
22913 + %a = load <4 x i32> addrspace(1) * %in
22914 + %b = load <4 x i32> addrspace(1) * %b_ptr
22915 + %result = urem <4 x i32> %a, %b
22916 + store <4 x i32> %result, <4 x i32> addrspace(1)* %out
22919 diff -Nur -x .git llvm-3.2.src/test/CodeGen/R600/vec4-expand.ll llvm-r600/test/CodeGen/R600/vec4-expand.ll
22920 --- llvm-3.2.src/test/CodeGen/R600/vec4-expand.ll 1970-01-01 01:00:00.000000000 +0100
22921 +++ llvm-r600/test/CodeGen/R600/vec4-expand.ll 2013-01-25 19:43:58.470049700 +0100
22923 +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
22925 +; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22926 +; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22927 +; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22928 +; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22930 +define void @fp_to_sint(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
22931 + %value = load <4 x float> addrspace(1) * %in
22932 + %result = fptosi <4 x float> %value to <4 x i32>
22933 + store <4 x i32> %result, <4 x i32> addrspace(1)* %out
22937 +; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22938 +; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22939 +; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22940 +; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22942 +define void @fp_to_uint(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
22943 + %value = load <4 x float> addrspace(1) * %in
22944 + %result = fptoui <4 x float> %value to <4 x i32>
22945 + store <4 x i32> %result, <4 x i32> addrspace(1)* %out
22949 +; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22950 +; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22951 +; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22952 +; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22954 +define void @sint_to_fp(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
22955 + %value = load <4 x i32> addrspace(1) * %in
22956 + %result = sitofp <4 x i32> %value to <4 x float>
22957 + store <4 x float> %result, <4 x float> addrspace(1)* %out
22961 +; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22962 +; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22963 +; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22964 +; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
22966 +define void @uint_to_fp(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
22967 + %value = load <4 x i32> addrspace(1) * %in
22968 + %result = uitofp <4 x i32> %value to <4 x float>
22969 + store <4 x float> %result, <4 x float> addrspace(1)* %out
22972 diff -Nur -x .git llvm-3.2.src/test/CodeGen/SI/sanity.ll llvm-r600/test/CodeGen/SI/sanity.ll
22973 --- llvm-3.2.src/test/CodeGen/SI/sanity.ll 1970-01-01 01:00:00.000000000 +0100
22974 +++ llvm-r600/test/CodeGen/SI/sanity.ll 2013-01-25 19:43:58.470049700 +0100
22976 +;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
22980 +define void @main() {
22982 + call void @llvm.AMDGPU.shader.type(i32 1)
22983 + %0 = load <4 x i32> addrspace(2)* addrspace(8)* inttoptr (i32 6 to <4 x i32> addrspace(2)* addrspace(8)*)
22984 + %1 = getelementptr <4 x i32> addrspace(2)* %0, i32 0
22985 + %2 = load <4 x i32> addrspace(2)* %1
22986 + %3 = call i32 @llvm.SI.vs.load.buffer.index()
22987 + %4 = call <4 x float> @llvm.SI.vs.load.input(<4 x i32> %2, i32 0, i32 %3)
22988 + %5 = extractelement <4 x float> %4, i32 0
22989 + %6 = extractelement <4 x float> %4, i32 1
22990 + %7 = extractelement <4 x float> %4, i32 2
22991 + %8 = extractelement <4 x float> %4, i32 3
22992 + %9 = load <4 x i32> addrspace(2)* addrspace(8)* inttoptr (i32 6 to <4 x i32> addrspace(2)* addrspace(8)*)
22993 + %10 = getelementptr <4 x i32> addrspace(2)* %9, i32 1
22994 + %11 = load <4 x i32> addrspace(2)* %10
22995 + %12 = call i32 @llvm.SI.vs.load.buffer.index()
22996 + %13 = call <4 x float> @llvm.SI.vs.load.input(<4 x i32> %11, i32 0, i32 %12)
22997 + %14 = extractelement <4 x float> %13, i32 0
22998 + %15 = extractelement <4 x float> %13, i32 1
22999 + %16 = extractelement <4 x float> %13, i32 2
23000 + %17 = extractelement <4 x float> %13, i32 3
23001 + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %14, float %15, float %16, float %17)
23002 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %5, float %6, float %7, float %8)
23006 +declare void @llvm.AMDGPU.shader.type(i32)
23008 +declare i32 @llvm.SI.vs.load.buffer.index() readnone
23010 +declare <4 x float> @llvm.SI.vs.load.input(<4 x i32>, i32, i32)
23012 +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
23013 diff -Nur -x .git llvm-3.2.src/test/CodeGen/X86/cvtv2f32.ll llvm-r600/test/CodeGen/X86/cvtv2f32.ll
23014 --- llvm-3.2.src/test/CodeGen/X86/cvtv2f32.ll 2012-10-24 06:14:18.000000000 +0200
23015 +++ llvm-r600/test/CodeGen/X86/cvtv2f32.ll 2013-01-25 19:43:58.856716358 +0100
23017 +; A bug fix in the DAGCombiner made this test fail, so marking as xfail
23018 +; until this can be investigated further.
23021 ; RUN: llc < %s -mtriple=i686-linux-pc -mcpu=corei7 | FileCheck %s
23023 define <2 x float> @foo(i32 %x, i32 %y, <2 x float> %v) {