1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
|
//=- X86ScheduleZnver1.td - X86 Znver1 Scheduling -------------*- tablegen -*-=//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines the machine model for Znver1 to support instruction
// scheduling and other instruction cost heuristics.
//
//===----------------------------------------------------------------------===//
def Znver1Model : SchedMachineModel {
// Zen can decode 4 instructions per cycle.
let IssueWidth = 4;
// Based on the reorder buffer we define MicroOpBufferSize
let MicroOpBufferSize = 192;
let LoadLatency = 4;
let MispredictPenalty = 17;
let HighLatency = 25;
let PostRAScheduler = 1;
// FIXME: This variable is required for incomplete model.
// We haven't catered all instructions.
// So, we reset the value of this variable so as to
// say that the model is incomplete.
let CompleteModel = 0;
}
let SchedModel = Znver1Model in {
// Zen can issue micro-ops to 10 different units in one cycle.
// These are
// * Four integer ALU units (ZALU0, ZALU1, ZALU2, ZALU3)
// * Two AGU units (ZAGU0, ZAGU1)
// * Four FPU units (ZFPU0, ZFPU1, ZFPU2, ZFPU3)
// AGUs feed load store queues @two loads and 1 store per cycle.
// Four ALU units are defined below
def ZnALU0 : ProcResource<1>;
def ZnALU1 : ProcResource<1>;
def ZnALU2 : ProcResource<1>;
def ZnALU3 : ProcResource<1>;
// Two AGU units are defined below
def ZnAGU0 : ProcResource<1>;
def ZnAGU1 : ProcResource<1>;
// Four FPU units are defined below
def ZnFPU0 : ProcResource<1>;
def ZnFPU1 : ProcResource<1>;
def ZnFPU2 : ProcResource<1>;
def ZnFPU3 : ProcResource<1>;
// FPU grouping
def ZnFPU : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU2, ZnFPU3]>;
def ZnFPU013 : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU3]>;
def ZnFPU01 : ProcResGroup<[ZnFPU0, ZnFPU1]>;
def ZnFPU12 : ProcResGroup<[ZnFPU1, ZnFPU2]>;
def ZnFPU13 : ProcResGroup<[ZnFPU1, ZnFPU3]>;
def ZnFPU23 : ProcResGroup<[ZnFPU2, ZnFPU3]>;
def ZnFPU02 : ProcResGroup<[ZnFPU0, ZnFPU2]>;
def ZnFPU03 : ProcResGroup<[ZnFPU0, ZnFPU3]>;
// Below are the grouping of the units.
// Micro-ops to be issued to multiple units are tackled this way.
// ALU grouping
// ZnALU03 - 0,3 grouping
def ZnALU03: ProcResGroup<[ZnALU0, ZnALU3]>;
// 56 Entry (14x4 entries) Int Scheduler
def ZnALU : ProcResGroup<[ZnALU0, ZnALU1, ZnALU2, ZnALU3]> {
let BufferSize=56;
}
// 28 Entry (14x2) AGU group. AGUs can't be used for all ALU operations
// but are relevant for some instructions
def ZnAGU : ProcResGroup<[ZnAGU0, ZnAGU1]> {
let BufferSize=28;
}
// Integer Multiplication issued on ALU1.
def ZnMultiplier : ProcResource<1>;
// Integer division issued on ALU2.
def ZnDivider : ProcResource<1>;
// 4 Cycles load-to use Latency is captured
def : ReadAdvance<ReadAfterLd, 4>;
// (a folded load is an instruction that loads and does some operation)
// Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops.
// a. load and
// b. addpd
// This multiclass is for folded loads for integer units.
multiclass ZnWriteResPair<X86FoldableSchedWrite SchedRW,
ProcResourceKind ExePort,
int Lat> {
// Register variant takes 1-cycle on Execution Port.
def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
// Memory variant also uses a cycle on ZnAGU
// adds 4 cycles to the latency.
def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> {
let Latency = !add(Lat, 4);
}
}
// This multiclass is for folded loads for floating point units.
multiclass ZnWriteResFpuPair<X86FoldableSchedWrite SchedRW,
ProcResourceKind ExePort,
int Lat> {
// Register variant takes 1-cycle on Execution Port.
def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
// Memory variant also uses a cycle on ZnAGU
// adds 7 cycles to the latency.
def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> {
let Latency = !add(Lat, 7);
}
}
// WriteRMW is set for instructions with Memory write
// operation in codegen
def : WriteRes<WriteRMW, [ZnAGU]>;
def : WriteRes<WriteStore, [ZnAGU]>;
def : WriteRes<WriteMove, [ZnALU]>;
def : WriteRes<WriteLoad, [ZnAGU]> { let Latency = 8; }
def : WriteRes<WriteZero, []>;
def : WriteRes<WriteLEA, [ZnALU]>;
defm : ZnWriteResPair<WriteALU, ZnALU, 1>;
defm : ZnWriteResPair<WriteShift, ZnALU, 1>;
defm : ZnWriteResPair<WriteJump, ZnALU, 1>;
// IDIV
def : WriteRes<WriteIDiv, [ZnALU2, ZnDivider]> {
let Latency = 41;
let ResourceCycles = [1, 41];
}
def : WriteRes<WriteIDivLd, [ZnALU2, ZnAGU, ZnDivider]> {
let Latency = 45;
let ResourceCycles = [1, 4, 41];
}
// IMUL
def : WriteRes<WriteIMulH, [ZnALU1, ZnMultiplier]>{
let Latency = 4;
}
def : WriteRes<WriteIMul, [ZnALU1, ZnMultiplier]> {
let Latency = 4;
}
def : WriteRes<WriteIMulLd,[ZnALU1, ZnMultiplier]> {
let Latency = 8;
}
// Floating point operations
defm : ZnWriteResFpuPair<WriteFHAdd, ZnFPU0, 3>;
defm : ZnWriteResFpuPair<WriteFAdd, ZnFPU0, 3>;
defm : ZnWriteResFpuPair<WriteFBlend, ZnFPU01, 1>;
defm : ZnWriteResFpuPair<WriteFVarBlend, ZnFPU01, 1>;
defm : ZnWriteResFpuPair<WriteVarBlend, ZnFPU0, 1>;
defm : ZnWriteResFpuPair<WriteCvtI2F, ZnFPU3, 5>;
defm : ZnWriteResFpuPair<WriteCvtF2F, ZnFPU3, 5>;
defm : ZnWriteResFpuPair<WriteCvtF2I, ZnFPU3, 5>;
defm : ZnWriteResFpuPair<WriteFDiv, ZnFPU3, 15>;
defm : ZnWriteResFpuPair<WriteFShuffle, ZnFPU12, 1>;
defm : ZnWriteResFpuPair<WriteFMul, ZnFPU0, 5>;
defm : ZnWriteResFpuPair<WriteFRcp, ZnFPU01, 5>;
defm : ZnWriteResFpuPair<WriteFRsqrt, ZnFPU01, 5>;
defm : ZnWriteResFpuPair<WriteFSqrt, ZnFPU3, 20>;
// Vector integer operations which uses FPU units
defm : ZnWriteResFpuPair<WriteVecShift, ZnFPU, 1>;
defm : ZnWriteResFpuPair<WriteVecLogic, ZnFPU, 1>;
defm : ZnWriteResFpuPair<WritePHAdd, ZnFPU, 1>;
defm : ZnWriteResFpuPair<WriteVecALU, ZnFPU, 1>;
defm : ZnWriteResFpuPair<WriteVecIMul, ZnFPU0, 4>;
defm : ZnWriteResFpuPair<WriteShuffle, ZnFPU, 1>;
defm : ZnWriteResFpuPair<WriteBlend, ZnFPU01, 1>;
defm : ZnWriteResFpuPair<WriteShuffle256, ZnFPU, 2>;
// Vector Shift Operations
defm : ZnWriteResFpuPair<WriteVarVecShift, ZnFPU12, 1>;
// AES Instructions.
defm : ZnWriteResFpuPair<WriteAESDecEnc, ZnFPU01, 4>;
defm : ZnWriteResFpuPair<WriteAESIMC, ZnFPU01, 4>;
defm : ZnWriteResFpuPair<WriteAESKeyGen, ZnFPU01, 4>;
def : WriteRes<WriteFence, [ZnAGU]>;
def : WriteRes<WriteNop, []>;
// Following instructions with latency=100 are microcoded.
// We set long latency so as to block the entire pipeline.
defm : ZnWriteResFpuPair<WriteFShuffle256, ZnFPU, 100>;
//Microcoded Instructions
let Latency = 100 in {
def : WriteRes<WriteMicrocoded, []>;
def : WriteRes<WriteSystem, []>;
def : WriteRes<WriteMPSAD, []>;
def : WriteRes<WriteMPSADLd, []>;
def : WriteRes<WriteCLMul, []>;
def : WriteRes<WriteCLMulLd, []>;
def : WriteRes<WritePCmpIStrM, []>;
def : WriteRes<WritePCmpIStrMLd, []>;
def : WriteRes<WritePCmpEStrI, []>;
def : WriteRes<WritePCmpEStrILd, []>;
def : WriteRes<WritePCmpEStrM, []>;
def : WriteRes<WritePCmpEStrMLd, []>;
def : WriteRes<WritePCmpIStrI, []>;
def : WriteRes<WritePCmpIStrILd, []>;
}
}
|