contrib/llvm/patches/patch-08-llvm-r230348-arm-fix-bad-ha.diff


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419

Pull in r230348 from upstream llvm trunk (by Tim Northover):

  ARM: treat [N x i32] and [N x i64] as AAPCS composite types

  The logic is almost there already, with our special homogeneous
  aggregate handling. Tweaking it like this allows front-ends to emit
  AAPCS compliant code without ever having to count registers or add
  discarded padding arguments.

  Only arrays of i32 and i64 are needed to model AAPCS rules, but I
  decided to apply the logic to all integer arrays for more consistency.

This fixes a possible "Unexpected member type for HA" error when
compiling lib/msun/bsdsrc/b_tgamma.c for armv6.

Reported by:	Jakub Palider <jpa@semihalf.com>

Introduced here: https://svnweb.freebsd.org/changeset/base/280400

Index: include/llvm/CodeGen/CallingConvLower.h
===================================================================
--- include/llvm/CodeGen/CallingConvLower.h
+++ include/llvm/CodeGen/CallingConvLower.h
@@ -122,8 +122,8 @@ class CCValAssign {
   // There is no need to differentiate between a pending CCValAssign and other
   // kinds, as they are stored in a different list.
   static CCValAssign getPending(unsigned ValNo, MVT ValVT, MVT LocVT,
-                                LocInfo HTP) {
-    return getReg(ValNo, ValVT, 0, LocVT, HTP);
+                                LocInfo HTP, unsigned ExtraInfo = 0) {
+    return getReg(ValNo, ValVT, ExtraInfo, LocVT, HTP);
   }
 
   void convertToReg(unsigned RegNo) {
@@ -146,6 +146,7 @@ class CCValAssign {
 
   unsigned getLocReg() const { assert(isRegLoc()); return Loc; }
   unsigned getLocMemOffset() const { assert(isMemLoc()); return Loc; }
+  unsigned getExtraInfo() const { return Loc; }
   MVT getLocVT() const { return LocVT; }
 
   LocInfo getLocInfo() const { return HTP; }
Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7429,11 +7429,8 @@ TargetLowering::LowerCallTo(TargetLowering::CallLo
       }
       if (Args[i].isNest)
         Flags.setNest();
-      if (NeedsRegBlock) {
+      if (NeedsRegBlock)
         Flags.setInConsecutiveRegs();
-        if (Value == NumValues - 1)
-          Flags.setInConsecutiveRegsLast();
-      }
       Flags.setOrigAlign(OriginalAlignment);
 
       MVT PartVT = getRegisterType(CLI.RetTy->getContext(), VT);
@@ -7482,6 +7479,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLo
         CLI.Outs.push_back(MyFlags);
         CLI.OutVals.push_back(Parts[j]);
       }
+
+      if (NeedsRegBlock && Value == NumValues - 1)
+        CLI.Outs[CLI.Outs.size() - 1].Flags.setInConsecutiveRegsLast();
     }
   }
 
@@ -7697,11 +7697,8 @@ void SelectionDAGISel::LowerArguments(const Functi
       }
       if (F.getAttributes().hasAttribute(Idx, Attribute::Nest))
         Flags.setNest();
-      if (NeedsRegBlock) {
+      if (NeedsRegBlock)
         Flags.setInConsecutiveRegs();
-        if (Value == NumValues - 1)
-          Flags.setInConsecutiveRegsLast();
-      }
       Flags.setOrigAlign(OriginalAlignment);
 
       MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
@@ -7716,6 +7713,8 @@ void SelectionDAGISel::LowerArguments(const Functi
           MyFlags.Flags.setOrigAlign(1);
         Ins.push_back(MyFlags);
       }
+      if (NeedsRegBlock && Value == NumValues - 1)
+        Ins[Ins.size() - 1].Flags.setInConsecutiveRegsLast();
       PartBase += VT.getStoreSize();
     }
   }
Index: lib/Target/ARM/ARMCallingConv.h
===================================================================
--- lib/Target/ARM/ARMCallingConv.h
+++ lib/Target/ARM/ARMCallingConv.h
@@ -160,6 +160,8 @@ static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &V
                                    State);
 }
 
+static const uint16_t RRegList[] = { ARM::R0,  ARM::R1,  ARM::R2,  ARM::R3 };
+
 static const uint16_t SRegList[] = { ARM::S0,  ARM::S1,  ARM::S2,  ARM::S3,
                                      ARM::S4,  ARM::S5,  ARM::S6,  ARM::S7,
                                      ARM::S8,  ARM::S9,  ARM::S10, ARM::S11,
@@ -168,81 +170,114 @@ static const uint16_t DRegList[] = { ARM::D0, ARM:
                                      ARM::D4, ARM::D5, ARM::D6, ARM::D7 };
 static const uint16_t QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 };
 
+
 // Allocate part of an AAPCS HFA or HVA. We assume that each member of the HA
 // has InConsecutiveRegs set, and that the last member also has
 // InConsecutiveRegsLast set. We must process all members of the HA before
 // we can allocate it, as we need to know the total number of registers that
 // will be needed in order to (attempt to) allocate a contiguous block.
-static bool CC_ARM_AAPCS_Custom_HA(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
-                                   CCValAssign::LocInfo &LocInfo,
-                                   ISD::ArgFlagsTy &ArgFlags, CCState &State) {
-  SmallVectorImpl<CCValAssign> &PendingHAMembers = State.getPendingLocs();
+static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
+                                          MVT &LocVT,
+                                          CCValAssign::LocInfo &LocInfo,
+                                          ISD::ArgFlagsTy &ArgFlags,
+                                          CCState &State) {
+  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
 
   // AAPCS HFAs must have 1-4 elements, all of the same type
-  assert(PendingHAMembers.size() < 4);
-  if (PendingHAMembers.size() > 0)
-    assert(PendingHAMembers[0].getLocVT() == LocVT);
+  if (PendingMembers.size() > 0)
+    assert(PendingMembers[0].getLocVT() == LocVT);
 
   // Add the argument to the list to be allocated once we know the size of the
-  // HA
-  PendingHAMembers.push_back(
-      CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+  // aggregate. Store the type's required alignmnent as extra info for later: in
+  // the [N x i64] case all trace has been removed by the time we actually get
+  // to do allocation.
+  PendingMembers.push_back(CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo,
+                                                   ArgFlags.getOrigAlign()));
 
-  if (ArgFlags.isInConsecutiveRegsLast()) {
-    assert(PendingHAMembers.size() > 0 && PendingHAMembers.size() <= 4 &&
-           "Homogeneous aggregates must have between 1 and 4 members");
+  if (!ArgFlags.isInConsecutiveRegsLast())
+    return true;
 
-    // Try to allocate a contiguous block of registers, each of the correct
-    // size to hold one member.
-    ArrayRef<uint16_t> RegList;
-    switch (LocVT.SimpleTy) {
-    case MVT::f32:
-      RegList = SRegList;
-      break;
-    case MVT::f64:
-      RegList = DRegList;
-      break;
-    case MVT::v2f64:
-      RegList = QRegList;
-      break;
-    default:
-      llvm_unreachable("Unexpected member type for HA");
-      break;
-    }
+  // Try to allocate a contiguous block of registers, each of the correct
+  // size to hold one member.
+  unsigned Align = std::min(PendingMembers[0].getExtraInfo(), 8U);
 
-    unsigned RegResult =
-        State.AllocateRegBlock(RegList, PendingHAMembers.size());
+  ArrayRef<uint16_t> RegList;
+  switch (LocVT.SimpleTy) {
+  case MVT::i32: {
+    RegList = RRegList;
+    unsigned RegIdx = State.getFirstUnallocated(RegList.data(), RegList.size());
 
-    if (RegResult) {
-      for (SmallVectorImpl<CCValAssign>::iterator It = PendingHAMembers.begin();
-           It != PendingHAMembers.end(); ++It) {
-        It->convertToReg(RegResult);
-        State.addLoc(*It);
-        ++RegResult;
-      }
-      PendingHAMembers.clear();
-      return true;
-    }
+    // First consume all registers that would give an unaligned object. Whether
+    // we go on stack or in regs, no-one will be using them in future.
+    unsigned RegAlign = RoundUpToAlignment(Align, 4) / 4;
+    while (RegIdx % RegAlign != 0 && RegIdx < RegList.size())
+      State.AllocateReg(RegList[RegIdx++]);
 
-    // Register allocation failed, fall back to the stack
+    break;
+  }
+  case MVT::f32:
+    RegList = SRegList;
+    break;
+  case MVT::f64:
+    RegList = DRegList;
+    break;
+  case MVT::v2f64:
+    RegList = QRegList;
+    break;
+  default:
+    llvm_unreachable("Unexpected member type for block aggregate");
+    break;
+  }
 
-    // Mark all VFP regs as unavailable (AAPCS rule C.2.vfp)
-    for (unsigned regNo = 0; regNo < 16; ++regNo)
-      State.AllocateReg(SRegList[regNo]);
+  unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size());
+  if (RegResult) {
+    for (SmallVectorImpl<CCValAssign>::iterator It = PendingMembers.begin();
+         It != PendingMembers.end(); ++It) {
+      It->convertToReg(RegResult);
+      State.addLoc(*It);
+      ++RegResult;
+    }
+    PendingMembers.clear();
+    return true;
+  }
 
-    unsigned Size = LocVT.getSizeInBits() / 8;
-    unsigned Align = std::min(Size, 8U);
+  // Register allocation failed, we'll be needing the stack
+  unsigned Size = LocVT.getSizeInBits() / 8;
+  if (LocVT == MVT::i32 && State.getNextStackOffset() == 0) {
+    // If nothing else has used the stack until this point, a non-HFA aggregate
+    // can be split between regs and stack.
+    unsigned RegIdx = State.getFirstUnallocated(RegList.data(), RegList.size());
+    for (auto &It : PendingMembers) {
+      if (RegIdx >= RegList.size())
+        It.convertToMem(State.AllocateStack(Size, Size));
+      else
+        It.convertToReg(State.AllocateReg(RegList[RegIdx++]));
 
-    for (auto It : PendingHAMembers) {
-      It.convertToMem(State.AllocateStack(Size, Align));
       State.addLoc(It);
     }
+    PendingMembers.clear();
+    return true;
+  } else if (LocVT != MVT::i32)
+    RegList = SRegList;
 
-    // All pending members have now been allocated
-    PendingHAMembers.clear();
+  // Mark all regs as unavailable (AAPCS rule C.2.vfp for VFP, C.6 for core)
+  for (auto Reg : RegList)
+    State.AllocateReg(Reg);
+
+  for (auto &It : PendingMembers) {
+    It.convertToMem(State.AllocateStack(Size, Align));
+    State.addLoc(It);
+
+    // After the first item has been allocated, the rest are packed as tightly
+    // as possible. (E.g. an incoming i64 would have starting Align of 8, but
+    // we'll be allocating a bunch of i32 slots).
+    Align = Size;
   }
 
-  // This will be allocated by the last member of the HA
+  // All pending members have now been allocated
+  PendingMembers.clear();
+
+  // This will be allocated by the last member of the aggregate
   return true;
 }
 
Index: lib/Target/ARM/ARMCallingConv.td
===================================================================
--- lib/Target/ARM/ARMCallingConv.td
+++ lib/Target/ARM/ARMCallingConv.td
@@ -175,7 +175,7 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
   // HFAs are passed in a contiguous block of registers, or on the stack
-  CCIfConsecutiveRegs<CCCustom<"CC_ARM_AAPCS_Custom_HA">>,
+  CCIfConsecutiveRegs<CCCustom<"CC_ARM_AAPCS_Custom_Aggregate">>,
 
   CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -11285,7 +11285,9 @@ static bool isHomogeneousAggregate(Type *Ty, HABas
   return (Members > 0 && Members <= 4);
 }
 
-/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate.
+/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
+/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
+/// passing according to AAPCS rules.
 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
     Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
   if (getEffectiveCallingConv(CallConv, isVarArg) !=
@@ -11294,7 +11296,9 @@ bool ARMTargetLowering::functionArgumentNeedsConse
 
   HABaseType Base = HA_UNKNOWN;
   uint64_t Members = 0;
-  bool result = isHomogeneousAggregate(Ty, Base, Members);
-  DEBUG(dbgs() << "isHA: " << result << " "; Ty->dump());
-  return result;
+  bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
+  DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
+
+  bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
+  return IsHA || IsIntArray;
 }
Index: test/CodeGen/ARM/aggregate-padding.ll
===================================================================
--- test/CodeGen/ARM/aggregate-padding.ll
+++ test/CodeGen/ARM/aggregate-padding.ll
@@ -0,0 +1,101 @@
+; RUN: llc -mtriple=armv7-linux-gnueabihf %s -o - | FileCheck %s
+
+; [2 x i64] should be contiguous when split (e.g. we shouldn't try to align all
+; i32 components to 64 bits). Also makes sure i64 based types are properly
+; aligned on the stack.
+define i64 @test_i64_contiguous_on_stack([8 x double], float, i32 %in, [2 x i64] %arg) nounwind {
+; CHECK-LABEL: test_i64_contiguous_on_stack:
+; CHECK-DAG: ldr [[LO0:r[0-9]+]], [sp, #8]
+; CHECK-DAG: ldr [[HI0:r[0-9]+]], [sp, #12]
+; CHECK-DAG: ldr [[LO1:r[0-9]+]], [sp, #16]
+; CHECK-DAG: ldr [[HI1:r[0-9]+]], [sp, #20]
+; CHECK: adds r0, [[LO0]], [[LO1]]
+; CHECK: adc r1, [[HI0]], [[HI1]]
+
+  %val1 = extractvalue [2 x i64] %arg, 0
+  %val2 = extractvalue [2 x i64] %arg, 1
+  %sum = add i64 %val1, %val2
+  ret i64 %sum
+}
+
+; [2 x i64] should try to use looks for 4 regs, not 8 (which might happen if the
+; i64 -> i32, i32 split wasn't handled correctly).
+define i64 @test_2xi64_uses_4_regs([8 x double], float, [2 x i64] %arg) nounwind {
+; CHECK-LABEL: test_2xi64_uses_4_regs:
+; CHECK-DAG: mov r0, r2
+; CHECK-DAG: mov r1, r3
+
+  %val = extractvalue [2 x i64] %arg, 1
+  ret i64 %val
+}
+
+; An aggregate should be able to split between registers and stack if there is
+; nothing else on the stack.
+define i32 @test_aggregates_split([8 x double], i32, [4 x i32] %arg) nounwind {
+; CHECK-LABEL: test_aggregates_split:
+; CHECK: ldr [[VAL3:r[0-9]+]], [sp]
+; CHECK: add r0, r1, [[VAL3]]
+
+  %val0 = extractvalue [4 x i32] %arg, 0
+  %val3 = extractvalue [4 x i32] %arg, 3
+  %sum = add i32 %val0, %val3
+  ret i32 %sum
+}
+
+; If an aggregate has to be moved entirely onto the stack, nothing should be
+; able to use r0-r3 any more. Also checks that [2 x i64] properly aligned when
+; it uses regs.
+define i32 @test_no_int_backfilling([8 x double], float, i32, [2 x i64], i32 %arg) nounwind {
+; CHECK-LABEL: test_no_int_backfilling:
+; CHECK: ldr r0, [sp, #24]
+  ret i32 %arg
+}
+
+; Even if the argument was successfully allocated as reg block, there should be
+; no backfillig to r1.
+define i32 @test_no_int_backfilling_regsonly(i32, [1 x i64], i32 %arg) {
+; CHECK-LABEL: test_no_int_backfilling_regsonly:
+; CHECK: ldr r0, [sp]
+  ret i32 %arg
+}
+
+; If an aggregate has to be moved entirely onto the stack, nothing should be
+; able to use r0-r3 any more.
+define float @test_no_float_backfilling([7 x double], [4 x i32], i32, [4 x double], float %arg) nounwind {
+; CHECK-LABEL: test_no_float_backfilling:
+; CHECK: vldr s0, [sp, #40]
+  ret float %arg
+}
+
+; They're a bit pointless, but types like [N x i8] should work as well.
+define i8 @test_i8_in_regs(i32, [3 x i8] %arg) {
+; CHECK-LABEL: test_i8_in_regs:
+; CHECK: add r0, r1, r3
+  %val0 = extractvalue [3 x i8] %arg, 0
+  %val2 = extractvalue [3 x i8] %arg, 2
+  %sum = add i8 %val0, %val2
+  ret i8 %sum
+}
+
+define i16 @test_i16_split(i32, i32, [3 x i16] %arg) {
+; CHECK-LABEL: test_i16_split:
+; CHECK: ldrh [[VAL2:r[0-9]+]], [sp]
+; CHECK: add r0, r2, [[VAL2]]
+  %val0 = extractvalue [3 x i16] %arg, 0
+  %val2 = extractvalue [3 x i16] %arg, 2
+  %sum = add i16 %val0, %val2
+  ret i16 %sum
+}
+
+; Beware: on the stack each i16 still gets a 32-bit slot, the array is not
+; packed.
+define i16 @test_i16_forced_stack([8 x double], double, i32, i32, [3 x i16] %arg) {
+; CHECK-LABEL: test_i16_forced_stack:
+; CHECK-DAG: ldrh [[VAL0:r[0-9]+]], [sp, #8]
+; CHECK-DAG: ldrh [[VAL2:r[0-9]+]], [sp, #16]
+; CHECK: add r0, [[VAL0]], [[VAL2]]
+  %val0 = extractvalue [3 x i16] %arg, 0
+  %val2 = extractvalue [3 x i16] %arg, 2
+  %sum = add i16 %val0, %val2
+  ret i16 %sum
+}