diff options
Diffstat (limited to 'contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp')
-rw-r--r-- | contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp | 114 |
1 files changed, 91 insertions, 23 deletions
diff --git a/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 3f11119..ec35b3f 100644 --- a/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -29,6 +29,9 @@ // It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2 // intrinsic in ARM backend. // +// In X86, this can be further optimized into a set of target +// specific loads followed by an optimized sequence of shuffles. +// // E.g. An interleaved store (Factor = 3): // %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> @@ -37,6 +40,8 @@ // It could be transformed into a st3 intrinsic in AArch64 backend or a vst3 // intrinsic in ARM backend. // +// Similarly, a set of interleaved stores can be transformed into an optimized +// sequence of shuffles followed by a set of target specific stores for X86. //===----------------------------------------------------------------------===// #include "llvm/CodeGen/Passes.h" @@ -57,8 +62,6 @@ static cl::opt<bool> LowerInterleavedAccesses( cl::desc("Enable lowering interleaved accesses to intrinsics"), cl::init(true), cl::Hidden); -static unsigned MaxFactor; // The maximum supported interleave factor. - namespace { class InterleavedAccess : public FunctionPass { @@ -70,7 +73,7 @@ public: initializeInterleavedAccessPass(*PassRegistry::getPassRegistry()); } - const char *getPassName() const override { return "Interleaved Access Pass"; } + StringRef getPassName() const override { return "Interleaved Access Pass"; } bool runOnFunction(Function &F) override; @@ -84,6 +87,9 @@ private: const TargetMachine *TM; const TargetLowering *TLI; + /// The maximum supported interleave factor. + unsigned MaxFactor; + /// \brief Transform an interleaved load into target specific intrinsics. bool lowerInterleavedLoad(LoadInst *LI, SmallVector<Instruction *, 32> &DeadInsts); @@ -144,7 +150,7 @@ static bool isDeInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor, /// <0, 2, 4, 6> (mask of index 0 to extract even elements) /// <1, 3, 5, 7> (mask of index 1 to extract odd elements) static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor, - unsigned &Index) { + unsigned &Index, unsigned MaxFactor) { if (Mask.size() < 2) return false; @@ -156,13 +162,19 @@ static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor, return false; } -/// \brief Check if the mask is RE-interleave mask for an interleaved store. -/// -/// I.e. <0, NumSubElts, ... , NumSubElts*(Factor - 1), 1, NumSubElts + 1, ...> +/// \brief Check if the mask can be used in an interleaved store. +// +/// It checks for a more general pattern than the RE-interleave mask. +/// I.e. <x, y, ... z, x+1, y+1, ...z+1, x+2, y+2, ...z+2, ...> +/// E.g. For a Factor of 2 (LaneLen=4): <4, 32, 5, 33, 6, 34, 7, 35> +/// E.g. For a Factor of 3 (LaneLen=4): <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> +/// E.g. For a Factor of 4 (LaneLen=2): <8, 2, 12, 4, 9, 3, 13, 5> /// -/// E.g. The RE-interleave mask (Factor = 2) could be: -/// <0, 4, 1, 5, 2, 6, 3, 7> -static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor) { +/// The particular case of an RE-interleave mask is: +/// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...> +/// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7> +static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor, + unsigned MaxFactor, unsigned OpNumElts) { unsigned NumElts = Mask.size(); if (NumElts < 4) return false; @@ -172,21 +184,75 @@ static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor) { if (NumElts % Factor) continue; - unsigned NumSubElts = NumElts / Factor; - if (!isPowerOf2_32(NumSubElts)) + unsigned LaneLen = NumElts / Factor; + if (!isPowerOf2_32(LaneLen)) continue; - // Check whether each element matchs the RE-interleaved rule. Ignore undef - // elements. - unsigned i = 0; - for (; i < NumElts; i++) - if (Mask[i] >= 0 && - static_cast<unsigned>(Mask[i]) != - (i % Factor) * NumSubElts + i / Factor) + // Check whether each element matches the general interleaved rule. + // Ignore undef elements, as long as the defined elements match the rule. + // Outer loop processes all factors (x, y, z in the above example) + unsigned I = 0, J; + for (; I < Factor; I++) { + unsigned SavedLaneValue; + unsigned SavedNoUndefs = 0; + + // Inner loop processes consecutive accesses (x, x+1... in the example) + for (J = 0; J < LaneLen - 1; J++) { + // Lane computes x's position in the Mask + unsigned Lane = J * Factor + I; + unsigned NextLane = Lane + Factor; + int LaneValue = Mask[Lane]; + int NextLaneValue = Mask[NextLane]; + + // If both are defined, values must be sequential + if (LaneValue >= 0 && NextLaneValue >= 0 && + LaneValue + 1 != NextLaneValue) + break; + + // If the next value is undef, save the current one as reference + if (LaneValue >= 0 && NextLaneValue < 0) { + SavedLaneValue = LaneValue; + SavedNoUndefs = 1; + } + + // Undefs are allowed, but defined elements must still be consecutive: + // i.e.: x,..., undef,..., x + 2,..., undef,..., undef,..., x + 5, .... + // Verify this by storing the last non-undef followed by an undef + // Check that following non-undef masks are incremented with the + // corresponding distance. + if (SavedNoUndefs > 0 && LaneValue < 0) { + SavedNoUndefs++; + if (NextLaneValue >= 0 && + SavedLaneValue + SavedNoUndefs != (unsigned)NextLaneValue) + break; + } + } + + if (J < LaneLen - 1) break; - // Find a RE-interleaved mask of current factor. - if (i == NumElts) + int StartMask = 0; + if (Mask[I] >= 0) { + // Check that the start of the I range (J=0) is greater than 0 + StartMask = Mask[I]; + } else if (Mask[(LaneLen - 1) * Factor + I] >= 0) { + // StartMask defined by the last value in lane + StartMask = Mask[(LaneLen - 1) * Factor + I] - J; + } else if (SavedNoUndefs > 0) { + // StartMask defined by some non-zero value in the j loop + StartMask = SavedLaneValue - (LaneLen - 1 - SavedNoUndefs); + } + // else StartMask remains set to 0, i.e. all elements are undefs + + if (StartMask < 0) + break; + // We must stay within the vectors; This case can happen with undefs. + if (StartMask + LaneLen > OpNumElts*2) + break; + } + + // Found an interleaved mask of current factor. + if (I == Factor) return true; } @@ -224,7 +290,8 @@ bool InterleavedAccess::lowerInterleavedLoad( unsigned Factor, Index; // Check if the first shufflevector is DE-interleave shuffle. - if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index)) + if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index, + MaxFactor)) return false; // Holds the corresponding index for each DE-interleave shuffle. @@ -342,7 +409,8 @@ bool InterleavedAccess::lowerInterleavedStore( // Check if the shufflevector is RE-interleave shuffle. unsigned Factor; - if (!isReInterleaveMask(SVI->getShuffleMask(), Factor)) + unsigned OpNumElts = SVI->getOperand(0)->getType()->getVectorNumElements(); + if (!isReInterleaveMask(SVI->getShuffleMask(), Factor, MaxFactor, OpNumElts)) return false; DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n"); |