Implement Fast Paths for most A32 SIMD instructions (#952)
* Begin work on A32 SIMD Intrinsics * More instructions, some cleanup. * Intrinsics for Move instructions (zip etc) These pass the existing tests. * Intrinsics for some of Cvt While doing this I noticed that the conversion for int/fp was incorrect in the slow path. I'll fix this in the original repo. * Intrinsics for more Arithmetic instructions. * Intrinsics for Vext * Fix VEXT Intrinsic for double words. * Use InsertPs to move scalar values. * Cleanup, fix VPADD.f32 and VMIN signed integer. * Cleanup, add SSE2 support for scalar insert. Works similarly to the IR scalar insert, but obviously this one works directly on V128. * Minor cleanup. * Enable intrinsic for FP64 to integer conversion. * Address feedback apart from splitting out intrinsic float abs Also: bad VREV encodings as undefined rather than throwing in translation. * Move float abs to helper, fix bug with cvt * Rename opc2 & 3 to match A32 docs, use ArgumentOutOfRangeException appropriately. * Get name of variable at compilation rather than string literal. * Use correct double sign mask.
This commit is contained in:
parent
d9ed827696
commit
68e15c1a74
12 changed files with 2077 additions and 400 deletions
|
@ -31,7 +31,7 @@ namespace ARMeilleure.Instructions
|
|||
15L << 56 | 14L << 48 | 13L << 40 | 12L << 32 | 07L << 24 | 06L << 16 | 05L << 8 | 04L << 0 // S
|
||||
};
|
||||
|
||||
private static readonly long _zeroMask = 128L << 56 | 128L << 48 | 128L << 40 | 128L << 32 | 128L << 24 | 128L << 16 | 128L << 8 | 128L << 0;
|
||||
public static readonly long ZeroMask = 128L << 56 | 128L << 48 | 128L << 40 | 128L << 32 | 128L << 24 | 128L << 16 | 128L << 8 | 128L << 0;
|
||||
#endregion
|
||||
|
||||
#region "X86 SSE Intrinsics"
|
||||
|
@ -1026,8 +1026,8 @@ namespace ARMeilleure.Instructions
|
|||
|
||||
if (op.RegisterSize == RegisterSize.Simd64)
|
||||
{
|
||||
Operand zeroEvenMask = X86GetElements(context, _zeroMask, EvenMasks[op.Size]);
|
||||
Operand zeroOddMask = X86GetElements(context, _zeroMask, OddMasks [op.Size]);
|
||||
Operand zeroEvenMask = X86GetElements(context, ZeroMask, EvenMasks[op.Size]);
|
||||
Operand zeroOddMask = X86GetElements(context, ZeroMask, OddMasks [op.Size]);
|
||||
|
||||
Operand mN = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m); // m:n
|
||||
|
||||
|
@ -1467,6 +1467,21 @@ namespace ARMeilleure.Instructions
|
|||
return context.Call(dlg, op1, op2);
|
||||
}
|
||||
|
||||
public static Operand EmitFloatAbs(ArmEmitterContext context, Operand value, bool single, bool vector)
|
||||
{
|
||||
Operand mask;
|
||||
if (single)
|
||||
{
|
||||
mask = vector ? X86GetAllElements(context, -0f) : X86GetScalar(context, -0f);
|
||||
}
|
||||
else
|
||||
{
|
||||
mask = vector ? X86GetAllElements(context, -0d) : X86GetScalar(context, -0d);
|
||||
}
|
||||
|
||||
return context.AddIntrinsic(single ? Intrinsic.X86Andnps : Intrinsic.X86Andnpd, mask, value);
|
||||
}
|
||||
|
||||
public static Operand EmitVectorExtractSx(ArmEmitterContext context, int reg, int index, int size)
|
||||
{
|
||||
return EmitVectorExtract(context, reg, index, size, true);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue