Optimize BIC, BSL, BIT, BIF, XTN, ZIP, DUP (Gp), FMADD (Scalar) and FCVT (Scalar) using SSE intrinsics (#405)
* Optimize BIC, BSL, BIT, BIF, XTN, ZIP, DUP (Gp), FMADD (Scalar) and FCVT (Scalar) using SSE intrinsics, some CQ improvements * Remove useless space * Address PR feedback * Revert EmitVectorZero32_128 changes
This commit is contained in:
parent
40282da93a
commit
0b52ee6627
8 changed files with 896 additions and 276 deletions
|
@ -227,7 +227,16 @@ namespace ChocolArm64.Instruction
|
|||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static double VectorExtractDouble(Vector128<float> Vector, byte Index)
|
||||
{
|
||||
return BitConverter.Int64BitsToDouble(VectorExtractIntSx(Vector, Index, 3));
|
||||
if (Sse41.IsSupported)
|
||||
{
|
||||
return BitConverter.Int64BitsToDouble(Sse41.Extract(Sse.StaticCast<float, long>(Vector), Index));
|
||||
}
|
||||
else if (Sse2.IsSupported)
|
||||
{
|
||||
return BitConverter.Int64BitsToDouble((long)VectorExtractIntZx(Vector, Index, 3));
|
||||
}
|
||||
|
||||
throw new PlatformNotSupportedException();
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
|
@ -235,41 +244,49 @@ namespace ChocolArm64.Instruction
|
|||
{
|
||||
if (Sse41.IsSupported)
|
||||
{
|
||||
switch (Size)
|
||||
if (Size == 0)
|
||||
{
|
||||
case 0:
|
||||
return (sbyte)Sse41.Extract(Sse.StaticCast<float, byte>(Vector), Index);
|
||||
|
||||
case 1:
|
||||
return (short)Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), Index);
|
||||
|
||||
case 2:
|
||||
return Sse41.Extract(Sse.StaticCast<float, int>(Vector), Index);
|
||||
|
||||
case 3:
|
||||
return Sse41.Extract(Sse.StaticCast<float, long>(Vector), Index);
|
||||
return (sbyte)Sse41.Extract(Sse.StaticCast<float, byte>(Vector), Index);
|
||||
}
|
||||
else if (Size == 1)
|
||||
{
|
||||
return (short)Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), Index);
|
||||
}
|
||||
else if (Size == 2)
|
||||
{
|
||||
return Sse41.Extract(Sse.StaticCast<float, int>(Vector), Index);
|
||||
}
|
||||
else if (Size == 3)
|
||||
{
|
||||
return Sse41.Extract(Sse.StaticCast<float, long>(Vector), Index);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(Size));
|
||||
}
|
||||
|
||||
throw new ArgumentOutOfRangeException(nameof(Size));
|
||||
}
|
||||
else if (Sse2.IsSupported)
|
||||
{
|
||||
switch (Size)
|
||||
if (Size == 0)
|
||||
{
|
||||
case 0:
|
||||
return (sbyte)VectorExtractIntZx(Vector, Index, Size);
|
||||
|
||||
case 1:
|
||||
return (short)VectorExtractIntZx(Vector, Index, Size);
|
||||
|
||||
case 2:
|
||||
return (int)VectorExtractIntZx(Vector, Index, Size);
|
||||
|
||||
case 3:
|
||||
return (long)VectorExtractIntZx(Vector, Index, Size);
|
||||
return (sbyte)VectorExtractIntZx(Vector, Index, Size);
|
||||
}
|
||||
else if (Size == 1)
|
||||
{
|
||||
return (short)VectorExtractIntZx(Vector, Index, Size);
|
||||
}
|
||||
else if (Size == 2)
|
||||
{
|
||||
return (int)VectorExtractIntZx(Vector, Index, Size);
|
||||
}
|
||||
else if (Size == 3)
|
||||
{
|
||||
return (long)VectorExtractIntZx(Vector, Index, Size);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(Size));
|
||||
}
|
||||
|
||||
throw new ArgumentOutOfRangeException(nameof(Size));
|
||||
}
|
||||
|
||||
throw new PlatformNotSupportedException();
|
||||
|
@ -280,22 +297,26 @@ namespace ChocolArm64.Instruction
|
|||
{
|
||||
if (Sse41.IsSupported)
|
||||
{
|
||||
switch (Size)
|
||||
if (Size == 0)
|
||||
{
|
||||
case 0:
|
||||
return Sse41.Extract(Sse.StaticCast<float, byte>(Vector), Index);
|
||||
|
||||
case 1:
|
||||
return Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), Index);
|
||||
|
||||
case 2:
|
||||
return Sse41.Extract(Sse.StaticCast<float, uint>(Vector), Index);
|
||||
|
||||
case 3:
|
||||
return Sse41.Extract(Sse.StaticCast<float, ulong>(Vector), Index);
|
||||
return Sse41.Extract(Sse.StaticCast<float, byte>(Vector), Index);
|
||||
}
|
||||
else if (Size == 1)
|
||||
{
|
||||
return Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), Index);
|
||||
}
|
||||
else if (Size == 2)
|
||||
{
|
||||
return Sse41.Extract(Sse.StaticCast<float, uint>(Vector), Index);
|
||||
}
|
||||
else if (Size == 3)
|
||||
{
|
||||
return Sse41.Extract(Sse.StaticCast<float, ulong>(Vector), Index);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(Size));
|
||||
}
|
||||
|
||||
throw new ArgumentOutOfRangeException(nameof(Size));
|
||||
}
|
||||
else if (Sse2.IsSupported)
|
||||
{
|
||||
|
@ -305,35 +326,35 @@ namespace ChocolArm64.Instruction
|
|||
|
||||
ushort Value = Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), (byte)ShortIdx);
|
||||
|
||||
switch (Size)
|
||||
if (Size == 0)
|
||||
{
|
||||
case 0:
|
||||
return (byte)(Value >> (Index & 1) * 8);
|
||||
|
||||
case 1:
|
||||
return Value;
|
||||
|
||||
case 2:
|
||||
case 3:
|
||||
{
|
||||
ushort Value1 = Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), (byte)(ShortIdx + 1));
|
||||
|
||||
if (Size == 2)
|
||||
{
|
||||
return (uint)(Value | (Value1 << 16));
|
||||
}
|
||||
|
||||
ushort Value2 = Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), (byte)(ShortIdx + 2));
|
||||
ushort Value3 = Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), (byte)(ShortIdx + 3));
|
||||
|
||||
return ((ulong)Value << 0) |
|
||||
((ulong)Value1 << 16) |
|
||||
((ulong)Value2 << 32) |
|
||||
((ulong)Value3 << 48);
|
||||
}
|
||||
return (byte)(Value >> (Index & 1) * 8);
|
||||
}
|
||||
else if (Size == 1)
|
||||
{
|
||||
return Value;
|
||||
}
|
||||
else if (Size == 2 || Size == 3)
|
||||
{
|
||||
ushort Value1 = Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), (byte)(ShortIdx + 1));
|
||||
|
||||
throw new ArgumentOutOfRangeException(nameof(Size));
|
||||
if (Size == 2)
|
||||
{
|
||||
return (uint)(Value | (Value1 << 16));
|
||||
}
|
||||
|
||||
ushort Value2 = Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), (byte)(ShortIdx + 2));
|
||||
ushort Value3 = Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), (byte)(ShortIdx + 3));
|
||||
|
||||
return ((ulong)Value << 0) |
|
||||
((ulong)Value1 << 16) |
|
||||
((ulong)Value2 << 32) |
|
||||
((ulong)Value3 << 48);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(Size));
|
||||
}
|
||||
}
|
||||
|
||||
throw new PlatformNotSupportedException();
|
||||
|
@ -370,22 +391,26 @@ namespace ChocolArm64.Instruction
|
|||
{
|
||||
if (Sse41.IsSupported)
|
||||
{
|
||||
switch (Size)
|
||||
if (Size == 0)
|
||||
{
|
||||
case 0:
|
||||
return Sse.StaticCast<byte, float>(Sse41.Insert(Sse.StaticCast<float, byte>(Vector), (byte)Value, Index));
|
||||
|
||||
case 1:
|
||||
return Sse.StaticCast<ushort, float>(Sse2.Insert(Sse.StaticCast<float, ushort>(Vector), (ushort)Value, Index));
|
||||
|
||||
case 2:
|
||||
return Sse.StaticCast<uint, float>(Sse41.Insert(Sse.StaticCast<float, uint>(Vector), (uint)Value, Index));
|
||||
|
||||
case 3:
|
||||
return Sse.StaticCast<ulong, float>(Sse41.Insert(Sse.StaticCast<float, ulong>(Vector), Value, Index));
|
||||
return Sse.StaticCast<byte, float>(Sse41.Insert(Sse.StaticCast<float, byte>(Vector), (byte)Value, Index));
|
||||
}
|
||||
else if (Size == 1)
|
||||
{
|
||||
return Sse.StaticCast<ushort, float>(Sse2.Insert(Sse.StaticCast<float, ushort>(Vector), (ushort)Value, Index));
|
||||
}
|
||||
else if (Size == 2)
|
||||
{
|
||||
return Sse.StaticCast<uint, float>(Sse41.Insert(Sse.StaticCast<float, uint>(Vector), (uint)Value, Index));
|
||||
}
|
||||
else if (Size == 3)
|
||||
{
|
||||
return Sse.StaticCast<ulong, float>(Sse41.Insert(Sse.StaticCast<float, ulong>(Vector), Value, Index));
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(Size));
|
||||
}
|
||||
|
||||
throw new ArgumentOutOfRangeException(nameof(Size));
|
||||
}
|
||||
else if (Sse2.IsSupported)
|
||||
{
|
||||
|
@ -395,41 +420,39 @@ namespace ChocolArm64.Instruction
|
|||
? Index >> 1
|
||||
: Index << (Size - 1);
|
||||
|
||||
switch (Size)
|
||||
if (Size == 0)
|
||||
{
|
||||
case 0:
|
||||
{
|
||||
ushort ShortVal = Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), (byte)ShortIdx);
|
||||
ushort ShortVal = Sse2.Extract(Sse.StaticCast<float, ushort>(Vector), (byte)ShortIdx);
|
||||
|
||||
int Shift = (Index & 1) * 8;
|
||||
int Shift = (Index & 1) * 8;
|
||||
|
||||
ShortVal &= (ushort)(0xff00 >> Shift);
|
||||
ShortVal &= (ushort)(0xff00 >> Shift);
|
||||
|
||||
ShortVal |= (ushort)((byte)Value << Shift);
|
||||
ShortVal |= (ushort)((byte)Value << Shift);
|
||||
|
||||
return Sse.StaticCast<ushort, float>(Sse2.Insert(ShortVector, ShortVal, (byte)ShortIdx));
|
||||
}
|
||||
|
||||
case 1:
|
||||
return Sse.StaticCast<ushort, float>(Sse2.Insert(Sse.StaticCast<float, ushort>(Vector), (ushort)Value, Index));
|
||||
|
||||
case 2:
|
||||
case 3:
|
||||
{
|
||||
ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 0), (byte)(ShortIdx + 0));
|
||||
ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 16), (byte)(ShortIdx + 1));
|
||||
|
||||
if (Size == 3)
|
||||
{
|
||||
ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 32), (byte)(ShortIdx + 2));
|
||||
ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 48), (byte)(ShortIdx + 3));
|
||||
}
|
||||
|
||||
return Sse.StaticCast<ushort, float>(ShortVector);
|
||||
}
|
||||
return Sse.StaticCast<ushort, float>(Sse2.Insert(ShortVector, ShortVal, (byte)ShortIdx));
|
||||
}
|
||||
else if (Size == 1)
|
||||
{
|
||||
return Sse.StaticCast<ushort, float>(Sse2.Insert(Sse.StaticCast<float, ushort>(Vector), (ushort)Value, Index));
|
||||
}
|
||||
else if (Size == 2 || Size == 3)
|
||||
{
|
||||
ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 0), (byte)(ShortIdx + 0));
|
||||
ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 16), (byte)(ShortIdx + 1));
|
||||
|
||||
throw new ArgumentOutOfRangeException(nameof(Size));
|
||||
if (Size == 3)
|
||||
{
|
||||
ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 32), (byte)(ShortIdx + 2));
|
||||
ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 48), (byte)(ShortIdx + 3));
|
||||
}
|
||||
|
||||
return Sse.StaticCast<ushort, float>(ShortVector);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(Size));
|
||||
}
|
||||
}
|
||||
|
||||
throw new PlatformNotSupportedException();
|
||||
|
@ -440,7 +463,29 @@ namespace ChocolArm64.Instruction
|
|||
{
|
||||
if (Sse41.IsSupported)
|
||||
{
|
||||
return Sse41.Insert(Vector, Value, (byte)(Index << 4));
|
||||
//Note: The if/else if is necessary to enable the JIT to
|
||||
//produce a single INSERTPS instruction instead of the
|
||||
//jump table fallback.
|
||||
if (Index == 0)
|
||||
{
|
||||
return Sse41.Insert(Vector, Value, 0x00);
|
||||
}
|
||||
else if (Index == 1)
|
||||
{
|
||||
return Sse41.Insert(Vector, Value, 0x10);
|
||||
}
|
||||
else if (Index == 2)
|
||||
{
|
||||
return Sse41.Insert(Vector, Value, 0x20);
|
||||
}
|
||||
else if (Index == 3)
|
||||
{
|
||||
return Sse41.Insert(Vector, Value, 0x30);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(Index));
|
||||
}
|
||||
}
|
||||
else if (Sse2.IsSupported)
|
||||
{
|
||||
|
@ -460,6 +505,79 @@ namespace ChocolArm64.Instruction
|
|||
throw new PlatformNotSupportedException();
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector128<float> Sse41VectorInsertScalarSingle(float Value, Vector128<float> Vector)
|
||||
{
|
||||
//Note: 0b1110 is the mask to zero the upper bits.
|
||||
return Sse41.Insert(Vector, Value, 0b1110);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector128<sbyte> VectorSByteZero()
|
||||
{
|
||||
if (Sse2.IsSupported)
|
||||
{
|
||||
return Sse2.SetZeroVector128<sbyte>();
|
||||
}
|
||||
|
||||
throw new PlatformNotSupportedException();
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector128<short> VectorInt16Zero()
|
||||
{
|
||||
if (Sse2.IsSupported)
|
||||
{
|
||||
return Sse2.SetZeroVector128<short>();
|
||||
}
|
||||
|
||||
throw new PlatformNotSupportedException();
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector128<int> VectorInt32Zero()
|
||||
{
|
||||
if (Sse2.IsSupported)
|
||||
{
|
||||
return Sse2.SetZeroVector128<int>();
|
||||
}
|
||||
|
||||
throw new PlatformNotSupportedException();
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector128<long> VectorInt64Zero()
|
||||
{
|
||||
if (Sse2.IsSupported)
|
||||
{
|
||||
return Sse2.SetZeroVector128<long>();
|
||||
}
|
||||
|
||||
throw new PlatformNotSupportedException();
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector128<float> VectorSingleZero()
|
||||
{
|
||||
if (Sse.IsSupported)
|
||||
{
|
||||
return Sse.SetZeroVector128();
|
||||
}
|
||||
|
||||
throw new PlatformNotSupportedException();
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector128<double> VectorDoubleZero()
|
||||
{
|
||||
if (Sse2.IsSupported)
|
||||
{
|
||||
return Sse2.SetZeroVector128<double>();
|
||||
}
|
||||
|
||||
throw new PlatformNotSupportedException();
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector128<float> VectorZero32_128(Vector128<float> Vector)
|
||||
{
|
||||
|
@ -515,6 +633,50 @@ namespace ChocolArm64.Instruction
|
|||
throw new PlatformNotSupportedException();
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector128<byte> VectorSingleToByte(Vector128<float> Vector)
|
||||
{
|
||||
if (Sse.IsSupported)
|
||||
{
|
||||
return Sse.StaticCast<float, byte>(Vector);
|
||||
}
|
||||
|
||||
throw new PlatformNotSupportedException();
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector128<ushort> VectorSingleToUInt16(Vector128<float> Vector)
|
||||
{
|
||||
if (Sse.IsSupported)
|
||||
{
|
||||
return Sse.StaticCast<float, ushort>(Vector);
|
||||
}
|
||||
|
||||
throw new PlatformNotSupportedException();
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector128<uint> VectorSingleToUInt32(Vector128<float> Vector)
|
||||
{
|
||||
if (Sse.IsSupported)
|
||||
{
|
||||
return Sse.StaticCast<float, uint>(Vector);
|
||||
}
|
||||
|
||||
throw new PlatformNotSupportedException();
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector128<ulong> VectorSingleToUInt64(Vector128<float> Vector)
|
||||
{
|
||||
if (Sse.IsSupported)
|
||||
{
|
||||
return Sse.StaticCast<float, ulong>(Vector);
|
||||
}
|
||||
|
||||
throw new PlatformNotSupportedException();
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector128<double> VectorSingleToDouble(Vector128<float> Vector)
|
||||
{
|
||||
|
@ -570,6 +732,50 @@ namespace ChocolArm64.Instruction
|
|||
throw new PlatformNotSupportedException();
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector128<float> VectorByteToSingle(Vector128<byte> Vector)
|
||||
{
|
||||
if (Sse.IsSupported)
|
||||
{
|
||||
return Sse.StaticCast<byte, float>(Vector);
|
||||
}
|
||||
|
||||
throw new PlatformNotSupportedException();
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector128<float> VectorUInt16ToSingle(Vector128<ushort> Vector)
|
||||
{
|
||||
if (Sse.IsSupported)
|
||||
{
|
||||
return Sse.StaticCast<ushort, float>(Vector);
|
||||
}
|
||||
|
||||
throw new PlatformNotSupportedException();
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector128<float> VectorUInt32ToSingle(Vector128<uint> Vector)
|
||||
{
|
||||
if (Sse.IsSupported)
|
||||
{
|
||||
return Sse.StaticCast<uint, float>(Vector);
|
||||
}
|
||||
|
||||
throw new PlatformNotSupportedException();
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector128<float> VectorUInt64ToSingle(Vector128<ulong> Vector)
|
||||
{
|
||||
if (Sse.IsSupported)
|
||||
{
|
||||
return Sse.StaticCast<ulong, float>(Vector);
|
||||
}
|
||||
|
||||
throw new PlatformNotSupportedException();
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static Vector128<float> VectorDoubleToSingle(Vector128<double> Vector)
|
||||
{
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue