Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/ImageSharp/Common/Helpers/ImageMaths.cs
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,12 @@ public static int LeastCommonMultiple(int a, int b)
return (a / GreatestCommonDivisor(a, b)) * b;
}

/// <summary>
/// Calculates <paramref name="x"/> % 2
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
public static int Modulo2(int x) => x & 1;

/// <summary>
/// Calculates <paramref name="x"/> % 4
/// </summary>
Expand Down
86 changes: 76 additions & 10 deletions src/ImageSharp/Common/Helpers/Vector4Utilities.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif

namespace SixLabors.ImageSharp
{
Expand All @@ -13,6 +17,10 @@ namespace SixLabors.ImageSharp
/// </summary>
internal static class Vector4Utilities
{
private const int BlendAlphaControl = 0b10001000;

private static ReadOnlySpan<byte> PermuteAlphaMask8x32 => new byte[] { 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0 };

/// <summary>
/// Restricts a vector between a minimum and a maximum value.
/// 5x Faster then <see cref="Vector4.Clamp(Vector4, Vector4, Vector4)"/>.
Expand Down Expand Up @@ -56,13 +64,42 @@ public static void UnPremultiply(ref Vector4 source)
[MethodImpl(InliningOptions.ShortMethod)]
public static void Premultiply(Span<Vector4> vectors)
{
// TODO: This method can be AVX2 optimized using Vector<float>
ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx2.IsSupported && vectors.Length >= 2)
{
ref Vector256<float> vectorsBase =
ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(vectors));

for (int i = 0; i < vectors.Length; i++)
Vector256<int> mask =
Unsafe.As<byte, Vector256<int>>(ref MemoryMarshal.GetReference(PermuteAlphaMask8x32));

// Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
ref Vector256<float> vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u));

while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast))
{
Vector256<float> source = vectorsBase;
Vector256<float> multiply = Avx2.PermuteVar8x32(source, mask);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You actually don't need to permute here since you're not crossing 128-bit lanes. Avx.Shuffle(source, source, 0b_11_11_11_11) will do the same thing with lower latency while eliminating the need to load the mask register.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @saucecontrol I didn't know Shuffle had that overload. Slight speedup.

Method Mean Error StdDev Ratio Gen 0 Gen 1 Gen 2 Allocated
PremultiplyBaseline 37.64 us 1.482 us 0.081 us 1.00 - - - -
Premultiply 27.42 us 1.738 us 0.095 us 0.73 - - - -
Method Mean Error StdDev Ratio Gen 0 Gen 1 Gen 2 Allocated
UnPremultiplyBaseline 37.753 us 3.9513 us 0.2166 us 1.00 - - - -
UnPremultiply 1.322 us 0.0998 us 0.0055 us 0.04 - - - -

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still don't understand the same method with Divide instead of Multiply results in a ~30X difference in benchmark result though 😖

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ha, I didn't read through all the comments and missed that bit. Your baseline UnPremultiply method in the benchmark is multiplying instead of dividing, but I don't see why the vectorized version is coming out so much faster. Will have a look after sleep if you don't figure it out ;)

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I knew it! I knew there was a mistake! Thanks!

I guess my computer just doesn't like multiplying stuff. The baseline is way faster now too.

Method Mean Error StdDev Ratio Gen 0 Gen 1 Gen 2 Allocated
UnPremultiplyBaseline 2.018 us 0.1879 us 0.0103 us 1.00 - - - -
UnPremultiply 1.255 us 0.0452 us 0.0025 us 0.62 - - - -

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's strange the division is showing lower times. Is that just an iteration count difference between the Premultiply and UnPremultiply runs? BDN is too clever sometimes.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nah... Exactly the same setup. I think it's optimizing something away.

vectorsBase = Avx.Blend(Avx.Multiply(source, multiply), source, BlendAlphaControl);
vectorsBase = ref Unsafe.Add(ref vectorsBase, 1);
}

if (ImageMaths.Modulo2(vectors.Length) != 0)
{
// Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
Premultiply(ref MemoryMarshal.GetReference(vectors.Slice(vectors.Length - 1)));
}
}
else
#endif
{
ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
Premultiply(ref v);
ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);

for (int i = 0; i < vectors.Length; i++)
{
ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
Premultiply(ref v);
}
}
}

Expand All @@ -73,13 +110,42 @@ public static void Premultiply(Span<Vector4> vectors)
[MethodImpl(InliningOptions.ShortMethod)]
public static void UnPremultiply(Span<Vector4> vectors)
{
// TODO: This method can be AVX2 optimized using Vector<float>
ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx2.IsSupported && vectors.Length >= 2)
{
ref Vector256<float> vectorsBase =
ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(vectors));

for (int i = 0; i < vectors.Length; i++)
Vector256<int> mask =
Unsafe.As<byte, Vector256<int>>(ref MemoryMarshal.GetReference(PermuteAlphaMask8x32));

// Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
ref Vector256<float> vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u));

while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast))
{
Vector256<float> source = vectorsBase;
Vector256<float> multiply = Avx2.PermuteVar8x32(source, mask);
vectorsBase = Avx.Blend(Avx.Divide(source, multiply), source, BlendAlphaControl);
vectorsBase = ref Unsafe.Add(ref vectorsBase, 1);
}

if (ImageMaths.Modulo2(vectors.Length) != 0)
{
// Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
UnPremultiply(ref MemoryMarshal.GetReference(vectors.Slice(vectors.Length - 1)));
}
}
else
#endif
{
ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
UnPremultiply(ref v);
ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);

for (int i = 0; i < vectors.Length; i++)
{
ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
UnPremultiply(ref v);
}
}
}

Expand Down
68 changes: 68 additions & 0 deletions tests/ImageSharp.Benchmarks/Color/Bulk/PremultiplyVector4.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.

using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using BenchmarkDotNet.Attributes;

namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
{
[Config(typeof(Config.ShortCore31))]
public class PremultiplyVector4
{
private static readonly Vector4[] Vectors = CreateVectors();

[Benchmark(Baseline = true)]
public void PremultiplyBaseline()
{
ref Vector4 baseRef = ref MemoryMarshal.GetReference<Vector4>(Vectors);

for (int i = 0; i < Vectors.Length; i++)
{
ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
Premultiply(ref v);
}
}

[Benchmark]
public void Premultiply()
{
Vector4Utilities.Premultiply(Vectors);
}

[MethodImpl(InliningOptions.ShortMethod)]
private static void Premultiply(ref Vector4 source)
{
float w = source.W;
source *= w;
source.W = w;
}

private static Vector4[] CreateVectors()
{
var rnd = new Random(42);
return GenerateRandomVectorArray(rnd, 2048, 0, 1);
}

private static Vector4[] GenerateRandomVectorArray(Random rnd, int length, float minVal, float maxVal)
{
var values = new Vector4[length];

for (int i = 0; i < length; i++)
{
ref Vector4 v = ref values[i];
v.X = GetRandomFloat(rnd, minVal, maxVal);
v.Y = GetRandomFloat(rnd, minVal, maxVal);
v.Z = GetRandomFloat(rnd, minVal, maxVal);
v.W = GetRandomFloat(rnd, minVal, maxVal);
}

return values;
}

private static float GetRandomFloat(Random rnd, float minVal, float maxVal)
=> ((float)rnd.NextDouble() * (maxVal - minVal)) + minVal;
}
}
68 changes: 68 additions & 0 deletions tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.

using System;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using BenchmarkDotNet.Attributes;

namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
{
[Config(typeof(Config.ShortCore31))]
public class UnPremultiplyVector4
{
private static readonly Vector4[] Vectors = CreateVectors();

[Benchmark(Baseline = true)]
public void UnPremultiplyBaseline()
{
ref Vector4 baseRef = ref MemoryMarshal.GetReference<Vector4>(Vectors);

for (int i = 0; i < Vectors.Length; i++)
{
ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
UnPremultiply(ref v);
}
}

[Benchmark]
public void UnPremultiply()
{
Vector4Utilities.UnPremultiply(Vectors);
}

[MethodImpl(InliningOptions.ShortMethod)]
private static void UnPremultiply(ref Vector4 source)
{
float w = source.W;
source *= w;
source.W = w;
}

private static Vector4[] CreateVectors()
{
var rnd = new Random(42);
return GenerateRandomVectorArray(rnd, 2048, 0, 1);
}

private static Vector4[] GenerateRandomVectorArray(Random rnd, int length, float minVal, float maxVal)
{
var values = new Vector4[length];

for (int i = 0; i < length; i++)
{
ref Vector4 v = ref values[i];
v.X = GetRandomFloat(rnd, minVal, maxVal);
v.Y = GetRandomFloat(rnd, minVal, maxVal);
v.Z = GetRandomFloat(rnd, minVal, maxVal);
v.W = GetRandomFloat(rnd, minVal, maxVal);
}

return values;
}

private static float GetRandomFloat(Random rnd, float minVal, float maxVal)
=> ((float)rnd.NextDouble() * (maxVal - minVal)) + minVal;
}
}
15 changes: 15 additions & 0 deletions tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,21 @@ namespace SixLabors.ImageSharp.Tests.Helpers
{
public class ImageMathsTests
{
[Theory]
[InlineData(0)]
[InlineData(1)]
[InlineData(2)]
[InlineData(3)]
[InlineData(4)]
[InlineData(100)]
[InlineData(123)]
[InlineData(53436353)]
public void Modulo2(int x)
{
int actual = ImageMaths.Modulo2(x);
Assert.Equal(x % 2, actual);
}

[Theory]
[InlineData(0)]
[InlineData(1)]
Expand Down
2 changes: 2 additions & 0 deletions tests/ImageSharp.Tests/Helpers/Vector4UtilsTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ public class Vector4UtilsTests
[InlineData(0)]
[InlineData(1)]
[InlineData(30)]
[InlineData(63)]
public void Premultiply_VectorSpan(int length)
{
var rnd = new Random(42);
Expand All @@ -36,6 +37,7 @@ public void Premultiply_VectorSpan(int length)
[InlineData(0)]
[InlineData(1)]
[InlineData(30)]
[InlineData(63)]
public void UnPremultiply_VectorSpan(int length)
{
var rnd = new Random(42);
Expand Down