SixLabors · JimBobSquarePants · Oct 23, 2020 · Oct 21, 2020 · Oct 21, 2020 · Oct 21, 2020
diff --git a/src/ImageSharp/Common/Helpers/ImageMaths.cs b/src/ImageSharp/Common/Helpers/ImageMaths.cs
@@ -132,6 +132,12 @@ public static int LeastCommonMultiple(int a, int b)
             return (a / GreatestCommonDivisor(a, b)) * b;
         }
 
+        /// <summary>
+        /// Calculates <paramref name="x"/> % 2
+        /// </summary>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public static int Modulo2(int x) => x & 1;
+
         /// <summary>
         /// Calculates <paramref name="x"/> % 4
         /// </summary>

diff --git a/src/ImageSharp/Common/Helpers/Vector4Utilities.cs b/src/ImageSharp/Common/Helpers/Vector4Utilities.cs
@@ -5,6 +5,10 @@
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 
 namespace SixLabors.ImageSharp
 {
@@ -13,6 +17,10 @@ namespace SixLabors.ImageSharp
     /// </summary>
     internal static class Vector4Utilities
     {
+        private const int BlendAlphaControl = 0b10001000;
+
+        private static ReadOnlySpan<byte> PermuteAlphaMask8x32 => new byte[] { 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0 };
+
         /// <summary>
         /// Restricts a vector between a minimum and a maximum value.
         /// 5x Faster then <see cref="Vector4.Clamp(Vector4, Vector4, Vector4)"/>.
@@ -56,13 +64,42 @@ public static void UnPremultiply(ref Vector4 source)
         [MethodImpl(InliningOptions.ShortMethod)]
         public static void Premultiply(Span<Vector4> vectors)
         {
-            // TODO: This method can be AVX2 optimized using Vector<float>
-            ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx2.IsSupported && vectors.Length >= 2)
+            {
+                ref Vector256<float> vectorsBase =
+                    ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(vectors));
 
-            for (int i = 0; i < vectors.Length; i++)
+                Vector256<int> mask =
+                    Unsafe.As<byte, Vector256<int>>(ref MemoryMarshal.GetReference(PermuteAlphaMask8x32));
+
+                // Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
+                ref Vector256<float> vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u));
+
+                while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast))
+                {
+                    Vector256<float> source = vectorsBase;
+                    Vector256<float> multiply = Avx2.PermuteVar8x32(source, mask);
+                    vectorsBase = Avx.Blend(Avx.Multiply(source, multiply), source, BlendAlphaControl);
+                    vectorsBase = ref Unsafe.Add(ref vectorsBase, 1);
+                }
+
+                if (ImageMaths.Modulo2(vectors.Length) != 0)
+                {
+                    // Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
+                    Premultiply(ref MemoryMarshal.GetReference(vectors.Slice(vectors.Length - 1)));
+                }
+            }
+            else
+#endif
             {
-                ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
-                Premultiply(ref v);
+                ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
+
+                for (int i = 0; i < vectors.Length; i++)
+                {
+                    ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
+                    Premultiply(ref v);
+                }
             }
         }
 
@@ -73,13 +110,42 @@ public static void Premultiply(Span<Vector4> vectors)
         [MethodImpl(InliningOptions.ShortMethod)]
         public static void UnPremultiply(Span<Vector4> vectors)
         {
-            // TODO: This method can be AVX2 optimized using Vector<float>
-            ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx2.IsSupported && vectors.Length >= 2)
+            {
+                ref Vector256<float> vectorsBase =
+                    ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(vectors));
 
-            for (int i = 0; i < vectors.Length; i++)
+                Vector256<int> mask =
+                    Unsafe.As<byte, Vector256<int>>(ref MemoryMarshal.GetReference(PermuteAlphaMask8x32));
+
+                // Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
+                ref Vector256<float> vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u));
+
+                while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast))
+                {
+                    Vector256<float> source = vectorsBase;
+                    Vector256<float> multiply = Avx2.PermuteVar8x32(source, mask);
+                    vectorsBase = Avx.Blend(Avx.Divide(source, multiply), source, BlendAlphaControl);
+                    vectorsBase = ref Unsafe.Add(ref vectorsBase, 1);
+                }
+
+                if (ImageMaths.Modulo2(vectors.Length) != 0)
+                {
+                    // Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
+                    UnPremultiply(ref MemoryMarshal.GetReference(vectors.Slice(vectors.Length - 1)));
+                }
+            }
+            else
+#endif
             {
-                ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
-                UnPremultiply(ref v);
+                ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
+
+                for (int i = 0; i < vectors.Length; i++)
+                {
+                    ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
+                    UnPremultiply(ref v);
+                }
             }
         }
 

diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/PremultiplyVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/PremultiplyVector4.cs
@@ -0,0 +1,68 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using BenchmarkDotNet.Attributes;
+
+namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
+{
+    [Config(typeof(Config.ShortCore31))]
+    public class PremultiplyVector4
+    {
+        private static readonly Vector4[] Vectors = CreateVectors();
+
+        [Benchmark(Baseline = true)]
+        public void PremultiplyBaseline()
+        {
+            ref Vector4 baseRef = ref MemoryMarshal.GetReference<Vector4>(Vectors);
+
+            for (int i = 0; i < Vectors.Length; i++)
+            {
+                ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
+                Premultiply(ref v);
+            }
+        }
+
+        [Benchmark]
+        public void Premultiply()
+        {
+            Vector4Utilities.Premultiply(Vectors);
+        }
+
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private static void Premultiply(ref Vector4 source)
+        {
+            float w = source.W;
+            source *= w;
+            source.W = w;
+        }
+
+        private static Vector4[] CreateVectors()
+        {
+            var rnd = new Random(42);
+            return GenerateRandomVectorArray(rnd, 2048, 0, 1);
+        }
+
+        private static Vector4[] GenerateRandomVectorArray(Random rnd, int length, float minVal, float maxVal)
+        {
+            var values = new Vector4[length];
+
+            for (int i = 0; i < length; i++)
+            {
+                ref Vector4 v = ref values[i];
+                v.X = GetRandomFloat(rnd, minVal, maxVal);
+                v.Y = GetRandomFloat(rnd, minVal, maxVal);
+                v.Z = GetRandomFloat(rnd, minVal, maxVal);
+                v.W = GetRandomFloat(rnd, minVal, maxVal);
+            }
+
+            return values;
+        }
+
+        private static float GetRandomFloat(Random rnd, float minVal, float maxVal)
+            => ((float)rnd.NextDouble() * (maxVal - minVal)) + minVal;
+    }
+}
diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/UnPremultiplyVector4.cs
@@ -0,0 +1,68 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using BenchmarkDotNet.Attributes;
+
+namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
+{
+    [Config(typeof(Config.ShortCore31))]
+    public class UnPremultiplyVector4
+    {
+        private static readonly Vector4[] Vectors = CreateVectors();
+
+        [Benchmark(Baseline = true)]
+        public void UnPremultiplyBaseline()
+        {
+            ref Vector4 baseRef = ref MemoryMarshal.GetReference<Vector4>(Vectors);
+
+            for (int i = 0; i < Vectors.Length; i++)
+            {
+                ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
+                UnPremultiply(ref v);
+            }
+        }
+
+        [Benchmark]
+        public void UnPremultiply()
+        {
+            Vector4Utilities.UnPremultiply(Vectors);
+        }
+
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private static void UnPremultiply(ref Vector4 source)
+        {
+            float w = source.W;
+            source *= w;
+            source.W = w;
+        }
+
+        private static Vector4[] CreateVectors()
+        {
+            var rnd = new Random(42);
+            return GenerateRandomVectorArray(rnd, 2048, 0, 1);
+        }
+
+        private static Vector4[] GenerateRandomVectorArray(Random rnd, int length, float minVal, float maxVal)
+        {
+            var values = new Vector4[length];
+
+            for (int i = 0; i < length; i++)
+            {
+                ref Vector4 v = ref values[i];
+                v.X = GetRandomFloat(rnd, minVal, maxVal);
+                v.Y = GetRandomFloat(rnd, minVal, maxVal);
+                v.Z = GetRandomFloat(rnd, minVal, maxVal);
+                v.W = GetRandomFloat(rnd, minVal, maxVal);
+            }
+
+            return values;
+        }
+
+        private static float GetRandomFloat(Random rnd, float minVal, float maxVal)
+            => ((float)rnd.NextDouble() * (maxVal - minVal)) + minVal;
+    }
+}
diff --git a/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs b/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs
@@ -10,6 +10,21 @@ namespace SixLabors.ImageSharp.Tests.Helpers
 {
     public class ImageMathsTests
     {
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        [InlineData(2)]
+        [InlineData(3)]
+        [InlineData(4)]
+        [InlineData(100)]
+        [InlineData(123)]
+        [InlineData(53436353)]
+        public void Modulo2(int x)
+        {
+            int actual = ImageMaths.Modulo2(x);
+            Assert.Equal(x % 2, actual);
+        }
+
         [Theory]
         [InlineData(0)]
         [InlineData(1)]

diff --git a/tests/ImageSharp.Tests/Helpers/Vector4UtilsTests.cs b/tests/ImageSharp.Tests/Helpers/Vector4UtilsTests.cs
@@ -17,6 +17,7 @@ public class Vector4UtilsTests
         [InlineData(0)]
         [InlineData(1)]
         [InlineData(30)]
+        [InlineData(63)]
         public void Premultiply_VectorSpan(int length)
         {
             var rnd = new Random(42);
@@ -36,6 +37,7 @@ public void Premultiply_VectorSpan(int length)
         [InlineData(0)]
         [InlineData(1)]
         [InlineData(30)]
+        [InlineData(63)]
         public void UnPremultiply_VectorSpan(int length)
         {
             var rnd = new Random(42);