-
-
Notifications
You must be signed in to change notification settings - Fork 893
Add AVX2 Vector4Octet.Pack implementation #1402
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
c1e6d50
ebfd069
8872b2b
eb315fe
3ae4b02
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -4,7 +4,12 @@ | |||||
| using System; | ||||||
| using System.Collections.Generic; | ||||||
| using System.Numerics; | ||||||
|
|
||||||
| using System.Runtime.CompilerServices; | ||||||
| using System.Runtime.InteropServices; | ||||||
| #if SUPPORTS_RUNTIME_INTRINSICS | ||||||
| using System.Runtime.Intrinsics; | ||||||
| using System.Runtime.Intrinsics.X86; | ||||||
| #endif | ||||||
| using SixLabors.ImageSharp.Memory; | ||||||
| using SixLabors.ImageSharp.Tuples; | ||||||
|
|
||||||
|
|
@@ -190,45 +195,90 @@ internal struct Vector4Octet | |||||
| /// </summary> | ||||||
| public void Pack(ref Vector4Pair r, ref Vector4Pair g, ref Vector4Pair b) | ||||||
| { | ||||||
| this.V0.X = r.A.X; | ||||||
| this.V0.Y = g.A.X; | ||||||
| this.V0.Z = b.A.X; | ||||||
| this.V0.W = 1f; | ||||||
|
|
||||||
| this.V1.X = r.A.Y; | ||||||
| this.V1.Y = g.A.Y; | ||||||
| this.V1.Z = b.A.Y; | ||||||
| this.V1.W = 1f; | ||||||
|
|
||||||
| this.V2.X = r.A.Z; | ||||||
| this.V2.Y = g.A.Z; | ||||||
| this.V2.Z = b.A.Z; | ||||||
| this.V2.W = 1f; | ||||||
|
|
||||||
| this.V3.X = r.A.W; | ||||||
| this.V3.Y = g.A.W; | ||||||
| this.V3.Z = b.A.W; | ||||||
| this.V3.W = 1f; | ||||||
|
|
||||||
| this.V4.X = r.B.X; | ||||||
| this.V4.Y = g.B.X; | ||||||
| this.V4.Z = b.B.X; | ||||||
| this.V4.W = 1f; | ||||||
|
|
||||||
| this.V5.X = r.B.Y; | ||||||
| this.V5.Y = g.B.Y; | ||||||
| this.V5.Z = b.B.Y; | ||||||
| this.V5.W = 1f; | ||||||
|
|
||||||
| this.V6.X = r.B.Z; | ||||||
| this.V6.Y = g.B.Z; | ||||||
| this.V6.Z = b.B.Z; | ||||||
| this.V6.W = 1f; | ||||||
|
|
||||||
| this.V7.X = r.B.W; | ||||||
| this.V7.Y = g.B.W; | ||||||
| this.V7.Z = b.B.W; | ||||||
| this.V7.W = 1f; | ||||||
| #if SUPPORTS_RUNTIME_INTRINSICS | ||||||
| if (Avx2.IsSupported) | ||||||
| { | ||||||
| Vector4 vo = Vector4.One; | ||||||
| Vector128<float> valpha = Unsafe.As<Vector4, Vector128<float>>(ref vo); | ||||||
|
|
||||||
| ref byte control = ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskDeinterleave8x32); | ||||||
| Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control); | ||||||
|
|
||||||
| Vector256<float> r0 = Avx.InsertVector128( | ||||||
| Unsafe.As<Vector4, Vector128<float>>(ref r.A).ToVector256(), | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I'm not getting it wrong, we can spare
Suggested change
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Weirdly when I tried that I got access violations!
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are you sure it was on
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I’ll double check. Was super surprised to see it as it didn’t make sense.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||
| Unsafe.As<Vector4, Vector128<float>>(ref g.A), | ||||||
| 1); | ||||||
|
|
||||||
| Vector256<float> r1 = Avx.InsertVector128( | ||||||
| Unsafe.As<Vector4, Vector128<float>>(ref b.A).ToVector256(), | ||||||
| valpha, | ||||||
| 1); | ||||||
|
|
||||||
| Vector256<float> r2 = Avx.InsertVector128( | ||||||
| Unsafe.As<Vector4, Vector128<float>>(ref r.B).ToVector256(), | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With a wider refactor, it's also possible to save the conversion here.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That'd certainly be easier if I was using pointers.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What I meant is inlining the
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, that's what I mean. It's difficult to inline at the moment because I need the 128bit offset when I'm aligning at 256bit.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm pushing the current state. |
||||||
| Unsafe.As<Vector4, Vector128<float>>(ref g.B), | ||||||
| 1); | ||||||
|
|
||||||
| Vector256<float> r3 = Avx.InsertVector128( | ||||||
| Unsafe.As<Vector4, Vector128<float>>(ref b.B).ToVector256(), | ||||||
| valpha, | ||||||
| 1); | ||||||
|
|
||||||
| Vector256<float> t0 = Avx.UnpackLow(r0, r1); | ||||||
| Vector256<float> t2 = Avx.UnpackHigh(r0, r1); | ||||||
|
|
||||||
| Unsafe.As<Vector4, Vector256<float>>(ref this.V0) = Avx2.PermuteVar8x32(t0, vcontrol); | ||||||
| Unsafe.As<Vector4, Vector256<float>>(ref this.V2) = Avx2.PermuteVar8x32(t2, vcontrol); | ||||||
|
|
||||||
| Vector256<float> t4 = Avx.UnpackLow(r2, r3); | ||||||
| Vector256<float> t6 = Avx.UnpackHigh(r2, r3); | ||||||
|
|
||||||
| Unsafe.As<Vector4, Vector256<float>>(ref this.V4) = Avx2.PermuteVar8x32(t4, vcontrol); | ||||||
| Unsafe.As<Vector4, Vector256<float>>(ref this.V6) = Avx2.PermuteVar8x32(t6, vcontrol); | ||||||
| } | ||||||
| else | ||||||
| #endif | ||||||
| { | ||||||
| this.V0.X = r.A.X; | ||||||
| this.V0.Y = g.A.X; | ||||||
| this.V0.Z = b.A.X; | ||||||
| this.V0.W = 1f; | ||||||
|
|
||||||
| this.V1.X = r.A.Y; | ||||||
| this.V1.Y = g.A.Y; | ||||||
| this.V1.Z = b.A.Y; | ||||||
| this.V1.W = 1f; | ||||||
|
|
||||||
| this.V2.X = r.A.Z; | ||||||
| this.V2.Y = g.A.Z; | ||||||
| this.V2.Z = b.A.Z; | ||||||
| this.V2.W = 1f; | ||||||
|
|
||||||
| this.V3.X = r.A.W; | ||||||
| this.V3.Y = g.A.W; | ||||||
| this.V3.Z = b.A.W; | ||||||
| this.V3.W = 1f; | ||||||
|
|
||||||
| this.V4.X = r.B.X; | ||||||
| this.V4.Y = g.B.X; | ||||||
| this.V4.Z = b.B.X; | ||||||
| this.V4.W = 1f; | ||||||
|
|
||||||
| this.V5.X = r.B.Y; | ||||||
| this.V5.Y = g.B.Y; | ||||||
| this.V5.Z = b.B.Y; | ||||||
| this.V5.W = 1f; | ||||||
|
|
||||||
| this.V6.X = r.B.Z; | ||||||
| this.V6.Y = g.B.Z; | ||||||
| this.V6.Z = b.B.Z; | ||||||
| this.V6.W = 1f; | ||||||
|
|
||||||
| this.V7.X = r.B.W; | ||||||
| this.V7.Y = g.B.W; | ||||||
| this.V7.Z = b.B.W; | ||||||
| this.V7.W = 1f; | ||||||
| } | ||||||
| } | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| // Copyright (c) Six Labors. | ||
| // Licensed under the Apache License, Version 2.0. | ||
|
|
||
| using System.Numerics; | ||
| using BenchmarkDotNet.Attributes; | ||
| using SixLabors.ImageSharp.Tuples; | ||
| using static SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters.JpegColorConverter; | ||
|
|
||
| namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg | ||
| { | ||
| [Config(typeof(Config.HwIntrinsics_SSE_AVX))] | ||
| public class Vector4OctetPack | ||
| { | ||
| private static Vector4Pair r = new Vector4Pair | ||
| { | ||
| A = new Vector4(1, 2, 3, 4), | ||
| B = new Vector4(5, 6, 7, 8) | ||
| }; | ||
|
|
||
| private static Vector4Pair g = new Vector4Pair | ||
| { | ||
| A = new Vector4(9, 10, 11, 12), | ||
| B = new Vector4(13, 14, 15, 16) | ||
| }; | ||
|
|
||
| private static Vector4Pair b = new Vector4Pair | ||
| { | ||
| A = new Vector4(17, 18, 19, 20), | ||
| B = new Vector4(21, 22, 23, 24) | ||
| }; | ||
|
|
||
| [Benchmark] | ||
| public void Pack() | ||
| { | ||
| Vector4Octet v = default; | ||
|
|
||
| v.Pack(ref r, ref g, ref b); | ||
| } | ||
| } | ||
| } |

Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
By having two versions of
Pack, (or inlining the HwIntrinsics AVX2 version) we can move these loads outside of the for loop callingPack.(Not necessarily a suggestion for this PR since it extends the scope quite a lot)