Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ internal static partial class SimdUtils
{
public static class HwIntrinsics
{
private static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };
public static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };

/// <summary>
/// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@
using System;
using System.Collections.Generic;
using System.Numerics;

using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif
using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.Tuples;

Expand Down Expand Up @@ -190,45 +195,90 @@ internal struct Vector4Octet
/// </summary>
public void Pack(ref Vector4Pair r, ref Vector4Pair g, ref Vector4Pair b)
{
this.V0.X = r.A.X;
this.V0.Y = g.A.X;
this.V0.Z = b.A.X;
this.V0.W = 1f;

this.V1.X = r.A.Y;
this.V1.Y = g.A.Y;
this.V1.Z = b.A.Y;
this.V1.W = 1f;

this.V2.X = r.A.Z;
this.V2.Y = g.A.Z;
this.V2.Z = b.A.Z;
this.V2.W = 1f;

this.V3.X = r.A.W;
this.V3.Y = g.A.W;
this.V3.Z = b.A.W;
this.V3.W = 1f;

this.V4.X = r.B.X;
this.V4.Y = g.B.X;
this.V4.Z = b.B.X;
this.V4.W = 1f;

this.V5.X = r.B.Y;
this.V5.Y = g.B.Y;
this.V5.Z = b.B.Y;
this.V5.W = 1f;

this.V6.X = r.B.Z;
this.V6.Y = g.B.Z;
this.V6.Z = b.B.Z;
this.V6.W = 1f;

this.V7.X = r.B.W;
this.V7.Y = g.B.W;
this.V7.Z = b.B.W;
this.V7.W = 1f;
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx2.IsSupported)
{
Vector4 vo = Vector4.One;
Vector128<float> valpha = Unsafe.As<Vector4, Vector128<float>>(ref vo);

ref byte control = ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskDeinterleave8x32);
Vector256<int> vcontrol = Unsafe.As<byte, Vector256<int>>(ref control);

@antonfirsov antonfirsov Oct 23, 2020

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By having two versions of Pack, (or inlining the HwIntrinsics AVX2 version) we can move these loads outside of the for loop calling Pack.

(Not necessarily a suggestion for this PR since it extends the scope quite a lot)


Vector256<float> r0 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref r.A).ToVector256(),

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I'm not getting it wrong, we can spare ToVector256 in lines dealing with r.A, g.A and b.A, since the upper 4 elements will be overwritten anyways:

Suggested change
Unsafe.As<Vector4, Vector128<float>>(ref r.A).ToVector256(),
Unsafe.As<Vector4, Vector256<float>>(ref r.A),

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Weirdly when I tried that I got access violations!

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure it was on .A stuff and not .B?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’ll double check. Was super surprised to see it as it didn’t make sense.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup, crashes every time.

image

Unsafe.As<Vector4, Vector128<float>>(ref g.A),
1);

Vector256<float> r1 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref b.A).ToVector256(),
valpha,
1);

Vector256<float> r2 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref r.B).ToVector256(),

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With a wider refactor, it's also possible to save the conversion here.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That'd certainly be easier if I was using pointers.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I meant is inlining the Pack stuff into AVX2 color space conversion code. That would remove a bunch of unnecessary loads/stores.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that's what I mean. It's difficult to inline at the moment because I need the 128bit offset when I'm aligning at 256bit.
If I fixed the r, g, b inputs then I could simply load up the vector from the offset.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm pushing the current state.

Unsafe.As<Vector4, Vector128<float>>(ref g.B),
1);

Vector256<float> r3 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref b.B).ToVector256(),
valpha,
1);

Vector256<float> t0 = Avx.UnpackLow(r0, r1);
Vector256<float> t2 = Avx.UnpackHigh(r0, r1);

Unsafe.As<Vector4, Vector256<float>>(ref this.V0) = Avx2.PermuteVar8x32(t0, vcontrol);
Unsafe.As<Vector4, Vector256<float>>(ref this.V2) = Avx2.PermuteVar8x32(t2, vcontrol);

Vector256<float> t4 = Avx.UnpackLow(r2, r3);
Vector256<float> t6 = Avx.UnpackHigh(r2, r3);

Unsafe.As<Vector4, Vector256<float>>(ref this.V4) = Avx2.PermuteVar8x32(t4, vcontrol);
Unsafe.As<Vector4, Vector256<float>>(ref this.V6) = Avx2.PermuteVar8x32(t6, vcontrol);
}
else
#endif
{
this.V0.X = r.A.X;
this.V0.Y = g.A.X;
this.V0.Z = b.A.X;
this.V0.W = 1f;

this.V1.X = r.A.Y;
this.V1.Y = g.A.Y;
this.V1.Z = b.A.Y;
this.V1.W = 1f;

this.V2.X = r.A.Z;
this.V2.Y = g.A.Z;
this.V2.Z = b.A.Z;
this.V2.W = 1f;

this.V3.X = r.A.W;
this.V3.Y = g.A.W;
this.V3.Z = b.A.W;
this.V3.W = 1f;

this.V4.X = r.B.X;
this.V4.Y = g.B.X;
this.V4.Z = b.B.X;
this.V4.W = 1f;

this.V5.X = r.B.Y;
this.V5.Y = g.B.Y;
this.V5.Z = b.B.Y;
this.V5.W = 1f;

this.V6.X = r.B.Z;
this.V6.Y = g.B.Z;
this.V6.Z = b.B.Z;
this.V6.W = 1f;

this.V7.X = r.B.W;
this.V7.Y = g.B.W;
this.V7.Z = b.B.W;
this.V7.W = 1f;
}
}
}
}
Expand Down
40 changes: 40 additions & 0 deletions tests/ImageSharp.Benchmarks/Codecs/Jpeg/Vector4OctetPack.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.

using System.Numerics;
using BenchmarkDotNet.Attributes;
using SixLabors.ImageSharp.Tuples;
using static SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters.JpegColorConverter;

namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
{
[Config(typeof(Config.HwIntrinsics_SSE_AVX))]
public class Vector4OctetPack
{
private static Vector4Pair r = new Vector4Pair
{
A = new Vector4(1, 2, 3, 4),
B = new Vector4(5, 6, 7, 8)
};

private static Vector4Pair g = new Vector4Pair
{
A = new Vector4(9, 10, 11, 12),
B = new Vector4(13, 14, 15, 16)
};

private static Vector4Pair b = new Vector4Pair
{
A = new Vector4(17, 18, 19, 20),
B = new Vector4(21, 22, 23, 24)
};

[Benchmark]
public void Pack()
{
Vector4Octet v = default;

v.Pack(ref r, ref g, ref b);
}
}
}
4 changes: 3 additions & 1 deletion tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,9 @@ public HwIntrinsics_SSE_AVX()
}
#endif
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31)
.WithEnvironmentVariables(new EnvironmentVariable(EnableHWIntrinsic, Off))
.WithEnvironmentVariables(
new EnvironmentVariable(EnableHWIntrinsic, Off),
new EnvironmentVariable(FeatureSIMD, Off))
.WithId("No HwIntrinsics"));
}
}
Expand Down