Skip to content

Commit 49bd35c

Browse files
authored
Merge pull request #1804 from SixLabors/bp/selectsse2
Add sse2 version of select
2 parents d021222 + 143de22 commit 49bd35c

3 files changed

Lines changed: 90 additions & 23 deletions

File tree

src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs

Lines changed: 43 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -551,6 +551,7 @@ public static void PredictorInverseTransform(
551551
int mask = tileWidth - 1;
552552
int tilesPerRow = SubSampleSize(width, transform.Bits);
553553
int predictorModeIdxBase = (y >> transform.Bits) * tilesPerRow;
554+
Span<short> scratch = stackalloc short[8];
554555
while (y < yEnd)
555556
{
556557
int predictorModeIdx = predictorModeIdxBase;
@@ -608,7 +609,7 @@ public static void PredictorInverseTransform(
608609
PredictorAdd10(input + x, output + x - width, xEnd - x, output + x);
609610
break;
610611
case 11:
611-
PredictorAdd11(input + x, output + x - width, xEnd - x, output + x);
612+
PredictorAdd11(input + x, output + x - width, xEnd - x, output + x, scratch);
612613
break;
613614
case 12:
614615
PredictorAdd12(input + x, output + x - width, xEnd - x, output + x);
@@ -974,11 +975,11 @@ private static void PredictorAdd10(uint* input, uint* upper, int numberOfPixels,
974975
}
975976

976977
[MethodImpl(InliningOptions.ShortMethod)]
977-
private static void PredictorAdd11(uint* input, uint* upper, int numberOfPixels, uint* output)
978+
private static void PredictorAdd11(uint* input, uint* upper, int numberOfPixels, uint* output, Span<short> scratch)
978979
{
979980
for (int x = 0; x < numberOfPixels; x++)
980981
{
981-
uint pred = Predictor11(output[x - 1], upper + x);
982+
uint pred = Predictor11(output[x - 1], upper + x, scratch);
982983
output[x] = AddPixels(input[x], pred);
983984
}
984985
}
@@ -1031,7 +1032,7 @@ private static void PredictorAdd13(uint* input, uint* upper, int numberOfPixels,
10311032
public static uint Predictor10(uint left, uint* top) => Average4(left, top[-1], top[0], top[1]);
10321033

10331034
[MethodImpl(InliningOptions.ShortMethod)]
1034-
public static uint Predictor11(uint left, uint* top) => Select(top[0], left, top[-1]);
1035+
public static uint Predictor11(uint left, uint* top, Span<short> scratch) => Select(top[0], left, top[-1], scratch);
10351036

10361037
[MethodImpl(InliningOptions.ShortMethod)]
10371038
public static uint Predictor12(uint left, uint* top) => ClampedAddSubtractFull(left, top[0], top[-1]);
@@ -1148,11 +1149,11 @@ public static void PredictorSub10(uint* input, uint* upper, int numPixels, uint*
11481149
}
11491150

11501151
[MethodImpl(InliningOptions.ShortMethod)]
1151-
public static void PredictorSub11(uint* input, uint* upper, int numPixels, uint* output)
1152+
public static void PredictorSub11(uint* input, uint* upper, int numPixels, uint* output, Span<short> scratch)
11521153
{
11531154
for (int x = 0; x < numPixels; x++)
11541155
{
1155-
uint pred = Predictor11(input[x - 1], upper + x);
1156+
uint pred = Predictor11(input[x - 1], upper + x, scratch);
11561157
output[x] = SubPixels(input[x], pred);
11571158
}
11581159
}
@@ -1240,14 +1241,43 @@ private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2)
12401241
private static Vector128<int> MkCst16(int hi, int lo) => Vector128.Create((hi << 16) | (lo & 0xffff));
12411242
#endif
12421243

1243-
private static uint Select(uint a, uint b, uint c)
1244+
private static uint Select(uint a, uint b, uint c, Span<short> scratch)
12441245
{
1245-
int paMinusPb =
1246-
Sub3((int)(a >> 24), (int)(b >> 24), (int)(c >> 24)) +
1247-
Sub3((int)((a >> 16) & 0xff), (int)((b >> 16) & 0xff), (int)((c >> 16) & 0xff)) +
1248-
Sub3((int)((a >> 8) & 0xff), (int)((b >> 8) & 0xff), (int)((c >> 8) & 0xff)) +
1249-
Sub3((int)(a & 0xff), (int)(b & 0xff), (int)(c & 0xff));
1250-
return paMinusPb <= 0 ? a : b;
1246+
#if SUPPORTS_RUNTIME_INTRINSICS
1247+
if (Sse2.IsSupported)
1248+
{
1249+
Span<short> output = scratch;
1250+
fixed (short* p = output)
1251+
{
1252+
Vector128<byte> a0 = Sse2.ConvertScalarToVector128UInt32(a).AsByte();
1253+
Vector128<byte> b0 = Sse2.ConvertScalarToVector128UInt32(b).AsByte();
1254+
Vector128<byte> c0 = Sse2.ConvertScalarToVector128UInt32(c).AsByte();
1255+
Vector128<byte> ac0 = Sse2.SubtractSaturate(a0, c0);
1256+
Vector128<byte> ca0 = Sse2.SubtractSaturate(c0, a0);
1257+
Vector128<byte> bc0 = Sse2.SubtractSaturate(b0, c0);
1258+
Vector128<byte> cb0 = Sse2.SubtractSaturate(c0, b0);
1259+
Vector128<byte> ac = Sse2.Or(ac0, ca0);
1260+
Vector128<byte> bc = Sse2.Or(bc0, cb0);
1261+
Vector128<byte> pa = Sse2.UnpackLow(ac, Vector128<byte>.Zero); // |a - c|
1262+
Vector128<byte> pb = Sse2.UnpackLow(bc, Vector128<byte>.Zero); // |b - c|
1263+
Vector128<ushort> diff = Sse2.Subtract(pb.AsUInt16(), pa.AsUInt16());
1264+
Sse2.Store((ushort*)p, diff);
1265+
}
1266+
1267+
int paMinusPb = output[0] + output[1] + output[2] + output[3];
1268+
1269+
return (paMinusPb <= 0) ? a : b;
1270+
}
1271+
else
1272+
#endif
1273+
{
1274+
int paMinusPb =
1275+
Sub3((int)(a >> 24), (int)(b >> 24), (int)(c >> 24)) +
1276+
Sub3((int)((a >> 16) & 0xff), (int)((b >> 16) & 0xff), (int)((c >> 16) & 0xff)) +
1277+
Sub3((int)((a >> 8) & 0xff), (int)((b >> 8) & 0xff), (int)((c >> 8) & 0xff)) +
1278+
Sub3((int)(a & 0xff), (int)(b & 0xff), (int)(c & 0xff));
1279+
return paMinusPb <= 0 ? a : b;
1280+
}
12511281
}
12521282

12531283
[MethodImpl(InliningOptions.ShortMethod)]

src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ public static void ResidualImage(
5050
int tilesPerRow = LosslessUtils.SubSampleSize(width, bits);
5151
int tilesPerCol = LosslessUtils.SubSampleSize(height, bits);
5252
int maxQuantization = 1 << LosslessUtils.NearLosslessBits(nearLosslessQuality);
53+
Span<short> scratch = stackalloc short[8];
5354

5455
// TODO: Can we optimize this?
5556
int[][] histo = new int[4][];
@@ -84,7 +85,8 @@ public static void ResidualImage(
8485
transparentColorMode,
8586
usedSubtractGreen,
8687
nearLossless,
87-
image);
88+
image,
89+
scratch);
8890

8991
image[(tileY * tilesPerRow) + tileX] = (uint)(WebpConstants.ArgbBlack | (pred << 8));
9092
}
@@ -192,7 +194,8 @@ private static int GetBestPredictorForTile(
192194
WebpTransparentColorMode transparentColorMode,
193195
bool usedSubtractGreen,
194196
bool nearLossless,
195-
Span<uint> modes)
197+
Span<uint> modes,
198+
Span<short> scratch)
196199
{
197200
const int numPredModes = 14;
198201
int startX = tileX << bits;
@@ -272,7 +275,7 @@ private static int GetBestPredictorForTile(
272275
}
273276
}
274277

275-
GetResidual(width, height, upperRow, currentRow, maxDiffs, mode, startX, startX + maxX, y, maxQuantization, transparentColorMode, usedSubtractGreen, nearLossless, residuals);
278+
GetResidual(width, height, upperRow, currentRow, maxDiffs, mode, startX, startX + maxX, y, maxQuantization, transparentColorMode, usedSubtractGreen, nearLossless, residuals, scratch);
276279
for (int relativeX = 0; relativeX < maxX; ++relativeX)
277280
{
278281
UpdateHisto(histoArgb, residuals[relativeX]);
@@ -333,11 +336,12 @@ private static void GetResidual(
333336
WebpTransparentColorMode transparentColorMode,
334337
bool usedSubtractGreen,
335338
bool nearLossless,
336-
Span<uint> output)
339+
Span<uint> output,
340+
Span<short> scratch)
337341
{
338342
if (transparentColorMode == WebpTransparentColorMode.Preserve)
339343
{
340-
PredictBatch(mode, xStart, y, xEnd - xStart, currentRowSpan, upperRowSpan, output);
344+
PredictBatch(mode, xStart, y, xEnd - xStart, currentRowSpan, upperRowSpan, output, scratch);
341345
}
342346
else
343347
{
@@ -395,7 +399,7 @@ private static void GetResidual(
395399
predict = LosslessUtils.Predictor10(currentRow[x - 1], upperRow + x);
396400
break;
397401
case 11:
398-
predict = LosslessUtils.Predictor11(currentRow[x - 1], upperRow + x);
402+
predict = LosslessUtils.Predictor11(currentRow[x - 1], upperRow + x, scratch);
399403
break;
400404
case 12:
401405
predict = LosslessUtils.Predictor12(currentRow[x - 1], upperRow + x);
@@ -583,6 +587,7 @@ private static void CopyImageWithPrediction(
583587
Span<byte> currentMaxDiffs = MemoryMarshal.Cast<uint, byte>(currentRow.Slice(width + 1));
584588

585589
Span<byte> lowerMaxDiffs = currentMaxDiffs.Slice(width);
590+
Span<short> scratch = stackalloc short[8];
586591
for (int y = 0; y < height; y++)
587592
{
588593
Span<uint> tmp32 = upperRow;
@@ -593,7 +598,7 @@ private static void CopyImageWithPrediction(
593598

594599
if (lowEffort)
595600
{
596-
PredictBatch(PredLowEffort, 0, y, width, currentRow, upperRow, argb.Slice(y * width));
601+
PredictBatch(PredLowEffort, 0, y, width, currentRow, upperRow, argb.Slice(y * width), scratch);
597602
}
598603
else
599604
{
@@ -634,7 +639,8 @@ private static void CopyImageWithPrediction(
634639
transparentColorMode,
635640
usedSubtractGreen,
636641
nearLossless,
637-
argb.Slice((y * width) + x));
642+
argb.Slice((y * width) + x),
643+
scratch);
638644

639645
x = xEnd;
640646
}
@@ -649,7 +655,8 @@ private static void PredictBatch(
649655
int numPixels,
650656
Span<uint> currentSpan,
651657
Span<uint> upperSpan,
652-
Span<uint> outputSpan)
658+
Span<uint> outputSpan,
659+
Span<short> scratch)
653660
{
654661
#pragma warning disable SA1503 // Braces should not be omitted
655662
fixed (uint* current = currentSpan)
@@ -718,7 +725,7 @@ private static void PredictBatch(
718725
LosslessUtils.PredictorSub10(current + xStart, upper + xStart, numPixels, output);
719726
break;
720727
case 11:
721-
LosslessUtils.PredictorSub11(current + xStart, upper + xStart, numPixels, output);
728+
LosslessUtils.PredictorSub11(current + xStart, upper + xStart, numPixels, output, scratch);
722729
break;
723730
case 12:
724731
LosslessUtils.PredictorSub12(current + xStart, upper + xStart, numPixels, output);

tests/ImageSharp.Tests/Formats/WebP/LosslessUtilsTests.cs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,30 @@ private static void RunTransformColorInverseTest()
132132
Assert.Equal(expectedOutput, pixelData);
133133
}
134134

135+
private static void RunPredictor11Test()
136+
{
137+
// arrange
138+
uint[] topData = { 4278258949, 4278258949 };
139+
uint left = 4294839812;
140+
short[] scratch = new short[8];
141+
uint expectedResult = 4294839812;
142+
143+
// act
144+
unsafe
145+
{
146+
fixed (uint* top = &topData[1])
147+
{
148+
uint actual = LosslessUtils.Predictor11(left, top, scratch);
149+
150+
// assert
151+
Assert.Equal(expectedResult, actual);
152+
}
153+
}
154+
}
155+
156+
[Fact]
157+
public void Predictor11_Works() => RunPredictor11Test();
158+
135159
[Fact]
136160
public void SubtractGreen_Works() => RunSubtractGreenTest();
137161

@@ -145,6 +169,12 @@ private static void RunTransformColorInverseTest()
145169
public void TransformColorInverse_Works() => RunTransformColorInverseTest();
146170

147171
#if SUPPORTS_RUNTIME_INTRINSICS
172+
[Fact]
173+
public void Predictor11_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor11Test, HwIntrinsics.AllowAll);
174+
175+
[Fact]
176+
public void Predictor11_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunPredictor11Test, HwIntrinsics.DisableSSE2);
177+
148178
[Fact]
149179
public void SubtractGreen_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunSubtractGreenTest, HwIntrinsics.AllowAll);
150180

0 commit comments

Comments
 (0)