@@ -551,6 +551,7 @@ public static void PredictorInverseTransform(
551551 int mask = tileWidth - 1 ;
552552 int tilesPerRow = SubSampleSize ( width , transform . Bits ) ;
553553 int predictorModeIdxBase = ( y >> transform . Bits ) * tilesPerRow ;
554+ Span < short > scratch = stackalloc short [ 8 ] ;
554555 while ( y < yEnd )
555556 {
556557 int predictorModeIdx = predictorModeIdxBase ;
@@ -608,7 +609,7 @@ public static void PredictorInverseTransform(
608609 PredictorAdd10 ( input + x , output + x - width , xEnd - x , output + x ) ;
609610 break ;
610611 case 11 :
611- PredictorAdd11 ( input + x , output + x - width , xEnd - x , output + x ) ;
612+ PredictorAdd11 ( input + x , output + x - width , xEnd - x , output + x , scratch ) ;
612613 break ;
613614 case 12 :
614615 PredictorAdd12 ( input + x , output + x - width , xEnd - x , output + x ) ;
@@ -974,11 +975,11 @@ private static void PredictorAdd10(uint* input, uint* upper, int numberOfPixels,
974975 }
975976
976977 [ MethodImpl ( InliningOptions . ShortMethod ) ]
977- private static void PredictorAdd11 ( uint * input , uint * upper , int numberOfPixels , uint * output )
978+ private static void PredictorAdd11 ( uint * input , uint * upper , int numberOfPixels , uint * output , Span < short > scratch )
978979 {
979980 for ( int x = 0 ; x < numberOfPixels ; x ++ )
980981 {
981- uint pred = Predictor11 ( output [ x - 1 ] , upper + x ) ;
982+ uint pred = Predictor11 ( output [ x - 1 ] , upper + x , scratch ) ;
982983 output [ x ] = AddPixels ( input [ x ] , pred ) ;
983984 }
984985 }
@@ -1031,7 +1032,7 @@ private static void PredictorAdd13(uint* input, uint* upper, int numberOfPixels,
10311032 public static uint Predictor10 ( uint left , uint * top ) => Average4 ( left , top [ - 1 ] , top [ 0 ] , top [ 1 ] ) ;
10321033
10331034 [ MethodImpl ( InliningOptions . ShortMethod ) ]
1034- public static uint Predictor11 ( uint left , uint * top ) => Select ( top [ 0 ] , left , top [ - 1 ] ) ;
1035+ public static uint Predictor11 ( uint left , uint * top , Span < short > scratch ) => Select ( top [ 0 ] , left , top [ - 1 ] , scratch ) ;
10351036
10361037 [ MethodImpl ( InliningOptions . ShortMethod ) ]
10371038 public static uint Predictor12 ( uint left , uint * top ) => ClampedAddSubtractFull ( left , top [ 0 ] , top [ - 1 ] ) ;
@@ -1148,11 +1149,11 @@ public static void PredictorSub10(uint* input, uint* upper, int numPixels, uint*
11481149 }
11491150
11501151 [ MethodImpl ( InliningOptions . ShortMethod ) ]
1151- public static void PredictorSub11 ( uint * input , uint * upper , int numPixels , uint * output )
1152+ public static void PredictorSub11 ( uint * input , uint * upper , int numPixels , uint * output , Span < short > scratch )
11521153 {
11531154 for ( int x = 0 ; x < numPixels ; x ++ )
11541155 {
1155- uint pred = Predictor11 ( input [ x - 1 ] , upper + x ) ;
1156+ uint pred = Predictor11 ( input [ x - 1 ] , upper + x , scratch ) ;
11561157 output [ x ] = SubPixels ( input [ x ] , pred ) ;
11571158 }
11581159 }
@@ -1240,14 +1241,43 @@ private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2)
12401241 private static Vector128 < int > MkCst16 ( int hi , int lo ) => Vector128 . Create ( ( hi << 16 ) | ( lo & 0xffff ) ) ;
12411242#endif
12421243
1243- private static uint Select ( uint a , uint b , uint c )
1244+ private static uint Select ( uint a , uint b , uint c , Span < short > scratch )
12441245 {
1245- int paMinusPb =
1246- Sub3 ( ( int ) ( a >> 24 ) , ( int ) ( b >> 24 ) , ( int ) ( c >> 24 ) ) +
1247- Sub3 ( ( int ) ( ( a >> 16 ) & 0xff ) , ( int ) ( ( b >> 16 ) & 0xff ) , ( int ) ( ( c >> 16 ) & 0xff ) ) +
1248- Sub3 ( ( int ) ( ( a >> 8 ) & 0xff ) , ( int ) ( ( b >> 8 ) & 0xff ) , ( int ) ( ( c >> 8 ) & 0xff ) ) +
1249- Sub3 ( ( int ) ( a & 0xff ) , ( int ) ( b & 0xff ) , ( int ) ( c & 0xff ) ) ;
1250- return paMinusPb <= 0 ? a : b ;
1246+ #if SUPPORTS_RUNTIME_INTRINSICS
1247+ if ( Sse2 . IsSupported )
1248+ {
1249+ Span < short > output = scratch ;
1250+ fixed ( short * p = output )
1251+ {
1252+ Vector128 < byte > a0 = Sse2 . ConvertScalarToVector128UInt32 ( a ) . AsByte ( ) ;
1253+ Vector128 < byte > b0 = Sse2 . ConvertScalarToVector128UInt32 ( b ) . AsByte ( ) ;
1254+ Vector128 < byte > c0 = Sse2 . ConvertScalarToVector128UInt32 ( c ) . AsByte ( ) ;
1255+ Vector128 < byte > ac0 = Sse2 . SubtractSaturate ( a0 , c0 ) ;
1256+ Vector128 < byte > ca0 = Sse2 . SubtractSaturate ( c0 , a0 ) ;
1257+ Vector128 < byte > bc0 = Sse2 . SubtractSaturate ( b0 , c0 ) ;
1258+ Vector128 < byte > cb0 = Sse2 . SubtractSaturate ( c0 , b0 ) ;
1259+ Vector128 < byte > ac = Sse2 . Or ( ac0 , ca0 ) ;
1260+ Vector128 < byte > bc = Sse2 . Or ( bc0 , cb0 ) ;
1261+ Vector128 < byte > pa = Sse2 . UnpackLow ( ac , Vector128 < byte > . Zero ) ; // |a - c|
1262+ Vector128 < byte > pb = Sse2 . UnpackLow ( bc , Vector128 < byte > . Zero ) ; // |b - c|
1263+ Vector128 < ushort > diff = Sse2 . Subtract ( pb . AsUInt16 ( ) , pa . AsUInt16 ( ) ) ;
1264+ Sse2 . Store ( ( ushort * ) p , diff ) ;
1265+ }
1266+
1267+ int paMinusPb = output [ 0 ] + output [ 1 ] + output [ 2 ] + output [ 3 ] ;
1268+
1269+ return ( paMinusPb <= 0 ) ? a : b ;
1270+ }
1271+ else
1272+ #endif
1273+ {
1274+ int paMinusPb =
1275+ Sub3 ( ( int ) ( a >> 24 ) , ( int ) ( b >> 24 ) , ( int ) ( c >> 24 ) ) +
1276+ Sub3 ( ( int ) ( ( a >> 16 ) & 0xff ) , ( int ) ( ( b >> 16 ) & 0xff ) , ( int ) ( ( c >> 16 ) & 0xff ) ) +
1277+ Sub3 ( ( int ) ( ( a >> 8 ) & 0xff ) , ( int ) ( ( b >> 8 ) & 0xff ) , ( int ) ( ( c >> 8 ) & 0xff ) ) +
1278+ Sub3 ( ( int ) ( a & 0xff ) , ( int ) ( b & 0xff ) , ( int ) ( c & 0xff ) ) ;
1279+ return paMinusPb <= 0 ? a : b ;
1280+ }
12511281 }
12521282
12531283 [ MethodImpl ( InliningOptions . ShortMethod ) ]
0 commit comments