6060 static void defilter_avg4 (size_t rowbytes , unsigned char * row , const unsigned char * prev );
6161 static void defilter_paeth3 (size_t rowbytes , unsigned char * row , const unsigned char * prev );
6262 static void defilter_paeth4 (size_t rowbytes , unsigned char * row , const unsigned char * prev );
63+
64+ #if defined(SPNG_ARM )
65+ static uint32_t expand_palette_rgba8_neon (unsigned char * row , const unsigned char * scanline , const unsigned char * plte , uint32_t width );
66+ static uint32_t expand_palette_rgb8_neon (unsigned char * row , const unsigned char * scanline , const unsigned char * plte , uint32_t width );
67+ #endif
6368 #endif
6469#endif
6570
@@ -174,6 +179,14 @@ struct spng__iter
174179 const unsigned char * samples ;
175180};
176181
182+ union spng__decode_plte
183+ {
184+ struct spng_plte_entry rgba [256 ];
185+ unsigned char rgb [256 * 3 ];
186+ unsigned char raw [256 * 4 ];
187+ uint32_t align_this ;
188+ };
189+
177190typedef void spng__undo (spng_ctx * ctx );
178191
179192struct spng_ctx
@@ -286,7 +299,7 @@ struct spng_ctx
286299 uint16_t * gamma_lut16 ;
287300 uint16_t gamma_lut8 [256 ];
288301 unsigned char trns_px [8 ];
289- struct spng_plte_entry decode_plte [ 256 ] ;
302+ union spng__decode_plte decode_plte ;
290303 struct spng_sbit decode_sb ;
291304 struct decode_flags decode_flags ;
292305 struct spng_row_info row_info ;
@@ -1294,14 +1307,38 @@ static inline void scale_row(unsigned char *row, uint32_t pixels, int fmt, unsig
12941307}
12951308
12961309/* Expand to *row using 8-bit palette indices from *scanline */
1297- void expand_row (unsigned char * row , const unsigned char * scanline , const struct spng_plte_entry * plte , uint32_t width , int fmt )
1310+ void expand_row (unsigned char * row ,
1311+ const unsigned char * scanline ,
1312+ const union spng__decode_plte * decode_plte ,
1313+ uint32_t width ,
1314+ int fmt )
12981315{
1299- uint32_t i ;
1316+ uint32_t i = 0 ;
13001317 unsigned char * px ;
13011318 unsigned char entry ;
1319+ const struct spng_plte_entry * plte = decode_plte -> rgba ;
1320+
1321+ #if defined(SPNG_ARM )
1322+ if (fmt == SPNG_FMT_RGBA8 ) i = expand_palette_rgba8_neon (row , scanline , decode_plte -> raw , width );
1323+ else if (fmt == SPNG_FMT_RGB8 )
1324+ {
1325+ i = expand_palette_rgb8_neon (row , scanline , decode_plte -> raw , width );
1326+
1327+ for (; i < width ; i ++ )
1328+ {/* In this case the LUT is 3 bytes packed */
1329+ px = row + i * 3 ;
1330+ entry = scanline [i ];
1331+ px [0 ] = decode_plte -> raw [entry * 3 + 0 ];
1332+ px [1 ] = decode_plte -> raw [entry * 3 + 1 ];
1333+ px [2 ] = decode_plte -> raw [entry * 3 + 2 ];
1334+ }
1335+ return ;
1336+ }
1337+ #endif
1338+
13021339 if (fmt == SPNG_FMT_RGBA8 )
13031340 {
1304- for (i = 0 ; i < width ; i ++ )
1341+ for (; i < width ; i ++ )
13051342 {
13061343 px = row + i * 4 ;
13071344 entry = scanline [i ];
@@ -1313,7 +1350,7 @@ void expand_row(unsigned char *row, const unsigned char *scanline, const struct
13131350 }
13141351 else if (fmt == SPNG_FMT_RGB8 )
13151352 {
1316- for (i = 0 ; i < width ; i ++ )
1353+ for (; i < width ; i ++ )
13171354 {
13181355 px = row + i * 3 ;
13191356 entry = scanline [i ];
@@ -2627,7 +2664,7 @@ int spng_decode_scanline(spng_ctx *ctx, void *out, size_t len)
26272664 const uint16_t * gamma_lut = ctx -> gamma_lut ;
26282665 unsigned char * trns_px = ctx -> trns_px ;
26292666 const struct spng_sbit * sb = & ctx -> decode_sb ;
2630- const struct spng_plte_entry * plte = ctx -> decode_plte ;
2667+ const struct spng_plte_entry * plte = ctx -> decode_plte . rgba ;
26312668 struct spng__iter iter = (ihdr -> bit_depth < 16 ) ? spng__iter_init (ihdr -> bit_depth , ctx -> scanline ) : (struct spng__iter ){0 };
26322669
26332670 const unsigned char * scanline ;
@@ -2711,7 +2748,7 @@ int spng_decode_scanline(spng_ctx *ctx, void *out, size_t len)
27112748 {
27122749 if (fmt & (SPNG_FMT_RGBA8 | SPNG_FMT_RGB8 ))
27132750 {
2714- expand_row (out , scanline , plte , width , fmt );
2751+ expand_row (out , scanline , & ctx -> decode_plte , width , fmt );
27152752 break ;
27162753 }
27172754
@@ -3182,11 +3219,13 @@ int spng_decode_image(spng_ctx *ctx, void *out, size_t len, int fmt, int flags)
31823219 sb -> alpha_bits == processing_depth &&
31833220 processing_depth == depth_target ) f .do_scaling = 0 ;
31843221
3185- struct spng_plte_entry * plte = ctx -> decode_plte ;
3222+ struct spng_plte_entry * plte = ctx -> decode_plte . rgba ;
31863223
31873224 /* Pre-process palette entries */
31883225 if (f .indexed )
31893226 {
3227+ uint8_t red , green , blue , alpha ;
3228+
31903229 uint32_t i ;
31913230 for (i = 0 ; i < 256 ; i ++ )
31923231 {
@@ -3195,10 +3234,24 @@ int spng_decode_image(spng_ctx *ctx, void *out, size_t len, int fmt, int flags)
31953234 else
31963235 ctx -> plte .entries [i ].alpha = 255 ;
31973236
3198- plte [i ].red = sample_to_target (ctx -> plte .entries [i ].red , 8 , sb -> red_bits , 8 );
3199- plte [i ].green = sample_to_target (ctx -> plte .entries [i ].green , 8 , sb -> green_bits , 8 );
3200- plte [i ].blue = sample_to_target (ctx -> plte .entries [i ].blue , 8 , sb -> blue_bits , 8 );
3201- plte [i ].alpha = sample_to_target (ctx -> plte .entries [i ].alpha , 8 , sb -> alpha_bits , 8 );
3237+ red = sample_to_target (ctx -> plte .entries [i ].red , 8 , sb -> red_bits , 8 );
3238+ green = sample_to_target (ctx -> plte .entries [i ].green , 8 , sb -> green_bits , 8 );
3239+ blue = sample_to_target (ctx -> plte .entries [i ].blue , 8 , sb -> blue_bits , 8 );
3240+ alpha = sample_to_target (ctx -> plte .entries [i ].alpha , 8 , sb -> alpha_bits , 8 );
3241+
3242+ #if defined(SPNG_ARM )
3243+ if (fmt == SPNG_FMT_RGB8 && ihdr -> bit_depth == 8 )
3244+ {/* Working with 3 bytes at a time is more of an ARM thing */
3245+ ctx -> decode_plte .rgb [i * 3 + 0 ] = red ;
3246+ ctx -> decode_plte .rgb [i * 3 + 1 ] = green ;
3247+ ctx -> decode_plte .rgb [i * 3 + 2 ] = blue ;
3248+ continue ;
3249+ }
3250+ #endif
3251+ plte [i ].red = red ;
3252+ plte [i ].green = green ;
3253+ plte [i ].blue = blue ;
3254+ plte [i ].alpha = alpha ;
32023255 }
32033256
32043257 f .apply_trns = 0 ;
@@ -5043,4 +5096,74 @@ static void defilter_paeth4(size_t rowbytes, unsigned char *row, const unsigned
50435096 }
50445097}
50455098
5099+ /* NEON optimised palette expansion functions
5100+ * Derived from palette_neon_intrinsics.c
5101+ *
5102+ * Copyright (c) 2018-2019 Cosmin Truta
5103+ * Copyright (c) 2017-2018 Arm Holdings. All rights reserved.
5104+ * Written by Richard Townsend <Richard.Townsend@arm.com>, February 2017.
5105+ *
5106+ * This code is derived from libpng source code.
5107+ * For conditions of distribution and use, see the disclaimer
5108+ * and license in this file.
5109+ *
5110+ * Related: https://developer.arm.com/documentation/101964/latest/Color-palette-expansion
5111+ *
5112+ * The functions were refactored to iterate forward.
5113+ *
5114+ */
5115+
5116+ /* Expands a palettized row into RGBA8. */
5117+ static uint32_t expand_palette_rgba8_neon (unsigned char * row , const unsigned char * scanline , const unsigned char * plte , uint32_t width )
5118+ {
5119+ const uint32_t stride = 4 ;
5120+ const uint32_t * palette = (const uint32_t * )plte ;
5121+
5122+ if (width < stride ) return 0 ;
5123+
5124+ uint32_t i ;
5125+ for (i = 0 ; i < width ; i += stride , scanline += stride , row += stride * 4 )
5126+ {
5127+ uint32x4_t cur ;
5128+ cur = vld1q_dup_u32 (palette + scanline [0 ]);
5129+ cur = vld1q_lane_u32 (palette + scanline [1 ], cur , 1 );
5130+ cur = vld1q_lane_u32 (palette + scanline [2 ], cur , 2 );
5131+ cur = vld1q_lane_u32 (palette + scanline [3 ], cur , 3 );
5132+ vst1q_u32 ((void * )row , cur );
5133+ }
5134+
5135+ /* Remove the amount that wasn't processed. */
5136+ if (i != width ) i -= stride ;
5137+
5138+ return i ;
5139+ }
5140+
5141+ /* Expands a palettized row into RGB8. */
5142+ static uint32_t expand_palette_rgb8_neon (unsigned char * row , const unsigned char * scanline , const unsigned char * plte , uint32_t width )
5143+ {
5144+ const uint32_t stride = 8 ;
5145+
5146+ if (width <= stride ) return 0 ;
5147+
5148+ uint32_t i ;
5149+ for (i = 0 ; i < width ; i += stride , scanline += stride , row += stride * 3 )
5150+ {
5151+ uint8x8x3_t cur ;
5152+ cur = vld3_dup_u8 (plte + 3 * scanline [0 ]);
5153+ cur = vld3_lane_u8 (plte + 3 * scanline [1 ], cur , 1 );
5154+ cur = vld3_lane_u8 (plte + 3 * scanline [2 ], cur , 2 );
5155+ cur = vld3_lane_u8 (plte + 3 * scanline [3 ], cur , 3 );
5156+ cur = vld3_lane_u8 (plte + 3 * scanline [4 ], cur , 4 );
5157+ cur = vld3_lane_u8 (plte + 3 * scanline [5 ], cur , 5 );
5158+ cur = vld3_lane_u8 (plte + 3 * scanline [6 ], cur , 6 );
5159+ cur = vld3_lane_u8 (plte + 3 * scanline [7 ], cur , 7 );
5160+ vst3_u8 ((void * )row , cur );
5161+ }
5162+
5163+ /* Remove the amount that wasn't processed. */
5164+ if (i != width ) i -= stride ;
5165+
5166+ return i ;
5167+ }
5168+
50465169#endif /* SPNG_ARM */
0 commit comments