Skip to content

Commit dacbb7a

Browse files
committed
adapt NEON filter optimizations
1 parent 1f653cd commit dacbb7a

1 file changed

Lines changed: 135 additions & 12 deletions

File tree

spng/spng.c

Lines changed: 135 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,11 @@
6060
static void defilter_avg4(size_t rowbytes, unsigned char *row, const unsigned char *prev);
6161
static void defilter_paeth3(size_t rowbytes, unsigned char *row, const unsigned char *prev);
6262
static void defilter_paeth4(size_t rowbytes, unsigned char *row, const unsigned char *prev);
63+
64+
#if defined(SPNG_ARM)
65+
static uint32_t expand_palette_rgba8_neon(unsigned char *row, const unsigned char *scanline, const unsigned char *plte, uint32_t width);
66+
static uint32_t expand_palette_rgb8_neon(unsigned char *row, const unsigned char *scanline, const unsigned char *plte, uint32_t width);
67+
#endif
6368
#endif
6469
#endif
6570

@@ -174,6 +179,14 @@ struct spng__iter
174179
const unsigned char *samples;
175180
};
176181

182+
union spng__decode_plte
183+
{
184+
struct spng_plte_entry rgba[256];
185+
unsigned char rgb[256 * 3];
186+
unsigned char raw[256 * 4];
187+
uint32_t align_this;
188+
};
189+
177190
typedef void spng__undo(spng_ctx *ctx);
178191

179192
struct spng_ctx
@@ -286,7 +299,7 @@ struct spng_ctx
286299
uint16_t *gamma_lut16;
287300
uint16_t gamma_lut8[256];
288301
unsigned char trns_px[8];
289-
struct spng_plte_entry decode_plte[256];
302+
union spng__decode_plte decode_plte;
290303
struct spng_sbit decode_sb;
291304
struct decode_flags decode_flags;
292305
struct spng_row_info row_info;
@@ -1294,14 +1307,38 @@ static inline void scale_row(unsigned char *row, uint32_t pixels, int fmt, unsig
12941307
}
12951308

12961309
/* Expand to *row using 8-bit palette indices from *scanline */
1297-
void expand_row(unsigned char *row, const unsigned char *scanline, const struct spng_plte_entry *plte, uint32_t width, int fmt)
1310+
void expand_row(unsigned char *row,
1311+
const unsigned char *scanline,
1312+
const union spng__decode_plte *decode_plte,
1313+
uint32_t width,
1314+
int fmt)
12981315
{
1299-
uint32_t i;
1316+
uint32_t i = 0;
13001317
unsigned char *px;
13011318
unsigned char entry;
1319+
const struct spng_plte_entry *plte = decode_plte->rgba;
1320+
1321+
#if defined(SPNG_ARM)
1322+
if(fmt == SPNG_FMT_RGBA8) i = expand_palette_rgba8_neon(row, scanline, decode_plte->raw, width);
1323+
else if(fmt == SPNG_FMT_RGB8)
1324+
{
1325+
i = expand_palette_rgb8_neon(row, scanline, decode_plte->raw, width);
1326+
1327+
for(; i < width; i++)
1328+
{/* In this case the LUT is 3 bytes packed */
1329+
px = row + i * 3;
1330+
entry = scanline[i];
1331+
px[0] = decode_plte->raw[entry * 3 + 0];
1332+
px[1] = decode_plte->raw[entry * 3 + 1];
1333+
px[2] = decode_plte->raw[entry * 3 + 2];
1334+
}
1335+
return;
1336+
}
1337+
#endif
1338+
13021339
if(fmt == SPNG_FMT_RGBA8)
13031340
{
1304-
for(i=0; i < width; i++)
1341+
for(; i < width; i++)
13051342
{
13061343
px = row + i * 4;
13071344
entry = scanline[i];
@@ -1313,7 +1350,7 @@ void expand_row(unsigned char *row, const unsigned char *scanline, const struct
13131350
}
13141351
else if(fmt == SPNG_FMT_RGB8)
13151352
{
1316-
for(i=0; i < width; i++)
1353+
for(; i < width; i++)
13171354
{
13181355
px = row + i * 3;
13191356
entry = scanline[i];
@@ -2627,7 +2664,7 @@ int spng_decode_scanline(spng_ctx *ctx, void *out, size_t len)
26272664
const uint16_t *gamma_lut = ctx->gamma_lut;
26282665
unsigned char *trns_px = ctx->trns_px;
26292666
const struct spng_sbit *sb = &ctx->decode_sb;
2630-
const struct spng_plte_entry *plte = ctx->decode_plte;
2667+
const struct spng_plte_entry *plte = ctx->decode_plte.rgba;
26312668
struct spng__iter iter = (ihdr->bit_depth < 16) ? spng__iter_init(ihdr->bit_depth, ctx->scanline) : (struct spng__iter){0};
26322669

26332670
const unsigned char *scanline;
@@ -2711,7 +2748,7 @@ int spng_decode_scanline(spng_ctx *ctx, void *out, size_t len)
27112748
{
27122749
if(fmt & (SPNG_FMT_RGBA8 | SPNG_FMT_RGB8))
27132750
{
2714-
expand_row(out, scanline, plte, width, fmt);
2751+
expand_row(out, scanline, &ctx->decode_plte, width, fmt);
27152752
break;
27162753
}
27172754

@@ -3182,11 +3219,13 @@ int spng_decode_image(spng_ctx *ctx, void *out, size_t len, int fmt, int flags)
31823219
sb->alpha_bits == processing_depth &&
31833220
processing_depth == depth_target) f.do_scaling = 0;
31843221

3185-
struct spng_plte_entry *plte = ctx->decode_plte;
3222+
struct spng_plte_entry *plte = ctx->decode_plte.rgba;
31863223

31873224
/* Pre-process palette entries */
31883225
if(f.indexed)
31893226
{
3227+
uint8_t red, green, blue, alpha;
3228+
31903229
uint32_t i;
31913230
for(i=0; i < 256; i++)
31923231
{
@@ -3195,10 +3234,24 @@ int spng_decode_image(spng_ctx *ctx, void *out, size_t len, int fmt, int flags)
31953234
else
31963235
ctx->plte.entries[i].alpha = 255;
31973236

3198-
plte[i].red = sample_to_target(ctx->plte.entries[i].red, 8, sb->red_bits, 8);
3199-
plte[i].green = sample_to_target(ctx->plte.entries[i].green, 8, sb->green_bits, 8);
3200-
plte[i].blue = sample_to_target(ctx->plte.entries[i].blue, 8, sb->blue_bits, 8);
3201-
plte[i].alpha = sample_to_target(ctx->plte.entries[i].alpha, 8, sb->alpha_bits, 8);
3237+
red = sample_to_target(ctx->plte.entries[i].red, 8, sb->red_bits, 8);
3238+
green = sample_to_target(ctx->plte.entries[i].green, 8, sb->green_bits, 8);
3239+
blue = sample_to_target(ctx->plte.entries[i].blue, 8, sb->blue_bits, 8);
3240+
alpha = sample_to_target(ctx->plte.entries[i].alpha, 8, sb->alpha_bits, 8);
3241+
3242+
#if defined(SPNG_ARM)
3243+
if(fmt == SPNG_FMT_RGB8 && ihdr->bit_depth == 8)
3244+
{/* Working with 3 bytes at a time is more of an ARM thing */
3245+
ctx->decode_plte.rgb[i * 3 + 0] = red;
3246+
ctx->decode_plte.rgb[i * 3 + 1] = green;
3247+
ctx->decode_plte.rgb[i * 3 + 2] = blue;
3248+
continue;
3249+
}
3250+
#endif
3251+
plte[i].red = red;
3252+
plte[i].green = green;
3253+
plte[i].blue = blue;
3254+
plte[i].alpha = alpha;
32023255
}
32033256

32043257
f.apply_trns = 0;
@@ -5043,4 +5096,74 @@ static void defilter_paeth4(size_t rowbytes, unsigned char *row, const unsigned
50435096
}
50445097
}
50455098

5099+
/* NEON optimised palette expansion functions
5100+
* Derived from palette_neon_intrinsics.c
5101+
*
5102+
* Copyright (c) 2018-2019 Cosmin Truta
5103+
* Copyright (c) 2017-2018 Arm Holdings. All rights reserved.
5104+
* Written by Richard Townsend <Richard.Townsend@arm.com>, February 2017.
5105+
*
5106+
* This code is derived from libpng source code.
5107+
* For conditions of distribution and use, see the disclaimer
5108+
* and license in this file.
5109+
*
5110+
* Related: https://developer.arm.com/documentation/101964/latest/Color-palette-expansion
5111+
*
5112+
* The functions were refactored to iterate forward.
5113+
*
5114+
*/
5115+
5116+
/* Expands a palettized row into RGBA8. */
5117+
static uint32_t expand_palette_rgba8_neon(unsigned char *row, const unsigned char *scanline, const unsigned char *plte, uint32_t width)
5118+
{
5119+
const uint32_t stride = 4;
5120+
const uint32_t *palette = (const uint32_t*)plte;
5121+
5122+
if(width < stride) return 0;
5123+
5124+
uint32_t i;
5125+
for(i=0; i < width; i += stride, scanline += stride, row += stride * 4)
5126+
{
5127+
uint32x4_t cur;
5128+
cur = vld1q_dup_u32 (palette + scanline[0]);
5129+
cur = vld1q_lane_u32(palette + scanline[1], cur, 1);
5130+
cur = vld1q_lane_u32(palette + scanline[2], cur, 2);
5131+
cur = vld1q_lane_u32(palette + scanline[3], cur, 3);
5132+
vst1q_u32((void*)row, cur);
5133+
}
5134+
5135+
/* Remove the amount that wasn't processed. */
5136+
if(i != width) i -= stride;
5137+
5138+
return i;
5139+
}
5140+
5141+
/* Expands a palettized row into RGB8. */
5142+
static uint32_t expand_palette_rgb8_neon(unsigned char *row, const unsigned char *scanline, const unsigned char *plte, uint32_t width)
5143+
{
5144+
const uint32_t stride = 8;
5145+
5146+
if(width <= stride) return 0;
5147+
5148+
uint32_t i;
5149+
for(i=0; i < width; i += stride, scanline += stride, row += stride * 3)
5150+
{
5151+
uint8x8x3_t cur;
5152+
cur = vld3_dup_u8 (plte + 3 * scanline[0]);
5153+
cur = vld3_lane_u8(plte + 3 * scanline[1], cur, 1);
5154+
cur = vld3_lane_u8(plte + 3 * scanline[2], cur, 2);
5155+
cur = vld3_lane_u8(plte + 3 * scanline[3], cur, 3);
5156+
cur = vld3_lane_u8(plte + 3 * scanline[4], cur, 4);
5157+
cur = vld3_lane_u8(plte + 3 * scanline[5], cur, 5);
5158+
cur = vld3_lane_u8(plte + 3 * scanline[6], cur, 6);
5159+
cur = vld3_lane_u8(plte + 3 * scanline[7], cur, 7);
5160+
vst3_u8((void*)row, cur);
5161+
}
5162+
5163+
/* Remove the amount that wasn't processed. */
5164+
if(i != width) i -= stride;
5165+
5166+
return i;
5167+
}
5168+
50465169
#endif /* SPNG_ARM */

0 commit comments

Comments
 (0)