void
Rgb32ToGray(
const
RGB_PIXEL* pRgbSrcPixels, uint8_t* pDest,
const
float
coef[
4
],
size_t
nPixels )
{
__m512 coefR = _mm512_set1_ps(coef[
0
]);
__m512 coefG = _mm512_set1_ps(coef[
1
]);
__m512 coefB = _mm512_set1_ps(coef[
2
]);
__m512i Mask = _mm512_set1_epi32(0x000000ff);
__m512 packed_0_5 = _mm512_set1_ps(
0.
5f);
__m512 packed_255 = _mm512_set1_ps(
255.
0f);
const
size_t
nElements =
16
;
for
(
size_t
i =
0
; i < nPixels; i += nElements) {
__m512i pixel_vals = _mm512_load_si512((__m512i*)&pRgbSrcPixels[i]);
__m512i pixelsR = pixel_vals;
__m512i pixelsG = _mm512_srli_epi32(pixel_vals,
8
);
__m512i pixelsB = _mm512_srli_epi32(pixel_vals,
16
);
__m512i upixelR = _mm512_and_si512(pixelsR, Mask);
__m512i upixelG = _mm512_and_si512(pixelsG, Mask);
__m512i upixelB = _mm512_and_si512(pixelsB, Mask);
__m512 fpixelR = _mm512_cvtepi32_ps(upixelR);
__m512 fpixelG = _mm512_cvtepi32_ps(upixelG);
__m512 fpixelB = _mm512_cvtepi32_ps(upixelB);
fpixelR = _mm512_mul_ps(fpixelR, coefR);
fpixelG = _mm512_mul_ps(fpixelG, coefG);
fpixelB = _mm512_mul_ps(fpixelB, coefB);
__m512 ps = _mm512_add_ps(fpixelR, fpixelG);
__m512 vgs = _mm512_add_ps(fpixelB, ps);
vgs = _mm512_add_ps(vgs, packed_0_5);
vgs = _mm512_min_ps(vgs, packed_255);
__m512i pixels32 = _mm512_cvtps_epi32(vgs);
__m128i pixels8 = _mm512_cvtusepi32_epi8(pixels32);
_mm_store_si128((__m128i*) & pDest[i], pixels8);
}
}