15 #include "visiontransfer/bitconversions.h"
16 #include "visiontransfer/exceptions.h"
20 # include <immintrin.h>
22 # include <smmintrin.h>
24 # include <emmintrin.h>
31 using namespace visiontransfer;
32 using namespace visiontransfer::internal;
34 namespace visiontransfer {
37 void BitConversions::decode12BitPacked(
int startRow,
int stopRow,
const unsigned char* src,
38 unsigned char* dst,
int srcStride,
int dstStride,
int rowWidth) {
40 const unsigned char* dispStart = src;
43 if(rowWidth % 32 == 0) {
44 if(srcStride % 16 == 0 &&
reinterpret_cast<size_t>(src) % 16 == 0) {
45 decode12BitPackedSSE4<true>(startRow, stopRow, dispStart,
46 rowWidth,
reinterpret_cast<unsigned short*
>(dst), srcStride, dstStride);
48 decode12BitPackedSSE4<false>(startRow, stopRow, dispStart,
49 rowWidth,
reinterpret_cast<unsigned short*
>(dst), srcStride, dstStride);
54 # if defined(__ARM_NEON) && defined(__ARM_ARCH_ISA_A64)
55 if(rowWidth % 32 == 0) {
56 if(srcStride % 16 == 0 &&
reinterpret_cast<size_t>(src) % 16 == 0) {
57 decode12BitPackedNEON<true>(startRow, stopRow, dispStart,
58 rowWidth,
reinterpret_cast<unsigned short*
>(dst), srcStride, dstStride);
60 decode12BitPackedNEON<false>(startRow, stopRow, dispStart,
61 rowWidth,
reinterpret_cast<unsigned short*
>(dst), srcStride, dstStride);
67 decode12BitPackedFallback(startRow, stopRow, dispStart, rowWidth,
68 reinterpret_cast<unsigned short*
>(dst), srcStride, dstStride);
73 template <
bool alignedLoad>
74 void BitConversions::decode12BitPackedSSE4(
int startRow,
int stopRow,
const unsigned char* dispStart,
75 int width,
unsigned short* dst,
int srcStride,
int dstStride) {
81 unsigned char* outPos = &
reinterpret_cast<unsigned char*
>(dst)[startRow*dstStride];
82 int outRowPadding = dstStride - 2*width;
84 constexpr
char ff = (char)0xff;
85 const __m128i shuffleMask1a = _mm_set_epi8(11, 10, 10, 9, 8, 7, 7, 6, 5, 4, 4, 3, 2, 1, 1, 0);
86 const __m128i shuffleMask1b = _mm_set_epi8(ff, ff, ff, ff, ff, ff, ff, ff, ff, ff, ff, 15, 14, 13, 13, 12);
88 const __m128i shuffleMask2a = _mm_set_epi8(7, 6, 6, 5, 4, 3, 3, 2, 1, 0, 0, ff, ff, ff, ff, ff);
89 const __m128i shuffleMask2b = _mm_set_epi8(ff, ff, ff, ff, ff, 15, 15, 14, 13, 12, 12, 11, 10, 9, 9, 8);
91 const __m128i shuffleMask3a = _mm_set_epi8(3, 2, 2, 1, 0, ff, ff, ff, ff, ff, ff, ff, ff, ff, ff, ff);
92 const __m128i shuffleMask3b = _mm_set_epi8(15, 14, 14, 13, 12, 11, 11, 10, 9, 8, 8, 7, 6, 5, 5, 4);
94 const __m128i shiftMultiplyMask = _mm_set_epi16(1, 16, 1, 16, 1, 16, 1, 16);
96 const __m128i blendMask1 = _mm_set_epi8(ff, ff, ff, ff, ff, ff, ff, ff, ff, ff, ff, 0, 0, 0, 0, 0);
97 const __m128i blendMask2 = _mm_set_epi8(ff, ff, ff, ff, ff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
99 int dispRowWidth = width * 3/2;
101 for(
int y = startRow; y<stopRow; y++) {
102 const unsigned char* rowPos = &dispStart[y*srcStride];
103 const unsigned char* rowEnd = &dispStart[y*srcStride + dispRowWidth];
105 while(rowPos < rowEnd) {
108 __m128i rowPixels1, rowPixels2, rowPixels3;
110 rowPixels1 = _mm_load_si128(
reinterpret_cast<const __m128i*
>(rowPos));
113 rowPixels2 = _mm_load_si128(
reinterpret_cast<const __m128i*
>(rowPos));
116 rowPixels3 = _mm_load_si128(
reinterpret_cast<const __m128i*
>(rowPos));
119 rowPixels1 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(rowPos));
122 rowPixels2 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(rowPos));
125 rowPixels3 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(rowPos));
131 __m128i part1 = _mm_shuffle_epi8(rowPixels1, shuffleMask1a);
132 __m128i part2a = _mm_shuffle_epi8(rowPixels1, shuffleMask1b);
133 __m128i part2b = _mm_shuffle_epi8(rowPixels2, shuffleMask2a);
134 __m128i part3a = _mm_shuffle_epi8(rowPixels2, shuffleMask2b);
135 __m128i part3b = _mm_shuffle_epi8(rowPixels3, shuffleMask3a);
136 __m128i part4 = _mm_shuffle_epi8(rowPixels3, shuffleMask3b);
138 __m128i part2 = _mm_blendv_epi8(part2a, part2b, blendMask1);
139 __m128i part3 = _mm_blendv_epi8(part3a, part3b, blendMask2);
143 __m128i shift1a = _mm_mullo_epi16(part1, shiftMultiplyMask);
144 __m128i shift2a = _mm_mullo_epi16(part2, shiftMultiplyMask);
145 __m128i shift3a = _mm_mullo_epi16(part3, shiftMultiplyMask);
146 __m128i shift4a = _mm_mullo_epi16(part4, shiftMultiplyMask);
150 __m128i shift1b = _mm_srli_epi16(shift1a, 4);
151 __m128i shift2b = _mm_srli_epi16(shift2a, 4);
152 __m128i shift3b = _mm_srli_epi16(shift3a, 4);
153 __m128i shift4b = _mm_srli_epi16(shift4a, 4);
155 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(outPos), shift1b);
157 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(outPos), shift2b);
159 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(outPos), shift3b);
161 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(outPos), shift4b);
165 outPos += outRowPadding;
170 #if defined(__ARM_NEON) && defined(__ARM_ARCH_ISA_A64)
171 #define TX(y,x) ((x + y*16)/3 + ((x + y*16)%3)*16)
173 template <
bool alignedLoad>
174 void BitConversions::decode12BitPackedNEON(
int startRow,
int stopRow,
const unsigned char* dispStart,
175 int width,
unsigned short* dst,
int srcStride,
int dstStride) {
176 if(width % 32 != 0) {
181 unsigned char* outPos = &
reinterpret_cast<unsigned char*
>(dst)[startRow*dstStride];
182 int outRowPadding = dstStride - 2*width;
185 const uint8x16_t shuffleMask1 = {TX(0,0), TX(0,1), TX(0,1), TX(0,2), TX(0,3), TX(0,4),
186 TX(0,4), TX(0,5), TX(0,6), TX(0,7), TX(0,7), TX(0,8), TX(0,9), TX(0,10), TX(0,10), TX(0,11)};
187 const uint8x16_t shuffleMask2 = {TX(0,12), TX(0,13), TX(0,13), TX(0,14), TX(0,15), TX(1,0),
188 TX(1,0), TX(1,1), TX(1,2), TX(1,3), TX(1,3), TX(1,4), TX(1,5), TX(1,6), TX(1,6), TX(1,7)};
189 const uint8x16_t shuffleMask3 = {TX(1,8), TX(1,9), TX(1,9), TX(1,10), TX(1,11), TX(1,12),
190 TX(1,12), TX(1,13), TX(1,14), TX(1,15), TX(1,15), TX(2,0), TX(2,1), TX(2,2), TX(2,2), TX(2,3)};
191 const uint8x16_t shuffleMask4 = {TX(2,4), TX(2,5), TX(2,5), TX(2,6), TX(2,7), TX(2,8),
192 TX(2,8), TX(2,9), TX(2,10), TX(2,11), TX(2,11), TX(2,12), TX(2,13), TX(2,14), TX(2,14), TX(2,15)};
194 const int16x8_t shiftMask = {4, 0, 4, 0, 4, 0, 4, 0};
196 int dispRowWidth = width * 3/2;
198 for(
int y = startRow; y<stopRow; y++) {
199 const unsigned char* rowPos = &dispStart[y*srcStride];
200 const unsigned char* rowEnd = &dispStart[y*srcStride + dispRowWidth];
202 while(rowPos < rowEnd) {
205 uint8x16x3_t rowPixels;
207 rowPixels = vld3q_u8(
reinterpret_cast<const uint8_t*
>(
208 __builtin_assume_aligned(rowPos, 16)));
210 rowPixels = vld3q_u8(
reinterpret_cast<const uint8_t*
>(rowPos));
216 uint8x16_t part1 = vqtbl3q_u8(rowPixels, shuffleMask1);
217 uint8x16_t part2 = vqtbl3q_u8(rowPixels, shuffleMask2);
218 uint8x16_t part3 = vqtbl3q_u8(rowPixels, shuffleMask3);
219 uint8x16_t part4 = vqtbl3q_u8(rowPixels, shuffleMask4);
223 uint16x8_t shift1a = vshlq_u16(vreinterpretq_u16_u8(part1), shiftMask);
224 uint16x8_t shift2a = vshlq_u16(vreinterpretq_u16_u8(part2), shiftMask);
225 uint16x8_t shift3a = vshlq_u16(vreinterpretq_u16_u8(part3), shiftMask);
226 uint16x8_t shift4a = vshlq_u16(vreinterpretq_u16_u8(part4), shiftMask);
230 uint16x8_t shift1b = vshrq_n_u16(shift1a, 4);
231 uint16x8_t shift2b = vshrq_n_u16(shift2a, 4);
232 uint16x8_t shift3b = vshrq_n_u16(shift3a, 4);
233 uint16x8_t shift4b = vshrq_n_u16(shift4a, 4);
235 vst1q_u16(
reinterpret_cast<uint16_t*
>(outPos), shift1b);
237 vst1q_u16(
reinterpret_cast<uint16_t*
>(outPos), shift2b);
239 vst1q_u16(
reinterpret_cast<uint16_t*
>(outPos), shift3b);
241 vst1q_u16(
reinterpret_cast<uint16_t*
>(outPos), shift4b);
245 outPos += outRowPadding;
250 void BitConversions::decode12BitPackedFallback(
int startRow,
int stopRow,
const unsigned char* dispStart,
251 int width,
unsigned short* dst,
int srcStride,
int dstStride) {
253 int dstStrideShort = dstStride/2;
256 for(
int y = startRow; y < stopRow; y++) {
257 const unsigned char* srcPtr = &dispStart[y*srcStride];
258 unsigned short* dstPtr = &dst[y*dstStrideShort];
259 unsigned short* dstEndPtr = dstPtr + width;
261 while(dstPtr != dstEndPtr) {
262 *dstPtr =
static_cast<unsigned short>(*srcPtr);
264 *dstPtr |=
static_cast<unsigned short>(*srcPtr & 0x0f) << 8;
267 *dstPtr =
static_cast<unsigned short>(*srcPtr) >> 4;
269 *dstPtr |=
static_cast<unsigned short>(*srcPtr) << 4;
276 void BitConversions::encode12BitPacked(
int startRow,
int stopRow,
const unsigned char* src,
277 unsigned char* dst,
int srcStride,
int dstStride,
int rowWidth) {
278 const unsigned short* srcShort =
reinterpret_cast<const unsigned short*
>(src);
279 int srcStrideShort = srcStride/2;
282 for(
int y = startRow; y < stopRow; y++) {
283 const unsigned short* srcPtr = &srcShort[y*srcStrideShort];
284 const unsigned short* srcEndPtr = srcPtr + rowWidth;
285 unsigned char* dstPtr = &dst[y*dstStride];
287 while(srcPtr != srcEndPtr) {
288 *dstPtr =
static_cast<unsigned char>(*srcPtr);
290 *dstPtr =
static_cast<unsigned char>(*srcPtr >> 8) & 0x0f;
293 *dstPtr |=
static_cast<unsigned char>(*srcPtr) << 4;
295 *dstPtr =
static_cast<unsigned char>(*srcPtr >> 4);