libvisiontransfer  10.6.0
bitconversions.cpp
1 /*******************************************************************************
2  * Copyright (c) 2023 Allied Vision Technologies GmbH
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a copy
5  * of this software and associated documentation files (the "Software"), to deal
6  * in the Software without restriction, including without limitation the rights
7  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8  * copies of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *******************************************************************************/
14 
15 #include "visiontransfer/bitconversions.h"
16 #include "visiontransfer/exceptions.h"
17 
18 // SIMD Headers
19 #ifdef __AVX2__
20 # include <immintrin.h>
21 #elif __SSE4_1__
22 # include <smmintrin.h>
23 #elif __SSE2__
24 # include <emmintrin.h>
25 #endif
26 
27 #ifdef __ARM_NEON
28 #include <arm_neon.h>
29 #endif
30 
31 using namespace visiontransfer;
32 using namespace visiontransfer::internal;
33 
34 namespace visiontransfer {
35 namespace internal {
36 
37 void BitConversions::decode12BitPacked(int startRow, int stopRow, const unsigned char* src,
38  unsigned char* dst, int srcStride, int dstStride, int rowWidth) {
39 
40  const unsigned char* dispStart = src;
41 
42 # ifdef __SSE4_1__
43  if(rowWidth % 32 == 0) {
44  if(srcStride % 16 == 0 && reinterpret_cast<size_t>(src) % 16 == 0) {
45  decode12BitPackedSSE4<true>(startRow, stopRow, dispStart,
46  rowWidth, reinterpret_cast<unsigned short*>(dst), srcStride, dstStride);
47  } else {
48  decode12BitPackedSSE4<false>(startRow, stopRow, dispStart,
49  rowWidth, reinterpret_cast<unsigned short*>(dst), srcStride, dstStride);
50  }
51 
52  } else // We use fallback implementation if the image width is not dividable by 32
53 # endif
54 # if defined(__ARM_NEON) && defined(__ARM_ARCH_ISA_A64)
55  if(rowWidth % 32 == 0) {
56  if(srcStride % 16 == 0 && reinterpret_cast<size_t>(src) % 16 == 0) {
57  decode12BitPackedNEON<true>(startRow, stopRow, dispStart,
58  rowWidth, reinterpret_cast<unsigned short*>(dst), srcStride, dstStride);
59  } else {
60  decode12BitPackedNEON<false>(startRow, stopRow, dispStart,
61  rowWidth, reinterpret_cast<unsigned short*>(dst), srcStride, dstStride);
62  }
63 
64  } else // We use fallback implementation if the image width is not dividable by 32
65 # endif
66  {
67  decode12BitPackedFallback(startRow, stopRow, dispStart, rowWidth,
68  reinterpret_cast<unsigned short*>(dst), srcStride, dstStride);
69  }
70 }
71 
72 #ifdef __SSE4_1__
73 template <bool alignedLoad>
74 void BitConversions::decode12BitPackedSSE4(int startRow, int stopRow, const unsigned char* dispStart,
75  int width, unsigned short* dst, int srcStride, int dstStride) {
76  if(width % 32 != 0) {
77  throw ProtocolException("Image width must be a multiple of 32!");
78  }
79 
80  // SSE optimized code
81  unsigned char* outPos = &reinterpret_cast<unsigned char*>(dst)[startRow*dstStride];
82  int outRowPadding = dstStride - 2*width;
83 
84  constexpr char ff = (char)0xff; // to prevent warnings
85  const __m128i shuffleMask1a = _mm_set_epi8(11, 10, 10, 9, 8, 7, 7, 6, 5, 4, 4, 3, 2, 1, 1, 0);
86  const __m128i shuffleMask1b = _mm_set_epi8(ff, ff, ff, ff, ff, ff, ff, ff, ff, ff, ff, 15, 14, 13, 13, 12);
87 
88  const __m128i shuffleMask2a = _mm_set_epi8(7, 6, 6, 5, 4, 3, 3, 2, 1, 0, 0, ff, ff, ff, ff, ff);
89  const __m128i shuffleMask2b = _mm_set_epi8(ff, ff, ff, ff, ff, 15, 15, 14, 13, 12, 12, 11, 10, 9, 9, 8);
90 
91  const __m128i shuffleMask3a = _mm_set_epi8(3, 2, 2, 1, 0, ff, ff, ff, ff, ff, ff, ff, ff, ff, ff, ff);
92  const __m128i shuffleMask3b = _mm_set_epi8(15, 14, 14, 13, 12, 11, 11, 10, 9, 8, 8, 7, 6, 5, 5, 4);
93 
94  const __m128i shiftMultiplyMask = _mm_set_epi16(1, 16, 1, 16, 1, 16, 1, 16);
95 
96  const __m128i blendMask1 = _mm_set_epi8(ff, ff, ff, ff, ff, ff, ff, ff, ff, ff, ff, 0, 0, 0, 0, 0);
97  const __m128i blendMask2 = _mm_set_epi8(ff, ff, ff, ff, ff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
98 
99  int dispRowWidth = width * 3/2;
100 
101  for(int y = startRow; y<stopRow; y++) {
102  const unsigned char* rowPos = &dispStart[y*srcStride];
103  const unsigned char* rowEnd = &dispStart[y*srcStride + dispRowWidth];
104 
105  while(rowPos < rowEnd) {
106  // Load 16 pixels
107  // AA BA BB CC DC DD EE FE FF ...
108  __m128i rowPixels1, rowPixels2, rowPixels3;
109  if(alignedLoad) {
110  rowPixels1 = _mm_load_si128(reinterpret_cast<const __m128i*>(rowPos));
111  rowPos += 16;
112 
113  rowPixels2 = _mm_load_si128(reinterpret_cast<const __m128i*>(rowPos));
114  rowPos += 16;
115 
116  rowPixels3 = _mm_load_si128(reinterpret_cast<const __m128i*>(rowPos));
117  rowPos += 16;
118  } else {
119  rowPixels1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(rowPos));
120  rowPos += 16;
121 
122  rowPixels2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(rowPos));
123  rowPos += 16;
124 
125  rowPixels3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(rowPos));
126  rowPos += 16;
127  }
128 
129  // Duplicate bytes with shared data
130  // BAAA BBBA DCCC DDDC FEEE FFFE (example without endianess swap!)
131  __m128i part1 = _mm_shuffle_epi8(rowPixels1, shuffleMask1a);
132  __m128i part2a = _mm_shuffle_epi8(rowPixels1, shuffleMask1b);
133  __m128i part2b = _mm_shuffle_epi8(rowPixels2, shuffleMask2a);
134  __m128i part3a = _mm_shuffle_epi8(rowPixels2, shuffleMask2b);
135  __m128i part3b = _mm_shuffle_epi8(rowPixels3, shuffleMask3a);
136  __m128i part4 = _mm_shuffle_epi8(rowPixels3, shuffleMask3b);
137 
138  __m128i part2 = _mm_blendv_epi8(part2a, part2b, blendMask1);
139  __m128i part3 = _mm_blendv_epi8(part3a, part3b, blendMask2);
140 
141  // Shift left through multiplication
142  // AAA0 BBBA CCC0 DDDC EEE0 FFFE
143  __m128i shift1a = _mm_mullo_epi16(part1, shiftMultiplyMask);
144  __m128i shift2a = _mm_mullo_epi16(part2, shiftMultiplyMask);
145  __m128i shift3a = _mm_mullo_epi16(part3, shiftMultiplyMask);
146  __m128i shift4a = _mm_mullo_epi16(part4, shiftMultiplyMask);
147 
148  // Shift right again
149  // 0AAA 0BBB 0CCC 0DDD 0EEE 0FFF ...
150  __m128i shift1b = _mm_srli_epi16(shift1a, 4);
151  __m128i shift2b = _mm_srli_epi16(shift2a, 4);
152  __m128i shift3b = _mm_srli_epi16(shift3a, 4);
153  __m128i shift4b = _mm_srli_epi16(shift4a, 4);
154 
155  _mm_storeu_si128(reinterpret_cast<__m128i*>(outPos), shift1b);
156  outPos += 16;
157  _mm_storeu_si128(reinterpret_cast<__m128i*>(outPos), shift2b);
158  outPos += 16;
159  _mm_storeu_si128(reinterpret_cast<__m128i*>(outPos), shift3b);
160  outPos += 16;
161  _mm_storeu_si128(reinterpret_cast<__m128i*>(outPos), shift4b);
162  outPos += 16;
163  }
164 
165  outPos += outRowPadding;
166  }
167 }
168 #endif
169 
170 #if defined(__ARM_NEON) && defined(__ARM_ARCH_ISA_A64)
171 #define TX(y,x) ((x + y*16)/3 + ((x + y*16)%3)*16)
172 
173 template <bool alignedLoad>
174 void BitConversions::decode12BitPackedNEON(int startRow, int stopRow, const unsigned char* dispStart,
175  int width, unsigned short* dst, int srcStride, int dstStride) {
176  if(width % 32 != 0) {
177  throw ProtocolException("Image width must be a multiple of 32!");
178  }
179 
180  // ARM NEON A64 optimized code
181  unsigned char* outPos = &reinterpret_cast<unsigned char*>(dst)[startRow*dstStride];
182  int outRowPadding = dstStride - 2*width;
183 
184  // Shuffle mask already performs endianess swapping
185  const uint8x16_t shuffleMask1 = {TX(0,0), TX(0,1), TX(0,1), TX(0,2), TX(0,3), TX(0,4),
186  TX(0,4), TX(0,5), TX(0,6), TX(0,7), TX(0,7), TX(0,8), TX(0,9), TX(0,10), TX(0,10), TX(0,11)};
187  const uint8x16_t shuffleMask2 = {TX(0,12), TX(0,13), TX(0,13), TX(0,14), TX(0,15), TX(1,0),
188  TX(1,0), TX(1,1), TX(1,2), TX(1,3), TX(1,3), TX(1,4), TX(1,5), TX(1,6), TX(1,6), TX(1,7)};
189  const uint8x16_t shuffleMask3 = {TX(1,8), TX(1,9), TX(1,9), TX(1,10), TX(1,11), TX(1,12),
190  TX(1,12), TX(1,13), TX(1,14), TX(1,15), TX(1,15), TX(2,0), TX(2,1), TX(2,2), TX(2,2), TX(2,3)};
191  const uint8x16_t shuffleMask4 = {TX(2,4), TX(2,5), TX(2,5), TX(2,6), TX(2,7), TX(2,8),
192  TX(2,8), TX(2,9), TX(2,10), TX(2,11), TX(2,11), TX(2,12), TX(2,13), TX(2,14), TX(2,14), TX(2,15)};
193 
194  const int16x8_t shiftMask = {4, 0, 4, 0, 4, 0, 4, 0};
195 
196  int dispRowWidth = width * 3/2;
197 
198  for(int y = startRow; y<stopRow; y++) {
199  const unsigned char* rowPos = &dispStart[y*srcStride];
200  const unsigned char* rowEnd = &dispStart[y*srcStride + dispRowWidth];
201 
202  while(rowPos < rowEnd) {
203  // Load 16 pixels
204  // AA BA BB CC DC DD EE FE FF
205  uint8x16x3_t rowPixels;
206  if(alignedLoad) {
207  rowPixels = vld3q_u8(reinterpret_cast<const uint8_t*>(
208  __builtin_assume_aligned(rowPos, 16)));
209  } else {
210  rowPixels = vld3q_u8(reinterpret_cast<const uint8_t*>(rowPos));
211  }
212  rowPos += 48;
213 
214  // Duplicate bytes with shared data
215  // BAAA BBBA DCCC DDDC FEEE FFFE (example without endianess swap!)
216  uint8x16_t part1 = vqtbl3q_u8(rowPixels, shuffleMask1);
217  uint8x16_t part2 = vqtbl3q_u8(rowPixels, shuffleMask2);
218  uint8x16_t part3 = vqtbl3q_u8(rowPixels, shuffleMask3);
219  uint8x16_t part4 = vqtbl3q_u8(rowPixels, shuffleMask4);
220 
221  // Shift left
222  // AAA0 BBBA CCC0 DDDC EEE0 FFFE
223  uint16x8_t shift1a = vshlq_u16(vreinterpretq_u16_u8(part1), shiftMask);
224  uint16x8_t shift2a = vshlq_u16(vreinterpretq_u16_u8(part2), shiftMask);
225  uint16x8_t shift3a = vshlq_u16(vreinterpretq_u16_u8(part3), shiftMask);
226  uint16x8_t shift4a = vshlq_u16(vreinterpretq_u16_u8(part4), shiftMask);
227 
228  // Shift right again
229  // 0AAA 0BBB 0CCC 0DDD 0EEE 0FFF ...
230  uint16x8_t shift1b = vshrq_n_u16(shift1a, 4);
231  uint16x8_t shift2b = vshrq_n_u16(shift2a, 4);
232  uint16x8_t shift3b = vshrq_n_u16(shift3a, 4);
233  uint16x8_t shift4b = vshrq_n_u16(shift4a, 4);
234 
235  vst1q_u16(reinterpret_cast<uint16_t*>(outPos), shift1b);
236  outPos += 16;
237  vst1q_u16(reinterpret_cast<uint16_t*>(outPos), shift2b);
238  outPos += 16;
239  vst1q_u16(reinterpret_cast<uint16_t*>(outPos), shift3b);
240  outPos += 16;
241  vst1q_u16(reinterpret_cast<uint16_t*>(outPos), shift4b);
242  outPos += 16;
243  }
244 
245  outPos += outRowPadding;
246  }
247 }
248 #endif
249 
250 void BitConversions::decode12BitPackedFallback(int startRow, int stopRow, const unsigned char* dispStart,
251  int width, unsigned short* dst, int srcStride, int dstStride) {
252 
253  int dstStrideShort = dstStride/2;
254 
255  // Non-SSE version
256  for(int y = startRow; y < stopRow; y++) {
257  const unsigned char* srcPtr = &dispStart[y*srcStride];
258  unsigned short* dstPtr = &dst[y*dstStrideShort];
259  unsigned short* dstEndPtr = dstPtr + width;
260 
261  while(dstPtr != dstEndPtr) {
262  *dstPtr = static_cast<unsigned short>(*srcPtr);
263  srcPtr++;
264  *dstPtr |= static_cast<unsigned short>(*srcPtr & 0x0f) << 8;
265  dstPtr++;
266 
267  *dstPtr = static_cast<unsigned short>(*srcPtr) >> 4;
268  srcPtr++;
269  *dstPtr |= static_cast<unsigned short>(*srcPtr) << 4;
270  srcPtr++;
271  dstPtr++;
272  }
273  }
274 }
275 
276 void BitConversions::encode12BitPacked(int startRow, int stopRow, const unsigned char* src,
277  unsigned char* dst, int srcStride, int dstStride, int rowWidth) {
278  const unsigned short* srcShort = reinterpret_cast<const unsigned short*>(src);
279  int srcStrideShort = srcStride/2;
280 
281  // SSE/NEON optimization is not yet available
282  for(int y = startRow; y < stopRow; y++) {
283  const unsigned short* srcPtr = &srcShort[y*srcStrideShort];
284  const unsigned short* srcEndPtr = srcPtr + rowWidth;
285  unsigned char* dstPtr = &dst[y*dstStride];
286 
287  while(srcPtr != srcEndPtr) {
288  *dstPtr = static_cast<unsigned char>(*srcPtr);
289  dstPtr++;
290  *dstPtr = static_cast<unsigned char>(*srcPtr >> 8) & 0x0f;
291  srcPtr++;
292 
293  *dstPtr |= static_cast<unsigned char>(*srcPtr) << 4;
294  dstPtr++;
295  *dstPtr = static_cast<unsigned char>(*srcPtr >> 4);
296  srcPtr++;
297  dstPtr++;
298  }
299  }
300 }
301 
302 }} // namespace
303 
visiontransfer::ProtocolException
Exception class that is used for all protocol exceptions.
Definition: exceptions.h:37
Allied Vision