libvisiontransfer  5.2.0
bitconversions.cpp
1 /*******************************************************************************
2  * Copyright (c) 2018 Nerian Vision Technologies
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a copy
5  * of this software and associated documentation files (the "Software"), to deal
6  * in the Software without restriction, including without limitation the rights
7  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8  * copies of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *******************************************************************************/
14 
15 #include "visiontransfer/bitconversions.h"
16 #include "visiontransfer/exceptions.h"
17 
18 // SIMD Headers
19 #ifdef __AVX2__
20 # include <immintrin.h>
21 #elif __SSE4_1__
22 # include <smmintrin.h>
23 #elif __SSE2__
24 # include <emmintrin.h>
25 #endif
26 
27 #ifdef __ARM_NEON
28 #include <arm_neon.h>
29 #endif
30 
31 
32 void BitConversions::decode12BitSplit(int startRow, int stopRow, const unsigned char* src,
33  unsigned char* dst, int srcStride, int dstStride, int rowWidth) {
34 
35  const unsigned char* dispStart = src;
36  const unsigned char* subpixStart = &src[rowWidth];
37 
38 # ifdef __AVX2__
39  if(rowWidth % 32 == 0) {
40  if(srcStride % 32 == 0 && reinterpret_cast<size_t>(src) % 32 == 0) {
41  decode12BitSplitAVX2<true>(startRow, stopRow, dispStart, subpixStart,
42  rowWidth, reinterpret_cast<unsigned short*>(dst), srcStride, dstStride);
43  } else {
44  decode12BitSplitAVX2<false>(startRow, stopRow, dispStart, subpixStart,
45  rowWidth, reinterpret_cast<unsigned short*>(dst), srcStride, dstStride);
46  }
47  } else // We use the SSSE implementation as fall back if the image width is not
48  // dividable by 32
49 # endif
50 # ifdef __SSE2__
51  if(rowWidth % 16 == 0) {
52  if(srcStride % 16 == 0 && reinterpret_cast<size_t>(src) % 16 == 0) {
53  decode12BitSplitSSE2<true>(startRow, stopRow, dispStart, subpixStart,
54  rowWidth, reinterpret_cast<unsigned short*>(dst), srcStride, dstStride);
55  } else {
56  decode12BitSplitSSE2<false>(startRow, stopRow, dispStart, subpixStart,
57  rowWidth, reinterpret_cast<unsigned short*>(dst), srcStride, dstStride);
58  }
59  } else // We use the SSSE implementation as fall back if the image width is not
60  // dividable by 16
61 # endif
62  {
63  decode12BitSplitFallback(startRow, stopRow, dispStart, subpixStart, rowWidth,
64  reinterpret_cast<unsigned short*>(dst), srcStride, dstStride);
65  }
66 }
67 
68 #ifdef __SSE2__
69 template <bool alignedLoad>
70 void BitConversions::decode12BitSplitSSE2(int startRow, int stopRow, const unsigned char* dispStart,
71  const unsigned char* subpixStart, int width, unsigned short* dst, int srcStride, int dstStride) {
72  if(width % 16 != 0) {
73  throw ProtocolException("Image width must be a multiple of 16!");
74  }
75 
76  // SSE optimized code
77  __m128i zero = _mm_set1_epi8(0x00);
78  __m128i subpixMask = _mm_set1_epi8(0x0f);
79  unsigned char* outPos = &reinterpret_cast<unsigned char*>(dst)[startRow*dstStride];
80  int outRowPadding = dstStride - 2*width;
81 
82  for(int y = startRow; y<stopRow; y++) {
83  const unsigned char* intPos = &dispStart[y*srcStride];
84  const unsigned char* intEndPos = &dispStart[y*srcStride + width];
85  const unsigned char* subpixPos = &subpixStart[y*srcStride];
86 
87  for(; intPos < intEndPos;) {
88  // Get subpix offsets
89  __m128i subpixOffsets;
90  if(alignedLoad) {
91  subpixOffsets = _mm_load_si128(reinterpret_cast<const __m128i*>(subpixPos));
92  } else {
93  subpixOffsets = _mm_loadu_si128(reinterpret_cast<const __m128i*>(subpixPos));
94  }
95  subpixPos += 16;
96 
97  __m128i offsetsEven = _mm_and_si128(subpixOffsets, subpixMask);
98  __m128i offsetsUneven = _mm_and_si128(_mm_srli_epi16(subpixOffsets, 4), subpixMask);
99 
100  for(int i=0; i<2; i++) {
101  // Load integer disparities
102  __m128i intDisps;
103  if(alignedLoad) {
104  intDisps = _mm_load_si128(reinterpret_cast<const __m128i*>(intPos));
105  } else {
106  intDisps = _mm_loadu_si128(reinterpret_cast<const __m128i*>(intPos));
107  }
108 
109  intPos += 16;
110 
111  // Get integer disparities shifted by 4
112  __m128i disps1 = _mm_slli_epi16(_mm_unpacklo_epi8(intDisps, zero), 4);
113  __m128i disps2 = _mm_slli_epi16(_mm_unpackhi_epi8(intDisps, zero), 4);
114 
115  // Unpack subpixel offsets for selected disparities
116  __m128i offsets;
117  if(i == 0) {
118  offsets = _mm_unpacklo_epi8(offsetsEven, offsetsUneven);
119  } else {
120  offsets = _mm_unpackhi_epi8(offsetsEven, offsetsUneven);
121  }
122 
123  // Add subpixel offsets to integer disparities
124  disps1 = _mm_or_si128(disps1, _mm_unpacklo_epi8(offsets, zero));
125  disps2 = _mm_or_si128(disps2, _mm_unpackhi_epi8(offsets, zero));
126 
127  // Store result
128  _mm_store_si128(reinterpret_cast<__m128i*>(outPos), disps1);
129  outPos += 16;
130  _mm_store_si128(reinterpret_cast<__m128i*>(outPos), disps2);
131  outPos += 16;
132 
133  if(intPos >= intEndPos) {
134  // For the last pixel we might need one iteration less
135  break;
136  }
137  }
138  }
139 
140  outPos += outRowPadding;
141  }
142 }
143 #endif
144 
145 # ifdef __AVX2__
146 template <bool alignedLoad>
147 void BitConversions::decode12BitSplitAVX2(int startRow, int stopRow, const unsigned char* dispStart,
148  const unsigned char* subpixStart, int width, unsigned short* dst, int srcStride, int dstStride) {
149  if(width % 32 != 0) {
150  // We use the SSE implementation as fall back if the image size isn't
151  // a multiple of
152  throw ProtocolException("Image width must be a multiple of 32!");
153  }
154 
155  // AVX2 optimized code
156  __m256i zero = _mm256_set1_epi8(0x00);
157  __m256i subpixMask = _mm256_set1_epi8(0x0f);
158  unsigned char* outPos = &reinterpret_cast<unsigned char*>(dst)[startRow*dstStride];
159  int outRowPadding = dstStride - 2*width;
160 
161  for(int y = startRow; y<stopRow; y++) {
162  const unsigned char* intPos = &dispStart[y*srcStride];
163  const unsigned char* intEndPos = &dispStart[y*srcStride + width];
164  const unsigned char* subpixPos = &subpixStart[y*srcStride];
165 
166  for(; intPos < intEndPos;) {
167  // Get subpix offsets
168  __m256i subpixOffsets;
169 
170  if(alignedLoad) {
171  subpixOffsets = _mm256_load_si256(reinterpret_cast<const __m256i*>(subpixPos));
172  } else {
173  subpixOffsets = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(subpixPos));
174  }
175  subpixPos += 32;
176 
177  __m256i offsetsEven = _mm256_and_si256(subpixOffsets, subpixMask);
178  __m256i offsetsUneven = _mm256_and_si256(_mm256_srli_epi16 (subpixOffsets, 4), subpixMask);
179 
180  for(int i=0; i<2; i++) {
181  // Load integer disparities
182  __m256i intDisps;
183  if(alignedLoad) {
184  intDisps = _mm256_load_si256(reinterpret_cast<const __m256i*>(intPos));
185  } else {
186  intDisps = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(intPos));
187  }
188  intPos += 32;
189 
190  // Stupid AVX2 unpack mixes everything up! Lets swap the register beforehand.
191  __m256i intDispsMixup = _mm256_permute4x64_epi64(intDisps, 0xd8);
192 
193  // Get integer disparities shifted by 4
194  __m256i disps1 = _mm256_slli_epi16(_mm256_unpacklo_epi8(intDispsMixup, zero), 4);
195  __m256i disps2 = _mm256_slli_epi16(_mm256_unpackhi_epi8(intDispsMixup, zero), 4);
196 
197  // Unpack swap again :-(
198  __m256i offsetsEvenMixup = _mm256_permute4x64_epi64(offsetsEven, 0xd8);
199  __m256i offsetsUnevenMixup = _mm256_permute4x64_epi64(offsetsUneven, 0xd8);
200 
201  // Unpack subpixel offsets for selected disparities
202  __m256i offsets;
203  if(i == 0) {
204  offsets = _mm256_unpacklo_epi8(offsetsEvenMixup, offsetsUnevenMixup);
205  } else {
206  offsets = _mm256_unpackhi_epi8(offsetsEvenMixup, offsetsUnevenMixup);
207  }
208 
209  // And again!!
210  __m256i offsetsMixup = _mm256_permute4x64_epi64(offsets, 0xd8);
211 
212  // Add subpixel offsets to integer disparities
213  disps1 = _mm256_or_si256(disps1, _mm256_unpacklo_epi8(offsetsMixup, zero));
214  disps2 = _mm256_or_si256(disps2, _mm256_unpackhi_epi8(offsetsMixup, zero));
215 
216  // Store result
217  _mm256_store_si256(reinterpret_cast<__m256i*>(outPos), disps1);
218  outPos += 32;
219  _mm256_store_si256(reinterpret_cast<__m256i*>(outPos), disps2);
220  outPos += 32;
221 
222  if(intPos >= intEndPos) {
223  // For the last pixel we might need one iteration less
224  break;
225  }
226  }
227  }
228 
229  outPos += outRowPadding;
230  }
231 }
232 # endif
233 
234 void BitConversions::decode12BitSplitFallback(int startRow, int stopRow, const unsigned char* dispStart,
235  const unsigned char* subpixStart, int width, unsigned short* dst, int srcStride, int dstStride) {
236 
237  int dstStrideShort = dstStride/2;
238 
239  // Non-SSE version
240  for(int y = startRow; y < stopRow; y++) {
241  for(int x = 0; x < width; x++) {
242 
243  unsigned short subpix = 0;
244  if(x % 2 == 0) {
245  subpix = subpixStart[y*srcStride + x/2] & 0x0F;
246  } else {
247  subpix = subpixStart[y*srcStride + x/2] >> 4;
248  }
249 
250  dst[y*dstStrideShort + x] = (static_cast<unsigned short>(dispStart[y*srcStride + x]) << 4) | subpix;
251  }
252  }
253 }
254 
255 void BitConversions::decode12BitPacked(int startRow, int stopRow, const unsigned char* src,
256  unsigned char* dst, int srcStride, int dstStride, int rowWidth) {
257 
258  const unsigned char* dispStart = src;
259 
260 # ifdef __SSE4_1__
261  if(rowWidth % 32 == 0) {
262  if(srcStride % 16 == 0 && reinterpret_cast<size_t>(src) % 16 == 0) {
263  decode12BitPackedSSE4<true>(startRow, stopRow, dispStart,
264  rowWidth, reinterpret_cast<unsigned short*>(dst), srcStride, dstStride);
265  } else {
266  decode12BitPackedSSE4<false>(startRow, stopRow, dispStart,
267  rowWidth, reinterpret_cast<unsigned short*>(dst), srcStride, dstStride);
268  }
269 
270  } else // We use fallback implementation if the image width is not dividable by 32
271 # endif
272 # if defined(__ARM_NEON) && defined(__ARM_ARCH_ISA_A64)
273  if(rowWidth % 32 == 0) {
274  if(srcStride % 16 == 0 && reinterpret_cast<size_t>(src) % 16 == 0) {
275  decode12BitPackedNEON<true>(startRow, stopRow, dispStart,
276  rowWidth, reinterpret_cast<unsigned short*>(dst), srcStride, dstStride);
277  } else {
278  decode12BitPackedNEON<false>(startRow, stopRow, dispStart,
279  rowWidth, reinterpret_cast<unsigned short*>(dst), srcStride, dstStride);
280  }
281 
282  } else // We use fallback implementation if the image width is not dividable by 32
283 # endif
284  {
285  decode12BitPackedFallback(startRow, stopRow, dispStart, rowWidth,
286  reinterpret_cast<unsigned short*>(dst), srcStride, dstStride);
287  }
288 }
289 
290 #ifdef __SSE4_1__
291 template <bool alignedLoad>
292 void BitConversions::decode12BitPackedSSE4(int startRow, int stopRow, const unsigned char* dispStart,
293  int width, unsigned short* dst, int srcStride, int dstStride) {
294  if(width % 32 != 0) {
295  throw ProtocolException("Image width must be a multiple of 32!");
296  }
297 
298  // SSE optimized code
299  unsigned char* outPos = &reinterpret_cast<unsigned char*>(dst)[startRow*dstStride];
300  int outRowPadding = dstStride - 2*width;
301 
302  const __m128i shuffleMask1a = _mm_set_epi8(11, 10, 10, 9, 8, 7, 7, 6, 5, 4, 4, 3, 2, 1, 1, 0);
303  const __m128i shuffleMask1b = _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 15, 14, 13, 13, 12);
304 
305  const __m128i shuffleMask2a = _mm_set_epi8(7, 6, 6, 5, 4, 3, 3, 2, 1, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff);
306  const __m128i shuffleMask2b = _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 15, 15, 14, 13, 12, 12, 11, 10, 9, 9, 8);
307 
308  const __m128i shuffleMask3a = _mm_set_epi8(3, 2, 2, 1, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff);
309  const __m128i shuffleMask3b = _mm_set_epi8(15, 14, 14, 13, 12, 11, 11, 10, 9, 8, 8, 7, 6, 5, 5, 4);
310 
311  const __m128i shiftMultiplyMask = _mm_set_epi16(1, 16, 1, 16, 1, 16, 1, 16);
312 
313  const __m128i blendMask1 = _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0);
314  const __m128i blendMask2 = _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
315 
316  int dispRowWidth = width * 3/2;
317 
318  for(int y = startRow; y<stopRow; y++) {
319  const unsigned char* rowPos = &dispStart[y*srcStride];
320  const unsigned char* rowEnd = &dispStart[y*srcStride + dispRowWidth];
321 
322  while(rowPos < rowEnd) {
323  // Load 16 pixels
324  // AA BA BB CC DC DD EE FE FF ...
325  __m128i rowPixels1, rowPixels2, rowPixels3;
326  if(alignedLoad) {
327  rowPixels1 = _mm_load_si128(reinterpret_cast<const __m128i*>(rowPos));
328  rowPos += 16;
329 
330  rowPixels2 = _mm_load_si128(reinterpret_cast<const __m128i*>(rowPos));
331  rowPos += 16;
332 
333  rowPixels3 = _mm_load_si128(reinterpret_cast<const __m128i*>(rowPos));
334  rowPos += 16;
335  } else {
336  rowPixels1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(rowPos));
337  rowPos += 16;
338 
339  rowPixels2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(rowPos));
340  rowPos += 16;
341 
342  rowPixels3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(rowPos));
343  rowPos += 16;
344  }
345 
346  // Duplicate bytes with shared data
347  // BAAA BBBA DCCC DDDC FEEE FFFE (example without endianess swap!)
348  __m128i part1 = _mm_shuffle_epi8(rowPixels1, shuffleMask1a);
349  __m128i part2a = _mm_shuffle_epi8(rowPixels1, shuffleMask1b);
350  __m128i part2b = _mm_shuffle_epi8(rowPixels2, shuffleMask2a);
351  __m128i part3a = _mm_shuffle_epi8(rowPixels2, shuffleMask2b);
352  __m128i part3b = _mm_shuffle_epi8(rowPixels3, shuffleMask3a);
353  __m128i part4 = _mm_shuffle_epi8(rowPixels3, shuffleMask3b);
354 
355  __m128i part2 = _mm_blendv_epi8(part2a, part2b, blendMask1);
356  __m128i part3 = _mm_blendv_epi8(part3a, part3b, blendMask2);
357 
358  // Shift left through multiplication
359  // AAA0 BBBA CCC0 DDDC EEE0 FFFE
360  __m128i shift1a = _mm_mullo_epi16(part1, shiftMultiplyMask);
361  __m128i shift2a = _mm_mullo_epi16(part2, shiftMultiplyMask);
362  __m128i shift3a = _mm_mullo_epi16(part3, shiftMultiplyMask);
363  __m128i shift4a = _mm_mullo_epi16(part4, shiftMultiplyMask);
364 
365  // Shift right again
366  // 0AAA 0BBB 0CCC 0DDD 0EEE 0FFF ...
367  __m128i shift1b = _mm_srli_epi16(shift1a, 4);
368  __m128i shift2b = _mm_srli_epi16(shift2a, 4);
369  __m128i shift3b = _mm_srli_epi16(shift3a, 4);
370  __m128i shift4b = _mm_srli_epi16(shift4a, 4);
371 
372  _mm_storeu_si128(reinterpret_cast<__m128i*>(outPos), shift1b);
373  outPos += 16;
374  _mm_storeu_si128(reinterpret_cast<__m128i*>(outPos), shift2b);
375  outPos += 16;
376  _mm_storeu_si128(reinterpret_cast<__m128i*>(outPos), shift3b);
377  outPos += 16;
378  _mm_storeu_si128(reinterpret_cast<__m128i*>(outPos), shift4b);
379  outPos += 16;
380  }
381 
382  outPos += outRowPadding;
383  }
384 }
385 #endif
386 
387 #if defined(__ARM_NEON) && defined(__ARM_ARCH_ISA_A64)
388 #define TX(y,x) ((x + y*16)/3 + ((x + y*16)%3)*16)
389 
390 template <bool alignedLoad>
391 void BitConversions::decode12BitPackedNEON(int startRow, int stopRow, const unsigned char* dispStart,
392  int width, unsigned short* dst, int srcStride, int dstStride) {
393  if(width % 32 != 0) {
394  throw ProtocolException("Image width must be a multiple of 32!");
395  }
396 
397  // ARM NEON A64 optimized code
398  unsigned char* outPos = &reinterpret_cast<unsigned char*>(dst)[startRow*dstStride];
399  int outRowPadding = dstStride - 2*width;
400 
401  // Shuffle mask already performs endianess swapping
402  const uint8x16_t shuffleMask1 = {TX(0,0), TX(0,1), TX(0,1), TX(0,2), TX(0,3), TX(0,4),
403  TX(0,4), TX(0,5), TX(0,6), TX(0,7), TX(0,7), TX(0,8), TX(0,9), TX(0,10), TX(0,10), TX(0,11)};
404  const uint8x16_t shuffleMask2 = {TX(0,12), TX(0,13), TX(0,13), TX(0,14), TX(0,15), TX(1,0),
405  TX(1,0), TX(1,1), TX(1,2), TX(1,3), TX(1,3), TX(1,4), TX(1,5), TX(1,6), TX(1,6), TX(1,7)};
406  const uint8x16_t shuffleMask3 = {TX(1,8), TX(1,9), TX(1,9), TX(1,10), TX(1,11), TX(1,12),
407  TX(1,12), TX(1,13), TX(1,14), TX(1,15), TX(1,15), TX(2,0), TX(2,1), TX(2,2), TX(2,2), TX(2,3)};
408  const uint8x16_t shuffleMask4 = {TX(2,4), TX(2,5), TX(2,5), TX(2,6), TX(2,7), TX(2,8),
409  TX(2,8), TX(2,9), TX(2,10), TX(2,11), TX(2,11), TX(2,12), TX(2,13), TX(2,14), TX(2,14), TX(2,15)};
410 
411  const int16x8_t shiftMask = {4, 0, 4, 0, 4, 0, 4, 0};
412 
413  int dispRowWidth = width * 3/2;
414 
415  for(int y = startRow; y<stopRow; y++) {
416  const unsigned char* rowPos = &dispStart[y*srcStride];
417  const unsigned char* rowEnd = &dispStart[y*srcStride + dispRowWidth];
418 
419  while(rowPos < rowEnd) {
420  // Load 16 pixels
421  // AA BA BB CC DC DD EE FE FF
422  uint8x16x3_t rowPixels;
423  if(alignedLoad) {
424  rowPixels = vld3q_u8(reinterpret_cast<const uint8_t*>(
425  __builtin_assume_aligned(rowPos, 16)));
426  } else {
427  rowPixels = vld3q_u8(reinterpret_cast<const uint8_t*>(rowPos));
428  }
429  rowPos += 48;
430 
431  // Duplicate bytes with shared data
432  // BAAA BBBA DCCC DDDC FEEE FFFE (example without endianess swap!)
433  uint8x16_t part1 = vqtbl3q_u8(rowPixels, shuffleMask1);
434  uint8x16_t part2 = vqtbl3q_u8(rowPixels, shuffleMask2);
435  uint8x16_t part3 = vqtbl3q_u8(rowPixels, shuffleMask3);
436  uint8x16_t part4 = vqtbl3q_u8(rowPixels, shuffleMask4);
437 
438  // Shift left
439  // AAA0 BBBA CCC0 DDDC EEE0 FFFE
440  uint16x8_t shift1a = vshlq_u16(vreinterpretq_u16_u8(part1), shiftMask);
441  uint16x8_t shift2a = vshlq_u16(vreinterpretq_u16_u8(part2), shiftMask);
442  uint16x8_t shift3a = vshlq_u16(vreinterpretq_u16_u8(part3), shiftMask);
443  uint16x8_t shift4a = vshlq_u16(vreinterpretq_u16_u8(part4), shiftMask);
444 
445  // Shift right again
446  // 0AAA 0BBB 0CCC 0DDD 0EEE 0FFF ...
447  uint16x8_t shift1b = vshrq_n_u16(shift1a, 4);
448  uint16x8_t shift2b = vshrq_n_u16(shift2a, 4);
449  uint16x8_t shift3b = vshrq_n_u16(shift3a, 4);
450  uint16x8_t shift4b = vshrq_n_u16(shift4a, 4);
451 
452  vst1q_u16(reinterpret_cast<uint16_t*>(outPos), shift1b);
453  outPos += 16;
454  vst1q_u16(reinterpret_cast<uint16_t*>(outPos), shift2b);
455  outPos += 16;
456  vst1q_u16(reinterpret_cast<uint16_t*>(outPos), shift3b);
457  outPos += 16;
458  vst1q_u16(reinterpret_cast<uint16_t*>(outPos), shift4b);
459  outPos += 16;
460  }
461 
462  outPos += outRowPadding;
463  }
464 }
465 #endif
466 
467 void BitConversions::decode12BitPackedFallback(int startRow, int stopRow, const unsigned char* dispStart,
468  int width, unsigned short* dst, int srcStride, int dstStride) {
469 
470  int dstStrideShort = dstStride/2;
471 
472  // Non-SSE version
473  for(int y = startRow; y < stopRow; y++) {
474  const unsigned char* srcPtr = &dispStart[y*srcStride];
475  unsigned short* dstPtr = &dst[y*dstStrideShort];
476  unsigned short* dstEndPtr = dstPtr + width;
477 
478  while(dstPtr != dstEndPtr) {
479  *dstPtr = static_cast<unsigned short>(*srcPtr);
480  srcPtr++;
481  *dstPtr |= static_cast<unsigned short>(*srcPtr & 0x0f) << 8;
482  dstPtr++;
483 
484  *dstPtr = static_cast<unsigned short>(*srcPtr) >> 4;
485  srcPtr++;
486  *dstPtr |= static_cast<unsigned short>(*srcPtr) << 4;
487  srcPtr++;
488  dstPtr++;
489  }
490  }
491 }
492 
493 void BitConversions::encode12BitPacked(int startRow, int stopRow, const unsigned char* src,
494  unsigned char* dst, int srcStride, int dstStride, int rowWidth) {
495  const unsigned short* srcShort = reinterpret_cast<const unsigned short*>(src);
496  int srcStrideShort = srcStride/2;
497 
498  // SSE/NEON optimization is not yet available
499  for(int y = startRow; y < stopRow; y++) {
500  const unsigned short* srcPtr = &srcShort[y*srcStrideShort];
501  const unsigned short* srcEndPtr = srcPtr + rowWidth;
502  unsigned char* dstPtr = &dst[y*dstStride];
503 
504  while(srcPtr != srcEndPtr) {
505  *dstPtr = static_cast<unsigned char>(*srcPtr);
506  dstPtr++;
507  *dstPtr = static_cast<unsigned char>(*srcPtr >> 8) & 0x0f;
508  srcPtr++;
509 
510  *dstPtr |= static_cast<unsigned char>(*srcPtr) << 4;
511  dstPtr++;
512  *dstPtr = static_cast<unsigned short>(*srcPtr >> 4);
513  srcPtr++;
514  dstPtr++;
515  }
516  }
517 }
Exception class that is used for all protocol exceptions.
Definition: exceptions.h:23
Nerian Vision Technologies