DFace SDK  1.8.5
mat_pixel.h
1 #ifndef DFACE_MATPIXEL_H
2 #define DFACE_MATPIXEL_H
3 
4 #include "dface/mat.h"
5 #include <limits.h>
6 #include <algorithm>
7 #if __ARM_NEON
8 #include <arm_neon.h>
9 #endif // __ARM_NEON
10 
11 
12 namespace dface {
13 
14 static Mat from_rgb(const unsigned char* rgb, int w, int h)
15 {
16  Mat m(w, h, 3);
17  if (m.empty())
18  return m;
19 
20  float* ptr0 = m.channel(0);
21  float* ptr1 = m.channel(1);
22  float* ptr2 = m.channel(2);
23 
24  int size = w * h;
25 
26 #if __ARM_NEON
27  int nn = size >> 3;
28  int remain = size - (nn << 3);
29 #else
30  int remain = size;
31 #endif // __ARM_NEON
32 
33 #if __ARM_NEON
34 #if __aarch64__
35  for (; nn>0; nn--)
36  {
37  uint8x8x3_t _rgb = vld3_u8(rgb);
38  uint16x8_t _r16 = vmovl_u8(_rgb.val[0]);
39  uint16x8_t _g16 = vmovl_u8(_rgb.val[1]);
40  uint16x8_t _b16 = vmovl_u8(_rgb.val[2]);
41 
42  float32x4_t _rlow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_r16)));
43  float32x4_t _rhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_r16)));
44  float32x4_t _glow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_g16)));
45  float32x4_t _ghigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_g16)));
46  float32x4_t _blow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_b16)));
47  float32x4_t _bhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_b16)));
48 
49  vst1q_f32(ptr0, _rlow);
50  vst1q_f32(ptr0+4, _rhigh);
51  vst1q_f32(ptr1, _glow);
52  vst1q_f32(ptr1+4, _ghigh);
53  vst1q_f32(ptr2, _blow);
54  vst1q_f32(ptr2+4, _bhigh);
55 
56  rgb += 3*8;
57  ptr0 += 8;
58  ptr1 += 8;
59  ptr2 += 8;
60  }
61 #else
62  if (nn > 0)
63  {
64  asm volatile(
65  "0: \n"
66  "pld [%1, #256] \n"
67  "vld3.u8 {d0-d2}, [%1]! \n"
68  "vmovl.u8 q8, d0 \n"
69  "vmovl.u8 q9, d1 \n"
70  "vmovl.u8 q10, d2 \n"
71  "vmovl.u16 q0, d16 \n"
72  "vmovl.u16 q1, d17 \n"
73  "vmovl.u16 q2, d18 \n"
74  "vmovl.u16 q3, d19 \n"
75  "vmovl.u16 q8, d20 \n"
76  "vmovl.u16 q9, d21 \n"
77  "vcvt.f32.u32 q0, q0 \n"
78  "vcvt.f32.u32 q1, q1 \n"
79  "vcvt.f32.u32 q2, q2 \n"
80  "vcvt.f32.u32 q3, q3 \n"
81  "vcvt.f32.u32 q8, q8 \n"
82  "subs %0, #1 \n"
83  "vst1.f32 {d0-d3}, [%2 :128]! \n"
84  "vcvt.f32.u32 q9, q9 \n"
85  "vst1.f32 {d4-d7}, [%3 :128]! \n"
86  "vst1.f32 {d16-d19}, [%4 :128]!\n"
87  "bne 0b \n"
88  : "=r"(nn), // %0
89  "=r"(rgb), // %1
90  "=r"(ptr0), // %2
91  "=r"(ptr1), // %3
92  "=r"(ptr2) // %4
93  : "0"(nn),
94  "1"(rgb),
95  "2"(ptr0),
96  "3"(ptr1),
97  "4"(ptr2)
98  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
99  );
100  }
101 #endif // __aarch64__
102 #endif // __ARM_NEON
103  for (; remain>0; remain--)
104  {
105  *ptr0 = rgb[0];
106  *ptr1 = rgb[1];
107  *ptr2 = rgb[2];
108 
109  rgb += 3;
110  ptr0++;
111  ptr1++;
112  ptr2++;
113  }
114 
115  return m;
116 }
117 
118 static void to_rgb(const Mat& m, unsigned char* rgb)
119 {
120  const float* ptr0 = m.channel(0);
121  const float* ptr1 = m.channel(1);
122  const float* ptr2 = m.channel(2);
123 
124  int size = m.w * m.h;
125 
126 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
127 //#define SATURATE_CAST_UCHAR(X) (unsigned char)(int)(X);
128 
129  int remain = size;
130 
131  for (; remain>0; remain--)
132  {
133  rgb[0] = SATURATE_CAST_UCHAR(*ptr0);
134  rgb[1] = SATURATE_CAST_UCHAR(*ptr1);
135  rgb[2] = SATURATE_CAST_UCHAR(*ptr2);
136 
137  rgb += 3;
138  ptr0++;
139  ptr1++;
140  ptr2++;
141  }
142 
143 #undef SATURATE_CAST_UCHAR
144 }
145 
146 static Mat from_gray(const unsigned char* gray, int w, int h)
147 {
148  Mat m(w, h, 1);
149  if (m.empty())
150  return m;
151 
152  float* ptr = m;
153 
154  int size = w * h;
155 
156 #if __ARM_NEON
157  int nn = size >> 4;
158  int remain = size - (nn << 4);
159 #else
160  int remain = size;
161 #endif // __ARM_NEON
162 
163 #if __ARM_NEON
164 #if __aarch64__
165  for (; nn>0; nn--)
166  {
167  uint8x16_t _gray = vld1q_u8(gray);
168  uint16x8_t _gray16_0 = vmovl_u8(vget_low_u8(_gray));
169  uint16x8_t _gray16_1 = vmovl_u8(vget_high_u8(_gray));
170 
171  float32x4_t _graylow_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_0)));
172  float32x4_t _grayhigh_0 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_0)));
173  float32x4_t _graylow_1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_1)));
174  float32x4_t _grayhigh_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_1)));
175 
176  vst1q_f32(ptr, _graylow_0);
177  vst1q_f32(ptr+4, _grayhigh_0);
178  vst1q_f32(ptr+8, _graylow_1);
179  vst1q_f32(ptr+12, _grayhigh_1);
180 
181  gray += 16;
182  ptr += 16;
183  }
184 #else
185  if (nn > 0)
186  {
187  asm volatile(
188  "0: \n"
189  "pld [%1, #128] \n"
190  "vld1.u8 {d0,d1}, [%1]! \n"
191  "vmovl.u8 q8, d0 \n"
192  "vmovl.u8 q9, d1 \n"
193  "vmovl.u16 q0, d16 \n"
194  "vmovl.u16 q1, d17 \n"
195  "vmovl.u16 q2, d18 \n"
196  "vmovl.u16 q3, d19 \n"
197  "vcvt.f32.u32 q0, q0 \n"
198  "vcvt.f32.u32 q1, q1 \n"
199  "vcvt.f32.u32 q2, q2 \n"
200  "vcvt.f32.u32 q3, q3 \n"
201  "subs %0, #1 \n"
202  "vst1.f32 {d0-d3}, [%2 :128]! \n"
203  "vst1.f32 {d4-d7}, [%2 :128]! \n"
204  "bne 0b \n"
205  : "=r"(nn), // %0
206  "=r"(gray), // %1
207  "=r"(ptr) // %2
208  : "0"(nn),
209  "1"(gray),
210  "2"(ptr)
211  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
212  );
213  }
214 #endif // __aarch64__
215 #endif // __ARM_NEON
216  for (; remain>0; remain--)
217  {
218  *ptr = *gray;
219 
220  gray++;
221  ptr++;
222  }
223 
224  return m;
225 }
226 
227 static void to_gray(const Mat& m, unsigned char* gray)
228 {
229  const float* ptr = m;
230 
231  int size = m.w * m.h;
232 
233 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
234 
235  int remain = size;
236 
237  for (; remain>0; remain--)
238  {
239  *gray = SATURATE_CAST_UCHAR(*ptr);
240 
241  gray++;
242  ptr++;
243  }
244 
245 #undef SATURATE_CAST_UCHAR
246 }
247 
248 static Mat from_rgba(const unsigned char* rgba, int w, int h)
249 {
250  Mat m(w, h, 4);
251  if (m.empty())
252  return m;
253 
254  float* ptr0 = m.channel(0);
255  float* ptr1 = m.channel(1);
256  float* ptr2 = m.channel(2);
257  float* ptr3 = m.channel(3);
258 
259  int size = w * h;
260 
261 #if __ARM_NEON
262  int nn = size >> 3;
263  int remain = size - (nn << 3);
264 #else
265  int remain = size;
266 #endif // __ARM_NEON
267 
268 #if __ARM_NEON
269 #if __aarch64__
270  for (; nn>0; nn--)
271  {
272  uint8x8x4_t _rgba = vld4_u8(rgba);
273  int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
274  int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
275  int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
276  int16x8_t _a16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[3]));
277 
278  float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
279  float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
280  float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
281  float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
282  float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
283  float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
284  float32x4_t _alow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_a16)));
285  float32x4_t _ahigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_a16)));
286 
287  vst1q_f32(ptr0, _rlow);
288  vst1q_f32(ptr0+4, _rhigh);
289  vst1q_f32(ptr1, _glow);
290  vst1q_f32(ptr1+4, _ghigh);
291  vst1q_f32(ptr2, _blow);
292  vst1q_f32(ptr2+4, _bhigh);
293  vst1q_f32(ptr3, _alow);
294  vst1q_f32(ptr3+4, _ahigh);
295 
296  rgba += 4*8;
297  ptr0 += 8;
298  ptr1 += 8;
299  ptr2 += 8;
300  ptr3 += 8;
301  }
302 #else
303  if (nn > 0)
304  {
305  asm volatile(
306  "0: \n"
307  "pld [%1, #256] \n"
308  "vld4.u8 {d0-d3}, [%1]! \n"
309  "vmovl.u8 q8, d0 \n"
310  "vmovl.u8 q9, d1 \n"
311  "vmovl.u8 q10, d2 \n"
312  "vmovl.u8 q11, d3 \n"
313  "vmovl.u16 q0, d16 \n"
314  "vmovl.u16 q1, d17 \n"
315  "vmovl.u16 q2, d18 \n"
316  "vmovl.u16 q3, d19 \n"
317  "vmovl.u16 q8, d20 \n"
318  "vmovl.u16 q9, d21 \n"
319  "vmovl.u16 q10, d22 \n"
320  "vmovl.u16 q11, d23 \n"
321  "vcvt.f32.u32 q0, q0 \n"
322  "vcvt.f32.u32 q1, q1 \n"
323  "vcvt.f32.u32 q2, q2 \n"
324  "vcvt.f32.u32 q3, q3 \n"
325  "vcvt.f32.u32 q8, q8 \n"
326  "vcvt.f32.u32 q9, q9 \n"
327  "subs %0, #1 \n"
328  "vst1.f32 {d0-d3}, [%2 :128]! \n"
329  "vcvt.f32.u32 q10, q10 \n"
330  "vcvt.f32.u32 q11, q11 \n"
331  "vst1.f32 {d4-d7}, [%3 :128]! \n"
332  "vst1.f32 {d16-d19}, [%4 :128]!\n"
333  "vst1.f32 {d20-d23}, [%5 :128]!\n"
334  "bne 0b \n"
335  : "=r"(nn), // %0
336  "=r"(rgba), // %1
337  "=r"(ptr0), // %2
338  "=r"(ptr1), // %3
339  "=r"(ptr2), // %4
340  "=r"(ptr3) // %5
341  : "0"(nn),
342  "1"(rgba),
343  "2"(ptr0),
344  "3"(ptr1),
345  "4"(ptr2),
346  "5"(ptr3)
347  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
348  );
349  }
350 #endif // __aarch64__
351 #endif // __ARM_NEON
352  for (; remain>0; remain--)
353  {
354  *ptr0 = rgba[0];
355  *ptr1 = rgba[1];
356  *ptr2 = rgba[2];
357  *ptr3 = rgba[3];
358 
359  rgba += 4;
360  ptr0++;
361  ptr1++;
362  ptr2++;
363  ptr3++;
364  }
365 
366  return m;
367 }
368 
369 static void to_rgba(const Mat& m, unsigned char* rgba)
370 {
371  const float* ptr0 = m.channel(0);
372  const float* ptr1 = m.channel(1);
373  const float* ptr2 = m.channel(2);
374  const float* ptr3 = m.channel(3);
375 
376  int size = m.w * m.h;
377 
378 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
379 
380  int remain = size;
381 
382  for (; remain>0; remain--)
383  {
384  rgba[0] = SATURATE_CAST_UCHAR(*ptr0);
385  rgba[1] = SATURATE_CAST_UCHAR(*ptr1);
386  rgba[2] = SATURATE_CAST_UCHAR(*ptr2);
387  rgba[3] = SATURATE_CAST_UCHAR(*ptr3);
388 
389  rgba += 4;
390  ptr0++;
391  ptr1++;
392  ptr2++;
393  ptr3++;
394  }
395 
396 #undef SATURATE_CAST_UCHAR
397 }
398 
399 static Mat from_rgb2bgr(const unsigned char* rgb, int w, int h)
400 {
401  Mat m(w, h, 3);
402  if (m.empty())
403  return m;
404 
405  float* ptr0 = m.channel(0);
406  float* ptr1 = m.channel(1);
407  float* ptr2 = m.channel(2);
408 
409  int size = w * h;
410 
411 #if __ARM_NEON
412  int nn = size >> 3;
413  int remain = size - (nn << 3);
414 #else
415  int remain = size;
416 #endif // __ARM_NEON
417 
418 #if __ARM_NEON
419 #if __aarch64__
420  for (; nn>0; nn--)
421  {
422  uint8x8x3_t _rgb = vld3_u8(rgb);
423  uint16x8_t _r16 = vmovl_u8(_rgb.val[0]);
424  uint16x8_t _g16 = vmovl_u8(_rgb.val[1]);
425  uint16x8_t _b16 = vmovl_u8(_rgb.val[2]);
426 
427  float32x4_t _rlow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_r16)));
428  float32x4_t _rhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_r16)));
429  float32x4_t _glow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_g16)));
430  float32x4_t _ghigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_g16)));
431  float32x4_t _blow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_b16)));
432  float32x4_t _bhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_b16)));
433 
434  vst1q_f32(ptr2, _rlow);
435  vst1q_f32(ptr2+4, _rhigh);
436  vst1q_f32(ptr1, _glow);
437  vst1q_f32(ptr1+4, _ghigh);
438  vst1q_f32(ptr0, _blow);
439  vst1q_f32(ptr0+4, _bhigh);
440 
441  rgb += 3*8;
442  ptr0 += 8;
443  ptr1 += 8;
444  ptr2 += 8;
445  }
446 #else
447  if (nn > 0)
448  {
449  asm volatile(
450  "0: \n"
451  "pld [%1, #256] \n"
452  "vld3.u8 {d0-d2}, [%1]! \n"
453  "vmovl.u8 q8, d0 \n"
454  "vmovl.u8 q9, d1 \n"
455  "vmovl.u8 q10, d2 \n"
456  "vmovl.u16 q0, d16 \n"
457  "vmovl.u16 q1, d17 \n"
458  "vmovl.u16 q2, d18 \n"
459  "vmovl.u16 q3, d19 \n"
460  "vmovl.u16 q8, d20 \n"
461  "vmovl.u16 q9, d21 \n"
462  "vcvt.f32.u32 q0, q0 \n"
463  "vcvt.f32.u32 q1, q1 \n"
464  "vcvt.f32.u32 q2, q2 \n"
465  "vcvt.f32.u32 q3, q3 \n"
466  "vcvt.f32.u32 q8, q8 \n"
467  "subs %0, #1 \n"
468  "vst1.f32 {d0-d3}, [%4 :128]! \n"
469  "vcvt.f32.u32 q9, q9 \n"
470  "vst1.f32 {d4-d7}, [%3 :128]! \n"
471  "vst1.f32 {d16-d19}, [%2 :128]!\n"
472  "bne 0b \n"
473  : "=r"(nn), // %0
474  "=r"(rgb), // %1
475  "=r"(ptr0), // %2
476  "=r"(ptr1), // %3
477  "=r"(ptr2) // %4
478  : "0"(nn),
479  "1"(rgb),
480  "2"(ptr0),
481  "3"(ptr1),
482  "4"(ptr2)
483  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
484  );
485  }
486 #endif // __aarch64__
487 #endif // __ARM_NEON
488  for (; remain>0; remain--)
489  {
490  *ptr0 = rgb[2];
491  *ptr1 = rgb[1];
492  *ptr2 = rgb[0];
493 
494  rgb += 3;
495  ptr0++;
496  ptr1++;
497  ptr2++;
498  }
499 
500  return m;
501 }
502 
503 static void to_bgr2rgb(const Mat& m, unsigned char* rgb)
504 {
505  const float* ptr0 = m.channel(0);
506  const float* ptr1 = m.channel(1);
507  const float* ptr2 = m.channel(2);
508 
509  int size = m.w * m.h;
510 
511 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
512 
513  int remain = size;
514 
515  for (; remain>0; remain--)
516  {
517  rgb[2] = SATURATE_CAST_UCHAR(*ptr0);
518  rgb[1] = SATURATE_CAST_UCHAR(*ptr1);
519  rgb[0] = SATURATE_CAST_UCHAR(*ptr2);
520 
521  rgb += 3;
522  ptr0++;
523  ptr1++;
524  ptr2++;
525  }
526 
527 #undef SATURATE_CAST_UCHAR
528 }
529 
530 static Mat from_rgb2gray(const unsigned char* rgb, int w, int h)
531 {
532  // coeffs for r g b = 0.299f, 0.587f, 0.114f
533  const unsigned char Y_shift = 8;//14
534  const unsigned char R2Y = 77;
535  const unsigned char G2Y = 150;
536  const unsigned char B2Y = 29;
537 
538  Mat m(w, h, 1);
539  if (m.empty())
540  return m;
541 
542  float* ptr = m;
543 
544  int size = w * h;
545 
546 #if __ARM_NEON
547  int nn = size >> 3;
548  int remain = size - (nn << 3);
549 #else
550  int remain = size;
551 #endif // __ARM_NEON
552 
553 #if __ARM_NEON
554 #if __aarch64__
555  uint8x8_t _R2Y = vdup_n_u8(R2Y);
556  uint8x8_t _G2Y = vdup_n_u8(G2Y);
557  uint8x8_t _B2Y = vdup_n_u8(B2Y);
558  for (; nn>0; nn--)
559  {
560  uint8x8x3_t _rgb = vld3_u8(rgb);
561 
562  uint16x8_t _y16 = vmull_u8(_rgb.val[0], _R2Y);
563  _y16 = vmlal_u8(_y16, _rgb.val[1], _G2Y);
564  _y16 = vmlal_u8(_y16, _rgb.val[2], _B2Y);
565  _y16 = vshrq_n_u16(_y16, Y_shift);
566 
567  float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
568  float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
569 
570  vst1q_f32(ptr, _ylow);
571  vst1q_f32(ptr+4, _yhigh);
572 
573  rgb += 3*8;
574  ptr += 8;
575  }
576 #else
577  if (nn > 0)
578  {
579  asm volatile(
580  "vdup.u8 d16, %6 \n"
581  "vdup.u8 d17, %7 \n"
582  "vdup.u8 d18, %8 \n"
583  "0: \n"
584  "pld [%1, #256] \n"
585  "vld3.u8 {d0-d2}, [%1]! \n"
586  "vmull.u8 q2, d0, d16 \n"
587  "vmlal.u8 q2, d1, d17 \n"
588  "vmlal.u8 q2, d2, d18 \n"
589  "vshr.u16 q2, q2, #8 \n" // Y_shift
590  "vmovl.u16 q0, d4 \n"
591  "vmovl.u16 q1, d5 \n"
592  "vcvt.f32.u32 q0, q0 \n"
593  "vcvt.f32.u32 q1, q1 \n"
594  "subs %0, #1 \n"
595  "vst1.f32 {d0-d3}, [%2 :128]! \n"
596  "bne 0b \n"
597  : "=r"(nn), // %0
598  "=r"(rgb), // %1
599  "=r"(ptr) // %2
600  : "0"(nn),
601  "1"(rgb),
602  "2"(ptr),
603  "r"(R2Y), // %6
604  "r"(G2Y), // %7
605  "r"(B2Y) // %8
606  : "cc", "memory", "q0", "q1", "q2", "q8", "q9"
607  );
608  }
609 #endif // __aarch64__
610 #endif // __ARM_NEON
611  for (; remain>0; remain--)
612  {
613  *ptr = (rgb[0] * R2Y + rgb[1] * G2Y + rgb[2] * B2Y) >> Y_shift;
614 
615  rgb += 3;
616  ptr++;
617  }
618 
619  return m;
620 }
621 
622 static Mat from_bgr2gray(const unsigned char* bgr, int w, int h)
623 {
624  // coeffs for r g b = 0.299f, 0.587f, 0.114f
625  const unsigned char Y_shift = 8;//14
626  const unsigned char R2Y = 77;
627  const unsigned char G2Y = 150;
628  const unsigned char B2Y = 29;
629 
630  Mat m(w, h, 1);
631  if (m.empty())
632  return m;
633 
634  float* ptr = m;
635 
636  int size = w * h;
637 
638 #if __ARM_NEON
639  int nn = size >> 3;
640  int remain = size - (nn << 3);
641 #else
642  int remain = size;
643 #endif // __ARM_NEON
644 
645 #if __ARM_NEON
646 #if __aarch64__
647  uint8x8_t _R2Y = vdup_n_u8(R2Y);
648  uint8x8_t _G2Y = vdup_n_u8(G2Y);
649  uint8x8_t _B2Y = vdup_n_u8(B2Y);
650  for (; nn>0; nn--)
651  {
652  uint8x8x3_t _rgb = vld3_u8(bgr);
653 
654  uint16x8_t _y16 = vmull_u8(_rgb.val[2], _R2Y);
655  _y16 = vmlal_u8(_y16, _rgb.val[1], _G2Y);
656  _y16 = vmlal_u8(_y16, _rgb.val[0], _B2Y);
657  _y16 = vshrq_n_u16(_y16, Y_shift);
658 
659  float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
660  float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
661 
662  vst1q_f32(ptr, _ylow);
663  vst1q_f32(ptr+4, _yhigh);
664 
665  bgr += 3*8;
666  ptr += 8;
667  }
668 #else
669  if (nn > 0)
670  {
671  asm volatile(
672  "vdup.u8 d16, %6 \n"
673  "vdup.u8 d17, %7 \n"
674  "vdup.u8 d18, %8 \n"
675  "0: \n"
676  "pld [%1, #256] \n"
677  "vld3.u8 {d0-d2}, [%1]! \n"
678  "vmull.u8 q2, d2, d16 \n"
679  "vmlal.u8 q2, d1, d17 \n"
680  "vmlal.u8 q2, d0, d18 \n"
681  "vshr.u16 q2, q2, #8 \n" // Y_shift
682  "vmovl.u16 q0, d4 \n"
683  "vmovl.u16 q1, d5 \n"
684  "vcvt.f32.u32 q0, q0 \n"
685  "vcvt.f32.u32 q1, q1 \n"
686  "subs %0, #1 \n"
687  "vst1.f32 {d0-d3}, [%2 :128]! \n"
688  "bne 0b \n"
689  : "=r"(nn), // %0
690  "=r"(bgr), // %1
691  "=r"(ptr) // %2
692  : "0"(nn),
693  "1"(bgr),
694  "2"(ptr),
695  "r"(R2Y), // %6
696  "r"(G2Y), // %7
697  "r"(B2Y) // %8
698  : "cc", "memory", "q0", "q1", "q2", "q8", "q9"
699  );
700  }
701 #endif // __aarch64__
702 #endif // __ARM_NEON
703  for (; remain>0; remain--)
704  {
705  *ptr = (bgr[2] * R2Y + bgr[1] * G2Y + bgr[0] * B2Y) >> Y_shift;
706 
707  bgr += 3;
708  ptr++;
709  }
710 
711  return m;
712 }
713 
714 static Mat from_gray2rgb(const unsigned char* gray, int w, int h)
715 {
716  Mat m(w, h, 3);
717  if (m.empty())
718  return m;
719 
720  float* ptr0 = m.channel(0);
721  float* ptr1 = m.channel(1);
722  float* ptr2 = m.channel(2);
723 
724  int size = w * h;
725 
726 #if __ARM_NEON
727  int nn = size >> 4;
728  int remain = size - (nn << 4);
729 #else
730  int remain = size;
731 #endif // __ARM_NEON
732 
733 #if __ARM_NEON
734 #if __aarch64__
735  for (; nn>0; nn--)
736  {
737  uint8x16_t _gray = vld1q_u8(gray);
738  uint16x8_t _gray16_0 = vmovl_u8(vget_low_u8(_gray));
739  uint16x8_t _gray16_1 = vmovl_u8(vget_high_u8(_gray));
740 
741  float32x4_t _graylow_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_0)));
742  float32x4_t _grayhigh_0 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_0)));
743  float32x4_t _graylow_1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_1)));
744  float32x4_t _grayhigh_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_1)));
745 
746  vst1q_f32(ptr0, _graylow_0);
747  vst1q_f32(ptr0+4, _grayhigh_0);
748  vst1q_f32(ptr0+8, _graylow_1);
749  vst1q_f32(ptr0+12, _grayhigh_1);
750 
751  vst1q_f32(ptr1, _graylow_0);
752  vst1q_f32(ptr1+4, _grayhigh_0);
753  vst1q_f32(ptr1+8, _graylow_1);
754  vst1q_f32(ptr1+12, _grayhigh_1);
755 
756  vst1q_f32(ptr2, _graylow_0);
757  vst1q_f32(ptr2+4, _grayhigh_0);
758  vst1q_f32(ptr2+8, _graylow_1);
759  vst1q_f32(ptr2+12, _grayhigh_1);
760 
761  gray += 16;
762  ptr0 += 16;
763  ptr1 += 16;
764  ptr2 += 16;
765  }
766 #else
767  if (nn > 0)
768  {
769  asm volatile(
770  "0: \n"
771  "pld [%1, #128] \n"
772  "vld1.u8 {d0,d1}, [%1]! \n"
773  "vmovl.u8 q8, d0 \n"
774  "vmovl.u8 q9, d1 \n"
775  "vmovl.u16 q0, d16 \n"
776  "vmovl.u16 q1, d17 \n"
777  "vmovl.u16 q2, d18 \n"
778  "vmovl.u16 q3, d19 \n"
779  "vcvt.f32.u32 q0, q0 \n"
780  "vcvt.f32.u32 q1, q1 \n"
781  "vcvt.f32.u32 q2, q2 \n"
782  "vcvt.f32.u32 q3, q3 \n"
783  "subs %0, #1 \n"
784  "vst1.f32 {d0-d3}, [%2 :128]! \n"
785  "vst1.f32 {d4-d7}, [%2 :128]! \n"
786  "vst1.f32 {d0-d3}, [%3 :128]! \n"
787  "vst1.f32 {d4-d7}, [%3 :128]! \n"
788  "vst1.f32 {d0-d3}, [%4 :128]! \n"
789  "vst1.f32 {d4-d7}, [%4 :128]! \n"
790  "bne 0b \n"
791  : "=r"(nn), // %0
792  "=r"(gray), // %1
793  "=r"(ptr0), // %2
794  "=r"(ptr1), // %3
795  "=r"(ptr2) // %4
796  : "0"(nn),
797  "1"(gray),
798  "2"(ptr0),
799  "3"(ptr1),
800  "4"(ptr2)
801  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
802  );
803  }
804 #endif // __aarch64__
805 #endif // __ARM_NEON
806  for (; remain>0; remain--)
807  {
808  *ptr0 = *gray;
809  *ptr1 = *gray;
810  *ptr2 = *gray;
811 
812  gray++;
813  ptr0++;
814  ptr1++;
815  ptr2++;
816  }
817 
818  return m;
819 }
820 
821 static Mat from_rgba2rgb(const unsigned char* rgba, int w, int h)
822 {
823  Mat m(w, h, 3);
824  if (m.empty())
825  return m;
826 
827  float* ptr0 = m.channel(0);
828  float* ptr1 = m.channel(1);
829  float* ptr2 = m.channel(2);
830 
831  int size = w * h;
832 
833 #if __ARM_NEON
834  int nn = size >> 3;
835  int remain = size - (nn << 3);
836 #else
837  int remain = size;
838 #endif // __ARM_NEON
839 
840 #if __ARM_NEON
841 #if __aarch64__
842  for (; nn>0; nn--)
843  {
844  uint8x8x4_t _rgba = vld4_u8(rgba);
845  int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
846  int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
847  int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
848 
849  float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
850  float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
851  float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
852  float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
853  float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
854  float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
855 
856  vst1q_f32(ptr0, _rlow);
857  vst1q_f32(ptr0+4, _rhigh);
858  vst1q_f32(ptr1, _glow);
859  vst1q_f32(ptr1+4, _ghigh);
860  vst1q_f32(ptr2, _blow);
861  vst1q_f32(ptr2+4, _bhigh);
862 
863  rgba += 4*8;
864  ptr0 += 8;
865  ptr1 += 8;
866  ptr2 += 8;
867  }
868 #else
869  if (nn > 0)
870  {
871  asm volatile(
872  "0: \n"
873  "pld [%1, #256] \n"
874  "vld4.u8 {d0-d3}, [%1]! \n"
875  "vmovl.u8 q8, d0 \n"
876  "vmovl.u8 q9, d1 \n"
877  "vmovl.u8 q10, d2 \n"
878  "vmovl.u16 q0, d16 \n"
879  "vmovl.u16 q1, d17 \n"
880  "vmovl.u16 q2, d18 \n"
881  "vmovl.u16 q3, d19 \n"
882  "vmovl.u16 q8, d20 \n"
883  "vmovl.u16 q9, d21 \n"
884  "vcvt.f32.u32 q0, q0 \n"
885  "vcvt.f32.u32 q1, q1 \n"
886  "vcvt.f32.u32 q2, q2 \n"
887  "vcvt.f32.u32 q3, q3 \n"
888  "vcvt.f32.u32 q8, q8 \n"
889  "subs %0, #1 \n"
890  "vst1.f32 {d0-d3}, [%2 :128]! \n"
891  "vcvt.f32.u32 q9, q9 \n"
892  "vst1.f32 {d4-d7}, [%3 :128]! \n"
893  "vst1.f32 {d16-d19}, [%4 :128]!\n"
894  "bne 0b \n"
895  : "=r"(nn), // %0
896  "=r"(rgba), // %1
897  "=r"(ptr0), // %2
898  "=r"(ptr1), // %3
899  "=r"(ptr2) // %4
900  : "0"(nn),
901  "1"(rgba),
902  "2"(ptr0),
903  "3"(ptr1),
904  "4"(ptr2)
905  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
906  );
907  }
908 #endif // __aarch64__
909 #endif // __ARM_NEON
910  for (; remain>0; remain--)
911  {
912  *ptr0 = rgba[0];
913  *ptr1 = rgba[1];
914  *ptr2 = rgba[2];
915 
916  rgba += 4;
917  ptr0++;
918  ptr1++;
919  ptr2++;
920  }
921 
922  return m;
923 }
924 
925 static Mat from_rgba2bgr(const unsigned char* rgba, int w, int h)
926 {
927  Mat m(w, h, 3);
928  if (m.empty())
929  return m;
930 
931  float* ptr0 = m.channel(0);
932  float* ptr1 = m.channel(1);
933  float* ptr2 = m.channel(2);
934 
935  int size = w * h;
936 
937 #if __ARM_NEON
938  int nn = size >> 3;
939  int remain = size - (nn << 3);
940 #else
941  int remain = size;
942 #endif // __ARM_NEON
943 
944 #if __ARM_NEON
945 #if __aarch64__
946  for (; nn>0; nn--)
947  {
948  uint8x8x4_t _rgba = vld4_u8(rgba);
949  int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
950  int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
951  int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
952 
953  float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
954  float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
955  float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
956  float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
957  float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
958  float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
959 
960  vst1q_f32(ptr2, _rlow);
961  vst1q_f32(ptr2+4, _rhigh);
962  vst1q_f32(ptr1, _glow);
963  vst1q_f32(ptr1+4, _ghigh);
964  vst1q_f32(ptr0, _blow);
965  vst1q_f32(ptr0+4, _bhigh);
966 
967  rgba += 4*8;
968  ptr0 += 8;
969  ptr1 += 8;
970  ptr2 += 8;
971  }
972 #else
973  if (nn > 0)
974  {
975  asm volatile(
976  "0: \n"
977  "pld [%1, #256] \n"
978  "vld4.u8 {d0-d3}, [%1]! \n"
979  "vmovl.u8 q8, d0 \n"
980  "vmovl.u8 q9, d1 \n"
981  "vmovl.u8 q10, d2 \n"
982  "vmovl.u16 q0, d16 \n"
983  "vmovl.u16 q1, d17 \n"
984  "vmovl.u16 q2, d18 \n"
985  "vmovl.u16 q3, d19 \n"
986  "vmovl.u16 q8, d20 \n"
987  "vmovl.u16 q9, d21 \n"
988  "vcvt.f32.u32 q0, q0 \n"
989  "vcvt.f32.u32 q1, q1 \n"
990  "vcvt.f32.u32 q2, q2 \n"
991  "vcvt.f32.u32 q3, q3 \n"
992  "vcvt.f32.u32 q8, q8 \n"
993  "subs %0, #1 \n"
994  "vst1.f32 {d0-d3}, [%4 :128]! \n"
995  "vcvt.f32.u32 q9, q9 \n"
996  "vst1.f32 {d4-d7}, [%3 :128]! \n"
997  "vst1.f32 {d16-d19}, [%2 :128]!\n"
998  "bne 0b \n"
999  : "=r"(nn), // %0
1000  "=r"(rgba), // %1
1001  "=r"(ptr0), // %2
1002  "=r"(ptr1), // %3
1003  "=r"(ptr2) // %4
1004  : "0"(nn),
1005  "1"(rgba),
1006  "2"(ptr0),
1007  "3"(ptr1),
1008  "4"(ptr2)
1009  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
1010  );
1011  }
1012 #endif // __aarch64__
1013 #endif // __ARM_NEON
1014  for (; remain>0; remain--)
1015  {
1016  *ptr0 = rgba[2];
1017  *ptr1 = rgba[1];
1018  *ptr2 = rgba[0];
1019 
1020  rgba += 4;
1021  ptr0++;
1022  ptr1++;
1023  ptr2++;
1024  }
1025 
1026  return m;
1027 }
1028 
1029 static Mat from_rgba2gray(const unsigned char* rgba, int w, int h)
1030 {
1031  // coeffs for r g b = 0.299f, 0.587f, 0.114f
1032  const unsigned char Y_shift = 8;//14
1033  const unsigned char R2Y = 77;
1034  const unsigned char G2Y = 150;
1035  const unsigned char B2Y = 29;
1036 
1037  Mat m(w, h, 1);
1038  if (m.empty())
1039  return m;
1040 
1041  float* ptr = m;
1042 
1043  int size = w * h;
1044 
1045 #if __ARM_NEON
1046  int nn = size >> 3;
1047  int remain = size - (nn << 3);
1048 #else
1049  int remain = size;
1050 #endif // __ARM_NEON
1051 
1052 #if __ARM_NEON
1053 #if __aarch64__
1054  uint8x8_t _R2Y = vdup_n_u8(R2Y);
1055  uint8x8_t _G2Y = vdup_n_u8(G2Y);
1056  uint8x8_t _B2Y = vdup_n_u8(B2Y);
1057  for (; nn>0; nn--)
1058  {
1059  uint8x8x4_t _rgba = vld4_u8(rgba);
1060 
1061  uint16x8_t _y16 = vmull_u8(_rgba.val[0], _R2Y);
1062  _y16 = vmlal_u8(_y16, _rgba.val[1], _G2Y);
1063  _y16 = vmlal_u8(_y16, _rgba.val[2], _B2Y);
1064  _y16 = vshrq_n_u16(_y16, Y_shift);
1065 
1066  float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
1067  float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
1068 
1069  vst1q_f32(ptr, _ylow);
1070  vst1q_f32(ptr+4, _yhigh);
1071 
1072  rgba += 4*8;
1073  ptr += 8;
1074  }
1075 #else
1076  if (nn > 0)
1077  {
1078  asm volatile(
1079  "vdup.u8 d16, %6 \n"
1080  "vdup.u8 d17, %7 \n"
1081  "vdup.u8 d18, %8 \n"
1082  "0: \n"
1083  "pld [%1, #256] \n"
1084  "vld4.u8 {d0-d3}, [%1]! \n"
1085  "vmull.u8 q2, d0, d16 \n"
1086  "vmlal.u8 q2, d1, d17 \n"
1087  "vmlal.u8 q2, d2, d18 \n"
1088  "vshr.u16 q2, q2, #8 \n" // Y_shift
1089  "vmovl.u16 q0, d4 \n"
1090  "vmovl.u16 q1, d5 \n"
1091  "vcvt.f32.u32 q0, q0 \n"
1092  "vcvt.f32.u32 q1, q1 \n"
1093  "subs %0, #1 \n"
1094  "vst1.f32 {d0-d3}, [%2 :128]! \n"
1095  "bne 0b \n"
1096  : "=r"(nn), // %0
1097  "=r"(rgba), // %1
1098  "=r"(ptr) // %2
1099  : "0"(nn),
1100  "1"(rgba),
1101  "2"(ptr),
1102  "r"(R2Y), // %6
1103  "r"(G2Y), // %7
1104  "r"(B2Y) // %8
1105  : "cc", "memory", "q0", "q1", "q2", "q8", "q9"
1106  );
1107  }
1108 #endif // __aarch64__
1109 #endif // __ARM_NEON
1110  for (; remain>0; remain--)
1111  {
1112  *ptr = (rgba[0] * R2Y + rgba[1] * G2Y + rgba[2] * B2Y) >> Y_shift;
1113 
1114  rgba += 4;
1115  ptr++;
1116  }
1117 
1118  return m;
1119 }
1120 
1121 
1122 
1123 //void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
1124 //void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
1125 //void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
1126 
1127 
1128 static void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
1129 {
1130  const int INTER_RESIZE_COEF_BITS=11;
1131  const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
1132 // const int ONE=INTER_RESIZE_COEF_SCALE;
1133 
1134  double scale_x = (double)srcw / w;
1135  double scale_y = (double)srch / h;
1136 
1137  int* buf = new int[w + h + w + h];
1138 
1139  int* xofs = buf;//new int[w];
1140  int* yofs = buf + w;//new int[h];
1141 
1142  short* ialpha = (short*)(buf + w + h);//new short[w * 2];
1143  short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
1144 
1145  float fx;
1146  float fy;
1147  int sx;
1148  int sy;
1149 
1150 #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
1151 
1152  for (int dx = 0; dx < w; dx++)
1153  {
1154  fx = (float)((dx + 0.5) * scale_x - 0.5);
1155  sx = fx;//cvFloor(fx);
1156  fx -= sx;
1157 
1158  if (sx >= srcw - 1)
1159  {
1160  sx = srcw - 2;
1161  fx = 1.f;
1162  }
1163 
1164  xofs[dx] = sx*3;
1165 
1166  float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
1167  float a1 = fx * INTER_RESIZE_COEF_SCALE;
1168 
1169  ialpha[dx*2 ] = SATURATE_CAST_SHORT(a0);
1170  ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
1171  }
1172 
1173  for (int dy = 0; dy < h; dy++)
1174  {
1175  fy = (float)((dy + 0.5) * scale_y - 0.5);
1176  sy = fy;//cvFloor(fy);
1177  fy -= sy;
1178 
1179  if (sy >= srch - 1)
1180  {
1181  sy = srch - 2;
1182  fy = 1.f;
1183  }
1184 
1185  yofs[dy] = sy*3;
1186 
1187  float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
1188  float b1 = fy * INTER_RESIZE_COEF_SCALE;
1189 
1190  ibeta[dy*2 ] = SATURATE_CAST_SHORT(b0);
1191  ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
1192  }
1193 
1194 #undef SATURATE_CAST_SHORT
1195 
1196  // loop body
1197  Mat rowsbuf0((w*3 >> 1) + 3);
1198  Mat rowsbuf1((w*3 >> 1) + 3);
1199  short* rows0 = (short*)rowsbuf0.data;
1200  short* rows1 = (short*)rowsbuf1.data;
1201 
1202  int prev_sy1 = -1;
1203 
1204  for (int dy = 0; dy < h; dy++ )
1205  {
1206  int sy = yofs[dy];
1207 
1208  if (sy == prev_sy1)
1209  {
1210  // hresize one row
1211  short* rows0_old = rows0;
1212  rows0 = rows1;
1213  rows1 = rows0_old;
1214  const unsigned char *S1 = src + srcw * (sy+3);
1215 
1216  const short* ialphap = ialpha;
1217  short* rows1p = rows1;
1218  for ( int dx = 0; dx < w; dx++ )
1219  {
1220  int sx = xofs[dx];
1221  short a0 = ialphap[0];
1222  short a1 = ialphap[1];
1223 
1224  const unsigned char* S1p = S1 + sx;
1225 #if __ARM_NEON
1226  int16x4_t _a0 = vdup_n_s16(a0);
1227  int16x4_t _a1 = vdup_n_s16(a1);
1228  uint8x8_t _S1 = vld1_u8(S1p);
1229  int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
1230  int16x4_t _S1low = vget_low_s16(_S116);
1231  int16x4_t _S1high = vext_s16(_S1low, vget_high_s16(_S116), 3);
1232  int32x4_t _rows1 = vmull_s16(_S1low, _a0);
1233  _rows1 = vmlal_s16(_rows1, _S1high, _a1);
1234  int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
1235  vst1_s16(rows1p, _rows1_sr4);
1236 #else
1237  rows1p[0] = (S1p[0]*a0 + S1p[3]*a1) >> 4;
1238  rows1p[1] = (S1p[1]*a0 + S1p[4]*a1) >> 4;
1239  rows1p[2] = (S1p[2]*a0 + S1p[5]*a1) >> 4;
1240 #endif // __ARM_NEON
1241 
1242  ialphap += 2;
1243  rows1p += 3;
1244  }
1245  }
1246  else
1247  {
1248  // hresize two rows
1249  const unsigned char *S0 = src + srcw * (sy);
1250  const unsigned char *S1 = src + srcw * (sy+3);
1251 
1252  const short* ialphap = ialpha;
1253  short* rows0p = rows0;
1254  short* rows1p = rows1;
1255  for ( int dx = 0; dx < w; dx++ )
1256  {
1257  int sx = xofs[dx];
1258  short a0 = ialphap[0];
1259  short a1 = ialphap[1];
1260 
1261  const unsigned char* S0p = S0 + sx;
1262  const unsigned char* S1p = S1 + sx;
1263 #if __ARM_NEON
1264  int16x4_t _a0 = vdup_n_s16(a0);
1265  int16x4_t _a1 = vdup_n_s16(a1);
1266  uint8x8_t _S0 = vld1_u8(S0p);
1267  uint8x8_t _S1 = vld1_u8(S1p);
1268  int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
1269  int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
1270  int16x4_t _S0low = vget_low_s16(_S016);
1271  int16x4_t _S1low = vget_low_s16(_S116);
1272  int16x4_t _S0high = vext_s16(_S0low, vget_high_s16(_S016), 3);
1273  int16x4_t _S1high = vext_s16(_S1low, vget_high_s16(_S116), 3);
1274  int32x4_t _rows0 = vmull_s16(_S0low, _a0);
1275  int32x4_t _rows1 = vmull_s16(_S1low, _a0);
1276  _rows0 = vmlal_s16(_rows0, _S0high, _a1);
1277  _rows1 = vmlal_s16(_rows1, _S1high, _a1);
1278  int16x4_t _rows0_sr4 = vshrn_n_s32(_rows0, 4);
1279  int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
1280  vst1_s16(rows0p, _rows0_sr4);
1281  vst1_s16(rows1p, _rows1_sr4);
1282 #else
1283  rows0p[0] = (S0p[0]*a0 + S0p[3]*a1) >> 4;
1284  rows0p[1] = (S0p[1]*a0 + S0p[4]*a1) >> 4;
1285  rows0p[2] = (S0p[2]*a0 + S0p[5]*a1) >> 4;
1286  rows1p[0] = (S1p[0]*a0 + S1p[3]*a1) >> 4;
1287  rows1p[1] = (S1p[1]*a0 + S1p[4]*a1) >> 4;
1288  rows1p[2] = (S1p[2]*a0 + S1p[5]*a1) >> 4;
1289 #endif // __ARM_NEON
1290 
1291  ialphap += 2;
1292  rows0p += 3;
1293  rows1p += 3;
1294  }
1295  }
1296 
1297  prev_sy1 = sy + 1;
1298 
1299  // vresize
1300  short b0 = ibeta[0];
1301  short b1 = ibeta[1];
1302 
1303  short* rows0p = rows0;
1304  short* rows1p = rows1;
1305  unsigned char* Dp = dst + w * 3 * (dy);
1306 
1307 #if __ARM_NEON
1308  int nn = (w * 3) >> 3;
1309 #else
1310  int nn = 0;
1311 #endif
1312  int remain = (w * 3) - (nn << 3);
1313 
1314 #if __ARM_NEON
1315 #if __aarch64__
1316  int16x4_t _b0 = vdup_n_s16(b0);
1317  int16x4_t _b1 = vdup_n_s16(b1);
1318  int32x4_t _v2 = vdupq_n_s32(2);
1319  for (; nn>0; nn--)
1320  {
1321  int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
1322  int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
1323  int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
1324  int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
1325 
1326  int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
1327  int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
1328  int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
1329  int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
1330 
1331  int32x4_t _acc = _v2;
1332  _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
1333  _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
1334 
1335  int32x4_t _acc_1 = _v2;
1336  _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
1337  _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
1338 
1339  int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
1340  int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
1341 
1342  uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
1343 
1344  vst1_u8(Dp, _D);
1345 
1346  Dp += 8;
1347  rows0p += 8;
1348  rows1p += 8;
1349  }
1350 #else
1351  if (nn > 0)
1352  {
1353  asm volatile(
1354  "vdup.s16 d16, %8 \n"
1355  "mov r4, #2 \n"
1356  "vdup.s16 d17, %9 \n"
1357  "vdup.s32 q12, r4 \n"
1358  "pld [%0, #128] \n"
1359  "vld1.s16 {d2-d3}, [%0 :128]!\n"
1360  "pld [%1, #128] \n"
1361  "vld1.s16 {d6-d7}, [%1 :128]!\n"
1362  "0: \n"
1363  "vmull.s16 q0, d2, d16 \n"
1364  "vmull.s16 q1, d3, d16 \n"
1365  "vorr.s32 q10, q12, q12 \n"
1366  "vorr.s32 q11, q12, q12 \n"
1367  "vmull.s16 q2, d6, d17 \n"
1368  "vmull.s16 q3, d7, d17 \n"
1369  "vsra.s32 q10, q0, #16 \n"
1370  "vsra.s32 q11, q1, #16 \n"
1371  "pld [%0, #128] \n"
1372  "vld1.s16 {d2-d3}, [%0 :128]!\n"
1373  "vsra.s32 q10, q2, #16 \n"
1374  "vsra.s32 q11, q3, #16 \n"
1375  "pld [%1, #128] \n"
1376  "vld1.s16 {d6-d7}, [%1 :128]!\n"
1377  "vshrn.s32 d20, q10, #2 \n"
1378  "vshrn.s32 d21, q11, #2 \n"
1379  "vqmovun.s16 d20, q10 \n"
1380  "vst1.8 {d20}, [%2]! \n"
1381  "subs %3, #1 \n"
1382  "bne 0b \n"
1383  "sub %0, #16 \n"
1384  "sub %1, #16 \n"
1385  : "=r"(rows0p), // %0
1386  "=r"(rows1p), // %1
1387  "=r"(Dp), // %2
1388  "=r"(nn) // %3
1389  : "0"(rows0p),
1390  "1"(rows1p),
1391  "2"(Dp),
1392  "3"(nn),
1393  "r"(b0), // %8
1394  "r"(b1) // %9
1395  : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
1396  );
1397  }
1398 #endif // __aarch64__
1399 #endif // __ARM_NEON
1400  for ( ; remain; --remain )
1401  {
1402 // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
1403  *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
1404  }
1405 
1406  ibeta += 2;
1407  }
1408 
1409  delete[] buf;
1410 }
1411 
1412 static void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
1413 {
1414  const int INTER_RESIZE_COEF_BITS=11;
1415  const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
1416 // const int ONE=INTER_RESIZE_COEF_SCALE;
1417 
1418  double scale_x = (double)srcw / w;
1419  double scale_y = (double)srch / h;
1420 
1421  int* buf = new int[w + h + w + h];
1422 
1423  int* xofs = buf;//new int[w];
1424  int* yofs = buf + w;//new int[h];
1425 
1426  short* ialpha = (short*)(buf + w + h);//new short[w * 2];
1427  short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
1428 
1429  float fx;
1430  float fy;
1431  int sx;
1432  int sy;
1433 
1434 #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
1435 
1436  for (int dx = 0; dx < w; dx++)
1437  {
1438  fx = (float)((dx + 0.5) * scale_x - 0.5);
1439  sx = fx;//cvFloor(fx);
1440  fx -= sx;
1441 
1442  if (sx >= srcw - 1)
1443  {
1444  sx = srcw - 2;
1445  fx = 1.f;
1446  }
1447 
1448  xofs[dx] = sx;
1449 
1450  float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
1451  float a1 = fx * INTER_RESIZE_COEF_SCALE;
1452 
1453  ialpha[dx*2 ] = SATURATE_CAST_SHORT(a0);
1454  ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
1455  }
1456 
1457  for (int dy = 0; dy < h; dy++)
1458  {
1459  fy = (float)((dy + 0.5) * scale_y - 0.5);
1460  sy = fy;//cvFloor(fy);
1461  fy -= sy;
1462 
1463  if (sy >= srch - 1)
1464  {
1465  sy = srch - 2;
1466  fy = 1.f;
1467  }
1468 
1469  yofs[dy] = sy;
1470 
1471  float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
1472  float b1 = fy * INTER_RESIZE_COEF_SCALE;
1473 
1474  ibeta[dy*2 ] = SATURATE_CAST_SHORT(b0);
1475  ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
1476  }
1477 
1478 #undef SATURATE_CAST_SHORT
1479 
1480  // loop body
1481  Mat rowsbuf0((w >> 1) + 1);
1482  Mat rowsbuf1((w >> 1) + 1);
1483  short* rows0 = (short*)rowsbuf0.data;
1484  short* rows1 = (short*)rowsbuf1.data;
1485 
1486  int prev_sy1 = -1;
1487 
1488  for (int dy = 0; dy < h; dy++ )
1489  {
1490  int sy = yofs[dy];
1491 
1492  if (sy == prev_sy1)
1493  {
1494  // hresize one row
1495  short* rows0_old = rows0;
1496  rows0 = rows1;
1497  rows1 = rows0_old;
1498  const unsigned char *S1 = src + srcw * (sy+1);
1499 
1500  const short* ialphap = ialpha;
1501  short* rows1p = rows1;
1502  for ( int dx = 0; dx < w; dx++ )
1503  {
1504  int sx = xofs[dx];
1505  short a0 = ialphap[0];
1506  short a1 = ialphap[1];
1507 
1508  const unsigned char* S1p = S1 + sx;
1509  rows1p[dx] = (S1p[0]*a0 + S1p[1]*a1) >> 4;
1510 
1511  ialphap += 2;
1512  }
1513  }
1514  else
1515  {
1516  // hresize two rows
1517  const unsigned char *S0 = src + srcw * (sy);
1518  const unsigned char *S1 = src + srcw * (sy+1);
1519 
1520  const short* ialphap = ialpha;
1521  short* rows0p = rows0;
1522  short* rows1p = rows1;
1523  for ( int dx = 0; dx < w; dx++ )
1524  {
1525  int sx = xofs[dx];
1526  short a0 = ialphap[0];
1527  short a1 = ialphap[1];
1528 
1529  const unsigned char* S0p = S0 + sx;
1530  const unsigned char* S1p = S1 + sx;
1531  rows0p[dx] = (S0p[0]*a0 + S0p[1]*a1) >> 4;
1532  rows1p[dx] = (S1p[0]*a0 + S1p[1]*a1) >> 4;
1533 
1534  ialphap += 2;
1535  }
1536  }
1537 
1538  prev_sy1 = sy + 1;
1539 
1540  // vresize
1541  short b0 = ibeta[0];
1542  short b1 = ibeta[1];
1543 
1544  short* rows0p = rows0;
1545  short* rows1p = rows1;
1546  unsigned char* Dp = dst + w * (dy);
1547 
1548 #if __ARM_NEON
1549  int nn = w >> 3;
1550 #else
1551  int nn = 0;
1552 #endif
1553  int remain = w - (nn << 3);
1554 
1555 #if __ARM_NEON
1556 #if __aarch64__
1557  int16x4_t _b0 = vdup_n_s16(b0);
1558  int16x4_t _b1 = vdup_n_s16(b1);
1559  int32x4_t _v2 = vdupq_n_s32(2);
1560  for (; nn>0; nn--)
1561  {
1562  int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
1563  int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
1564  int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
1565  int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
1566 
1567  int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
1568  int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
1569  int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
1570  int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
1571 
1572  int32x4_t _acc = _v2;
1573  _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
1574  _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
1575 
1576  int32x4_t _acc_1 = _v2;
1577  _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
1578  _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
1579 
1580  int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
1581  int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
1582 
1583  uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
1584 
1585  vst1_u8(Dp, _D);
1586 
1587  Dp += 8;
1588  rows0p += 8;
1589  rows1p += 8;
1590  }
1591 #else
1592  if (nn > 0)
1593  {
1594  asm volatile(
1595  "vdup.s16 d16, %8 \n"
1596  "mov r4, #2 \n"
1597  "vdup.s16 d17, %9 \n"
1598  "vdup.s32 q12, r4 \n"
1599  "pld [%0, #128] \n"
1600  "vld1.s16 {d2-d3}, [%0 :128]!\n"
1601  "pld [%1, #128] \n"
1602  "vld1.s16 {d6-d7}, [%1 :128]!\n"
1603  "0: \n"
1604  "vmull.s16 q0, d2, d16 \n"
1605  "vmull.s16 q1, d3, d16 \n"
1606  "vorr.s32 q10, q12, q12 \n"
1607  "vorr.s32 q11, q12, q12 \n"
1608  "vmull.s16 q2, d6, d17 \n"
1609  "vmull.s16 q3, d7, d17 \n"
1610  "vsra.s32 q10, q0, #16 \n"
1611  "vsra.s32 q11, q1, #16 \n"
1612  "pld [%0, #128] \n"
1613  "vld1.s32 {d2-d3}, [%0 :128]!\n"
1614  "vsra.s32 q10, q2, #16 \n"
1615  "vsra.s32 q11, q3, #16 \n"
1616  "pld [%1, #128] \n"
1617  "vld1.s32 {d6-d7}, [%1 :128]!\n"
1618  "vshrn.s32 d20, q10, #2 \n"
1619  "vshrn.s32 d21, q11, #2 \n"
1620  "vqmovun.s16 d20, q10 \n"
1621  "vst1.8 {d20}, [%2]! \n"
1622  "subs %3, #1 \n"
1623  "bne 0b \n"
1624  "sub %0, #16 \n"
1625  "sub %1, #16 \n"
1626  : "=r"(rows0p), // %0
1627  "=r"(rows1p), // %1
1628  "=r"(Dp), // %2
1629  "=r"(nn) // %3
1630  : "0"(rows0p),
1631  "1"(rows1p),
1632  "2"(Dp),
1633  "3"(nn),
1634  "r"(b0), // %8
1635  "r"(b1) // %9
1636  : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
1637  );
1638  }
1639 #endif // __aarch64__
1640 #endif // __ARM_NEON
1641  for ( ; remain; --remain )
1642  {
1643 // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
1644  *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
1645  }
1646 
1647  ibeta += 2;
1648  }
1649 
1650  delete[] buf;
1651 }
1652 
1653 static void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
1654 {
1655  const int INTER_RESIZE_COEF_BITS=11;
1656  const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
1657 // const int ONE=INTER_RESIZE_COEF_SCALE;
1658 
1659  double scale_x = (double)srcw / w;
1660  double scale_y = (double)srch / h;
1661 
1662  int* buf = new int[w + h + w + h];
1663 
1664  int* xofs = buf;//new int[w];
1665  int* yofs = buf + w;//new int[h];
1666 
1667  short* ialpha = (short*)(buf + w + h);//new short[w * 2];
1668  short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
1669 
1670  float fx;
1671  float fy;
1672  int sx;
1673  int sy;
1674 
1675 #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
1676 
1677  for (int dx = 0; dx < w; dx++)
1678  {
1679  fx = (float)((dx + 0.5) * scale_x - 0.5);
1680  sx = fx;//cvFloor(fx);
1681  fx -= sx;
1682 
1683  if (sx >= srcw - 1)
1684  {
1685  sx = srcw - 2;
1686  fx = 1.f;
1687  }
1688 
1689  xofs[dx] = sx*4;
1690 
1691  float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
1692  float a1 = fx * INTER_RESIZE_COEF_SCALE;
1693 
1694  ialpha[dx*2 ] = SATURATE_CAST_SHORT(a0);
1695  ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
1696  }
1697 
1698  for (int dy = 0; dy < h; dy++)
1699  {
1700  fy = (float)((dy + 0.5) * scale_y - 0.5);
1701  sy = fy;//cvFloor(fy);
1702  fy -= sy;
1703 
1704  if (sy >= srch - 1)
1705  {
1706  sy = srch - 2;
1707  fy = 1.f;
1708  }
1709 
1710  yofs[dy] = sy*4;
1711 
1712  float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
1713  float b1 = fy * INTER_RESIZE_COEF_SCALE;
1714 
1715  ibeta[dy*2 ] = SATURATE_CAST_SHORT(b0);
1716  ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
1717  }
1718 
1719 #undef SATURATE_CAST_SHORT
1720 
1721  // loop body
1722  Mat rowsbuf0((w*4 >> 1) + 4);
1723  Mat rowsbuf1((w*4 >> 1) + 4);
1724  short* rows0 = (short*)rowsbuf0.data;
1725  short* rows1 = (short*)rowsbuf1.data;
1726 
1727  int prev_sy1 = -1;
1728 
1729  for (int dy = 0; dy < h; dy++ )
1730  {
1731  int sy = yofs[dy];
1732 
1733  if (sy == prev_sy1)
1734  {
1735  // hresize one row
1736  short* rows0_old = rows0;
1737  rows0 = rows1;
1738  rows1 = rows0_old;
1739  const unsigned char *S1 = src + srcw * (sy+4);
1740 
1741  const short* ialphap = ialpha;
1742  short* rows1p = rows1;
1743  for ( int dx = 0; dx < w; dx++ )
1744  {
1745  int sx = xofs[dx];
1746  short a0 = ialphap[0];
1747  short a1 = ialphap[1];
1748 
1749  const unsigned char* S1p = S1 + sx;
1750 #if __ARM_NEON
1751  int16x4_t _a0 = vdup_n_s16(a0);
1752  int16x4_t _a1 = vdup_n_s16(a1);
1753  uint8x8_t _S1 = vld1_u8(S1p);
1754  int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
1755  int16x4_t _S1low = vget_low_s16(_S116);
1756  int16x4_t _S1high = vget_high_s16(_S116);
1757  int32x4_t _rows1 = vmull_s16(_S1low, _a0);
1758  _rows1 = vmlal_s16(_rows1, _S1high, _a1);
1759  int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
1760  vst1_s16(rows1p, _rows1_sr4);
1761 #else
1762  rows1p[0] = (S1p[0]*a0 + S1p[4]*a1) >> 4;
1763  rows1p[1] = (S1p[1]*a0 + S1p[5]*a1) >> 4;
1764  rows1p[2] = (S1p[2]*a0 + S1p[6]*a1) >> 4;
1765  rows1p[3] = (S1p[3]*a0 + S1p[7]*a1) >> 4;
1766 #endif // __ARM_NEON
1767 
1768  ialphap += 2;
1769  rows1p += 4;
1770  }
1771  }
1772  else
1773  {
1774  // hresize two rows
1775  const unsigned char *S0 = src + srcw * (sy);
1776  const unsigned char *S1 = src + srcw * (sy+4);
1777 
1778  const short* ialphap = ialpha;
1779  short* rows0p = rows0;
1780  short* rows1p = rows1;
1781  for ( int dx = 0; dx < w; dx++ )
1782  {
1783  int sx = xofs[dx];
1784  short a0 = ialphap[0];
1785  short a1 = ialphap[1];
1786 
1787  const unsigned char* S0p = S0 + sx;
1788  const unsigned char* S1p = S1 + sx;
1789 #if __ARM_NEON
1790  int16x4_t _a0 = vdup_n_s16(a0);
1791  int16x4_t _a1 = vdup_n_s16(a1);
1792  uint8x8_t _S0 = vld1_u8(S0p);
1793  uint8x8_t _S1 = vld1_u8(S1p);
1794  int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
1795  int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
1796  int16x4_t _S0low = vget_low_s16(_S016);
1797  int16x4_t _S1low = vget_low_s16(_S116);
1798  int16x4_t _S0high = vget_high_s16(_S016);
1799  int16x4_t _S1high = vget_high_s16(_S116);
1800  int32x4_t _rows0 = vmull_s16(_S0low, _a0);
1801  int32x4_t _rows1 = vmull_s16(_S1low, _a0);
1802  _rows0 = vmlal_s16(_rows0, _S0high, _a1);
1803  _rows1 = vmlal_s16(_rows1, _S1high, _a1);
1804  int16x4_t _rows0_sr4 = vshrn_n_s32(_rows0, 4);
1805  int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
1806  vst1_s16(rows0p, _rows0_sr4);
1807  vst1_s16(rows1p, _rows1_sr4);
1808 #else
1809  rows0p[0] = (S0p[0]*a0 + S0p[4]*a1) >> 4;
1810  rows0p[1] = (S0p[1]*a0 + S0p[5]*a1) >> 4;
1811  rows0p[2] = (S0p[2]*a0 + S0p[6]*a1) >> 4;
1812  rows0p[3] = (S0p[3]*a0 + S0p[7]*a1) >> 4;
1813  rows1p[0] = (S1p[0]*a0 + S1p[4]*a1) >> 4;
1814  rows1p[1] = (S1p[1]*a0 + S1p[5]*a1) >> 4;
1815  rows1p[2] = (S1p[2]*a0 + S1p[6]*a1) >> 4;
1816  rows1p[3] = (S1p[3]*a0 + S1p[7]*a1) >> 4;
1817 #endif // __ARM_NEON
1818 
1819  ialphap += 2;
1820  rows0p += 4;
1821  rows1p += 4;
1822  }
1823  }
1824 
1825  prev_sy1 = sy + 1;
1826 
1827  // vresize
1828  short b0 = ibeta[0];
1829  short b1 = ibeta[1];
1830 
1831  short* rows0p = rows0;
1832  short* rows1p = rows1;
1833  unsigned char* Dp = dst + w * 4 * (dy);
1834 
1835 #if __ARM_NEON
1836  int nn = (w * 4) >> 3;
1837 #else
1838  int nn = 0;
1839 #endif
1840  int remain = (w * 4) - (nn << 3);
1841 
1842 #if __ARM_NEON
1843 #if __aarch64__
1844  int16x4_t _b0 = vdup_n_s16(b0);
1845  int16x4_t _b1 = vdup_n_s16(b1);
1846  int32x4_t _v2 = vdupq_n_s32(2);
1847  for (; nn>0; nn--)
1848  {
1849  int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
1850  int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
1851  int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
1852  int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
1853 
1854  int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
1855  int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
1856  int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
1857  int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
1858 
1859  int32x4_t _acc = _v2;
1860  _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
1861  _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
1862 
1863  int32x4_t _acc_1 = _v2;
1864  _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
1865  _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
1866 
1867  int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
1868  int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
1869 
1870  uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
1871 
1872  vst1_u8(Dp, _D);
1873 
1874  Dp += 8;
1875  rows0p += 8;
1876  rows1p += 8;
1877  }
1878 #else
1879  if (nn > 0)
1880  {
1881  asm volatile(
1882  "vdup.s16 d16, %8 \n"
1883  "mov r4, #2 \n"
1884  "vdup.s16 d17, %9 \n"
1885  "vdup.s32 q12, r4 \n"
1886  "pld [%0, #128] \n"
1887  "vld1.s16 {d2-d3}, [%0 :128]!\n"
1888  "pld [%1, #128] \n"
1889  "vld1.s16 {d6-d7}, [%1 :128]!\n"
1890  "0: \n"
1891  "vmull.s16 q0, d2, d16 \n"
1892  "vmull.s16 q1, d3, d16 \n"
1893  "vorr.s32 q10, q12, q12 \n"
1894  "vorr.s32 q11, q12, q12 \n"
1895  "vmull.s16 q2, d6, d17 \n"
1896  "vmull.s16 q3, d7, d17 \n"
1897  "vsra.s32 q10, q0, #16 \n"
1898  "vsra.s32 q11, q1, #16 \n"
1899  "pld [%0, #128] \n"
1900  "vld1.s32 {d2-d3}, [%0 :128]!\n"
1901  "vsra.s32 q10, q2, #16 \n"
1902  "vsra.s32 q11, q3, #16 \n"
1903  "pld [%1, #128] \n"
1904  "vld1.s32 {d6-d7}, [%1 :128]!\n"
1905  "vshrn.s32 d20, q10, #2 \n"
1906  "vshrn.s32 d21, q11, #2 \n"
1907  "vqmovun.s16 d20, q10 \n"
1908  "vst1.8 {d20}, [%2]! \n"
1909  "subs %3, #1 \n"
1910  "bne 0b \n"
1911  "sub %0, #16 \n"
1912  "sub %1, #16 \n"
1913  : "=r"(rows0p), // %0
1914  "=r"(rows1p), // %1
1915  "=r"(Dp), // %2
1916  "=r"(nn) // %3
1917  : "0"(rows0p),
1918  "1"(rows1p),
1919  "2"(Dp),
1920  "3"(nn),
1921  "r"(b0), // %8
1922  "r"(b1) // %9
1923  : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
1924  );
1925  }
1926 #endif // __aarch64__
1927 #endif // __ARM_NEON
1928  for ( ; remain; --remain )
1929  {
1930 // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
1931  *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
1932  }
1933 
1934  ibeta += 2;
1935  }
1936 
1937  delete[] buf;
1938 }
1939 
1940 inline Mat Mat::from_pixels(const unsigned char* pixels, int type, int w, int h)
1941 {
1942  if (type & PIXEL_CONVERT_MASK)
1943  {
1944  if (type == PIXEL_RGB2BGR || type == PIXEL_BGR2RGB)
1945  return from_rgb2bgr(pixels, w, h);
1946 
1947  if (type == PIXEL_RGB2GRAY)
1948  return from_rgb2gray(pixels, w, h);
1949 
1950  if (type == PIXEL_BGR2GRAY)
1951  return from_bgr2gray(pixels, w, h);
1952 
1953  if (type == PIXEL_GRAY2RGB || type == PIXEL_GRAY2BGR)
1954  return from_gray2rgb(pixels, w, h);
1955 
1956  if (type == PIXEL_RGBA2RGB)
1957  return from_rgba2rgb(pixels, w, h);
1958 
1959  if (type == PIXEL_RGBA2BGR)
1960  return from_rgba2bgr(pixels, w, h);
1961 
1962  if (type == PIXEL_RGBA2GRAY)
1963  return from_rgba2gray(pixels, w, h);
1964  }
1965  else
1966  {
1967  if (type == PIXEL_RGB || type == PIXEL_BGR)
1968  return from_rgb(pixels, w, h);
1969 
1970  if (type == PIXEL_GRAY)
1971  return from_gray(pixels, w, h);
1972 
1973  if (type == PIXEL_RGBA)
1974  return from_rgba(pixels, w, h);
1975  }
1976 
1977  return Mat();
1978 }
1979 
1980 inline Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height)
1981 {
1982  if (w == target_width && h == target_height)
1983  return Mat::from_pixels(pixels, type, w, h);
1984 
1985  Mat m;
1986 
1987  int type_from = type & PIXEL_FORMAT_MASK;
1988 
1989  if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
1990  {
1991  unsigned char* dst = new unsigned char[target_width * target_height * 3];
1992 
1993  resize_bilinear_c3(pixels, w, h, dst, target_width, target_height);
1994 
1995  m = Mat::from_pixels(dst, type, target_width, target_height);
1996 
1997  delete[] dst;
1998  }
1999  else if (type_from == PIXEL_GRAY)
2000  {
2001  unsigned char* dst = new unsigned char[target_width * target_height];
2002 
2003  resize_bilinear_c1(pixels, w, h, dst, target_width, target_height);
2004 
2005  m = Mat::from_pixels(dst, type, target_width, target_height);
2006 
2007  delete[] dst;
2008  }
2009  else if (type_from == PIXEL_RGBA)
2010  {
2011  unsigned char* dst = new unsigned char[target_width * target_height * 4];
2012 
2013  resize_bilinear_c4(pixels, w, h, dst, target_width, target_height);
2014 
2015  m = Mat::from_pixels(dst, type, target_width, target_height);
2016 
2017  delete[] dst;
2018  }
2019 
2020  return m;
2021 }
2022 
2023 inline void Mat::to_pixels(unsigned char* pixels, int type) const
2024 {
2025  if (type & PIXEL_CONVERT_MASK)
2026  {
2027  if (type == PIXEL_RGB2BGR || type == PIXEL_BGR2RGB)
2028  return to_bgr2rgb(*this, pixels);
2029  }
2030  else
2031  {
2032  if (type == PIXEL_RGB || type == PIXEL_BGR)
2033  return to_rgb(*this, pixels);
2034 
2035  if (type == PIXEL_GRAY)
2036  return to_gray(*this, pixels);
2037 
2038  if (type == PIXEL_RGBA)
2039  return to_rgba(*this, pixels);
2040  }
2041 }
2042 
2043 inline void Mat::to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height) const
2044 {
2045  if (w == target_width && h == target_height)
2046  return to_pixels(pixels, type);
2047 
2048  int type_to = (type & PIXEL_CONVERT_MASK) ? (type >> PIXEL_CONVERT_SHIFT) : (type & PIXEL_FORMAT_MASK);
2049 
2050  if (type_to == PIXEL_RGB || type_to == PIXEL_BGR)
2051  {
2052  unsigned char* src = new unsigned char[w * h * 3];
2053 
2054  to_pixels(src, type);
2055 
2056  resize_bilinear_c3(src, w, h, pixels, target_width, target_height);
2057 
2058  delete[] src;
2059  }
2060  else if (type_to == PIXEL_GRAY)
2061  {
2062  unsigned char* src = new unsigned char[w * h];
2063 
2064  to_pixels(src, type);
2065 
2066  resize_bilinear_c1(src, w, h, pixels, target_width, target_height);
2067 
2068  delete[] src;
2069  }
2070  else if (type_to == PIXEL_RGBA)
2071  {
2072  unsigned char* src = new unsigned char[w * h * 4];
2073 
2074  to_pixels(src, type);
2075 
2076  resize_bilinear_c4(src, w, h, pixels, target_width, target_height);
2077 
2078  delete[] src;
2079  }
2080 }
2081 
2082 } // namespace dface
2083 
2084 #endif
Definition: common.h:37
Mat dface内置的图像数据 dface的所有接口只支持dface::Mat(RGB格式)
Definition: mat.h:23
static Mat from_pixels(const unsigned char *pixels, int type, int w, int h)
Definition: mat_pixel.h:1940
void to_pixels(unsigned char *pixels, int type) const
Definition: mat_pixel.h:2023