DFace SDK  2.0.0
LTM
mat_pixel.h
1 #ifndef DFACE_MAT_PIXEL_H
2 #define DFACE_MAT_PIXEL_H
3 
4 #include "mat.h"
5 #include <limits.h>
6 #include <math.h>
7 #include <algorithm>
8 #if __ARM_NEON
9 #include <arm_neon.h>
10 #endif // __ARM_NEON
11 #include "platform.h"
12 
13 namespace dface {
14 
15 static inline float get_pixel(dface::Mat m, int x, int y, int c)
16 {
17  if(!m.data){
18  return 0.0f;
19  }
20  float* fptr = (float*)m.data + c*m.h*m.w + y * m.w + x;
21  return *fptr;
22  //return (m.data ? (float)(m.data[c*m.h*m.w + y * m.w + x]) : 0.0f);
23 }
24 
25 static inline void set_pixel(dface::Mat m, int x, int y, int c, float val)
26 {
27  /* x, y, c are already validated by upper layers */
28  if (m.data) {
29  float *fptr = (float *) m.data + c * m.h * m.w + y * m.w + x;
30  *fptr = val;
31  }
32 
33 }
34 
35 static inline float three_way_max(float a, float b, float c)
36 {
37  return (a > b) ? ((a > c) ? a : c) : ((b > c) ? b : c);
38 }
39 
40 static inline float three_way_min(float a, float b, float c)
41 {
42  return (a < b) ? ((a < c) ? a : c) : ((b < c) ? b : c);
43 }
44 
45 
46 static Mat from_rgb(const unsigned char* rgb, int w, int h, Allocator* allocator)
47 {
48  Mat m(w, h, 3, 4u, allocator);
49  if (m.empty())
50  return m;
51 
52  float* ptr0 = m.channel(0);
53  float* ptr1 = m.channel(1);
54  float* ptr2 = m.channel(2);
55 
56  int size = w * h;
57 
58 #if __ARM_NEON
59  int nn = size >> 3;
60  int remain = size - (nn << 3);
61 #else
62  int remain = size;
63 #endif // __ARM_NEON
64 
65 #if __ARM_NEON
66 #if __aarch64__
67  for (; nn>0; nn--)
68  {
69  uint8x8x3_t _rgb = vld3_u8(rgb);
70  uint16x8_t _r16 = vmovl_u8(_rgb.val[0]);
71  uint16x8_t _g16 = vmovl_u8(_rgb.val[1]);
72  uint16x8_t _b16 = vmovl_u8(_rgb.val[2]);
73 
74  float32x4_t _rlow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_r16)));
75  float32x4_t _rhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_r16)));
76  float32x4_t _glow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_g16)));
77  float32x4_t _ghigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_g16)));
78  float32x4_t _blow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_b16)));
79  float32x4_t _bhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_b16)));
80 
81  vst1q_f32(ptr0, _rlow);
82  vst1q_f32(ptr0+4, _rhigh);
83  vst1q_f32(ptr1, _glow);
84  vst1q_f32(ptr1+4, _ghigh);
85  vst1q_f32(ptr2, _blow);
86  vst1q_f32(ptr2+4, _bhigh);
87 
88  rgb += 3*8;
89  ptr0 += 8;
90  ptr1 += 8;
91  ptr2 += 8;
92  }
93 #else
94  if (nn > 0)
95  {
96  asm volatile(
97  "0: \n"
98  "pld [%1, #256] \n"
99  "vld3.u8 {d0-d2}, [%1]! \n"
100  "vmovl.u8 q8, d0 \n"
101  "vmovl.u8 q9, d1 \n"
102  "vmovl.u8 q10, d2 \n"
103  "vmovl.u16 q0, d16 \n"
104  "vmovl.u16 q1, d17 \n"
105  "vmovl.u16 q2, d18 \n"
106  "vmovl.u16 q3, d19 \n"
107  "vmovl.u16 q8, d20 \n"
108  "vmovl.u16 q9, d21 \n"
109  "vcvt.f32.u32 q0, q0 \n"
110  "vcvt.f32.u32 q1, q1 \n"
111  "vcvt.f32.u32 q2, q2 \n"
112  "vcvt.f32.u32 q3, q3 \n"
113  "vcvt.f32.u32 q8, q8 \n"
114  "subs %0, #1 \n"
115  "vst1.f32 {d0-d3}, [%2 :128]! \n"
116  "vcvt.f32.u32 q9, q9 \n"
117  "vst1.f32 {d4-d7}, [%3 :128]! \n"
118  "vst1.f32 {d16-d19}, [%4 :128]!\n"
119  "bne 0b \n"
120  : "=r"(nn), // %0
121  "=r"(rgb), // %1
122  "=r"(ptr0), // %2
123  "=r"(ptr1), // %3
124  "=r"(ptr2) // %4
125  : "0"(nn),
126  "1"(rgb),
127  "2"(ptr0),
128  "3"(ptr1),
129  "4"(ptr2)
130  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
131  );
132  }
133 #endif // __aarch64__
134 #endif // __ARM_NEON
135  for (; remain>0; remain--)
136  {
137  *ptr0 = rgb[0];
138  *ptr1 = rgb[1];
139  *ptr2 = rgb[2];
140 
141  rgb += 3;
142  ptr0++;
143  ptr1++;
144  ptr2++;
145  }
146 
147  return m;
148 }
149 
150 static void to_rgb(const Mat& m, unsigned char* rgb)
151 {
152  const float* ptr0 = m.channel(0);
153  const float* ptr1 = m.channel(1);
154  const float* ptr2 = m.channel(2);
155 
156  int size = m.w * m.h;
157 
158 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
159 
160  int remain = size;
161 
162  for (; remain>0; remain--)
163  {
164  rgb[0] = SATURATE_CAST_UCHAR(*ptr0);
165  rgb[1] = SATURATE_CAST_UCHAR(*ptr1);
166  rgb[2] = SATURATE_CAST_UCHAR(*ptr2);
167 
168  rgb += 3;
169  ptr0++;
170  ptr1++;
171  ptr2++;
172  }
173 
174 #undef SATURATE_CAST_UCHAR
175 }
176 
177 static Mat from_gray(const unsigned char* gray, int w, int h, Allocator* allocator)
178 {
179  Mat m(w, h, 1, 4u, allocator);
180  if (m.empty())
181  return m;
182 
183  float* ptr = m;
184 
185  int size = w * h;
186 
187 #if __ARM_NEON
188  int nn = size >> 4;
189  int remain = size - (nn << 4);
190 #else
191  int remain = size;
192 #endif // __ARM_NEON
193 
194 #if __ARM_NEON
195 #if __aarch64__
196  for (; nn>0; nn--)
197  {
198  uint8x16_t _gray = vld1q_u8(gray);
199  uint16x8_t _gray16_0 = vmovl_u8(vget_low_u8(_gray));
200  uint16x8_t _gray16_1 = vmovl_u8(vget_high_u8(_gray));
201 
202  float32x4_t _graylow_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_0)));
203  float32x4_t _grayhigh_0 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_0)));
204  float32x4_t _graylow_1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_1)));
205  float32x4_t _grayhigh_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_1)));
206 
207  vst1q_f32(ptr, _graylow_0);
208  vst1q_f32(ptr+4, _grayhigh_0);
209  vst1q_f32(ptr+8, _graylow_1);
210  vst1q_f32(ptr+12, _grayhigh_1);
211 
212  gray += 16;
213  ptr += 16;
214  }
215 #else
216  if (nn > 0)
217  {
218  asm volatile(
219  "0: \n"
220  "pld [%1, #128] \n"
221  "vld1.u8 {d0,d1}, [%1]! \n"
222  "vmovl.u8 q8, d0 \n"
223  "vmovl.u8 q9, d1 \n"
224  "vmovl.u16 q0, d16 \n"
225  "vmovl.u16 q1, d17 \n"
226  "vmovl.u16 q2, d18 \n"
227  "vmovl.u16 q3, d19 \n"
228  "vcvt.f32.u32 q0, q0 \n"
229  "vcvt.f32.u32 q1, q1 \n"
230  "vcvt.f32.u32 q2, q2 \n"
231  "vcvt.f32.u32 q3, q3 \n"
232  "subs %0, #1 \n"
233  "vst1.f32 {d0-d3}, [%2 :128]! \n"
234  "vst1.f32 {d4-d7}, [%2 :128]! \n"
235  "bne 0b \n"
236  : "=r"(nn), // %0
237  "=r"(gray), // %1
238  "=r"(ptr) // %2
239  : "0"(nn),
240  "1"(gray),
241  "2"(ptr)
242  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
243  );
244  }
245 #endif // __aarch64__
246 #endif // __ARM_NEON
247  for (; remain>0; remain--)
248  {
249  *ptr = *gray;
250 
251  gray++;
252  ptr++;
253  }
254 
255  return m;
256 }
257 
258 static void to_gray(const Mat& m, unsigned char* gray)
259 {
260  const float* ptr = m;
261 
262  int size = m.w * m.h;
263 
264 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
265 
266  int remain = size;
267 
268  for (; remain>0; remain--)
269  {
270  *gray = SATURATE_CAST_UCHAR(*ptr);
271 
272  gray++;
273  ptr++;
274  }
275 
276 #undef SATURATE_CAST_UCHAR
277 }
278 
279 static Mat from_rgba(const unsigned char* rgba, int w, int h, Allocator* allocator)
280 {
281  Mat m(w, h, 4, 4u, allocator);
282  if (m.empty())
283  return m;
284 
285  float* ptr0 = m.channel(0);
286  float* ptr1 = m.channel(1);
287  float* ptr2 = m.channel(2);
288  float* ptr3 = m.channel(3);
289 
290  int size = w * h;
291 
292 #if __ARM_NEON
293  int nn = size >> 3;
294  int remain = size - (nn << 3);
295 #else
296  int remain = size;
297 #endif // __ARM_NEON
298 
299 #if __ARM_NEON
300 #if __aarch64__
301  for (; nn>0; nn--)
302  {
303  uint8x8x4_t _rgba = vld4_u8(rgba);
304  int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
305  int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
306  int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
307  int16x8_t _a16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[3]));
308 
309  float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
310  float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
311  float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
312  float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
313  float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
314  float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
315  float32x4_t _alow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_a16)));
316  float32x4_t _ahigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_a16)));
317 
318  vst1q_f32(ptr0, _rlow);
319  vst1q_f32(ptr0+4, _rhigh);
320  vst1q_f32(ptr1, _glow);
321  vst1q_f32(ptr1+4, _ghigh);
322  vst1q_f32(ptr2, _blow);
323  vst1q_f32(ptr2+4, _bhigh);
324  vst1q_f32(ptr3, _alow);
325  vst1q_f32(ptr3+4, _ahigh);
326 
327  rgba += 4*8;
328  ptr0 += 8;
329  ptr1 += 8;
330  ptr2 += 8;
331  ptr3 += 8;
332  }
333 #else
334  if (nn > 0)
335  {
336  asm volatile(
337  "0: \n"
338  "pld [%1, #256] \n"
339  "vld4.u8 {d0-d3}, [%1]! \n"
340  "vmovl.u8 q8, d0 \n"
341  "vmovl.u8 q9, d1 \n"
342  "vmovl.u8 q10, d2 \n"
343  "vmovl.u8 q11, d3 \n"
344  "vmovl.u16 q0, d16 \n"
345  "vmovl.u16 q1, d17 \n"
346  "vmovl.u16 q2, d18 \n"
347  "vmovl.u16 q3, d19 \n"
348  "vmovl.u16 q8, d20 \n"
349  "vmovl.u16 q9, d21 \n"
350  "vmovl.u16 q10, d22 \n"
351  "vmovl.u16 q11, d23 \n"
352  "vcvt.f32.u32 q0, q0 \n"
353  "vcvt.f32.u32 q1, q1 \n"
354  "vcvt.f32.u32 q2, q2 \n"
355  "vcvt.f32.u32 q3, q3 \n"
356  "vcvt.f32.u32 q8, q8 \n"
357  "vcvt.f32.u32 q9, q9 \n"
358  "subs %0, #1 \n"
359  "vst1.f32 {d0-d3}, [%2 :128]! \n"
360  "vcvt.f32.u32 q10, q10 \n"
361  "vcvt.f32.u32 q11, q11 \n"
362  "vst1.f32 {d4-d7}, [%3 :128]! \n"
363  "vst1.f32 {d16-d19}, [%4 :128]!\n"
364  "vst1.f32 {d20-d23}, [%5 :128]!\n"
365  "bne 0b \n"
366  : "=r"(nn), // %0
367  "=r"(rgba), // %1
368  "=r"(ptr0), // %2
369  "=r"(ptr1), // %3
370  "=r"(ptr2), // %4
371  "=r"(ptr3) // %5
372  : "0"(nn),
373  "1"(rgba),
374  "2"(ptr0),
375  "3"(ptr1),
376  "4"(ptr2),
377  "5"(ptr3)
378  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
379  );
380  }
381 #endif // __aarch64__
382 #endif // __ARM_NEON
383  for (; remain>0; remain--)
384  {
385  *ptr0 = rgba[0];
386  *ptr1 = rgba[1];
387  *ptr2 = rgba[2];
388  *ptr3 = rgba[3];
389 
390  rgba += 4;
391  ptr0++;
392  ptr1++;
393  ptr2++;
394  ptr3++;
395  }
396 
397  return m;
398 }
399 
400 static void to_rgba(const Mat& m, unsigned char* rgba)
401 {
402  const float* ptr0 = m.channel(0);
403  const float* ptr1 = m.channel(1);
404  const float* ptr2 = m.channel(2);
405  const float* ptr3 = m.channel(3);
406 
407  int size = m.w * m.h;
408 
409 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
410 
411  int remain = size;
412 
413  for (; remain>0; remain--)
414  {
415  rgba[0] = SATURATE_CAST_UCHAR(*ptr0);
416  rgba[1] = SATURATE_CAST_UCHAR(*ptr1);
417  rgba[2] = SATURATE_CAST_UCHAR(*ptr2);
418  rgba[3] = SATURATE_CAST_UCHAR(*ptr3);
419 
420  rgba += 4;
421  ptr0++;
422  ptr1++;
423  ptr2++;
424  ptr3++;
425  }
426 
427 #undef SATURATE_CAST_UCHAR
428 }
429 
430 static Mat from_rgb2bgr(const unsigned char* rgb, int w, int h, Allocator* allocator)
431 {
432  Mat m(w, h, 3, 4u, allocator);
433  if (m.empty())
434  return m;
435 
436  float* ptr0 = m.channel(0);
437  float* ptr1 = m.channel(1);
438  float* ptr2 = m.channel(2);
439 
440  int size = w * h;
441 
442 #if __ARM_NEON
443  int nn = size >> 3;
444  int remain = size - (nn << 3);
445 #else
446  int remain = size;
447 #endif // __ARM_NEON
448 
449 #if __ARM_NEON
450 #if __aarch64__
451  for (; nn>0; nn--)
452  {
453  uint8x8x3_t _rgb = vld3_u8(rgb);
454  uint16x8_t _r16 = vmovl_u8(_rgb.val[0]);
455  uint16x8_t _g16 = vmovl_u8(_rgb.val[1]);
456  uint16x8_t _b16 = vmovl_u8(_rgb.val[2]);
457 
458  float32x4_t _rlow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_r16)));
459  float32x4_t _rhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_r16)));
460  float32x4_t _glow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_g16)));
461  float32x4_t _ghigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_g16)));
462  float32x4_t _blow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_b16)));
463  float32x4_t _bhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_b16)));
464 
465  vst1q_f32(ptr2, _rlow);
466  vst1q_f32(ptr2+4, _rhigh);
467  vst1q_f32(ptr1, _glow);
468  vst1q_f32(ptr1+4, _ghigh);
469  vst1q_f32(ptr0, _blow);
470  vst1q_f32(ptr0+4, _bhigh);
471 
472  rgb += 3*8;
473  ptr0 += 8;
474  ptr1 += 8;
475  ptr2 += 8;
476  }
477 #else
478  if (nn > 0)
479  {
480  asm volatile(
481  "0: \n"
482  "pld [%1, #256] \n"
483  "vld3.u8 {d0-d2}, [%1]! \n"
484  "vmovl.u8 q8, d0 \n"
485  "vmovl.u8 q9, d1 \n"
486  "vmovl.u8 q10, d2 \n"
487  "vmovl.u16 q0, d16 \n"
488  "vmovl.u16 q1, d17 \n"
489  "vmovl.u16 q2, d18 \n"
490  "vmovl.u16 q3, d19 \n"
491  "vmovl.u16 q8, d20 \n"
492  "vmovl.u16 q9, d21 \n"
493  "vcvt.f32.u32 q0, q0 \n"
494  "vcvt.f32.u32 q1, q1 \n"
495  "vcvt.f32.u32 q2, q2 \n"
496  "vcvt.f32.u32 q3, q3 \n"
497  "vcvt.f32.u32 q8, q8 \n"
498  "subs %0, #1 \n"
499  "vst1.f32 {d0-d3}, [%4 :128]! \n"
500  "vcvt.f32.u32 q9, q9 \n"
501  "vst1.f32 {d4-d7}, [%3 :128]! \n"
502  "vst1.f32 {d16-d19}, [%2 :128]!\n"
503  "bne 0b \n"
504  : "=r"(nn), // %0
505  "=r"(rgb), // %1
506  "=r"(ptr0), // %2
507  "=r"(ptr1), // %3
508  "=r"(ptr2) // %4
509  : "0"(nn),
510  "1"(rgb),
511  "2"(ptr0),
512  "3"(ptr1),
513  "4"(ptr2)
514  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
515  );
516  }
517 #endif // __aarch64__
518 #endif // __ARM_NEON
519  for (; remain>0; remain--)
520  {
521  *ptr0 = rgb[2];
522  *ptr1 = rgb[1];
523  *ptr2 = rgb[0];
524 
525  rgb += 3;
526  ptr0++;
527  ptr1++;
528  ptr2++;
529  }
530 
531  return m;
532 }
533 
534 static void to_bgr2rgb(const Mat& m, unsigned char* rgb)
535 {
536  const float* ptr0 = m.channel(0);
537  const float* ptr1 = m.channel(1);
538  const float* ptr2 = m.channel(2);
539 
540  int size = m.w * m.h;
541 
542 #define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
543 
544  int remain = size;
545 
546  for (; remain>0; remain--)
547  {
548  rgb[2] = SATURATE_CAST_UCHAR(*ptr0);
549  rgb[1] = SATURATE_CAST_UCHAR(*ptr1);
550  rgb[0] = SATURATE_CAST_UCHAR(*ptr2);
551 
552  rgb += 3;
553  ptr0++;
554  ptr1++;
555  ptr2++;
556  }
557 
558 #undef SATURATE_CAST_UCHAR
559 }
560 
561 static Mat from_rgb2gray(const unsigned char* rgb, int w, int h, Allocator* allocator)
562 {
563  // coeffs for r g b = 0.299f, 0.587f, 0.114f
564  const unsigned char Y_shift = 8;//14
565  const unsigned char R2Y = 77;
566  const unsigned char G2Y = 150;
567  const unsigned char B2Y = 29;
568 
569  Mat m(w, h, 1, 4u, allocator);
570  if (m.empty())
571  return m;
572 
573  float* ptr = m;
574 
575  int size = w * h;
576 
577 #if __ARM_NEON
578  int nn = size >> 3;
579  int remain = size - (nn << 3);
580 #else
581  int remain = size;
582 #endif // __ARM_NEON
583 
584 #if __ARM_NEON
585 #if __aarch64__
586  uint8x8_t _R2Y = vdup_n_u8(R2Y);
587  uint8x8_t _G2Y = vdup_n_u8(G2Y);
588  uint8x8_t _B2Y = vdup_n_u8(B2Y);
589  for (; nn>0; nn--)
590  {
591  uint8x8x3_t _rgb = vld3_u8(rgb);
592 
593  uint16x8_t _y16 = vmull_u8(_rgb.val[0], _R2Y);
594  _y16 = vmlal_u8(_y16, _rgb.val[1], _G2Y);
595  _y16 = vmlal_u8(_y16, _rgb.val[2], _B2Y);
596  _y16 = vshrq_n_u16(_y16, Y_shift);
597 
598  float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
599  float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
600 
601  vst1q_f32(ptr, _ylow);
602  vst1q_f32(ptr+4, _yhigh);
603 
604  rgb += 3*8;
605  ptr += 8;
606  }
607 #else
608  if (nn > 0)
609  {
610  asm volatile(
611  "vdup.u8 d16, %6 \n"
612  "vdup.u8 d17, %7 \n"
613  "vdup.u8 d18, %8 \n"
614  "0: \n"
615  "pld [%1, #256] \n"
616  "vld3.u8 {d0-d2}, [%1]! \n"
617  "vmull.u8 q2, d0, d16 \n"
618  "vmlal.u8 q2, d1, d17 \n"
619  "vmlal.u8 q2, d2, d18 \n"
620  "vshr.u16 q2, q2, #8 \n" // Y_shift
621  "vmovl.u16 q0, d4 \n"
622  "vmovl.u16 q1, d5 \n"
623  "vcvt.f32.u32 q0, q0 \n"
624  "vcvt.f32.u32 q1, q1 \n"
625  "subs %0, #1 \n"
626  "vst1.f32 {d0-d3}, [%2 :128]! \n"
627  "bne 0b \n"
628  : "=r"(nn), // %0
629  "=r"(rgb), // %1
630  "=r"(ptr) // %2
631  : "0"(nn),
632  "1"(rgb),
633  "2"(ptr),
634  "r"(R2Y), // %6
635  "r"(G2Y), // %7
636  "r"(B2Y) // %8
637  : "cc", "memory", "q0", "q1", "q2", "q8", "q9"
638  );
639  }
640 #endif // __aarch64__
641 #endif // __ARM_NEON
642  for (; remain>0; remain--)
643  {
644  *ptr = (rgb[0] * R2Y + rgb[1] * G2Y + rgb[2] * B2Y) >> Y_shift;
645 
646  rgb += 3;
647  ptr++;
648  }
649 
650  return m;
651 }
652 
653 static Mat from_bgr2gray(const unsigned char* bgr, int w, int h, Allocator* allocator)
654 {
655  // coeffs for r g b = 0.299f, 0.587f, 0.114f
656  const unsigned char Y_shift = 8;//14
657  const unsigned char R2Y = 77;
658  const unsigned char G2Y = 150;
659  const unsigned char B2Y = 29;
660 
661  Mat m(w, h, 1, 4u, allocator);
662  if (m.empty())
663  return m;
664 
665  float* ptr = m;
666 
667  int size = w * h;
668 
669 #if __ARM_NEON
670  int nn = size >> 3;
671  int remain = size - (nn << 3);
672 #else
673  int remain = size;
674 #endif // __ARM_NEON
675 
676 #if __ARM_NEON
677 #if __aarch64__
678  uint8x8_t _R2Y = vdup_n_u8(R2Y);
679  uint8x8_t _G2Y = vdup_n_u8(G2Y);
680  uint8x8_t _B2Y = vdup_n_u8(B2Y);
681  for (; nn>0; nn--)
682  {
683  uint8x8x3_t _rgb = vld3_u8(bgr);
684 
685  uint16x8_t _y16 = vmull_u8(_rgb.val[2], _R2Y);
686  _y16 = vmlal_u8(_y16, _rgb.val[1], _G2Y);
687  _y16 = vmlal_u8(_y16, _rgb.val[0], _B2Y);
688  _y16 = vshrq_n_u16(_y16, Y_shift);
689 
690  float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
691  float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
692 
693  vst1q_f32(ptr, _ylow);
694  vst1q_f32(ptr+4, _yhigh);
695 
696  bgr += 3*8;
697  ptr += 8;
698  }
699 #else
700  if (nn > 0)
701  {
702  asm volatile(
703  "vdup.u8 d16, %6 \n"
704  "vdup.u8 d17, %7 \n"
705  "vdup.u8 d18, %8 \n"
706  "0: \n"
707  "pld [%1, #256] \n"
708  "vld3.u8 {d0-d2}, [%1]! \n"
709  "vmull.u8 q2, d2, d16 \n"
710  "vmlal.u8 q2, d1, d17 \n"
711  "vmlal.u8 q2, d0, d18 \n"
712  "vshr.u16 q2, q2, #8 \n" // Y_shift
713  "vmovl.u16 q0, d4 \n"
714  "vmovl.u16 q1, d5 \n"
715  "vcvt.f32.u32 q0, q0 \n"
716  "vcvt.f32.u32 q1, q1 \n"
717  "subs %0, #1 \n"
718  "vst1.f32 {d0-d3}, [%2 :128]! \n"
719  "bne 0b \n"
720  : "=r"(nn), // %0
721  "=r"(bgr), // %1
722  "=r"(ptr) // %2
723  : "0"(nn),
724  "1"(bgr),
725  "2"(ptr),
726  "r"(R2Y), // %6
727  "r"(G2Y), // %7
728  "r"(B2Y) // %8
729  : "cc", "memory", "q0", "q1", "q2", "q8", "q9"
730  );
731  }
732 #endif // __aarch64__
733 #endif // __ARM_NEON
734  for (; remain>0; remain--)
735  {
736  *ptr = (bgr[2] * R2Y + bgr[1] * G2Y + bgr[0] * B2Y) >> Y_shift;
737 
738  bgr += 3;
739  ptr++;
740  }
741 
742  return m;
743 }
744 
745 static Mat from_gray2rgb(const unsigned char* gray, int w, int h, Allocator* allocator)
746 {
747  Mat m(w, h, 3, 4u, allocator);
748  if (m.empty())
749  return m;
750 
751  float* ptr0 = m.channel(0);
752  float* ptr1 = m.channel(1);
753  float* ptr2 = m.channel(2);
754 
755  int size = w * h;
756 
757 #if __ARM_NEON
758  int nn = size >> 4;
759  int remain = size - (nn << 4);
760 #else
761  int remain = size;
762 #endif // __ARM_NEON
763 
764 #if __ARM_NEON
765 #if __aarch64__
766  for (; nn>0; nn--)
767  {
768  uint8x16_t _gray = vld1q_u8(gray);
769  uint16x8_t _gray16_0 = vmovl_u8(vget_low_u8(_gray));
770  uint16x8_t _gray16_1 = vmovl_u8(vget_high_u8(_gray));
771 
772  float32x4_t _graylow_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_0)));
773  float32x4_t _grayhigh_0 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_0)));
774  float32x4_t _graylow_1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_gray16_1)));
775  float32x4_t _grayhigh_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_gray16_1)));
776 
777  vst1q_f32(ptr0, _graylow_0);
778  vst1q_f32(ptr0+4, _grayhigh_0);
779  vst1q_f32(ptr0+8, _graylow_1);
780  vst1q_f32(ptr0+12, _grayhigh_1);
781 
782  vst1q_f32(ptr1, _graylow_0);
783  vst1q_f32(ptr1+4, _grayhigh_0);
784  vst1q_f32(ptr1+8, _graylow_1);
785  vst1q_f32(ptr1+12, _grayhigh_1);
786 
787  vst1q_f32(ptr2, _graylow_0);
788  vst1q_f32(ptr2+4, _grayhigh_0);
789  vst1q_f32(ptr2+8, _graylow_1);
790  vst1q_f32(ptr2+12, _grayhigh_1);
791 
792  gray += 16;
793  ptr0 += 16;
794  ptr1 += 16;
795  ptr2 += 16;
796  }
797 #else
798  if (nn > 0)
799  {
800  asm volatile(
801  "0: \n"
802  "pld [%1, #128] \n"
803  "vld1.u8 {d0,d1}, [%1]! \n"
804  "vmovl.u8 q8, d0 \n"
805  "vmovl.u8 q9, d1 \n"
806  "vmovl.u16 q0, d16 \n"
807  "vmovl.u16 q1, d17 \n"
808  "vmovl.u16 q2, d18 \n"
809  "vmovl.u16 q3, d19 \n"
810  "vcvt.f32.u32 q0, q0 \n"
811  "vcvt.f32.u32 q1, q1 \n"
812  "vcvt.f32.u32 q2, q2 \n"
813  "vcvt.f32.u32 q3, q3 \n"
814  "subs %0, #1 \n"
815  "vst1.f32 {d0-d3}, [%2 :128]! \n"
816  "vst1.f32 {d4-d7}, [%2 :128]! \n"
817  "vst1.f32 {d0-d3}, [%3 :128]! \n"
818  "vst1.f32 {d4-d7}, [%3 :128]! \n"
819  "vst1.f32 {d0-d3}, [%4 :128]! \n"
820  "vst1.f32 {d4-d7}, [%4 :128]! \n"
821  "bne 0b \n"
822  : "=r"(nn), // %0
823  "=r"(gray), // %1
824  "=r"(ptr0), // %2
825  "=r"(ptr1), // %3
826  "=r"(ptr2) // %4
827  : "0"(nn),
828  "1"(gray),
829  "2"(ptr0),
830  "3"(ptr1),
831  "4"(ptr2)
832  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
833  );
834  }
835 #endif // __aarch64__
836 #endif // __ARM_NEON
837  for (; remain>0; remain--)
838  {
839  *ptr0 = *gray;
840  *ptr1 = *gray;
841  *ptr2 = *gray;
842 
843  gray++;
844  ptr0++;
845  ptr1++;
846  ptr2++;
847  }
848 
849  return m;
850 }
851 
852 static Mat from_rgba2rgb(const unsigned char* rgba, int w, int h, Allocator* allocator)
853 {
854  Mat m(w, h, 3, 4u, allocator);
855  if (m.empty())
856  return m;
857 
858  float* ptr0 = m.channel(0);
859  float* ptr1 = m.channel(1);
860  float* ptr2 = m.channel(2);
861 
862  int size = w * h;
863 
864 #if __ARM_NEON
865  int nn = size >> 3;
866  int remain = size - (nn << 3);
867 #else
868  int remain = size;
869 #endif // __ARM_NEON
870 
871 #if __ARM_NEON
872 #if __aarch64__
873  for (; nn>0; nn--)
874  {
875  uint8x8x4_t _rgba = vld4_u8(rgba);
876  int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
877  int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
878  int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
879 
880  float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
881  float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
882  float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
883  float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
884  float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
885  float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
886 
887  vst1q_f32(ptr0, _rlow);
888  vst1q_f32(ptr0+4, _rhigh);
889  vst1q_f32(ptr1, _glow);
890  vst1q_f32(ptr1+4, _ghigh);
891  vst1q_f32(ptr2, _blow);
892  vst1q_f32(ptr2+4, _bhigh);
893 
894  rgba += 4*8;
895  ptr0 += 8;
896  ptr1 += 8;
897  ptr2 += 8;
898  }
899 #else
900  if (nn > 0)
901  {
902  asm volatile(
903  "0: \n"
904  "pld [%1, #256] \n"
905  "vld4.u8 {d0-d3}, [%1]! \n"
906  "vmovl.u8 q8, d0 \n"
907  "vmovl.u8 q9, d1 \n"
908  "vmovl.u8 q10, d2 \n"
909  "vmovl.u16 q0, d16 \n"
910  "vmovl.u16 q1, d17 \n"
911  "vmovl.u16 q2, d18 \n"
912  "vmovl.u16 q3, d19 \n"
913  "vmovl.u16 q8, d20 \n"
914  "vmovl.u16 q9, d21 \n"
915  "vcvt.f32.u32 q0, q0 \n"
916  "vcvt.f32.u32 q1, q1 \n"
917  "vcvt.f32.u32 q2, q2 \n"
918  "vcvt.f32.u32 q3, q3 \n"
919  "vcvt.f32.u32 q8, q8 \n"
920  "subs %0, #1 \n"
921  "vst1.f32 {d0-d3}, [%2 :128]! \n"
922  "vcvt.f32.u32 q9, q9 \n"
923  "vst1.f32 {d4-d7}, [%3 :128]! \n"
924  "vst1.f32 {d16-d19}, [%4 :128]!\n"
925  "bne 0b \n"
926  : "=r"(nn), // %0
927  "=r"(rgba), // %1
928  "=r"(ptr0), // %2
929  "=r"(ptr1), // %3
930  "=r"(ptr2) // %4
931  : "0"(nn),
932  "1"(rgba),
933  "2"(ptr0),
934  "3"(ptr1),
935  "4"(ptr2)
936  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9"
937  );
938  }
939 #endif // __aarch64__
940 #endif // __ARM_NEON
941  for (; remain>0; remain--)
942  {
943  *ptr0 = rgba[0];
944  *ptr1 = rgba[1];
945  *ptr2 = rgba[2];
946 
947  rgba += 4;
948  ptr0++;
949  ptr1++;
950  ptr2++;
951  }
952 
953  return m;
954 }
955 
956 static Mat from_rgba2bgr(const unsigned char* rgba, int w, int h, Allocator* allocator)
957 {
958  Mat m(w, h, 3, 4u, allocator);
959  if (m.empty())
960  return m;
961 
962  float* ptr0 = m.channel(0);
963  float* ptr1 = m.channel(1);
964  float* ptr2 = m.channel(2);
965 
966  int size = w * h;
967 
968 #if __ARM_NEON
969  int nn = size >> 3;
970  int remain = size - (nn << 3);
971 #else
972  int remain = size;
973 #endif // __ARM_NEON
974 
975 #if __ARM_NEON
976 #if __aarch64__
977  for (; nn>0; nn--)
978  {
979  uint8x8x4_t _rgba = vld4_u8(rgba);
980  int16x8_t _r16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[0]));
981  int16x8_t _g16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[1]));
982  int16x8_t _b16 = vreinterpretq_s16_u16(vmovl_u8(_rgba.val[2]));
983 
984  float32x4_t _rlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_r16)));
985  float32x4_t _rhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_r16)));
986  float32x4_t _glow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_g16)));
987  float32x4_t _ghigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_g16)));
988  float32x4_t _blow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(_b16)));
989  float32x4_t _bhigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(_b16)));
990 
991  vst1q_f32(ptr2, _rlow);
992  vst1q_f32(ptr2+4, _rhigh);
993  vst1q_f32(ptr1, _glow);
994  vst1q_f32(ptr1+4, _ghigh);
995  vst1q_f32(ptr0, _blow);
996  vst1q_f32(ptr0+4, _bhigh);
997 
998  rgba += 4*8;
999  ptr0 += 8;
1000  ptr1 += 8;
1001  ptr2 += 8;
1002  }
1003 #else
1004  if (nn > 0)
1005  {
1006  asm volatile(
1007  "0: \n"
1008  "pld [%1, #256] \n"
1009  "vld4.u8 {d0-d3}, [%1]! \n"
1010  "vmovl.u8 q8, d0 \n"
1011  "vmovl.u8 q9, d1 \n"
1012  "vmovl.u8 q10, d2 \n"
1013  "vmovl.u16 q0, d16 \n"
1014  "vmovl.u16 q1, d17 \n"
1015  "vmovl.u16 q2, d18 \n"
1016  "vmovl.u16 q3, d19 \n"
1017  "vmovl.u16 q8, d20 \n"
1018  "vmovl.u16 q9, d21 \n"
1019  "vcvt.f32.u32 q0, q0 \n"
1020  "vcvt.f32.u32 q1, q1 \n"
1021  "vcvt.f32.u32 q2, q2 \n"
1022  "vcvt.f32.u32 q3, q3 \n"
1023  "vcvt.f32.u32 q8, q8 \n"
1024  "subs %0, #1 \n"
1025  "vst1.f32 {d0-d3}, [%4 :128]! \n"
1026  "vcvt.f32.u32 q9, q9 \n"
1027  "vst1.f32 {d4-d7}, [%3 :128]! \n"
1028  "vst1.f32 {d16-d19}, [%2 :128]!\n"
1029  "bne 0b \n"
1030  : "=r"(nn), // %0
1031  "=r"(rgba), // %1
1032  "=r"(ptr0), // %2
1033  "=r"(ptr1), // %3
1034  "=r"(ptr2) // %4
1035  : "0"(nn),
1036  "1"(rgba),
1037  "2"(ptr0),
1038  "3"(ptr1),
1039  "4"(ptr2)
1040  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
1041  );
1042  }
1043 #endif // __aarch64__
1044 #endif // __ARM_NEON
1045  for (; remain>0; remain--)
1046  {
1047  *ptr0 = rgba[2];
1048  *ptr1 = rgba[1];
1049  *ptr2 = rgba[0];
1050 
1051  rgba += 4;
1052  ptr0++;
1053  ptr1++;
1054  ptr2++;
1055  }
1056 
1057  return m;
1058 }
1059 
1060 static Mat from_rgba2gray(const unsigned char* rgba, int w, int h, Allocator* allocator)
1061 {
1062  // coeffs for r g b = 0.299f, 0.587f, 0.114f
1063  const unsigned char Y_shift = 8;//14
1064  const unsigned char R2Y = 77;
1065  const unsigned char G2Y = 150;
1066  const unsigned char B2Y = 29;
1067 
1068  Mat m(w, h, 1, 4u, allocator);
1069  if (m.empty())
1070  return m;
1071 
1072  float* ptr = m;
1073 
1074  int size = w * h;
1075 
1076 #if __ARM_NEON
1077  int nn = size >> 3;
1078  int remain = size - (nn << 3);
1079 #else
1080  int remain = size;
1081 #endif // __ARM_NEON
1082 
1083 #if __ARM_NEON
1084 #if __aarch64__
1085  uint8x8_t _R2Y = vdup_n_u8(R2Y);
1086  uint8x8_t _G2Y = vdup_n_u8(G2Y);
1087  uint8x8_t _B2Y = vdup_n_u8(B2Y);
1088  for (; nn>0; nn--)
1089  {
1090  uint8x8x4_t _rgba = vld4_u8(rgba);
1091 
1092  uint16x8_t _y16 = vmull_u8(_rgba.val[0], _R2Y);
1093  _y16 = vmlal_u8(_y16, _rgba.val[1], _G2Y);
1094  _y16 = vmlal_u8(_y16, _rgba.val[2], _B2Y);
1095  _y16 = vshrq_n_u16(_y16, Y_shift);
1096 
1097  float32x4_t _ylow = vcvtq_f32_u32(vmovl_u16(vget_low_u16(_y16)));
1098  float32x4_t _yhigh = vcvtq_f32_u32(vmovl_u16(vget_high_u16(_y16)));
1099 
1100  vst1q_f32(ptr, _ylow);
1101  vst1q_f32(ptr+4, _yhigh);
1102 
1103  rgba += 4*8;
1104  ptr += 8;
1105  }
1106 #else
1107  if (nn > 0)
1108  {
1109  asm volatile(
1110  "vdup.u8 d16, %6 \n"
1111  "vdup.u8 d17, %7 \n"
1112  "vdup.u8 d18, %8 \n"
1113  "0: \n"
1114  "pld [%1, #256] \n"
1115  "vld4.u8 {d0-d3}, [%1]! \n"
1116  "vmull.u8 q2, d0, d16 \n"
1117  "vmlal.u8 q2, d1, d17 \n"
1118  "vmlal.u8 q2, d2, d18 \n"
1119  "vshr.u16 q2, q2, #8 \n" // Y_shift
1120  "vmovl.u16 q0, d4 \n"
1121  "vmovl.u16 q1, d5 \n"
1122  "vcvt.f32.u32 q0, q0 \n"
1123  "vcvt.f32.u32 q1, q1 \n"
1124  "subs %0, #1 \n"
1125  "vst1.f32 {d0-d3}, [%2 :128]! \n"
1126  "bne 0b \n"
1127  : "=r"(nn), // %0
1128  "=r"(rgba), // %1
1129  "=r"(ptr) // %2
1130  : "0"(nn),
1131  "1"(rgba),
1132  "2"(ptr),
1133  "r"(R2Y), // %6
1134  "r"(G2Y), // %7
1135  "r"(B2Y) // %8
1136  : "cc", "memory", "q0", "q1", "q2", "q8", "q9"
1137  );
1138  }
1139 #endif // __aarch64__
1140 #endif // __ARM_NEON
1141  for (; remain>0; remain--)
1142  {
1143  *ptr = (rgba[0] * R2Y + rgba[1] * G2Y + rgba[2] * B2Y) >> Y_shift;
1144 
1145  rgba += 4;
1146  ptr++;
1147  }
1148 
1149  return m;
1150 }
1151 
1152 inline void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
1153 {
1154  const int INTER_RESIZE_COEF_BITS=11;
1155  const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
1156 // const int ONE=INTER_RESIZE_COEF_SCALE;
1157 
1158  double scale_x = (double)srcw / w;
1159  double scale_y = (double)srch / h;
1160 
1161  int* buf = new int[w + h + w + h];
1162 
1163  int* xofs = buf;//new int[w];
1164  int* yofs = buf + w;//new int[h];
1165 
1166  short* ialpha = (short*)(buf + w + h);//new short[w * 2];
1167  short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
1168 
1169  float fx;
1170  float fy;
1171  int sx;
1172  int sy;
1173 
1174 #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
1175 
1176  for (int dx = 0; dx < w; dx++)
1177  {
1178  fx = (float)((dx + 0.5) * scale_x - 0.5);
1179  sx = floor(fx);
1180  fx -= sx;
1181 
1182  if (sx < 0)
1183  {
1184  sx = 0;
1185  fx = 0.f;
1186  }
1187  if (sx >= srcw - 1)
1188  {
1189  sx = srcw - 2;
1190  fx = 1.f;
1191  }
1192 
1193  xofs[dx] = sx*3;
1194 
1195  float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
1196  float a1 = fx * INTER_RESIZE_COEF_SCALE;
1197 
1198  ialpha[dx*2 ] = SATURATE_CAST_SHORT(a0);
1199  ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
1200  }
1201 
1202  for (int dy = 0; dy < h; dy++)
1203  {
1204  fy = (float)((dy + 0.5) * scale_y - 0.5);
1205  sy = floor(fy);
1206  fy -= sy;
1207 
1208  if (sy < 0)
1209  {
1210  sy = 0;
1211  fy = 0.f;
1212  }
1213  if (sy >= srch - 1)
1214  {
1215  sy = srch - 2;
1216  fy = 1.f;
1217  }
1218 
1219  yofs[dy] = sy*3;
1220 
1221  float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
1222  float b1 = fy * INTER_RESIZE_COEF_SCALE;
1223 
1224  ibeta[dy*2 ] = SATURATE_CAST_SHORT(b0);
1225  ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
1226  }
1227 
1228 #undef SATURATE_CAST_SHORT
1229 
1230  // loop body
1231  Mat rowsbuf0((w*3 >> 1) + 3);
1232  Mat rowsbuf1((w*3 >> 1) + 3);
1233  short* rows0 = (short*)rowsbuf0.data;
1234  short* rows1 = (short*)rowsbuf1.data;
1235 
1236  int prev_sy1 = -1;
1237 
1238  for (int dy = 0; dy < h; dy++ )
1239  {
1240  int sy = yofs[dy];
1241 
1242  if (sy == prev_sy1)
1243  {
1244  // hresize one row
1245  short* rows0_old = rows0;
1246  rows0 = rows1;
1247  rows1 = rows0_old;
1248  const unsigned char *S1 = src + srcw * (sy+3);
1249 
1250  const short* ialphap = ialpha;
1251  short* rows1p = rows1;
1252  for ( int dx = 0; dx < w; dx++ )
1253  {
1254  int sx = xofs[dx];
1255  short a0 = ialphap[0];
1256  short a1 = ialphap[1];
1257 
1258  const unsigned char* S1p = S1 + sx;
1259 #if __ARM_NEON
1260  int16x4_t _a0 = vdup_n_s16(a0);
1261  int16x4_t _a1 = vdup_n_s16(a1);
1262  uint8x8_t _S1 = vld1_u8(S1p);
1263  int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
1264  int16x4_t _S1low = vget_low_s16(_S116);
1265  int16x4_t _S1high = vext_s16(_S1low, vget_high_s16(_S116), 3);
1266  int32x4_t _rows1 = vmull_s16(_S1low, _a0);
1267  _rows1 = vmlal_s16(_rows1, _S1high, _a1);
1268  int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
1269  vst1_s16(rows1p, _rows1_sr4);
1270 #else
1271  rows1p[0] = (S1p[0]*a0 + S1p[3]*a1) >> 4;
1272  rows1p[1] = (S1p[1]*a0 + S1p[4]*a1) >> 4;
1273  rows1p[2] = (S1p[2]*a0 + S1p[5]*a1) >> 4;
1274 #endif // __ARM_NEON
1275 
1276  ialphap += 2;
1277  rows1p += 3;
1278  }
1279  }
1280  else
1281  {
1282  // hresize two rows
1283  const unsigned char *S0 = src + srcw * (sy);
1284  const unsigned char *S1 = src + srcw * (sy+3);
1285 
1286  const short* ialphap = ialpha;
1287  short* rows0p = rows0;
1288  short* rows1p = rows1;
1289  for ( int dx = 0; dx < w; dx++ )
1290  {
1291  int sx = xofs[dx];
1292  short a0 = ialphap[0];
1293  short a1 = ialphap[1];
1294 
1295  const unsigned char* S0p = S0 + sx;
1296  const unsigned char* S1p = S1 + sx;
1297 #if __ARM_NEON
1298  int16x4_t _a0 = vdup_n_s16(a0);
1299  int16x4_t _a1 = vdup_n_s16(a1);
1300  uint8x8_t _S0 = vld1_u8(S0p);
1301  uint8x8_t _S1 = vld1_u8(S1p);
1302  int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
1303  int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
1304  int16x4_t _S0low = vget_low_s16(_S016);
1305  int16x4_t _S1low = vget_low_s16(_S116);
1306  int16x4_t _S0high = vext_s16(_S0low, vget_high_s16(_S016), 3);
1307  int16x4_t _S1high = vext_s16(_S1low, vget_high_s16(_S116), 3);
1308  int32x4_t _rows0 = vmull_s16(_S0low, _a0);
1309  int32x4_t _rows1 = vmull_s16(_S1low, _a0);
1310  _rows0 = vmlal_s16(_rows0, _S0high, _a1);
1311  _rows1 = vmlal_s16(_rows1, _S1high, _a1);
1312  int16x4_t _rows0_sr4 = vshrn_n_s32(_rows0, 4);
1313  int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
1314  vst1_s16(rows0p, _rows0_sr4);
1315  vst1_s16(rows1p, _rows1_sr4);
1316 #else
1317  rows0p[0] = (S0p[0]*a0 + S0p[3]*a1) >> 4;
1318  rows0p[1] = (S0p[1]*a0 + S0p[4]*a1) >> 4;
1319  rows0p[2] = (S0p[2]*a0 + S0p[5]*a1) >> 4;
1320  rows1p[0] = (S1p[0]*a0 + S1p[3]*a1) >> 4;
1321  rows1p[1] = (S1p[1]*a0 + S1p[4]*a1) >> 4;
1322  rows1p[2] = (S1p[2]*a0 + S1p[5]*a1) >> 4;
1323 #endif // __ARM_NEON
1324 
1325  ialphap += 2;
1326  rows0p += 3;
1327  rows1p += 3;
1328  }
1329  }
1330 
1331  prev_sy1 = sy + 1;
1332 
1333  // vresize
1334  short b0 = ibeta[0];
1335  short b1 = ibeta[1];
1336 
1337  short* rows0p = rows0;
1338  short* rows1p = rows1;
1339  unsigned char* Dp = dst + w * 3 * (dy);
1340 
1341 #if __ARM_NEON
1342  int nn = (w * 3) >> 3;
1343 #else
1344  int nn = 0;
1345 #endif
1346  int remain = (w * 3) - (nn << 3);
1347 
1348 #if __ARM_NEON
1349 #if __aarch64__
1350  int16x4_t _b0 = vdup_n_s16(b0);
1351  int16x4_t _b1 = vdup_n_s16(b1);
1352  int32x4_t _v2 = vdupq_n_s32(2);
1353  for (; nn>0; nn--)
1354  {
1355  int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
1356  int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
1357  int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
1358  int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
1359 
1360  int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
1361  int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
1362  int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
1363  int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
1364 
1365  int32x4_t _acc = _v2;
1366  _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
1367  _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
1368 
1369  int32x4_t _acc_1 = _v2;
1370  _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
1371  _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
1372 
1373  int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
1374  int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
1375 
1376  uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
1377 
1378  vst1_u8(Dp, _D);
1379 
1380  Dp += 8;
1381  rows0p += 8;
1382  rows1p += 8;
1383  }
1384 #else
1385  if (nn > 0)
1386  {
1387  asm volatile(
1388  "vdup.s16 d16, %8 \n"
1389  "mov r4, #2 \n"
1390  "vdup.s16 d17, %9 \n"
1391  "vdup.s32 q12, r4 \n"
1392  "pld [%0, #128] \n"
1393  "vld1.s16 {d2-d3}, [%0 :128]!\n"
1394  "pld [%1, #128] \n"
1395  "vld1.s16 {d6-d7}, [%1 :128]!\n"
1396  "0: \n"
1397  "vmull.s16 q0, d2, d16 \n"
1398  "vmull.s16 q1, d3, d16 \n"
1399  "vorr.s32 q10, q12, q12 \n"
1400  "vorr.s32 q11, q12, q12 \n"
1401  "vmull.s16 q2, d6, d17 \n"
1402  "vmull.s16 q3, d7, d17 \n"
1403  "vsra.s32 q10, q0, #16 \n"
1404  "vsra.s32 q11, q1, #16 \n"
1405  "pld [%0, #128] \n"
1406  "vld1.s16 {d2-d3}, [%0 :128]!\n"
1407  "vsra.s32 q10, q2, #16 \n"
1408  "vsra.s32 q11, q3, #16 \n"
1409  "pld [%1, #128] \n"
1410  "vld1.s16 {d6-d7}, [%1 :128]!\n"
1411  "vshrn.s32 d20, q10, #2 \n"
1412  "vshrn.s32 d21, q11, #2 \n"
1413  "vqmovun.s16 d20, q10 \n"
1414  "vst1.8 {d20}, [%2]! \n"
1415  "subs %3, #1 \n"
1416  "bne 0b \n"
1417  "sub %0, #16 \n"
1418  "sub %1, #16 \n"
1419  : "=r"(rows0p), // %0
1420  "=r"(rows1p), // %1
1421  "=r"(Dp), // %2
1422  "=r"(nn) // %3
1423  : "0"(rows0p),
1424  "1"(rows1p),
1425  "2"(Dp),
1426  "3"(nn),
1427  "r"(b0), // %8
1428  "r"(b1) // %9
1429  : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
1430  );
1431  }
1432 #endif // __aarch64__
1433 #endif // __ARM_NEON
1434  for ( ; remain; --remain )
1435  {
1436 // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
1437  *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
1438  }
1439 
1440  ibeta += 2;
1441  }
1442 
1443  delete[] buf;
1444 }
1445 
1446 inline void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
1447 {
1448  const int INTER_RESIZE_COEF_BITS=11;
1449  const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
1450 // const int ONE=INTER_RESIZE_COEF_SCALE;
1451 
1452  double scale_x = (double)srcw / w;
1453  double scale_y = (double)srch / h;
1454 
1455  int* buf = new int[w + h + w + h];
1456 
1457  int* xofs = buf;//new int[w];
1458  int* yofs = buf + w;//new int[h];
1459 
1460  short* ialpha = (short*)(buf + w + h);//new short[w * 2];
1461  short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
1462 
1463  float fx;
1464  float fy;
1465  int sx;
1466  int sy;
1467 
1468 #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
1469 
1470  for (int dx = 0; dx < w; dx++)
1471  {
1472  fx = (float)((dx + 0.5) * scale_x - 0.5);
1473  sx = floor(fx);
1474  fx -= sx;
1475 
1476  if (sx < 0)
1477  {
1478  sx = 0;
1479  fx = 0.f;
1480  }
1481  if (sx >= srcw - 1)
1482  {
1483  sx = srcw - 2;
1484  fx = 1.f;
1485  }
1486 
1487  xofs[dx] = sx;
1488 
1489  float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
1490  float a1 = fx * INTER_RESIZE_COEF_SCALE;
1491 
1492  ialpha[dx*2 ] = SATURATE_CAST_SHORT(a0);
1493  ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
1494  }
1495 
1496  for (int dy = 0; dy < h; dy++)
1497  {
1498  fy = (float)((dy + 0.5) * scale_y - 0.5);
1499  sy = floor(fy);
1500  fy -= sy;
1501 
1502  if (sy < 0)
1503  {
1504  sy = 0;
1505  fy = 0.f;
1506  }
1507  if (sy >= srch - 1)
1508  {
1509  sy = srch - 2;
1510  fy = 1.f;
1511  }
1512 
1513  yofs[dy] = sy;
1514 
1515  float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
1516  float b1 = fy * INTER_RESIZE_COEF_SCALE;
1517 
1518  ibeta[dy*2 ] = SATURATE_CAST_SHORT(b0);
1519  ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
1520  }
1521 
1522 #undef SATURATE_CAST_SHORT
1523 
1524  // loop body
1525  Mat rowsbuf0((w >> 1) + 1);
1526  Mat rowsbuf1((w >> 1) + 1);
1527  short* rows0 = (short*)rowsbuf0.data;
1528  short* rows1 = (short*)rowsbuf1.data;
1529 
1530  int prev_sy1 = -1;
1531 
1532  for (int dy = 0; dy < h; dy++ )
1533  {
1534  int sy = yofs[dy];
1535 
1536  if (sy == prev_sy1)
1537  {
1538  // hresize one row
1539  short* rows0_old = rows0;
1540  rows0 = rows1;
1541  rows1 = rows0_old;
1542  const unsigned char *S1 = src + srcw * (sy+1);
1543 
1544  const short* ialphap = ialpha;
1545  short* rows1p = rows1;
1546  for ( int dx = 0; dx < w; dx++ )
1547  {
1548  int sx = xofs[dx];
1549  short a0 = ialphap[0];
1550  short a1 = ialphap[1];
1551 
1552  const unsigned char* S1p = S1 + sx;
1553  rows1p[dx] = (S1p[0]*a0 + S1p[1]*a1) >> 4;
1554 
1555  ialphap += 2;
1556  }
1557  }
1558  else
1559  {
1560  // hresize two rows
1561  const unsigned char *S0 = src + srcw * (sy);
1562  const unsigned char *S1 = src + srcw * (sy+1);
1563 
1564  const short* ialphap = ialpha;
1565  short* rows0p = rows0;
1566  short* rows1p = rows1;
1567  for ( int dx = 0; dx < w; dx++ )
1568  {
1569  int sx = xofs[dx];
1570  short a0 = ialphap[0];
1571  short a1 = ialphap[1];
1572 
1573  const unsigned char* S0p = S0 + sx;
1574  const unsigned char* S1p = S1 + sx;
1575  rows0p[dx] = (S0p[0]*a0 + S0p[1]*a1) >> 4;
1576  rows1p[dx] = (S1p[0]*a0 + S1p[1]*a1) >> 4;
1577 
1578  ialphap += 2;
1579  }
1580  }
1581 
1582  prev_sy1 = sy + 1;
1583 
1584  // vresize
1585  short b0 = ibeta[0];
1586  short b1 = ibeta[1];
1587 
1588  short* rows0p = rows0;
1589  short* rows1p = rows1;
1590  unsigned char* Dp = dst + w * (dy);
1591 
1592 #if __ARM_NEON
1593  int nn = w >> 3;
1594 #else
1595  int nn = 0;
1596 #endif
1597  int remain = w - (nn << 3);
1598 
1599 #if __ARM_NEON
1600 #if __aarch64__
1601  int16x4_t _b0 = vdup_n_s16(b0);
1602  int16x4_t _b1 = vdup_n_s16(b1);
1603  int32x4_t _v2 = vdupq_n_s32(2);
1604  for (; nn>0; nn--)
1605  {
1606  int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
1607  int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
1608  int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
1609  int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
1610 
1611  int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
1612  int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
1613  int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
1614  int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
1615 
1616  int32x4_t _acc = _v2;
1617  _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
1618  _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
1619 
1620  int32x4_t _acc_1 = _v2;
1621  _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
1622  _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
1623 
1624  int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
1625  int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
1626 
1627  uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
1628 
1629  vst1_u8(Dp, _D);
1630 
1631  Dp += 8;
1632  rows0p += 8;
1633  rows1p += 8;
1634  }
1635 #else
1636  if (nn > 0)
1637  {
1638  asm volatile(
1639  "vdup.s16 d16, %8 \n"
1640  "mov r4, #2 \n"
1641  "vdup.s16 d17, %9 \n"
1642  "vdup.s32 q12, r4 \n"
1643  "pld [%0, #128] \n"
1644  "vld1.s16 {d2-d3}, [%0 :128]!\n"
1645  "pld [%1, #128] \n"
1646  "vld1.s16 {d6-d7}, [%1 :128]!\n"
1647  "0: \n"
1648  "vmull.s16 q0, d2, d16 \n"
1649  "vmull.s16 q1, d3, d16 \n"
1650  "vorr.s32 q10, q12, q12 \n"
1651  "vorr.s32 q11, q12, q12 \n"
1652  "vmull.s16 q2, d6, d17 \n"
1653  "vmull.s16 q3, d7, d17 \n"
1654  "vsra.s32 q10, q0, #16 \n"
1655  "vsra.s32 q11, q1, #16 \n"
1656  "pld [%0, #128] \n"
1657  "vld1.s32 {d2-d3}, [%0 :128]!\n"
1658  "vsra.s32 q10, q2, #16 \n"
1659  "vsra.s32 q11, q3, #16 \n"
1660  "pld [%1, #128] \n"
1661  "vld1.s32 {d6-d7}, [%1 :128]!\n"
1662  "vshrn.s32 d20, q10, #2 \n"
1663  "vshrn.s32 d21, q11, #2 \n"
1664  "vqmovun.s16 d20, q10 \n"
1665  "vst1.8 {d20}, [%2]! \n"
1666  "subs %3, #1 \n"
1667  "bne 0b \n"
1668  "sub %0, #16 \n"
1669  "sub %1, #16 \n"
1670  : "=r"(rows0p), // %0
1671  "=r"(rows1p), // %1
1672  "=r"(Dp), // %2
1673  "=r"(nn) // %3
1674  : "0"(rows0p),
1675  "1"(rows1p),
1676  "2"(Dp),
1677  "3"(nn),
1678  "r"(b0), // %8
1679  "r"(b1) // %9
1680  : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
1681  );
1682  }
1683 #endif // __aarch64__
1684 #endif // __ARM_NEON
1685  for ( ; remain; --remain )
1686  {
1687 // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
1688  *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
1689  }
1690 
1691  ibeta += 2;
1692  }
1693 
1694  delete[] buf;
1695 }
1696 
1697 inline void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h)
1698 {
1699  const int INTER_RESIZE_COEF_BITS=11;
1700  const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
1701 // const int ONE=INTER_RESIZE_COEF_SCALE;
1702 
1703  double scale_x = (double)srcw / w;
1704  double scale_y = (double)srch / h;
1705 
1706  int* buf = new int[w + h + w + h];
1707 
1708  int* xofs = buf;//new int[w];
1709  int* yofs = buf + w;//new int[h];
1710 
1711  short* ialpha = (short*)(buf + w + h);//new short[w * 2];
1712  short* ibeta = (short*)(buf + w + h + w);//new short[h * 2];
1713 
1714  float fx;
1715  float fy;
1716  int sx;
1717  int sy;
1718 
1719 #define SATURATE_CAST_SHORT(X) (short)::std::min(::std::max((int)(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), SHRT_MAX);
1720 
1721  for (int dx = 0; dx < w; dx++)
1722  {
1723  fx = (float)((dx + 0.5) * scale_x - 0.5);
1724  sx = floor(fx);
1725  fx -= sx;
1726 
1727  if (sx < 0)
1728  {
1729  sx = 0;
1730  fx = 0.f;
1731  }
1732  if (sx >= srcw - 1)
1733  {
1734  sx = srcw - 2;
1735  fx = 1.f;
1736  }
1737 
1738  xofs[dx] = sx*4;
1739 
1740  float a0 = (1.f - fx) * INTER_RESIZE_COEF_SCALE;
1741  float a1 = fx * INTER_RESIZE_COEF_SCALE;
1742 
1743  ialpha[dx*2 ] = SATURATE_CAST_SHORT(a0);
1744  ialpha[dx*2 + 1] = SATURATE_CAST_SHORT(a1);
1745  }
1746 
1747  for (int dy = 0; dy < h; dy++)
1748  {
1749  fy = (float)((dy + 0.5) * scale_y - 0.5);
1750  sy = floor(fy);
1751  fy -= sy;
1752 
1753  if (sy < 0)
1754  {
1755  sy = 0;
1756  fy = 0.f;
1757  }
1758  if (sy >= srch - 1)
1759  {
1760  sy = srch - 2;
1761  fy = 1.f;
1762  }
1763 
1764  yofs[dy] = sy*4;
1765 
1766  float b0 = (1.f - fy) * INTER_RESIZE_COEF_SCALE;
1767  float b1 = fy * INTER_RESIZE_COEF_SCALE;
1768 
1769  ibeta[dy*2 ] = SATURATE_CAST_SHORT(b0);
1770  ibeta[dy*2 + 1] = SATURATE_CAST_SHORT(b1);
1771  }
1772 
1773 #undef SATURATE_CAST_SHORT
1774 
1775  // loop body
1776  Mat rowsbuf0((w*4 >> 1) + 4);
1777  Mat rowsbuf1((w*4 >> 1) + 4);
1778  short* rows0 = (short*)rowsbuf0.data;
1779  short* rows1 = (short*)rowsbuf1.data;
1780 
1781  int prev_sy1 = -1;
1782 
1783  for (int dy = 0; dy < h; dy++ )
1784  {
1785  int sy = yofs[dy];
1786 
1787  if (sy == prev_sy1)
1788  {
1789  // hresize one row
1790  short* rows0_old = rows0;
1791  rows0 = rows1;
1792  rows1 = rows0_old;
1793  const unsigned char *S1 = src + srcw * (sy+4);
1794 
1795  const short* ialphap = ialpha;
1796  short* rows1p = rows1;
1797  for ( int dx = 0; dx < w; dx++ )
1798  {
1799  int sx = xofs[dx];
1800  short a0 = ialphap[0];
1801  short a1 = ialphap[1];
1802 
1803  const unsigned char* S1p = S1 + sx;
1804 #if __ARM_NEON
1805  int16x4_t _a0 = vdup_n_s16(a0);
1806  int16x4_t _a1 = vdup_n_s16(a1);
1807  uint8x8_t _S1 = vld1_u8(S1p);
1808  int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
1809  int16x4_t _S1low = vget_low_s16(_S116);
1810  int16x4_t _S1high = vget_high_s16(_S116);
1811  int32x4_t _rows1 = vmull_s16(_S1low, _a0);
1812  _rows1 = vmlal_s16(_rows1, _S1high, _a1);
1813  int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
1814  vst1_s16(rows1p, _rows1_sr4);
1815 #else
1816  rows1p[0] = (S1p[0]*a0 + S1p[4]*a1) >> 4;
1817  rows1p[1] = (S1p[1]*a0 + S1p[5]*a1) >> 4;
1818  rows1p[2] = (S1p[2]*a0 + S1p[6]*a1) >> 4;
1819  rows1p[3] = (S1p[3]*a0 + S1p[7]*a1) >> 4;
1820 #endif // __ARM_NEON
1821 
1822  ialphap += 2;
1823  rows1p += 4;
1824  }
1825  }
1826  else
1827  {
1828  // hresize two rows
1829  const unsigned char *S0 = src + srcw * (sy);
1830  const unsigned char *S1 = src + srcw * (sy+4);
1831 
1832  const short* ialphap = ialpha;
1833  short* rows0p = rows0;
1834  short* rows1p = rows1;
1835  for ( int dx = 0; dx < w; dx++ )
1836  {
1837  int sx = xofs[dx];
1838  short a0 = ialphap[0];
1839  short a1 = ialphap[1];
1840 
1841  const unsigned char* S0p = S0 + sx;
1842  const unsigned char* S1p = S1 + sx;
1843 #if __ARM_NEON
1844  int16x4_t _a0 = vdup_n_s16(a0);
1845  int16x4_t _a1 = vdup_n_s16(a1);
1846  uint8x8_t _S0 = vld1_u8(S0p);
1847  uint8x8_t _S1 = vld1_u8(S1p);
1848  int16x8_t _S016 = vreinterpretq_s16_u16(vmovl_u8(_S0));
1849  int16x8_t _S116 = vreinterpretq_s16_u16(vmovl_u8(_S1));
1850  int16x4_t _S0low = vget_low_s16(_S016);
1851  int16x4_t _S1low = vget_low_s16(_S116);
1852  int16x4_t _S0high = vget_high_s16(_S016);
1853  int16x4_t _S1high = vget_high_s16(_S116);
1854  int32x4_t _rows0 = vmull_s16(_S0low, _a0);
1855  int32x4_t _rows1 = vmull_s16(_S1low, _a0);
1856  _rows0 = vmlal_s16(_rows0, _S0high, _a1);
1857  _rows1 = vmlal_s16(_rows1, _S1high, _a1);
1858  int16x4_t _rows0_sr4 = vshrn_n_s32(_rows0, 4);
1859  int16x4_t _rows1_sr4 = vshrn_n_s32(_rows1, 4);
1860  vst1_s16(rows0p, _rows0_sr4);
1861  vst1_s16(rows1p, _rows1_sr4);
1862 #else
1863  rows0p[0] = (S0p[0]*a0 + S0p[4]*a1) >> 4;
1864  rows0p[1] = (S0p[1]*a0 + S0p[5]*a1) >> 4;
1865  rows0p[2] = (S0p[2]*a0 + S0p[6]*a1) >> 4;
1866  rows0p[3] = (S0p[3]*a0 + S0p[7]*a1) >> 4;
1867  rows1p[0] = (S1p[0]*a0 + S1p[4]*a1) >> 4;
1868  rows1p[1] = (S1p[1]*a0 + S1p[5]*a1) >> 4;
1869  rows1p[2] = (S1p[2]*a0 + S1p[6]*a1) >> 4;
1870  rows1p[3] = (S1p[3]*a0 + S1p[7]*a1) >> 4;
1871 #endif // __ARM_NEON
1872 
1873  ialphap += 2;
1874  rows0p += 4;
1875  rows1p += 4;
1876  }
1877  }
1878 
1879  prev_sy1 = sy + 1;
1880 
1881  // vresize
1882  short b0 = ibeta[0];
1883  short b1 = ibeta[1];
1884 
1885  short* rows0p = rows0;
1886  short* rows1p = rows1;
1887  unsigned char* Dp = dst + w * 4 * (dy);
1888 
1889 #if __ARM_NEON
1890  int nn = (w * 4) >> 3;
1891 #else
1892  int nn = 0;
1893 #endif
1894  int remain = (w * 4) - (nn << 3);
1895 
1896 #if __ARM_NEON
1897 #if __aarch64__
1898  int16x4_t _b0 = vdup_n_s16(b0);
1899  int16x4_t _b1 = vdup_n_s16(b1);
1900  int32x4_t _v2 = vdupq_n_s32(2);
1901  for (; nn>0; nn--)
1902  {
1903  int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
1904  int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
1905  int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p+4);
1906  int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p+4);
1907 
1908  int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
1909  int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
1910  int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
1911  int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
1912 
1913  int32x4_t _acc = _v2;
1914  _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
1915  _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
1916 
1917  int32x4_t _acc_1 = _v2;
1918  _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
1919  _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
1920 
1921  int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
1922  int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
1923 
1924  uint8x8_t _D = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
1925 
1926  vst1_u8(Dp, _D);
1927 
1928  Dp += 8;
1929  rows0p += 8;
1930  rows1p += 8;
1931  }
1932 #else
1933  if (nn > 0)
1934  {
1935  asm volatile(
1936  "vdup.s16 d16, %8 \n"
1937  "mov r4, #2 \n"
1938  "vdup.s16 d17, %9 \n"
1939  "vdup.s32 q12, r4 \n"
1940  "pld [%0, #128] \n"
1941  "vld1.s16 {d2-d3}, [%0 :128]!\n"
1942  "pld [%1, #128] \n"
1943  "vld1.s16 {d6-d7}, [%1 :128]!\n"
1944  "0: \n"
1945  "vmull.s16 q0, d2, d16 \n"
1946  "vmull.s16 q1, d3, d16 \n"
1947  "vorr.s32 q10, q12, q12 \n"
1948  "vorr.s32 q11, q12, q12 \n"
1949  "vmull.s16 q2, d6, d17 \n"
1950  "vmull.s16 q3, d7, d17 \n"
1951  "vsra.s32 q10, q0, #16 \n"
1952  "vsra.s32 q11, q1, #16 \n"
1953  "pld [%0, #128] \n"
1954  "vld1.s32 {d2-d3}, [%0 :128]!\n"
1955  "vsra.s32 q10, q2, #16 \n"
1956  "vsra.s32 q11, q3, #16 \n"
1957  "pld [%1, #128] \n"
1958  "vld1.s32 {d6-d7}, [%1 :128]!\n"
1959  "vshrn.s32 d20, q10, #2 \n"
1960  "vshrn.s32 d21, q11, #2 \n"
1961  "vqmovun.s16 d20, q10 \n"
1962  "vst1.8 {d20}, [%2]! \n"
1963  "subs %3, #1 \n"
1964  "bne 0b \n"
1965  "sub %0, #16 \n"
1966  "sub %1, #16 \n"
1967  : "=r"(rows0p), // %0
1968  "=r"(rows1p), // %1
1969  "=r"(Dp), // %2
1970  "=r"(nn) // %3
1971  : "0"(rows0p),
1972  "1"(rows1p),
1973  "2"(Dp),
1974  "3"(nn),
1975  "r"(b0), // %8
1976  "r"(b1) // %9
1977  : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
1978  );
1979  }
1980 #endif // __aarch64__
1981 #endif // __ARM_NEON
1982  for ( ; remain; --remain )
1983  {
1984 // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
1985  *Dp++ = (unsigned char)(( (short)((b0 * (short)(*rows0p++)) >> 16) + (short)((b1 * (short)(*rows1p++)) >> 16) + 2)>>2);
1986  }
1987 
1988  ibeta += 2;
1989  }
1990 
1991  delete[] buf;
1992 }
1993 
1994 inline Mat Mat::from_pixels(const unsigned char* pixels, int type, int w, int h, Allocator* allocator)
1995 {
1996  if (type & PIXEL_CONVERT_MASK)
1997  {
1998  if (type == PIXEL_RGB2BGR || type == PIXEL_BGR2RGB)
1999  return from_rgb2bgr(pixels, w, h, allocator);
2000 
2001  if (type == PIXEL_RGB2GRAY)
2002  return from_rgb2gray(pixels, w, h, allocator);
2003 
2004  if (type == PIXEL_BGR2GRAY)
2005  return from_bgr2gray(pixels, w, h, allocator);
2006 
2007  if (type == PIXEL_GRAY2RGB || type == PIXEL_GRAY2BGR)
2008  return from_gray2rgb(pixels, w, h, allocator);
2009 
2010  if (type == PIXEL_RGBA2RGB)
2011  return from_rgba2rgb(pixels, w, h, allocator);
2012 
2013  if (type == PIXEL_RGBA2BGR)
2014  return from_rgba2bgr(pixels, w, h, allocator);
2015 
2016  if (type == PIXEL_RGBA2GRAY)
2017  return from_rgba2gray(pixels, w, h, allocator);
2018 
2019  }
2020  else
2021  {
2022  if (type == PIXEL_RGB || type == PIXEL_BGR)
2023  return from_rgb(pixels, w, h, allocator);
2024 
2025  if (type == PIXEL_GRAY)
2026  return from_gray(pixels, w, h, allocator);
2027 
2028  if (type == PIXEL_RGBA)
2029  return from_rgba(pixels, w, h, allocator);
2030 
2031  }
2032 
2033  return Mat();
2034 }
2035 
2036 inline Mat Mat::from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height, Allocator* allocator)
2037 {
2038  if (w == target_width && h == target_height)
2039  return Mat::from_pixels(pixels, type, w, h);
2040 
2041  Mat m;
2042 
2043  int type_from = type & PIXEL_FORMAT_MASK;
2044 
2045  if (type_from == PIXEL_RGB || type_from == PIXEL_BGR)
2046  {
2047  unsigned char* dst = new unsigned char[target_width * target_height * 3];
2048 
2049  resize_bilinear_c3(pixels, w, h, dst, target_width, target_height);
2050 
2051  m = Mat::from_pixels(dst, type, target_width, target_height, allocator);
2052 
2053  delete[] dst;
2054  }
2055  else if (type_from == PIXEL_GRAY)
2056  {
2057  unsigned char* dst = new unsigned char[target_width * target_height];
2058 
2059  resize_bilinear_c1(pixels, w, h, dst, target_width, target_height);
2060 
2061  m = Mat::from_pixels(dst, type, target_width, target_height, allocator);
2062 
2063  delete[] dst;
2064  }
2065  else if (type_from == PIXEL_RGBA)
2066  {
2067  unsigned char* dst = new unsigned char[target_width * target_height * 4];
2068 
2069  resize_bilinear_c4(pixels, w, h, dst, target_width, target_height);
2070 
2071  m = Mat::from_pixels(dst, type, target_width, target_height, allocator);
2072 
2073  delete[] dst;
2074  }
2075 
2076  return m;
2077 }
2078 
2079 inline void Mat::to_pixels(unsigned char* pixels, int type) const
2080 {
2081  if (type & PIXEL_CONVERT_MASK)
2082  {
2083  if (type == PIXEL_RGB2BGR || type == PIXEL_BGR2RGB)
2084  return to_bgr2rgb(*this, pixels);
2085  }
2086  else
2087  {
2088  if (type == PIXEL_RGB || type == PIXEL_BGR)
2089  return to_rgb(*this, pixels);
2090 
2091  if (type == PIXEL_GRAY)
2092  return to_gray(*this, pixels);
2093 
2094  if (type == PIXEL_RGBA)
2095  return to_rgba(*this, pixels);
2096  }
2097 }
2098 
2099 inline void Mat::to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height) const
2100 {
2101  if (w == target_width && h == target_height)
2102  return to_pixels(pixels, type);
2103 
2104  int type_to = (type & PIXEL_CONVERT_MASK) ? (type >> PIXEL_CONVERT_SHIFT) : (type & PIXEL_FORMAT_MASK);
2105 
2106  if (type_to == PIXEL_RGB || type_to == PIXEL_BGR)
2107  {
2108  unsigned char* src = new unsigned char[w * h * 3];
2109 
2110  to_pixels(src, type);
2111 
2112  resize_bilinear_c3(src, w, h, pixels, target_width, target_height);
2113 
2114  delete[] src;
2115  }
2116  else if (type_to == PIXEL_GRAY)
2117  {
2118  unsigned char* src = new unsigned char[w * h];
2119 
2120  to_pixels(src, type);
2121 
2122  resize_bilinear_c1(src, w, h, pixels, target_width, target_height);
2123 
2124  delete[] src;
2125  }
2126  else if (type_to == PIXEL_RGBA)
2127  {
2128  unsigned char* src = new unsigned char[w * h * 4];
2129 
2130  to_pixels(src, type);
2131 
2132  resize_bilinear_c4(src, w, h, pixels, target_width, target_height);
2133 
2134  delete[] src;
2135  }
2136 }
2137 
2138 
2139 inline void draw_box_grayscale(Mat& im, int x1, int y1, int x2, int y2, float g)
2140 {
2141  if (im.data) {
2142  int i;
2143  if (x1 < 0) x1 = 0;
2144  if (x1 >= im.w) x1 = im.w - 1;
2145  if (x2 < 0) x2 = 0;
2146  if (x2 >= im.w) x2 = im.w - 1;
2147 
2148  if (y1 < 0) y1 = 0;
2149  if (y1 >= im.h) y1 = im.h - 1;
2150  if (y2 < 0) y2 = 0;
2151  if (y2 >= im.h) y2 = im.h - 1;
2152 
2153  for (i = x1; i <= x2; ++i) {
2154  ((float*)im.data)[i + y1 * im.w] = g;
2155  ((float*)im.data)[i + y2 * im.w] = g;
2156  }
2157  for (i = y1; i <= y2; ++i) {
2158  ((float*)im.data)[x1 + i * im.w] = g;
2159  ((float*)im.data)[x2 + i * im.w] = g;
2160  }
2161  }
2162 }
2163 
2164 
2165 inline void draw_box(Mat& im, int x1, int y1, int x2, int y2, float r, float g, float b)
2166 {
2167  r = r / 255.;
2168  g = g / 255.;
2169  b = b / 255.;
2170  if (im.c == 1) {
2171  /* Draw on grayscale image */
2172  draw_box_grayscale(im, x1, y1, x2, y2, b * 0.114 + g * 0.587 + r * 0.299);
2173  return;
2174  }
2175  if (im.data) {
2176  int i;
2177  if (x1 < 0) x1 = 0;
2178  if (x1 >= im.w) x1 = im.w - 1;
2179  if (x2 < 0) x2 = 0;
2180  if (x2 >= im.w) x2 = im.w - 1;
2181 
2182  if (y1 < 0) y1 = 0;
2183  if (y1 >= im.h) y1 = im.h - 1;
2184  if (y2 < 0) y2 = 0;
2185  if (y2 >= im.h) y2 = im.h - 1;
2186 
2187  for (i = x1; i <= x2; ++i) {
2188  ((float*)im.data)[i + y1 * im.w + 0 * im.w*im.h] = r;
2189  ((float*)im.data)[i + y2 * im.w + 0 * im.w*im.h] = r;
2190 
2191  ((float*)im.data)[i + y1 * im.w + 1 * im.w*im.h] = g;
2192  ((float*)im.data)[i + y2 * im.w + 1 * im.w*im.h] = g;
2193 
2194  ((float*)im.data)[i + y1 * im.w + 2 * im.w*im.h] = b;
2195  ((float*)im.data)[i + y2 * im.w + 2 * im.w*im.h] = b;
2196  }
2197  for (i = y1; i <= y2; ++i) {
2198  ((float*)im.data)[x1 + i * im.w + 0 * im.w*im.h] = r;
2199  ((float*)im.data)[x2 + i * im.w + 0 * im.w*im.h] = r;
2200 
2201  ((float*)im.data)[x1 + i * im.w + 1 * im.w*im.h] = g;
2202  ((float*)im.data)[x2 + i * im.w + 1 * im.w*im.h] = g;
2203 
2204  ((float*)im.data)[x1 + i * im.w + 2 * im.w*im.h] = b;
2205  ((float*)im.data)[x2 + i * im.w + 2 * im.w*im.h] = b;
2206  }
2207  }
2208 }
2209 
2210 
2211 
2212 
2213 inline void draw_bbox(Mat &im, Box bbox, float r, float g, float b)
2214 {
2215  draw_box(im, bbox.x, bbox.y, bbox.x + bbox.w, bbox.y + bbox.h, r, g, b);
2216 }
2217 
2218 
2219 inline void draw_bbox_width(Mat &im, Box bbox, int width, float r, float g, float b){
2220  draw_box(im, bbox.x, bbox.y, bbox.x + bbox.w, bbox.y + bbox.h, r, g, b);
2221 }
2222 
2223 
2224 inline void flip_mat(Mat &input){
2225  int i, j, k;
2226  for (k = 0; k < input.c; ++k) {
2227  for (i = 0; i < input.h; ++i) {
2228  for (j = 0; j < input.w / 2; ++j) {
2229  int index = j + input.w*(i + input.h*(k));
2230  int flip = (input.w - j - 1) + input.w*(i + input.h*(k));
2231  float swap = ((float*)input.data)[flip];
2232  ((float*)input.data)[flip] = ((float*)input.data)[index];
2233  ((float*)input.data)[index] = swap;
2234  }
2235  }
2236  }
2237 }
2238 
2239 
2240 
2241 inline void draw_circle(Mat& im, int x0, int y0, int radius, float r, float g, float b){
2242 #define plot(x, y) set_pixel(im,x,y,0,r);set_pixel(im,x,y,1,g);set_pixel(im,x,y,2,b);
2243  int f, ddF_x, ddF_y, x, y;
2244  if (im.data) {
2245  r = r / 255.;
2246  g = g / 255.;
2247  b = b / 255.;
2248  if (im.c == 1) {
2249  /* Draw on grayscale image */
2250  r = b * 0.114 + g * 0.587 + r * 0.299;
2251  }
2252  f = 1 - radius;
2253  ddF_x = 0;
2254  ddF_y = -2 * radius;
2255  x = 0;
2256  y = radius;
2257 
2258  plot(x0, y0 + radius);
2259  plot(x0, y0 - radius);
2260  plot(x0 + radius, y0);
2261  plot(x0 - radius, y0);
2262 
2263  while (x < y)
2264  {
2265  if (f >= 0)
2266  {
2267  y--;
2268  ddF_y += 2;
2269  f += ddF_y;
2270  }
2271  x++;
2272  ddF_x += 2;
2273  f += ddF_x + 1;
2274  plot(x0 + x, y0 + y);
2275  plot(x0 - x, y0 + y);
2276  plot(x0 + x, y0 - y);
2277  plot(x0 - x, y0 - y);
2278  plot(x0 + y, y0 + x);
2279  plot(x0 - y, y0 + x);
2280  plot(x0 + y, y0 - x);
2281  plot(x0 - y, y0 - x);
2282  }
2283  }
2284 #undef plot
2285 }
2286 
2287 
2288 
2289 inline void draw_circle_thickness(Mat& im, int x0, int y0, int radius, int width, float r, float g, float b){
2290  int i;
2291  for (i = 0; i < width; i++) {
2292  draw_circle(im, x0, y0, radius - i, r, g, b);
2293  }
2294 }
2295 
2296 
2297 inline void draw_line(Mat& im, Point start, Point end, float r, float g, float b){
2298  int x1, x2, y1, y2, dx, dy, err, sx, sy, e2;
2299  r = r / 255.;
2300  g = g / 255.;
2301  b = b / 255.;
2302  if (im.c == 1) {
2303  /* Draw on grayscale image */
2304  r = b * 0.114 + g * 0.587 + r * 0.299;
2305  }
2306  x1 = start.x;
2307  x2 = end.x;
2308  y1 = start.y;
2309  y2 = end.y;
2310  if (x1 < 0) x1 = 0;
2311  if (x1 >= im.w) x1 = im.w - 1;
2312  if (x2 < 0) x2 = 0;
2313  if (x2 >= im.w) x2 = im.w - 1;
2314 
2315  if (y1 < 0) y1 = 0;
2316  if (y1 >= im.h) y1 = im.h - 1;
2317  if (y2 < 0) y2 = 0;
2318  if (y2 >= im.h) y2 = im.h - 1;
2319 
2320  dx = abs(x2 - x1), sx = x1 < x2 ? 1 : -1;
2321  dy = abs(y2 - y1), sy = y1 < y2 ? 1 : -1;
2322  err = (dx > dy ? dx : -dy) / 2;
2323 
2324  for (;;) {
2325  set_pixel(im, x1, y1, 0, r);
2326  if (im.c == 3) {
2327  set_pixel(im, x1, y1, 1, g);
2328  set_pixel(im, x1, y1, 2, b);
2329  }
2330  if (x1 == x2 && y1 == y2) break;
2331  e2 = err;
2332  if (e2 > -dx) { err -= dy; x1 += sx; }
2333  if (e2 < dy) { err += dx; y1 += sy; }
2334  }
2335 }
2336 
2337 
2338 
2339 
2340 } // namespace dface
2341 
2342 #endif
int y
Definition: types.h:34
Definition: mat.h:217
Definition: allocator.h:14
static void draw_circle(Mat &im, int x0, int y0, int radius, float r, float g, float b)
static Mat from_pixels(const unsigned char *pixels, int type, int w, int h, Allocator *allocator=0)
Definition: mat_pixel.h:1994
Definition: mat.h:220
Definition: mat.h:216
Definition: mat.h:219
Definition: mat.h:227
Definition: types.h:32
void * data
Definition: mat.h:399
Definition: mat.h:225
static void draw_circle_thickness(Mat &im, int x0, int y0, int radius, int width, float r, float g, float b)
Definition: mat.h:213
Mat()
Definition: mat.h:811
int x
Definition: types.h:17
bool empty() const
Definition: mat.h:1272
static void draw_bbox_width(Mat &im, Box bbox, int width, float r, float g, float b)
static Mat from_pixels_resize(const unsigned char *pixels, int type, int w, int h, int target_width, int target_height, Allocator *allocator=0)
Definition: mat_pixel.h:2036
边框信息(人脸,行人,物体等)
Definition: types.h:16
static void flip_mat(Mat &input)
Definition: mat.h:226
static void draw_box(Mat &im, int x1, int y1, int x2, int y2, float r, float g, float b)
dface内置的图像数据 dface的所有接口只支持dface::Mat(RGB格式)
Definition: mat.h:23
int y
Definition: types.h:18
static void draw_bbox(Mat &im, Box bbox, float r, float g, float b)
static void draw_box_grayscale(Mat &im, int x1, int y1, int x2, int y2, float g)
int w
Definition: types.h:19
static void draw_line(Mat &im, Point start, Point end, float r, float g, float b)
int x
Definition: types.h:33
Definition: mat.h:211
void to_pixels_resize(unsigned char *pixels, int type, int target_width, int target_height) const
Definition: mat_pixel.h:2099
Definition: allocator.h:107
Definition: mat.h:223
Definition: mat.h:212
void to_pixels(unsigned char *pixels, int type) const
Definition: mat_pixel.h:2079
Definition: mat.h:222
int h
Definition: types.h:20
Definition: mat.h:214