DFace SDK  1.8.5
cpu.h
1 
2 #ifndef DFACE_CPU_H
3 #define DFACE_CPU_H
4 
5 #include <stdio.h>
6 #include <string.h>
7 #include <vector>
8 
9 #ifdef _OPENMP
10 #include <omp.h>
11 #endif
12 
13 #ifdef __ANDROID__
14 #include <sys/syscall.h>
15 #include <unistd.h>
16 #endif
17 
18 #if __APPLE__
19 #include "TargetConditionals.h"
20 #if TARGET_OS_IPHONE
21 #include <sys/types.h>
22 #include <sys/sysctl.h>
23 #include <mach/machine.h>
24 #define __IOS__ 1
25 #endif
26 #endif
27 
28 
29 namespace dface {
30 
31 // test optional cpu features
32 // neon = armv7 neon or aarch64 asimd
33 int cpu_support_arm_neon();
34 // vfpv4 = armv7 fp16 + fma
35 int cpu_support_arm_vfpv4();
36 // asimdhp = aarch64 asimd half precision
37 int cpu_support_arm_asimdhp();
38 
39 // cpu info
40 int get_cpu_count();
41 
42 // bind all threads on little clusters if powersave enabled
43 // affacts HMP arch cpu like ARM big.LITTLE
44 // only implemented on android at the moment
45 // switching powersave is expensive and not thread-safe
46 // 0 = all cores enabled(default)
47 // 1 = only little clusters enabled
48 // 2 = only big clusters enabled
49 // return 0 if success for setter function
50 int get_cpu_powersave();
51 int set_cpu_powersave(int powersave);
52 
53 // misc function wrapper for openmp routines
54 int get_omp_num_threads();
55 void set_omp_num_threads(int num_threads);
56 
57 int get_omp_dynamic();
58 void set_omp_dynamic(int dynamic);
59 
60 
61 #ifdef __ANDROID__
62 
63  // extract the ELF HW capabilities bitmap from /proc/self/auxv
64 static unsigned int get_elf_hwcap_from_proc_self_auxv()
65 {
66  FILE* fp = fopen("/proc/self/auxv", "rb");
67  if (!fp)
68  {
69  return 0;
70  }
71 
72 #define AT_HWCAP 16
73 #define AT_HWCAP2 26
74 
75  struct { unsigned int tag; unsigned int value; } entry;
76 
77  unsigned int result = 0;
78  while (!feof(fp))
79  {
80  int nread = fread((char*)&entry, sizeof(entry), 1, fp);
81  if (nread != 1)
82  break;
83 
84  if (entry.tag == 0 && entry.value == 0)
85  break;
86 
87  if (entry.tag == AT_HWCAP)
88  {
89  result = entry.value;
90  break;
91  }
92  }
93 
94  fclose(fp);
95 
96  return result;
97 }
98 
99 static unsigned int g_hwcaps = get_elf_hwcap_from_proc_self_auxv();
100 
101 #if __aarch64__
102 // from arch/arm64/include/uapi/asm/hwcap.h
103 #define HWCAP_ASIMD (1 << 1)
104 #define HWCAP_ASIMDHP (1 << 10)
105 #else
106 // from arch/arm/include/uapi/asm/hwcap.h
107 #define HWCAP_NEON (1 << 12)
108 #define HWCAP_VFPv4 (1 << 16)
109 #endif
110 
111 #endif // __ANDROID__
112 
113 #if __IOS__
114  static cpu_type_t get_hw_cputype()
115 {
116  cpu_type_t value = 0;
117  size_t len = sizeof(value);
118  sysctlbyname("hw.cputype", &value, &len, NULL, 0);
119  return value;
120 }
121 
122 static cpu_subtype_t get_hw_cpusubtype()
123 {
124  cpu_subtype_t value = 0;
125  size_t len = sizeof(value);
126  sysctlbyname("hw.cpusubtype", &value, &len, NULL, 0);
127  return value;
128 }
129 
130 static cpu_type_t g_hw_cputype = get_hw_cputype();
131 static cpu_subtype_t g_hw_cpusubtype = get_hw_cpusubtype();
132 #endif // __IOS__
133 
134  inline int cpu_support_arm_neon()
135  {
136 #ifdef __ANDROID__
137  #if __aarch64__
138  return g_hwcaps & HWCAP_ASIMD;
139 #else
140  return g_hwcaps & HWCAP_NEON;
141 #endif
142 #elif __IOS__
143  #if __aarch64__
144  return g_hw_cputype == CPU_TYPE_ARM64;
145 #else
146  return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
147 #endif
148 #else
149  return 0;
150 #endif
151  }
152 
153  inline int cpu_support_arm_vfpv4()
154  {
155 #ifdef __ANDROID__
156  #if __aarch64__
157  // neon always enable fma and fp16
158  return g_hwcaps & HWCAP_ASIMD;
159 #else
160  return g_hwcaps & HWCAP_VFPv4;
161 #endif
162 #elif __IOS__
163  #if __aarch64__
164  return g_hw_cputype == CPU_TYPE_ARM64;
165 #else
166  return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
167 #endif
168 #else
169  return 0;
170 #endif
171  }
172 
173  inline int cpu_support_arm_asimdhp()
174  {
175 #ifdef __ANDROID__
176  #if __aarch64__
177  return g_hwcaps & HWCAP_ASIMDHP;
178 #else
179  return 0;
180 #endif
181 #elif __IOS__
182  #if __aarch64__
183  return 0;
184 #else
185  return 0;
186 #endif
187 #else
188  return 0;
189 #endif
190  }
191 
192  static int get_cpucount()
193  {
194 #ifdef __ANDROID__
195  // get cpu count from /proc/cpuinfo
196  FILE* fp = fopen("/proc/cpuinfo", "rb");
197  if (!fp)
198  return 1;
199 
200  int count = 0;
201  char line[1024];
202  while (!feof(fp))
203  {
204  char* s = fgets(line, 1024, fp);
205  if (!s)
206  break;
207 
208  if (memcmp(line, "processor", 9) == 0)
209  {
210  count++;
211  }
212  }
213 
214  fclose(fp);
215 
216  if (count < 1)
217  count = 1;
218 
219  return count;
220 #elif __IOS__
221  int count = 0;
222  size_t len = sizeof(count);
223  sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
224 
225  if (count < 1)
226  count = 1;
227 
228  return count;
229 #else
230  return 1;
231 #endif
232  }
233 
234  static int g_cpucount = get_cpucount();
235 
236  inline int get_cpu_count()
237  {
238  return g_cpucount;
239  }
240 
241 #ifdef __ANDROID__
242  static int get_max_freq_khz(int cpuid)
243 {
244  // first try, for all possible cpu
245  char path[256];
246  sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
247 
248  FILE* fp = fopen(path, "rb");
249 
250  if (!fp)
251  {
252  // second try, for online cpu
253  sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid);
254  fp = fopen(path, "rb");
255 
256  if (!fp)
257  {
258  // third try, for online cpu
259  sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
260  fp = fopen(path, "rb");
261 
262  if (!fp)
263  return -1;
264 
265  int max_freq_khz = -1;
266  fscanf(fp, "%d", &max_freq_khz);
267 
268  fclose(fp);
269 
270  return max_freq_khz;
271  }
272  }
273 
274  int max_freq_khz = 0;
275  while (!feof(fp))
276  {
277  int freq_khz = 0;
278  int nscan = fscanf(fp, "%d %*d", &freq_khz);
279  if (nscan != 1)
280  break;
281 
282  if (freq_khz > max_freq_khz)
283  max_freq_khz = freq_khz;
284  }
285 
286  fclose(fp);
287 
288  return max_freq_khz;
289 }
290 
291 static int set_sched_affinity(const std::vector<int>& cpuids)
292 {
293  // cpu_set_t definition
294  // ref http://stackoverflow.com/questions/16319725/android-set-thread-affinity
295 #define CPU_SETSIZE 1024
296 #define __NCPUBITS (8 * sizeof (unsigned long))
297 typedef struct
298 {
299  unsigned long __bits[CPU_SETSIZE / __NCPUBITS];
300 } cpu_set_t;
301 
302 #define CPU_SET(cpu, cpusetp) \
303  ((cpusetp)->__bits[(cpu)/__NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS)))
304 
305 #define CPU_ZERO(cpusetp) \
306  memset((cpusetp), 0, sizeof(cpu_set_t))
307 
308  // set affinity for thread
309  pid_t pid = gettid();
310 
311  cpu_set_t mask;
312  CPU_ZERO(&mask);
313  for (int i=0; i<(int)cpuids.size(); i++)
314  {
315  CPU_SET(cpuids[i], &mask);
316  }
317 
318  int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
319  if (syscallret)
320  {
321  fprintf(stderr, "syscall error %d\n", syscallret);
322  return -1;
323  }
324 
325  return 0;
326 }
327 
328 static int sort_cpuid_by_max_frequency(std::vector<int>& cpuids, int* little_cluster_offset)
329 {
330  const int cpu_count = cpuids.size();
331 
332  *little_cluster_offset = 0;
333 
334  if (cpu_count == 0)
335  return 0;
336 
337  std::vector<int> cpu_max_freq_khz;
338  cpu_max_freq_khz.resize(cpu_count);
339 
340  for (int i=0; i<cpu_count; i++)
341  {
342  int max_freq_khz = get_max_freq_khz(i);
343 
344 // printf("%d max freq = %d khz\n", i, max_freq_khz);
345 
346  cpuids[i] = i;
347  cpu_max_freq_khz[i] = max_freq_khz;
348  }
349 
350  // sort cpuid as big core first
351  // simple bubble sort
352  for (int i=0; i<cpu_count; i++)
353  {
354  for (int j=i+1; j<cpu_count; j++)
355  {
356  if (cpu_max_freq_khz[i] < cpu_max_freq_khz[j])
357  {
358  // swap
359  int tmp = cpuids[i];
360  cpuids[i] = cpuids[j];
361  cpuids[j] = tmp;
362 
363  tmp = cpu_max_freq_khz[i];
364  cpu_max_freq_khz[i] = cpu_max_freq_khz[j];
365  cpu_max_freq_khz[j] = tmp;
366  }
367  }
368  }
369 
370  // SMP
371  int mid_max_freq_khz = (cpu_max_freq_khz.front() + cpu_max_freq_khz.back()) / 2;
372  if (mid_max_freq_khz == cpu_max_freq_khz.back())
373  return 0;
374 
375  for (int i=0; i<cpu_count; i++)
376  {
377  if (cpu_max_freq_khz[i] < mid_max_freq_khz)
378  {
379  *little_cluster_offset = i;
380  break;
381  }
382  }
383 
384  return 0;
385 }
386 #endif // __ANDROID__
387 
388  static int g_powersave = 0;
389 
390  inline int get_cpu_powersave()
391  {
392  return g_powersave;
393  }
394 
395  inline int set_cpu_powersave(int powersave)
396  {
397 #ifdef __ANDROID__
398  static std::vector<int> sorted_cpuids;
399  static int little_cluster_offset = 0;
400 
401  if (sorted_cpuids.empty())
402  {
403  // 0 ~ g_cpucount
404  sorted_cpuids.resize(g_cpucount);
405  for (int i=0; i<g_cpucount; i++)
406  {
407  sorted_cpuids[i] = i;
408  }
409 
410  // descent sort by max frequency
411  sort_cpuid_by_max_frequency(sorted_cpuids, &little_cluster_offset);
412  }
413 
414  if (little_cluster_offset == 0 && powersave != 0)
415  {
416  powersave = 0;
417  fprintf(stderr, "SMP cpu powersave not supported\n");
418  }
419 
420  // prepare affinity cpuid
421  std::vector<int> cpuids;
422  if (powersave == 0)
423  {
424  cpuids = sorted_cpuids;
425  }
426  else if (powersave == 1)
427  {
428  cpuids = std::vector<int>(sorted_cpuids.begin() + little_cluster_offset, sorted_cpuids.end());
429  }
430  else if (powersave == 2)
431  {
432  cpuids = std::vector<int>(sorted_cpuids.begin(), sorted_cpuids.begin() + little_cluster_offset);
433  }
434  else
435  {
436  fprintf(stderr, "powersave %d not supported\n", powersave);
437  return -1;
438  }
439 
440 #ifdef _OPENMP
441  // set affinity for each thread
442  int num_threads = cpuids.size();
443  omp_set_num_threads(num_threads);
444  std::vector<int> ssarets(num_threads, 0);
445  #pragma omp parallel for
446  for (int i=0; i<num_threads; i++)
447  {
448  ssarets[i] = set_sched_affinity(cpuids);
449  }
450  for (int i=0; i<num_threads; i++)
451  {
452  if (ssarets[i] != 0)
453  {
454  return -1;
455  }
456  }
457 #else
458  int ssaret = set_sched_affinity(cpuids);
459  if (ssaret != 0)
460  {
461  return -1;
462  }
463 #endif
464 
465  g_powersave = powersave;
466 
467  return 0;
468 #elif __IOS__
469  // thread affinity not supported on ios
470  return -1;
471 #else
472  // TODO
473  (void) powersave; // Avoid unused parameter warning.
474  return -1;
475 #endif
476  }
477 
478  inline int get_omp_num_threads()
479  {
480 #ifdef _OPENMP
481  return omp_get_num_threads();
482 #else
483  return 1;
484 #endif
485  }
486 
487  inline void set_omp_num_threads(int num_threads)
488  {
489 #ifdef _OPENMP
490  omp_set_num_threads(num_threads);
491 #else
492  (void)num_threads;
493 #endif
494  }
495 
496  inline int get_omp_dynamic()
497  {
498 #ifdef _OPENMP
499  return omp_get_dynamic();
500 #else
501  return 0;
502 #endif
503  }
504 
505  inline void set_omp_dynamic(int dynamic)
506  {
507 #ifdef _OPENMP
508  omp_set_dynamic(dynamic);
509 #else
510  (void)dynamic;
511 #endif
512  }
513 
514 
515 } // namespace ncnn
516 
517 #endif // DFACE_CPU_H
Definition: common.h:37