本文共 9342 字,大约阅读时间需要 31 分钟。
一直以来都在DM642平台下写程序,而自己所做的大部分工作也和优化相关。为了更加清楚地理解TI CCS编译器的优化规则,做了以下试验:
分别写了7个相同功能的函数,操作相同, 但实现的方式不一样,自然运行的时间也不一样:
----------------------debug mode----------------------------------
combine1(v, &dest) Time elapse: 0.028967 ms.
combine2(v, &dest) Time elapse: 0.024751 ms. combine3(v, &dest) Time elapse: 0.023541 ms. combine4(v, &dest) Time elapse: 0.012635 ms. combine4p(v, &dest) Time elapse: 0.012090 ms. combine5p(v, &dest) Time elapse: 0.007475 ms. combine6(v, &dest) Time elapse: 0.011119 ms. combine6p(v, &dest) Time elapse: 0.006823 ms.
combine1 > combine2 > combine3 > combine4 > conbine4p > combine6 > conbine5p > combine6p
由于没有开优化,在debug模式下运行,这个结果和自己预期的比较一致。从combine1到combine6p, 依次减少了函数的调用和对存储器的访问,以以及循环的展开等等。自然其耗时越来越短。然而当我把-o3优化打开时,结果令我费解:
------------------release mode---------------------------------------
combine1(v, &dest) Time elapse: 0.011491 ms.
combine2(v, &dest) Time elapse: 0.009616 ms. combine3(v, &dest) Time elapse: 0.009603 ms. combine4(v, &dest) Time elapse: 0.003884 ms. combine4p(v, &dest) Time elapse: 0.004096 ms. combine5p(v, &dest) Time elapse: 0.005573 ms. combine6(v, &dest) Time elapse: 0.005120 ms. combine6p(v, &dest) Time elapse: 0.004987 ms. 最值得注意的变化是combine4和combine4p. 下降的幅度最大,并且成为了耗时最短的函数。仔细看一下combine4中的核心循环:
for (i = 0; i < len; i++) { x = x OPER data[i]; }
这里并没有任何人为的循环展开。而正是因为如此编译器对它所进行的优化程度甚至超过了人为循环展开的函数。
这使得它一下子成为了最快的函数。而且combine4与combine4p的唯一区别在于一个使用数组,一个用指针。
而在这里也证明了一点:编译器对数组的优化程度大于指针。尽管差别不是太大。
打开profile查看:
0:0x800202c0-0x80020374,combine4,125-143:test.c, function, 1,1580,1580,1430,1430,7,7,7,7,
0:0x800207b4-0x80020980,combine6,217-239:test.c, function, 1,2117,2117,2021,2021,12,12,9,9,
0:0x80020344-0x80020354,combine4,138-141:test.c, loop, 124,1102,1102,978,978,5,5,4,4, (循环124次,cache hit 5/(5+4) = 0.56) 0:0x80020880-0x80020898,combine6,227-231:test.c, loop, 60,1041,1041,981,981,6,6,4,4, (循环60次,cache hit 4/10 = 0.4)
看来编译器做循环展开比我们人为地做要好。 另外,combine6p做了很大的循环展开,但并没有像想象的那样有几何级数的提速,原因是循环展开到一定程度,内存的访问等待时间成为瓶颈。
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <math.h> 4 #include <csl.h> 5 #include <csl_cache.h> 6 #include <time.h> 7 #include "myMath.h" 8 #include "HKY_testTime.h" 9 10 #define IDENT 0 11 #define OPER + 12 #define VEC_LEN 1024 13 14 typedef int data_t; 15 16 typedef struct { 17 int len; 18 data_t *data; 19 }vec_rec, *vec_ptr; 20 21 vec_ptr new_vec(int len); 22 int get_vec_element(vec_ptr v, int index, data_t *dest); 23 int vec_length(vec_ptr v); 24 25 void combine1(vec_ptr v, data_t *dest); 26 27 vec_ptr new_vec(int len) 28 { 29 vec_ptr result = (vec_ptr)malloc(sizeof(vec_rec)); 30 if (!result) 31 { 32 return NULL; 33 } 34 35 result->len = len; 36 37 if (len > 0) 38 { 39 int i; 40 data_t *data = (data_t *)malloc(len * sizeof(data_t)); 41 if (!data) 42 { 43 free((void *)result); 44 return NULL; 45 } 46 result->data = data; 47 48 for (i = 0; i < len; i++) 49 { 50 data[i] = i; 51 } 52 } 53 else 54 { 55 result->data = NULL; 56 } 57 return result; 58 } 59 60 int get_vec_element(vec_ptr v, int index, data_t *dest) 61 { 62 if (index < 0 || index >= v->len) 63 { 64 return 0; 65 } 66 *dest = v->data[index]; 67 return 1; 68 } 69 70 int vec_length(vec_ptr v) 71 { 72 return v->len; 73 } 74 75 void combine1(vec_ptr v, data_t *dest) 76 { 77 int i; 78 79 *dest = IDENT; 80 81 for (i = 0; i < vec_length(v); i++) 82 { 83 data_t val; 84 get_vec_element(v, i, &val); 85 *dest = *dest OPER val; 86 } 87 } 88 89 void combine2(vec_ptr v, data_t *dest) 90 { 91 int i; 92 int len; 93 94 *dest = IDENT; 95 len = vec_length(v); 96 97 for (i = 0; i < len; i++) 98 { 99 data_t val; 100 get_vec_element(v, i, &val); 101 *dest = *dest OPER val; 102 } 103 } 104 105 void combine3(vec_ptr v, data_t *dest) 106 { 107 int i; 108 int len; 109 data_t x; 110 111 *dest = IDENT; 112 x = IDENT; 113 114 len = vec_length(v); 115 116 for (i = 0; i < len; i++) 117 { 118 data_t val; 119 get_vec_element(v, i, &val); 120 x = x OPER val; 121 } 122 *dest = x; 123 } 124 125 void combine4(vec_ptr v, data_t *dest) 126 { 127 int i; 128 int len; 129 data_t x; 130 data_t *data; 131 132 *dest = IDENT; 133 x = IDENT; 134 135 len = vec_length(v); 136 data = v->data; 137 138 for (i = 0; i < len; i++) 139 { 140 x = x OPER data[i]; 141 } 142 *dest = x; 143 } 144 145 void combine4p(vec_ptr v, data_t *dest) 146 { 147 int i; 148 int len; 149 data_t x; 150 data_t *data; 151 data_t *dend; 152 153 *dest = IDENT; 154 x = IDENT; 155 156 len = vec_length(v); 157 data = v->data; 158 dend = data + len; 159 160 for (; data < dend; data++) 161 { 162 x = x OPER (*data); 163 } 164 *dest = x; 165 } 166 void combine5(vec_ptr v, data_t *dest) 167 { 168 int i; 169 int len; 170 data_t x; 171 data_t *data; 172 173 *dest = IDENT; 174 x = IDENT; 175 176 len = vec_length(v); 177 data = v->data; 178 179 for (i = 0; i < len; i += 2) 180 { 181 x = x OPER data[i]; 182 x = x OPER data[i+1]; 183 } 184 *dest = x; 185 } 186 void combine5p(vec_ptr v, data_t *dest) 187 { 188 int i; 189 int len; 190 data_t x; 191 data_t *data; 192 data_t *dend; 193 data_t *dlimit; 194 195 *dest = IDENT; 196 x = IDENT; 197 198 len = vec_length(v); 199 data = v->data; 200 dend = data + len; 201 dlimit = dend - 7; 202 203 for (; data < dlimit; data += 8) 204 { 205 x = x OPER data[0] OPER data[1] OPER data[2] OPER data[3] 206 OPER data[4] OPER data[5] OPER data[6] OPER data[7]; 207 } 208 209 for (; data < dend; data++) 210 { 211 x = x OPER data[0]; 212 } 213 214 *dest = x; 215 } 216 217 void combine6(vec_ptr v, data_t *dest) 218 { 219 int length = vec_length(v); 220 int limit = length - 1; 221 int i; 222 223 data_t *data = v->data; 224 data_t x0 = IDENT; 225 data_t x1 = IDENT; 226 227 for (i = 0; i < limit; i += 2) 228 { 229 x0 = x0 OPER data[i]; 230 x1 = x1 OPER data[i+1]; 231 } 232 233 for (; i < length; i++) 234 { 235 x0 = x0 OPER data[i]; 236 } 237 238 *dest = x0 OPER x1; 239 } 240 241 void combine6p(vec_ptr v, data_t *dest) 242 { 243 int i; 244 int len; 245 data_t x; 246 data_t *data; 247 data_t *dend; 248 data_t *dlimit; 249 250 *dest = IDENT; 251 x = IDENT; 252 253 len = vec_length(v); 254 data = v->data; 255 dend = data + len; 256 dlimit = dend - 15; 257 258 for (; data < dlimit; data += 16) 259 { 260 x = x OPER data[0] OPER data[1] OPER data[2] OPER data[3] OPER 261 data[4] OPER data[5] OPER data[6] OPER data[7] OPER 262 data[8] OPER data[9] OPER data[10] OPER data[11] OPER 263 data[12] OPER data[13] OPER data[14] OPER data[15]; 264 } 265 266 for (; data < dend; data++) 267 { 268 x = x OPER data[0]; 269 } 270 271 *dest = x; 272 } 273 274 #define CLIP(X,AMIN,AMAX) (((X)<(AMIN)) ? (AMIN) : ((X)>(AMAX)) ? (AMAX) : (X)) 275 276 #define CLIP1(Y, X, AMIN, AMAX) if (X < AMIN) { Y = AMIN;} / 277 else if (X > AMAX) {Y = AMAX;}/ 278 else {Y = X;} 279 280 void genCosTable() 281 { 282 #define PI (3.1415926) 283 #define ROUND(x) ((x) - floor(x)) > 0.5 ? ((int)(x) + 1) : (int)(x) 284 double temp; 285 int temp_d; 286 int i; 287 double theta; 288 289 for (i = 0; i < 360 * 8; i++) 290 { 291 theta = (double)(i) * PI / (8 * 180); 292 temp = sin(theta) * 65536; 293 temp_d = ROUND(temp); 294 printf("temp = %d /n", temp_d); 295 } 296 297 } 298 299 int test_combine() 300 { 301 vec_ptr v = new_vec(128); 302 data_t dest; 303 HKY_CSL_INIT(); 304 305 CALL_FUN_TIME(combine1(v, &dest)); 306 CALL_FUN_TIME(combine2(v, &dest)); 307 CALL_FUN_TIME(combine3(v, &dest)); 308 CALL_FUN_TIME(combine4(v, &dest)); 309 CALL_FUN_TIME(combine4p(v, &dest)); 310 CALL_FUN_TIME(combine5(v ,&dest)); 311 CALL_FUN_TIME(combine5p(v, &dest)); 312 CALL_FUN_TIME(combine6(v, &dest)); 313 CALL_FUN_TIME(combine6p(v, &dest)); 314 return 0; 315 } 316 317 318 #define N 128 319 void mm_ijk(short *c, short *b, short *a) 320 { 321 int i, j, k; 322 short sum; 323 324 short (*C)[N] = (short (*)[N])c; 325 short (*B)[N] = (short (*)[N])b; 326 short (*A)[N] = (short (*)[N])a; 327 328 for (i = 0; i < N; i++) 329 { 330 for (j = 0; j < N; j++) 331 { 332 sum = 0; 333 for (k = 0; k < N; k++) 334 { 335 sum += A[i][k] * B[k][j]; 336 } 337 C[i][j] += sum; 338 } 339 } 340 341 } 342 343 int main() 344 { 345 test_combine(); 346 return 0; 347 }
--------------------header files-------------------------------------
#ifndef _HKY_TEST_TIME_H_ #define _HKY_TEST_TIME_H_
#include <csl.h> #include <csl_timer.h> #include <csl_cache.h> #include <time.h>
//#define USE_CLOCK_FUNC //if you want to use clock(), open this
TIMER_Config MyConfig = { 0x00000200, /* ctl */ 0xFFFFFFFF, /* prd */ 0x00000000 /* cnt */ }; TIMER_Handle myhTimer; double start_time, end_time, cur_time;
#define HKY_CSL_INIT()/ CSL_init();/ CACHE_setL2Mode(CACHE_256KCACHE);/ CACHE_enableCaching(CACHE_EMIFA_CE00);/ CACHE_enableCaching(CACHE_EMIFA_CE01);/ myhTimer = TIMER_open(TIMER_DEV0, 0);/ TIMER_config(myhTimer, &MyConfig);/ TIMER_setCount(myhTimer,0);/ TIMER_start(myhTimer)
#ifndef USE_CLOCK_FUNC #define CALL_FUN_TIME(fun_arg) / CACHE_clean(CACHE_L2ALL, (void *)0, 0);/ TIMER_setCount(myhTimer, 0);/ start_time = TIMER_getCount(myhTimer);/ fun_arg;/ end_time = TIMER_getCount(myhTimer);/ cur_time = (end_time - start_time) * 1.33e-5;/ printf(#fun_arg"/tTime elapse: %f ms./n", cur_time) #else #define CALL_FUN_TIME(fun_arg)/ CACHE_clean(CACHE_L2ALL, (void *)0, 0);/ start_time = clock();/ fun_arg;/ end_time = clock();/ cur_time = (end_time - start_time) * 1.67e-6;/ printf(#fun_arg"/tTime elapse: %f ms./n", cur_time) #endif //USE_CLOCK_FUNC
#endif //_HKY_TEST_TIME_H_
转载地址:http://krbmb.baihongyu.com/