@N
Size: a a a
@N
@N
IL
CC
YB
CC
YB
@N
AZ
CC
IL
@N
CC
@N
YB
@N
@N
p
@N
YB
int i,j;//, ii=0,jj=0;
// variables for vector section
int vindexm [8]={0, MAX1, MAX1*2, MAX1*3, MAX1*4, MAX1*5, MAX1*6, MAX1*7 };
__m256i vindex = _mm256_load_si256((__m256i *) &vindexm[0]);
__m256 vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8;
for(i=0; i<MAX1; i+=8){
for(j=0; j<MAX2; j+=8){
//loading from columns
vec1 = _mm256_i32gather_ps (&a[i][j+0],vindex,4);
vec2 = _mm256_i32gather_ps (&a[i][j+1],vindex,4);
vec3 = _mm256_i32gather_ps (&a[i][j+2],vindex,4);
vec4 = _mm256_i32gather_ps (&a[i][j+3],vindex,4);
vec5 = _mm256_i32gather_ps (&a[i][j+4],vindex,4);
vec6 = _mm256_i32gather_ps (&a[i][j+5],vindex,4);
vec7 = _mm256_i32gather_ps (&a[i][j+6],vindex,4);
vec8 = _mm256_i32gather_ps (&a[i][j+7],vindex,4);
//storing to the rows
_mm256_store_ps(&a_tra[j+0][i], vec1);
_mm256_store_ps(&a_tra[j+1][i], vec2);
_mm256_store_ps(&a_tra[j+2][i], vec3);
_mm256_store_ps(&a_tra[j+3][i], vec4);
_mm256_store_ps(&a_tra[j+4][i], vec5);
_mm256_store_ps(&a_tra[j+5][i], vec6);
_mm256_store_ps(&a_tra[j+6][i], vec7);
_mm256_store_ps(&a_tra[j+7][i], vec8);
}
}