如果要使用 ARM11 的 VFP 功能, 在 compile 時加上 -mfpu=vfp -mfloat-abi=softfp
我是使用 debian for ARM , gcc 4.4.2
[C]# cat f.c
int main()
{
float f1=1.2, f2=1.3;
f1 = f2*f1;
}[/C]
# gcc -mfpu=vfp -mfloat-abi=softfp -c f.c
# objdump -d f.o
f.o: file format elf32-littlearm
Disassembly of section .text:
00000000 :
0: e52db004 push {fp} ; (str fp, [sp, #-4]!)
4: e28db000 add fp, sp, #0
8: e24dd00c sub sp, sp, #12
c: eddf7a09 vldr s15, [pc, #36] ; 0x24
10: ed4b7a03 vstr s15, [fp, #-12]
14: eddf7a08 vldr s15, [pc, #32]
18: ed4b7a02 vstr s15, [fp, #-8]
1c: ed1b7a03 vldr s14, [fp, #-12]
20: ed5b7a02 vldr s15, [fp, #-8]
24: ee677a27 vmul.f32 s15, s14, s15
28: ed4b7a03 vstr s15, [fp, #-12]
2c: e28bd000 add sp, fp, #0
30: e8bd0800 pop {fp}
34: e12fff1e bx lr
38: 3f99999a .word 0x3f99999a
3c: 3fa66666 .word 0x3fa66666
有 vldr, vstr, vmul.f32 等 instruction .
# cat test2.c
[C]
#include <unistd.h>
#include <stdio.h>
void vfp_regs_load(float arrays[32])
{
asm volatile(“fldmias %0, {s0-s31}\n”
:
:”r”(arrays));
}
void vfp_regs_save(float arrays[32])
{
asm volatile (“fstmias %0, {s0-s31}”
:
:”r”(arrays));
}
void print_array(float array[32])
{
int i;
for(i=0; i<32; i++)
{
if(i%8==0)
printf(“\n”);
printf(“%f “,i, array[i]);
}
printf(“\n”);
}
int main()
{
unsigned int fpscr;
float f1=1.0, f2=1.0;
float farrays[32], farrays2[32];
int i;
fpscr = 0x130000;
asm volatile (“fmxr fpscr, %0\n”
:
:”r”(fpscr));
asm volatile (“fmrx %0, fpscr\n”
:”=r”(fpscr));
vfp_regs_save(farrays2);
for(i=0; i<32; i++) farrays[i] = f1+f2*(float) i; vfp_regs_load(farrays); vfp_regs_save(farrays2); printf(“\n1:ScalarA op ScalarB->ScalarD”);
vfp_regs_load(farrays);
asm volatile(“fadds s0, s1, s2”);
vfp_regs_save(farrays2);
print_array(farrays2);
printf(“\n2:VectorA[?] op ScalarB->VectorD[?]”);
vfp_regs_load(farrays);
asm volatile(“fadds s8, s24, s0”);
vfp_regs_save(farrays2);
print_array(farrays2);
printf(“\n3:VectorA[?] op VectorB[?]->VectorD[?]”);
vfp_regs_load(farrays);
asm volatile(“fadds s8, s16, s24”);
vfp_regs_save(farrays2);
print_array(farrays2);
}[/C]
Vector Instruciton 的範例
# ./a.out
1:ScalarA op ScalarB->ScalarD
5.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.000000
9.000000 10.000000 11.000000 12.000000 13.000000 14.000000 15.000000 16.000000
17.000000 18.000000 19.000000 20.000000 21.000000 22.000000 23.000000 24.000000
25.000000 26.000000 27.000000 28.000000 29.000000 30.000000 31.000000 32.000000
2:VectorA[?] op ScalarB->VectorD[?]
1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.000000
26.000000 10.000000 28.000000 12.000000 30.000000 14.000000 32.000000 16.000000
17.000000 18.000000 19.000000 20.000000 21.000000 22.000000 23.000000 24.000000
25.000000 26.000000 27.000000 28.000000 29.000000 30.000000 31.000000 32.000000
3:VectorA[?] op VectorB[?]->VectorD[?]
1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.000000
42.000000 10.000000 46.000000 12.000000 50.000000 14.000000 54.000000 16.000000
17.000000 18.000000 19.000000 20.000000 21.000000 22.000000 23.000000 24.000000
25.000000 26.000000 27.000000 28.000000 29.000000 30.000000 31.000000 32.000000
1:ScalarA op ScalarB->ScalarD
單純的二個浮點運算
2:VectorA[?] op ScalarB->VectorD[?]
一個 Vector * Scalar 運算
3:VectorA[?] op VectorB[?]->VectorD[?]
Vector * Vector 運算
Ref.
ARM VFP的一点体会 寫的不錯, 範例很好, 就.. 照作一次就 OK 了
VFP11 ™ VectorFloating-point Coprocessor Technical Reference Manual
for ARM1136JF-S processorr1p5
發佈留言