Академический Документы
Профессиональный Документы
Культура Документы
Pi19404
March 10, 2014
Contents
Contents
ARM Neon Optimization InterLeaving/De-Interleaving 3
0.1 Introduction . . . . . 0.2 ARM Neon . . . . . . 0.3 Deinterleaving and 0.4 De-InterLeaving . . 0.5 NDK BUILD . . . . . . 0.6 InterLeaving . . . . . 0.7 Code . . . . . . . . . . . References . . . . . . . . .
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . Interleaving channels of . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
. . . . . . . . . . Image . . . . . . . . . . . . . . . . . . . . . . . . .
. . . . . . . .
. . . . . . . .
. . . . . . . .
. . . . . . . .
. . . . . . . .
. . . . . . . .
3 3 3 4 5 9 10 10
2 | 10
3 | 10
0.4 De-InterLeaving
The de-interleave seperates the pairs of adjacenet elements in the memory into seperate registers. the VLD3 instruction seperates/de-interleaves the BGR channels of the image and sperates them into 3 different registers.The BGR values are stored in adjacent memory locations. The result of vld instruction is then stored to registers which point to destination memory location
vld3_u8 /*This instruction loads the contents of memory location with interleaving of adjacent memory locations .This results in 8 elements of memory being loaded into single 64 bit register and we have 3 such registers as a result of interleaving process. This may be used when the pointer refers to data of type 8 bit signed or unsigned integers */ vst1_u8 //This instruction is used to store contents of 64 bit register to desired memory location.8 simultaneous elements (8x8 =64) constituting the 64 bit register are written to the memory location. void neon_interlace(uint8_t * __restrict d3,uint8_t * __restrict r0,uint8_t * {
4 | 10
int i; uint8_t *s3 = (uint8_t *)d3; for(i=0;i<(width*height)/8;i++) { uint8x8x3_t loaded = vld3_u8(s3); vst1_u8(r0,loaded.val[0]); vst1_u8(r1,loaded.val[1]); vst1_u8(r2,loaded.val[2]); s3=s3+3*8; r0=r0+8; r1=r1+8; r2=r2+8; } }
5 | 10
/opt/android-ndk-r7/toolchains/arm-linux-androideabi-4.4.3/prebuilt/ linux-x86/bin/arm-linux-androideabi-gcc -MMD -MP -MF /home/pi19404/ ARM//obj/local/armeabi-v7a/objs/helloneon/helloneon-intrinsics.o.d -fpic -ffunction-sections -funwind-tables -fstack-protector -D__ARM_ARCH_5__ -D__ARM_ARCH_5T__ -D__ARM_ARCH_5E__ -D__ARM_ARCH_5TE__ -Wno-psabi -march=armv7-a -mfloat-abi=softfp -mfpu=vfp -mthumb -Os -fomit-frame-pointer -fno-strict-aliasing -finline-limit=64 -mfpu=neon -I/usr/local/include -I/media/UBUNTU/repository/OpenVisionLibrary/OpenVision/ -I/opt/android-ndk-r7/sources//android/cpufeatures -I/opt/android-ndk-r7/sources/cxx-stl/gnu-libstdc++/include -I/opt/android-ndk-r7/sources/cxx-stl/gnu-libstdc++/libs/armeabi-v7a/include -I/home/pi19404/ARM//jni -DANDROID -DHAVE_NEON -fPIC -DANDROID
6 | 10
-I/usr/local/include/opencv -I/usr/local/include -I/OpenVision -I/media/UBUNTU/repository/OpenVisionLibrary/OpenVision -fPIC -DHAVE_NEON=1 -ftree-vectorize -mfpu=neon -O3 -mfloat-abi=softfp -ffast-math -Wa,--noexecstack -O3 -DNDEBUG -I/opt/android-ndk-r7/platforms/android-8/arch-arm/usr/include /home/pi19404/ARM//jni/helloneon-intrinsics.c -S
The above command will generate the the file helloneon-intrinsics.s in the present directory A lot of unecessary instruction can be observed in the assembly code. The assembly level code corresponding to the functions were optimized and compiled For compilation again the debug build output observed from ndkbuild process as modified so that helloneon-intrinsics.o object file is compiled from helloneon-intrinsics.s and helloneon binary file is compiled and linked from all source files.
/opt/android-ndk-r7/toolchains/arm-linux-androideabi-4.4.3/prebuilt/linux-x86/ -MMD -MP -MF \ -fpic -ffunction-sections -funwind-tables -fstack-protector\ -D__ARM_ARCH_5__ -D__ARM_ARCH_5T__ -D__ARM_ARCH_5E__ -D__ARM_ARCH_5TE__ \ -Wno-psabi -march=armv7-a -mfloat-abi=softfp -mfpu=vfp -mthumb -Os -fomit-fram -fno-strict-aliasing -finline-limit=64 -mfpu=neon -I/usr/local/include \ -I/media/UBUNTU/repository/OpenVisionLibrary/OpenVision/ -I/opt/android-ndk-r7/sources//android/cpufeatures -I/opt/android-ndk-r7/sources/cxx-stl/gnu-libstdc++/include -I/opt/android-ndk-r7/sources/cxx-stl/gnu-libstdc++/libs/armeabi-v7a/include \ -I/home/pi19404/ARM//jni -DANDROID -DHAVE_NEON -fPIC -DANDROID -I/usr/local/include/opencv -I/usr/local/include -I/OpenVision \ -I/media/UBUNTU/repository/OpenVisionLibrary/OpenVision -fPIC -DHAVE_NEON=1 -ftree-vectorize -mfpu=neon -O3 -mfloat-abi=softfp -ffast-math -Wa,--noexecstack -O3 -DNDEBUG -I/opt/android-ndk-r7/platforms/android-8/arch-arm/usr/include -c /home/pi19404/ARM/jni/helloneon-intrinsics.s \ -o /home/pi19404/ARM/obj/local/armeabi-v7a/objs/helloneon/helloneon-intrinsic --sysroot=/opt/android-ndk-r7/platforms/android-14/arch-arm/
7 | 10
/home/pi19404/ARM//obj/local/armeabi-v7a/libcpufeatures.a /home/pi19404/ARM//obj/local/armeabi-v7a/libgnustl_static.a /opt/android-ndk-r7/toolchains/arm-linux-androideabi-4.4.3/ prebuilt/linux-x86/bin/../lib/gcc/arm-linux-androideabi/4.4.3/libgcc.a -Wl,--fix-cortex-a8 -Wl,--no-undefined -Wl,-z,noexecstack -L/opt/android-ndk-r7/platforms/android-8/arch-arm/usr/lib -fPIC -llog -ldl -lm -lz -lm -lc -lgcc -Wl,-rpath,'libs/armeabi-v7a' -L/home/pi19404/ARM//jni/../libs/armeabi -llog -Llibs/armebi -Llibs/armeabi-v -lopencv_core -lopencv_imgproc -lopencv_highgui -lopencv_flann -lc -lm -o /home/pi19404/ARM//obj/local/armeabi-v7a/helloneon cp /home/pi19404/ARM//obj/local/armeabi-v7a/helloneon libs/armeabi-v7a
The results of the optimization process is as follows OPENCV : 15ms NEON : 8ms NEON OPTIMIZED : 6 ms Thus a speedup factor of 1.4 and total performance improvement of 2.5x was observed. Thus it can be seen that atleast 2.5x improvement is observed after optimizing the assembly code. This still does not motivate the use of assembly level coding since the developement effort may outweight the optimization benifits.
.LCFI0: .LCFI1:
push {r4, r5, r6, r7, r8, r9, sl, fp} @store registers on stack .save {r4, r5, r6, r7, r8, r9, sl, fp} .pad #64 sub mov ldr ldr mul asr sp, sp, #64 @pointer to top of stack
.loop:
r7, r0 r4, [sp, #96] @load function arguments r4 64+8*4 r5, [sp, #100] @load function arguments r5 64+9*4 r6,r4,r5 r6, r6, #3 @divide loop count by 8
# load 8 pixels: vld3.8 {d0-d2},[r7] @load pixels vst1.8 {d0}, [r1] @store interleaved pixels vst1.8 {d1}, [r2]
8 | 10
{d2}, [r3] r7, r7, #24 @increment counter r1, r1, #8 r3, r2, #8 r3, r3, #8 r6, r6, #1 @check loop counter .loop sp, sp, #64 {r4, r5, r6, r7, r8, r9, sl, fp} lr
0.6 InterLeaving
The interleaving operation corresponds to combining 3 independent channels of a image into multi-channel image. Each element of idependent channels are stored in adjacent locations in the multi-channel image.
void neon_interleave(uint8_t * __restrict d3,uint8_t * __restrict r0,uint8_t * { int i; uint8x8x3_t v; for(i=0;i<width*height/8;i++) { v.val[0]=vld1_u8(r0); v.val[1]=vld1_u8(r1); v.val[2]=vld1_u8(r2); vst3_u8(d3,v); d3=d3+3*8; r0=r0+8; r1=r1+8; r2=r2+8; } }
The performance is as follows : OPENCV : 9ms NEON OPTIMIZED : 3 ms The interleaving process shows a performance improvement of about 3x.
9 | 10
0.7 Code
The code for the same can be found in the git repository https: //github.com/pi19404/OpenVision in the POC/ARM subdirectory. The jni subdirectory consists of the source files as well as the make files. The files generate_assembly.ksh generate the helloneon-intrinsis.s files in the ARM directory.After modifying the file copy it to the jni sub-directory, compile_assembly.ksh compiles the helloneon-intrinsis.s and also the binary file The binary requires the opencv library files which needs to be transferred to the android mobile device
adb push libs/armeabi-v7a/ /data/local/tmp/NEON_TEST \item \url{http://pulsar.webshaker.net/ccc/result.php} shows the number of execution ARM assembly code ,which can be used to check the performance of compiler gene code.
10 | 10