Energy aware Compilation for DSPs with SIMD Instructions Markus Lorenz
Lars Wehmeyer
Thorsten Drager ¨
University of Dortmund Dept. of Computer Science 12 44221 Dortmund, Germany
University of Dortmund Dept. of Computer Science 12 44221 Dortmund, Germany
[email protected]
[email protected]
Techn. Universitat ¨ Dresden Vodafone Chair For Mobile Communication Systems 01062 Dresden, Germany
ABSTRACT
! " #$#
%
!
% ! & ' !
Categories and Subject Descriptors
General Terms
Keywords
! " # 1.
INTRODUCTION
$ % &# '()#
" * *+
" + ! " ( # , ! - ) " )# . / 0 $ 0.% , 1
Permission to make digital or hard copies of all or part of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. To copy otherwise, to republish, to post on servers or to redistribute to lists, requires prior specific permission and/or a fee. LCTES’02–SCOPES’02, June 19-21, 2002, Berlin, Germany. Copyright 2002 ACM 1-58113-527-0/02/0006 ... 5.00.
2
[email protected]
" # " ! )#
3 ! # " ! " # # " ) 0 & "+ " ! " " ") (" + " " ( " $4 5 " % - , 6789: 8 " , ,:999 89 + ; "+ " ( $ 5 " % 4% " ! ! ) ) ( + " ) # ) ( ! + " , + # " -
?" ) - * " , =>4 ( # " , - ) ) )+ - $6% $%
$/% $6.% 0 " ) - # @+ " " ) " ) - " Æ $" # % " ) - # )# " A " # ! " " " # ! # " "" ! " " " (" ! " ! " " # ! )
machine independent C-Src
Lance2 front end
Lance2IR
Lance2 to GeLIR
standard optimizations
machine dependent GeLIR
! vectorization ! zero overhead hardware loops ! code generation
GeLIR to Asm
Asm
GeLIR to Sim
Sim
energy data
$.6.% + # ") # " )# ) , "" ! , ( ! " ! - # ) ! " " " (" + $ % ! " , " " # ! ) B / ! - ! A C "" 7 "" ! #
& " ) " )# :9 " * .4/ " + C )
" ) .4/ " "" + & )# )# ! .4/ " " 6 ! " 6 " + ) )# " " ! " " + ) ( &# .4/ " + ) ) ( # ) #
# " ! " " * #
2.
3. TARGET ARCHITECTURE 3.1 M3-DSP
, + $* 8% ) " ) "" ; , " " " " "" ! " + " " # " " "" ! Wide Data Memory (Group Memory) (16 x 16) bit Intermediate Register
ABCD
MAC ALU Accu 0
ABCD
MAC ALU Accu 1
... slice
Local Communication
Inter Communication Network Local Communication
! " " " ! - * : * $ 4 8 :B :% 6 " + " $/% "+ 4 8+/ "" $.4/% D ! (
+ " " "+ " " .4/ ! " " + - ) #" ) " ( " ) + ) " " # " ) " * .4/ " $ ) " $ % " % " ) ) ! " " " )# "+
" * , ) " # " + ! " & - " (" (" + ! " *# # ! " " ) # " " .4/ + " Æ C
Local Communication
COMPILER FRAMEWORK
ABCD
MAC ALU Accu 15
, ) ) " ) " + " $ 9% " "
" " + :7 " " + C " " # + " + :7+) ! " :7 ! , " ! ) " )# "" + " * + ! - , " + " " ! ) $ E * 8% 3 - # ) ! " + , " ) )# ( " 8B7 " ! " ! ) ( 8;7; 0 + + "" $4A% + ! ! " " " + 3.2 Instruction level energy cost model
" # + C & 3 ! " + !
") + , ) " " & - )# & + 1 # " ! + " ( ) # ! " 1 # # ! # " # " "" # ) ) )# " 0 ,! 8 " # ) # " $ % !
$ % ! " & - " E ,! ! - ! ) * # + ! " + " B # ! C 8F " + ! ! ( ! " (" # ! * ) # " $ 5 % " + ! # " $+B " % ) " " :7 " " 0 ( ! " # # " "" ! (" ) " # " # 0 (" " * ! ! " # " $ :7 ( :7 )% " * "
" ! " * * " # , ) ( "+ & ! ) " ) #
! ! ! ) C ! " # " )! C
" , " # " ) )# " - 4. LOOP OPTIMIZATIONS
! " " + ,# " " - " " : (" ! " + 8 ) ) ) " + .4/ ! - , # .4/ " ! ) - * " + 4.1 Vectorization
, (" " " " # -
") " + , (" ! )# ) # ) ) 6 " ) "" ,
" )# # , " # " ! - " # " ) ! , # * " !
) " ) " - ! ) " ( )# " 88 :D # )# " ) ! " 6 6 3 ! )# " " ! " ) & " " " :8 1/6 ,9 " " -
8 "+ ) " ! ) 3 ! "( & # " & ! " "( # :8 : " " " ) ) - 3 " )# " , " ) )# ) " " " + ! ) ( " -
" , "" ) :7 - ? ! + " )# ! " " " ) , "" (" "( # & , :7 )! : ! + ! ) # :7 ) ) - )# " # "( " + 3 ! ""# ) " ! ) , + " # + " # " # ) ! " + # " +" " E ! (" + " " ¯ , + " " # + "+ " :7 ! , !
$" % # " ! ) ¯ , + " 0 " ) " " $ 9% ! " > # ) "" ) " ! " ( , ! )# " ! " + ) " ! "+
* .4/ " , ! " ) , + " ) - " )# " " (" ! " * E " !
)! ! " ./ .4/ " ! + + # ! ! " , + ) " * " !# + " )! ! " " ! " # ! # : 6 " " # ) C # + # " !)# "+
* # # ) # " )# " " ) + ) #
:G 8 " * " + , - * ) ) - "
" ) ! " ) " )! ) # + #
" )# " Æ 7 " ) ! ! " ' ! + " 6 - " )! ) + #
# " ) "+ " - ) B 6 " .4/ " ! ) @ ! # 0 " ! 6 " # 7
0 % $ "* % .4/ (" " , + " " " $ - ) % 4 " ) !# ! " * " * .4/ " $ * %% ! ) + " 0 " ! " ! ) " $ % "+ ! ! - " @ " # # " 0# " * " " " ! " * % ?" " " ) ( ) " ) ! ! " ( ! .4/ ) + " " " + " 5.
ENERGY AWARE CODE GENERATION
" & " + )# " Æ ! " ( # " & + " ) 87 " " ) C " " :; + & # " " , ) & + " ) # &
! ) " " " " "
! )# " , # ! " # " # ) # ) ) "# +
" " # * ) & " " 0 " " " # !
+ 0 "+ " ) + & 3 ! " " ) " " "( " ) , - " "+ # ! ") * " " " " # . $. % " "( "+ " ) )# " +
$ : :: !% . " + " " " ) , " )# ! ) , +
) " " ) , * ) ) $ + % * " " ) # " ) " ) & + , " ) " " " 0
& + " " ) - " " " ) ! " : ) - $ + " % # ! " $ % ) ) - $.6.% ! ) :; 8 . #
& (
6.
EXPERIMENTAL RESULTS
! (" " "+ ! " $=>4% ! # ! .6. " " ! .6. .4/ ! # + rel. code size in %
250 200
unoptimized ZOL
150
vector 100
vector+ZOL vector+ZOL+GCG
50
lm s do t_ pr od uc t_ do 2 t_ pr od uc t_ 16
_u pd at es
n_ re al
ex am
pl e
0
$
, ) - - ) 8; ! ( " ) #
#
)# " + ) G ! " " * " & )# ) "+ " ! "
120 unoptimized
100
ZOL
80
vector
60
vector+ZOL
40
vector+ZOL+GCG
20
lm s do t_ pr od uc t_ do 2 t_ pr od uc t_ 16
n_ re al
ex am
_u pd at es
0
pl e
rel. #memory accesses in %
140
%
rel. #exec. cycles in %
160 140 120
unoptimized
100
ZOL
80
vector
60
vector+ZOL
40
vector+ZOL+GCG
20
lm s do t_ pr od uc t_ do 2 t_ pr od uc t_ 16
n_ re al
_u pd at es
pl e
0
ex am
) ) - )# - ( + + # ! ) " " " " ! " .6. " " * .4/ ! ) - * .4/ * " , " )+ ? # " " ) " " " 3 .6. ) +
C )? )# " # ) * ¯ ( ) )# + ) ( # )
# " ! * ¯ 4 " ! " , "" ( $8 :7% )# ) - ' ) # ) "
120 100 80
unoptimized
60
ZOL
40
ZOL+GCG
20 3
1 m at rix
m at 1x
fir
0 co nv ol ut io n
rel. #exec. cycles in %
) '
* +, '( " #
( # " , .6. $ H=>4+ H.6.% " F * G D =>4 ) + - ! ) )# " , ) ( # )# - =>4 )# :7F 8BF ) - , * ! " # " $ * D% # ! $=>4H.6.% # " # + " )# BF " )# # =>4 , )# ;F ! ) #
) - 7.
CONCLUSIONS
, ! ) # " " "" " ! , + + " # " # " # ) ! ) - (+ " ") " " " " "+ ( ! # " # "" ! " " " ("
unoptimized
60
ZOL
40
ZOL+GCG
20
1
3
0
m at rix
lm s do t_ pr od uc t_ do 2 t_ pr od uc t_ 16
ex am
n_ re al
_u pd at es
vector+ZOL+GCG
80
m at 1x
vector+ZOL
100
fir
vector
rel. energy consumption in %
ZOL
120
co nv ol ut io n
unoptimized
pl e
rel. energy consumption in %
180 160 140 120 100 80 60 40 20 0
- +, ' ! " ! ( # + " ) "" ! " + " " " $.6.% ) # .6. ") " # ! / ! " # " &# , ) - # " )# 8F ) - )# :DF 3 ! " ! ) " # ! ) - 0 ! - ! ! ( ) " ! ) + )# " & " + ! - ! ! " " & " # !# 8. ACKNOWLEDGMENTS
, ! - - 3 ! " " "+ 9. ADDITIONAL AUTHORS
/ 4" $ # 1 # , # # $)$$*+,&',"*% 10. REFERENCES
: , EI - ! " # $ >( 1 # :DD7 8 0 E 4 . > J " 6 " , 3+ 6 " " % & 87$% :DD E / 4" 6 6 0( + '( " )" * :DDD J " + &,- 1 # , J :DD; B , I . 0! ME # , !
+ / & 8 ! & & $!& 899: D . 4 ! 4 /" $.4/% :8+!!! + N NN :9 3 C , L @ . E > ) > A A- 3 # @ # "" +" * + $ % 1 " 4 !!! # " & " :G K :B 899: :: J 3 3 " 4 " & , :DD8 :8 L 6 " , &