Loop Fusion in Haskell

Loop fusion in Haskell
Roman Leshchinskiy
Programming Languages and Systems

University of New South Wales
What is this about?
What I do
Data Parallel Haskell
compiles nested data-parallel programs to flat data-parallel ones
lots of arrays and collective operations involved
What is this about?
What I do
zipWith (-)
(zipWith (*)
(zipWith (-) (replicate_s segd as1) xs)
(zipWith (-) (replicate_s segd bs1) ys))
(zipWith (*)
(zipWith (-) (replicate_s segd bs2) ys)
(zipWith (-) (replicate_s segd as2) xs))
What is this about?
What I do
What other people do
array programs with lots of collective operations
return . foldl’ hash 5381

. map toLower
. filter isAlpha =<< readFile f
What is this about?
What I do
What other people do
array programs with lots of collective operations
What everybody wants
no temporary arrays
fused loops
C-like speed
Loop fusion is easy!
foo xs = map (*5) (map (+3) xs)

RULES
"map/map" map f (map g xs) = map (f . g) xs

bar ys = filter even (filter (<42) ys)
RULES

RULES
"filter/filter" filter f (filter g xs)
= filter (λ x → f x && g x) xs

baz zs = map (+1) (filter even zs)
RULES

RULES
"map/filter" map f (filter g xs) = mapFilter f g xs

RULES
"map/filter" map f (filter g xs) = mapFilter f g xs
"map/mapFilter" map f (mapFilter g h xs)
= mapFilter (f . g) h xs
"mapFilter/filter" mapFilter f g (filter h xs)
= mapFilter (f λ x → g x && h x) xs
...

RULES
"map/filter"
E A
map f (filter g xs) = mapFilter f g xs
ID
"map/mapFilter" map f (mapFilter g h xs)
= mapFilter (f . g) h xs
A D
"mapFilter/filter" mapFilter f g (filter h xs)
... B
= mapFilter (f λ x → g x && h x) xs
The challenge
use a constant number of rewrite rules

don’t require new rules for new combinators
make adding new combinators easy
fuse everything!
don’t require specialised compiler support
handle both sequential and parallel loops
Sequential loops
Streams
data Step s a = Yield a s

| Done
data Stream a = ∃s. Stream (s → Step s a) s
Streams
data Step s a = Yield a s stepper

| Done
data Stream a = ∃s. Stream (s → Step s a) s state
stepper produces next element and state from current state

similar to an iterator
actually encodes an anamorphism (unfold)
Streams

| Done

sumS :: Num a ⇒ Stream a → a

sumS (Stream step s) = go 0 s
where go z s = case step s of
Yield x s’ → go (z+x) s’
Done → z
Streams

| Done

stream :: Array a → Stream a

stream arr = Stream step 0
where step i | i < length arr = Yield (arr ! i) (i+1)
| otherwise = Done
Streams

| Done

mapS :: (a → b) → Stream a → Stream b

mapS f (Stream step s) = Stream step’ s
where step’ s = case step s of
Yield x s’ → Yield (f x) s’
Done → Done
Streams

| Done

unstream :: Stream a → Array a

unstream (Stream step s) = <allocate, fill and freeze>
Stream fusion in three easy steps
Step 1: implement array operations in terms of streams
sum :: Num a ⇒ Array a → a

sum = sumS . stream
map :: (a → b) → Array a → Array b

map f = unstream . mapS f . stream

sum = sumS . stream

Step 2: inline them

sumsq :: Num a ⇒ Array a → a
sumsq = sum . map (λx -> x*x)

sum = sumS . stream

Step 2: inline them

= sumS . stream . unstream . mapS f . stream

sum = sumS . stream

Step 2: inline them


sum = sumS . stream

Step 2: inline them

Step 3: eliminate stream/unstream pairs
"stream/unstream" stream (unstream s) = s

sum = sumS . stream

Step 2: inline them

= sumS . mapS f . stream

sum = sumS . stream
st
re
e
th
do
C
Step 2: inline them H
tG
Le
= sumS . mapS f . stream

Optimising stream operations
sumsq xs = sumS (mapS square ( stream xs))

sumsq xs = sumS (mapS square ( stream xs))
inline
stream :: Array a → Stream a

stream arr = Stream step 0
where step i | i < length arr = Yield (arr ! i) (i+1)
| otherwise = Done
sumsq xs = sumS ( mapS square (Stream step1 0))

where
step1 i = case i < length xs of
True → Yield (xs ! i) (i+1)
False → Done
sumsq xs = sumS ( mapS square (Stream step1 0))

where
step1 i = case i < inline
length xs of
False → Done
mapS :: (a → b) → Stream a → b
mapS f (Stream step s) = Stream step’ s
where step’ s = case step s of
Yield x s’ → Yield (f x) s’
Done → Done
sumsq xs = sumS (Stream step2 0)

where
False → Done
step2 i = case step1 i of

Yield x i’ → Yield (square x) i’
Done → Done
sumsq xs = sumS (Stream step2 0)

where
step1 i = case
inlinei < length xs of
False → Done

Done → Done
sumS :: Num a ⇒ Stream a → a
sumS (Stream step s) = go 0 s
where go z s = case step s of
Yield x s’ → go (z+x) s’
Done → z
sumsq xs = go 0 0
where
False → Done

Done → Done
go z i = case step2 i of
Yield x i’ → go (z+x) i’
Done → z
sumsq xs = go 0 0
where
False → Done

Done → Done
Done → z
inline
sumsq xs = go 0 0
where
False → Done
go z i = case (case step1 i of

Done → Done) of
Done → z
sumsq xs = go 0 0
where
case of case
False → Done
go z i = case (case step1 i of

Done → Done) of
Done → z
sumsq xs = go 0 0
where
False → Done
Yield x i’ → go (z + square x) i’
Done → z
sumsq xs = go 0 0
where
False → Done
Done → z
inline
sumsq xs = go 0 0
where
go z i = case (case i < length xs of
False → Done) of
Done → z
case of case
sumsq xs = go 0 0
where
go z i = case (case i < length xs of
False → Done) of
Done → z
sumsq xs = go 0 0
where
go z i = case i < length xs of
True → go (z + square (xs ! i)) (i+1)
False → z
sumsq xs = go 0 0
where
go z i = case i < length xs of
True → go (z + square (xs ! i)) (i+1)
False → z
optimal loop
no Stream or Step values ever created
only general-purpose optimisations
will be optimised further (unboxing etc.)
requires a great compiler (thanks GHC team!)
Why does it work?
sumsq xs = go 0 0
where
False → Done

Done → Done
Done → z
Why does it work?
sumsq xs = go 0 0
where
u r sive
True -r→ c Yield (xs ! i) (i+1)
n o n e
False → Done

Done → Done
Done → z
Why does it work?
sumsq xs = go 0 0
where
u r sive
n o n e
False → Done

ve
ursi→ Yield (square x) i’
Yield -xreci’
non
Done → Done
Done → z
Why does it work?
sumsq xs = go 0 0
where
u r sive
n o n e
False → Done

ve
ursi→ Yield (square x) i’
Yield -xreci’
non
Done → Done
sive→ go (z+x) i’
Yield xcuri’
re
Done → z
A slight problem
filterS :: (a → Bool) → Stream a → Stream a

filterS f (Stream step s) = Stream step’ s
where
step’ s = case step s of
Yield x s’
| f x → Yield x s’
| otherwise → step s’
Done → Done
A slight problem

where
Yield x s’
| f x cursive → Yield x s’
re
| otherwise → step s’
Done → Done
Extending streams
Idea: allow a loop iteration not to produce an element

Extending streams

| Skip s
| Done
Extending streams

| Skip s
| Done

where
Yield x s’
| otherwise → Skip s’
Skip s’ → Skip s’
Done → Done
Extending streams

| Skip s
| Done

where
Yield x s’
u r sive
| otherwise c → Skip s’
n-re
Skip s’ no → Skip s’
Done → Done
Stream fusion - summary
encode loops by streams

implement array operations in terms of streams
eliminate stream/unstream pairs (temporaries)
stream producers are non-recursive
standard optimisations remove overhead (loop fusion)
Stream fusion - summary
encode loops by streams

implement array operations in terms of streams
eliminate stream/unstream pairs (temporaries)
stream producers are non-recursive
standard optimisations remove overhead (loop fusion)
Standard optimisations: inlining, case-of-case, worker/wrapper

transformation, SpecConstr, LiberateCase, specialisation ...
Parallel loops
DPH on multicores
Evaluation strategy after vectorisation

operations are data parallel and flat
executed by a gang of worker threads
essentially fork-join parallelism
DPH on multicores

mapP :: (a → b) → Array a → Array b

mapP f xs = <split xs across workers>
<map f over each chunk>
<collect local results>
DPH on multicores

f is sequential

DPH on multicores


sumP :: Num a ⇒ Array a → a

sumP xs = <split xs across workers>
<sum each chunk>
<reduce local sums>
DPH on multicores

sumsqP = sumP . mapP square

DPH on multicores

sumsqP xs = <split xs across workers>

<map square over each chunk>
<split results across workers>
<sum each chunk>
<reduce local sums>
DPH on multicores


<sum each chunk>
<reduce local sums>
DPH on multicores


<sum each chunk>
<reduce local sums>
Distributed types
Idea: let’s make the evaluation strategy explicit! (Keller 1999)

Distributed types

data Dist a a is distributed across threads
Dist (Array a) each thread has a local array (chunk)
Dist Double each thread has a local Double
Distributed types

splitD distribute an array across threads

joinD collect thread-local chunks
splitD :: Array a → Dist (Array a)

joinD :: Dist (Array a) → Array a
Distributed types

splitD distribute an array across threads

mapD execute a sequential operation in each thread

sumD compute sum of local values
splitD :: Array a → Dist (Array a)

joinD :: Dist (Array a) → Array a
mapD :: (a → b) → Dist a → Dist b
sumD :: Num a ⇒ Dist a → a
Programming with distributed types

mapP f = joinD -- collect

. mapD (map f) -- map f over chunks
. splitD -- split

. splitD -- split
sumP xs = <split xs across workers>

<sum each chunk>
<reduce local sums>

. splitD -- split
sumP = sumD -- reduce

. mapD sum -- sum each chunk
. splitD -- split
Fusing distributed types
sumsqP = sumP . mapP square

sumsqP = sumD -- reduce

. splitD -- split
. joinD -- collect
. mapD (map square) -- map square over chunks
. splitD -- split

. splitD -- split
. joinD -- collect
. splitD -- split
RULES
splitD (joinD xs) = xs

. splitD -- split
RULES

. splitD -- split
RULES
mapD f (mapD g xs) = mapD (f . g) xs

. mapD (sum . map square) -- work
. splitD -- split
RULES

. mapD (sum . map square) -- work
. splitD -- split
stream fusion
RULES
Distributed types on multicores
splitD distribute xs across threads

mapD execute a sequential operation in each thread
splitD/joinD eliminate communication

mapD/mapD eliminate synchronisation
Distributed types on clusters
data Dist a a is distributed across nodes
splitD scatter
joinD gather
mapD execute operation on each node
splitD/joinD eliminate communication

mapD/mapD eliminate synchronisation
Distributed types on GPUs
data Dist a a is in GPU memory
splitD CPU −→ GPU transfer

joinD GPU −→ CPU transfer
mapD execute kernel on the GPU
splitD/joinD eliminate memory transfers (communication)

mapD/mapD fuse kernels (synchronisation)
Distribured types – summary
encode parallel loops as split/work/join

eliminate unnecessary split/join pairs
fuse sequential work (stream fusion)
very general mechanism for fusing parallel computations
applicable to a wide range of architectures
again, no specialised compiler support
1
1 2 4 8
Obligatory benchmark
sumsq, Haskell sumsq, C dotp, Haskell
dotp, C smvm, Haskell smvm, C
Runtime @ greyarea
10000
1000
100
10
1
1 2 4 8 16 32 64
sumsq, Haskell sumsq, C dotp, Haskell

dotp, C smvm, Haskell smvm, C
Parting thoughts
it’s nice, it’s easy to use, it works

high-level functional programs compiled to highly efficient code
even parallel ones!
rewrite rules + great optimiser = win
DPH doesn’t require any special-purpose optimisations
try this in an imperative language...
Parting thoughts

even parallel ones!
try this in n’t imperative language...
doan
Parting thoughts

even parallel ones!
try this in n’t imperative language...
doan
Stream fusion: dph, bytestring, vector, uvector

Distributed types: dph

Loop Fusion in Haskell

Загружено:

Сведения о документе

Авторское право

Доступные форматы

Поделиться этим документом

Поделиться или встроить документ

Параметры публикации

Этот документ был вам полезен?

Это неприемлемый материал?

Авторское право:

Доступные форматы

Loop Fusion in Haskell

Загружено:

Авторское право:

Доступные форматы

Loop fusion in Haskell

Programming Languages and Systems

return . foldl’ hash 5381

foo xs = map (*5) (map (+3) xs)

foo xs = map (*5) (map (+3) xs)

foo xs = map (*5) (map (+3) xs)

foo xs = map (*5) (map (+3) xs)

foo xs = map (*5) (map (+3) xs)

foo xs = map (*5) (map (+3) xs)

foo xs = map (*5) (map (+3) xs)

foo xs = map (*5) (map (+3) xs)

use a constant number of rewrite rules

data Step s a = Yield a s

data Step s a = Yield a s stepper

stepper produces next element and state from current state

data Step s a = Yield a s

stepper produces next element and state from current state

sumS :: Num a ⇒ Stream a → a

data Step s a = Yield a s

stepper produces next element and state from current state

stream :: Array a → Stream a

data Step s a = Yield a s

stepper produces next element and state from current state

mapS :: (a → b) → Stream a → Stream b

data Step s a = Yield a s

stepper produces next element and state from current state

unstream :: Stream a → Array a

sum :: Num a ⇒ Array a → a

map :: (a → b) → Array a → Array b

sum :: Num a ⇒ Array a → a

map :: (a → b) → Array a → Array b

Step 2: inline them

sum :: Num a ⇒ Array a → a

map :: (a → b) → Array a → Array b

Step 2: inline them

sum :: Num a ⇒ Array a → a

map :: (a → b) → Array a → Array b

Step 2: inline them

sum :: Num a ⇒ Array a → a

map :: (a → b) → Array a → Array b

Step 2: inline them

sum :: Num a ⇒ Array a → a

map :: (a → b) → Array a → Array b

Step 2: inline them

sum :: Num a ⇒ Array a → a

= sumS . mapS f . stream

sumsq xs = sumS (mapS square ( stream xs))

sumsq xs = sumS (mapS square ( stream xs))

stream :: Array a → Stream a

sumsq xs = sumS ( mapS square (Stream step1 0))

sumsq xs = sumS ( mapS square (Stream step1 0))

sumsq xs = sumS (Stream step2 0)

step2 i = case step1 i of

sumsq xs = sumS (Stream step2 0)

step2 i = case step1 i of

step2 i = case step1 i of

step2 i = case step1 i of

go z i = case (case step1 i of

go z i = case (case step1 i of

step2 i = case step1 i of

step2 i = case step1 i of

step2 i = case step1 i of

step2 i = case step1 i of

filterS :: (a → Bool) → Stream a → Stream a