hs2010 / hs2010.tex

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
\documentclass[twocolumn,9pt]{sigplanconf}
\usepackage{amsmath}
\usepackage{color}
\usepackage{graphicx}
\usepackage{listings}
\usepackage{natbib}
\usepackage{hyperref}


\lstloadlanguages{Haskell}
\lstnewenvironment{code}
    {\lstset{}%
      \csname lst@SetFirstLabel\endcsname}
    {\csname lst@SaveFirstLabel\endcsname}
    \lstset{
      basicstyle=\small\ttfamily,
      morecomment=[l][\rmfamily\slshape]--,
      fontadjust=true,
      flexiblecolumns=false,
      basewidth={0.5em,0.45em},
      literate={+}{{$+$}}1 {/}{{$/$}}1 {*}{{$*$}}1 {=}{{$=$}}1
               {>}{{$>$}}1 {<}{{$<$}}1 {\\}{{$\lambda$}}1
               {\\\\}{{\char`\\\char`\\}}1
               {->}{{$\rightarrow$}}2 {>=}{{$\geq$}}2 {<-}{{$\leftarrow$}}2
               {<=}{{$\leq$}}2 {=>}{{$\Rightarrow$}}2 
               {\ .}{{$\circ$}}2 {\ .\ }{{$\circ$}}2
               {>>}{{>>}}2 {>>=}{{>>=}}2
               {|}{{$\mid$}}1               
    }


\begin{document}

\conferenceinfo{Haskell Symposium 2010}{September 30 2010, Baltimore.} 
\copyrightyear{2010} 
\copyrightdata{[to be supplied]} 

\title{Scalable~I/O Event Handling for GHC}

\authorinfo{Bryan O'Sullivan}
           {Serpentine}
           {bos@serpentine.com}
\authorinfo{Johan Tibell}
           {Google}
           {johan.tibell@gmail.com}

\maketitle

\begin{abstract}
  We have developed a new, portable~I/O event manager for the Glasgow
  Haskell Compiler (GHC) that scales to the needs of modern server
  applications.  Our new code is transparently available to existing
  Haskell applications.  Performance at lower concurrency levels is
  comparable with the existing implementation.  In tests, we can
  support hundreds of thousands of concurrent network connections,
  with hundreds of thousands of active timeouts, from a single
  multithreaded program, levels far beyond those achievable with the
  current~I/O manager.  In addition, we provide a public API to
  developers who need to create event-driven network applications.
\end{abstract}

\category{D.3.2}{Programming Languages}{Language Classifications---Applicative (functional) languages}
\category{D.3.2}{Programming Languages}{Language Classifi\-cations---Concurrent, distributed and parallel languages}
\category{D.3.3}{Programming Languages}{Language Constructs and Features---Concurrent programming structures}
\category{D.3.4}{Programming Languages}{Processors---Runtime-environments}

\terms
Languages, Concurrency, Performance, Networking


\section{Introduction}

The concurrent computing model used by most Haskell programs has been
largely stable for almost 15 years~\cite{spj96}.  Despite the
language's many innovations in other areas, networked software is
written in Haskell using a programming model that will be familiar to
most programmers: a thread of control synchronously sends and receives
data over a network connection.  By \emph{synchronous}, we mean that
when a thread attempts to send data over a network connection, its
continued execution will be blocked if the data cannot immediately be
either sent or buffered by the underlying operating system.

The Glasgow Haskell Compiler (GHC) provides an environment with a
number of attractive features for the development of networked
applications.  It provides composable synchronization primitives that
are easy to use~\cite{harris05a}; lightweight threads; and multicore
support~\cite{harris05b}.  However, the increasing demands of
large-scale networked software have outstripped the capabilities of
crucial components of GHC's runtime system.

We have rewritten~GHC's event and timeout handling subsystems to be
dramatically more efficient.  With our changes, a modestly configured
server can easily cope with networking workloads that are several
orders of magnitude more demanding than before.  Our new code is
designed to accommodate both the thread-based programming model of
Concurrent Haskell (with no changes to existing application code) and
the needs of event-driven applications.


\section{Background}

\subsection{The GHC concurrent runtime}

GHC provides a multicore runtime system that uses a small number of
operating system (OS) threads to manage the execution of a potentially
much larger number of lightweight Haskell threads~\cite{marlow04}.
The number of operating system threads to use may be chosen at program
startup time, with typical values ranging up to the number of~CPU
cores available\footnote{GHC also supports an ``unthreaded'' runtime,
  which does not support multiple~CPU cores.  We are concerned only
  with the threaded runtime.}.

From the programmer's perspective, programming in Concurrent Haskell
is appealing due to the simplicity of the synchronous model. The fact
that Haskell threads are lightweight, and do not have a one-to-one
mapping to OS threads, complicates the implementation of the runtime
system.  When a Haskell thread must block, this cannot lead to
an~OS-level thread also being blocked, so the runtime system uses a
single~OS-level~I/O event manager thread (which \emph{is} allowed to
block) to provide an event notification mechanism.

The standard Haskell file and network~I/O libraries are written to
cooperate with the~I/O event manager thread.  When one of these
libraries acquires a resource such as a file or a network socket, it
immediately tells the~OS to access the resource in a non-blocking
fashion.  When a client attempts to access (e.g.~read or write, send
or receive) such a resource, the library performs the following
actions:

\begin{enumerate}
\item Attempt to perform the operation.  If it succeeds, resume
  immediately.
\item If the operation would need to block, the~OS will instead cause
  it to fail and indicate (via \texttt{EAGAIN} or \texttt{EWOULDBLOCK}
  in~Unix parlance) that it must be retried later.
\item The thread registers with the~I/O event manager to be awoken
  when the operation can be completed without blocking.  The sleeping
  and waking are performed using the lightweight~\lstinline{MVar}
  synchronization mechanism of Concurrent Haskell.
\item Once the~I/O event manager wakes the thread, return to step~1.
  (The operation may fail repeatedly with a would-block error,
  e.g.~due to a lost race against another thread for resources, or
  an~OS buffer filling up.)
\end{enumerate}

As this sketch indicates,~GHC provides a synchronous programming model
using a lower-level event-oriented mechanism.  It does so via a
semi-public~API that clients (e.g.~the file and networking libraries)
can use to provide blocking semantics.
\begin{code}
-- Block the current thread until data is available
-- on the given file descriptor.
threadWaitRead, threadWaitWrite :: Fd -> IO ()
\end{code}


\subsection{Timeout management and robust networking}

Well designed network applications make careful use of timeouts to
provide robustness in the face of a number of challenges.  At internet
scale, broken and malicious clients are widespread.  As an example, a
defensively written application will, if a newly connected client
doesn't send any data within a typically brief time window,
unilaterally close the connection and clean up its resources.  

To support this style of programming, the \lstinline{System.Timeout}
module provides a \lstinline{timeout} function:
\begin{code}
timeout :: Int -> IO a -> IO (Maybe a)
\end{code}
It initiates an \lstinline{IO} action, and if the action completes
within the specified time limit, returns \lstinline{Just} its result,
otherwise it aborts the action and returns \lstinline{Nothing}.

Concurrent Haskell also provides a \lstinline{threadDelay} function
that blocks the execution of a thread for a specified amount of time.

Behind the scenes, the~I/O event manager thread maintains a queue of
pending timeouts.  When a timeout fires, it wakes the appropriate
application thread.


\section{Related work}

Li and Zdancewic~\cite{li07} began the push for higher concurrency in
Haskell server applications with an application-level library that
provides both event-~and thread-based interfaces.  We followed their
lead in supporting both event-based and thread-based concurrency, but
unlike their work, ours transparently benefits existing Haskell
applications.

In the context of the Java Virtual Machine, Haller and~Odersky unify
event-~and thread-based concurrency via a~Scala implementation of the
actor concurrency model~\cite{haller07}.  Much of their work is
concerned with safely implementing lightweight threads via
continuations on top of Java's platform-level threads, resulting in an
environment similar to the two-level threading of GHC's runtime, with
comparable concurrency management facilities.

For several years, C programmers concerned with client concurrency
have enjoyed the \texttt{libev} and \texttt{libevent} libraries.
These enable an event-~and callback-driven style of development that
can achieve high levels of both performance and concurrency.  Similar
frameworks are available in other languages, e.g.~Twisted for Python
and Node.js for Javascript.


\section{Shortcomings of the traditional I/O manager}

Although the~I/O manager in versions of~GHC up to~6.12 is portable,
stable, and performs well for low-concurrency applications, its
imperfections make it inapplicable to the scale of operations required
by modern networked applications.

The~I/O manager uses the venerable \texttt{select} system call for two
purposes.  It informs the~OS of the resources it wishes to track for
events, and the time until the next pending timeout should be
triggered, and blocks until either an event occurs or the timeout
fires.

The \texttt{select} system call has well-known problems.  Most obvious
is the distressingly small fixed limit on the number of resources it
can handle even under modern operating systems, e.g.~1,024 on Linux.
In addition, the programming style enforced by \texttt{select} can be
inefficient.  The sizes of its programmer-visible data structures are
linear in the number of resources to watch.  They must be filled out,
copied twice across the user/kernel address space boundary, and
checked afresh for \emph{every} system call performed.  Since the most
common case for server-side applications on the public Internet is for
connections from clients to mostly be idle, the amount of useful work
performed per invocation of \texttt{select} dwindles as the number of
open connections increases.  This repetitious book-keeping rapidly
becomes a noticeable source of overhead.

The~I/O manager incurs further inefficiency by using ordinary Haskell
lists to manage both events and timeouts.  It has to walk the list of
timeouts once per iteration of its main loop, to figure out whether
any threads must be woken and when the next timeout expires.  It must
walk the the list of events twice per iteration: once to fill out the
data structures to pass to \texttt{select}, and again after
\texttt{select} has returned to see which threads to wake.

Since \texttt{select} imposes such a small limit on the number of
resources it can manage, we cannot easily illustrate the cost of using
lists to manage events, but in section~\ref{sec:perf:timeout}, we will
demonstrate the clear importance of using a more efficient data
structure for managing timeouts.


\section{Our approach}

When we set out to improve the performance of~GHC's~I/O manager, our
primary goal was to increase the number of files, network connections,
and timeouts GHC could manage by several orders of magnitude.  We
wanted to achieve this in the framework of the existing Concurrent
Haskell model, retaining complete source-level compatibility with
existing Haskell code, and in a manner that could be integrated into
the main GHC distribution with minimal effort.

Secondarily, we wanted to sidestep the long dispute over whether
events or threads make a better programming model for high-concurrency
servers~\cite{behrens03}.  Since we needed to implement an
event-driven~I/O event manager in order to provide synchronous
semantics to application programmers, we might as well design the
event~API cleanly and expose it publicly to those programmers who wish
to use events\footnote{In our experience, even in a language with
  first-class closures and continuations, writing applications of
  anything beyond modest size in an event-driven style is painful.}.

We desired to implement as much as possible of the new I/O event
manager in Haskell, rather than delegating to a lower-level language.
This desire was partly borne out of pragmatism: we initially thought
that it might be more efficient to base our work atop a portable event
handling library such as \texttt{libev} or \texttt{libevent2}.  These
libraries are intended to be used in a callback-driven style, where
the library executes application code to handle an event.
Although~Haskell's foreign function interface supports calls from~C
code back into~Haskell, experimentation convinced us that the overhead
involved was higher than we were happy with.  With performance and
aesthetic considerations pushing us in the same direction, we were
happy to forge ahead in Haskell.

Architecturally, our new~I/O event manager consists of two components.
Our event notification library provides a clean and portable~API, and
abstracts the system-level mechanisms used to provide efficient event
notifications (\texttt{kqueue}, \texttt{epoll}, and \texttt{poll}).
We have also written a shim that implements the semi-public
\lstinline{threadWaitRead} and \lstinline{threadWaitWrite} interfaces.
This means that neither the core file or networking libraries, nor
other low-level~I/O libraries, require any changes to work with our
new code, and they transparently benefit from its performance
improvements.


\section{Interface to the~I/O event manager}

Our I/O~event manager is divided into a portable front end and a
platform-specific back end.  The interface to the back end is simple,
and is only visible to the front end; it is abstract in the public
interface.
\begin{code}
data Backend = forall a. Backend {
    -- State specific to this platform.
    _beState :: !a,

    -- Poll the back end for new events.  The callback
    -- provided is invoked once per file descriptor with
    -- new events.
    _bePoll :: a
            -> Timeout                 -- in milliseconds
            -> (Fd -> Events -> IO ()) -- I/O callback
            -> IO (),

    -- Register, modify, or unregister interest in the 
    -- given events on the specified file descriptor.
    _beModifyFd :: a
                -> Fd       -- file descriptor
                -> Events   -- old events to watch for
                -> Events   -- new events to watch for
                -> IO (),

    -- Clean up platform-specific state upon destruction.
    _beDestroy :: a -> IO ()
    }
\end{code}

A particular back end will provide a \lstinline{new} action that fills
out a \lstinline{Backend} structure.  For instance, the Mac~OS~X back
end starts out as follows:

\begin{code}
module System.Event.KQueue (new) where
new :: IO Backend
\end{code}

On a Unix-influenced platform, typically more than one back end will
be available.  For instance, on~Linux, \texttt{epoll} is the most
efficient back end, but \texttt{select} and \texttt{poll} are
available.  On~Mac~OS~X, \texttt{kqueue} is usually preferred, but
again \texttt{select} and \texttt{poll} are also available.

Our public~API thus provides a default back end, but allows a specific
back end to be used (e.g.~for testing).

\begin{code}
-- Construct the fastest back end for this platform.
newDefaultBackend :: IO Backend

newWith :: Backend -> IO EventManager

new :: IO EventManager
new = newWith =<< newDefaultBackend
\end{code}

For low-level event-driven applications, a typical event loop involves
running a single step through the~I/O event manager to check for new
events, handling them, doing some other work, and repeating.  Our
interface to the~I/O event manager supports this approach.

\begin{code}
init :: EventManager -> IO ()

-- Returns an indication of whether the I/O event manager
-- should continue, and a modified timeout queue.
step :: EventManager 
     -> TimeoutQueue  -- current pending timeouts
     -> IO (Bool, TimeoutQueue)
\end{code}

To register for notification of events on a file descriptor, clients
use the \lstinline{registerFd} function.
\begin{code}
-- Cookie describing an event registration.
data FdKey

-- A set of events to wait for.
newtype Events
instance Monoid Events
evtRead, evtWrite :: Events

-- A synchronous callback into the application.
type IOCallback = FdKey -> Events -> IO ()

registerFd :: EventManager 
           -> IOCallback -- callback to invoke
           -> Fd         -- file descriptor of interest
           -> Events     -- events to listen for
           -> IO FdKey
\end{code}
Because the~I/O event manager has to accommodate being invoked from
other threads as well as from the same thread in which it is running,
\lstinline{registerFd} sends a wakeup message to the~I/O manager
thread when invoked.  We provide a variant, \lstinline{registerFd_},
which does not wake the~I/O event manager.  Clients needing a little
more efficiency can queue a series of requests to the manager without
waking it, then wake it once afterwards via a \lstinline{wakeManager}
call.

A client remains registered for notifications until it explicitly
drops its registration, and is thus called back on every step into
the~I/O event manager as long as an event remains pending.  We have
found this level-triggered approach to event notification to be much
easier for client applications to use than an edge-triggered approach,
which forces more complicated book-keeping into application code.
\begin{code}
unregisterFd :: EventManager -> FdKey -> IO ()
\end{code}


\section{Implementation}

By and large, the story of our efforts revolves around appropriate
choices of data structure, with a few extra dashes of
context-sensitive and profile-driven optimization thrown in.

\subsection{Economical~I/O event management}

GHC's original I/O manager has to walk the entire list of blocked
clients once per loop before calling \texttt{select}, and mutate the
list afterwards to wake and filter out any clients that have pending
events.  A step through the~I/O manager's loop thus involves $O(n)$ of
traversal and mutation, where~$n$ is the number of clients.

Our new~I/O event manager registers file descriptors persistently with
the operating system, using \texttt{epoll} on Linux and
\texttt{kqueue} on Mac~OS~X, so the~I/O event manager no longer needs
to walk through all clients on each step through the list.  Instead,
we maintain a finite map from file descriptor to client, which we can
look up for each triggered event.  This map is based on~Leijen's
implementation of~Okasaki and~Gill's purely functional~Patricia
tree~\cite{okasaki98}.  The new~I/O event manager's loop thus involves
$O(m \log n)$ traversal, and negligible mutation, where $m$ is the
number of clients with events pending.  This works well in the typical
case where $m \ll n$.

\subsection{Cheap timeouts}

In the original~I/O manager, GHC maintains pending timeouts in an
ordered list, which it partly walks and mutates on every iteration.
Inserting a new timeout thus has $O(n)$ cost per operation, as does
each step through the~I/O manager's loop.

The~I/O event manager needs to perform two operations efficiently
during every step: remove all timeouts that have expired, and find the
next timeout to wait for.  A priority queue would suffice to fill this
need with good performance, but we also need to allow clients to add
and remove timeouts.  Since we need both efficient update by key and
efficient access to the minimum value, we use a priority search queue
instead.

Our purely functional priority search queue implementation is based on
that of Hinze~\cite{hinze01}, so insertion and deletion have $O(\log
n)$ cost, while a step through the list has $O(m \log n)$ cost, where
$m$ is the number of expired timeouts.  Since typically $m \ll n$, we
win on performance.


\section{War stories, lessons learned, and scars earned}

Writing fast networking code is tricky business.  We have variously
encountered:
\begin{itemize}
\item Tunable kernel variables (15 at the last count) that regulate
  obscure aspects of the networking stack in ways that are important
  at scale;
\item Abstruse kernel infelicities (e.g.~Mac~OS~X lacking the
  \lstinline{NOTE_EOF} argument to \texttt{kqueue}, even though it has
  been present in other~BSD variants since 2003);
\item Performance bottlenecks in~GHC that required expert diagnosis
  (section~\ref{sec:blackhole});
\item An inability to stress the software enough, due to lack of
  10-gigabit Ethernet hardware (gigabit Ethernet is easily saturated,
  even with obsolete hardware).
\end{itemize}

In spite of these difficulties, we are satisfied with the level of
performance we have achieved to date.  To give a more nuanced flavour
of the sorts of problems we encountered, we have chosen to share a few
in more detail.


\subsection{Efficiently waking the~I/O event manager}

In a concurrent application with many threads, the~I/O event manager
thread spends much of its time blocked, waiting for the operating
system to notify it of pending events.  A thread that needs to block
until it can perform~I/O has no way to tell how long the~I/O event
manager thread may sleep for, so it must wake the~I/O event manager in
order to ensure that its~I/O request can be queued promptly.

The original implementation of event manager wakeup in~GHC uses a Unix
pipe, which clients use to transmit one of several kinds of
single-byte control message to the~I/O event manager thread.  The
delivery of a control message has the side effect of waking the~I/O
event manager if it is blocked.  Because several kinds of control
message may be sent, the original event manager reads and inspects a
single byte from the pipe at a time.  If several clients attempt to
wake the event manager thread before it can service any of their
requests, it acts as if it has been woken several times in succession,
thus performing unneeded work.

More damagingly, this design is vulnerable to the control pipe filling
up, since a Unix pipe has a fixed-size buffer.  If control messages
are lost due to a pipe overflow, an application may deadlock.  Indeed,
we wrote a microbenchmark that inadvertantly provided a demonstration
of how easy it is to provoke this condition under heavy load.

As a result, we invested some effort in ameliorating the problem.  We
use substantially the same mechanism for waking our event manager
thread, but we have considerably improved both its performance and its
reliability.  Our principal observation was that under both the
original and our new design, the most common control message by far is
a simple ``wake up.''  We have accordingly special-cased the handling
of this message.

On Linux, when possible, we use the kernel's \texttt{eventfd} facility
to provide fast wakeups.  No matter how many clients send wakeup
requests in between checks by the~I/O event manager, it will receive
only one notification.

While other operating systems do not provide a comparably fast
facility, we still have a trick up our sleeves.  We dedicate a pipe to
delivering \emph{only} wakeup messages.  To issue a wakeup request, a
client writes of a single byte to this pipe.  When the~I/O event
manager is notified that data is available on this pipe, it issues a
single \texttt{read} system call to gather all currently buffered
wakeups.  It does not need to inspect any of the data it has read,
since they must all be wakeups, and the fixed size of the pipe buffer
guarantees that it will not be subject to unnecessary wakeups,
regardless of the number of clients requesting.  This means that we no
longer need to worry about wakeup messages that cannot be written for
want of buffer space, so the thread doing the waking can safely use a
non-blocking write.

\subsection{The great black hole pileup}
\label{sec:blackhole}

Our use of an \lstinline{IORef} to manage the timeout queue yielded a
problem that was especially difficult to diagnose, with a huge and
unpredictable performance impact.

In our \lstinline{threadDelay} benchmark, thousands of threads compete
to update the single timeout management \lstinline{IORef}
atomically. If one of these threads was pre-empted while evaluating
the thunk left in the \lstinline{IORef} by
\lstinline{atomicModifyIORef}, then the thunk would become a ``black
hole,'' i.e.~a closure that is being evaluated.  From that point on,
all the other threads would become blocked on black holes: as one
thread called \lstinline{atomicModifyIORef} and found a black hole
inside, it would deposit a new black hole inside that depended on its
predecessor.  Because a black hole is a special kind of thunk that is
invisible to applications, we could not play any of the
usual~\lstinline{seq} tricks to jolly evaluation along.

At the time we encountered this problem, the black hole queue was
implemented as a linear list, which was scanned during every~GC.  Most
of the time, this choice of data structure was not a problem, but it
became painful with thousands of threads on the list.

After several weeks of effort, Simon Marlow performed a wholesale
replacement of~GHC's black hole mechanism.  Instead of a single global
black hole queue,~GHC now queues a blocked thread against the closure
upon which it is blocking.  His work fixed our problem.

\subsection{Bunfight at the GC corral}

When a client application registers a new timeout, we must update the
data structure that we use to manage timeouts.  Originally, we stored
the priority search queue inside an \lstinline{IORef}, and each client
manipulated the queue using \lstinline{atomicModifyIORef}.  Alas, this
led to a bad interaction with~GHC's generational garbage collector.

Since our client-side use of \lstinline{atomicModifyIORef} did not
force the evaluation of the data inside the \lstinline{IORef}, the
\lstinline{IORef} would accumulate a chain of thunks.  If the~I/O
event manager thread did not evaluate those thunks promptly enough,
they would be promoted to the old generation and become roots for all
subsequent minor garbage collections~(GCs).

When the thunks eventually got evaluated, they would each create a new
intermediate queue that immediately became garbage.  Since the thunks
served as roots until the next major~GC, these intermediate queues
would get copied unnecesarily in the next minor~GC, increasing~GC
time.  We had created a classic instance of the generational
``floating garbage'' problem.

The effect on performance of the floating garbage problem was
substantial.  We wrote a microbenchmark to gauge the performance of
\lstinline{threadDelay}.  For example, with~20,000 threads sleeping,
we saw variations in performance of up to~34\%, depending on how we
tuned the~GC and whether we simply got lucky.

We addressed this issue by having clients store a list of \emph{edits}
to the queue, instead of manipulating it directly.
\begin{code}
type TimeoutEdit = TimeoutQueue -> TimeoutQueue
\end{code}
%$

While maintaining a list of edits doesn't eliminate the creation of
floating garbage, it reduces the amount of copying at each minor~GC
enough that these substantial slowdowns no longer occur.


\section{Empirical results}

We gathered Linux results on commodity quad-core server-class hardware
with~4GB of~RAM, and~2.66GHz Intel\textregistered\
Xeon\textregistered~X3230 CPUs running~64-bit Debian~4.0.  We used
version~6.12.1 of~GHC for all measurements, running server
applications on three cores with~GHC's parallel garbage collector
disabled\footnote{The first release of the parallel~GC performed
  poorly on loosely coupled concurrent applications.  This problem has
  since been fixed.}.  When measuring network application performance,
we used an idle gigabit Ethernet network, connected through a switch
to an identically configured system.

\subsection{Performance of event notification}

\begin{figure}
  \centering
  \input{http-rps.tex}
  \input{http-lat.tex}
  \caption{Requests served per second (top) and latency per request
    (bottom) for two HTTP server benchmarks, with all clients busy,
    under old and new~I/O managers.}
  \label{fig:http-rps-lat}
\end{figure}

To evaluate the raw performance of event notification, we wrote
two~HTTP servers.  Each uses the usual Haskell networking libraries,
and we compiled each against both the original~I/O manager (labeled
``(old)'' in graphs) and our rewrite (labeled ``(new)'').  The first,
\texttt{pong}, simply responds immediately to any HTTP request with a
response of ``\texttt{Pong!}''.  The second, \texttt{file}, opens and
serves the contents of a file.  For our measurements, we chose a file
of 4,332 bytes in size.  We used the battle-tested ApacheBench tool to
measure performance under varying levels of client concurrency.

In figure~\ref{fig:http-rps-lat}, our setup has all client connections
active simultaneously; there are no idle connections.  The graphs
demonstrate that under these conditions of peak load, the
\texttt{epoll} back end performs with throughput and latency
comparable to the original~I/O manager.  Notably, the new~I/O event
manager handles far more concurrent connections than the~1,016 or so
that the original~I/O manager is capable of.  (We simply stopped
measuring the new~I/O event manager at~16,384 connections.)

\begin{figure}
  \centering
  \input{idle-rps.tex}
  \input{idle-lat.tex}
  \caption{Requests served per second (top) and latency per request
    (bottom), with~64 active connections and varying numbers of idle
    connections.}
  \label{fig:idle-rps-lat}
\end{figure}

To create a workload that corresponds more closely to the conditions
under which real applications must execute, we open a variable number
of idle connections to the server, then measure the performance of a
series of requests for which we always use~64 concurrently active
clients.  Figure~\ref{fig:idle-rps-lat} illustrates the effects on
throughput and latency of the \texttt{pong} microbenchmark when we
vary the number of idle clients.

For completeness, we measured the performance of both the
\texttt{epoll} and \texttt{poll} back ends.  The original and
\texttt{epoll} managers demonstrate comparable performance up to
the~1,024 limit that \texttt{select} can handle, but while the
performance of \texttt{poll} is erratic, the \texttt{epoll} back end
does not begin to fall off significantly until we have 50,000 idle
connections open\footnote{We tested the new event manager with as many
  as~300,000 idle client connections.  It works, though we might
  hesitate to refer to it as a paragon of speed at that level of
  concurrency.}.

In general, the small limit that \texttt{select} imposes on the number
of concurrently managed resources prevents us from seeing any
interesting changes in the behaviour of the original~I/O manager at
scale, because applications fall over long before any curves have an
opportunity to change shape.  We find this disappointing, as we were
looking forward to a fair fight.

\subsection{Performance of timeout management}
\label{sec:perf:timeout}

We developed a simple microbenchmark to measure the performance of the
\lstinline{threadDelay} function, and hence the efficiency of the
timeout management code.  We measured its execution time, with the
runtime system set to use two~OS-level threads.

As the upper graph of figure~\ref{fig:thread-delay} indicates, GHC's
traditional~I/O manager exhibits $O(n^2)$ behaviour when managing
numerous timeouts.  This would pose a severe performance problem with
many active timeouts.

In comparison, the lower graph of figure~\ref{fig:thread-delay} shows
that the new timeout managament code has no problem coping with
millions of simultaneously active timeouts.  The performance of our
microbenchmark did not begin to degrade until we had three million
threads and timeouts active on a system with~4GB of~RAM.

Even for smaller numbers of threads, the new timeout management code
is far more efficient than the old, as
figure~\ref{fig:thread-delay-comp} shows.

\begin{figure}
  \centering
  \input{old.tex}
  \input{new.tex}
  \caption{Performance of the \lstinline{threadDelay} benchmark, run
    under the existing~I/O event manager (top) and our rewritten
    manager (bottom).}
  \label{fig:thread-delay}
\end{figure}

\begin{figure}
  \centering
  \input{old-new.tex}
  \caption{Comparative performance of old and new I/O managers on the
    \lstinline{threadDelay} microbenchmark. Note the logarithmic scale
    on the~$y$-axis, needed to make the numbers for the new manager
    distinguishable from zero.}
  \label{fig:thread-delay-comp}
\end{figure}


\section{Future work}

We have integrated our event management code into~GHC, and it will be
available to all applications as of~GHC~6.14.  Our future efforts will
revolve around Windows support and further performance improvements.

\subsection{Windows support}

As we are primarily Unix developers, our work to date leaves~GHC's
event management on Windows unchanged.  We believe that our design can
accommodate the Windows model of scalable event notification via~I/O
completion ports.

\subsection{Lower overhead}

We were a little surprised that \texttt{epoll} is consistently
slightly slower than \texttt{select}.  We posit that part of the
reason for this might be that we currently issue two
\texttt{epoll\_ctl} system calls per event notification: one to queue
it with the kernel, and one to dequeue it.  In contrast, the
original~I/O manager has no system call overhead.  If we used
\texttt{epoll} in edge-triggered mode, we could eliminate one call
to\texttt{epoll\_ctl} to dequeue an event\footnote{As a side note,
  the~BSD \texttt{kqueue} mechanism is cleaner than \texttt{epoll} in
  this one respect, combining queueing, dequeueing, and checking for
  multiple events into a single system call.  However, the smaller
  number of trips across the user/kernel address space boundary does
  not appear to result in better performance, and the \texttt{kqueue}
  mechanism is otherwise more cluttered and difficult to use than
  \texttt{epoll}.}.

\subsection{Improved scaling to multiple cores}

In theory, an application should be able to improve both throughput
and latency by distributing its event management load across multiple
cores.  We already support running many instances of the low-level~I/O
event manager at once, with each instance managing a disjoint set of
files or network connections.

We hope to create a benchmark that stresses the~I/O event manager in
such a way that we can either find bottlenecks in, or demonstrate a
performance improvement via, multicore scaling.

\subsection{Better performance tools}

When we were diagnosing performance problems with the~I/O event
manager, we made heavy use of existing tools, such as the~Criterion
benchmarking library~\cite{osullivan09}, GHC's profiling tools, and
the~ThreadScope event tracing and visualisation tool~\cite{jones09}.

As useful as those tools are, when we made our brief foray into
multicore event dispatching, we lacked data that could help us to pin
down any performance bottleneck.  We speculate that if we could
integrate the new~Linux \texttt{perf} analysis tools with ThreadScope,
we might gain a broader systemic perspective on where performance
problems are occurring.


\appendix
\section{Additional materials}

The source code to the original, standalone version of our event
management library is available at
\url{http://github.com/tibbe/event}.

\acks

We owe especial gratitude to Simon Marlow for his numerous detailed
conversations about performance, and for his heroic fixes to~GHC borne
of the tricky problems we encountered.

We would also like to thank Brian Lewis and Gregory Collins for their
early contributions to the new event code base.


% We recommend abbrvnat bibliography style.

\bibliographystyle{abbrvnat}

% The bibliography should be embedded for final submission.

%\begin{thebibliography}{}
%\softraggedright

\bibliography{biblio}
%\bibitem[Smith et~al.(2009)Smith, Jones]{smith02}
%P. Q. Smith, and X. Y. Jones. ...reference text...

\end{document}

%%% Local Variables: 
%%% mode: latex
%%% TeX-PDF-mode: t
%%% TeX-master: t
%%% End: 
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.