Commits

Hakan Ardo committed edbd398

imported

Comments (0)

Files changed (28)

sqrt/benchmark.sh

+#!/bin/sh
+
+$1 time_sqrt.py None float
+$1 time_sqrt.py None int
+$1 time_sqrt.py None Fix16
+
+$1 time_sqrt.py int float
+$1 time_sqrt.py float int
+$1 time_sqrt.py float Fix16

sqrt/fix16.png

Added
New image
+[p0, p1, p2, p3, i4, p5, i6, p7, p8, p9, p10, p11, p12]
+guard_value(i4, 0, descr=<ResumeGuardDescr object at 0x7f0bf846b9c8>)
+guard_nonnull(p11, descr=<ResumeGuardDescr object at 0x7f0bf846ba88>)
+guard_value(p2, ConstPtr(ptr14), descr=<ResumeGuardDescr object at 0x7f0bf846bb88>)
+guard_class(p11, ConstClass(W_IntObject), descr=<ResumeGuardDescr object at 0x7f0bf846bc90>)
+guard_class(p11, ConstClass(W_IntObject), descr=<ResumeGuardDescr object at 0x7f0bf846bd58>)
+i17 = getfield_gc_pure(p11, descr=<W_IntObject.inst_intval>)
+i19 = int_gt(i17, 0)
+guard_true(i19, descr=<ResumeGuardDescr object at 0x7f0bf846be90>)
+guard_nonnull(p11, descr=<ResumeGuardDescr object at 0x7f0bf846c008>)
+guard_class(p11, ConstClass(W_IntObject), descr=<ResumeGuardDescr object at 0x7f0bf846c148>)
+guard_class(p11, ConstClass(W_IntObject), descr=<ResumeGuardDescr object at 0x7f0bf846c210>)
+i22 = getfield_gc_pure(p11, descr=<W_IntObject.inst_intval>)
+i24 = int_sub_ovf(i22, 1)
+guard_no_overflow(, descr=<ResumeGuardDescr object at 0x7f0bf846c340>)
+p26 = new_with_vtable(ConstClass(W_IntObject))
+setfield_gc(p26, i24, descr=<W_IntObject.inst_intval>)
+guard_nonnull(p12, descr=<ResumeGuardDescr object at 0x7f0bf846c4e8>)
+guard_nonnull(p10, descr=<ResumeGuardDescr object at 0x7f0bf846c5e0>)
+guard_nonnull(p12, descr=<ResumeGuardDescr object at 0x7f0bf846c6d8>)
+guard_class(p10, 17688456, descr=<ResumeGuardDescr object at 0x7f0bf846c7d8>)
+guard_class(p12, 17688456, descr=<ResumeGuardDescr object at 0x7f0bf846c8a0>)
+guard_class(p10, 17688456, descr=<ResumeGuardDescr object at 0x7f0bf846c968>)
+guard_class(p12, 17688456, descr=<ResumeGuardDescr object at 0x7f0bf846ca30>)
+f31 = getfield_gc_pure(p10, descr=<W_FloatObject.inst_floatval>)
+f32 = getfield_gc_pure(p12, descr=<W_FloatObject.inst_floatval>)
+i34 = float_eq(f32, 0.000000)
+guard_false(i34, descr=<ResumeGuardDescr object at 0x7f0bf846cba0>)
+f35 = float_truediv(f31, f32)
+p37 = new_with_vtable(17688456)
+setfield_gc(p37, f35, descr=<W_FloatObject.inst_floatval>)
+guard_class(p12, 17688456, descr=<ResumeGuardDescr object at 0x7f0bf846cd58>)
+guard_class(p37, 17688456, descr=<ResumeGuardDescr object at 0x7f0bf846ce20>)
+guard_class(p12, 17688456, descr=<ResumeGuardDescr object at 0x7f0bf846cee8>)
+guard_class(p37, 17688456, descr=<ResumeGuardDescr object at 0x7f0bf846cfb0>)
+f42 = getfield_gc_pure(p12, descr=<W_FloatObject.inst_floatval>)
+f43 = getfield_gc_pure(p37, descr=<W_FloatObject.inst_floatval>)
+f44 = float_add(f42, f43)
+p46 = new_with_vtable(17688456)
+setfield_gc(p46, f44, descr=<W_FloatObject.inst_floatval>)
+guard_class(p46, 17688456, descr=<ResumeGuardDescr object at 0x7f0bf846d220>)
+guard_class(p46, 17688456, descr=<ResumeGuardDescr object at 0x7f0bf846d2e8>)
+p50 = new_with_vtable(ConstClass(FailedToImplement))
+guard_class(p50, ConstClass(FailedToImplement), descr=<ResumeGuardDescr object at 0x7f0bf846d3e0>)
+guard_class(p50, ConstClass(FailedToImplement), descr=<ResumeGuardDescr object at 0x7f0bf846d4a8>)
+guard_class(p46, 17688456, descr=<ResumeGuardDescr object at 0x7f0bf846d570>)
+guard_class(p46, 17688456, descr=<ResumeGuardDescr object at 0x7f0bf846d638>)
+guard_class(p46, 17688456, descr=<ResumeGuardDescr object at 0x7f0bf846d700>)
+guard_nonnull(p46, descr=<ResumeGuardDescr object at 0x7f0bf846d7c0>)
+guard_class(p46, 17688456, descr=<ResumeGuardDescr object at 0x7f0bf846d880>)
+guard_class(p46, 17688456, descr=<ResumeGuardDescr object at 0x7f0bf846d948>)
+guard_class(p46, 17688456, descr=<ResumeGuardDescr object at 0x7f0bf846da10>)
+f59 = getfield_gc_pure(p46, descr=<W_FloatObject.inst_floatval>)
+f61 = float_truediv(f59, 2.000000)
+p63 = new_with_vtable(17688456)
+setfield_gc(p63, f61, descr=<W_FloatObject.inst_floatval>)
+guard_nonnull(p63, descr=<ResumeGuardDescr object at 0x7f0bf846dbc0>)
+i65 = ptr_eq(p63, ConstPtr(ptr64))
+guard_false(i65, descr=<ResumeGuardDescr object at 0x7f0bf846dcb8>)
+guard_nonnull(p63, descr=<ResumeGuardDescr object at 0x7f0bf846dd70>)
+i67 = getfield_raw(32555168, descr=<pypysig_long_struct.c_value>)
+i69 = int_sub(i67, 2)
+setfield_raw(32555168, i69, descr=<pypysig_long_struct.c_value>)
+i71 = int_lt(i69, 0)
+guard_false(i71, descr=<ResumeGuardDescr object at 0x7f0bf846dfa8>)
+p72 = same_as(ConstPtr(ptr14))
+i74 = same_as(0)
+i76 = same_as(13)
+p78 = same_as(ConstPtr(ptr77))
+p80 = same_as(ConstPtr(ptr79))
+p82 = same_as(ConstPtr(ptr81))
+jump(p0, p1, p72, p3, i74, p5, i76, p78, p80, p82, p10, p26, p63, descr=<Loop-1>)

sqrt/sqrt.png

Added
New image
+def sqrt(y, n=10000):
+    x = y / 2
+    while n > 0:
+        #assert y > 0 and x > 0
+        #if y > 0 and x > 0: pass
+        n -= 1
+        x = (x + y/x) / 2
+    return x
+
+def sqrt2(y, n=10000):
+    a, b = 0, y
+    while n > 0:
+        n -= 1
+        c = (a + b) / 2
+        if c*c <= y:
+            a = c
+        if c*c >= y:            
+            b = c
+    return c
+    
+class Fix16(object):
+    def __init__(self, val, scale=True):
+        if isinstance(val, Fix16):
+            self.val = val.val
+        else:
+            if scale:
+                self.val = int(val * 2**16)
+            else:
+                self.val = val
+
+    def __add__(self, other):
+        return  Fix16(self.val + Fix16(other).val, False)
+
+    def __sub__(self, other):
+        return  Fix16(self.val - Fix16(other).val, False)
+
+    def __mul__(self, other):
+        return  Fix16((self.val >> 8) * (Fix16(other).val >> 8), False)
+
+    def __div__(self, other):
+        return  Fix16((self.val << 16) / Fix16(other).val, False)
+
+
+    def __float__(self):
+        return float(self.val) / float(2**16)
+
+    def __int__(self):
+        return self.val >> 16
+
+    def __cmp__(self, other):
+        return cmp(self.val, Fix16(other).val)
+
+    def __str__(self):
+        return str(float(self))
+
+    __radd__ = __add__
+    __rmul__ = __mul__
+    def __rsub__(self, other):
+        return  Fix16(Fix16(other).val - self.val, False)
+    def __rdiv__(self, other):
+        return  Fix16((Fix16(other).val << 16) / self.val, False)
+
+if __name__ == '__main__':
+    #print sqrt(1234.0)
+    print sqrt(1234)
+    #print sqrt(1234.0, 100000000)
+    #print sqrt(1234)
+    #print sqrt(Fix16(1234))
+
+    #print sqrt(1234.0, 100000000)
+    #  pypy      0m1.209s
+    #  gcc       0m1.157s
+    #  cpython   0m28.713s
+
+    #print sqrt(1234, 100000000)
+    #  pypy      0m6.278s
+    #  gcc       0m1.817s
+    #  cpython   0m22.246s
+
+    #print sqrt(Fix16(1234), 100000000)
+    #  pypy      0m7.333s
+    #  gcc       0m1.885s
+    #  cpython   12m57.077s
+
+
+

sqrt/sqrt.pyc

Binary file added.

sqrt/sqrt_double.c

+int main() {
+  double y = 1234.0;
+  double x = y / 2.0;
+  long n = 100000000;
+  while (n>0) {
+    n -= 1;
+    x = (x + y/x) / 2.0;
+  }
+  printf("%f\n", x);
+}
+
+// gcc sqrt_double.c                     0m1.428s
+// gcc -O3 sqrt_double.c                 0m1.151s
+// gcc -O3 -mtune=native -march=native   0m1.157s

sqrt/sqrt_fix16.c

+int main() {
+  long y = 1234 << 16;
+  long x = y / 2;
+  long n = 100000000;
+  while (n>0) {
+    n -= 1;
+    x = ((x + (y << 16)/x)) / 2;
+  }
+  printf("%f\n", ((double) x) / ((double) (1<<16)));
+}
+
+// gcc                   0m2.032s         
+// gcc -O3               0m1.885s
+// gcc -O3 -mcpu=native  0m1.889s
+int main() {
+  long y = 1234;
+  long x = y / 2;
+  long n = 100000000;
+  while (n>0) {
+    n -= 1;
+    x = (x + y/x) / 2;
+  }
+  printf("%d\n", x);
+}
+
+// gcc                   0m1.930s
+// gcc -O3               0m1.817s
+// gcc -O3 -mtune=native 0m1.828s

sqrt/test_sqrt.py

+import math
+from sqrt import sqrt, sqrt2, Fix16
+
+f = Fix16(10.45)
+#print int(f), float(f), float(f/2), float(f*f)
+
+for i in range(2,10):
+    print i, sqrt(i), '%4.2f' % sqrt(float(i)), \
+          '%4.2f' % float(sqrt(Fix16(i))), '%4.2f' % math.sqrt(i)
+    print i, sqrt2(i), '%4.2f' % sqrt2(float(i)), \
+          '%4.2f' % float(sqrt2(Fix16(i))), '%4.2f' % math.sqrt(i)
+
+<H1>Loop invariant code motion</H1> 
+
+Recently, the jit-unroll-loops branch was merged. It implements the
+idea described in 
+<a href="http://morepypy.blogspot.com/2010/09/using-escape-analysis-across-loop.html">Using Escape Analysis Across Loop Boundaries for Specialization</a>.
+That post does only talk about virtuals, but the idea turned out
+to be more far reaching. After the metainterpreter produces a trace,
+several optimizations are applied to the trace before it is turned
+into binary code. Removing allocations is only one of them. There are also
+for instance
+<ul>
+<li> Heap optimizations that removes memory accesses by reusing results
+  previously read from or written to the same location.
+<li> Reusing of the results of pure operations if the same pure
+  operation is executed twice.
+<li> Removal of redundant guards.
+<li> ...
+</ul>
+A lot of these optimizations are in one way or another removing
+operations form the trace and/or reusing previous results. All of these
+optimizations could benefit from being able to operate across loop
+boundaries. Not only in the sense that operations operating on loop
+invariants could be moved out of the loop entirely. But also that
+results produced at the end of an iteration could be reused at the
+beginning of the next even if there are no loop invariants involved.
+
+<p>
+
+This is achieved by unrolling the trace into two iterations, and
+letting the optimizer work on this two-iteration-trace.
+The optimizer will now be able to optimize the second iteration more than the
+first since it can reuse results from the first iteration. The
+optimized version of the first iteration we call the <em>preamble</em> and the
+optimized version of the second iteration we call the <em>loop</em>. The
+preamble will end with a jump to the loop, while the loop will end
+with a jump to itself. This means that the preamble will be executed
+once for the first iteration, the loop will be executed for all following
+iterations.
+ 
+<p>
+
+<h2>Sqrt example</h2>
+Here is an example of a Python implementation of sqrt using a fairly
+simple algorithm
+
+<p>
+<!-- pygmentize -f html -O full -o t.html t.py -->
+  <style type="text/css">
+td.linenos { background-color: #f0f0f0; padding-right: 10px; }
+span.lineno { background-color: #f0f0f0; padding: 0 5px 0 5px; }
+pre { line-height: 125%; }
+body .hll { background-color: #ffffcc }
+body  { background: #f8f8f8; }
+body .c { color: #408080; font-style: italic } /* Comment */
+body .err { border: 1px solid #FF0000 } /* Error */
+body .k { color: #008000; font-weight: bold } /* Keyword */
+body .o { color: #666666 } /* Operator */
+body .cm { color: #408080; font-style: italic } /* Comment.Multiline */
+body .cp { color: #BC7A00 } /* Comment.Preproc */
+body .c1 { color: #408080; font-style: italic } /* Comment.Single */
+body .cs { color: #408080; font-style: italic } /* Comment.Special */
+body .gd { color: #A00000 } /* Generic.Deleted */
+body .ge { font-style: italic } /* Generic.Emph */
+body .gr { color: #FF0000 } /* Generic.Error */
+body .gh { color: #000080; font-weight: bold } /* Generic.Heading */
+body .gi { color: #00A000 } /* Generic.Inserted */
+body .go { color: #808080 } /* Generic.Output */
+body .gp { color: #000080; font-weight: bold } /* Generic.Prompt */
+body .gs { font-weight: bold } /* Generic.Strong */
+body .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
+body .gt { color: #0040D0 } /* Generic.Traceback */
+body .kc { color: #008000; font-weight: bold } /* Keyword.Constant */
+body .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */
+body .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */
+body .kp { color: #008000 } /* Keyword.Pseudo */
+body .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */
+body .kt { color: #B00040 } /* Keyword.Type */
+body .m { color: #666666 } /* Literal.Number */
+body .s { color: #BA2121 } /* Literal.String */
+body .na { color: #7D9029 } /* Name.Attribute */
+body .nb { color: #008000 } /* Name.Builtin */
+body .nc { color: #0000FF; font-weight: bold } /* Name.Class */
+body .no { color: #880000 } /* Name.Constant */
+body .nd { color: #AA22FF } /* Name.Decorator */
+body .ni { color: #999999; font-weight: bold } /* Name.Entity */
+body .ne { color: #D2413A; font-weight: bold } /* Name.Exception */
+body .nf { color: #0000FF } /* Name.Function */
+body .nl { color: #A0A000 } /* Name.Label */
+body .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */
+body .nt { color: #008000; font-weight: bold } /* Name.Tag */
+body .nv { color: #19177C } /* Name.Variable */
+body .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */
+body .w { color: #bbbbbb } /* Text.Whitespace */
+body .mf { color: #666666 } /* Literal.Number.Float */
+body .mh { color: #666666 } /* Literal.Number.Hex */
+body .mi { color: #666666 } /* Literal.Number.Integer */
+body .mo { color: #666666 } /* Literal.Number.Oct */
+body .sb { color: #BA2121 } /* Literal.String.Backtick */
+body .sc { color: #BA2121 } /* Literal.String.Char */
+body .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */
+body .s2 { color: #BA2121 } /* Literal.String.Double */
+body .se { color: #BB6622; font-weight: bold } /* Literal.String.Escape */
+body .sh { color: #BA2121 } /* Literal.String.Heredoc */
+body .si { color: #BB6688; font-weight: bold } /* Literal.String.Interpol */
+body .sx { color: #008000 } /* Literal.String.Other */
+body .sr { color: #BB6688 } /* Literal.String.Regex */
+body .s1 { color: #BA2121 } /* Literal.String.Single */
+body .ss { color: #19177C } /* Literal.String.Symbol */
+body .bp { color: #008000 } /* Name.Builtin.Pseudo */
+body .vc { color: #19177C } /* Name.Variable.Class */
+body .vg { color: #19177C } /* Name.Variable.Global */
+body .vi { color: #19177C } /* Name.Variable.Instance */
+body .il { color: #666666 } /* Literal.Number.Integer.Long */
+
+  </style>
+
+<div class="highlight"><pre><span class="k">def</span> <span class="nf">sqrt</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">n</span><span class="o">=</span><span class="mi">10000</span><span class="p">):</span>
+    <span class="n">x</span> <span class="o">=</span> <span class="n">y</span> <span class="o">/</span> <span class="mi">2</span>
+    <span class="k">while</span> <span class="n">n</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
+        <span class="n">n</span> <span class="o">-=</span> <span class="mi">1</span>
+        <span class="n">x</span> <span class="o">=</span> <span class="p">(</span><span class="n">x</span> <span class="o">+</span> <span class="n">y</span><span class="o">/</span><span class="n">x</span><span class="p">)</span> <span class="o">/</span> <span class="mi">2</span>
+    <span class="k">return</span> <span class="n">x</span>
+</pre></div>
+<p>
+
+If it is called with <tt>sqrt(1234.0)</tt>,  
+<a href="noopt.txt">a fairly long trace</a> is produced. From this trace
+the optimizer creates
+the
+following preamble (Loop 1) and loop (Loop 0) 
+
+<p><img src="trace1.png"><p>
+
+Looking at the preamble, it starts by making sure that it is not 
+currently being profiled, the guard
+on <tt>i5</tt>, and that the function object have not been changed
+since the trace was made, the guard on <tt>p3</tt>. Somewhat
+intermixed with that, the
+integer variable <tt>n</tt> is unboxed, by making sure <tt>p11</tt>
+points to an integer object and reading out the integer value from
+that object. 
+These operations are not needed in the
+loop (and have been removed from it) as emitting the same guards again
+would be redundant and <tt>n</tt> becomes a virtual before the
+end of the preamble.
+<pre>
+        guard_value(i5, 0, descr=&lt;Guard6&gt;) 
+        guard_nonnull_class(p11, ConstClass(W_IntObject), descr=&lt;Guard7&gt;) 
+        guard_value(p3, ConstPtr(ptr15), descr=&lt;Guard8&gt;) 
+        i16 = getfield_gc_pure(p11, descr=&lt;W_IntObject.inst_intval&gt;)
+</pre>
+
+Next comes a test and a guard implementing the while statement
+followed by the decrementing of <tt>n</tt>. These operation appear
+both in the preamble and in the loop
+<pre>
+        i18 = int_gt(i16, 0)
+        guard_true(i18, descr=&lt;Guard9&gt;) 
+        i20 = int_sub(i16, 1)
+</pre>
+
+After that the two floating point variables <tt>x</tt> and <tt>y</tt>
+are unboxed. Again this is only needed in the preamble. Note how the
+unboxed value of <tt>y</tt>, called <tt>f23</tt>, is passed unchanged
+from the preamble to the loop in arguments of the jump 
+to allow it to be reused. It will not become a virtual
+since it is never changed within the loop.
+<pre>
+        guard_nonnull_class(p12, 17652552, descr=&lt;Guard10&gt;) 
+        guard_nonnull_class(p10, 17652552, descr=&lt;Guard11&gt;) 
+        f23 = getfield_gc_pure(p10, descr=&lt;W_FloatObject.inst_floatval&gt;)
+        f24 = getfield_gc_pure(p12, descr=&lt;W_FloatObject.inst_floatval&gt;)
+</pre>
+
+Following that is the actual calculations performed in the loop in
+form of floating point operations (since the function was called with
+a float argument). These appear in both the loop
+and the preamble.
+<pre>
+        i26 = float_eq(f24, 0.000000)
+        guard_false(i26, descr=&lt;Guard12&gt;) 
+        f27 = float_truediv(f23, f24)
+        f28 = float_add(f24, f27)
+        f30 = float_truediv(f28, 2.000000)
+</pre>
+
+Finally there are some tests checking if a signal was received
+(such as when the user presses ctrl-C) and thus should execute some
+signal handler or if we need to hand over to another thread. This is
+implemented with a counter that is decreased once every iteration. It
+will go below zero after some specific number of iterations, tunable by
+<tt>sys.setcheckinterval</tt>. The counter is read from and written to
+some global location where it also can be made negative by a C-level
+signal handler. 
+<pre>
+        i32 = getfield_raw(32479328, descr=&lt;pypysig_long_struct.c_value&gt;)
+        i34 = int_sub(i32, 2)
+        setfield_raw(32479328, i34, descr=&lt;pypysig_long_struct.c_value&gt;)
+        i36 = int_lt(i34, 0)
+        guard_false(i36, descr=&lt;Guard13&gt;) 
+        jump(p0, p1, p2, p4, p10, i20, f30, f23, descr=&lt;Loop0&gt;)
+</pre>
+
+<H1>Bridges</H1>
+
+When a guard fails often enough, the meta-interpreter is started again
+to produce a new trace starting at the failing guard. The tracing is
+continued until a previously compiled loop is entered. This could
+either be the the same loop that contains the failing guard
+or some completely different loop. If it is the same loop, executing
+the preamble again maybe be unnecessary.
+It is preferable to end the bridge with a jump directly to 
+the loop. To achieve this the optimizer tries to produce <i>short
+  preambles</i> that are inlined at the end of bridges allowing
+them to jump directly to the loop. Inlining is better than jumping to
+a common preamble because most of the inlined short preamble can
+typically be removed again by the optimizer.
+Creating such a short
+preamble is however not always possible. Bridges jumping to loops for which
+no short preamble can be generated have to end with a jump to the
+full preamble instead.
+
+<p>
+
+The short preamble is created by comparing the operations in the
+preamble with the operations in the loop. The
+operations that are in the preamble but not in the loop 
+are moved to the short preamble whenever it is safe to move them to
+the front of the operations remaining. In other words, the full preamble
+is equivalent to the short preamble followed by one iteration of the
+loop. 
+
+<p>
+
+This much has currently been implemented. To give the full picture
+here, there are two more features that 
+hopefully will be implemented in the near future.
+The first is to replace the full preamble, used by the interpreter
+when it reaches a compiled loop, with the short preamble.
+This is currently not done and is probably not as straight forward as
+it might first seem. The problem is where to resume interpreting on a
+guard failure. However, implementing that should save some
+memory. Not only 
+because the preamble will become smaller, but mainly because the
+guards will appear either in the loop or in the preamble, but not
+in both (as they do now). That means there will only be a single bridge and 
+not potentially two copies once the guards are traced.
+
+<p>
+
+The sqrt example above would with a short preamble result in a trace
+like this
+
+<p><img src="trace2.png"><p>
+
+If it is executed long enough, the last guard will be traced to form a
+bridge. The trace will inherit the virtuals from its parent. This can
+be used to optimize away the part of the inlined short preamble
+that deals with virtuals. The resulting bridge should look
+something like
+
+<pre>
+    [p0, p1, p2, p3, p4, f5, i6]
+    i7 = force_token()
+    setfield_gc(p1, i7, descr=&lt;PyFrame.vable_token&gt;)
+    call_may_force(ConstClass(action_dispatcher), p0, p1, descr=&lt;VoidCallDescr&gt;)
+    guard_not_forced(, descr=&lt;Guard19&gt;) 
+    guard_no_exception(, descr=&lt;Guard20&gt;) 
+
+    guard_nonnull_class(p4, 17674024, descr=&lt;Guard21&gt;) 
+    f52 = getfield_gc_pure(p4, descr=&lt;W_FloatObject.inst_floatval&gt;)
+    jump(p1, p0, p2, p3, p4, i38, f53, f52, descr=&lt;Loop0&gt;)
+</pre>
+
+Here the first paragraph comes from the traced bridge and the second
+is what remains of the short preamble after optimization. The
+box <tt>p4</tt> is 
+not a virtual (it contains a pointer to <tt>y</tt> which is never
+changed), and it is only virtuals 
+that the bridge inherit from it's parents. This is why the last two
+operations currently cannot be removed.
+
+<!--
+To remove the remaining
+parts of the short preamble a lot more of the optimizer state would
+have to be saved in every guard. However it should be possible to
+recreate this state from the short preamble. To do that, the bridge
+have to know which of it's input boxes corresponds to which of the
+output boxes (arguments of the last jump) of the short preamble. One
+idea of how to store this information is to introduce some
+VFromStartValue virtuals that would be some pseudo virtuals containing
+a single
+input argument box and it's index. Using this the
+infrastructure in place for storing information about virtuals can be
+used to also store this index.
+
+XXX again, a lot of details from the code
+-->
+
+<p>
+
+Each time the short preamble is inlined, a new copy of each of the
+guards in it is generated. Typically the short preamble is inlined in
+several places and thus there will be several copies of each of those
+guards. 
+If they fail often enough bridges
+from them will be traced (as with all guards). But since there
+typically are several copies of each guard the same bridge
+will be generated in 
+several places. To prevent this, mini-bridges from the inlined guards
+are produced already during the inlining. These mini-bridges contain
+nothing but a jump to the preamble.
+
+<p>
+The mini-bridges needs the arguments of the preamble to be able
+to jump to it. These arguments contain among other things, boxed
+versions of the 
+variables <tt>x</tt> and <tt>y</tt>. Those variables are virtuals in
+the loop, and have to be allocated. Currently those allocations
+are placed in front of the inlined guard. Moving those allocations into
+the mini-bridges is the  second feature that 
+hopefully will be implemented in the near future. 
+<!--
+The current approach actually kills the entire benefit of the inlining in most
+real world cases as typically all the virtuals are forced.
+-->
+After this feature is
+implemented, the result should look something like
+<p><img src="trace3.png"><p>
+
+
+<h2>Multiple specialized versions</h2>
+
+Floating point operations were generated in the trace above
+because <tt>sqrt</tt> was called with a float argument. If it is
+instead called with an int argument, integer operations will be generated. The
+somewhat more complex situations is when both int's and float's are
+used as arguments. Then the jit need to generate multiple versions of
+the same loop, specialized in different ways. The details, given
+below, on how this is achieved is somewhat involved. For the casual
+reader it would make perfect sense to skip to the next section here.
+
+<p>
+
+Consider the case when <tt>sqrt</tt> is first called with a float
+argument (but with <tt>n</tt> small enough not to generate the
+bridge). Then the trace shown above will be
+generated. If <tt>sqrt</tt> is now called with an int argument, the
+guard in the preamble testing that the type of the input object is float
+will fail:
+<pre>
+        guard_nonnull_class(p12, 17652552, descr=&lt;Guard10&gt;) 
+</pre>
+It will fail every iteration, so soon enough a bridge will be
+generated from this guard in the preamble. This guard will end with a
+jump to the same loop, and the optimizer will try to inline
+the short preamble at the end of it. This will however fail
+since now there are two guards on <tt>p12</tt>. One that makes sure it
+is an int and and one that makes sure it is a float. The optimizer
+will detect that the second guard will always fail and mark the bridge
+as invalid. Invalid loops are not passed on to the backend for
+compilation. 
+
+<p>
+
+If a loop is detected to be invalid while inlining the short preamble,
+the metainterpreter will continue to trace for yet another 
+iteration of the loop. This new trace can be compiled as above and
+will produce a new loop with a new preamble that are now specialized
+for int arguments instead of float arguments. The bridge that
+previously became invalid will now be tried again. This time inlining
+the short preamble of the new loop instead. This will produce a set of
+traces connected like this
+
+<p>
+<a href="trace4mag.png"><img src="trace4.png"></a>
+<br>(click for some hairy details)
+<p>
+
+The height of the boxes is this figure represents how many instructions
+they contain (presuming the missing features from the previous section
+are implemented). Loop 0 is specialized for floats and it's preamble have
+been split into two boxes at the failing guard. Loop 2 is specialized
+for ints and is larger than Loop 0. This is mainly because the integer
+division in python does not map to the integer division of the
+machine, but have to be implemented with several instructions (integer
+division in python truncates its result towards minus
+infinity, while the the machine integer division truncates towards
+0). Also the height of the bridge is about the same as the height of
+Loop 2. This is because it contains a full iteration of the loop.
+
+<p>
+
+<h2>A More Advanced Example</h2>
+
+Let's conclude with an example that is a bit more advanced, where this unrolling
+approach actually outperforms the previous approach. Consider
+making a
+<a href="http://en.wikipedia.org/wiki/Fixed-point_arithmetic">fixed-point</a>
+implementation of the square root using 16 bit's of decimals. This can be
+done using the same implementation
+of <tt>sqrt</tt> but calling it with an object of a class representing
+such fixed-point real numbers:
+
+<p>
+<div class="highlight"><pre><span class="k">class</span> <span class="nc">Fix16</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span>
+    <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">val</span><span class="p">,</span> <span class="n">scale</span><span class="o">=</span><span class="bp">True</span><span class="p">):</span>
+        <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">val</span><span class="p">,</span> <span class="n">Fix16</span><span class="p">):</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">val</span> <span class="o">=</span> <span class="n">val</span><span class="o">.</span><span class="n">val</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="k">if</span> <span class="n">scale</span><span class="p">:</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">val</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">val</span> <span class="o">*</span> <span class="mi">2</span><span class="o">**</span><span class="mi">16</span><span class="p">)</span>
+            <span class="k">else</span><span class="p">:</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">val</span> <span class="o">=</span> <span class="n">val</span>
+
+    <span class="k">def</span> <span class="nf">__add__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span>
+        <span class="k">return</span>  <span class="n">Fix16</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">val</span> <span class="o">+</span> <span class="n">Fix16</span><span class="p">(</span><span class="n">other</span><span class="p">)</span><span class="o">.</span><span class="n">val</span><span class="p">,</span> <span class="bp">False</span><span class="p">)</span>
+
+    <span class="k">def</span> <span class="nf">__sub__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span>
+        <span class="k">return</span>  <span class="n">Fix16</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">val</span> <span class="o">-</span> <span class="n">Fix16</span><span class="p">(</span><span class="n">other</span><span class="p">)</span><span class="o">.</span><span class="n">val</span><span class="p">,</span> <span class="bp">False</span><span class="p">)</span>
+
+    <span class="k">def</span> <span class="nf">__mul__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span>
+        <span class="k">return</span>  <span class="n">Fix16</span><span class="p">((</span><span class="bp">self</span><span class="o">.</span><span class="n">val</span> <span class="o">&gt;&gt;</span> <span class="mi">8</span><span class="p">)</span> <span class="o">*</span> <span class="p">(</span><span class="n">Fix16</span><span class="p">(</span><span class="n">other</span><span class="p">)</span><span class="o">.</span><span class="n">val</span> <span class="o">&gt;&gt;</span> <span class="mi">8</span><span class="p">),</span> <span class="bp">False</span><span class="p">)</span>
+
+    <span class="k">def</span> <span class="nf">__div__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span>
+        <span class="k">return</span>  <span class="n">Fix16</span><span class="p">((</span><span class="bp">self</span><span class="o">.</span><span class="n">val</span> <span class="o">&lt;&lt;</span> <span class="mi">16</span><span class="p">)</span> <span class="o">/</span> <span class="n">Fix16</span><span class="p">(</span><span class="n">other</span><span class="p">)</span><span class="o">.</span><span class="n">val</span><span class="p">,</span> <span class="bp">False</span><span class="p">)</span>
+</pre></div>
+
+<p>
+
+Below is a table comparing the runtime of the sqrt function above with
+different argument types on different python interpreters. Pypy 1.4.1
+was released before the optimizations described in this post were in place
+while they are in place in the 
+<a href="http://buildbot.pypy.org/nightly/trunk/pypy-c-jit-40390-e1ab35394b0f-linux64.tar.bz2">nightly
+  build from January 5</a>, 
+denoted pypy in the table. There are also the running time for the same
+algorithms implemented in C and compiled with "gcc -O3
+-march=native". Tests were executed on a 2.53GHz Intel Core2
+processor with <tt>n=100000000</tt> iterations.
+Comparing the integer versions with C may be considered a
+bit unfair because of the more advanced integer division operator in
+python. The left part of this table shows runtimes of <tt>sqrt</tt> in
+a program containing a single call to sqrt (i.e. only a single
+specialized version of the loop is needed). The right part shows the
+runtime of <tt>sqrt</tt> when it has been called with a different
+type of argument before.
+
+<p>
+
+<table>
+  <tr><th></th><th colspan=3>First call</th><th></th><th colspan=3>Second call</th></tr>
+  <tr><th></th><th>float</th><th>int</th><th>Fix16</th><th>&nbsp;&nbsp;</th>
+               <th>float</th><th>int</th><th>Fix16</th></tr>
+  <tr><th>cpython</th>
+    <td> 28.18 s</td>
+    <td> 22.13 s</td>
+    <td> 779.04 s</td>
+    <td></td>
+    <td> 28.07 s</td>
+    <td> 22.21 s</td>
+    <td> 767.03 s</td>    
+  </tr>
+  <tr><th>pypy 1.4.1</th>
+    <td> 1.20 s</td>
+    <td> 6.49 s</td>
+    <td> 11.31 s</td>
+    <td></td>
+    <td> 1.20 s</td>
+    <td> 6.54 s</td>
+    <td> 11.23 s</td>
+  </tr>
+  <tr><th>pypy</th>
+    <td> 1.20 s</td>
+    <td> 6.44 s</td>
+    <td> 6.78 s</td>
+    <td></td>
+    <td> 1.19 s</td>
+    <td> 6.26 s</td>
+    <td> 6.79 s</td>
+  </tr>
+  <tr><th>gcc</th>
+    <td> 1.15 s</td>
+    <td> 1.82 s</td>
+    <td> 1.89 s</td>
+    <td></td>
+    <td> 1.15 s</td>
+    <td> 1.82 s</td>
+    <td> 1.89 s</td>
+  </tr>
+</table>
+
+<p>
+
+For this to work in the last case, when Fix16 is the argument type in
+the second type, 
+the trace_limit had to be increased from its default value to prevent
+the metainterpreter from aborting while tracing the second version of
+the loop. Also sys.setcheckinterval(1000000) were used to prevent the
+bridge from being generated. With the bridge the performance of the
+last case is significantly worse. Maybe because the optimizer currently
+fails to generate a short preamble for it. But the slowdown
+seems too big for that to be the only explanation. Below are the runtimes
+numbers with checkinterval set to its default value of 100:
+
+<table>
+  <tr><th></th><th colspan=3>First call</th><th></th><th colspan=3>Second call</th></tr>
+  <tr><th></th><th>float</th><th>int</th><th>Fix16</th><th>&nbsp;&nbsp;</th>
+               <th>float</th><th>int</th><th>Fix16</th></tr>
+  <tr><th>cpython</th>
+    <td> 28.71 s</td>
+    <td> 22.09 s</td>
+    <td> 781.86 s</td>
+    <td></td>
+    <td> 28.28 s</td>
+    <td> 21.92 s</td>
+    <td> 761.59 s</td>
+  </tr>
+  <tr><th>pypy 1.4.1</th>
+    <td> 1.21 s</td>
+    <td> 6.48 s</td>
+    <td> 11.22 s</td>
+    <td></td>
+    <td> 1.72 s</td>
+    <td> 7.58 s</td>
+    <td> 12.18 s</td>
+  </tr>
+  <tr><th>pypy</th>
+    <td> 1.21 s</td>
+    <td> 6.27 s</td>
+    <td> 7.22 s</td>
+    <td></td>
+    <td> 1.20 s</td>
+    <td> 6.29 s</td>
+    <td> 90.47 s</td>
+  </tr>
+</table>
+
+
+<h2>Conclusions</h2>
+Even though we are seeing speedups in a variety of different small
+benchmarks, more complicated examples are not affected much by these
+optimizations. It might partly be because larger examples have longer
+and more complicated loops, and thus allowing optimizations to operate
+across loop boundary will have a smaller relative effect. Another problem is
+that with more complicated examples there will be more bridges, and bridges
+are currently not handled very well (most of the time all virtuals are
+forced at the end of the bridge as explained above). But moving those
+forcings into the mini bridges should fix that. 
+

sqrt/time_sqrt.py

+import sys, time
+from sqrt import sqrt, sqrt2, Fix16
+
+try:
+    import pypyjit
+    pypyjit.set_param(trace_limit=20000)
+except ImportError:
+    pass
+
+type1 = eval(sys.argv[1])
+type2 = eval(sys.argv[2])
+if len(sys.argv) > 3 and sys.argv[3] == '2':
+    sqrt = sqrt2
+    
+
+#sys.setcheckinterval(1000000)
+#sys.setcheckinterval(2)
+if type1:
+    sqrt(type1(1234))
+if type2:
+    a = time.time()
+    sqrt(type2(1234), 100000000)
+    b = time.time()
+    #print sys.argv[1:], b - a
+    print '    <td>', '%.2f' % (b-a), 's</td>'
+    

sqrt/time_sqrt2.py

+import sys, time
+from sqrt import sqrt, sqrt2, Fix16
+
+try:
+    import pypyjit
+    pypyjit.set_param(trace_limit=20000)
+except ImportError:
+    pass
+
+type1 = eval(sys.argv[1])
+type2 = eval(sys.argv[2])
+sqrt = sqrt2
+    
+
+sys.setcheckinterval(1000000)
+#sys.setcheckinterval(2)
+if type1:
+    sqrt(type1(123456))
+if type2:
+    a = time.time()
+    for i in range(10):
+        sqrt(type2(123456), 10000000)
+    b = time.time()
+    #print sys.argv[1:], b - a
+    print '    <td>', '%.2f' % (b-a), 's</td>'
+    

sqrt/trace1.png

Added
New image

sqrt/trace2.png

Added
New image

sqrt/trace3.png

Added
New image

sqrt/trace4.png

Added
New image

sqrt/trace4mag.png

Added
New image
+import os
+from ctypes import *
+import inspect, re
+
+
+def mtime(fn):
+    try:
+        return os.stat(fn)[8]
+    except OSError:
+        return -1
+
+def cfile(fns,cflags='',ldflags=''):
+
+    if isinstance(fns,str):
+        fns=(fns,)
+
+    link=False
+    los=''
+    fbase=fns[0][:-2]
+    la='lib'+fbase+'.la'
+    so='./.libs/lib'+fbase+'.so.0'
+
+    # Don't compile if we are installed centraly
+    frm=inspect.currentframe().f_back
+    d='/'.join(re.split('/', frm.f_code.co_filename)[:-1])
+    installedso = d+'/lib'+fbase+'.so.0.0.0'
+    if mtime(installedso)>0:
+        return CDLL(installedso)
+
+    currdir = os.getcwd()
+    os.chdir(d)
+
+    # Recompiled changed files
+    for i,fn in enumerate(fns):
+        assert fn[-2:]=='.c'
+        fbase=fn[:-2]
+        lo=fbase+'.lo'
+        los+=lo+" "
+        c =fbase+'.c'
+
+        if mtime(lo) <= mtime(c):
+            ret=os.system('libtool --mode=compile gcc -o '+lo+' '+cflags+
+                          ' -c '+c)
+            if ret!=0: raise ImportError("Compilation of "+c+" failed.")
+            link=True
+
+    if link:
+        ret=os.system('libtool --mode=link gcc -o '+la+' '+los+' '+ldflags+
+                      ' -version-info 0:0:0 -rpath /usr/lib/')
+        if ret!=0: raise ImportError("Linking of "+c+" failed.")
+
+    dll = CDLL(so)
+    os.chdir(currdir)
+    return dll
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define min(a, b)  (((a) < (b)) ? (a) : (b))
+#define max(a, b)  (((a) > (b)) ? (a) : (b))
+
+static int pressed_q = 0;
+
+#include <GL/glut.h>
+
+unsigned int new_texture(int w, int h) {
+  unsigned int t;
+
+  glGenTextures(1, &t);
+  glBindTexture(GL_TEXTURE_RECTANGLE_NV, t);
+  glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+  glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  glTexImage2D(
+    GL_TEXTURE_RECTANGLE_NV,	/* texture target */
+    0,				                /* mipmap level */
+    GL_RGB,			              /* texture internal format */
+    w,	                      /* width */
+    h,	                      /* height */
+    0,				                /* border */
+    GL_RGB,			              /* pixel format of the image */
+                              /* (matching internal format is better) */
+    GL_UNSIGNED_BYTE,		      /* channel type */
+    NULL		                  /* image data */
+    );
+  glBindTexture(GL_TEXTURE_RECTANGLE_NV, 0);
+
+  return t;
+}
+
+static unsigned int tex;
+void glutMainLoopEvent();
+
+volatile int paused=0;
+int step=0;
+
+
+void reshape(int w, int h);
+
+static unsigned char last_key;
+
+unsigned char get_last_key() {
+  unsigned char key;
+  key=last_key;
+  last_key=0;
+  return key;
+}
+
+
+
+void keyboard (unsigned char key, int x, int y) {
+  last_key=key;
+  switch (key) {
+      case ' ':
+        if (paused) paused=0; else paused=1;
+        break;
+      case 'q':
+      case 'Q':
+      case 27:
+        pressed_q = 1;
+        exit(0);
+        break;
+      case 10:
+      case 13:
+        step=1;
+        paused=0;
+        break;
+  }
+}
+
+static int dx,dy, img_width, img_height;
+static float scale;
+
+static int last_mouse_x, last_mouse_y;
+static int mouse_updated = 0;
+
+void mouse(int button, int state, int x, int y) {
+  if (button==GLUT_LEFT_BUTTON && state==GLUT_DOWN) {
+    last_mouse_x = (int)(((float)(x-dx))/scale);
+    last_mouse_y = (int)(((float)(y-dy))/scale);
+    printf("Click: %d,%d\n", last_mouse_x, last_mouse_y);
+    mouse_updated = 1;
+  }
+}
+
+int get_mouse_updated() 
+{
+  if (mouse_updated == 1) {
+    mouse_updated = 0;
+    return 1;  
+  } else return 0;
+}
+
+int get_last_mouse_x() {
+  return last_mouse_x;
+}
+
+int get_last_mouse_y() {
+  return last_mouse_y;
+}
+
+
+void reshape(int w, int h) {
+  scale=min( ((float)w)/((float)img_width), ((float)h)/((float)img_height) );
+  dx=(w-scale*img_width)/2;
+  dy=(h-scale*img_height)/2;
+  glViewport(dx, dy, scale*img_width, scale*img_height);
+}
+
+int glview(int w, int h, char *data, int color, char typecode) {
+  static int win=-1;
+  if (win==-1) {
+    static int ac=0;
+    static char *av[]={"pr"};
+    glutInit(&ac,av);
+
+    glutInitWindowSize(w, h);
+    glutInitWindowPosition((glutGet(GLUT_SCREEN_WIDTH)-w)/2, 
+			   (glutGet(GLUT_SCREEN_HEIGHT)-h)/2);
+    
+    glutInitDisplayMode(GLUT_RGB|GLUT_DOUBLE);
+
+    win=glutCreateWindow("View");
+
+    glEnable(GL_TEXTURE_RECTANGLE_NV);
+
+    glMatrixMode(GL_MODELVIEW);
+    glLoadIdentity();
+    glMatrixMode(GL_PROJECTION);
+    glLoadIdentity();
+    gluOrtho2D(0, 1, 0, 1);
+
+    tex=new_texture(w,h);
+
+    img_width=w; img_height=h;
+  
+    glutKeyboardFunc(keyboard);
+    glutMouseFunc(mouse);
+    glutReshapeFunc(reshape); 
+  }
+
+  int t,s;
+  if (pressed_q==1) return 1;
+
+  if (color) {
+    t=GL_RGB;
+  } else {
+    t=GL_LUMINANCE;
+  } 
+
+  if (typecode == 'B') {
+    s=GL_UNSIGNED_BYTE;
+  } else if (typecode == 'i') {
+    s=GL_INT;
+  } else if (typecode == 'f') {
+    s=GL_FLOAT;
+  } else if (typecode == 'd') {
+    s=GL_DOUBLE;
+  } else {
+    fprintf(stderr, "Unknown pixel typecode %c\n", typecode);
+    return 0;
+  }
+
+
+  glBindTexture(GL_TEXTURE_RECTANGLE_NV, tex);
+  glTexSubImage2D(GL_TEXTURE_RECTANGLE_NV,0,0,0,w,w,t,s,data);
+
+  glClearColor(0.0, 0.0, 0.0, 1.0);
+  glClear(GL_COLOR_BUFFER_BIT);
+
+  glBegin(GL_QUADS);
+  {
+    glTexCoord2i(0, h); glVertex2i(0, 0);
+    glTexCoord2i(w, h); glVertex2i(1, 0);
+    glTexCoord2i(w, 0); glVertex2i(1, 1);
+    glTexCoord2i(0, 0); glVertex2i(0, 1);
+  }
+  glEnd();
+
+  //glutPostRedisplay();
+  glutSwapBuffers();
+  
+  
+  if (step) {
+    paused=1;
+    step=0;
+  }
+  
+  glutMainLoopEvent();
+  while(paused) glutMainLoopEvent();
+
+  return 1;
+}

video/negative.py

+import sys
+from video import Image, view, mplayer
+
+fn = sys.argv[1] if len(sys.argv)>1 else 'tv://'
+
+for img in mplayer(fn):
+    for p in img.pixels(border=50):
+        img[p] = 255 - img[p]
+    view(img)

video/negative2.py

+import sys
+from video import *
+
+fn = sys.argv[1] if len(sys.argv)>1 else 'tv://'
+
+class Neg1(Pixelwise):
+    typecode = 'B'
+    def op(self, p):
+        return 255 - p
+
+def Neg2(img):
+    out = img.new()
+    for p in img.pixels():
+        out[p] = 255 - img[p]
+    return out
+
+def genneg(img):
+    for p in img:
+        yield 255 - p
+
+def Neg3(img):
+    out = img.new()
+    for i, p in enumerate(genneg(img)):
+        out.data[i] = p
+    return out
+
+for img in mplayer(fn):
+    view(Neg3(img))
+from video import Image, view, mplayer
+import sys
+fn = sys.argv[1] if len(sys.argv)>1 else 'tv://'
+
+for img in mplayer(fn):
+    view(img)
+import sys
+from video import *
+
+fn = sys.argv[1] if len(sys.argv)>1 else 'tv://'
+
+for img in mplayer(fn):
+    view(Sqrt(SobelMagnitude(img)), scale=True)
+    #view(SobelDy(img), scale=True)
+import sys
+from video import Image, view, mplayer, Sqrt
+
+fn = sys.argv[1] if len(sys.argv)>1 else 'tv://'
+
+for img in mplayer(fn):
+    view(Sqrt(img), scale=True)
+from array import array
+from cfile import cfile
+import os, re, sys
+import itertools
+from math import sqrt
+
+class Image(object):
+    typecode = 'B'
+    def __init__(self, w, h, data=None):
+        self.width = w
+        self.height = h
+        if data is None:
+            self.data = array(self.typecode, (0,)) * (w * h)
+        else:
+            self.data = array(self.typecode, data)
+
+    def new(self):
+        return Image(self.width, self.height)
+
+    def _idx(self, (x, y)):
+        if 0 <= x < self.width and 0 <= y < self.height:
+            return y * self.width + x
+        raise IndexError
+
+    def __setitem__(self, idx, val):
+        self.data[self._idx(idx)] = val
+
+    def __getitem__(self, idx):
+        return self.data[self._idx(idx)]
+
+    def pixels(self, border=0, xborder=None, yborder=None, dx=1, dy=1):
+        if xborder is None:
+            xborder = border
+        if yborder is None:
+            yborder = border
+        for y in range(yborder, self.height - yborder, dy):
+            for x in range(xborder, self.width - xborder, dx):
+                yield x, y
+
+    def blocks(self, w, h, dx=1, dy=1, pad=None):
+        if pad is None:
+            for y in range(0, self.height - h + 1, dy):
+                for x in range(0, self.width - w + 1, dy):
+                    yield SubImage(self, w, h, x, y, pad)
+        else:
+            xoff = w / 2
+            yoff = h / 2
+            for y in range(0, self.height, dy):
+                for x in range(0, self.width, dy):
+                    yield SubImage(self, w, h, x-xoff, y-yoff, pad)
+
+    def __iter__(self):
+        for p in self.data:
+            yield p
+
+    def force(self):
+        pass
+
+    def __add__(self, other):
+        return Add(self, other)
+    __radd__=__add__
+
+    def __mul__(self, other):
+        return Mul(self, other)
+    __rmul__ = __mul__
+
+    def __div__(self, other):
+        return Div(self, other)
+    def __rdiv__(self, other):
+        return Div(other, self)
+    
+    def __sub__(self, other):
+        return Sub(self, other)
+    def __rsub__(self, other):
+        return Sub(other, self)
+    
+    def __pow__(self, other):
+        return Pow(self, other)
+
+class RGBImage(Image):
+    pass
+
+class VirtualImage(Image):
+    def __init__(self, w, h):
+        self.width = w
+        self.height = h
+        self.data = None
+
+    def force(self):
+        raise NotImplementedError
+
+    def force_if_needed(self):
+        if self.data is None:
+            self.force()
+
+    def __setitem__(self, idx, val):
+        self.force_if_needed()
+        return Image.__setitem__(self, idx, val)
+
+    def __getitem__(self, idx):
+        self.force_if_needed()
+        return Image.__getitem__(self, idx)
+
+    def __iter__(self):
+        raise NotImplementedError
+
+        
+class Pixelwise(VirtualImage):
+    typecode = 'd'
+    def __init__(self, *args):
+        w = set([])
+        h = set([])
+        self.args = []
+        for a in args:
+            if isinstance(a, Image):
+                w.add(a.width)
+                h.add(a.height)
+            else:
+                a = itertools.cycle((a,))
+            self.args.append(a)
+        assert len(w) == len(h) == 1
+        self.width = w.pop()
+        self.height = h.pop()
+        self.data = None
+        
+    def __iter__(self):
+        if self.data is None:
+            for p in zip(*self.args):
+                yield self.op(*p)
+        else:
+            for p in self.data:
+                yield p
+
+    def force(self):
+        data = array(self.typecode, (0,)) * (self.width * self.height)
+        for i, p in enumerate(self):
+            data[i] = p
+        self.data = data
+
+    def op(self, p):
+        raise NotImplementedError
+
+class Blockwise(Pixelwise):
+    size = (3, 3)
+    pad = 0
+    dx = dy = 1
+    def __iter__(self):
+        if self.data is None:
+            inputs = [a.blocks(self.size[0], self.size[1], dx=self.dx, dy=self.dy, pad=self.pad) for a in self.args]
+            for blocks in zip(*inputs):
+                yield self.op(*blocks)
+        else:
+            for p in self.data:
+                yield p
+
+class Sqrt(Pixelwise):
+    def op(self, p):
+        return sqrt(p)
+
+class Add(Pixelwise):
+    def op(self, a, b):
+        return a + b
+
+class Sub(Pixelwise):
+    def op(self, a, b):
+        return a - b
+
+class Mul(Pixelwise):
+    def op(self, a, b):
+        return a * b
+
+class Div(Pixelwise):
+    def op(self, a, b):
+        return a / b
+
+class Pow(Pixelwise):
+    def op(self, a, b):
+        return a ** b
+
+class UInt8(Pixelwise):
+    typecode = 'B'
+    def op(self, p):
+        return int(round(p))
+
+class SobelDx(Blockwise):
+    def op(self, blk):
+        return (1*blk[0, 0] - 1*blk[2, 0] +
+                2*blk[0, 1] - 2*blk[2, 1] +
+                1*blk[0, 2] - 1*blk[2, 2])
+
+class SobelDy(Blockwise):
+    def op(self, blk):
+        return ( 1*blk[0, 0] + 2*blk[1, 0] + 1*blk[2, 0] +
+                -1*blk[0, 2] - 2*blk[1, 2] - 1*blk[2, 2])
+
+def SobelMagnitude(a):
+    return SobelDx(a)**2 + SobelDy(a)**2
+
+    
+class SubImage(Image):
+    def __init__(self, image, width, height, dx, dy, pad=None):
+        self.image = image
+        self.dx = dx
+        self.dy = dy
+        self.width = width
+        self.height = height
+        self.pad = pad
+
+    def __getitem__(self, (x, y)):
+        if 0 <= x < self.width and 0 <= y < self.height:
+            try:
+                return self.image[x + self.dx, y + self.dy]
+            except IndexError:
+                if self.pad is None:
+                    raise
+                else:
+                    return self.pad
+        raise IndexError
+
+
+
+#so = cfile('glview.c', ldflags='-lglut -lGLU -lGL', cflags='-g')
+so = cfile('xvview.c', ldflags='-lX11 -lXv', cflags='-g')
+viewer = so.xvview
+from ctypes import c_int, c_void_p, c_char
+viewer.argtypes = [c_int, c_int, c_void_p, c_int, c_char]
+
+def view(img, scale=False):
+    if scale:
+        img.force()
+        ma, mi = max(img.data), min(img.data)
+        if ma != mi:
+            img = (img - mi) / (ma - mi) * 255
+    if img.typecode != 'B':
+        img = UInt8(img)
+    img.force()
+    viewer(img.width, img.height, img.data.buffer_info()[0],
+           isinstance(img, RGBImage), img.data.typecode)
+    
+def mplayer(fn='tv://'):
+    f = os.popen('mplayer -really-quiet -noframedrop '+
+                 '-vo yuv4mpeg:file=/dev/stdout 2>/dev/null </dev/null ' + fn)
+    hdr = f.readline()
+    m = re.search('W(\d+) H(\d+)', hdr)
+    w, h = int(m.group(1)), int(m.group(2))
+    while True:
+        hdr = f.readline()
+        if hdr != 'FRAME\n':
+            break
+        img = Image(w, h)
+        img.data = array('B')
+        img.data.fromfile(f, w*h)
+        f.read(w*h/2) # Color data
+        yield img
+    
+from video import Image, view
+
+fcnt = 0
+img = Image(640, 480)
+while True:
+    fcnt += 1
+    for x, y in img.pixels:
+        img[x, y] = (x + y + fcnt) & 255
+    view(img)
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <X11/Xlib.h>
+#include <X11/Xutil.h>
+#include <X11/extensions/Xvlib.h>
+#include <X11/keysym.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <X11/extensions/XShm.h>
+
+
+/* ok the following constant should be by right included thru in Xvlib.h */
+#ifndef XV_YV12
+#define XV_YV12 0x32315659
+#endif
+
+#ifndef XV_YUY2
+#define XV_YUY2 0x32595559
+#endif
+
+#ifndef XV_UYVY
+#define XV_UYVY 0x59565955
+#endif
+
+
+#define RGB2YUV(r, g, b, y, u, v)\
+  y = (9798*r + 19235*g + 3736*b)  / 32768;\
+  u = (-4784*r - 9437*g + 14221*b)  / 32768 + 128;\
+  v = (20218*r - 16941*g - 3277*b) / 32768 + 128;\
+  y = y < 0 ? 0 : y;\
+  u = u < 0 ? 0 : u;\
+  v = v < 0 ? 0 : v;\
+  y = y > 255 ? 255 : y;\
+  u = u > 255 ? 255 : u;\
+  v = v > 255 ? 255 : v
+
+static inline
+void rgb2yuy2 (unsigned char *RGB, unsigned char *YUV, int NumPixels) {
+  int i, j;
+  register int y0, y1, u0, u1, v0, v1 ;
+  register int r, g, b;
+
+  for (i = 0, j = 0; i < 3 * NumPixels; i += 6, j += 4)
+    {
+      r = RGB[i + 0];
+      g = RGB[i + 1];
+      b = RGB[i + 2];
+      RGB2YUV (r, g, b, y0, u0 , v0);
+      r = RGB[i + 3];
+      g = RGB[i + 4];
+      b = RGB[i + 5];
+      RGB2YUV (r, g, b, y1, u1 , v1);
+      YUV[j + 0] = y0;
+      YUV[j + 1] = (u0+u1)/2;
+      YUV[j + 2] = y1;
+      YUV[j + 3] = (v0+v1)/2;
+    }
+}
+
+static inline
+void bgr2yuy2 (unsigned char *RGB, unsigned char *YUV, int NumPixels) {
+  int i, j;
+  register int y0, y1, u0, u1, v0, v1 ;
+  register int r, g, b;
+
+  for (i = 0, j = 0; i < 3 * NumPixels; i += 6, j += 4)
+    {
+      r = RGB[i + 2];
+      g = RGB[i + 1];
+      b = RGB[i + 0];
+      RGB2YUV (r, g, b, y0, u0 , v0);
+      r = RGB[i + 5];
+      g = RGB[i + 4];
+      b = RGB[i + 3];
+      RGB2YUV (r, g, b, y1, u1 , v1);
+      YUV[j + 0] = y0;
+      YUV[j + 1] = (u0+u1)/2;
+      YUV[j + 2] = y1;
+      YUV[j + 3] = (v0+v1)/2;
+    }
+}
+
+static inline
+void yuv2yuy2 (unsigned char *YUV, unsigned char *YUV2, int NumPixels) {
+  int i, j;
+  //register int y0, y1, u0, u1, v0, v1 ;
+  //register int r, g, b;
+
+  for (i = 0, j = 0; i < 3 * NumPixels; i += 6, j += 4)
+    {
+      YUV2[j + 0] = YUV[i+2];
+      YUV2[j + 1] = (YUV[i+1]+YUV[i+4])/2;
+      YUV2[j + 2] = YUV[i+5];
+      YUV2[j + 3] = (YUV[i+0]+YUV[i+3])/2;
+    }
+}
+
+static inline
+void y2yuy2 (unsigned char *Y, unsigned char *YUV2, int NumPixels) {
+  int i, j;
+  //register int y0, y1, u0, u1, v0, v1 ;
+  //register int r, g, b;
+
+  for (i = 0, j = 0; i < NumPixels; i += 2, j += 4)
+    {
+      YUV2[j + 0] = Y[i+0];
+      YUV2[j + 1] = 127;
+      YUV2[j + 2] = Y[i+1];
+      YUV2[j + 3] = 127;
+    }
+}
+
+int xvview(int w, int h, char *data, int color, char typecode) {
+  static Display *display=NULL;
+  static int adaptor=-1;
+  static XvAdaptorInfo *info;
+  static const long format=XV_YUY2;
+  static unsigned char  *frame_buffer;
+
+  static Window window;
+  static XvImage *xv_image;
+  static GC gc;
+  static XShmSegmentInfo Shminfo;
+  static XGCValues xgcv;
+
+  static int paused = 0;
+
+  if (!display) {
+    int num_adaptors;
+    int num_formats;
+    XvImageFormatValues *formats=NULL;
+    int i,j;
+    char xv_name[5];
+
+    display=XOpenDisplay(getenv("DISPLAY"));
+    if(display==NULL) {
+      fprintf(stderr, "Could not open display \"%s\"\n",getenv("DISPLAY"));
+      return 0;
+    }
+  
+    XvQueryAdaptors(display,DefaultRootWindow(display),&num_adaptors,&info);
+        
+    for(i=0;i<num_adaptors;i++) {
+      formats=XvListImageFormats(display,info[i].base_id,&num_formats);
+      for(j=0;j<num_formats;j++) {
+	xv_name[4]=0;
+	memcpy(xv_name,&formats[j].id,4);
+	if(formats[j].id==format) {
+	  fprintf(stderr, "using Xv format 0x%x %s %s\n",formats[j].id,xv_name,
+		 (formats[j].format==XvPacked)?"packed":"planar");
+	  if(adaptor<0)adaptor=i;
+	}
+      }
+    }
+    XFree(formats);
+    if(adaptor<0) {
+      fprintf(stderr, "No suitable Xv adaptor found"); 
+      return 0;
+    }
+
+    long background=0x010203;
+
+    frame_buffer=(unsigned char *)malloc(w*h*2);
+    window=XCreateSimpleWindow(display,DefaultRootWindow(display),0,0,
+			       w,h,0,
+			       WhitePixel(display,DefaultScreen(display)),
+			       background);
+    XSelectInput(display,window,
+		 StructureNotifyMask|KeyPressMask|ButtonPressMask);
+    XMapWindow(display,window);
+    gc=XCreateGC(display,window,0,&xgcv);
+    if (!XShmQueryExtension(display)) 
+      fprintf(stderr,"Shared memory not supported\n");
+
+    Atom att=XInternAtom(display, "XV_SET_DEFAULTS", True);
+    if (att!=None) XvSetPortAttribute(display,info[adaptor].base_id,att,1);
+    xv_image = (XvImage *) XvShmCreateImage(display, info[adaptor].base_id, 
+					    format, NULL, w, h, &Shminfo);
+    Shminfo.shmid    = shmget(IPC_PRIVATE, xv_image->data_size, 
+			      IPC_CREAT | 0777);
+    Shminfo.shmaddr  = (char *) shmat(Shminfo.shmid, 0, 0);
+    Shminfo.readOnly = False;
+    xv_image->data = Shminfo.shmaddr;
+    XShmAttach(display, &Shminfo);
+    XSync(display, False);
+    shmctl(Shminfo.shmid, IPC_RMID, 0);
+  }
+
+  if (color && typecode == 'B') {
+    rgb2yuy2(data, xv_image->data, w*h);
+  } else if (!color && typecode == 'B') {
+    y2yuy2(data, xv_image->data, w*h);
+  } else {
+    fprintf(stderr, "Unknown image format: %c color=%d\n", typecode, color);
+    return 0;
+  }
+
+  XvShmPutImage(display,info[adaptor].base_id,window,gc,xv_image,
+		0,0,w,h,
+		0,0,w,h,
+		False);
+  XFlush(display);
+  
+  XEvent xev;
+  do {
+    while(XPending(display)>0) {
+      XNextEvent(display,&xev);
+      if (xev.type == KeyPress) {
+	char key[1];
+	XLookupString(&xev.xkey, key, 1, NULL, NULL);
+	if (key[0]==' ') paused=~paused;
+	else if (key[0] == 27) exit(0);
+      } else if (xev.type == ButtonPress) {
+	if (xev.xbutton.button==1) 
+	  printf("Click: (%d,%d)\n",xev.xbutton.x,xev.xbutton.y);
+      }
+    }
+  } while (paused);
+
+  
+  return 1;
+}