Source

mdma-slides / slides.html

Full commit
<!DOCTYPE html>
<html>
  <head>
    <title>mdma-slides</title>
    <meta charset='utf-8' />
    <meta content='width=1024, user-scalable=no' name='viewport' />
    <!-- deck.js's core css -->
    <link href="deck.js/core/deck.core.css" rel="stylesheet" type="text/css"/>
    <!-- deck.js extension CSS files -->
    <link href="deck.js/extensions/codemirror/deck.codemirror.css" rel="stylesheet" type="text/css"/>
    <link href="deck.js/extensions/goto/deck.goto.css" rel="stylesheet" type="text/css"/>
    <link href="deck.js/extensions/hash/deck.hash.css" rel="stylesheet" type="text/css"/>
    <link href="deck.js/extensions/menu/deck.menu.css" rel="stylesheet" type="text/css"/>
    <link href="deck.js/extensions/navigation/deck.navigation.css" rel="stylesheet" type="text/css"/>
    <link href="deck.js/extensions/scale/deck.scale.css" rel="stylesheet" type="text/css"/>
    <link href="deck.js/extensions/status/deck.status.css" rel="stylesheet" type="text/css"/>
    <!-- all css in the css dir: Keydown CSS, your custom CSS, and themes from deck.js -->
    <link href="css/keydown.css" rel="stylesheet" type="text/css"/>
    <link href="css/default.css" rel="stylesheet" type="text/css"/>
    <link href="css/horizontal-slide.css" rel="stylesheet" type="text/css"/>
    <link href="css/mdma-slides.css" rel="stylesheet" type="text/css"/>
    <link href="css/swiss.css" rel="stylesheet" type="text/css"/>
    <!-- Modernizr (provided for legacy browsers) -->
    <script src="deck.js/support/modernizr.custom.js" type="text/javascript"></script>
  </head>
  <body class='deck-container keydown'>
    <section class='full-background meta slide'>
      <div class='spacer top'></div>
      <div class='content'>
        <h1><strong>m</strong>eta<strong>d</strong>ata <strong>ma</strong>nagement <em>in four iterations</em></h1>
      </div>
      <div class='spacer bottom'></div>
    </section>
    <section class='left slide'>
      <div class='spacer top'></div>
      <div class='content'>
        <h1>Goals for FINC mdma</h1>
        
        <ul>
        <li><p>handle <strong>various</strong> data sources</p>
        
        <p>  <em>BSZ, Naxos, E-Book Library, Nutzergesteuerte Erwerbung (PDA), ...</em></p></li>
        <li><p>export <strong>FINCMARC</strong> as deliverable for VuFind <strong>frontend</strong></p></li>
        <li>handle <strong>deduplication</strong>, <strong>Libero</strong> item data, MARC field rewrites, ...</li>
        </ul>
      </div>
      <div class='spacer bottom'></div>
    </section>
    <section class='left slide'>
  <div class='spacer top'></div>
  <div class='content'>
    <h1>1st iteration ↺</h1>
    
    <ul>
    <li>PHP (261) and XML-driven (89) configuration</li>
    <li>leveraging common utils: <em>mv, yaz-marcdump, iconv</em></li>
    <li><strong>bootstrapped</strong> the metadata management</li>
    </ul>
    
    
    <p><textarea class='code' display='none' mode='xml'><source file="KA_010_1994-tit.mrc">&#x000A;    <prefix tag="001" prefix="PPN000000_" /> &#x000A;    <mergeaut />&#x000A;</source>&#x000A;&#x000A;<source file="NaxosNML-full.mrc">&#x000A;    <iconv from="iso8859-1" />&#x000A;    <copytag totag="001" fromtag="028a" />&#x000A;    <prefix tag="001" prefix="EXT000005_" />&#x000A;    <tagreplace tag="856u"&#x000A;        pattern="^.*(=[^=]+$)" &#x000A;        replace="http://example.com/sc.asp?ver=2.0&amp;item_code$1"/>&#x000A;</source></textarea>
</p>
    
    <p><textarea class='code' display='none' mode='php'>$filelist = glob("$tdir/$workfix*");&#x000A;    echo "processing " . count($filelist) . " chunks\n";&#x000A;    foreach($filelist as $file) {&#x000A;        foreach($source->children() as $action) {&#x000A;            ...</textarea>
</p>
  </div>
  <div class='spacer bottom'></div>
</section>
    <section class='left slide'>
  <div class='spacer top'></div>
  <div class='content'>
    <h1>2nd iteration ↺</h1>
    
    <ul>
    <li>Python (1814) and XML-driven (148) configuration</li>
    <li>libraries: <em>pymarc, libxml, sqlalchemy</em></li>
    <li><strong>NEW!</strong> metadata snapshot stored in <strong>database</strong> (MySQL)</li>
    </ul>
    
    
    <p><textarea class='code' display='none' mode='xml'><datasource kind="title" source_id="0" record_field="001"&#x000A;    content_type="application/marc" location="/var/i/000/2*/*-tit.mrc">&#x000A;    <process>&#x000A;        <fm_cloneOriginal />&#x000A;        <fm_setField field="001" value="finc_id()" />&#x000A;        <bag_getKeyFromOriginal field="020" subfield="a" key="isbn" />&#x000A;        <fm_setLocalInfo />&#x000A;        <bag_getLiberoItemData />&#x000A;    </process>&#x000A;</datasource></textarea>
</p>
    
    <p><textarea class='code' display='none' mode='python'>for j, record in enumerate(record_iterator, start=1):&#x000A;    ...&#x000A;    for command in _commands:&#x000A;        if 'skip_this' not in bag:&#x000A;            bag = command.execute(bag)&#x000A;    ...</textarea>
</p>
  </div>
  <div class='spacer bottom'></div>
</section>
    <section class='full-background left slide slow'>
      <div class='spacer top'></div>
      <div class='content'>
        <h1><strong>slow</strong> on 9.1GB of data and 19M records (and that's just one data source)</h1>
      </div>
      <div class='spacer bottom'></div>
    </section>
    <section class='left slide'>
      <div class='spacer top'></div>
      <div class='content'>
        <h1>2nd iteration slowness</h1>
        
        <ul>
        <li>needed to reach out to Libero for <strong>every</strong> record</li>
        <li>no multiprocessing / threading</li>
        <li>a bit of ORM overhead</li>
        </ul>
      </div>
      <div class='spacer bottom'></div>
    </section>
    <section class='left slide'>
  <div class='spacer top'></div>
  <div class='content'>
    <h1>3rd iteration ↺</h1>
    
    <ul>
    <li>2nd iteration +
    
    <ul>
    <li>multiprocessing with 10–200 worker</li>
    <li>plain SQL</li>
    <li>optimized (batched) Libero-requests</li>
    <li>a bit of memcached</li>
    </ul>
    </li>
    <li>Python (1760), XML (451)</li>
    </ul>
    
    
    <p><textarea class='code' display='none' mode='python'>QUEUE_SIZE = 20000&#x000A;tasks = multiprocessing.JoinableQueue(QUEUE_SIZE)&#x000A;num_workers = args.workers or multiprocessing.cpu_count() * 40&#x000A;workers = [ Worker(tasks, appconfig, iconf) for i in range(num_workers) ]&#x000A;_ = [ w.start() for w in workers ]&#x000A;...&#x000A;with open(filename, 'r') as handle:&#x000A;    reader = pymarc.MARCReader(handle.read(), to_unicode=True)&#x000A;    for i, record in enumerate(reader):&#x000A;        tasks.put(record)</textarea>
</p>
  </div>
  <div class='spacer bottom'></div>
</section>
    <section class='full-background left slide tape'>
      <div class='spacer top'></div>
      <div class='content'>
        <h1>3rd iteration: faster, but still sequential – still more control over data needed</h1>
      </div>
      <div class='spacer bottom'></div>
    </section>
    <section class='left slide'>
  <div class='spacer top'></div>
  <div class='content'>
    <h1>4th iteration ↺</h1>
    
    <ul>
    <li>shift to a slightly different processing model</li>
    <li><strong>NEW!</strong> <strong>cache</strong> all Libero item data (MySQL) – less network overhead</li>
    <li><strong>NEW!</strong> hold all raw source data in <strong>elasticsearch</strong> – faster, more fine grained access</li>
    <li>no more XML configuration, but one (or more) python scripts per data source</li>
    <li>Python (1708), Java (1292), bash (800)</li>
    </ul>
    
    
    <p><textarea class='code' display='none' mode='python'>isbns = set()&#x000A;for subfield in doc['content']['020']:&#x000A;    if 'a' in subfield:&#x000A;        # 020.a may look like: '9780948838903 (pbk.) :'&#x000A;        isbns.add(subfield['a'].split()[0])&#x000A;&#x000A;for isbn in isbns:&#x000A;    # dedup here</textarea>
</p>
  </div>
  <div class='spacer bottom'></div>
</section>
    <section class='left slide'>
  <div class='spacer top'></div>
  <div class='content'>
    <h1>4th iteration data access</h1>
    
    <ul>
    <li>flexible ad-hoc queries on MARC</li>
    <li>all records with ISBN 9781430272304</li>
    </ul>
    
    
    <p><textarea class='code' display='none' mode='sh'>$ curl -XGET 0.0.0.0:9200/_search?q=content.020.a:9781430272304</textarea>
</p>
    
    <ul>
    <li>kinds of classifications on NEP</li>
    </ul>
    
    
    <p><textarea class='code' display='none' mode='sh'>$ curl -XPOST "http://0.0.0.0:9200/nep/_search?pretty=true" -d&#x000A;    '{ "query" : { "match_all" : {} }, &#x000A;       "facets" : { "tags" : { "terms" : { "field" : "content.072.2" }}}}'</textarea>
</p>
    
    <ul>
    <li>pipeable processing (streams records from elasticsearch via processing script into metadata db)</li>
    </ul>
    
    
    <p><textarea class='code' display='none' mode='sh'>$ ./scroll.sh --index bsz --meta.kind=title | ./scripts/bsz.py</textarea>
</p>
  </div>
  <div class='spacer bottom'></div>
</section>
    <section class='left slide'>
      <div class='spacer top'></div>
      <div class='content'>
        <h1>4th iteration summary</h1>
        
        <ul>
        <li>fast (~10x speedup with fewer workers vs 2nd iteration)</li>
        <li>flexible data access, easier debugging</li>
        </ul>
        
        
        <p><strong>but ...</strong></p>
        
        <ul>
        <li>data redundancy (files and index)</li>
        <li>no more declarative XML configuration</li>
        </ul>
      </div>
      <div class='spacer bottom'></div>
    </section>
    <section class='left slide'>
      <div class='spacer top'></div>
      <div class='content'>
        <h1>lessons learned so far</h1>
        
        <ul>
        <li><strong>cache</strong> everything (libero data), never go over the network for data you'll access often, stale data (few hours) is ok here</li>
        <li>make data item <strong>accessible</strong> (elasticsearch for currently 7 sources, more to come soon) / within milliseconds (still needs some work)</li>
        <li>assume your data-flow can <strong>change</strong> tomorrow (new deduplication workflow, new source, new fields, etc.)</li>
        </ul>
      </div>
      <div class='spacer bottom'></div>
    </section>
    <section class='Nakonechna full-background left slide'>
      <div class='spacer top'></div>
      <div class='content'>
        <h1>FIN</h1>
      </div>
      <div class='spacer bottom'></div>
    </section>
    <!-- deck.js navigation extension -->
    <a class='deck-prev-link' href='#' title='Previous'>&#8592;</a>
    <a class='deck-next-link' href='#' title='Next'>&#8594;</a>
    <!-- deck.js hash extension -->
    <a class='deck-permalink' href='.' title='Permalink to this slide'>#</a>
    <!-- deck.js status extension -->
    <p class='deck-status'>
      <span class='deck-status-current'></span>
      /
      <span class='deck-status-total'></span>
    </p>
    <!-- jQuery & deck.js -->
    <script src="deck.js/support/jquery.1.6.4.min.js" type="text/javascript"></script>
    <script src="deck.js/core/deck.core.js" type="text/javascript"></script>
    <!-- deck.js extension JS files -->
    <script src="deck.js/extensions/codemirror/codemirror.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/deck.codemirror.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/clike/clike.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/clojure/clojure.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/coffeescript/coffeescript.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/css/css.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/diff/diff.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/haskell/haskell.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/htmlmixed/htmlmixed.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/javascript/javascript.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/lua/lua.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/php/php.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/plsql/plsql.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/python/python.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/r/r.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/rst/rst.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/ruby/ruby.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/scheme/scheme.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/smalltalk/smalltalk.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/sparql/sparql.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/stex/stex.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/velocity/velocity.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/xml/xml.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/xmlpure/xmlpure.js" type="text/javascript"></script>
    <script src="deck.js/extensions/codemirror/mode/yaml/yaml.js" type="text/javascript"></script>
    <script src="deck.js/extensions/goto/deck.goto.js" type="text/javascript"></script>
    <script src="deck.js/extensions/hash/deck.hash.js" type="text/javascript"></script>
    <script src="deck.js/extensions/menu/deck.menu.js" type="text/javascript"></script>
    <script src="deck.js/extensions/navigation/deck.navigation.js" type="text/javascript"></script>
    <script src="deck.js/extensions/scale/deck.scale.js" type="text/javascript"></script>
    <script src="deck.js/extensions/status/deck.status.js" type="text/javascript"></script>
    <!-- your custom JS here, including call to initialize deck.js-codemirror -->
    <script src="js/mdma-slides.js" type="text/javascript"></script>
    <!-- Initialize the deck. -->
    <script type='text/javascript'>
      $(function() { $.deck('.slide'); });
    </script>
  </body>
</html>