Anonymous avatar Anonymous committed b783dec

Added script to match spectrum in IDPicker files to mzML files.

Comments (0)

Files changed (1)

mzmatcher/mzmatcher.py

+#!/usr/bin/env python
+"""Script for analyzing the output of IDPicker in relation to
+the original mzML file that IDPicker used.
+
+Required Packages:
+  * stopwatch (http://pypi.python.org/pypi/stopwatch/)
+  * multiprocessing
+Suggested Packages:
+  * psyco
+
+2009 - John Paulett (john -at- 7oars.com)
+
+"""
+from __future__ import with_statement
+import base64
+import multiprocessing
+import os
+import stopwatch
+import struct
+import xml.dom.minidom as minidom
+
+@stopwatch.clockit
+def remove_peaks(fileprefix, ided, nonided):
+    """Incrementally increases the threshold and writes the result to a file."""
+
+    ## Sort the lists according to the number of peaks
+    ided = sorted(ided)
+    nonided = sorted(nonided)
+
+    with open('%s.csv' % fileprefix, 'w') as f:
+        for i in xrange(0, 1001):
+            ## Count the number of spectra that have at least i peaks
+            nonjunk_ided = [x for x in ided if x >= i]
+            nonjunk_nonided = [x for x in nonided if x >= i]
+            f.write('%i,%i,%i\n' % (i, len(nonjunk_ided), len(nonjunk_nonided)))
+    print 'Finished Removing'
+
+def peaks_from_binary(data):
+    """Convert binary data to the list of values. 
+    From mmass 2.4 document_mzdata.py"""
+    data = base64.b64decode(data)
+    endian = '<'
+    pointsCount = len(data) / struct.calcsize(endian+'f')
+    start, end = 0, len(data)
+    points = struct.unpack(endian + 'f' * pointsCount, data[start:end])
+    return points
+
+def parse_xml_file(filename):
+    """Open and parse the XML file, *filename*. Returns an instance of 
+    a DOM Element.
+    """
+    return minidom.parse(filename)
+
+def find_spectra(filename):
+    """Returns all the XML spectrum tags."""
+    xml = parse_xml_file(filename)
+    return xml.getElementsByTagName('spectrum')
+    
+def get_peak_count(xml):
+    array = xml.getElementsByTagName('binaryDataArray')[0]
+    return int(array.getAttribute('encodedLength'))
+    
+def get_peaks(xml, search):
+    """Recover the list of peaks from the mzML files."""
+    for array in xml.getElementsByTagName('binaryDataArray'):
+        if search in array.toxml():
+            binary = array.getElementsByTagName('binary')[0]
+            return peaks_from_binary(binary.firstChild.nodeValue)
+
+@stopwatch.clockit
+def process_raw_file(fileprefix):
+    """Handles mzML files (all peak information)"""
+    print 'Processing mzML file'
+    spectra = find_spectra(fileprefix + '.mzML')
+    
+    ids, mz_sum, intensity_sum = {}, {}, {}
+    for spectrum in spectra:
+        id = int(spectrum.attributes['id'].value)
+        ids[id] = get_peak_count(spectrum)
+        mz = get_peaks(spectrum, 'm/z array')
+        mz_sum[id] = sum(mz)
+        
+        intensity = get_peaks(spectrum, 'intensity array')
+        intensity_sum[id] = sum(intensity)
+    return ids, mz_sum, intensity_sum
+
+@stopwatch.clockit
+def process_idpicker_file(fileprefix):
+    """Handles the IDPicker files (identified spectra).  Returns the list of 
+    IDs from the file"""
+    print 'Processing idpXML file'
+    spectra = find_spectra(fileprefix + '.idpXML')
+    return [int(spectrum.attributes['scan'].value) for spectrum in spectra]
+
+def run(fileprefix):
+    """Get the data from the XML files, then match the two of them up."""
+    print fileprefix    
+
+    print 'Loading files'
+    raw_ids, mz_sum, intensity_sum = process_raw_file(fileprefix)
+    idpicker_ids = process_idpicker_file(fileprefix)
+    
+    print 'Processed files, now matching'
+    ided, nonided = [], []
+    with open(fileprefix + '_roc.txt','w') as roc_file:
+        for raw_id, raw_value in raw_ids.iteritems():
+            if raw_id in idpicker_ids:
+                ided.append(raw_value)
+                indicator = 'I'
+            else:
+                nonided.append(raw_value)
+                indicator = 'U'
+            roc_line = '%i %s %i %f %f\n' % (raw_id, indicator, raw_value, mz_sum[raw_id], intensity_sum[raw_id])
+            roc_file.write(roc_line)
+    print 'Finished matching. IDed=%i, Non-IDed=%i' % (len(ided), len(nonided))
+    
+    print 'Removing peaks'
+    remove_peaks(fileprefix, ided, nonided)
+
+@stopwatch.clockit
+def main():
+    print 'Starting'
+    print '============================================================='
+    
+    file_list = ['~/data/1232']
+    
+    ## spawn separate processes to move load to other cores
+    pool = multiprocessing.Pool()
+    pool.map(run, file_list)
+    
+    print '============================================================='
+    print 'All Finished'
+
+if __name__ == '__main__':
+    try:
+        ## Try to enable a speedup
+        import psyco
+        psyco.full() 
+    except ImportError:
+        print 'Could not optimize with psyco.'
+    main()
+    
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.