Source

psilib / mappers.py

Full commit
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
#! /usr/bin/env python

## Copyright (c) 1999 - 2002 L. C. Rees.  All rights reserved.
## See COPYRIGHT file for license terms.

__title__ = 'mappers'
__version__ = '0.03-test'
__author__ = 'L.C. Rees (xanimal@users.sf.net)'

'''mappers automates creation of PSI maps. It can map network (HTTP, FTP),
archive (ZIP, TAR.GZ, TAR), or filesystem datasources.'''


class Automapper:

    '''Autogenerates a PSI map of a datasource.'''

    # Retrieves XML tags by name attributes and name attribute 
    from psiutils import tagsbyName as _tagsbyName
    from psiutils import getname as _getname
    from psiutils import psins as _psins
    # Class for PSI collections and resources
    from som import Collection as _col
    from som import Resource as _res
    # OS and DOM modules
    import os as _os
    from os import path as _path
    from Ft.Xml.cDomlette import implementation as _imp
    
    # Creates document and its root
    _doc = _imp.createDocument(_psins, u'psi:psi', None)
    _root = _doc.firstChild
    # Clear argument flags
    _dohtml, _dorec, _doarc, = None, None, None
                
    def __init__(self, ip=None, op=None, *vargs):
        '''Sets attributes, input, and output

        Arguments:
        ip -- input path (default: None)
        op -- output path (default: None)
        vargs -- various arguments'''        
        # Extract argument flags for specialized processing
        for arg in vargs:
            # Determines if XHTML is embedded in a PSI map
            if arg == 'dohtm':
                # tidy and nr must have global names for some reason
                global nr, tidy
                # nr reads a string into cDomlette and HTML tidier
                from Ft.Xml.Domlette import NonvalidatingReader as nr
                from mx.Tidy import tidy
                from psiutils import attdel
                # Reader Exceptions and mimetype detector
                import mimetypes as mt                
                from Ft.Xml import ReaderException as rex
                self._rex, self._mt, self._dohtml = rex, mt, True
                self._attdel = attdel
            # Flag for recursive processing
            elif arg == 'dorec': self._dorec = True
            # Flag to map archive contents
            elif arg == 'doarc':
                # Archive and mimetype detector
                from psiutils import isarchive
                import mimetypes as mt
                self._isarchive, self._mt, self._doarc = isarchive, mt, True
        # If input or output paths are specified...
        if ip or op != None:
            self.read(ip)
            self.setoutput(op)

    def __repr__(self):
        '''Represent class as XML'''
        return self.tostring()

    def _collection(self, cn, pn=None):
        '''Makes a PSI collection element

        Arguments:
        cn -- collection name
        pn -- parent name (default: None)'''
        collection = self._makenode(self._col, cn)
        # Attach to parent if it exists
        if pn: pn.insertBefore(collection, pn.firstChild)
        return collection

    def _getparent(self, sp, root, strong=None, depth=-2):
        '''Finds a node's parent node
        
        Arguments:
        sp -- split path made from child's path
        root -- root node to search
        strong -- strongly verifies entire path (default: None)
        depth -- depth in path to start search for parent (default: -2)'''

        def findparent(depth):
            '''Walks down DOM till it finds actual parent
            
            Arguments:
            depth -- depth in path to start search for parent name'''
            if len(sp) != 2:
                # If no parents to check, end function
                if len(parents) == 0: return None
                # If parents, naarrow down number of possible parents
                elif strong or len(parents) != 1:
                    depth = depth + -1
                    for key in parents.keys():
                        # If parent is in path, adjust to fit
                        parent = parents.get(key).parentNode
                        # If the parent is found
                        if parent:
                            # Move down DOM tree
                            if sp[depth] == getname(parent):
                                parents[key] = parent
                            # Remove any parent that doesn't fit path
                            else: del parents[key]
                        # No more parents, end function
                        else: return None
                    # Recursion to check if necessary
                    findparent(depth)

        # Get collections matching parent name
        matches = self._tagsbyName(root, None, 'collection', sp[depth])
        getname = self._getname
        # Set to None if no children found
        if len(matches) == 0: return None
        # If more than one match, narrow down
        elif strong or len(matches) != 1:
            if len(sp) != 2:
                depth, parents = depth + -1, dict()
                # Weed out collections with different parents
                for match in matches:
                    parent = match.parentNode                    
                    if parent and sp[depth] == getname(parent):
                        parents[matches.index(match)] = parent
                # If more that one parent, narrow down parents
                findparent(depth)
                # Return parent if found
                if len(parents) != 0: return matches[parents.keys()[0]]
            else: return matches[0]
        # If one parent is found, return
        else: return matches[0]
                
    def _makenode(self, cls, name):
        '''Return a PSI node

        Arguments:
        cls -- class for creating PSI node
        name -- name of PSI node to be created'''
        # Ensure name is utf-8 encoded for cDomlette
        try: name = unicode(name, 'utf-8', 'ignore')
        except TypeError: pass
        # Import node from minidom into cDomlette tree 
        node = self._doc.importNode(cls(name))
        return node

    def _mapflat(self, ip):
        '''Returns PSI map of a list of input paths
        
        Arguments:
        ip -- input path'''

        def makepath(sp, type='collection'):
            '''Fills a DOM hierarchy if a PSI node's parent isn't found
        
            Arguments:
            sp -- split path to verify
            type -- type of PSI node (default: "collection")'''

            def findparent(tn):
                '''Finds or creates parents for a PSI node
        
                Arguments:
                tn -- temporary node hierarchy'''
                # Shorten split path by one
                if len(sp) != 2: del sp[-1]
                # Get parent of current PSI node
                parent = getparent(sp, root, 1)
                # Attach to parent if current in root node
                if parent: parent.insertBefore(tn, parent.firstChild)
                # Otherwise create new parent and look for parent's parent
                else:
                    temp = collection(sp[-2])
                    temp.insertBefore(tn, temp.firstChild)
                    findparent(temp)

            # Look up parent            
            parent = getparent(sp, root, 1)
            # If parent found...
            if parent:
                # Create resource and attach to parent
                if type == 'resource': walkfile(sp[-1], fp, parent, source)
                # Or create a collection if it doesn't exist
                elif not getparent(sp, root, 1, -1): collection(sp[-1], parent)
            # Otherwise make parent and look for its parent
            else:
                # Create parent
                temp = collection(sp[-2])
                # Create resource and attach to new parent
                if type == 'resource': walkfile(sp[-1], fp, temp, source)
                # Or create a new collection if it doesn't exist
                elif not getparent(sp, root, 1, -1): collection(sp[-1], temp)
                if len(sp) != 2: findparent(temp)

        # Avoid lookups, clear source, and create root
        getparent, collection, source = self._getparent, self._collection, None
        walkfile, root = self._walkfile, self._collection(u'root')
        # If FTP, process FTP
        if ip.find('ftp://') != -1:
            # Import URL parser and FTP spider
            from spider import ftpspider
            import urlparse as up
            self._up = up
            # Get full paths, partial paths, and FTP session
            flatlist, fullpaths, self._session = ftpspider(ip, 10, 500)
        elif ip.find('http://') != -1:
            # Import web spider and URL library
            from spider import httpspider
            import urllib as ulib
            self._ulib = ulib
            # Get full and partial paths,
            flatlist, fullpaths = httpspider(ip, 10, 500)
        else:
            root = collection(self._path.split(ip)[1])
            # Try opening tarball
            try:
                # Import tarfile handler
                import tarfile
                # Maintain source for other possible extractions
                source = tarfile.open(ip)
                # Make path and file lists
                flatlist, tl = source.getnames(), source.getmembers()
            except (NameError, tarfile.ReadError):
                # Import zipfile handler
                from zipfile import ZipFile
                # Maintain source for other possible extractions
                source = ZipFile(ip)
                # Make path and file lists
                flatlist = source.namelist()
            # If no archive, terminate process
            except IOError:
                print 'Error: Invalid pathname "%s"' % ip
                import sys
                sys.exit(0)
        # Process paths
        for path in flatlist:
            # Get path position and split the path
            pp, sp = flatlist.index(path), path.split('/')
            # Remove empty strings
            if sp[0] == '': del sp[0]
            if sp[-1] == '': del sp[-1]
            # Insert root into each path
            sp.insert(0, self._getname(root))
            # Adjust full path if necessary
            try: fp = fullpaths[pp]
            except NameError: fp = path
            # Try tarfile tests
            try:
                # If tarfile directory, make a collection
                if tl[pp].isdir(): makepath(sp)
                # If anything else (file), make resource
                else: makepath(sp, 'resource')
            except NameError:
                # If '/' indicates directory, make collection
                if path[-1] == '/': makepath(sp)
                # Anything else, make resource
                else: makepath(sp, 'resource')
        return root    
    
    def _mapdirs(self, u, ln, nested):
        '''Makes PSI map of a sequence of nested lists
        
        Arguments:
        u -- useless variable needed by self._path.walk()
        ln -- nested list level name
        nested -- nested list'''
        # Avoid lookups
        path, collection, os = self._path, self._collection, self._os
        dorec, walkfile, root = self._dorec, self._walkfile, self._root 
        # Split path (sp) of the collection nested list (ln)
        sp = ln.split(os.sep)
        # Remove empty string
        if sp[-1] == '': del sp[-1]
        # Make collection from name of dictionary
        temp = collection(sp[-1])
        # Loop over children (nested) of the current directory
        for item in nested:
            # Resolve full path of each item
            fp = path.join(ln, item)
            # Test if item is file. If true, make resource
            if path.isfile(fp): walkfile(item, fp, temp)
            # Test if item is directory. Skip if walk is non-recursive
            elif not dorec and path.isdir(fp): collection(item, temp)
        # Attach current collection to parent collection
        try:
            # Find new parent only if current parent changes
            if self._current != sp[-2]: self._cp = self._getparent(sp, root)
            self._cp.insertBefore(temp, self._cp.firstChild)
            # Keep parent collection current
            self._current = self._getname(self._cp)
        # If no parent is found, attach collection to document root
        except AttributeError:
            # Make first collection under root
            root.insertBefore(temp, root.firstChild)
            # Keep parent current
            self._cp, self._current = temp, self._getname(temp)

    def _resource(self, rn, pn):
        '''makes PSI resource element

        Arguments:
        rn -- resource name
        pn -- parent name'''
        resource = self._makenode(self._res, rn)
        pn.appendChild(resource)
        return resource

    def _walkdirs(self, ip):
        '''Walks input path directory hierarchy

        Arguments:
        ip -- base input directory path'''
        # Avoid name lookup
        mapdirs = self._mapdirs
        # For recursive directory scan
        if self._dorec: self._path.walk(ip, mapdirs, None)
        # For non-recusive directory scan
        else: mapdirs(None, ip, self._os.listdir(ip))

    def _walkfile(self, pp, fp, pn, source=None):
        '''Walks file contents

        Arguments:
        pp -- partial path name
        fp -- full path name
        pn -- parent name
        source -- file datasource (default: None)'''

        def readfile(fp):
            '''Reads content from file

            fp -- file path to read'''
            # Tries tar then zip then generic file opening
            try:
                # Try tar extract
                try:
                    file = source.extractfile(fp)
                    content = file.read()
                # Try zip
                except AttributeError: content = source.read(fp)
            # Try generic file opener
            except AttributeError: content = open(fp).read()
            # Return contents
            return content
    
        def readpath():
            '''Tries to read file contents all sources'''
            # Processes a URL unless a password is required for FTP
            try: content = self._ulib.urlopen(fp).read()
            # If FTP, prompt for loginname and password
            except AttributeError:
                try:
                    # Avoid name lookup
                    session = self._session
                    # Open local file
                    local = open(pp, 'wb')
                    # Create temporary path
                    tp = self._up.urlsplit(fp)[2]
                    # If session exists, download file
                    session.retrbinary('RETR %s' % tp, local.write)
                    # Close local file
                    local.close()
                    # Extract content
                    content = readfile(pp)
                # Extract content as default fallback
                except AttributeError: content = readfile(fp)
            return content

        # Avoid name lookups
        os, path, mapflat = self._os, self._path, self._mapflat
        doarc, dohtml = self._doarc, self._dohtml
        # If archive flag set, make collection
        if doarc and self._isarchive(fp):
            pn.insertBefore(mapflat(fp), pn.firstChild)
        # Otherwise, fallback to content mapper
        else:
            # Get mimetype
            if doarc or dohtml:
                mt = self._mt.guess_type(pp)
                archivetypes = ['application/x-tar', 'application/zip']
            # Scans archives if option set
            if doarc and mt[0] in archivetypes:
                # If the file needs to be written to filesystem, do so
                try:
                    if not path.exists(pp) or path.getsize(pp) == 0:
                        open(pp, 'wb').write(readpath())
                except OSError: pass
                pn.insertBefore(mapflat(pp), pn.firstChild)
                # Clear temporary files
                try: os.remove(pp)
                # Cache name of file if can't be deleted immediately
                except OSError: self._sc = pp
                # Try clearing any formerly locked files
                try: os.remove(self._sc)
                except (AttributeError, OSError): pass
            # Make normal resource processing default
            else:
                resource = self._resource(pp, pn)
                # Embed XHTML in PSI map if option set
                if dohtml and mt[0] == 'text/html':
                    # Pull down XHTML file and try embedding HTML
                    try: resource.appendChild(self._walkhtml(readpath()))
                    except SystemError: pass
                    # Delete any temporary files
                    try: os.remove(pp)
                    except OSError: pass  

    def _walkhtml(self, ip):
        '''Walks HTML input
        
        Arguments:
        ip -- HTML input path'''
        # Tidy the HTML and Feed into DOM tree
        try: html = nr.parseString(tidy(ip, output_xhtml=1)[2], 'file://file')
        # Don't embed if parsing errors
        except self._rex: return None
        # Extract only first child for cleanliness
        html = html.firstChild
        # Avoids issue with HTML docs made up of comments
        try: self._attdel(html, 'xmlns')
        except AttributeError: pass
        # Return HTML
        return html

    def read(self, ip):
        '''Sets the input path of the datasource a PSI map is generated from

        Arguments:
        ip -- input pathname'''
        # Set input attribute
        self.input = ip 
        # Clean up document root
        try:
            if self._root.hasChildNodes():
                self._root.removeChild(self._root.firstChild)
        except AttributeError: pass
        # _walkdirs if source is a filesystem. Otherwise, _mapflat
        if self._path.isdir(ip): self._walkdirs(ip)
        else: self._root.appendChild(self._mapflat(ip))
        
    def setoutput(self, op):
        '''Sets output to write PSI map to

        Arguments:
        op -- output path'''
        self.output = op

    def write(self, op=None):
        '''Writes PSI map to file

        Arguments:
        op -- output pathname (default: None)'''
        # Import XML file writer
        from psiutils import toxmlfile
        # Use output attribute if not overridden
        if op == None: op = self.output
        # Append .psi extension if not specified
        if op.find('.psi') == -1: op = ''.join([op, '.psi'])
        # Write the file
        toxmlfile(self._doc, op)

    def tocompressed(self, op=None):
        '''Writes PSI map in compressed XML

        Arguments:
        op -- output pathname (default: None)'''
        # Import gzip class
        from gzip import GzipFile
        # Use output attribute if not overridden
        if op == None: op = self.output
        # Append .psa extension if not specified
        if op.find('.psa') == -1: op = ''.join([op, '.psa'])
        # Write string to gzip file
        GzipFile(op, 'wb').write(self.tostring())

    def todom(self):
        '''Returns the DOM tree of the PSI map'''
        return self._doc

    def tostring(self):
        '''Returns a string version of the PSI map in XML'''
        from psiutils import tostring
        return tostring(self._doc)