Snippets

Dénes Türei Downloads data from the zsebinet.hu obstetric institutions database

Created by Dénes Türei
#!/usr/bin/env python

# Dénes Türei 2018
# turei.denes@gmail.com

import os
import sys
import imp
import re
import collections
import itertools

import urllib.request
import bs4

class Zsebinet(object):
    
    baseurl = 'http://zsebinet.hu%s'
    outfile = 'zsebinet.tsv'
    resp    = re.compile(r'\s{2,}')
    
    def __init__(self):
        
        self.urls = set([])
        self.data = collections.defaultdict(lambda: {})
    
    def reload(self, children = False):
        
        modname = self.__class__.__module__
        mod = __import__(modname, fromlist=[modname.split('.')[0]])
        imp.reload(mod)
        new = getattr(mod, self.__class__.__name__)
        setattr(self, '__class__', new)
    
    def main(self):
        
        self.iter_pages()
        self.export()
    
    def fetch_page(self):
        
        sys.stdout.write('\t[ INFO ] Loading `%s`\n' % self.url)
        req  = urllib.request.Request(self.url)
        resp = urllib.request.urlopen(req)
        html = resp.read()
        self.soup = bs4.BeautifulSoup(html, 'lxml')
    
    def process_page(self):
        
        self.fetch_page()
        self.rec = {}
        
        for div in self.soup.find_all('div', {'class': 'field-label'}):
            
            label = div.text.strip()
            values = [
                it.text.strip() if it else ''
                for it in
                div.findNextSibling(
                    'div', {'class': 'field-items'}
                ).find_all(
                    'div', {'class': 'field-item'}
                )
            ]
            self.rec[label] = values
    
    def iter_pages(self):
        
        self.url = self.baseurl % '/szuleszetek/'
        self.fetch_page()
        
        for tr in self.soup.find_all('tr'):
            
            try:
                
                city = tr.td.text.strip()
                name = tr.img.get('title')
                cat  = tr.h6.text
                self.url = self.baseurl % tr.a.get('href')
                self.process_page()
                self.data[name] = self.rec
                self.data[name]['city'] = city
                self.data[name]['cat'] = cat
                self.data[name]['url'] = self.url
                
            except:
                
                pass
    
    def export(self):
        
        sys.stdout.write(
            '\n\t[ INFO ] Exporting data to `%s`\n' % self.outfile
        )
        
        keys = ['name']
        
        for d in self.data.values():
            
            for k in d.keys():
                
                if k not in keys:
                    
                    keys.append(k)
        
        with open(self.outfile, 'w') as fp:
            
            _ = fp.write('%s\n' % '\t'.join(k.strip(':?') for k in keys))
            
            for name, d in self.data.items():
                
                _ = fp.write(
                    '%s\n' % '\t'.join(
                        itertools.chain(
                            [name],
                            [
                                'NA' if k not in d
                                else
                                    ';'.join(
                                        'NA'
                                        if i == 'Nincs adat'
                                        else
                                        self.resp.sub(' ', i.strip(' %'))
                                        for i in d[k]
                                    )
                                if type(d[k]) is list
                                else
                                    self.resp.sub(' ', d[k])
                                for k in keys
                            ]
                        )
                    )
                )

Comments (0)

HTTPS SSH

You can clone a snippet to your computer for local editing. Learn more.