Snippets
Created by
Dénes Türei
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | #!/usr/bin/env python
# Dénes Türei 2018
# turei.denes@gmail.com
import os
import sys
import imp
import re
import collections
import itertools
import urllib.request
import bs4
class Zsebinet(object):
baseurl = 'http://zsebinet.hu%s'
outfile = 'zsebinet.tsv'
resp = re.compile(r'\s{2,}')
def __init__(self):
self.urls = set([])
self.data = collections.defaultdict(lambda: {})
def reload(self, children = False):
modname = self.__class__.__module__
mod = __import__(modname, fromlist=[modname.split('.')[0]])
imp.reload(mod)
new = getattr(mod, self.__class__.__name__)
setattr(self, '__class__', new)
def main(self):
self.iter_pages()
self.export()
def fetch_page(self):
sys.stdout.write('\t[ INFO ] Loading `%s`\n' % self.url)
req = urllib.request.Request(self.url)
resp = urllib.request.urlopen(req)
html = resp.read()
self.soup = bs4.BeautifulSoup(html, 'lxml')
def process_page(self):
self.fetch_page()
self.rec = {}
for div in self.soup.find_all('div', {'class': 'field-label'}):
label = div.text.strip()
values = [
it.text.strip() if it else ''
for it in
div.findNextSibling(
'div', {'class': 'field-items'}
).find_all(
'div', {'class': 'field-item'}
)
]
self.rec[label] = values
def iter_pages(self):
self.url = self.baseurl % '/szuleszetek/'
self.fetch_page()
for tr in self.soup.find_all('tr'):
try:
city = tr.td.text.strip()
name = tr.img.get('title')
cat = tr.h6.text
self.url = self.baseurl % tr.a.get('href')
self.process_page()
self.data[name] = self.rec
self.data[name]['city'] = city
self.data[name]['cat'] = cat
self.data[name]['url'] = self.url
except:
pass
def export(self):
sys.stdout.write(
'\n\t[ INFO ] Exporting data to `%s`\n' % self.outfile
)
keys = ['name']
for d in self.data.values():
for k in d.keys():
if k not in keys:
keys.append(k)
with open(self.outfile, 'w') as fp:
_ = fp.write('%s\n' % '\t'.join(k.strip(':?') for k in keys))
for name, d in self.data.items():
_ = fp.write(
'%s\n' % '\t'.join(
itertools.chain(
[name],
[
'NA' if k not in d
else
';'.join(
'NA'
if i == 'Nincs adat'
else
self.resp.sub(' ', i.strip(' %'))
for i in d[k]
)
if type(d[k]) is list
else
self.resp.sub(' ', d[k])
for k in keys
]
)
)
)
|
Comments (0)
You can clone a snippet to your computer for local editing. Learn more.