Snippets
Created by
Dénes Türei
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 | #!/usr/bin/env python
# Downloads HTML tables from the ksh.hu webpage and exports them to CSV.
# Author: turei.denes@gmail.com 2021
import os
import re
import itertools
import bs4
import urllib.request
url = 'https://www.ksh.hu/docs/hun/xstadat/xstadat_eves/i_zhc014%s.html'
pages = 'abcd'
outfile = 'jovedelmek.csv'
sep = ';'
reint = re.compile(r'^-?[\d\s]+$')
refloat = re.compile(r'^-?[\d\s,]+$')
def main():
table = table_multi_page()
table = process_numbers(table)
write_csv(table)
return table
def table_multi_page():
return list(
list(itertools.chain(*row))
for row in
zip(*(
table_one_page(page)
for page in pages
))
)
def table_one_page(page):
page_url = url % page
soup = bs4.BeautifulSoup(
urllib.request.urlopen(page_url).read(),
'html.parser',
)
table_soups = soup.find_all('table')
return list(itertools.chain(*(
table_to_list(table_soup)
for table_soup in table_soups
)))
def table_to_list(table_soup, encoding = 'utf-8'):
"""
Transform a HTML table to a list of 2 dimensions
Split values of colspan and rowspan
Repair table (maybe not stable)
From https://github.com/4sushi/htmlParse/blob/master/htmlParse.py
Note : Untested with a lot of encoding
:param html: html code
:type html: str
:param encoding: document encoding
:rtype encoding: str
:rtype: list[list[str]]
"""
recomma = re.compile(r'\s+,\s+')
def __add_cell(table, i_line, i_col, val, nb_line, nb_col):
while len(table) <= i_line:
table.append([])
nb_line += 1
# Repair table
while len(table[i_line]) < i_col:
table[i_line].append('')
table[i_line].insert(i_col, val)
if len(table[i_line]) > nb_col:
nb_col = len(table[i_line])
return nb_line, nb_col
def __get_cell_val(cell, encoding):
text = cell.get_text(separator=' ') # bs4
if not isinstance(text, str):
text = text.encode(encoding)
if text is None or text == " ":
text = ""
return recomma.sub(', ', text.strip())
table = []
nb_col = 0
nb_line = 0
# Transform BF object to list 2d with metadata (text, colspan, rowspan)
tr_list = table_soup.find_all('tr')
for i_tr, tr in enumerate(tr_list):
table.append([])
td_th_list = tr.find_all(re.compile(r'(td|th)'))
for cell in td_th_list:
# Calculate rowspan and colspan
colspan_val = 1
rowspan_val = 1
if 'colspan' in cell.attrs:
colspan_val = int(cell.attrs['colspan'])
if 'rowspan' in cell.attrs:
rowspan_val = int(cell.attrs['rowspan'])
cell_info = {
'colspan': colspan_val,
'rowspan': rowspan_val,
'text': __get_cell_val(cell, encoding)
}
table[-1].append(cell_info)
if len(td_th_list) > nb_col:
nb_col = len(td_th_list)
nb_line = len(tr_list)
del tr_list, table_soup
i_col = 0
# Transform list 2d with metadata to a list of string
# Split rowspan and colspan
while i_col < nb_col:
i_line = 0
while i_line < nb_line:
# Repair table
while i_col >= len(table[i_line]):
table[i_line].append('')
cell = table[i_line][i_col]
if type(cell) is not dict:
i_line += 1
continue
for i_colspan in range(i_col, i_col + cell['colspan']):
for i_rowspan in range(i_line, i_line + cell['rowspan']):
if i_colspan == i_col and i_rowspan == i_line:
continue
nb_line, nb_col = __add_cell(
table,
i_rowspan,
i_colspan,
cell['text'],
nb_line, nb_col,
)
# Update cell value to string
table[i_line][i_col] = cell['text']
i_line += 1
i_col += 1
return table
def process_numbers(table):
return [
[process_number(field) for field in row]
for row in table
]
def process_number(field):
if reint.match(field):
return int(field.replace(' ', ''))
elif refloat.match(field):
return float(field.replace(' ', '').replace(',', '.'))
else:
return field
def write_csv(table):
with open(outfile, 'w') as fp:
_ = fp.write(
os.linesep.join(
sep.join(str(field) for field in row)
for row in table
)
)
|
Comments (0)
You can clone a snippet to your computer for local editing. Learn more.