Created by
Hiroaki Nakamura
| #!/usr/bin/env python
from datetime import datetime
from html.parser import HTMLParser
import csv
import itertools
import sys
def format_unixtime(t):
dt = datetime.fromtimestamp(int(t))
return dt.strftime('%Y-%m-%dT%H:%M:%S')
class MyHTMLParser(HTMLParser):
def __init__(self, csv_file):
super(MyHTMLParser, self).__init__()
self.in_a = False
self.attrs = {}
self.csv_writer = csv.writer(csv_file)
def handle_starttag(self, tag, attrs):
if tag == 'a':
self.in_a = True
for (key, value) in attrs:
self.attrs[key] = value
def handle_endtag(self, tag):
if tag == 'a':
self.in_a = False
def handle_data(self, data):
if self.in_a:
self.csv_writer.writerow([
self.attrs.get('href'),
data,
self.attrs.get('private'),
format_unixtime(self.attrs.get('add_date')),
format_unixtime(self.attrs.get('last_visit')),
self.attrs.get('tags'),
])
parser = MyHTMLParser(sys.stdout)
for chunk in sys.stdin:
parser.feed(chunk)
|