Snippets

Dennis Hedegaard poegetter.py

Created by Dennis Hedegaard
#!/usr/local/bin/python
import os
import requests
import sqlite3
from contextlib import closing

from bs4 import BeautifulSoup as BS4


URLTEMPLATE = (r'http://www.pathofexile.com/forum/view-forum'
               r'/%(forumid)s/page/%(page)s')
FORUMS = {
    'duelist': 40,
    'marauder': 23,
    'ranger': 24,
    'scion': 436,
    'shadow': 303,
    'templar': 41,
    'witch': 22,
}
DBFILE = os.path.abspath(os.path.join(
    os.path.dirname(__file__), os.path.splitext(__file__)[0] + '.db',
))

if os.path.exists(DBFILE):
    os.unlink(DBFILE)

dbi = sqlite3.connect(DBFILE)

with closing(dbi.cursor()) as csr:
    csr.execute('''
create table if not exists threads(
    forum text not null,
    title text not null,
    views integer not null,
    replies integer not null,
    page integer not null,
    url text primary key
);
    ''')
    csr.execute(
        'create index if not exists threads_idx_forum on threads(forum);')
    csr.execute(
        'create index if not exists threads_idx_title on threads(title);')
    csr.execute(
        'create index if not exists threads_idx_views on threads(views);')
    csr.execute(
        'create index if not exists threads_idx_replies on threads(replies);')
    csr.execute(
        'create index if not exists threads_idx_page on threads(page);')
    csr.execute('begin;')

for forum, forumid in FORUMS.items():
    pageno = 1
    while True:
        url = URLTEMPLATE % {'page': pageno, 'forumid': forumid}
        body = requests.get(url).text

        if 'No Threads' in body:
            break

        soup = BS4(body)
        table = soup.find('table', id='view_forum_table')
        for row in table.find_all('tr'):
            thread = row.find('td', class_='thread')
            if thread:
                link = thread.find(class_='title').find('a')

                url = link['href']
                if url.startswith('/'):
                    url = 'http://www.pathofexile.com%s' % url

                title = link.text
                views = int(row.find('td', class_='views').text)
                replies = int(row.find('td', class_='replies').text)
                with closing(dbi.cursor()) as csr:
                    csr.execute(
                        'insert into threads values (?, ?, ?, ?, ?, ?)',
                        (forum, title, views, replies, pageno, url))
        print 'FORUM %s: PAGE %s' % (forum, pageno)
        pageno += 1

with closing(dbi.cursor()) as csr:
    try:
        csr.execute('commit;')
    except:
        pass

try:
    dbi.close()
except:
    pass

Comments (0)

HTTPS SSH

You can clone a snippet to your computer for local editing. Learn more.