#!/usr/bin/env python3
.       .1111...          | Title:
    .10000000000011.   .. | Author: Oliver Morton
 .00              000...  | Email:
1                  01..   | Description:
                    ..    |  Download left and found hashes from
                   ..     |  public leaks
GrimHacker        ..      |
                 ..       | Requires Python 3, requests, beautifulsoup4  ..        |
@grimhacker    ..         |
    Copyright (C) 2017  Oliver Morton

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

__version__ = "$Revision: 1.0$"

# $Source$

import requests
import json
import sys
import re
import os

from bs4 import BeautifulSoup
from pathlib import Path

def print_version():
    """Print command line version banner."""
.       .1111...          | Title:
    .10000000000011.   .. | Author: Oliver Morton
 .00              000...  | Email:
1                  01..   | Description:
                    ..    |  Download left and found hashes from
                   ..     |  public leaks
GrimHacker        ..      |
                 ..       | Requires Python 3, requests, beautifulsoup4  ..        |
@grimhacker    ..         |
    This program comes with ABSOLUTELY NO WARRANTY.
    This is free software, and you are welcome to redistribute it
    under certain conditions. See GPLv2 License.

def parse_public():
    base_url = ""
    public_url = "{}public.php".format(base_url)
    r = requests.get(public_url)
    html = r.text
    soup = BeautifulSoup(html, "html5lib")
    table = soup.find("table")
    rows = table.find_all("tr")[2:]
    leaks = {}
    for row in rows:
        cells = row.find_all("td")
        keys = ["id", "name", "last-update", "num-hashes", "progress", "left-link", "found-link"]
        leak = {}
        for num, key in enumerate(keys):
            cell = cells[num]
            if "link" in key:
                relative_link = cell.find("a")["href"]
                data = "{}{}".format(base_url, relative_link)
                data = cell.string
            leak[key] = data
        leaks[leak["id"]] = leak
    print("Found {} leaks".format(len(leaks)))
    return leaks

def download_file(url):
    print("Downloading '{}'".format(url))
    # NOTE the stream=True parameter
    r = requests.get(url, stream=True)
    d = r.headers['content-disposition']
    local_filename = re.findall('filename="(.+)"', d)[0]
    path_filename = Path(local_filename)
    if path_filename.exists():
        print("Backing up previous version of '{}'.".format(local_filename))
        os.rename(local_filename, "{}.old".format(local_filename))
    with open(local_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024): 
            if chunk: # filter out keep-alive new chunks
    print("Downloaded '{}' to '{}'".format(url, local_filename))
    return local_filename

def download_leaks(oldleaks, newleaks, link_keys):
    fresh_downloads = {}
    for leak in newleaks:
        newleak = newleaks.get(leak)
        oldleak = oldleaks.get(leak)
        if oldleak is not None:
            if oldleak.get("last-update") == newleak.get("last-update"):
                print("Skipping '{}' as no update.".format(newleak.get("name")))
                continue # skip because no update
        # download the new or updated leak
        for key in link_keys:
            url = newleak[key]
            filename = download_file(url)
            filename_key = key.replace("link", "filename")
            newleak[filename_key] = filename
        fresh_downloads[leak] = newleaks[leak]
    # make sure we keep any old leaks that are no longer on the site.
    for leak in oldleaks:
        if leak not in newleaks:
            newleaks[leak] = oldleaks[leak]
    return fresh_downloads

def load_json(name):
    with open(name, "r") as f:
        leaks = json.load(f)
    return leaks

def output_json(leaks, name):
    with open(name, "w") as f:
        json.dump(leaks, f, sort_keys=True, indent=4, separators=(',', ': '))

if __name__ == "__main__":
    get_left = bool(int(sys.argv[1])) # 1 = download left hashes as well as found. 0 = just found.
    link_keys = ["found-link"]
    if get_left is True:
        print("getting left")
    leakfile = "hashes.org_leaks.json"
    freshfile = "hashes.org_fresh_leaks.json"
    newleaks = parse_public()
        oldleaks = load_json(leakfile)
    except FileNotFoundError as e:
        print("Assuming first run")
        oldleaks = {}
    fresh = download_leaks(oldleaks, newleaks, link_keys)
    output_json(newleaks, leakfile)
    output_json(fresh, freshfile)

