+. .1111... | Title: download_hashes.org.py
+ .10000000000011. .. | Author: Oliver Morton
+ .00 000... | Email: grimhacker@grimhacker.com
+ .. | Download left and found hashes from hashes.org
+ .. | Requires Python 3, requests, beautifulsoup4
+----------------------------------------------------------------------------
+ Copyright (C) 2017 Oliver Morton
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+__version__ = "$Revision: 1.0$"
+from bs4 import BeautifulSoup
+from pathlib import Path
+ """Print command line version banner."""
+. .1111... | Title: download_hashes.org.py
+ .10000000000011. .. | Author: Oliver Morton
+ .00 000... | Email: grimhacker@grimhacker.com
+ .. | Download left and found hashes from hashes.org
+ .. | Requires Python 3, requests, beautifulsoup4
+----------------------------------------------------------------------------
+ This program comes with ABSOLUTELY NO WARRANTY.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions. See GPLv2 License.
+----------------------------------------------------------------------------
+""".format(__version__))
+ base_url = "https://hashes.org/"
+ public_url = "{}public.php".format(base_url)
+ r = requests.get(public_url)
+ soup = BeautifulSoup(html, "html5lib")
+ table = soup.find("table")
+ rows = table.find_all("tr")[2:]
+ cells = row.find_all("td")
+ keys = ["id", "name", "last-update", "num-hashes", "progress", "left-link", "found-link"]
+ for num, key in enumerate(keys):
+ relative_link = cell.find("a")["href"]
+ data = "{}{}".format(base_url, relative_link)
+ leaks[leak["id"]] = leak
+ print("Found {} leaks".format(len(leaks)))
+ print("Downloading '{}'".format(url))
+ # NOTE the stream=True parameter
+ r = requests.get(url, stream=True)
+ d = r.headers['content-disposition']
+ local_filename = re.findall('filename="(.+)"', d)[0]
+ path_filename = Path(local_filename)
+ if path_filename.exists():
+ print("Backing up previous version of '{}'.".format(local_filename))
+ os.rename(local_filename, "{}.old".format(local_filename))
+ with open(local_filename, 'wb') as f:
+ for chunk in r.iter_content(chunk_size=1024):
+ if chunk: # filter out keep-alive new chunks
+ print("Downloaded '{}' to '{}'".format(url, local_filename))
+def download_leaks(oldleaks, newleaks, link_keys):
+ newleak = newleaks.get(leak)
+ oldleak = oldleaks.get(leak)
+ if oldleak is not None:
+ if oldleak.get("last-update") == newleak.get("last-update"):
+ print("Skipping '{}' as no update.".format(newleak.get("name")))
+ continue # skip because no update
+ # download the new or updated leak
+ filename = download_file(url)
+ filename_key = key.replace("link", "filename")
+ newleak[filename_key] = filename
+ fresh_downloads[leak] = newleaks[leak]
+ # make sure we keep any old leaks that are no longer on the site.
+ if leak not in newleaks:
+ newleaks[leak] = oldleaks[leak]
+ with open(name, "r") as f:
+def output_json(leaks, name):
+ with open(name, "w") as f:
+ json.dump(leaks, f, sort_keys=True, indent=4, separators=(',', ': '))
+if __name__ == "__main__":
+ get_left = bool(int(sys.argv[1])) # 1 = download left hashes as well as found. 0 = just found.
+ link_keys = ["found-link"]
+ link_keys.append("left-link")
+ leakfile = "hashes.org_leaks.json"
+ freshfile = "hashes.org_fresh_leaks.json"
+ newleaks = parse_public()
+ oldleaks = load_json(leakfile)
+ except FileNotFoundError as e:
+ print("Assuming first run")
+ fresh = download_leaks(oldleaks, newleaks, link_keys)
+ output_json(newleaks, leakfile)
+ output_json(fresh, freshfile)