Snippets

Andrew working.py

Created by Andrew
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

from scrape import UrlCreator, get_list_of_dicts_from_csv_file, CSV_FILE_PATH, CATEGORY_INDEX
#from proxy import do_req
import requests
import urllib
import logging as logger
import MySQLdb
import os
import time

timestr = time.strftime("%m-%d-%Y_%H:%M:%S")

header = {
    'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}

import logging as logger

logger.basicConfig(filename='/Users/coffeeman/Documents/Projects/scrapev2/logs/list.log',
                   filemode='w', level=logger.INFO,
                   format='%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
                   datefmt="%Y-%m-%d %H:%M:%S")

number_of_pages = 1
data_from_csv = get_list_of_dicts_from_csv_file(CSV_FILE_PATH)
url_list = []  # here we will store the url_list created
for data in data_from_csv:
    url_creator = UrlCreator(data['brand'], data['code'], number_of_pages)
    url1 = url_creator.create_url()
    url_list.append(url1)
#urls = json.dumps(url_list, indent=4)


#print(urls)

class PageExtractor:
    def __init__(self, url):
        self.url = url

    def get_number_of_pages(self):
        # Removing proxy w/ do_req
        #response = do_req(self.url)
        response = self.url

        #soup = BeautifulSoup(response.text, 'html.parser')
        soup = BeautifulSoup(response, features='lxml')
        page = soup.find('ul', {'class': 'pagination'})
        try:
            #print(self.url)
            number_of_pages = page.find('a', {'id': 'last'})['data-page']
            #print('(try)Pages = ' + str(number_of_pages))
        except:
            number_of_pages = 1
            #print('(except) statement Pages = ' + str(number_of_pages))
        self.scrape()
        return number_of_pages

    def scrape(self):
            pass

class DataExtractor:
    def __init__(self, url, brand, category):
        self.url = url
        #print(f"Self.url: {self.url}")
        self.brand = urllib.parse.quote(brand)
        #print(f"Self.url: {self.brand}")
        self.category = category
        #print(f"Self.url: {self.category}")

    def extract(self):
        db = MySQLdb.connect(host='127.0.0.1', user='thriftuser', passwd='kYJbqzI2Bvp41tp', db='thrift_db')
        #db = MySQLdb.connect(host='10.0.0.171', user='thriftuser', passwd='kYJbqzI2Bvp41tp', db='thrift_db')
        #db = MySQLdb.connect(host='ub16-03', user='scripter', passwd='o3c3bJRom8', db='thrift_db')
        # db = MySQLdb.connect(host='127.0.0.1', user='thriftuser', passwd='kYJbqzI2Bvp41tp', db='thrift_db')
        cur = db.cursor()
        alljobs = []
        logger.info("Print self.url: %s " % (self.url))
        #response = self.url
        response = requests.get(self.url, headers=header).text
        #print(f"Self.url: {self.url}")
        logger.info("Print response: %s " % (response))
        soup = BeautifulSoup(response, features='lxml')
        logger.info("Print soup: %s " % (soup))
        #print(soup)
        jobs = soup.find_all('li', class_='widget')
        print(f"Jobs: {jobs}")
        logger.info("Print jobs: %s " % (jobs))
        #print(jobs)
        for job in jobs:
            try:
                designer = self.brand
                searchURL = 'http://nullrefer.com/?' + url
                author_id = '1'
                title = job.findAll('div', attrs={'class': 'title'})[0].text.encode('utf-8').strip().splitlines()[:1]
                #print(title)
                link = job.findAll('a', attrs={'class': 'product'})[0]['href']
                productUrl = 'http://blankrefer.com/?https://www.shopgoodwill.com' + link
                imageUrl = job.findAll('img', attrs={'class': 'lazy-load'})[0]['src']
                price = job.findAll('div', attrs={'class': 'price'})[0].text.encode('utf-8')[3:10]
                countdown = job.findAll(attrs={"product-countdown"})[0].get('data-countdown')
                script = self.category
                print(F"script = {script}")
                published_date = time.strftime("%Y-%m-%d %H:%M:%S.000000")
                icon = job.findAll('img', attrs={'class': 'lazy-load'})[0]['data-src']
                published_date = time.strftime("%Y-%m-%d %H:%M:%S.000000")
                savedir = time.strftime('%m%d%Y_%H')
                #scriptName = 'women'
                scriptName = self.category
                imagesFolder = '/mnt/images/'
                # print('imagesFolder:', imagesFolder)
                # the full savepath is then:
                shortpath = os.path.join(scriptName + '_' + savedir)
                # print('shortpath:', shortpath)
                savepath = os.path.join(imagesFolder + '/', scriptName + '_' + savedir)
                # print('savepath:', savepath)
                try:
                    os.makedirs(savepath)
                except Exception as e:
                    pass

                try:
                    if imageUrl.find('/'):
                        imageName = imageUrl.rsplit('/', 1)[1]
                        # imageFolder = '/home/ubuntu/scripts/gwscripts/images'
                        # imagePath = imageFolder + '/' + imageName
                        imagePath = savepath + '/' + imageName
                        with open(imagePath, 'wb') as f:
                            response = requests.get(imageUrl, headers=header)
                            f.write(response.content)
                except Exception as e:
                    print(e)
                awsimageUrl = 'https://s3-us-west-2.amazonaws.com/imagekickbucket/' + shortpath + '/' + imageName
                # print(awsimageUrl)
            except Exception as e:
                print(e)
                logger.info("Post Jobs Try error: %s " % (e))
                
            currentJob = {
                'designer': designer,
                'searchURL': searchURL,
                'author_id': author_id,
                'title': title,
                'productUrl': productUrl,
                'awsimageUrl': awsimageUrl,
                'imagePath': imagePath,
                'price': price,
                'countdown': countdown,
                'script': script,
                'savepath': savepath,
                'published_date': published_date,
                'icon': icon,
            }
            alljobs.append(currentJob)
            logger.info("currentJob: %s " % (currentJob))
            try:
                cur.execute(
                    "INSERT INTO `blog_post` (designer, searchURL, author_id, title, productUrl, awsimageUrl, "
                    "imagePath, price, countdown, script, savepath, published_date, icon)"
                    "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
                    (designer, searchURL, author_id, title, productUrl, awsimageUrl, imagePath, price, countdown,
                     script, savepath, published_date, icon))
                db.commit()
            except MySQLdb.Error as e:
                logger.error(e)
                logger.info("MySQLdb Error: %s " % (e))
                print(e)
        logger.info("womens Ended:: %s " % (timestr))

        db.close()
        return alljobs

if __name__ == "__main__":
    for url in url_list:
        page_extractor = PageExtractor(url)
        number_of_pages = page_extractor.get_number_of_pages()
        #print(f"for url: {url} no_pages: {number_of_pages}")
        results = DataExtractor().extract(url)


# if __name__ == "__main__":
#   data_from_csv = get_list_of_dicts_from_csv_file(CSV_FILE_PATH)
#   urls = [] #here we will store the urls created
#   for data in data_from_csv:
#     url_creator = UrlCreator(data['brand'], data['code'])
#     url = url_creator.create_url()
#     urls.append(url)
#   x = json.dumps(urls, indent=4)
#   print(type(x))
#   print(json.dumps(urls, indent=4))

Comments (0)

HTTPS SSH

You can clone a snippet to your computer for local editing. Learn more.