importcsvimportjsonimporttimeimporturllib.parseinlinetime=time.strftime('%m/%-d/%Y').lstrip("0").replace(" 0"," ")CSV_FILE_PATH='./list/list.csv'BRAND_INDEX=0# use these instead of referencing with a value like list[0] to know what is 0; the same for code_indexCODE_INDEX=1CATEGORY_INDEX=2defget_list_of_dicts_from_csv_file(csv_file_path):list_of_dicts=[]withopen(csv_file_path)ascsv_file:csv_reader=csv.DictReader(csv_file,delimiter=',')next(csv_reader)# this is to skip the first line which contains the columnsforlineincsv_reader:list_of_dicts.append(line)returnlist_of_dictsclassUrlCreator:def__init__(self,brand,code):self.brand=urllib.parse.quote(brand)self.code=codedefcreate_url(self):returnf"https://www.site.com.com/Listings?st={self.brand}&sg=Ending&c={self.code}&s=&lp=0&hp=999999&sbn=false&spo=false&snpo=false&socs=false&sd=false&sca=false&caed={inlinetime}%2012:00:00%20AM&cadb=7&scs=False&sis=False&col=0&p=1&ps=40&desc=False&ss=0&Us"defurl_encoded():data_from_csv=get_list_of_dicts_from_csv_file(CSV_FILE_PATH)urls=[]# here we will store the urls createdfordataindata_from_csv:url_creator=UrlCreator(data['brand'],data['code'])url=url_creator.create_url()urls.append(url)url_list=json.dumps(urls,indent=4)returnurl_list#print(json.dumps(urls, indent=4))if__name__=="__main__":data_from_csv=get_list_of_dicts_from_csv_file(CSV_FILE_PATH)urls=[]# here we will store the urls createdfordataindata_from_csv:url_creator=UrlCreator(data['brand'],data['code'])url=url_creator.create_url()urls.append(url)url_list=json.dumps(urls,indent=4)print(json.dumps(urls,indent=4))
frombs4importBeautifulSoupimportrequestsimportloggingaslogger#import MySQLdbimportsys,os,csvfrom.mainimporturl_encodedurl_encoded()'''I want to start working with the urls from main.py url_encoded which is just a copy of the if __name__ == "__main__":Probably going about this wrong... How can I get the URLS to be a list or json to loop through in this file? Below is sudo code as its not working but you get the idea for whats next.'''defpagenum():pageNumber=1url_list=[]# guessing this would be the urls from the url_encoded() aboveforurlsinlist:response2=requests.get(url,proxies=proxies,headers=header).textsoup2=BeautifulSoup(response2,features='lxml')page=soup2.find('ul',{'class':'pagination'})try:pages=page.find('a',{'id':'last'})['data-page']print('(try)Pages = '+str(pages))foruinpages:url(split,)except:pages=1print('(except) statement Pages = '+str(pages))# while pageNumber < int(pages) + 1:# # sleep(randint(1,5))# print('Entering while response loop')# #print('brands = ' + brands)# print('(2)pageNumber = ' + str(pageNumber))url_list.append(link)print(url_list)returnurl_list,brands,pages
Comments (0)
HTTPSSSH
You can clone a snippet to your computer for local editing.
Learn more.