1. Mauro Baraldi
  2. Scrapping


Scrapping / batchpcb.py

#!/usr/bin/env python
# Crawling items list from http://www.batchpcb.com
# They don't have a search in site!! =/

import os
import requests
from BeautifulSoup import BeautifulSoup

base_dir = os.path.dirname(__file__)
results = os.path.join(base_dir, 'boards.csv')

base_url = 'https://www.batchpcb.com/?pcb_page=%i'
item_url = 'https://www.batchpcb.com/pcbs/%i'

html = lambda url, page: BeautifulSoup(requests.get(url % page, verify=False).content)

pages = int(html(base_url, 1).find('div', {'class':'pagination'}).findAll('li')[-2].text)

with open(results,'wb') as fp:
    for page in range(1,pages+1):
        print 'Page %s' % page
        items = html(base_url, page).findAll('div',{'class':'span3 design boxshadow'})
        print '  id, model, layers, price, width, height, area'
        for item in items:
            _id_ = int(item.a['href'].replace('/pcbs/',''))
            _item_ = html(item_url, _id_)
            info = _item_.find('div', {'id':'info'}).find('table').findAll('tr')
            model = _item_.find('h2', {'class':'bread'}).text
            layers = info[4].text.replace('Layers:','')
            price = info[5].text.replace('Price:','')
            width = info[6].text.replace('Width:','')
            height = info[7].text.replace('Height:','')
            area = info[8].text.replace('Area:','')
            print '  %i, %s, %s, %s, %s, %s, %s' % (_id_, model, layers, price, width, height, area)
            fp.write('%i, %s, %s, %s, %s, %s, %s/n' % (_id_, model, layers, price, width, height, area))