Source

Scripts / Utility / ScreenScaperPagineGialleV2.py

"""
Versione 2.2
Michele Gatti    YellowPage ScreenScaper    10-12-2010
Dato un indirizzo di pagine gialle restituisce un file di testo con tutti
contatti delimitati da "|".
"""

import sys
from urllib import urlopen
from BeautifulSoup import BeautifulSoup

class Company(object):
    """
    Company Data
    """

    def __init__(self):
        self.companyname = None
        self.address = None
        self.postalcode = None
        self.locality = None
        self.region = None
        self.phonenumber = None
        self.faxnumber = None
        self.website = None

    def get_linedelimited(self, chrdelimiter):
        """
        Return a string delimeted of a company
        """
        _stringexp = ""

        if self.companyname:
            _stringexp += self.companyname + chrdelimiter

        if self.address:
            _stringexp += self.address + chrdelimiter

        if self.postalcode:
            _stringexp += self.postalcode + chrdelimiter

        if self.locality:
            _stringexp += self.locality + chrdelimiter

        if self.region:
            _stringexp += self.region + chrdelimiter

        if self.phonenumber:
            _stringexp += self.phonenumber + chrdelimiter

        if self.faxnumber:
            _stringexp += self.faxnumber + chrdelimiter

        if self.website:
            _stringexp += self.website + chrdelimiter

        return _stringexp + "\n"


class YellowAd(object):
    """
    Yellow Page Advertisement
    """

    def __init__(self, linksearch):
        self._linksearch = linksearch
        self._clientad = {'class': "listing-client-line-pg clearfix inserzionista"}
        self._clientnoad = {'class': "listing-client-line-pg  clearfix"}

    def _get_yellowadpages(self):
        """
        Return the list of page of a search
        """

        _pages = []

        _search = BeautifulSoup(urlopen(self._linksearch).read(),
                                 convertEntities="html")
        _maxpage = int(_search.find('p', attrs={'class': 'pagination-total'})
                      .contents[3].text)

        for i in range(0, _maxpage, 1):
            _pages.append("%sp-%i?" % (self._linksearch, i + 1))

        return _pages

    def get_companys(self):
        """
        Return the list of company of a search
        """

        def get_companyname(page):
            """
            Return the companyname
            """
            if page.find('h3', attrs={'class': 'org orange'}):
                return page.find('h3',
                                 attrs={'class': 'org orange'}).text.upper()
            if page.find('h3', attrs={'class': 'org'}):
                return page.find('h3', attrs={'class': 'org'}).text.upper()

        def get_streetaddress(page):
            """
            Return street address
            """
            if page.find('p', attrs={'class': 'street-address'}):
                return page.find('p', attrs={'class': 'street-address'}).text

        def get_postalcode(page):
            """
            Return postalcode
            """
            if page.find('span', attrs={'class': 'postal-code'}):
                return page.find('span', attrs={'class': 'postal-code'}).text

        def get_locality(page):
            """
            Return italian zipcode
            """
            if page.find('span', attrs={'class': 'locality'}):
                return page.find('span', attrs={'class': 'locality'}).text

        def get_region(page):
            """
            Return region
            """
            if page.find('span', attrs={'class': 'region'}):
                return page.find('span', attrs={'class': 'region'}).text

        def get_phonenumber(page):
            """
            Return list of phone number and fax number
            """
            _phonefax = []

            if page.findAll('p', attrs={'class': 'tel'}):
                phonenumbers = page.findAll('p', attrs={'class': 'tel'})

                for phone in phonenumbers:
                    _phonefax.append(phone.text)

            return _phonefax

        def get_website(page):
            """
            Return the web site of company
            """
            _www = ""

            if page.findAll('a', attrs={'title': 'www'}):
                for link in page.findAll('a', attrs={'title': 'www'}):
                    _www = link['href']

            return _www

        def get_listad(listad, page, clientype):
            """
            Return the list of ad in yellowpage
            """
            _htmlpage = BeautifulSoup(urlopen(page).read(),
                                      convertEntities="html")

            if _htmlpage.findAll('div', attrs=clientype):
                for _ad in _htmlpage.findAll('div', attrs=clientype):
                    _company = Company()
                    _company.companyname = get_companyname(_ad)
                    _company.address = get_streetaddress(_ad)
                    _company.postalcode = get_postalcode(_ad)
                    _company.locality = get_locality(_ad)
                    _company.region = get_region(_ad)

                    if len(get_phonenumber(_ad)) > 0:
                        if get_phonenumber(_ad)[0]:
                            _company.phonenumber = get_phonenumber(_ad)[0]

                        if len(get_phonenumber(_ad)) > 1:
                            if get_phonenumber(_ad)[1]:
                                _company.faxnumber = get_phonenumber(_ad)[1]

                    _company.website = get_website(_ad)
                    listad.append(_company)

        _listcompany = []

        for _page in self._get_yellowadpages():
            get_listad(_listcompany, _page, self._clientad)
            get_listad(_listcompany, _page, self._clientnoad)

        return _listcompany

def main(filename, httpage):
    """
    Main function to execute programm 
    """

    _filewrite = open(filename, 'w')
    _companys = YellowAd(httpage).get_companys()

    for company in _companys:
        _filewrite.write(company.get_linedelimited('|').encode('utf-8'))

    _filewrite.close()

if __name__ == "__main__":
    if len(sys.argv) >= 2:
        main(sys.argv[2], sys.argv[1])
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.