Mikele Gatti avatar Mikele Gatti committed dcb12a4

Add scraping yellow page

Comments (0)

Files changed (1)

Utility/ScreenScaperPagineGialleV2.py

+"""
+Versione 2.2
+Michele Gatti    YellowPage ScreenScaper    10-12-2010
+Dato un indirizzo di pagine gialle restituisce un file di testo con tutti
+contatti delimitati da "|".
+"""
+
+import sys
+from urllib import urlopen
+from BeautifulSoup import BeautifulSoup
+
+class Company(object):
+    """
+    Company Data
+    """
+
+    def __init__(self):
+        self.companyname = None
+        self.address = None
+        self.postalcode = None
+        self.locality = None
+        self.region = None
+        self.phonenumber = None
+        self.faxnumber = None
+        self.website = None
+
+    def get_linedelimited(self, chrdelimiter):
+        """
+        Return a string delimeted of a company
+        """
+        _stringexp = ""
+
+        if self.companyname:
+            _stringexp += self.companyname + chrdelimiter
+
+        if self.address:
+            _stringexp += self.address + chrdelimiter
+
+        if self.postalcode:
+            _stringexp += self.postalcode + chrdelimiter
+
+        if self.locality:
+            _stringexp += self.locality + chrdelimiter
+
+        if self.region:
+            _stringexp += self.region + chrdelimiter
+
+        if self.phonenumber:
+            _stringexp += self.phonenumber + chrdelimiter
+
+        if self.faxnumber:
+            _stringexp += self.faxnumber + chrdelimiter
+
+        if self.website:
+            _stringexp += self.website + chrdelimiter
+
+        return _stringexp + "\n"
+
+
+class YellowAd(object):
+    """
+    Yellow Page Advertisement
+    """
+
+    def __init__(self, linksearch):
+        self._linksearch = linksearch
+        self._clientad = {'class': "listing-client-line-pg clearfix inserzionista"}
+        self._clientnoad = {'class': "listing-client-line-pg  clearfix"}
+
+    def _get_yellowadpages(self):
+        """
+        Return the list of page of a search
+        """
+
+        _pages = []
+
+        _search = BeautifulSoup(urlopen(self._linksearch).read(),
+                                 convertEntities="html")
+        _maxpage = int(_search.find('p', attrs={'class': 'pagination-total'})
+                      .contents[3].text)
+
+        for i in range(0, _maxpage, 1):
+            _pages.append("%sp-%i?" % (self._linksearch, i + 1))
+
+        return _pages
+
+    def get_companys(self):
+        """
+        Return the list of company of a search
+        """
+
+        def get_companyname(page):
+            """
+            Return the companyname
+            """
+            if page.find('h3', attrs={'class': 'org orange'}):
+                return page.find('h3',
+                                 attrs={'class': 'org orange'}).text.upper()
+            if page.find('h3', attrs={'class': 'org'}):
+                return page.find('h3', attrs={'class': 'org'}).text.upper()
+
+        def get_streetaddress(page):
+            """
+            Return street address
+            """
+            if page.find('p', attrs={'class': 'street-address'}):
+                return page.find('p', attrs={'class': 'street-address'}).text
+
+        def get_postalcode(page):
+            """
+            Return postalcode
+            """
+            if page.find('span', attrs={'class': 'postal-code'}):
+                return page.find('span', attrs={'class': 'postal-code'}).text
+
+        def get_locality(page):
+            """
+            Return italian zipcode
+            """
+            if page.find('span', attrs={'class': 'locality'}):
+                return page.find('span', attrs={'class': 'locality'}).text
+
+        def get_region(page):
+            """
+            Return region
+            """
+            if page.find('span', attrs={'class': 'region'}):
+                return page.find('span', attrs={'class': 'region'}).text
+
+        def get_phonenumber(page):
+            """
+            Return list of phone number and fax number
+            """
+            _phonefax = []
+
+            if page.findAll('p', attrs={'class': 'tel'}):
+                phonenumbers = page.findAll('p', attrs={'class': 'tel'})
+
+                for phone in phonenumbers:
+                    _phonefax.append(phone.text)
+
+            return _phonefax
+
+        def get_website(page):
+            """
+            Return the web site of company
+            """
+            _www = ""
+
+            if page.findAll('a', attrs={'title': 'www'}):
+                for link in page.findAll('a', attrs={'title': 'www'}):
+                    _www = link['href']
+
+            return _www
+
+        def get_listad(listad, page, clientype):
+            """
+            Return the list of ad in yellowpage
+            """
+            _htmlpage = BeautifulSoup(urlopen(page).read(),
+                                      convertEntities="html")
+
+            if _htmlpage.findAll('div', attrs=clientype):
+                for _ad in _htmlpage.findAll('div', attrs=clientype):
+                    _company = Company()
+                    _company.companyname = get_companyname(_ad)
+                    _company.address = get_streetaddress(_ad)
+                    _company.postalcode = get_postalcode(_ad)
+                    _company.locality = get_locality(_ad)
+                    _company.region = get_region(_ad)
+
+                    if len(get_phonenumber(_ad)) > 0:
+                        if get_phonenumber(_ad)[0]:
+                            _company.phonenumber = get_phonenumber(_ad)[0]
+
+                        if len(get_phonenumber(_ad)) > 1:
+                            if get_phonenumber(_ad)[1]:
+                                _company.faxnumber = get_phonenumber(_ad)[1]
+
+                    _company.website = get_website(_ad)
+                    listad.append(_company)
+
+        _listcompany = []
+
+        for _page in self._get_yellowadpages():
+            get_listad(_listcompany, _page, self._clientad)
+            get_listad(_listcompany, _page, self._clientnoad)
+
+        return _listcompany
+
+def main(filename, httpage):
+    """
+    Main function to execute programm 
+    """
+
+    _filewrite = open(filename, 'w')
+    _companys = YellowAd(httpage).get_companys()
+
+    for company in _companys:
+        _filewrite.write(company.get_linedelimited('|').encode('utf-8'))
+
+    _filewrite.close()
+
+if __name__ == "__main__":
+    if len(sys.argv) >= 2:
+        main(sys.argv[2], sys.argv[1])
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.