Source

ScholarScrap / recipe-523047-1.py

Full commit
Flavio Codeco Co… 801cf9d 











































































































































import httplib
import urllib
from BeautifulSoup import BeautifulSoup
import re

class GoogleScholarSearch:
    """
    @brief This class searches Google Scholar (http://scholar.google.com)

    Search for articles and publications containing terms of interest.
    
    Usage example:\n
    <tt>
    > from google_search import *\n
    > searcher = GoogleScholarSearch()\n
    > searcher.search(['breast cancer', 'gene'])
    </tt>
    """
    def __init__(self):
        """
        @brief Empty constructor.
        """
        self.SEARCH_HOST = "scholar.google.com"
        self.SEARCH_BASE_URL = "/scholar"

    def search(self, terms, limit=10):
        """
        @brief This function searches Google Scholar using the specified terms.
        
        Returns a list of dictionarys. Each
        dictionary contains the information related to the article:
            "URL"       : link to the article/n
            "Title"     : title of the publication/n
            "Authors"   : authors (example: DF Easton, DT Bishop, D Ford)/n
            "JournalYear"   : journal name & year (example: Nature, 2001)/n
            "JournalURL"    : link to the journal main website (example: www.nature.com)/n
            "Abstract"  : abstract of the publication/n
            "NumCited"  : number of times the publication is cited/n
            "Terms"     : list of search terms used in the query/n

        @param terms List of search terms
        @param limit Maximum number of results to be returned (default=10)
        @return List of results, this is the empty list if nothing is found
        """
        params = urllib.urlencode({'q': "+".join(terms), 'num': limit})
        headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}

        url = self.SEARCH_BASE_URL+"?"+params
        conn = httplib.HTTPConnection(self.SEARCH_HOST)
        conn.request("GET", url, {}, headers)
    
        resp = conn.getresponse()      
        
        if resp.status==200:
            html = resp.read()
            results = []
            html = html.decode('ascii', 'ignore')
                        
            # Screen-scrape the result to obtain the publication information
            soup = BeautifulSoup(html)
            citations = 0
            for record in soup('p', {'class': 'g'}):
             
                # Includeds error checking
                topPart = record.first('span', {'class': 'w'})                                
                
                pubURL = topPart.a['href']
                # Clean up the URL, make sure it does not contain '\' but '/' instead
                pubURL = pubURL.replace('\\', '/')

                pubTitle = ""
                
                for part in topPart.a.contents:
                    pubTitle += str(part)
                
                if pubTitle == "":
                    match1 = re.findall('<b>\[CITATION\]<\/b><\/font>(.*)- <a',str(record))
                    match2 = re.split('- <a',match1[citations])
                    pubTitle = re.sub('<\/?(\S)+>',"",match2[0])
                    citations = citations + 1
               
                authorPart = record.first('font', {'color': 'green'}).string
                if str(authorPart)=='Null': 
                    authorPart = ''
                    # Sometimes even BeautifulSoup can fail, fall back to regex
                    m = re.findall('<font color="green">(.*)</font>', str(record))
                    if len(m)>0:
                        authorPart = m[0]
                num = authorPart.count(" - ")
                # Assume that the fields are delimited by ' - ', the first entry will be the
                # list of authors, the last entry is the journal URL, anything in between
                # should be the journal year
                idx_start = authorPart.find(' - ')
                idx_end = authorPart.rfind(' - ')
                pubAuthors = authorPart[:idx_start]             
                pubJournalYear = authorPart[idx_start + 3:idx_end]
                pubJournalURL = authorPart[idx_end + 3:]
                # If (only one ' - ' is found) and (the end bit contains '\d\d\d\d')
                # then the last bit is journal year instead of journal URL
                if pubJournalYear=='' and re.search('\d\d\d\d', pubJournalURL)!=None:
                    pubJournalYear = pubJournalURL
                    pubJournalURL = ''
                               
                # This can potentially fail if all of the abstract can be contained in the space
                # provided such that no '...' is found
                delimiter = soup.firstText("...").parent
                pubAbstract = ""
                while str(delimiter)!='Null' and (str(delimiter)!='<b>...</b>' or pubAbstract==""):
                    pubAbstract += str(delimiter)
                    delimiter = delimiter.nextSibling
                pubAbstract += '<b>...</b>'
                
                match = re.search("Cited by ([^<]*)", str(record))
                pubCitation = ''
                if match != None:
                    pubCitation = match.group(1)
                results.append({
                    "URL": pubURL,
                    "Title": pubTitle,
                    "Authors": pubAuthors,
                    "JournalYear": pubJournalYear,
                    "JournalURL": pubJournalURL,
                    "Abstract": pubAbstract,
                    "NumCited": pubCitation,
                    "Terms": terms
                })
            return results
        else:
            print "ERROR: ",
            print resp.status, resp.reason
            return []

if __name__ == '__main__':
    search = GoogleScholarSearch()
    pubs = search.search(["dengue", "fever"], 10)
    for pub in pubs:
        print pub['Title']
        print pub['Authors']
        print pub['JournalYear']
        print pub['Terms']
        print "======================================"