Commits

Thomas Kluyver committed f8274e7

Add functions to do simple taxonomic searches, and to retrieve species distribution.

Comments (0)

Files changed (1)

+# coding: utf-8
 """
 Module for scraping data from the USDA GRIN database.
 
     '999':"(Other)"}
 improve_status_code = dict((v, k) for k, v in improve_status.items())
 
+def simple_search(query):
+    """Do a simple species search on GRIN. Takes the search parameter, e.g.
+    'Vicia', and returns a list of 3-tuples, (id, name, authority).
+    
+    Will fall down if the search only matches one result.
+    """
+    query = urllib.urlencode({"search":query})
+    page = BeautifulSoup(urllib.urlopen(TAX_SEARCH_URL, query))
+    results = []
+    for li in page.ol.findAll("li"):
+        if li.a.contents[0].name != "b":
+            continue
+        ID = li.a['href'].split("?")[-1]
+        auth = li.a.b.contents[-1].strip()
+        name = "".join(a.text if hasattr(a, 'text') else a 
+                        for a in li.a.b.contents[:-1]).replace("×", u"×")
+        results.append((ID, name, auth))
+    return results
+
 def get_by_PI(PIcode):
-	"""Returns basic details of an accession, by searching for its PI number or 
-	equivalent code"""
-	PIcode = PIcode.lstrip("PI ")
-	acc_url = SEARCHb + urllib.urlencode({'accid':"PI "+PIcode})
-	return _read_acc_page(acc_url)
-	
+    """Returns basic details of an accession, by searching for its PI number or 
+    equivalent code"""
+    PIcode = PIcode.lstrip("PI ")
+    acc_url = SEARCHb + urllib.urlencode({'accid':"PI "+PIcode})
+    return _read_acc_page(acc_url)
+    
 def get_acc(ID):
-	"""Returns basic details of an accession, given its numeric ID"""
-	return _read_acc_page(ACCb + ID)
-	
+    """Returns basic details of an accession, given its numeric ID"""
+    return _read_acc_page(ACCb + ID)
+    
 def _read_acc_page(acc_url):
-	acc_page = BeautifulSoup(urllib.urlopen(acc_url))
-	detail_para = acc_page('p')[0]
-	detail_para_s = str(detail_para)
-	details = {}
-	locn_match = locnre.search(detail_para_s)
-	if locn_match:
-		details['locn'] = locn_match.group(1)
-	details['improvement'] = improvementre.search(detail_para_s).group(1)
-	if acc_url.startswith(ACCb):
-		details['id'] = acc_url.partition("?")[-1]
-	else:
-		a = detail_para.findAll("h2")[-1].a
-		if a:
-			details['id'] = a['href'].partition("?")[-1]			
-	return details
-	
+    acc_page = BeautifulSoup(urllib.urlopen(acc_url))
+    detail_para = acc_page('p')[0]
+    detail_para_s = str(detail_para)
+    details = {}
+    locn_match = locnre.search(detail_para_s)
+    if locn_match:
+        details['locn'] = locn_match.group(1)
+    details['improvement'] = improvementre.search(detail_para_s).group(1)
+    if acc_url.startswith(ACCb):
+        details['id'] = acc_url.partition("?")[-1]
+    else:
+        a = detail_para.findAll("h2")[-1].a
+        if a:
+            details['id'] = a['href'].partition("?")[-1]			
+    return details
+    
 def get_accs_of_species(sp_id, withUnavailable=True):
-	"""Given the numeric ID of a species, returns the list of accessions held
-	of that species. For each accession, a tuple is returned:
-	(numeric id, PI number or equivalent, Accession name)"""
-	if withUnavailable:
-		listpage = urllib.urlopen(TAX_ACCb,"taxno=%s&rownum=0&sort=numb&unavail=off" % sp_id)
-	else:
-		listpage = urllib.urlopen(TAX_ACCb,"taxno=%s&rownum=0&sort=numb" % sp_id)
-	listsoup = BeautifulSoup(listpage)
-	accs = []
-	if not listsoup.ol:
-		return []
-	for li in listsoup.ol.findAll("li"):
-		PInum = li.a.text
-		accname = li.text.replace(PInum,"")
-		accid = li.a['href'].rpartition("?")[2]
-		accs.append((accid, PInum, accname))
-	return accs
-	
-	
+    """Given the numeric ID of a species, returns the list of accessions held
+    of that species. For each accession, a tuple is returned:
+    (numeric id, PI number or equivalent, Accession name)"""
+    if withUnavailable:
+        listpage = urllib.urlopen(TAX_ACCb,"taxno=%s&rownum=0&sort=numb&unavail=off" % sp_id)
+    else:
+        listpage = urllib.urlopen(TAX_ACCb,"taxno=%s&rownum=0&sort=numb" % sp_id)
+    listsoup = BeautifulSoup(listpage)
+    accs = []
+    if not listsoup.ol:
+        return []
+    for li in listsoup.ol.findAll("li"):
+        PInum = li.a.text
+        accname = li.text.replace(PInum,"")
+        accid = li.a['href'].rpartition("?")[2]
+        accs.append((accid, PInum, accname))
+    return accs
+
+def _get_distribution(soup):
+    try:
+        gofrom = soup.find(text="Distributional range").parent.parent
+    except AttributeError:    # heading wasn't found
+        return {}
+    distrib = {}
+    while True:
+        part = gofrom.findNextSibling("b")
+        if not part or part.text not in ("Native:", "Naturalized:", "Cultivated:"):
+            break
+        countries = set()
+        ul = part.findNextSibling("ul")
+        for li in ul.findAll("li"):
+            # This link is in bold, so it appears in the results if left.
+            nse_link = li.find("a", title="Link to NatureServe Explorer")
+            if nse_link is not None:
+                nse_link.extract()
+            countries.update(b.text for b in li.findAll("b")[1:]) # 1st is the continent name
+        distrib[part.text.strip(':')] = countries 
+        gofrom = ul
+    return distrib
+    
+def get_species_info(sp_id):
+    """Return a dictionary of information for the species with the given
+    numeric ID."""
+    sp_page = urllib.urlopen(TAXON_URL + sp_id)
+    soup = BeautifulSoup(sp_page)
+    
+    info = {'distribution': _get_distribution(soup)}
+    
+    return info
+    
+    
 def get_obs(ID):
-	"""Given the numeric ID of an accession, returns a dictionary of the observations made on it."""
-	page = BeautifulSoup(urllib.urlopen(OBSb + ID))
-	obs = defaultdict(list)
-	for obtr in page.findAll("tr"):
-		if obtr.contents[0].name == "th":
-			continue
-		obname = obtr.contents[0].a.text
-		obs[obname].append((obtr.contents[1].a.text,obtr.contents[1].a['href']))
-	return dict(obs)
-	
+    """Given the numeric ID of an accession, returns a dictionary of the observations made on it."""
+    page = BeautifulSoup(urllib.urlopen(OBSb + ID))
+    obs = defaultdict(list)
+    for obtr in page.findAll("tr"):
+        if obtr.contents[0].name == "th":
+            continue
+        obname = obtr.contents[0].a.text
+        obs[obname].append((obtr.contents[1].a.text,obtr.contents[1].a['href']))
+    return dict(obs)
+    
 def resolve(name):
     """Resolve a species name using GRIN taxonomy."""
     rawname = name