Commits

Anonymous committed 4b64e98

basic crawler

Comments (0)

Files changed (1)

+#encoding: utf-8
+import urllib
+from BeautifulSoup import BeautifulSoup
+import re
+__PARTIES_LIST_URL = 'http://www.cvk.lv/cgi-bin/wdbcgiw/base/komisijas2010.CVKAND10.saraksts1'
+
+def list_parties():
+    """ generator, who yields tuple (party_detail_url, party_name) """
+    data = urllib.urlopen(__PARTIES_LIST_URL).read()
+    handle = BeautifulSoup(unicode(data, "windows-1257"))
+    for item in handle.findAll("a", attrs = {"name" : "KAND"}):
+        yield item['href'], item.contents[0].strip('"')
+
+def get_candidates(party_url):
+    """ get candidates from party url. generator, yields tuple (candidate_url, candidate_name) """
+    data = urllib.urlopen(party_url).read()
+    handle = BeautifulSoup(unicode(data, "windows-1257"))
+    for item in handle.findAll("a", target = "KAND"):
+        if item['href'].find("apg_nr") == -1:
+            yield item['href'], item.contents[0]
+
+def get_candidate(candidate_url):
+    """ 
+        scraps candidate details, returns dictionary 
+        note to CVK website makers: your HTML sucks.Are you living in 90ies ?
+    """
+    data = unicode(urllib.urlopen(candidate_url).read(), "windows-1257")
+    info = {}
+    info['year'] = re.search(re.escape(u"Dzimšanas gads:</b> ") + u"(\d+)", data).group(1)
+    info['foreign_citizenship'] = re.search(re.escape(u"<BR><b>Ārvalstu pilsonība:</b> ") + u"(\w+)", data).group(1)
+    info['lives_into'] = re.search(re.escape(u"<BR><b>Dzīves vieta:</b> ") + u"(.*)", data).group(1)
+    info['education'] = re.search(re.escape(u"<br><b>Izglītība:</b> ") + u"(.*)", data).group(1) 
+    info['language'] = re.search(re.escape(u"<b>Valsts valodas  prasmes pašnovērtējums:</b> ") + u"(.*)", data).group(1)
+    info['nationality'] = re.search(re.escape(u"<br><b>Tautība:</b> ") + u"(.*)", data).group(1)
+    info['marital_status'] = re.search(re.escape(u"<br><b>Ģimenes stāvoklis:</b> ") + u"(.*)" + re.escape("<br>"), data).group(1)
+    return info
+
+print get_candidate("http://www.cvk.lv/cgi-bin/wdbcgiw/base/komisijas2010.CVKAND10.kandid2?NR1=5&cbutton=60262243965")
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.