Commits

Thejesh GN committed 173dcc8

added rajashthan rain water data scraper

Comments (0)

Files changed (1)

scrapers/rajasthan_rain_water/scrape.py

+import requests
+import string
+import lxml.html           
+import sqlite3
+conn = sqlite3.connect('/media/truecrypt5/code/snippets/scrapes/rajasthan_rain_data/data.sqlite')
+c = conn.cursor()
+YEARS = [2009,2010,2011]
+#YEARS = [2008]
+for YEAR in YEARS:
+    MONTHS = ["","JAN", "FEB","MAR","APR","MAY","JUN","JUL","AUG","SEP","OCT","NOV","DEC"]
+    for page_num in range(24,35):
+        page='http://waterresources.rajasthan.gov.in/Daily_Rainfall_Data/'+str(YEAR)+'_files/sheet'+string.zfill(page_num, 3)+".htm"
+        print page
+        html = requests.get(page)
+        #html = scraperwiki.scrape(page)
+        if html.status_code != 200:
+            print "------------------------------ERROR------------------------------------------------"
+        
+        root = lxml.html.fromstring(html.content)
+        row_number = 1
+        STATION = ''
+        DISTRICT = ''
+        HEADER_SECTION = False
+        RAIN_SECTION = False
+        MONTH = 0
+        DATE = 0
+        print "Staring rows"
+        for tr in root.cssselect("tr"):
+            tds = tr.cssselect("td")
+            column_number = 0
+            for td in tds:
+                column_number = column_number + 1
+                if column_number == 1 and 'STATION' in td.text_content():
+                    print "##################### HEADER SECTION #######################################"
+                    HEADER_SECTION = True
+                    RAIN_SECTION = False
+                    STATION = tds[1].text_content()
+                    DISTRICT = tds[4].text_content()
+                    print DISTRICT+","+STATION+","+str(YEAR)        
+                    break
+                elif column_number == 1 and 'DATE' in td.text_content():
+                    print "##################### RAIN SECTION #######################################"
+                    HEADER_SECTION = False
+                    RAIN_SECTION = True
+                    DATE = 0
+                    break
+                elif column_number == 1 and  '-' in td.text_content():
+                    print "## Contains -"
+                    #break this row      
+                    break
+                elif column_number == 1:
+                    DATE = td.text_content()
+                    print "## Incrementing date"
+                    continue
+                elif RAIN_SECTION:
+                    MONTH = MONTH+1
+                    RAIN_FALL = td.text_content()
+                    #print "# column_number="+str(column_number)
+                    #insert_data={"DISTRICT":DISTRICT, "STATION":STATION,"YEAR":str(YEAR),"MONTH":MONTHS[MONTH],"DATE":str(DATE),"RAIN_FALL":str(RAIN_FALL), "PAGE_NUM":str(page_num)}
+                    try:
+                        insert_data=[]
+                        insert_data.append(DISTRICT)
+                        insert_data.append(STATION)
+                        insert_data.append(str(YEAR))
+                        insert_data.append(MONTHS[MONTH])
+                        insert_data.append(str(DATE))
+                        insert_data.append(str(RAIN_FALL))
+                        insert_data.append(str(page))
+                        print insert_data
+                        if str(RAIN_FALL) != '0.0' and str(RAIN_FALL) != '':
+                            c.execute("insert into rainfall values (?,?,?,?,?,?,?)", insert_data)          
+                            conn.commit()
+                    except:
+                        pass
+                    #print insert_data
+                    #scraperwiki.sqlite.save(unique_keys=["DISTRICT","STATION","YEAR","MONTH","DATE"],data=insert_data )
+                    if DATE == '31':                
+                        HEADER_SECTION = False
+                        RAIN_SECTION = False
+                        print "################## END rain Section #######################################"
+                    if MONTH == 12:
+                        MONTH = 0
+                        break
+        ############### END of One row ###############################
+            row_number = row_number + 1
+            MONTH = 0
+
+c.close()
+
+
+