1. akosch
  2. porthu_parser

Commits

akosch  committed 585f6c1

Added getLongDescription method
Fixed broken (time) parsing

  • Participants
  • Parent commits e8feef8
  • Branches default

Comments (0)

Files changed (1)

File porthu.py

View file
 import urllib
 from string import lower
 from datetime import datetime
+from datetime import timedelta
 from lxml import etree
 from cStringIO import StringIO
 import re
         self.description = description
         self.link = porthu_link
     
+    def getLongDescription(self):
+        if(self.link!=''):
+            conn = urllib.urlopen(self.link)
+            page = conn.read()
+            
+            parser = etree.HTMLParser()
+            tree = etree.parse(StringIO(page), parser)
+            try:
+                table = tree.xpath("//table")[3]
+          
+                tr = table.xpath("./tr")[3]
+                spans = tr.xpath(".//span[@class='txt']")
+                for span in spans:
+                    separators  = span.xpath("../div[@class='separator']")
+                    if(len(separators)==2 and len(span.text)!=0):
+                        return span.text
+            except IndexError:
+                return 'None'
+            return 'None'
+        else:
+            return 'None'
+
     def __repr__(self):
         return '['+str(self.time)+'] '+self.description.encode('utf-8')
 
         raw_day = date_box.xpath('.//span')[0].text
         date = getDateForRawString(raw_day, dates)
         
+        last_ptime = datetime.min
+
         for time_container in date_box.xpath("..//td[@class='time_container']"):
             raw_time = ''
             raw_desc = ''
             link = ''
+            
+            raw_time = time_container.xpath(".//div")[0].text
+            
+            if raw_time==None:
+                raw_time = time_container.xpath("../..//p[@class='begin_time']")[0].text
 
-            raw_time = time_container.xpath(".//div|../..//p[@class='begin_time']")[0].text
             ptime=datetime.strptime(date+' '+raw_time, '%Y-%m-%d %H:%M')
 
+            if ptime<last_ptime:
+                ptime = ptime + timedelta(days=1)
+            
+            last_ptime=ptime
+            
             desc = time_container.xpath("..//a[@class='btxt']|..//a[@class='lbbtxt']|..//span[@class='btxt']")[0]
             
             raw_desc = desc.text