akent / sydneytraffic (http://semicircular.net/tag/sydney-traffic/)

The code that drives http://twitter.com/sydneytraffic Screenscraping the NSW Roads and Traffic Authority's traffic alerts, shortening, and reposting to twitter. Depends on BeautifulSoup and python-twitter.

Clone this repository (size: 32.5 KB): HTTPS / SSH
$ hg clone http://bitbucket.org/akent/sydneytraffic/
commit 5: aee1e9275c1f
parent 4: 36e39537b913
branch: default
- strip out UBD info from items, it's not very useful - add a couple more parse regex swaps - use python 2.5 because md5 etc are deprecated in future versions - remove excessively verbose logging - allow disabling of levenshtein distance dup detection (for RSS use)
ad...@garnet
8 months ago

Changed (Δ370 bytes):

raw changeset »

parse.py (19 lines added, 5 lines removed)

rssparse.py (2 lines added, 3 lines removed)

Up to file-list parse.py:

1
#!/usr/bin/env python
1
#!/usr/bin/python2.5
2
2
3
3
# Copyright (c) 2009 Adam Kent <adam@semicircular.net>
4
4
# All rights reserved.
@@ -118,12 +118,14 @@ def shorten(text):
118
118
            (r'\bwill be\b', r''),
119
119
            (r'\bbetween\b', r'btwn'),
120
120
            (r'\bjustpast\b', r'nr'),
121
            (r'\broadway\b', r'rd'),
121
122
            (r'\bnear\b', r'nr'),
122
123
            (r'\bcarriageway\b', r''),
123
124
            (r'\bapproach', r'apprch'),
124
125
            (r'\bcontrol\b', r'cntrl'),
125
126
            (r'\balternateroute\b', r'alt route'),
126
127
            (r'\b[Mm]ultiplevehicles\b', r'multi vehicle'),
128
            (r'\bon Rd\b', r'on rd'),
127
129
            (r'Effect:', r'. '),
128
130
            (r'Affected direction:', r''),
129
131
            (r'[Dd]irection', r'dir'),
@@ -198,7 +200,17 @@ def parse():
198
200
        date = i.find('div', { "class" : "startdate" })
199
201
        loc = i.find('div', { "class" : "location" })
200
202
        type = i.find('div', { "class" : "type" })
201
        res = "%s %s" % (strip(loc), strip(type))
203
        
204
        locarr = loc.findAll(text=True)
205
        locstr = ""
206
        for l in locarr:
207
            if (l.find("UBD") == 0):
208
                continue
209
            else:
210
                locstr += l
211
       
212
        loc = "%s; " % (locstr.strip())
213
        res = "%s %s" % (loc, strip(type))
202
214
203
215
        extra = i.find('table', { "class" : "incident_data collapsible" })
204
216
        for j in extra:
@@ -208,12 +220,15 @@ def parse():
208
220
209
221
210
222
# return 1 if we already have it, 0 if not
211
def seen(key, str, history):
223
def seen(key, str, history, use_lev=True):
212
224
    if (history.has_key(key)):
213
225
        # it's identical to a previously posted entry
214
        print "Identical item found, key %s" % (key)
226
        #print "Identical item found, key %s" % (key)
215
227
        return True
216
228
    
229
    if (not use_lev):
230
        return False
231
217
232
    loc = str.split(":", 1)[0]
218
233
    smallest_distance = 1000
219
234
    
@@ -252,7 +267,6 @@ if __name__ == "__main__":
252
267
        exit(1)
253
268
    
254
269
    for event in parse():
255
        print event[1]
256
270
        uid = "%s %s" % (event[0], event[1])
257
271
        uid = hashlib.md5(uid).hexdigest()
258
272
        

Up to file-list rssparse.py:

1
#!/usr/bin/env python
1
#!/usr/bin/python2.5
2
2
3
3
import feedparser
4
4
import parse
@@ -27,11 +27,10 @@ if __name__ == "__main__":
27
27
        exit(1)
28
28
    
29
29
    for event in parseit():
30
        print event[1]
31
30
        uid = "%s %s" % (event[0], event[1])
32
31
        uid = hashlib.md5(uid).hexdigest()
33
32
        
34
        if (not parse.seen(uid, event[1], history)):
33
        if (not parse.seen(uid, event[1], history, False)):
35
34
            parse.twitsubmit(event[1], sys.argv[2], pw)# submit!
36
35
            history[uid] = event[1];
37
36
            print "Posted: %s : %s" % (uid, event[1])