akent / sydneytraffic (http://semicircular.net/tag/sydney-traffic/)
The code that drives http://twitter.com/sydneytraffic Screenscraping the NSW Roads and Traffic Authority's traffic alerts, shortening, and reposting to twitter. Depends on BeautifulSoup and python-twitter.
Clone this repository (size: 32.5 KB): HTTPS / SSH
$ hg clone http://bitbucket.org/akent/sydneytraffic/
| commit 5: | aee1e9275c1f |
| parent 4: | 36e39537b913 |
| branch: | default |
- strip out UBD info from items, it's not very useful
- add a couple more parse regex swaps
- use python 2.5 because md5 etc are deprecated in future versions
- remove excessively verbose logging
- allow disabling of levenshtein distance dup detection (for RSS use)
8 months ago
Changed (Δ370 bytes):
raw changeset »
parse.py (19 lines added, 5 lines removed)
rssparse.py (2 lines added, 3 lines removed)
1 |
#!/usr/bin/ |
|
1 |
#!/usr/bin/python2.5 |
|
2 |
2 |
|
3 |
3 |
# Copyright (c) 2009 Adam Kent <adam@semicircular.net> |
4 |
4 |
# All rights reserved. |
| … | … | @@ -118,12 +118,14 @@ def shorten(text): |
118 |
118 |
(r'\bwill be\b', r''), |
119 |
119 |
(r'\bbetween\b', r'btwn'), |
120 |
120 |
(r'\bjustpast\b', r'nr'), |
121 |
(r'\broadway\b', r'rd'), |
|
121 |
122 |
(r'\bnear\b', r'nr'), |
122 |
123 |
(r'\bcarriageway\b', r''), |
123 |
124 |
(r'\bapproach', r'apprch'), |
124 |
125 |
(r'\bcontrol\b', r'cntrl'), |
125 |
126 |
(r'\balternateroute\b', r'alt route'), |
126 |
127 |
(r'\b[Mm]ultiplevehicles\b', r'multi vehicle'), |
128 |
(r'\bon Rd\b', r'on rd'), |
|
127 |
129 |
(r'Effect:', r'. '), |
128 |
130 |
(r'Affected direction:', r''), |
129 |
131 |
(r'[Dd]irection', r'dir'), |
| … | … | @@ -198,7 +200,17 @@ def parse(): |
198 |
200 |
date = i.find('div', { "class" : "startdate" }) |
199 |
201 |
loc = i.find('div', { "class" : "location" }) |
200 |
202 |
type = i.find('div', { "class" : "type" }) |
201 |
|
|
203 |
||
204 |
locarr = loc.findAll(text=True) |
|
205 |
locstr = "" |
|
206 |
for l in locarr: |
|
207 |
if (l.find("UBD") == 0): |
|
208 |
continue |
|
209 |
else: |
|
210 |
locstr += l |
|
211 |
||
212 |
loc = "%s; " % (locstr.strip()) |
|
213 |
res = "%s %s" % (loc, strip(type)) |
|
202 |
214 |
|
203 |
215 |
extra = i.find('table', { "class" : "incident_data collapsible" }) |
204 |
216 |
for j in extra: |
| … | … | @@ -208,12 +220,15 @@ def parse(): |
208 |
220 |
|
209 |
221 |
|
210 |
222 |
# return 1 if we already have it, 0 if not |
211 |
def seen(key, str, history |
|
223 |
def seen(key, str, history, use_lev=True): |
|
212 |
224 |
if (history.has_key(key)): |
213 |
225 |
# it's identical to a previously posted entry |
214 |
|
|
226 |
#print "Identical item found, key %s" % (key) |
|
215 |
227 |
return True |
216 |
228 |
|
229 |
if (not use_lev): |
|
230 |
return False |
|
231 |
||
217 |
232 |
loc = str.split(":", 1)[0] |
218 |
233 |
smallest_distance = 1000 |
219 |
234 |
|
| … | … | @@ -252,7 +267,6 @@ if __name__ == "__main__": |
252 |
267 |
exit(1) |
253 |
268 |
|
254 |
269 |
for event in parse(): |
255 |
print event[1] |
|
256 |
270 |
uid = "%s %s" % (event[0], event[1]) |
257 |
271 |
uid = hashlib.md5(uid).hexdigest() |
258 |
272 |
1 |
#!/usr/bin/ |
|
1 |
#!/usr/bin/python2.5 |
|
2 |
2 |
|
3 |
3 |
import feedparser |
4 |
4 |
import parse |
| … | … | @@ -27,11 +27,10 @@ if __name__ == "__main__": |
27 |
27 |
exit(1) |
28 |
28 |
|
29 |
29 |
for event in parseit(): |
30 |
print event[1] |
|
31 |
30 |
uid = "%s %s" % (event[0], event[1]) |
32 |
31 |
uid = hashlib.md5(uid).hexdigest() |
33 |
32 |
|
34 |
if (not parse.seen(uid, event[1], history |
|
33 |
if (not parse.seen(uid, event[1], history, False)): |
|
35 |
34 |
parse.twitsubmit(event[1], sys.argv[2], pw)# submit! |
36 |
35 |
history[uid] = event[1]; |
37 |
36 |
print "Posted: %s : %s" % (uid, event[1]) |
