Source

textgrounder / python / processwiki.py

   1
   2
   3
   4
   5
   6
   7
   8
   9
  10
  11
  12
  13
  14
  15
  16
  17
  18
  19
  20
  21
  22
  23
  24
  25
  26
  27
  28
  29
  30
  31
  32
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#######
####### processwiki.py
#######
####### Copyright (c) 2010 Ben Wing.
#######

### FIXME:
###
### Cases to fix involving coordinates:

# 1. Nested coordinates:

#{{Infobox Australian Place
#| name     = Lauderdale
#| image    = Lauderdale Canal.JPG
#| caption  = 
#| loc-x    = 
#| loc-y    = 
#| coordinates = {{coord|42|54|40|S|147|29|34|E|display=inline,title}}
#| state    = tas
#...
#}}

import sys, re
from optparse import OptionParser
from nlputil import *
import itertools
import time
from process_article_data import *

from xml.sax import make_parser
from xml.sax.handler import ContentHandler

# Debug flags.  Different flags indicate different info to output.
debug = booldict()

# Program options
progopts = None

disambig_pages_by_id = set()

article_namespaces = ['User', 'Wikipedia', 'File', 'MediaWiki', 'Template',
                      'Help', 'Category', 'Thread', 'Summary', 'Portal',
                      'Book']

article_namespace_aliases = {
  'P':'Portal', 'H':'Help', 'T':'Template',
  'CAT':'Category', 'Cat':'Category', 'C':'Category',
  'MOS':'Wikipedia', 'MoS':'Wikipedia', 'Mos':'Wikipedia'}

# Count number of incoming links for articles
incoming_link_count = intdict()

# Map anchor text to a hash that maps articles to counts
anchor_text_map = {}

# Set listing articles containing coordinates
coordinate_articles = set()

debug_cur_title = None

# Parse the result of a previous run of --coords-counts for articles with
# coordinates
def read_coordinates_file(filename):
  errprint("Reading coordinates file %s..." % filename)
  status = StatusMessage('article')
  for line in uchompopen(filename):
    m = re.match('Article title: (.*)', line)
    if m:
      title = capfirst(m.group(1))
    elif re.match('Article coordinates: ', line):
      coordinate_articles.add(title)
      if status.item_processed(maxtime=Opts.max_time_per_stage):
        break
    
# Read in redirects.  Record redirects as additional articles with coordinates
# if the article pointed to has coordinates. NOTE: Must be done *AFTER*
# reading coordinates.
def read_redirects_from_article_data(filename):
  assert coordinate_articles
  errprint("Reading redirects from article data file %s..." % filename)

  def process(art):
    if art.namespace != 'Main':
      return
    if art.redir and capfirst(art.redir) in coordinate_articles:
      coordinate_articles.add(art.title)

  read_article_data_file(filename, process, maxtime=Opts.max_time_per_stage)

# Read the list of disambiguation article ID's.
def read_disambig_id_file(filename):
  errprint("Reading disambig ID file %s..." % filename)
  status = StatusMessage("article")
  for line in uchompopen(filename):
    disambig_pages_by_id.add(line)
    if status.item_processed(maxtime=Opts.max_time_per_stage):
      break
    
############################################################################
#                              Documentation                               #
############################################################################

##### Quick start

# This program processes the article dump from Wikipedia.  Dump is on
# stdin.  Outputs to stdout.  Written flexibly so that it can be modified
# to do various things.  To run it, use something like this:
#
# bzcat enwiki-20100905-pages-articles.xml.bz2 | processwiki.py > wiki-words.out

#####  How this program works

# Currently it does the following:
#
# 1. Locate the article title and text.
#
# 2. Find any coordinates specified either in Infobox or Coord templates.
#    If found, the first such coordinate in an article is output with lines
#    like 
#
#    Article title: Politics of Angola
#    Article coordinates: 13.3166666667,-169.15
#
# 3. For articles with coordinates in them, locate all the "useful" words in
#    the article text.  This ignores HTML codes like <sup>, comments,
#    stuff like [[ or ]], anything inside of <math>...</math>, etc.  It
#    tries to do intelligent things with templates (stuff inside of {{...}})
#    and internal links (inside of [[...]]), and ignores external links
#    ([http:...]).  The words are split on whitespace, ignoring punctuation
#    such as periods and commas, and the resulting words are counted up, and
#    the count of each different word is output, one per line like
#
#    Birmingham = 48
#
# There is also a debug flag.  If set, lots of additional stuff is output.
# Among them are warnings like
#
#    Warning: Nesting level would drop below 0; string = }, prevstring =  (19
#
# Note that since words are broken on spaces, there will never be a space
# in the outputted words.  Hence, the lines containing directives (e.g.
# the article title) can always be distinguished from lines containing words.
#
# Note also that the following terminology is used here, which may not be
# standard:
#
# Internal link: A link to another Wikipedia article, of the form [[...]].
# External link: A link to an external URL, of the form [...].
# Template: An expression of the form {{...}}, with arguments separated by
#           the pipe symbol |, that processes the arguments and subsitutes
#           the processed text; it may also trigger other sorts of actions.
#           Similar to the macros in C or M4.
# Macro: An expression that results in some other text getting substituted,
#        either a template {{...}} or an internal link [[...]].

##### Internal workings; how to extend the program

# Note that this program is written so that it can be flexibly extended to
# allow for different sorts of processing of the Wikipedia dump.  See the
# following description, which indicates where to change in order to
# implement different behavior.
#
# The basic functioning of this code is controlled by an article handler class.
# The default handler class is ArticleHandler.  Usually it is
# sufficient to subclass this handler class, as it provides hooks to do
# interesting things, which by default do nothing.  You can also subclass
# ArticleHandlerForUsefulText if you want the source text processed for
# "useful text" (what the Wikipedia user sees, plus similar-quality
# hidden text).
#
# SAX is used to process the XML of the raw Wikipedia dump file.
# Simple SAX handler functions are what invokes the article handler
# functions in the article handler class.
#
# For each article, the article handler function process_article_text() is
# called to process the text of the article, and is passed the article title
# and full text, with entity expressions such as &nbsp; replaced appropriately.
# This function operates in two passes.  The first pass, performed by
# the article handler process_text_for_data(), extracts useful data, e.g.
# coordinates or links.  It returns True or False, indicating whether the
# second pass should operate.  The purpose of the second pass is to do
# processing involving the article text itself, e.g. counting up words.
# It is implemented by the article handler process_text_for_text().
# The default handler does two things:
#
# 1. Process the text, filtering out some junk
#    (see format_text_second_pass()).
# 2. Use process_source_text() to extract chunks of "actual
#    text" (as opposed to directives of various sorts), i.e. text that
#    is useful for constructing a language model that can be used
#    for classifying a document to find the most similar article.
#    Join together and then split into words.  Pass the generator
#    of words to the article handler process_text_for_words().
#  
# process_source_text() is is a generator that yields processed
# textual chunks containing only "actual text".  This function works by
# calling parse_simple_balanced_text() to parse the text into balanced chunks
# (text delimited by balanced braces or brackets, i.e. {...} or [...],
# or text without any braces or brackets), and then handling the chunks
# according to their type:
#
# -- if [[...]], use process_internal_link()
# -- if {{...}}, use process_template()
# -- if {|...|}, use process_table()
# -- if [...] but not [[...]], use process_external_link()
# -- else, return the text unchanged
#
# Each of the above functions is a generator that yields chunks of
# "actual text".  Different sorts of processing can be implemented here.
# Note also that a similar structure can and probably should be
# implemented in process_text_for_data().
#
# As mentioned above, the chunks are concatenated before being split again.
# Concatentation helps in the case of article text like
#
# ... the [[latent variable|hidden value]]s ...
#
# which will get processed into chunks
#
# '... the ', 'latent variable hidden value', 's ...'
#
# Concatenating again will generate a single word "values".
#
# The resulting text is split to find words, using split_text_into_words().
# This splits on whitespace, but is a bit smarter; it also ignores
# punctuation such as periods or commas that occurs at the end of words,
# as well as parens and quotes at word boundaries, and ignores entirely
# any word with a colon in the middle (a likely URL or other directive that
# has slipped through), and separates on # and _ (which may occur in
# internal links or such).
#
# Note also that prior to processing the text for data and again prior to
# processing the text for words, it is formatted to make it nicer to
# process and to get rid of certain sorts of non-useful text.  This is
# implemented in the functions format_text_first_pass() and
# format_text_second_pass(), respectively.  This includes things like:
#
# -- removing comments
# -- removing <math>...</math> sections, which contain differently-formatted
#    text (specifically, in TeX format), which will screw up processing
#    of templates and links and contains very little useful text
# -- handling certain sorts of embedded entity expressions, e.g. cases
#    where &amp;nbsp; appears in the raw dump file.  This corresponds to
#    cases where &nbsp; appears in the source text of the article.
#    Wikipedia's servers process the source text into HTML and then spit
#    out the HTML, which gets rendered by the browser and which will
#    handle embedded entity expressions (e.g. convert &nbsp; into a
#    non-breaking-space character).  Note that something like &nbsp;
#    appears directly in the raw dump file only when a literal
#    non-breaking-space character appears in the article source text.
# -- handling embedded HTML expressions like <sup>2</sup>, where < appears
#    in the raw dump as &lt;.  These get processed by the user's browser.
#    We handle them in a simple fashion, special-casing <br> and <ref>
#    into whitespace and just removing all the others.
# -- removing === characters in headers like ===Introduction===
# -- removing multiple single-quote characters, which indicate boldface
#    or italics

##### About generators

# The code in this program relies heavily on generators, a special type of
# Python function.  The following is a quick intro for programmers who
# might not be familiar with generators.
#
# A generator is any function containing a "yield foo" expression.
# Logically speaking, a generator function returns multiple values in
# succession rather than returning a single value.  In the actual
# implementation, the result of calling a generator function is a
# generator object, which can be iterated over in a for loop, list
# comprehension or generator expression, e.g.
#
# 1. The following uses a for loop to print out the objects returned by
#    a generator function.
#
# for x in generator():
#   print x
#
# 2. The following returns a list resulting from calling a function fun() on
#    each object returned by a generator function.
# 
# [fun(x) for x in generator()]
#
# 3. The following returns another generator expression resulting from
#    calling a function fun() on each object returned by a generator function.
#
# (fun(x) for x in generator())
#
# There are some subtleties involved in writing generators:
#
# -- A generator can contain a "return" statement, but cannot return a value.
#    Returning from a generator, whether explicitly through a "return"
#    statement or implicitly by falling off the end of the function, triggers
#    a "raise StopIteration" statement.  This terminates the iteration loop
#    over the values returned by the generator.
# -- Chaining generators, i.e. calling one generator inside of another, is
#    a bit tricky.  If you have a generator function generator(), and you
#    want to pass back the values from another generator function generator2(),
#    you cannot simply call "return generator2()", since generators can't
#    return values.  If you just write "generator2()", nothing will happen;
#    the value from generator2() gets discarded.  So you usually have to
#    write a for loop:
#
#    for foo in generator2():
#      yield foo
#
#    Note that "return generator2()" *will* work inside of a function that is
#    not a generator, i.e. has no "yield" statement in it.

#######################################################################
#                         Splitting the output                        #
#######################################################################

# Files to output to, when splitting output
split_output_files = None

# List of split suffixes
split_suffixes = None

# Current file to output to
cur_output_file = sys.stdout
debug_to_stderr = False

# Name of current split (training, dev, test)
cur_split_name = ''

# Generator of files to output to
split_file_gen = None

# Initialize the split output files, using PREFIX as the prefix
def init_output_files(prefix, split_fractions, the_split_suffixes):
  assert len(split_fractions) == len(the_split_suffixes)
  global split_output_files
  split_output_files = [None]*len(the_split_suffixes)
  global split_suffixes
  split_suffixes = the_split_suffixes
  for i in range(len(the_split_suffixes)):
    split_output_files[i] = open("%s.%s" % (prefix, the_split_suffixes[i]), "w")
  global split_file_gen
  split_file_gen = next_split_set(split_fractions)

# Find the next split file to output to and set CUR_OUTPUT_FILE appropriately;
# don't do anything if the user hasn't called for splitting.
def set_next_split_file():
  global cur_output_file
  global cur_split_name
  if split_file_gen:
    nextid = split_file_gen.next()
    cur_output_file = split_output_files[nextid]
    cur_split_name = split_suffixes[nextid]
  
#######################################################################
#                  Chunk text into balanced sections                  #
#######################################################################

### Return chunks of balanced text, for use in handling template chunks
### and such.  The chunks consist either of text without any braces or
### brackets, chunks consisting of a brace or bracket and all the text
### up to and including the matching brace or bracket, or lone unmatched
### right braces/brackets.  Currently, if a chunk is closed with the
### wrong type of character (brace when bracket is expected or vice-versa),
### we still treat it as the closing character, but output a warning.
###
### In addition, some of the versions below will split off additional
### characters if they occur at the top level (e.g. pipe symbols or
### newlines).  In these cases, if such a character occurs, three
### successive chunks will be seen: The text up to but not including the
### dividing character, a chunk with only the character, and a chunk
### with the following text.  Note that if the dividing character occurs
### inside of bracketed or braced text, it will not divide the text.
### This way, for example, arguments of a template or internal link
### (which are separated by a pipe symbol) can be sectioned off without
### also sectioning off the arguments inside of nested templates or
### internal links.  Then, the parser can be called recursively if
### necessary to handle such expressions.

left_ref_re = r'<ref.*?>'
# Return braces and brackets separately from other text.
simple_balanced_re = re.compile(left_ref_re + r'|</ref>|[^{}\[\]<]+|[{}\[\]]|<')
#simple_balanced_re = re.compile(r'[^{}\[\]]+|[{}\[\]]')

# Return braces, brackets and pipe symbols separately from other text.
balanced_pipe_re = re.compile(left_ref_re + r'|</ref>|[^{}\[\]|<]+|[{}\[\]|]|<')
#balanced_pipe_re = re.compile(r'[^{}\[\]|]+|[{}\[\]|]')

# Return braces, brackets, and newlines separately from other text.
# Useful for handling Wikipedia tables, denoted with {| ... |}.
balanced_table_re = re.compile(left_ref_re + r'|</ref>|[^{}\[\]\n<]+|[{}\[\]\n]|<')
#balanced_table_re = re.compile(r'[^{}\[\]\n]+|[{}\[\]\n]')

left_match_chars = {'{':'}', '[':']', '<ref>':'</ref>'}
right_match_chars = {'}':'{', ']':'[', '</ref>':'<ref>'}

def parse_balanced_text(textre, text, throw_away = 0):
  '''Parse text in TEXT containing balanced expressions surrounded by single
or double braces or brackets.  This is a generator; it successively yields
chunks of text consisting either of sections without any braces or brackets,
or balanced expressions delimited by single or double braces or brackets, or
unmatched single or double right braces or brackets.  TEXTRE is used to
separate the text into chunks; it can be used to separate out additional
top-level separators, such as vertical bar.'''
  strbuf = []
  prevstring = "(at beginning)"
  leftmatches = []
  parenlevel = 0
  for string in textre.findall(text):
    if debug['debugparens']:
      errprint("pbt: Saw %s, parenlevel=%s" % (string, parenlevel))
    if string.startswith('<ref'):
      #errprint("Saw reference: %s" % string)
      if not string.endswith('>'):
        wikiwarning("Strange parsing, saw odd ref tag: %s" % string)
      if string.endswith('/>'):
        continue
      string = '<ref>'
    if string in right_match_chars:
      if parenlevel == 0:
        wikiwarning("Nesting level would drop below 0; string = %s, prevstring = %s" % (string, prevstring.replace('\n','\\n')))
        yield string
      else:
        strbuf.append(string)
        assert len(leftmatches) == parenlevel
        should_left = right_match_chars[string]
        should_pop_off = 1
        the_left = leftmatches[-should_pop_off]
        if should_left != the_left:
          if should_left == '<ref>':
            wikiwarning("Saw unmatched </ref>")
            in_ref = any([match for match in leftmatches if match == '<ref>'])
            if not in_ref:
              wikiwarning("Stray </ref>??; prevstring = %s" % prevstring.replace('\n','\\n'))
              should_pop_off = 0
            else:
              while (len(leftmatches) - should_pop_off >= 0 and
                  should_left != leftmatches[len(leftmatches)-should_pop_off]):
                should_pop_off += 1
              if should_pop_off >= 0:
                wikiwarning("%s non-matching brackets inside of <ref>...</ref>: %s ; prevstring = %s" % (should_pop_off - 1, ' '.join(left_match_chars[x] for x in leftmatches[len(leftmatches)-should_pop_off:]), prevstring.replace('\n','\\n')))
              else:
                wikiwarning("Inside of <ref> but still interpreted as stray </ref>??; prevstring = %s" % prevstring.replace('\n','\\n'))
                should_pop_off = 0
          elif the_left == '<ref>':
            wikiwarning("Stray %s inside of <ref>...</ref>; prevstring = %s" % (string, prevstring.replace('\n','\\n')))
            should_pop_off = 0
          else:
            wikiwarning("Non-matching brackets: Saw %s, expected %s; prevstring = %s" % (string, left_match_chars[the_left], prevstring.replace('\n','\\n')))
        if should_pop_off > 0:
          parenlevel -= should_pop_off
          if debug['debugparens']:
            errprint("pbt: Decreasing parenlevel by 1 to %s" % parenlevel)
          leftmatches = leftmatches[:-should_pop_off]
        if parenlevel == 0:
          yield ''.join(strbuf)
          strbuf = []
    else:
      if string in left_match_chars:
        if throw_away > 0:
          wikiwarning("Throwing away left bracket %s as a reparse strategy"
              % string)
          throw_away -= 1
        else:
          parenlevel += 1
          if debug['debugparens']:
            errprint("pbt: Increasing parenlevel by 1 to %s" % parenlevel)
          leftmatches.append(string)
      if parenlevel > 0:
        strbuf.append(string)
      else:
        yield string
    prevstring = string
  leftover = ''.join(strbuf)
  if leftover:
    wikiwarning("Unmatched left paren, brace or bracket: %s characters remaining" % len(leftover))
    wikiwarning("Remaining text: [%s]" % bound_string_length(leftover))
    wikiwarning("Reparsing:")
    for string in parse_balanced_text(textre, leftover, throw_away = parenlevel):
      yield string

def parse_simple_balanced_text(text):
  '''Parse text in TEXT containing balanced expressions surrounded by single
or double braces or brackets.  This is a generator; it successively yields
chunks of text consisting either of sections without any braces or brackets,
or balanced expressions delimited by single or double braces or brackets, or
unmatched single or double right braces or brackets.'''
  return parse_balanced_text(simple_balanced_re, text)

#######################################################################
###                        Utility functions                        ###
#######################################################################

def splitprint(text):
  '''Print text (possibly Unicode) to the appropriate output, either stdout
or one of the split output files.'''
  uniprint(text, outfile=cur_output_file)

def outprint(text):
  '''Print text (possibly Unicode) to stdout (but stderr in certain debugging
modes).'''
  if debug_to_stderr:
    errprint(text)
  else:
    uniprint(text)

def wikiwarning(foo):
  warning("Article %s: %s" % (debug_cur_title, foo))

# Output a string of maximum length, adding ... if too long
def bound_string_length(str, maxlen=60):
  if len(str) <= maxlen:
    return str
  else:
    return '%s...' % str[0:maxlen]

def find_template_params(args, strip_values):
  '''Find the parameters specified in template arguments, i.e. the arguments
to a template that are of the form KEY=VAL.  Given the arguments ARGS of a
template, return a tuple (HASH, NONPARAM) where HASH is the hash table of
KEY->VAL parameter mappings and NONPARAM is a list of all the remaining,
non-parameter arguments.  If STRIP_VALUES is true, strip whitespace off the
beginning and ending of values in the hash table (keys will always be
lowercased and have the whitespace stripped from them).'''
  hash = {}
  nonparam_args = []
  for arg in args:
    m = re.match(r'(?s)(.*?)=(.*)', arg)
    if m:
      key = m.group(1).strip().lower()
      value = m.group(2)
      if strip_values:
        value = value.strip()
      hash[key] = value
    else:
      #errprint("Unable to process template argument %s" % arg)
      nonparam_args.append(arg) 
  return (hash, nonparam_args)

def get_macro_args(macro):
  '''Split macro MACRO (either a {{...}} or [[...]] expression)
by arguments (separated by | occurrences), but intelligently so that
arguments in nested macros are not also sectioned off.  In the case
of a template, i.e. {{...}}, the first "argument" returned will be
the template type, e.g. "Cite web" or "Coord".  At least one argument
will always be returned (in the case of an empty macro, it will be
the string "empty macro"), so that code that parses templates need
not worry about crashing on these syntactic errors.'''

  macroargs = [foo for foo in
              parse_balanced_text(balanced_pipe_re, macro[2:-2])
              if foo != '|']
  if not macroargs:
    wikiwarning("Strange macro with no arguments: %s" % macroargs)
    return ['empty macro']
  return macroargs

#######################################################################
#                         Process source text                         #
#######################################################################

# Handle the text of a given article.  Yield chunks of processed text.

class SourceTextHandler(object):
  def process_internal_link(self, text):
    yield text
    
  def process_template(self, text):
    yield text
    
  def process_table(self, text):
    yield text
    
  def process_external_link(self, text):
    yield text
    
  def process_reference(self, text):
    yield text
    
  def process_text_chunk(self, text):
    yield text

  def process_source_text(self, text):
    # Look for all template and link expressions in the text and do something
    # sensible with them.  Yield the resulting text chunks.  The idea is that
    # when the chunks are joined back together, we will get raw text that can
    # be directly separated into words, without any remaining macros (templates,
    # internal or external links, tables, etc.) and with as much extraneous
    # junk (directives of various sorts, instead of relevant text) as possible
    # filtered out.  Note that when we process macros and extract the relevant
    # text from them, we need to recursively process that text.
  
    if debug['lots']: errprint("Entering process_source_text: [%s]" % text)
  
    for foo in parse_simple_balanced_text(text):
      if debug['lots']: errprint("parse_simple_balanced_text yields: [%s]" % foo)
  
      if foo.startswith('[['):
        gen = self.process_internal_link(foo)
  
      elif foo.startswith('{{'):
        gen = self.process_template(foo)
  
      elif foo.startswith('{|'):
        gen = self.process_table(foo)
  
      elif foo.startswith('['):
        gen = self.process_external_link(foo)
  
      elif foo.startswith('<ref'):
        gen = self.process_reference(foo)
  
      else:
        gen = self.process_text_chunk(foo)
  
      for chunk in gen:
        if debug['lots']: errprint("process_source_text yields: [%s]" % chunk)
        yield chunk
  
# An article source-text handler that recursively processes text inside of
# macros.  Doesn't split templates, links or tables according to arguments
# or fields.

class RecursiveSourceTextHandler(SourceTextHandler):
  def process_internal_link(self, text):
    return self.process_source_text(text[2:-2])
    
  def process_template(self, text):
    return self.process_source_text(text[2:-2])
    
  def process_table(self, text):
    return self.process_source_text(text[2:-2])
    
  def process_external_link(self, text):
    return self.process_source_text(text[1:-1])
    
  def process_reference(self, text):
    return self.process_source_text(" " + text[5:-6] + " ")
    
#######################################################################
#                     Process text for coordinates                    #
#######################################################################

# Accumulate a table of all the templates with coordinates in them, along
# with counts.
templates_with_coords = intdict()

# Accumulate a table of all templates, with counts.
all_templates = intdict()

def safe_float_1(x):
  '''Subfunction of safe_float.  Always return None.'''
  if x is None:
    return None
  try:
    return float(x)
  except:
    x = x.strip()
    if x:
      wikiwarning("Expected number, saw %s" % x)
    return None

def safe_float(x, zero_on_error=False):
  '''Convert a string to floating point, but don't crash on errors;
instead, output a warning.  If 'zero_on_error', return 0 if no number could
be produced; otherwise, return None.'''
  ret = safe_float_1(x)
  if ret is None and zero_on_error: return 0.
  return ret

def get_german_style_coord(arg):
  '''Handle plain floating-point numbers as well as "German-style"
deg/min/sec/DIR indicators like 45/32/30/E.'''
  if arg is None:
    return None
  if ' ' in arg:
    arg = re.sub(' .*$', '', arg)
  if '/' in arg:
    m = re.match('([0-9.]+)/([0-9.]+)?/([0-9.]+)?/([NSEWnsew])', arg)
    if m:
      (deg, min, sec, offind) = m.groups()
      offind = offind.upper()
      if offind in convert_ns:
        off = convert_ns[offind]
      else:
        off = convert_ew[offind]
      return convert_dms(off, deg, min, sec)
    wikiwarning("Unrecognized DEG/MIN/SEC/HEMIS-style indicator: %s" % arg)
    return None
  else:
    return safe_float(arg)

def convert_dms(nsew, d, m, s):
  '''Convert a multiplier (1 or N or E, -1 for S or W) and degree/min/sec
values into a decimal +/- latitude or longitude.'''
  lat = get_german_style_coord(d)
  if lat is None:
    return None
  return nsew*(lat + safe_float(m, zero_on_error = True)/60. +
      safe_float(s, zero_on_error = True)/3600.)

convert_ns = {'N':1, 'S':-1}
convert_ew = {'E':1, 'W':-1, 'L':1, 'O':-1}

# Get the default value for the hemisphere, as a multiplier +1 or -1.
# We need to handle Australian places specially, as S latitude, E longitude.
# We need to handle Pittsburgh neighborhoods specially, as N latitude, W longitude.
# Otherwise assume +1, so that we leave the values alone.  This is important
# because some fields may specifically use signed values to indicate the
# hemisphere directly, or use other methods of indicating hemisphere (e.g.
# "German"-style "72/50/35/W").
def get_hemisphere(temptype, is_lat):
  if temptype.lower().startswith('infobox australia'):
    if is_lat: return -1
    else: return 1
  elif temptype.lower().startswith('infobox pittsburgh neighborhood'):
    if is_lat: return 1
    else: return -1
  else: return 1

# Get an argument (ARGSEARCH) by name from a hash table (ARGS).  Multiple
# synonymous names can be looked up by giving a list or tuple for ARGSEARCH.
# Other parameters control warning messages.
def getarg(argsearch, temptype, args, rawargs, warnifnot=True):
  if isinstance(argsearch, tuple) or isinstance(argsearch, list):
    for x in argsearch:
      val = args.get(x, None)
      if val is not None:
        return val
    if warnifnot:
      wikiwarning("None of params %s seen in template {{%s|%s}}" % (
        ','.join(argsearch), temptype, bound_string_length('|'.join(rawargs))))
  else:
    val = args.get(argsearch, None)
    if val is not None:
      return val
    if warnifnot:
      wikiwarning("Param %s not seen in template {{%s|%s}}" % (
        argsearch, temptype, bound_string_length('|'.join(rawargs))))
  return None

# Utility function for get_latd_coord().
# Extract out either latitude or longitude from a template of type
# TEMPTYPE with arguments ARGS.  LATD/LATM/LATS are lists or tuples of
# parameters to look up to retrieve the appropriate value. OFFPARAM is the
# list of possible parameters indicating the offset to the N, S, E or W.
# IS_LAT is True if a latitude is being extracted, False for longitude.
def get_lat_long_1(temptype, args, rawargs, latd, latm, lats, offparam, is_lat):
  d = getarg(latd, temptype, args, rawargs)
  m = getarg(latm, temptype, args, rawargs, warnifnot=False) 
  s = getarg(lats, temptype, args, rawargs, warnifnot=False)
  hemis = getarg(offparam, temptype, args, rawargs)
  if hemis is None:
    hemismult = get_hemisphere(temptype, is_lat)
  else:
    if is_lat:
      convert = convert_ns
    else:
      convert = convert_ew
    hemismult = convert.get(hemis, 0)
    if hemismult == 0:
      wikiwarning("%s for template type %s has bad value: [%s]" %
               (offparam, temptype, hemis))
  return convert_dms(hemismult, d, m, s)

latd_arguments = ('latd', 'latg', 'lat_d',
  'latdeg', 'lat_deg', 'lat_degrees', 'latitudedegrees',
  'latitudinegradi', 'latitudine_gradi', 'latitudine gradi',
  'latgradi',
  'latitudine_d',
  'latitudegraden',
  'breitengrad', 'breddegrad', 'bredde_grad')
def get_latd_coord(temptype, args, rawargs):
  '''Given a template of type TEMPTYPE with arguments ARGS (converted into
a hash table; also available in raw form as RAWARGS), assumed to have
a latitude/longitude specification in it using latd/lat_deg/etc. and
longd/lon_deg/etc., extract out and return a tuple of decimal
(latitude, longitude) values.'''
  lat = get_lat_long_1(temptype, args, rawargs,
      latd_arguments,
      ('latm', 'latmin', 'lat_min', 'lat_m', 'lat_minutes', 'latitudeminutes',
         'latitudineprimi', 'latitudine_primi', 'latitudine primi',
         'latprimi',
         'latitudineminuti', 'latitudine_minuti', 'latitudine minuti',
         'latminuti',
         'latitudine_m',
         'latitudeminuten',
         'breitenminute', 'bredde_min'),
      ('lats', 'latsec', 'lat_sec', 'lat_s', 'lat_seconds', 'latitudeseconds',
         'latitudinesecondi', 'latitudine_secondi', 'latitudine secondi',
         'latsecondi',
         'latitudine_s',
         'latitudeseconden',
         'breitensekunde'),
      ('latns', 'latp', 'lap', 'lat_dir', 'lat_direction',
         'latitudinens', 'latitudine_ns', 'latitudine ns'),
      is_lat=True)
  long = get_lat_long_1(temptype, args, rawargs,
      # Typos like Longtitude do occur in the Spanish Wikipedia at least
      ('longd', 'lond', 'longg', 'long',
         'londeg', 'lon_deg', 'long_d', 'long_degrees',
         'longitudinegradi', 'longitudine_gradi', 'longitudine gradi',
         'longgradi',
         'longitudine_d',
         'longitudedegrees', 'longtitudedegrees',
         'longitudegraden',
         u'längengrad', 'laengengrad', 'lengdegrad', u'længde_grad'),
      ('longm', 'lonm', 'lonmin', 'lon_min', 'long_m', 'long_minutes',
         'longitudineprimi', 'longitudine_primi', 'longitudine primi',
         'longprimi',
         'longitudineminuti', 'longitudine_minuti', 'longitudine minuti',
         'longminuti',
         'longitudine_m',
         'longitudeminutes', 'longtitudeminutes',
         'longitudeminuten',
         u'längenminute', u'længde_min'),
      ('longs', 'lons', 'lonsec', 'lon_sec', 'long_s', 'long_seconds',
         'longitudinesecondi', 'longitudine_secondi', 'longitudine secondi',
         'longsecondi',
         'longitudine_s',
         'longitudeseconds', 'longtitudeseconds',
         'longitudeseconden',
         u'längensekunde'),
      ('longew', 'longp', 'lonp', 'lon_dir', 'long_direction',
         'longitudineew', 'longitudine_ew', 'longitudine ew'),
      is_lat=False)
  return (lat, long)

def get_built_in_lat_long_1(temptype, args, rawargs, latd, latm, lats, is_lat):
  d = getarg(latd, temptype, args, rawargs)
  m = getarg(latm, temptype, args, rawargs, warnifnot=False) 
  s = getarg(lats, temptype, args, rawargs, warnifnot=False)
  return convert_dms(mult, d, m, s)

built_in_latd_north_arguments = ('stopnin')
built_in_latd_south_arguments = ('stopnis')
built_in_longd_north_arguments = ('stopnie')
built_in_longd_south_arguments = ('stopniw')

def get_built_in_lat_coord(temptype, args, rawargs):
  '''Given a template of type TEMPTYPE with arguments ARGS (converted into
a hash table; also available in raw form as RAWARGS), assumed to have
a latitude/longitude specification in it using stopniN/etc. (where the
direction NSEW is built into the argument name), extract out and return a
tuple of decimal (latitude, longitude) values.'''
  if getarg(built_in_latd_north_arguments) is not None:
    mult = 1
  elif getarg(built_in_latd_south_arguments) is not None:
    mult = -1
  else:
    wikiwarning("Didn't see any appropriate stopniN/stopniS param")
    mult = 1 # Arbitrarily set to N, probably accurate in Poland
  lat = get_built_in_lat_long_1(temptype, args, rawargs,
      ('stopnin', 'stopnis'),
      ('minutn', 'minuts'),
      ('sekundn', 'sekunds'),
      mult)
  if getarg(built_in_longd_north_arguments) is not None:
    mult = 1
  elif getarg(built_in_longd_south_arguments) is not None:
    mult = -1
  else:
    wikiwarning("Didn't see any appropriate stopniE/stopniW param")
    mult = 1 # Arbitrarily set to E, probably accurate in Poland
  long = get_built_in_lat_long_1(temptype, args, rawargs,
      ('stopnie', 'stopniw'),
      ('minute', 'minutw'),
      ('sekunde', 'sekundw'),
      mult)
  return (lat, long)

latitude_arguments = ('latitude', 'latitud', 'latitudine',
    # NOTE: We want to prefer breitengrad over breite because islands may
    # have both, with breite simply specifying the width while breitengrad
    # specifies the latitude.  But sometimes breitengrad occurs with
    # breitenminute, so we list it in the latd arguments as well, which
    # we check first.
    'breitengrad', 'breite',
    #'lat' # Appears in non-article coordinates
    #'lat_dec' # Appears to be associated with non-Earth coordinates
    )
longitude_arguments = ('longitude', 'longitud', 'longitudine',
    u'längengrad', u'laengengrad', u'länge', u'laenge'
    #'long' # Appears in non-article coordinates
    #'long_dec' # Appears to be associated with non-Earth coordinates
    )

def get_latitude_coord(temptype, args, rawargs):
  '''Given a template of type TEMPTYPE with arguments ARGS, assumed to have
a latitude/longitude specification in it, extract out and return a tuple of
decimal (latitude, longitude) values.'''
  # German-style (e.g. 72/53/15/E) also occurs with 'latitude' and such,
  # so just check for it everywhere.
  lat = get_german_style_coord(getarg(latitude_arguments,
    temptype, args, rawargs))
  long = get_german_style_coord(getarg(longitude_arguments,
    temptype, args, rawargs))
  return (lat, long)

# Utility function for get_coord().  Extract out the latitude or longitude
# values out of a Coord structure.  Return a tuple (OFFSET, VAL) for decimal
# latitude or longitude VAL and OFFSET indicating the offset of the next
# argument after the arguments used to produce the value.
def get_coord_1(args, nsew, convert_nsew):
  if args[1] in nsew:
    d = args[0]; m = 0; s = 0; i = 1
  elif args[2] in nsew:
    d = args[0]; m = args[1]; s = 0; i = 2
  elif args[3] in nsew:
    d = args[0]; m = args[1]; s = args[2]; i = 3
  else: return (1, args[0])
  return (i+1, convert_dms(convert_nsew[args[i]], d, m, s))

# FIXME!  To be more accurate, we need to look at the template parameters,
# which, despite the claim below, ARE quite interesting.  In fact, if the
# parameter 'display=title' is seen (or variant like 'display=inline,title'),
# then we have *THE* correct coordinate for the article.  So we need to
# return this fact if known, as an additional argument.  See comments
# below at extract_coordinates_from_article().

def get_coord(temptype, args):
  '''Parse a Coord template and return a tuple (lat,long) for latitude and
longitude.  TEMPTYPE is the template name.  ARGS is the raw arguments for
the template.  Coord templates are one of four types:

{{Coord|44.112|-87.913}}
{{Coord|44.112|N|87.913|W}}
{{Coord|44|6.72|N|87|54.78|W}}
{{Coord|44|6|43.2|N|87|54|46.8|W}}

Note that all four of the above are equivalent.

In addition, extra "template" or "coordinate" parameters can be given.
The template parameters mostly control display and are basically
uninteresting.  (FIXME: Not true, see above.) However, the coordinate
parameters contain lots of potentially useful information that can be
used as features or whatever.  See
http://en.wikipedia.org/wiki/Template:Coord for more information.

The types of coordinate parameters are:

type: country, city, city(###) where ### is the population, isle, river, etc.
      Very useful feature; can also be used to filter uninteresting info as
      some articles will have multiple coordinates in them.
scale: indicates the map scale (note that type: also specifies a default scale)
dim: diameter of viewing circle centered on coordinate (gives some sense of
     how big the feature is)
region: the "political region for terrestrial coordinates", i.e. the country
        the coordinate is in, as a two-letter ISO 3166-1 alpha-2 code, or the
        country plus next-level subdivision (state, province, etc.)
globe: which planet or satellite the coordinate is on (esp. if not the Earth)
'''
  if debug['some']: errprint("Passed in args %s" % args)
  # Filter out optional "template arguments", add a bunch of blank arguments
  # at the end to make sure we don't get out-of-bounds errors in
  # get_coord_1()
  filtargs = [x for x in args if '=' not in x]
  if filtargs:
    filtargs += ['','','','','','']
    (i, lat) = get_coord_1(filtargs, ('N','S'), convert_ns)
    (_, long) = get_coord_1(filtargs[i:], ('E','W'), convert_ew)
    return (lat, long)
  else:
    (paramshash, _) = find_template_params(args, True)
    lat = paramshash.get('lat', None) or paramshash.get('latitude', None)
    long = paramshash.get('long', None) or paramshash.get('longitude', None)
    if lat is None or long is None:
      wikiwarning("Can't find latitude/longitude in {{%s|%s}}" %
              (temptype, '|'.join(args)))
    lat = safe_float(lat)
    long = safe_float(long)
    return (lat, long)

def get_coordinate_coord(temptype, rawargs):
  '''Parse a Coordinate template and return a tuple (lat,long) for latitude and
longitude.  TEMPTYPE is the template name.  ARGS is the raw arguments for
the template.  These templates tend to occur in the German Wikipedia. Examples:

{{Coordinate|text=DMS|article=DMS|NS=51.50939|EW=-0.11832|type=city|pop=7825200|region=GB-LND}}
{{Coordinate|article=/|NS=41/00/00/N|EW=16/43/00/E|type=adm1st|region=IT-23}}
{Coordinate|NS=51/14/08.16/N|EW=6/48/37.43/E|text=DMS|name=Bronzetafel – Mittelpunkt Düsseldorfs|type=landmark|dim=50|region=DE-NW}}
{{Coordinate|NS=46.421401 &lt;!-- {{subst:CH1903-WGS84|777.367|143.725||koor=B }} --&gt;|EW=9.746124 &lt;!-- {{subst:CH1903-WGS84|777.367|143.725||koor=L }} --&gt;|region=CH-GR|text=DMS|type=isle|dim=500|name=Chaviolas}}
'''
  if debug['some']: errprint("Passed in args %s" % rawargs)
  (paramshash, _) = find_template_params(rawargs, True)
  lat = get_german_style_coord(getarg('ns', temptype, paramshash, rawargs))
  long = get_german_style_coord(getarg('ew', temptype, paramshash, rawargs))
  return (lat, long)

def get_coord_params(temptype, args):
  '''Parse a Coord template and return a list of tuples of coordinate
parameters (see comment under get_coord).'''
  if debug['some']: errprint("Passed in args %s" % args)
  # Filter out optional "template arguments"
  filtargs = [x for x in args if '=' not in x]
  if filtargs and ':' in filtargs[-1]:
    coord_params = [tuple(x.split(':')) for x in filtargs[-1].split('_')]
    return coord_params
  else:
    return []

def get_coord_params(temptype, args):
  '''Parse a Coord template and return a list of tuples of coordinate
parameters (see comment under get_coord).'''
  if debug['some']: errprint("Passed in args %s" % args)
  # Filter out optional "template arguments"
  filtargs = [x for x in args if '=' not in x]
  if filtargs and ':' in filtargs[-1]:
    coord_params = [tuple(x.split(':')) for x in filtargs[-1].split('_')]
    return coord_params
  else:
    return []

def get_geocoordenadas_coord(temptype, args):
  '''Parse a geocoordenadas template (common in the Portuguese Wikipedia) and
return a tuple (lat,long) for latitude and longitude.  TEMPTYPE is the
template name.  ARGS is the raw arguments for the template.  Typical example
is:

{{geocoordenadas|39_15_34_N_24_57_9_E_type:waterbody|39° 15′ 34&quot; N, 24° 57′ 9&quot; O}}
'''
  if debug['some']: errprint("Passed in args %s" % args)
  # Filter out optional "template arguments", add a bunch of blank arguments
  # at the end to make sure we don't get out-of-bounds errors in
  # get_coord_1()
  if len(args) == 0:
    wikiwarning("No arguments to template 'geocoordenadas'")
    return (None, None)
  else:
    # Yes, every one of the following problems occurs: Extra spaces; commas
    # used instead of periods; lowercase nsew; use of O (Oeste) for "West",
    # "L" (Leste) for "East"
    arg = args[0].upper().strip().replace(',','.')
    m = re.match(r'([0-9.]+)(?:_([0-9.]+))?(?:_([0-9.]+))?_([NS])_([0-9.]+)(?:_([0-9.]+))?(?:_([0-9.]+))?_([EWOL])(?:_.*)?$', arg)
    if not m:
      wikiwarning("Unrecognized argument %s to template 'geocoordenadas'" %
          args[0])
      return (None, None)
    else:
      (latd, latm, lats, latns, longd, longm, longs, longew) = \
          m.groups()
      return (convert_dms(convert_ns[latns], latd, latm, lats),
              convert_dms(convert_ew[longew], longd, longm, longs))

class ExtractCoordinatesFromSource(RecursiveSourceTextHandler):
  '''Given the article text TEXT of an article (in general, after first-
stage processing), extract coordinates out of templates that have coordinates
in them (Infobox, Coord, etc.).  Record each coordinate into COORD.

We don't recursively process text inside of templates or links.  If we want
to do that, change this class to inherit from RecursiveSourceTextHandler.

See process_article_text() for a description of the formatting that is
applied to the text before being sent here.'''

  def __init__(self):
    self.coords = []

  def process_template(self, text):
    # Look for a Coord, Infobox, etc. template that may have coordinates in it
    lat = long = None
    if debug['some']: errprint("Enter process_template: [%s]" % text)
    tempargs = get_macro_args(text)
    temptype = tempargs[0].strip()
    if debug['some']: errprint("Template type: %s" % temptype)
    lowertemp = temptype.lower()
    rawargs = tempargs[1:]
    # Look for a coordinate template
    if lowertemp in ('coord', 'coordp', 'coords',
                     'koord', #Norwegian
                     'coor', 'coor d', 'coor dm', 'coor dms',
                     'coor title d', 'coor title dm', 'coor title dms',
                     'coor dec', 'coorheader') \
        or lowertemp.startswith('geolinks') \
        or lowertemp.startswith('mapit') \
        or lowertemp.startswith('koordynaty'): # Coordinates in Polish:
      (lat, long) = get_coord(temptype, rawargs)
    elif lowertemp == 'coordinate':
      (lat, long) = get_coordinate_coord(temptype, rawargs)
    elif lowertemp in ('geocoordenadas', u'coördinaten'):
      # geocoordenadas is Portuguese, coördinaten is Dutch, and they work
      # the same way
      (lat, long) = get_geocoordenadas_coord(temptype, rawargs)
    else:
      # Look for any other template with a 'latd' or 'latitude' parameter.
      # Usually these will be Infobox-type templates.  Possibly we should only
      # look at templates whose lowercased name begins with "infobox".
      (paramshash, _) = find_template_params(rawargs, True)
      if getarg(latd_arguments, temptype, paramshash, rawargs, warnifnot=False) is not None:
        #errprint("seen: [%s] in {{%s|%s}}" % (getarg(latd_arguments, temptype, paramshash, rawargs), temptype, rawargs))
        templates_with_coords[lowertemp] += 1
        (lat, long) = get_latd_coord(temptype, paramshash, rawargs)
      # NOTE: DO NOT CHANGE ORDER.  We want to check latd first and check
      # latitude afterwards for various reasons (e.g. so that cases where
      # breitengrad and breitenminute occur get found).  FIXME: Maybe we
      # don't need get_latitude_coord at all, but get_latd_coord will
      # suffice.
      elif getarg(latitude_arguments, temptype, paramshash, rawargs, warnifnot=False) is not None:
        #errprint("seen: [%s] in {{%s|%s}}" % (getarg(latitude_arguments, temptype, paramshash, rawargs), temptype, rawargs))
        templates_with_coords[lowertemp] += 1
        (lat, long) = get_latitude_coord(temptype, paramshash, rawargs)
      elif (getarg(built_in_latd_north_arguments, temptype, paramshash,
                   rawargs, warnifnot=False) is not None or
            getarg(built_in_latd_south_arguments, temptype, paramshash,
                   rawargs, warnifnot=False) is not None):
        #errprint("seen: [%s] in {{%s|%s}}" % (getarg(built_in_latd_north_arguments, temptype, paramshash, rawargs), temptype, rawargs))
        #errprint("seen: [%s] in {{%s|%s}}" % (getarg(built_in_latd_south_arguments, temptype, paramshash, rawargs), temptype, rawargs))
        templates_with_coords[lowertemp] += 1
        (lat, long) = get_built_in_lat_coord(temptype, paramshash, rawargs)

    if debug['some']: errprint("Saw coordinate %s,%s in template type %s" %
              (lat, long, temptype))
    if lat is None and long is not None:
      errprint("Saw longitude %s but no latitude in template: %s" %
          (long, bound_string_length(text)))
    if long is None and lat is not None:
      errprint("Saw latitude %s but no latitude in template: %s" %
          (lat, bound_string_length(text)))
    if lat is not None and long is not None:
      self.coords.append((lowertemp,lat,long))
    # Recursively process the text inside the template in case there are
    # coordinates in it.
    return self.process_source_text(text[2:-2])

#category_types = [
#    ['neighbourhoods', 'neighborhood'],
#    ['neighborhoods', 'neighborhood'],
#    ['mountains', 'mountain'],
#    ['stations', ('landmark', 'railwaystation')],
#    ['rivers', 'river'],
#    ['islands', 'isle'],
#    ['counties', 'adm2nd'],
#    ['parishes', 'adm2nd'],
#    ['municipalities', 'city'],
#    ['communities', 'city'],
#    ['towns', 'city'],
#    ['villages', 'city'],
#    ['hamlets', 'city'],
#    ['communes', 'city'],
#    ['suburbs', 'city'],
#    ['universities', 'edu'],
#    ['colleges', 'edu'],
#    ['schools', 'edu'],
#    ['educational institutions', 'edu'],
#    ['reserves', '?'],
#    ['buildings', '?'],
#    ['structures', '?'],
#    ['landfills' '?'],
#    ['streets', '?'],
#    ['museums', '?'],
#    ['galleries', '?']
#    ['organizations', '?'],
#    ['groups', '?'],
#    ['lighthouses', '?'],
#    ['attractions', '?'],
#    ['border crossings', '?'],
#    ['forts', '?'],
#    ['parks', '?'],
#    ['townships', '?'],
#    ['cathedrals', '?'],
#    ['skyscrapers', '?'],
#    ['waterfalls', '?'],
#    ['caves', '?'],
#    ['beaches', '?'],
#    ['cemeteries'],
#    ['prisons'],
#    ['territories'],
#    ['states'],
#    ['countries'],
#    ['dominions'],
#    ['airports', 'airport'],
#    ['bridges'],
#    ]


class ExtractLocationTypeFromSource(RecursiveSourceTextHandler):
  '''Given the article text TEXT of an article (in general, after first-
stage processing), extract info about the type of location (if any).
Record info found in 'loctype'.'''

  def __init__(self):
    self.loctype = []
    self.categories = []

  def process_internal_link(self, text):
    tempargs = get_macro_args(text)
    arg0 = tempargs[0].strip()
    if arg0.startswith('Category:'):
      self.categories += [arg0[9:].strip()]
    return self.process_source_text(text[2:-2])

  def process_template(self, text):
    # Look for a Coord, Infobox, etc. template that may have coordinates in it
    lat = long = None
    tempargs = get_macro_args(text)
    temptype = tempargs[0].strip()
    lowertemp = temptype.lower()
    # Look for a coordinate template
    if lowertemp in ('coord', 'coor d', 'coor dm', 'coor dms',
                     'coor dec', 'coorheader') \
        or lowertemp.startswith('geolinks') \
        or lowertemp.startswith('mapit'):
      params = get_coord_params(temptype, tempargs[1:])
      if params:
        self.loctype += [['coord-params', params]]
    else:
      (paramshash, _) = find_template_params(tempargs[1:], True)
      if lowertemp == 'infobox settlement':
        params = []
        for x in ['settlement_type',
                  'subdivision_type', 'subdivision_type1', 'subdivision_type2',
                  'subdivision_name', 'subdivision_name1', 'subdivision_name2',
                  'coordinates_type', 'coordinates_region']:
          val = paramshash.get(x, None)
          if val:
            params += [(x, val)]
        self.loctype += [['infobox-settlement', params]]
      elif ('latd' in paramshash or 'lat_deg' in paramshash or
          'latitude' in paramshash):
        self.loctype += \
            [['other-template-with-coord', [('template', temptype)]]]
    # Recursively process the text inside the template in case there are
    # coordinates in it.
    return self.process_source_text(text[2:-2])

#######################################################################
#                         Process text for words                      #
#######################################################################

# For a "macro" (e.g. internal link or template) with arguments, and
# a generator that returns the interesting arguments separately, process
# each of these arguments into chunks, join the chunks of an argument back
# together, and join the processed arguments, with spaces separating them.
# The idea is that for something like
#
#   The [[latent variable|hidden node]]s are ...
#
# We will ultimately get something like
#
#   The latent variable hidden nodes are ...
#
# after joining chunks. (Even better would be to correct handle something
# like
#
#    The sub[[latent variable|node]]s are ...
#
# into
#
#    The latent variable subnodes are ...
#
# But that's a major hassle, and such occurrences should be rare.)

# Process an internal link into separate chunks for each interesting
# argument.  Yield the chunks.  They will be recursively processed, and
# joined by spaces.
def yield_internal_link_args(text):
  tempargs = get_macro_args(text)
  m = re.match(r'(?s)\s*([a-zA-Z0-9_]+)\s*:(.*)', tempargs[0])
  if m:
    # Something like [[Image:...]] or [[wikt:...]] or [[fr:...]]
    namespace = m.group(1).lower()
    if namespace in ('image', 'file'):
      # For image links, filter out non-interesting args
      for arg in tempargs[1:]:
        # Ignore uninteresting args
        if re.match(r'thumb|left|(up)?right|[0-9+](\s*px)?$', arg.strip()): pass
        # For alt text, ignore the alt= but use the rest
        else:
          # Look for parameter spec
          m = re.match(r'(?s)\s*([a-zA-Z0-9_]+)\s*=(.*)', arg)
          if m:
            (param, value) = m.groups()
            if param.lower() == 'alt':
              yield value
            # Skip other parameters
          # Use non-parameter args
          else: yield arg
    elif len(namespace) == 2 or len(namespace) == 3 or namespace == 'simple':
      # A link to the equivalent page in another language; foreign words
      # probably won't help for word matching.  However, this might be
      # useful in some other way.
      pass
    else:
      # Probably either a category or wikt (wiktionary).
      # The category is probably useful; the wiktionary entry maybe.
      # In both cases, go ahead and use.
      link = m.group(2)
      # Skip "Appendix:" in "wikt:Appendix"
      m = re.match(r'(?s)\s*[Aa]ppendix\s*:(.*)', link)
      if m: yield m.group(1)
      else: yield link
      for arg in tempargs[1:]: yield arg
  else:
    # For textual internal link, use all arguments, unless --raw-text
    if Opts.raw_text:
      yield tempargs[-1]
    else:
      for chunk in tempargs: yield chunk

# Process a template into separate chunks for each interesting
# argument.  Yield the chunks.  They will be recursively processed, and
# joined by spaces.
def yield_template_args(text):
  # For a template, do something smart depending on the template.
  if debug['lots']: errprint("yield_template_args called with: %s" % text)

  # OK, this is a hack, but a useful one.  There are lots of templates that
  # look like {{Emancipation Proclamation draft}} or
  # {{Time measurement and standards}} or similar that are useful as words.
  # So we look for templates without arguments that look like this.
  # Note that we require the first word to have at least two letters, so
  # we filter out things like {{R from related word}} or similar redirection-
  # related indicators.  Note that similar-looking templates that begin with
  # a lowercase letter are sometimes useful like {{aviation lists}} or
  # {{global warming}} but often are non-useful things like {{de icon}} or
  # {{nowrap begin}} or {{other uses}}.  Potentially we could be smarter
  # about this.
  if re.match(r'{{[A-Z][a-z]+ [A-Za-z ]+}}$', text):
    yield text[2:-2]
    return

  tempargs = get_macro_args(text)
  if debug['lots']: errprint("template args: %s" % tempargs)
  temptype = tempargs[0].strip().lower()

  if debug['some']:
    all_templates[temptype] += 1

  # Extract the parameter and non-parameter arguments.
  (paramhash, nonparam) = find_template_params(tempargs[1:], False)
  #errprint("params: %s" % paramhash)
  #errprint("nonparam: %s" % nonparam)

  # For certain known template types, use the values from the interesting
  # parameter args and ignore the others.  For other template types,
  # assume the parameter are uninteresting.
  if re.match(r'v?cite', temptype):
    # A citation, a very common type of template.
    for (key,value) in paramhash.items():
      # A fairly arbitrary list of "interesting" parameters.
      if re.match(r'(last|first|authorlink)[1-9]?$', key) or \
         re.match(r'(author|editor)[1-9]?-(last|first|link)$', key) or \
         key in ('coauthors', 'others', 'title', 'trans_title',
                 'quote', 'work', 'contribution', 'chapter', 'trans_chapter',
                 'series', 'volume'):
        yield value
  elif re.match(r'infobox', temptype):
    # Handle Infoboxes.
    for (key,value) in paramhash.items():
      # A fairly arbitrary list of "interesting" parameters.
      if key in ('name', 'fullname', 'nickname', 'altname', 'former',
                 'alt', 'caption', 'description', 'title', 'title_orig',
                 'image_caption', 'imagecaption', 'map_caption', 'mapcaption',
                 # Associated with states, etc.
                 'motto', 'mottoenglish', 'slogan', 'demonym', 'capital',
                 # Add more here
                 ):
        yield value
  elif re.match(r'coord', temptype):
    return

  # For other template types, ignore all parameters and yield the
  # remaining arguments.
  # Yield any non-parameter arguments.
  for arg in nonparam:
    yield arg

# Process a table into separate chunks.  Unlike code for processing
# internal links, the chunks should have whitespace added where necessary.
def yield_table_chunks(text):
  if debug['lots']: errprint("Entering yield_table_chunks: [%s]" % text)

  # Given a single line or part of a line, and an indication (ATSTART) of
  # whether we just saw a beginning-of-line separator, split on within-line
  # separators (|| or !!) and remove table directives that can occur at
  # the beginning of a field (terminated by a |).  Yield the resulting
  # arguments as chunks.
  def process_table_chunk_1(text, atstart):
    for arg in re.split(r'(?:\|\||!!)', text):
      if atstart:
        m = re.match('(?s)[^|]*\|(.*)', arg)
        if m:
          yield m.group(1) + ' '
          continue
      yield arg
      atstart = True

  # Just a wrapper function around process_table_chunk_1() for logging
  # purposes.
  def process_table_chunk(text, atstart):
    if debug['lots']: errprint("Entering process_table_chunk: [%s], %s" % (text, atstart))
    for chunk in process_table_chunk_1(text, atstart):
      if debug['lots']: errprint("process_table_chunk yields: [%s]" % chunk)
      yield chunk

  # Strip off {| and |}
  text = text[2:-2]
  ignore_text = True
  at_line_beg = False

  # Loop over balanced chunks, breaking top-level text at newlines.
  # Strip out notations like | and |- that separate fields, and strip out
  # table directives (e.g. which occur after |-).  Pass the remainder to
  # process_table_chunk(), which will split a line on within-line separators
  # (e.g. || or !!) and strip out directives.
  for arg in parse_balanced_text(balanced_table_re, text):
    if debug['lots']: errprint("parse_balanced_text(balanced_table_re) yields: [%s]" % arg)
    # If we see a newline, reset the flags and yield the newline.  This way,
    # a whitespace will always be inserted.
    if arg == '\n':
      ignore_text = False
      at_line_beg = True
      yield arg
    if at_line_beg:
      if arg.startswith('|-'):
        ignore_text = True
        continue
      elif arg.startswith('|') or arg.startswith('!'):
        arg = arg[1:]
        if arg and arg[0] == '+': arg = arg[1:]
        # The chunks returned here are separate fields.  Make sure whitespace
        # separates them.
        yield ' '.join(process_table_chunk(arg, True))
        continue
    elif ignore_text: continue
    # Add whitespace between fields, as above.
    yield ' '.join(process_table_chunk(arg, False))

# Given raw text, split it into words, filtering out punctuation, and
# yield the words.  Also ignore words with a colon in the middle, indicating
# likely URL's and similar directives.
def split_text_into_words(text):
  (text, _) = re.subn(left_ref_re, r' ', text)
  if Opts.no_tokenize:
    # No tokenization requested.  Just split on whitespace.  But still try
    # to eliminate URL's.  Rather than just look for :, we look for :/, which
    # URL's are likely to contain.  Possibly we should look for a colon in
    # the middle of a word, which is effectively what the checks down below
    # do (or modify those checks to look for :/).
    for word in re.split('\s+', text):
      if ':/' not in word:
        yield word
  elif Opts.raw_text:
    # This regexp splits on whitespace, but also handles the following cases:
    # 1. Any of , ; . etc. at the end of a word
    # 2. Parens or quotes in words like (foo) or "bar"
    off = 0
    for word in re.split(r'([,;."):]*)\s+([("]*)', text):
      if (off % 3) != 0:
        for c in word:
          yield c
      else:
        # Sometimes URL's or other junk slips through.  Much of this junk has
        # a colon in it and little useful stuff does.
        if ':' not in word:
          # Handle things like "Two-port_network#ABCD-parameters".  Do this after
          # filtering for : so URL's don't get split up.
          for word2 in re.split('[#_]', word):
            if word2: yield word2
      off += 1
  else:
    # This regexp splits on whitespace, but also handles the following cases:
    # 1. Any of , ; . etc. at the end of a word
    # 2. Parens or quotes in words like (foo) or "bar"
    for word in re.split(r'[,;."):]*\s+[("]*', text):
      # Sometimes URL's or other junk slips through.  Much of this junk has
      # a colon in it and little useful stuff does.
      if ':' not in word:
        # Handle things like "Two-port_network#ABCD-parameters".  Do this after
        # filtering for : so URL's don't get split up.
        for word2 in re.split('[#_]', word):
          if word2: yield word2

# Extract "useful" text (generally, text that will be seen by the user,
# or hidden text of similar quality) and yield up chunks.

class ExtractUsefulText(SourceTextHandler):
  def process_and_join_arguments(self, args_of_macro):
    return ' '.join(''.join(self.process_source_text(chunk))
                    for chunk in args_of_macro)

  def process_internal_link(self, text):
    '''Process an internal link into chunks of raw text and yield them.'''
    # Find the interesting arguments of an internal link and join
    # with spaces.
    yield self.process_and_join_arguments(yield_internal_link_args(text))
  
  def process_template(self, text):
    '''Process a template into chunks of raw text and yield them.'''
    # Find the interesting arguments of a template and join with spaces.
    yield self.process_and_join_arguments(yield_template_args(text))
  
  def process_table(self, text):
    '''Process a table into chunks of raw text and yield them.'''
    for bar in yield_table_chunks(text):
      if debug['lots']: errprint("process_table yields: [%s]" % bar)
      for baz in self.process_source_text(bar):
        yield baz
  
  def process_external_link(self, text):
    '''Process an external link into chunks of raw text and yield them.'''
    # For an external link, use the anchor text of the link, if any
    splitlink = re.split(r'\s+', text[1:-1], 1)
    if len(splitlink) == 2:
      (link, linktext) = splitlink
      for chunk in self.process_source_text(linktext):
        yield chunk
  
  def process_reference(self, text):
    return self.process_source_text(" " + text[5:-6] + " ")
    
#######################################################################
#               Formatting text to make processing easier             #
#######################################################################

# Process the text in various ways in preparation for extracting data
# from the text.
def format_text_first_pass(text):
  # Remove all comments from the text; may contain malformed stuff of
  # various sorts, and generally stuff we don't want to index
  (text, _) = re.subn(r'(?s)<!--.*?-->', '', text)

  # Get rid of all text inside of <math>...</math>, which is in a different
  # format (TeX), and mostly non-useful.
  (text, _) = re.subn(r'(?s)<math>.*?</math>', '', text)

  # Try getting rid of everything in a reference
  #(text, _) = re.subn(r'(?s)<ref.*?>.*?</ref>', '', text)
  #(text, _) = re.subn(r'(?s)<ref[^<>/]*?/>', '', text)

  # Convert occurrences of &nbsp; and &ndash; and similar, which occur often
  # (note that SAX itself should handle entities like this; occurrences that
  # remain must have had the ampersand converted to &amp;)
  (text, _) = re.subn(r'&nbsp;', ' ', text)
  (text, _) = re.subn(r'&thinsp;', ' ', text)
  (text, _) = re.subn(r'&[nm]dash;', '-', text)
  (text, _) = re.subn(r'&minus;', '-', text)
  (text, _) = re.subn(r'&amp;', '&', text)
  (text, _) = re.subn(r'&times;', '*', text)
  (text, _) = re.subn(r'&hellip;', '...', text)
  (text, _) = re.subn(r'&lt;', '<', text)
  (text, _) = re.subn(r'&gt;', '>', text)
  #(text, _) = re.subn(r'&#91;', '[', text)
  #(text, _) = re.subn(r'&#93;', ']', text)

  return text

# Process the text in various ways in preparation for extracting
# the words from the text.
def format_text_second_pass(text):
  # Convert breaks into newlines
  (text, _) = re.subn(r'<br( +/)?>', r'\n', text)

  # Remove references, but convert to whitespace to avoid concatenating
  # words outside and inside a reference together
  #(text, _) = re.subn(r'(?s)<ref.*?>', ' ', text)

  # An alternative approach.
  # Convert references to simple tags.
  (text, _) = re.subn(r'(?s)<ref[^<>]*?/>', ' ', text)
  (text, _) = re.subn(r'(?s)<ref.*?>', '< ref>', text)
  (text, _) = re.subn(r'(?s)</ref.*?>', '< /ref>', text)

  # Similar for nowiki, which may have <'s, brackets and such inside.
  (text, _) = re.subn(r'(?s)<nowiki>.*?</nowiki>', ' ', text)

  # Another hack: Inside of <gallery>...</gallery>, there are raw filenames.
  # Get rid of.

  def process_gallery(text):
    # Split on gallery blocks (FIXME, recursion not handled).  Putting a
    # group around the split text ensures we get it returned along with the
    # other text.
    chunks = re.split(r'(?s)(<gallery.*?>.*?</gallery>)', text)
    for chunk in chunks:
      # If a gallery, extract the stuff inside ...
      m = re.match(r'^(?s)<gallery.*?>(.*?)</gallery>$', chunk)
      if m:
        chunk = m.group(1)
        # ... then remove files and images, but keep any text after |
        (chunk, _) = re.subn(r'(?m)^(?:File|Image):[^|\n]*$', '', chunk)
        (chunk, _) = re.subn(r'(?m)^(?:File|Image):[^|\n]*\|(.*)$',
                             r'\1', chunk)
      yield chunk
  
  text = ''.join(process_gallery(text))
  
  # Remove remaining HTML codes from the text
  (text, _) = re.subn(r'(?s)<[A-Za-z/].*?>', '', text)

  (text, _) = re.subn(r'< (/?ref)>', r'<\1>', text)

  # Remove multiple sequences of quotes (indicating boldface or italics)
  (text, _) = re.subn(r"''+", '', text)
 
  # Remove beginning-of-line markers indicating indentation, lists, headers,
  # etc.
  (text, _) = re.subn(r"(?m)^[*#:]+", '', text)

  # Remove end-of-line markers indicating headers (e.g. ===Introduction===)
  (text, _) = re.subn(r"(?m)^=+(.*?)=+$", r'\1', text)

  return text

#######################################################################
#                           Article handlers                          #
#######################################################################



### Default handler class for processing article text.  Subclass this to
### implement your own handlers.
class ArticleHandler(object):
  def __init__(self):
    self.title = None
    self.id = None

  redirect_commands = "|".join([
      # English, etc.
      'redirect', 'redirect to',
      # Italian (IT)
      'rinvia', 'rinvio',
      # Polish (PL)
      'patrz', 'przekieruj', 'tam',
      # Dutch (NL)
      'doorverwijzing',
      # French (FR)
      'redirection',
      # Spanish (ES)
      u'redirección',
      # Portuguese (PT)
      'redirecionamento',
      # German (DE)
      'weiterleitung',
      # Russian (RU)
      u'перенаправление',
    ])
 
  global redirect_re
  redirect_re = re.compile(ur'(?i)#(?:%s)\s*:?\s*\[\[(.*?)\]\]' %
      redirect_commands)

  # Process the text of article TITLE, with text TEXT.  The default
  # implementation does the following:
  #
  # 1. Remove comments, math, and other unuseful stuff.
  # 2. If article is a redirect, call self.process_redirect() to handle it.
  # 3. Else, call self.process_text_for_data() to extract data out.
  # 4. If that handler returned True, call self.process_text_for_text()
  #    to do processing of the text itself (e.g. for words).

  def process_article_text(self, text, title, id, redirect):
    self.title = title
    self.id = id
    global debug_cur_title
    debug_cur_title = title
  
    if debug['some']:
      errprint("Article title: %s" % title)
      errprint("Article ID: %s" % id)
      errprint("Article is redirect: %s" % redirect)
      errprint("Original article text:\n%s" % text)
  
    ### Preliminary processing of text, removing stuff unuseful even for
    ### extracting data.
 
    text = format_text_first_pass(text)
  
    ### Look to see if the article is a redirect
  
    if redirect:
      m = redirect_re.match(text.strip())
      if m:
        self.process_redirect(m.group(1))
        # NOTE: There may be additional templates specified along with a
        # redirection page, typically something like {{R from misspelling}}
        # that gives the reason for the redirection.  Currently, we ignore
        # such templates.
        return
      else:
        wikiwarning(
          "Article %s (ID %s) is a redirect but can't parse redirect spec %s"
          % (title, id, text))
  
    ### Extract the data out of templates; if it returns True, also process
    ### text for words
  
    if self.process_text_for_data(text):
      self.process_text_for_text(text)

  # Process the text itself, e.g. for words.  Default implementation does
  # nothing.
  def process_text_for_text(self, text):
    pass

  # Process an article that is a redirect.  Default implementation does
  # nothing.

  def process_redirect(self, redirtitle):
    pass

  # Process the text and extract data.  Return True if further processing of
  # the article should happen. (Extracting the real text in
  # process_text_for_text() currently takes up the vast majority of running
  # time, so skipping it is a big win.)
  #
  # Default implementation just returns True.

  def process_text_for_data(self, text):
    return True

  def finish_processing(self):
    pass



### Default handler class for processing article text, including returning
### "useful" text (what the Wikipedia user sees, plus similar-quality
### hidden text).
class ArticleHandlerForUsefulText(ArticleHandler):
  # Process the text itself, e.g. for words.  Input it text that has been
  # preprocessed as described above (remove comments, etc.).  Default
  # handler does two things:
  #
  # 1. Further process the text (see format_text_second_pass())
  # 2. Use process_source_text() to extract chunks of useful
  #    text.  Join together and then split into words.  Pass the generator
  #    of words to self.process_text_for_words().

  def process_text_for_text(self, text):  
    # Now process the text in various ways in preparation for extracting
    # the words from the text
    text = format_text_second_pass(text)
    # Now process the resulting text into chunks.  Join them back together
    # again (to handle cases like "the [[latent variable]]s are ..."), and
    # split to find words.
    self.process_text_for_words(
      split_text_into_words(
        ''.join(ExtractUsefulText().process_source_text(text))))

  # Process the real words of the text of an article.  Default implementation
  # does nothing.

  def process_text_for_words(self, word_generator):
    pass



# Print out the info passed in for article words; as for the implementation of
# process_text_for_data(), uses ExtractCoordinatesFromSource() to extract
# coordinates, and outputs all the coordinates seen.  Always returns True.

class OutputAllWords(ArticleHandlerForUsefulText):
  def process_text_for_words(self, word_generator):
    splitprint("Article title: %s" % self.title)
    splitprint("Article ID: %s" % self.id)
    for word in word_generator:
      if debug['some']: errprint("Saw word: %s" % word)
      else: splitprint("%s" % word)

  def process_text_for_data(self, text):
    #handler = ExtractCoordinatesFromSource()
    #for foo in handler.process_source_text(text): pass
    #for (temptype,lat,long) in handler.coords:
    #  splitprint("Article coordinates: %s,%s" % (lat, long))
    return True

  def finish_processing(self):
    ### Output all of the templates that were seen with coordinates in them,
    ### along with counts of how many times each template was seen.
    if debug['some']:
      print("Templates with coordinates:")
      output_reverse_sorted_table(templates_with_coords,
                                  outfile=cur_output_file)
      
      print("All templates:")
      output_reverse_sorted_table(all_templates, outfile=cur_output_file)
  
      print "Notice: ending processing"


class OutputCoordWords(OutputAllWords):
  def process_text_for_data(self, text):
    if extract_coordinates_from_article(text):
      return True
    return False


# Just find redirects.

class FindRedirects(ArticleHandler):
  def process_redirect(self, redirtitle):
    splitprint("Article title: %s" % self.title)
    splitprint("Article ID: %s" % self.id)
    splitprint("Redirect to: %s" % redirtitle)

def output_title(title, id):
  splitprint("Article title: %s" % title)
  splitprint("Article ID: %s" % id)

def output_title_and_coordinates(title, id, lat, long):
  output_title(title, id)
  splitprint("Article coordinates: %s,%s" % (lat, long))

# FIXME:
#
# (1) Figure out whether coordinates had a display=title in them.
#     If so, use the last one.
# (2) Else, use the last other Coord, but possibly limit to Coords that
#     appear on a line by themselves or at least are at top level (not
#     inside some other template, table, etc.).
# (3) Else, do what we prevously did.
#
# Also, we should test to see whether it's better in (2) to limit Coords
# to those that apear on a line by themselves.  To do that, we'd generate
# coordinates for Wikipedia, and in the process note
#
# (1) Whether it was step 1, 2 or 3 above that produced the coordinate;
# (2) If step 2, would the result have been different if we did step 2
#     differently?  Check the possibilities: No limit in step 2;
#     (maybe, if not too hard) limit to those things at top level;
#     limit to be on line by itself; don't ever use Coords in step 2.
#     If there is a difference among the results of any of these strategies
#     debug-output this fact along with the different values and the
#     strategies that produced them.
#
# Then
#
# (1) Output counts of how many resolved through steps 1, 2, 3, and how
#     many in step 2 triggered a debug-output.
# (2) Go through manually and check e.g. 50 of the ones with debug-output
#     and see which one is more correct.  

def extract_coordinates_from_article(text):
  handler = ExtractCoordinatesFromSource()
  for foo in handler.process_source_text(text): pass
  if len(handler.coords) > 0:
    # Prefer a coordinate specified using {{Coord|...}} or similar to
    # a coordinate in an Infobox, because the latter tend to be less
    # accurate.
    for (temptype, lat, long) in handler.coords:
      if temptype.startswith('coor'):
        return (lat, long)
    (temptype, lat, long) = handler.coords[0]
    return (lat, long)
  else: return None

def extract_and_output_coordinates_from_article(title, id, text):
  retval = extract_coordinates_from_article(text)
  if retval == None: return False
  (lat, long) = retval
  output_title_and_coordinates(title, id, lat, long)
  return True

def extract_location_type(text):
  handler = ExtractLocationTypeFromSource()
  for foo in handler.process_source_text(text): pass
  for (ty, vals) in handler.loctype:
    splitprint("  %s: %s" % (ty, vals))
  for cat in handler.categories:
    splitprint("  category: %s" % cat)

# Handler to output count information on words.  Only processes articles
# with coordinates in them.  Computes the count of each word in the article
# text, after filtering text for "actual text" (as opposed to directives
# etc.), and outputs the counts.

class OutputCoordCounts(ArticleHandlerForUsefulText):
  def process_text_for_words(self, word_generator):
    wordhash = intdict()
    for word in word_generator:
      if word: wordhash[word] += 1
    output_reverse_sorted_table(wordhash, outfile=cur_output_file)

  def process_text_for_data(self, text):
    if extract_coordinates_from_article(text):
      output_title(self.title, self.id)
      return True
    return False

# Same as above but output counts for all articles, not just those with
# coordinates in them.

class OutputAllCounts(OutputCoordCounts):
  def process_text_for_data(self, text):
    output_title(self.title, self.id)
    return True

# Handler to output just coordinate information.
class OutputCoords(ArticleHandler):
  def process_text_for_data(self, text):
    return extract_and_output_coordinates_from_article(self.title, self.id,
                                                       text)

# Handler to try to determine the type of an article with coordinates.
class OutputLocationType(ArticleHandler):
  def process_text_for_data(self, text):
    iscoord = extract_and_output_coordinates_from_article(self.title, self.id,
                                                          text)
    if iscoord:
      extract_location_type(text)
    return iscoord


class ToponymEvalDataHandler(ExtractUsefulText):
  def join_arguments_as_generator(self, args_of_macro):
    first = True
    for chunk in args_of_macro:
      if not first: yield ' '
      first = False
      for chu in self.process_source_text(chunk):
        yield chu

  # OK, this is a bit tricky.  The definitions of process_template() and
  # process_internal_link() in ExtractUsefulText() use yield_template_args()
  # and yield_internal_link_args(), respectively, to yield arguments, and
  # then call process_source_text() to recursively process the arguments and
  # then join everything together into a string, with spaces between the
  # chunks corresponding to separate arguments.  The joining together
  # happens inside of process_and_join_arguments().  This runs into problems
  # if we have an internal link inside of another internal link, which often
  # happens with images, which are internal links that have an extra caption
  # argument, which frequently contains (nested) internal links.  The
  # reason is that we've overridden process_internal_link() to sometimes
  # return a tuple (which signals the outer handler that we found a link
  # of the appropriate sort), and the joining together chokes on non-string
  # arguments.  So instead, we "join" arguments by just yielding everything
  # in sequence, with spaces inserted as needed between arguments; this
  # happens in join_arguments_as_generator().  We specifically need to
  # override process_template() (and already override process_internal_link()),
  # because it's exactly those two that currently call
  # process_and_join_arguments().
  #
  # The idea is that we never join arguments together at any level of
  # recursion, but just yield chunks.  At the topmost level, we will join
  # as necessary and resplit for word boundaries.

  def process_template(self, text):
    for chunk in self.join_arguments_as_generator(yield_template_args(text)):
      yield chunk
  
  def process_internal_link(self, text):
    tempargs = get_macro_args(text)
    m = re.match(r'(?s)\s*([a-zA-Z0-9_]+)\s*:(.*)', tempargs[0])
    if m:
      # Something like [[Image:...]] or [[wikt:...]] or [[fr:...]]
      # For now, just skip them all; eventually, might want to do something
      # useful with some, e.g. categories
      pass
    else:
      article = capfirst(tempargs[0])
      # Skip links to articles without coordinates
      if coordinate_articles and article not in coordinate_articles:
        pass
      else:
        yield ('link', tempargs)
        return

    for chunk in self.join_arguments_as_generator(yield_internal_link_args(text)):
      yield chunk


class GenerateToponymEvalData(ArticleHandler):
  # Process the text itself, e.g. for words.  Input it text that has been
  # preprocessed as described above (remove comments, etc.).  Default
  # handler does two things:
  #
  # 1. Further process the text (see format_text_second_pass())
  # 2. Use process_source_text() to extract chunks of useful
  #    text.  Join together and then split into words.  Pass the generator
  #    of words to self.process_text_for_words().

  def process_text_for_text(self, text):
    # Now process the text in various ways in preparation for extracting
    # the words from the text
    text = format_text_second_pass(text)

    splitprint("Article title: %s" % self.title)
    chunkgen = ToponymEvalDataHandler().process_source_text(text)
    #for chunk in chunkgen:
    #  errprint("Saw chunk: %s" % (chunk,))
    # groupby() allows us to group all the non-link chunks (which are raw
    # strings) together efficiently
    for k, g in itertools.groupby(chunkgen,
                                  lambda chunk: type(chunk) is tuple):
      #args = [arg for arg in g]
      #errprint("Saw k=%s, g=%s" % (k,args))
      if k:
         for (linktext, linkargs) in g:
           splitprint("Link: %s" % '|'.join(linkargs))
      else:
        # Now process the resulting text into chunks.  Join them back together
        # again (to handle cases like "the [[latent variable]]s are ..."), and
        # split to find words.
        for word in split_text_into_words(''.join(g)):
          if word:
            splitprint("%s" % word)

# Generate article data of various sorts
class GenerateArticleData(ArticleHandler):
  def process_article(self, redirtitle):
    if rematch('(.*?):', self.title):
      namespace = m_[1]
      if namespace in article_namespace_aliases:
        namespace = article_namespace_aliases[namespace]
      elif namespace not in article_namespaces:
        namespace = 'Main'
    else:
      namespace = 'Main'
    yesno = {True:'yes', False:'no'}
    listof = self.title.startswith('List of ')
    disambig = self.id in disambig_pages_by_id
    list = listof or disambig or namespace in ('Category', 'Book')
    outprint("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %
             (self.id, self.title, cur_split_name, redirtitle, namespace,
              yesno[listof], yesno[disambig], yesno[list]))

  def process_redirect(self, redirtitle):
    self.process_article(capfirst(redirtitle))

  def process_text_for_data(self, text):
    self.process_article('')
    return False

# Handler to output link information as well as coordinate information.
# Note that a link consists of two parts: The anchor text and the article
# name.  For all links, we keep track of all the possible articles for a
# given anchor text and their counts.  We also count all of the incoming
# links to an article (can be used for computing prior probabilities of
# an article).

class ProcessSourceForCoordLinks(RecursiveSourceTextHandler):
  useful_text_handler = ExtractUsefulText()
  def process_internal_link(self, text):
    tempargs = get_macro_args(text)
    m = re.match(r'(?s)\s*([a-zA-Z0-9_]+)\s*:(.*)', tempargs[0])
    if m:
      # Something like [[Image:...]] or [[wikt:...]] or [[fr:...]]
      # For now, just skip them all; eventually, might want to do something
      # useful with some, e.g. categories
      pass
    else:
      article = capfirst(tempargs[0])
      # Skip links to articles without coordinates
      if coordinate_articles and article not in coordinate_articles:
        pass
      else:
        anchor = ''.join(self.useful_text_handler.
                         process_source_text(tempargs[-1]))
        incoming_link_count[article] += 1
        if anchor not in anchor_text_map:
          nested_anchor_text_map = intdict()
          anchor_text_map[anchor] = nested_anchor_text_map
        else:
          nested_anchor_text_map = anchor_text_map[anchor]
        nested_anchor_text_map[article] += 1
 
    # Also recursively process all the arguments for links, etc.
    return self.process_source_text(text[2:-2])

class FindCoordLinks(ArticleHandler):
  def process_text_for_data(self, text):
    handler = ProcessSourceForCoordLinks()
    for foo in handler.process_source_text(text): pass
    return False

  def finish_processing(self):
    print "------------------ Count of incoming links: ---------------"
    output_reverse_sorted_table(incoming_link_count, outfile=cur_output_file)
  
    print "==========================================================="
    print "==========================================================="
    print "==========================================================="
    print ""
    for (anchor,map) in anchor_text_map.items():
      splitprint("-------- Anchor text->article for %s: " % anchor)
      output_reverse_sorted_table(map, outfile=cur_output_file)

#######################################################################
#                SAX handler for processing raw dump files            #
#######################################################################

class FinishParsing:
  pass

# We do a very simple-minded way of handling the XML.  We maintain the
# path of nested elements that we're within, and we track the text since the
# last time we saw the beginning of an element.  We reset the text we're
# tracking every time we see an element begin tag, and we don't record
# text at all after an end tag, until we see a begin tag again.  Basically,
# this means we don't handle cases where tags are nested inside of text.
# This isn't a problem since cases like this don't occur in the Wikipedia
# dump.

class WikipediaDumpSaxHandler(ContentHandler):
  '''SAX handler for processing Wikipedia dumps.  Note that SAX is a
simple interface for handling XML in a serial fashion (as opposed to a
DOM-type interface, which reads the entire XML file into memory and allows
it to be dynamically manipulated).  Given the size of the XML dump file
(around 25 GB uncompressed), we can't read it all into memory.'''
  def __init__(self, output_handler):
    errprint("Beginning processing of Wikipedia dump...")
    self.curpath = []
    self.curtext = None
    self.output_handler = output_handler
    self.status = StatusMessage('article')
    
  def startElement(self, name, attrs):
    '''Handler for beginning of XML element.'''
    if debug['sax']: errprint("startElement() saw %s/%s" % (name, attrs))
    # We should never see an element inside of the Wikipedia text.
    if self.curpath:
      assert self.curpath[-1] != 'text'
    self.curpath.append(name)
    self.curtext = []
    # We care about the title, ID, and redirect status.  Reset them for
    # every page; this is especially important for redirect status.
    if name == 'page':
      self.title = None
      self.id = None
      self.redirect = False

  def characters(self, text):
    '''Handler for chunks of text.  Accumulate all adjacent chunks.  When
the end element </text> is seen, process_article_text() will be called on the
combined chunks.'''
    if debug['sax']: errprint("characters() saw %s" % text)
    # None means the last directive we saw was an end tag; we don't track
    # text any more until the next begin tag.
    if self.curtext != None:
      self.curtext.append(text)
 
  def endElement(self, name):
    '''Handler for end of XML element.'''
    eltext = ''.join(self.curtext) if self.curtext else ''
    self.curtext = None # Stop tracking text
    self.curpath.pop()
    if name == 'title':
      self.title = eltext
    # ID's occur in three places: the page ID, revision ID and contributor ID.
    # We only want the page ID, so check to make sure we've got the right one.
    elif name == 'id' and self.curpath[-1] == 'page':
      self.id = eltext
    elif name == 'redirect':
      self.redirect = True
    elif name == 'text':
      # If we saw the end of the article text, join all the text chunks
      # together and call process_article_text() on it.
      set_next_split_file()
      if debug['lots']:
        max_text_len = 150
        endslice = min(max_text_len, len(eltext))
        truncated = len(eltext) > max_text_len
        errprint(
        """Calling process_article_text with title=%s, id=%s, redirect=%s;
  text=[%s%s]""" % (self.title, self.id, self.redirect, eltext[0:endslice],
                    "..." if truncated else ""))
      self.output_handler.process_article_text(text=eltext, title=self.title,
        id=self.id, redirect=self.redirect)
      if self.status.item_processed(maxtime=Opts.max_time_per_stage):
        raise FinishParsing()
 
#######################################################################
#                                Main code                            #
#######################################################################


def main_process_input(wiki_handler):
  ### Create the SAX parser and run it on stdin.
  sax_parser = make_parser()
  sax_handler = WikipediaDumpSaxHandler(wiki_handler)
  sax_parser.setContentHandler(sax_handler)
  try:
    sax_parser.parse(sys.stdin)
  except FinishParsing:
    pass
  wiki_handler.finish_processing()
  
def main():

  op = OptionParser(usage="%prog [options] < file")
  op.add_option("--output-all-words",
                help="Output words of text, for all articles.",
                action="store_true")
  op.add_option("--output-coord-words",
                help="Output text, but only for articles with coordinates.",
                action="store_true")
  op.add_option("--raw-text", help="""When outputting words, make output
resemble some concept of "raw text".  Currently, this just includes
punctuation instead of omitting it, and shows only the anchor text of a
link rather than both the anchor text and actual article name linked to,
when different.""", action="store_true")
  op.add_option("--no-tokenize", help="""When outputting words, don't tokenize.
This causes words to only be split on whitespace, rather than also on
punctuation.""", action="store_true")
  op.add_option("--find-coord-links",
                help="""Find all links and print info about them, for
articles with coordinates or redirects to such articles.  Includes count of
incoming links, and, for each anchor-text form, counts of all articles it
maps to.""",
                action="store_true")
  op.add_option("--output-all-counts",
                help="Print info about counts of words, for all articles.",
                action="store_true")
  op.add_option("--output-coord-counts",
                help="Print info about counts of words, but only for articles with coodinates.",
                action="store_true")
  op.add_option("--output-coords",
                help="Print info about coordinates of articles with coordinates.",
                action="store_true")
  op.add_option("--output-location-type",
                help="Print info about type of articles with coordinates.",
                action="store_true")
  op.add_option("--find-redirects",
                help="Output all redirects.",
                action="store_true")
  op.add_option("--generate-toponym-eval",
                help="Generate data files for use in toponym evaluation.",
                action="store_true")
  op.add_option("--generate-article-data",
                help="""Generate file listing all articles and info about them.
If using this option, the --disambig-id-file and --split-training-dev-test
options should also be used.

The format is

ID TITLE SPLIT REDIR NAMESPACE LIST-OF DISAMBIG LIST

where each field is separated by a tab character.

The fields are

ID = Numeric ID of article, given by wikiprep
TITLE = Title of article
SPLIT = Split to assign the article to; one of 'training', 'dev', or 'test'.
REDIR = If the article is a redirect, lists the article it redirects to;
        else, blank.
NAMESPACE = Namespace of the article, one of 'Main', 'User', 'Wikipedia',
            'File', 'MediaWiki', 'Template', 'Help', 'Category', 'Thread',
            'Summary', 'Portal', 'Book'.  These are the basic namespaces
            defined in [[Wikipedia:Namespace]].  Articles of the appropriate
            namespace begin with the namespace prefix, e.g. 'File:*', except
            for articles in the main namespace, which includes everything
            else.  Note that some of these namespaces don't actually appear
            in the article dump; likewise, talk pages don't appear in the
            dump.  In addition, we automatically include the common namespace
            abbreviations in the appropriate space, i.e.

            P               Portal
            H               Help
            T               Template
            CAT, Cat, C     Category
            MOS, MoS, Mos   Wikipedia (used for "Manual of Style" pages)
LIST-OF = 'yes' if article title is of the form 'List of *', typically
          containing a list; else 'no'.
DISAMBIG = 'yes' if article is a disambiguation page (used to disambiguate
           multiple concepts with the same name); else 'no'.
LIST = 'yes' if article is a list of some sort, else no.  This includes
       'List of' articles, disambiguation pages, and articles in the 'Category'
       and 'Book' namespaces.""",
                action="store_true")
  op.add_option("--split-training-dev-test",
                help="""Split output into training, dev and test files.
Use the specified value as the file prefix, suffixed with '.train', '.dev'
and '.test' respectively.""",
                metavar="FILE")
  op.add_option("--training-fraction", type='float', default=80,
                help="""Fraction of total articles to use for training.
The absolute amount doesn't matter, only the value relative to the test
and dev fractions, as the values are normalized.  Default %default.""",
                metavar="FRACTION")
  op.add_option("--dev-fraction", type='float', default=10,
                help="""Fraction of total articles to use for dev set.
The absolute amount doesn't matter, only the value relative to the training
and test fractions, as the values are normalized.  Default %default.""",
                metavar="FRACTION")
  op.add_option("--test-fraction", type='float', default=10,
                help="""Fraction of total articles to use for test set.
The absolute amount doesn't matter, only the value relative to the training
and dev fractions, as the values are normalized.  Default %default.""",
                metavar="FRACTION")
  op.add_option("--coords-file",
                help="""File containing output from a prior run of
--coords-counts, listing all the articles with associated coordinates.
This is used to limit the operation of --find-coord-links to only consider
links to articles with coordinates.  Currently, if this is not done, then
using --coords-file requires at least 10GB, perhaps more, of memory in order
to store the entire table of anchor->article mappings in memory. (If this
entire table is needed, it may be necessary to implement a MapReduce-style
process where smaller chunks are processed separately and then the results
combined.)""",
                metavar="FILE")
  op.add_option("--article-data-file",
                help="""File containing article data.  Used by
--find-coord-links to find the redirects pointing to articles with
coordinates.""",
                metavar="FILE")
  op.add_option("--disambig-id-file",
                help="""File containing list of article ID's that are
disambiguation pages.""",
                metavar="FILE")
  op.add_option("--max-time-per-stage", "--mts", type='int', default=0,
                help="""Maximum time per stage in seconds.  If 0, no limit.
Used for testing purposes.  Default %default.""")
  op.add_option("--debug", metavar="FLAGS",
                help="Output debug info of the given types (separated by spaces or commas)")

  errprint("Arguments: %s" % ' '.join(sys.argv))
  opts, args = op.parse_args()
  output_option_parameters(opts)

  global Opts
  Opts = opts

  global debug
  if opts.debug:
    flags = re.split(r'[,\s]+', opts.debug)
    for f in flags:
      debug[f] = True
  if debug['err'] or debug['some'] or debug['lots'] or debug['sax']:
    cur_output_file = sys.stderr
    debug_to_stderr = True

  if opts.split_training_dev_test:
    init_output_files(opts.split_training_dev_test,
                      [opts.training_fraction, opts.dev_fraction,
                       opts.test_fraction],
                      ['training', 'dev', 'test'])

  if opts.coords_file:
    read_coordinates_file(opts.coords_file)    
  if opts.article_data_file:
    read_redirects_from_article_data(opts.article_data_file)
  if opts.disambig_id_file:
    read_disambig_id_file(opts.disambig_id_file)
  if opts.output_all_words:
    main_process_input(OutputAllWords())
  elif opts.output_coord_words:
    main_process_input(OutputCoordWords())
  elif opts.find_coord_links:
    main_process_input(FindCoordLinks())
  elif opts.find_redirects:
    main_process_input(FindRedirects())
  elif opts.output_coords:
    main_process_input(OutputCoords())
  elif opts.output_all_counts:
    main_process_input(OutputAllCounts())
  elif opts.output_coord_counts:
    main_process_input(OutputCoordCounts())
  elif opts.output_location_type:
    main_process_input(OutputLocationType())
  elif opts.generate_toponym_eval:
    main_process_input(GenerateToponymEvalData())
  elif opts.generate_article_data:
    outprint('id\ttitle\tsplit\tredir\tnamespace\tis_list_of\tis_disambig\tis_list')
    main_process_input(GenerateArticleData())

#import cProfile
#cProfile.run('main()', 'process-wiki.prof')
main()
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.