Commits

Michele Bini  committed b518a3f

Initial commit.

  • Participants

Comments (0)

Files changed (3)

+Script to generate a database of earth coordinates of Italian cities
+and municipalities, to be used by OpenUDC project.
+
+
+Copyright (c) 2012 Michele Bini
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the version 3 of the GNU General Public License
+as published by the Free Software Foundation.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+The script requires the database IT.zip from geonames.org; which is
+avalable under the Creative Commons Attribution 3.0 License

File parse-ita.pl

+#!/usr/bin/perl -w
+
+# Copyright (c) 2012 Michele Bini
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the version 3 of the GNU General Public License
+# as published by the Free Software Foundation.
+
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+use strict;
+
+my $verbose = 1;
+
+my %names_and_codes;
+my %names_nocode;
+
+sub formatcoords {
+    sprintf("e%+06.2f%+07.2f", @_)
+}
+
+while (defined($_ = <STDIN>)) {
+    my ($geonameid, $name, $asciiname, $alteratenames, $latitude, $longitude, $feature_class, $feature_code, $country_code, $cc2, $admin1_code, $admin2_code, $admin3_code, $admin4_code, $population, $elevation, $dem, $timezone, $modification_date) = split(/\t/, $_);
+    next unless $country_code =~ /^ITA?$/;
+    next unless $feature_code =~ /^PPL/i;
+    unless (length($admin2_code)) {
+	# warn "No admin2 code for $asciiname ($geonameid)";
+	# next;
+    }
+
+    # Fix database entries
+    next if $asciiname eq 'Kuwan';
+    if ($asciiname eq 'Ceppo Morelli' && (!length($admin2_code))) {
+	next;
+    } elsif (($asciiname eq 'Cercivento') && (!length($admin2_code))) {
+	next;
+    }
+
+    my $ll = formatcoords($latitude, $longitude);
+    if (length $admin2_code) {
+	push @{ $names_and_codes{uc($asciiname) . " (" . $admin2_code . ")"} ||= [] }, $ll;
+    } else {
+	push @{ $names_nocode{uc($asciiname)} ||= [] }, $ll;
+    }
+}
+
+sub parsecoords {
+    if (shift() =~ /e([+-]?[0-9.]+)([+-]?[0-9.]+)/) {
+	return ($1, $2);
+    } else {
+	return;
+    }
+}
+
+sub coorddistance {
+    use Math::Trig qw(acos pi);
+    my ($a, $b) = @_;
+    my ($lat1, $lon1) = @$a;
+    my ($lat2, $lon2) = @$b;
+    ($lat1, $lon1, $lat2, $lon2) = map {
+	$_ * pi() / 180.0
+    } ($lat1, $lon1, $lat2, $lon2);
+    acos(sin($lat1)*sin($lat2) +
+	 cos($lat1)*cos($lat2) *
+	 cos($lon2 - $lon1)) * 6378.137; # Radius of the equator
+}
+
+sub calculate_average_point {
+    my $name = shift();
+    my @c = @_;
+    # Calculate average of collected points
+    no integer;
+    my $lat_m = 0.0;
+    my $lon_m = 0.0;
+    for (@c) {
+	$lat_m += $_->[0];
+	$lon_m += $_->[1];
+    }
+    $lat_m = $lat_m / int(@c);
+    $lon_m = $lon_m / int(@c);
+    
+    # Accumulate all distinct points
+    my @p = ([$lat_m, $lon_m]);
+    
+    for (@c) {
+	if (($lat_m - $_->[0]) ** 2 +
+	    ($lon_m - $_->[1]) ** 2
+	    > 0.8 ** 2) {
+	    push @p, [ $_->[0], $_->[1] ];
+	}
+    }
+    
+    if (@p > 1) {
+	die "\"$name\" has more than one location: " . join(", ", map { formatcoords(@$_) } @p);
+    }
+
+    @{ $p[0] }
+}
+
+sub warning {
+    my $x = shift;
+    print STDERR $x . "\n";
+}
+
+# Check for names with multiple locations
+for (keys %names_and_codes) {
+    my $x = $names_and_codes{$_};
+    if (@$x > 1) {
+	my @c = map { [ parsecoords($_) ] } @$x;
+	$names_and_codes{$_} = formatcoords(calculate_average_point($_, @c));
+    } else {
+	$names_and_codes{$_} = $x->[0];
+    }
+}
+
+my %names_with_code;
+for (keys %names_and_codes) {
+    my $x = $_;
+    die "internal error" unless $x =~ s/ [(]([^ ]+)[)]$//;
+    push @{ $names_with_code{$x} ||= [] }, [ $names_and_codes{$_}, $1 ];
+}
+
+# Delete entries that already occur with an administrative code
+my $n;
+for $n (keys %names_nocode) {
+    my $x = $names_with_code{$n};
+    next unless defined $x;
+    my $l = $names_nocode{$n};
+    my @r;
+    my $a_u;
+    for $a_u (@$l) {
+	my @a = parsecoords($a_u);
+	my $found;
+	my $b_u;
+	for $b_u (map { $_->[0] } @$x) {
+	    my @b = parsecoords($b_u);
+	    if (($a[0] - $b[0]) ** 2 +
+		($a[1] - $b[1]) ** 2
+		< 1 ** 2) {
+		warning "Found a location for $n" if $verbose;
+		$found = 1;
+	    }
+	}
+	unless ($found) {
+	    push @r, $a_u;
+	}
+    }
+    if (@r) {
+	@$l = @r;
+    } else {
+	warning "Found all locations for $n" if $verbose;
+	delete $names_nocode{$n};
+    }
+}
+
+# Warn about entries without an administrative code that have distict locations from entries with the same name that occur with an administrative code: usually these are localities that share the name of another locality with a greater administrative level
+for (keys %names_nocode) {
+    if (exists $names_with_code{$_}) {
+	warning "Entry without administrative code has distinct locations from entry with the same name and administrative code, deleted.";
+	warning "$_: " . join(", ", @{ $names_nocode{$_} });
+	my $n = $_;
+	for (@{ $names_with_code{$_} }) {
+	    warning "$n (" . $_->[1] . "): " . $_->[0];
+	};
+	delete $names_nocode{$n};
+    }
+}
+
+# Warn about entries with more than one distinct location
+for (keys %names_nocode) {
+    my $x = $names_nocode{$_};
+    if (@$x > 1) {
+	# Delete names that match more than one location
+	die "\"$_\" matches more than one location, deleted";
+	delete $names_nocode{$_};
+    } else {
+	$names_nocode{$_} = $x->[0];
+    }
+}
+
+my %names;
+# Coalesce names_and_codes and names_nocode into a single map:
+for (keys %names_nocode) {
+    die "Two entries have the same name: $_" if exists $names{$_};
+    $names{$_} = $names_nocode{$_};
+}
+
+for (keys %names_and_codes) {
+    die "Two entries have the same name: $_" if exists $names{$_};
+    $names{$_} = $names_and_codes{$_};    
+}
+
+# Delete some non-existing entries
+
+# Create a map of locations to find any entries sharing a location
+my %loc;
+for (keys %names) {
+    my $l = $names{$_};
+    if (exists $loc{$l}) {
+	my $a = $_;
+	my $b = $loc{$l};
+	warning "Two entries have the same location: $a and $b";
+    } else {
+	$loc{$l} = $_;
+    }
+}
+
+# Duplicate dual-named entries
+for (keys %names) {
+    my $o = $_;
+    my $x = $_;
+    my $admcode = ($x =~ s/ +[(][^ ]+[)]$//) ? $& : "";
+    if ($x =~ / +- +/) {
+	$names{"$'" . $admcode} = $names{$o};
+	$names{"$`" . $admcode} = $names{$o};
+	delete $names{$o};
+    }
+}
+
+# Print out entries
+for (keys %names) {
+    print $names{$_} . "\tITA\t$_\n";
+}
+#!/bin/sh
+set -x
+(perl -w parse-ita.pl | sort -u) <it.txt >geolist_ITA.txt 2>parseita.err