Clone wiki

ky_wbprojects / update_obo_oa_ontologies.pl

This script : /home/postgres/work/pgpopulation/obo_oa_ontologies/update_obo_oa_ontologies.pl

Is on a cron job every day at 3am. It populates obo tables by downloading :

And it also calls :

If any other scripts do anything with any of those files above, it's independent of this script that populates the OA obo_ tables, and I don't know what they are. If there's any specific scripts you want to go over, let me know, and we can sit down together and go through it so you can write up on it on a wiki.

#!/usr/bin/perl -w

# Populate obo_{name|syn|data}_<datatype>_<field> tables in postgres based off
# webpages where the obos are stored.  For ontology_annotator.cgi 
# needs a cronjob (probably every day).  2009 10 04
#
# Populate obo_data_app_term  with pre-populated parent/child relationships.  2009 10 13
#
# Added alt_id to app & anat_term to obo_data_app_anat_term and obo_name_app_anat_term 
# so that terms that existed before can be queried for replacing.  2010 04 21
#
# No longer update app_tempname (variations + transgene + rearrangement) from ws_current.obo
# instead use 
# http://tazendra.caltech.edu/~azurebrd/cgi-bin/forms/generic.cgi?action=UpdateVariationObo
# which will look at 4 files in /home/acedb/jolene/WS_AQL_queries/ and nameserver data.
# 2010 06 10
#
# update app term data from sanger, which takes some 30 mins by calling 
# /home/acedb/jolene/WS_AQL_queries/update_variation_obo.pl
# 2010 07 22
#
# do full update of variation obo data only on 1st of month (it takes 55 minutes)
# do incremental update other days (takes 30 secs to download file and a few seconds to
# add a small number of new entries)  2010 08 23
#
#
# Added to cron every day at 3am  2010 01 22
# 0 3 * * * /home/postgres/work/pgpopulation/obo_oa_ontologies/update_obo_oa_ontologies.pl


use strict;
use diagnostics;
use DBI;
use LWP::Simple;
use LWP;

my $dbh = DBI->connect ( "dbi:Pg:dbname=testdb", "", "") or die "Cannot connect to database!\n"; 
my $result;

my $directory = '/home/postgres/work/pgpopulation/obo_oa_ontologies/';
chdir ($directory) or die "Cannot chdir to $directory : $!";


my %obos;
  $obos{app}{term} = 'http://tazendra.caltech.edu/~azurebrd/cgi-bin/forms/phenotype_ontology_obo.cgi';	# gets from cvs from spica
#   $obos{app}{child_of} = 'http://tazendra.caltech.edu/~azurebrd/cgi-bin/forms/phenotype_ontology_obo.cgi';
#   $obos{app}{tempname} = 'http://tazendra.caltech.edu/~azurebrd/var/work/phenote/ws_current.obo';	# no longer use this file, jolene will manually update when changing data in the nameserver  2010 06 10

  $obos{mop}{chebi} = 'ftp://ftp.ebi.ac.uk/pub/databases/chebi/ontology/chebi.obo';
  $obos{gop}{goid} = 'http://www.geneontology.org/ontology/obo_format_1_2/gene_ontology_ext.obo';	# replaced 2010 10 28
#   $obos{gop}{goid} = 'http://www.geneontology.org/ontology/gene_ontology_edit.obo';	# replaced 2010 10 28
#   $obos{app}{anat_term} = 'http://www.berkeleybop.org/ontologies/obo-all/worm_anatomy/worm_anatomy.obo';
  $obos{app}{anat_term} = 'http://obo.cvs.sourceforge.net/*checkout*/obo/obo/ontology/anatomy/gross_anatomy/animal_gross_anatomy/worm/worm_anatomy/WBbt.obo';	# jolene says raymond changes this more frequently  2010 04 21
  $obos{app}{lifestage} = 'http://www.berkeleybop.org/ontologies/obo-all/worm_development/worm_development.obo';
  $obos{app}{entity} = 'http://www.berkeleybop.org/ontologies/obo-all/rex/rex.obo';
  $obos{app}{quality} = 'http://www.berkeleybop.org/ontologies/obo-all/quality/quality.obo';

#     my $file_name = $directory . 'obo_app_' . $field;
#     open (IN, "<$file_name") or die "Cannot open $file_name : $!";
#     my $file_data = <IN>;
#     close (IN) or die "Cannot close $file_name : $!";
#     &updateData($type, $field, $file_name, $file_data);



$/ = undef;
foreach my $type (sort keys %obos) {
  foreach my $field (sort keys %{ $obos{$type} }) {
#     &createTable($type, $field);				# only create table once
    my $new_data = get $obos{$type}{$field};
    my $file_name = $directory . 'obo_' . $type . '_' . $field;
    my $file_data = ""; my $file_date = 0;
    if (-r $file_name) {
      open (IN, "<$file_name") or die "Cannot open $file_name : $!";
      $file_data = <IN>;
      close (IN) or die "Cannot close $file_name : $!";
      my ($day, $month, $year, $hour, $minute) = $file_data =~ m/date: (\d+):(\d+):(\d+) (\d+):(\d+)/;
      $file_date = $year . $month . $day . $hour . $minute;
    }
    if ($new_data =~ m/date: (\d+):(\d+):(\d+) (\d+):(\d+)/) {
      my ($day, $month, $year, $hour, $minute) = $new_data =~ m/date: (\d+):(\d+):(\d+) (\d+):(\d+)/;
      my $new_date = $year . $month . $day . $hour . $minute;
      if ($new_date > $file_date) {
        &updateData($type, $field, $file_name, $new_data);
      } # if ($new_date > $file_date)
    } # if ($new_data =~ m/date: (\d+):(\d+):(\d+) (\d+):(\d+)/)
  } # foreach my $field (sort keys %{ $obos{$type} }) 
} # foreach my $type (sort keys %obos) 
$/ = "\n";

# update app term data from sanger, which takes some 30 mins  2010 07 22
# `/home/acedb/jolene/WS_AQL_queries/update_variation_obo.pl`;	# no longer update everyday, do full update on first of month by calling webpage script

my $start_time = time;
my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime($start_time);
if ($mday eq '1') {			# do full wipe and repopulate on 1st of month
#   `wget "http://tazendra.caltech.edu/~azurebrd/cgi-bin/forms/generic.cgi?action=UpdateVariationObo"`;
  my $u = "http://tazendra.caltech.edu/~azurebrd/cgi-bin/forms/generic.cgi?action=UpdateVariationObo";
  my $ua = LWP::UserAgent->new(timeout => 99999); #instantiates a new user agent
  $ua->timeout( 999999 );			# this is still getting an Uncaught exception from user code: Error while getting http://tazendra.caltech.edu/~azurebrd/cgi-bin/forms/generic.cgi?action=UpdateVariationObo -- 504 Gateway Time-out
  my $request = HTTP::Request->new(GET => $u); #grabs url
  my $response = $ua->request($request);       #checks url, dies if not valid.
  die "Error while getting ", $response->request->uri," -- ", $response->status_line, "\nAborting" unless $response-> is_success;
#   `wget "http://tazendra.caltech.edu/~azurebrd/cgi-bin/forms/generic.cgi?action=UpdateVariationObo"`;		# this times out
} else {				# do incremental additions other days  2010 08 23
  get "http://tazendra.caltech.edu/~azurebrd/cgi-bin/forms/generic.cgi?action=AddToVariationObo";
}




sub updateData {
  my ($type, $field, $file_name, $new_data) = @_;
  my @tables = qw( name syn data );
  foreach my $table_type (@tables) {
    my $table = 'obo_' . $table_type . '_' . $type . '_' . $field;
#     print "DELETE FROM $table; \n";
    $result = $dbh->do("DELETE FROM $table; ");
  }
  my (@terms) = split/\[Term\]/, $new_data;
  my $term = shift @terms;	# junk header
  my %children; my %names;
  if ( ($type eq 'app') && ($field eq 'term') ) {
    foreach $term (@terms) {
      my ($id) = $term =~ m/\nid: (.*?)\n/;
      my ($name) = $term =~ m/\nname: (.*?)\n/;
      $names{$id} = $name;
      my (@parents) = $term =~ m/is_a: (WBPhenotype:\d+)/g;
      foreach my $parent (@parents) { $children{$parent}{"$id \! $name"}++; }
      (@parents) = $term =~ m/relationship: part_of (WBPhenotype:\d+)/g;
      foreach my $parent (@parents) { $children{$parent}{"$id \! $name"}++; }
    }
  }
  foreach $term (@terms) {
    $term =~ s/\\//g;		# strip \ escaped data
# print "1TERM $term 1END\n\n";
    my @syns = ();
    my ($id) = $term =~ m/\nid: (.*?)\n/;
    if ( ($type eq 'mop') && ($field eq 'chebi') ) { $id =~ s/CHEBI://; }
    my ($name) = $term =~ m/\nname: (.*?)\n/;
    $name =~ s/\"//g; $name =~ s/\'/''/g; 
    if ($term =~ m/\nsynonym: \"(.*?)\"/) {
      (@syns) = $term =~ m/\nsynonym: \"(.*?)\"/g; }
    $term =~ s/^\s+//sg; $term =~ s/\s+$//sg; $term =~ s/\'/''/g; 
    my $table = 'obo_name_' . $type . '_' . $field;
    $result = $dbh->do("INSERT INTO $table VALUES( '$id', '$name') ");
    $table = 'obo_data_' . $type . '_' . $field;
    if ( ($type eq 'app') && ($field eq 'term') ) {
      $term =~ s/is_a:/parent:/g;
      $term =~ s/relationship: part_of/parent:/g;
      foreach my $child_term (sort keys %{ $children{$id} }) { $term .= "\nchild: $child_term"; } 
    } # if ( ($type eq 'app') && ($field eq 'term') )
    if ( ($type eq 'app') && ($field eq 'anat_term') ) {
      if ($term =~ m/alt_id: (WBbt:\d+)/) {
        my (@alt) = $term =~ m/alt_id: (WBbt:\d+)/g;
        foreach my $alt_id (@alt) {
          my $table = 'obo_name_app_anat_term';
          $result = $dbh->do("INSERT INTO $table VALUES( '$alt_id', 'alt_id for $id') ");
          $table = 'obo_data_app_anat_term';
          $result = $dbh->do("INSERT INTO $table VALUES( '$alt_id', 'alt_id for $id') "); } }
    } # if ( ($type eq 'app') && ($field eq 'anat_term') )
# print "2TERM $term 2END\n\n";
    $result = $dbh->do("INSERT INTO $table VALUES( '$id', '$term') ");
    $table = 'obo_syn_' . $type . '_' . $field;
    foreach my $syn (@syns) { $syn =~ s/\'/''/g; 
      $result = $dbh->do("INSERT INTO $table VALUES( '$id', '$syn') "); }
  } # foreach $term (@terms)
  open (OUT, ">$file_name") or die "Cannot write to $file_name : $!"; 
  print OUT "$new_data";
  close (OUT) or die "Cannot close $file_name : $!"; 
} # sub updateData


sub createTable {
  my ($type, $field) = @_;
  my @tables = qw( name syn data );
  foreach my $table_type (@tables) {
    my $table = 'obo_' . $table_type . '_' . $type . '_' . $field;
    $result = $dbh->do("DROP TABLE $table; ");
    $result = $dbh->do( "CREATE TABLE $table ( joinkey text, $table text, obo_timestamp timestamp with time zone DEFAULT \"timestamp\"('now'::text) );" );
    $result = $dbh->do( "CREATE INDEX ${table}_idx ON $table USING btree (joinkey);" );
    $result = $dbh->do("REVOKE ALL ON TABLE $table FROM PUBLIC; ");
    $result = $dbh->do("GRANT ALL ON TABLE $table TO postgres; ");
    $result = $dbh->do("GRANT ALL ON TABLE $table TO acedb; ");
    $result = $dbh->do("GRANT ALL ON TABLE $table TO apache; ");
    $result = $dbh->do("GRANT ALL ON TABLE $table TO azurebrd; ");
    $result = $dbh->do("GRANT ALL ON TABLE $table TO cecilia; ");
  }
} # sub createTable


__END__

my $result = $dbh->prepare( "SELECT * FROM two_comment WHERE two_comment ~ ?" );
$result->execute() or die "Cannot prepare statement: $DBI::errstr\n"; 
while (my @row = $result->fetchrow) {
  if ($row[0]) { 
    $row[0] =~ s/
//g;
    $row[1] =~ s/
//g;
    $row[2] =~ s/
//g;
    print "$row[0]\t$row[1]\t$row[2]\n";
  } # if ($row[0])
} # while (@row = $result->fetchrow)

__END__

Updated