1. Shlomi Fish
  2. perl-Statistics-Descriptive

Commits

shawn_laffan  committed 81f0a5d

Modify methods in Statistics::Descriptive::Full based on code profiling and benchmarking.
These make a considerable difference when thousands of stats objects are generated and populated.
The main changes are:
1. Avoid needless calculation of mindex and maxdex. A search of CPAN indicated nobody uses them, so there is no need to calculate them every time data are added.
2. Use List::Util and List::MoreUtil subs to process some of the data.
3. Only clear the cached keys when we have actually cached something.

  • Participants
  • Parent commits 9dcd4a7
  • Branches default

Comments (0)

Files changed (1)

File Statistics-Descriptive/lib/Statistics/Descriptive.pm

View file
     return 1;
 }
 
+
 sub standard_deviation {
   my $self = shift;  ##Myself
   return undef if (!$self->count());
 
 @ISA = qw(Statistics::Descriptive::Sparse);
 
+use List::MoreUtils;
+use List::Util;
+
 ##Create a list of fields not to remove when data is updated
 %fields = (
   _permitted => undef,  ##Place holder for the inherited key hash
 sub _delete_all_cached_keys
 {
     my $self = shift;
+    
+    my %keys = %{ $self };
+
+    # Remove reserved keys for this class from the deletion list
+    delete @keys{keys %{$self->_reserved}};
+    delete @keys{keys %{$self->_permitted}};
+    delete $keys{_trimmed_mean_cache};
 
     KEYS_LOOP:
-    foreach my $key (keys %{ $self }) { # Check each key in the object
-        # If it's a reserved key for this class, keep it
-        if ($self->_is_reserved($key) || $self->_is_permitted($key))
-        {
-            next KEYS_LOOP;
-        }
-        delete $self->{$key};          # Delete the out of date cached key
+    foreach my $key (keys %keys) { # Check each key in the object
+        delete $self->{$key};  # Delete any out of date cached key
     }
+    $self->{_trimmed_mean_cache} = {};  #  just reset this one
     return;
 }
 
 }
 
 sub add_data {
-  my $self = shift;
-  my $aref;
+    my $self = shift;  ##Myself
+  
+    my $aref;
 
-  if (ref $_[0] eq 'ARRAY') {
-    $aref = $_[0];
-  }
-  else {
-    $aref = \@_;
-  }
-  $self->SUPER::add_data($aref);  ##Perform base statistics on the data
-  push @{ $self->_data() }, @{ $aref };
-  ##Clear the presorted flag
-  $self->presorted(0);
+    if (ref $_[0] eq 'ARRAY') {
+      $aref = $_[0];
+    }
+    else {
+      $aref = \@_;
+    }
+  
+    ##If we were given no data, we do nothing.
+    return 1 if (!@{ $aref });
+  
+    my $oldmean;
+    my ($min, $max, $sum, $sumsq);
+    my $count = $self->count;
 
-  $self->_delete_all_cached_keys();
+    #  $count is modified lower down, but we need this flag after that
+    my $has_existing_data = $count;  
 
-  return 1;
+    # Take care of appending to an existing data set
+    if ($has_existing_data) {
+        $min   = $self->min();
+        $max   = $self->max();
+        $sum   = $self->sum();
+        $sumsq = $self->sumsq();
+    }
+    else {
+        $min   = $aref->[0];
+        $max   = $aref->[0];
+        $sum   = 0;
+        $sumsq = 0;
+    }
+
+    #  need to allow for already having data
+    $sum    += List::Util::sum (@$aref);
+    $sumsq  += List::Util::sum (map {$_ ** 2} @$aref);
+    $max    =  List::Util::max ($max, @$aref);
+    $min    =  List::Util::min ($min, @$aref);
+    $count  +=  scalar @$aref;
+    my $mean = $sum / $count;
+
+    $self->min($min);
+    $self->max($max);
+    $self->sample_range($max - $min);
+    $self->sum($sum);
+    $self->sumsq($sumsq);
+    $self->mean($mean);
+    $self->count($count);
+
+    ##Variance isn't commonly enough
+    ##used to recompute every single data add, so just clear its cache.
+    #$self->_variance(undef);
+    $self->{variance} = undef;  #  Dirty approach, as above
+    
+    push @{ $self->_data() }, @{ $aref };
+
+    #  no need to clear keys if we are a newly populated object,
+    #  and profiling shows it takes a long time when creating
+    #  and populating many stats objects
+    if ($has_existing_data) {
+        ##Clear the presorted flag
+        $self->presorted(0);
+        $self->_delete_all_cached_keys();
+    }
+  
+    return 1;
 }
 
+
 sub add_data_with_samples {
     my ($self,$aref_values) = @_;
 
     $self->{_smoother}->get_smoothed_data();
 }
 
+sub maxdex {
+    my $self = shift;
+
+    return undef if !$self->count;
+    my $maxdex;
+
+    if ($self->presorted) {
+        $maxdex = $self->count - 1;
+    }
+    else {
+        my $max = $self->max;
+        $maxdex =  List::MoreUtils::first_index {$_ == $max} $self->get_data;
+    }
+
+    $self->{maxdex} = $maxdex;
+
+    return $maxdex;
+}
+
+sub mindex {
+    my $self = shift;
+
+    return undef if !$self->count;
+    #my $maxdex = $self->{maxdex};
+    #return $maxdex if defined $maxdex;
+    my $mindex;
+
+    if ($self->presorted) {
+        $mindex = 0;
+    }
+    else {
+        my $min = $self->min;
+        $mindex = List::MoreUtils::first_index {$_ == $min} $self->get_data;
+    }
+
+    $self->{mindex} = $mindex;
+
+    return $mindex;
+}
+
 sub sort_data {
   my $self = shift;
 
       ##Sort the data in descending order
       $self->_data([ sort {$a <=> $b} @{$self->_data()} ]);
       $self->presorted(1);
-      ##Fix the maxima and minima indices
-      $self->mindex(0);
-      $self->maxdex($#{$self->_data()});
+      ##Fix the maxima and minima indices - no, this is unnecessary now we have methods
+      #$self->mindex(0);
+      #$self->maxdex($#{$self->_data()});
   }
 
   return 1;