#!/usr/bin/perl
# randCompareMeans.pl
#
# Using the per-topic average precision scores for 2 or more runs in the
# search.results.table, perform a partial randomization test on each
# pair of runs. As implemented this is a one-tailed test.
#
# For background and details see the literature, e.g.:
#
# - Manly, Bryan F.J. 1997. Randomization, Bootstrap and Monte Carlo
# Methods in Biology, Chapman & Hall
#
# - Kempthorne, O. and Doerfler, T.E. The Behaviour of Some Significance
# Tests Under Experimental Randomization. Biometrika, Vol 56, No.2
# (Aug.,1969) 231 - 238.
#
# Romano, Joseph. P. On the Behavior of Randomization Tests Without a
# Group Invariance Assumption. Journal of the American Statistical
# Association, Vol. 85, No. 411 (Sep. 1990), pp. 686-692.
#
# This software was produced by NIST, an agency of the U.S. government,
# and by statute is not subject to copyright in the United States. Recipients
# of this software assume all responsibilities associated with its operation,
# modification and maintenance.
#
# Version change log ----------------------------------------------------------------
# 1.0
#
# 1.1 Put run first in each output line and sort by first run id.
#
# 1.2 Add estimate of error of p-values
#----------------------------------------------------------------------------
# Arguments
$numIterations = $ARGV[0];
$sigLevel = $ARGV[1];
$resultsFile = $ARGV[2];
$runSubstring1 = $ARGV[3]; # Used to filter runs based on run id or type prefix
$runSubstring2 = $ARGV[4];
$runSubstring3 = $ARGV[5];
$runSubstring4 = $ARGV[6];
if ($#ARGV < 1)
{
print "Usage randCompareMeans.pl number-of-iterations significance-level results-file run-name-substring1 run-name-substring2 run-name-substring3 run-name-substring4\n";
exit;
}
@resultsList = (); # Holds list of output lines - one for each comparison output, then sorted
%resultsHash = (); # Holds for each run a count of number of inferior runs
open RESULTS, $resultsFile || die "couldn't open \"$resultsFile\"";
$lineNum = -1;
# If the line from the results table meets the run name criteria
# passed on invocation, then save the line in the lines array
# Each line starts with the run id followed by the scores
while ($line = ) {
if ($line =~ /_/
&& $line =~ /$runSubstring1/
&& $line =~ /$runSubstring2/
&& $line =~ /$runSubstring3/
&& $line =~ /$runSubstring4/ )
{
chomp $line;
$lineNum++;
$lines[$lineNum]=$line;
}
}
close RESULTS || die "Can't close \"$resultsFile\"";
if ($lineNum == -1)
{
print "No matching runs found\n";
exit;
}
elsif ($lineNum == 0)
{
(undef,$runname,undef) = split /\s+/,$lines[$lineNum];
print "No comparisons possible - only one matching run found: $runname\n";
exit;
}
# Invoke the comparison routine for each pair
# of lines, i.e., runs
for ($m=0;$m<=$#lines;$m++)
{
for ($n=$m+1;$n<=$#lines;$n++)
{
calcRandComparison($lines[$m],$lines[$n]);
}
}
@resultsList = sort @resultsList;
foreach $item (@resultsList)
{
print "$item";
}
# Estimate the standard error of the p-values after Efron and Tinshirani 1998
$pError = $sigLevel * ( ( ( 1-($sigLevel) ) / ($sigLevel) )/$numIterations )**0.5;
# Print arguments, etc
printf "\nTarget iterations: $numIterations\n";
printf "significance level: $sigLevel\n";
printf "estimated error of p values: + or - %6.5f\n",$pError;
printf "scores file: $resultsFile\n";
printf "run substring 1: $runSubstring1\n";
printf "run substring 2: $runSubstring2\n";
printf "run substring 3: $runSubstring3\n";
printf "run substring 4: $runSubstring4\n\n";
# For each run, print number of runs it is significantly
# better than according to the current test
printf "Number of runs each run is significantly better than
according to current test:\n";
for ($i=$#lines;$i>=0;$i--)
{
foreach $key (keys %resultsHash)
{
if ($resultsHash{$key} == $i)
{
printf "%3d %s\n",$resultsHash{$key},$key;
}
}
}
#--------------------------------------------------------
# Subroutine to make randomization comparison of two runs
#--------------------------------------------------------
sub calcRandComparison
{
my ($run1,$run2) = @_;
# Given two runs (arrays of run name followed by per-topic
# scores), take the observed per-topic average precision
# scores, calculate the observed mean (mean average precision:
# MAP) for each run, and then calculate the observed difference
# in means (mean for run1 - means for run2).
#
# For a large number times (>=10000):
# In effect, interchange the paired scores for a random
# subset of topics
# Recalculate the difference in means for the two runs.
# Store this difference as part of a distribution under
# the null hypothesis (H0).
#
# Calculate the fraction of the values in the H0
# distribution that is >= (for positive values) or <= (for
# negative values) the observed difference in means.
# This fraction estimates p: the probability of getting the
# observed difference due to chance - in other words the
# strength of the null hypothesis.
my @run1scores = "";
my @run2scores = "";
my $c = 0;
my @run1 = split /\s+/,$run1;
my @run2 = split /\s+/,$run2;
# Remove of run names
shift @run1;
my $run1name = shift @run1;
shift @run2;
my $run2name = shift @run2;
my $numTopics = $#run1+1;
# Avoid comparison of run to self
if ($run1name eq $run2name)
{
return 0;
}
# Calculate average precision differences (run - run2) for observed data
for ($i=0;$i<=$numTopics-1;$i++)
{
$diffAvgp[$i] = $run1[$i] - $run2[$i];
}
# Calculate the difference in means for the observed data
my $t=0;
foreach $val (@diffAvgp)
{
$t+=$val;
}
my $observedDiffMeans = $t/$numTopics;
# Calculate randomized difference in means numIterations times (NOTE you could
# just as well look at the difference in summed average precision to save
# computation but that number is less familiar)
my %randHash = ();
my $numRandomizations = 0;
for ($j=1;$j<=$numIterations;$j++)
{
$t=0;
$randomizationString = "";
foreach $val (@diffAvgp)
{
if (rand >=.5)
{
$t+=$val;
$randomizationString .="1";
}
else
{
$t-=$val;
$randomizationString .="0";
}
}
# If this randomization is as yet unseen, then use it in the
# distribution under the null hypothesis (H0)
if (!exists $randHash{$randomizationString})
{
$generatedDiffMeans = $t/$numTopics;
#print "$generatedDiffMeans\n";
$randHash{$randomizationString} = 1;
if ($observedDiffMeans >= 0 && $generatedDiffMeans >= $observedDiffMeans)
{
$c++; # count instance of H0 distribution containing difference
# in means equal to or more extreme then observed difference
}
elsif ($observedDiffMeans < 0 && $generatedDiffMeans <= $observedDiffMeans)
{
$c++;
}
$numRandomizations++;
}
}
# Calculate frequency (probability) that observed difference in means
# or one more extreme occurs in constructed distribution under the
# null hypothesis
my $p = $c/$numRandomizations;
# Initialize hash containing number of runs "inferior" to run1name
if (!exists $resultsHash{$run1name})
{
$resultsHash{$run1name} = 0;
}
if (!exists $resultsHash{$run2name})
{
$resultsHash{$run2name} = 0;
}
if ($p <= $sigLevel)
{
if ($observedDiffMeans > 0) # run 1 better than run 2
{
$result = " > ";
$resultsHash{$run1name}++;
printResultLine($p,$c,$numRandomizations,$observedDiffMeans,$run1name,
$result,$run2name);
}
else # run 2 better than run 1
{
$result = " > ";
$resultsHash{$run2name}++;
printResultLine($p,$c,$numRandomizations,-$observedDiffMeans,$run2name,
$result,$run1name);
}
}
else # runs have same scores
{
# return without printing a result line
}
return $p;
}
#-------------------------------------
# Subroutine to make print result line
#-------------------------------------
# Print test result for current run pair:
# name of run 1
# comparison result (>)
# name of run 2
# probability observed difference due to to chance,
# count of generated differences equal or more extreme than observed,
# total number of differences generated under the null hypothesis,
# observed difference in the means
sub printResultLine
{
my ($p,$c,$numRandomizations,$observedDiff,$run1Name,$result,$run2Name) = @_;
$line = sprintf " %-35s %s %-35s %6.3f %6d %6d %6.3f\n",
$run1Name,$result,$run2Name,$p,$c,$numRandomizations,$observedDiff,;
push @resultsList, ($line);
}