#!/usr/bin/perl

# randCompareMeans.pl
#
# Using the per-topic average precision scores for 2 or more runs in the
# search.results.table, perform a partial randomization test on each 
# pair of runs. As implemented this is a one-tailed test.
# 
# For background and details see the literature, e.g.:
#
# - Manly, Bryan F.J. 1997. Randomization, Bootstrap and Monte Carlo 
#   Methods in Biology, Chapman & Hall
#
# - Kempthorne, O. and Doerfler, T.E. The Behaviour of Some Significance 
#   Tests Under Experimental Randomization. Biometrika, Vol 56, No.2 
#   (Aug.,1969) 231 - 238.
#
#   Romano, Joseph. P. On the Behavior of Randomization Tests Without a
#   Group Invariance Assumption. Journal of the American Statistical
#   Association, Vol. 85, No. 411 (Sep. 1990), pp. 686-692.
#
# This software was produced by NIST, an agency of the U.S. government, 
# and by statute is not subject to copyright in the United States. Recipients 
# of this software assume all responsibilities associated with its operation, 
# modification and maintenance.
#
# Version change log ----------------------------------------------------------------
# 1.0
#
# 1.1 Put run first in each output line and sort by first run id.
#
# 1.2 Add estimate of error of p-values
#----------------------------------------------------------------------------

# Arguments
$numIterations    = $ARGV[0];
$sigLevel         = $ARGV[1];
$resultsFile      = $ARGV[2];
$runSubstring1    = $ARGV[3]; # Used to filter runs based on run id or type prefix 
$runSubstring2    = $ARGV[4]; 
$runSubstring3    = $ARGV[5]; 
$runSubstring4    = $ARGV[6]; 

if ($#ARGV < 1)
{
    print "Usage randCompareMeans.pl number-of-iterations significance-level results-file run-name-substring1 run-name-substring2 run-name-substring3 run-name-substring4\n";
    exit;
}

@resultsList = (); # Holds list of output lines - one for each comparison output, then sorted

%resultsHash = (); # Holds for each run a count of number of inferior runs

open RESULTS, $resultsFile || die "couldn't open \"$resultsFile\"";

$lineNum = -1;

# If the line from the results table meets the run name criteria
# passed on invocation, then save the line in the lines array
# Each line starts with the run id followed by the scores
while ($line = <RESULTS>) {
    if ($line =~ /_/ 
	&& $line =~ /$runSubstring1/ 
	&& $line =~ /$runSubstring2/ 
	&& $line =~ /$runSubstring3/ 
	&& $line =~ /$runSubstring4/ )
    {
	chomp $line;
	$lineNum++;
	$lines[$lineNum]=$line;
    }
}

close RESULTS || die "Can't close \"$resultsFile\"";

if ($lineNum == -1)
{
    print "No matching runs found\n";
    exit;
}
elsif ($lineNum == 0)
{
    (undef,$runname,undef) = split /\s+/,$lines[$lineNum];
    print "No comparisons possible - only one matching run found: $runname\n";
    exit;
}

# Invoke the comparison routine for each pair
# of lines, i.e., runs
for ($m=0;$m<=$#lines;$m++)
{
  for ($n=$m+1;$n<=$#lines;$n++)
  {
      calcRandComparison($lines[$m],$lines[$n]);
  }  
}

@resultsList = sort @resultsList;

foreach $item (@resultsList)
{
    print "$item";
}

# Estimate the standard error of the p-values after Efron and Tinshirani 1998
$pError = $sigLevel * (    (  ( 1-($sigLevel) ) / ($sigLevel)  )/$numIterations   )**0.5;

# Print arguments, etc
printf "\nTarget iterations: $numIterations\n";
printf "significance level: $sigLevel\n";
printf "estimated error of p values: + or - %6.5f\n",$pError; 
printf "scores file: $resultsFile\n";
printf "run substring 1: $runSubstring1\n";
printf "run substring 2: $runSubstring2\n";
printf "run substring 3: $runSubstring3\n";
printf "run substring 4: $runSubstring4\n\n";

# For each run, print number of runs it is significantly
# better than according to the current test
printf "Number of runs each run is significantly better than
according to current test:\n";
for ($i=$#lines;$i>=0;$i--)
{
    foreach $key (keys %resultsHash)
    {
	if ($resultsHash{$key} == $i)
	{
	        printf "%3d   %s\n",$resultsHash{$key},$key;
	}
    }
}



#--------------------------------------------------------
# Subroutine to make randomization comparison of two runs
#--------------------------------------------------------
sub calcRandComparison 
{
    my ($run1,$run2) = @_;

# Given two runs (arrays of run name followed by per-topic 
# scores), take the observed per-topic average precision 
# scores, calculate the observed mean (mean average precision:
# MAP) for each run, and then calculate the observed difference
# in means (mean for run1 - means for run2).
#
# For a large number times (>=10000):
#   In effect, interchange the paired scores for a random 
#   subset of topics
#   Recalculate the difference in means for the two runs. 
#   Store this difference as part of a distribution under
#   the null hypothesis (H0).
#
# Calculate the fraction of the values in the H0 
# distribution that is >= (for positive values) or <= (for
# negative values) the observed difference in means. 

# This fraction estimates p: the probability of getting the 
# observed difference due to chance - in other words the 
# strength of the null hypothesis.

    my @run1scores = "";
    my @run2scores = "";

    my $c = 0;

    my @run1 = split /\s+/,$run1;
    my @run2 = split /\s+/,$run2;

#   Remove of run names
    shift @run1;
    my $run1name = shift @run1;

    shift @run2;
    my $run2name = shift @run2;

    my $numTopics = $#run1+1;

#   Avoid comparison of run to self
    if ($run1name eq $run2name)
    {
	return 0; 
    }

#   Calculate average precision differences (run - run2) for observed data 
    for ($i=0;$i<=$numTopics-1;$i++)
    {
	$diffAvgp[$i] = $run1[$i] - $run2[$i];
    }

#   Calculate the difference in means for the observed data
    my $t=0;
    foreach $val (@diffAvgp)
    {
	$t+=$val;
    }

    my $observedDiffMeans = $t/$numTopics;

 
#   Calculate randomized difference in means numIterations times (NOTE you could 
#   just as well look at the difference in summed average precision to save 
#   computation but that number is less familiar)

    my %randHash = ();
    my $numRandomizations = 0;

    for ($j=1;$j<=$numIterations;$j++)
    {
	$t=0;
	$randomizationString = "";
	foreach $val (@diffAvgp)
	{
	    if (rand >=.5)
	    {
		$t+=$val;
		$randomizationString .="1";
	    }
	    else
	    {
		$t-=$val;
		$randomizationString .="0";
	    }
	}

        # If this randomization is as yet unseen, then use it in the
        # distribution under the null hypothesis (H0)
	if (!exists $randHash{$randomizationString})
	{
	    $generatedDiffMeans = $t/$numTopics; 
	    #print "$generatedDiffMeans\n";
	    $randHash{$randomizationString} = 1;

	    if ($observedDiffMeans >= 0 && $generatedDiffMeans >= $observedDiffMeans)
	    {
		$c++;  # count instance of H0 distribution containing difference
                       # in means equal to or more extreme then observed difference
	    }
	    elsif ($observedDiffMeans < 0 && $generatedDiffMeans <= $observedDiffMeans)
	    {
		$c++;
	    }
	    $numRandomizations++;
	}
    }

#   Calculate frequency (probability) that observed difference in means
#   or one more extreme occurs in constructed distribution under the 
#   null hypothesis

    my $p = $c/$numRandomizations;

    # Initialize hash containing number of runs "inferior" to run1name
    if (!exists $resultsHash{$run1name})
    {
	$resultsHash{$run1name} = 0;
    }
    if (!exists $resultsHash{$run2name})
    {
	$resultsHash{$run2name} = 0;
    }

    if ($p <= $sigLevel)
    {
	if ($observedDiffMeans > 0) # run 1 better than run 2
	{
	    $result = " > ";
	    $resultsHash{$run1name}++;
	    printResultLine($p,$c,$numRandomizations,$observedDiffMeans,$run1name,
			    $result,$run2name); 
	}
	else                      # run 2 better than run 1
	{
	    $result = " > ";
	    $resultsHash{$run2name}++;
	    printResultLine($p,$c,$numRandomizations,-$observedDiffMeans,$run2name,
			    $result,$run1name);
	}

    }
    else                          # runs have same scores
    {
	# return without printing a result line
    }

    return $p;
}


#-------------------------------------
# Subroutine to make print result line
#-------------------------------------
#   Print test result for current run pair:
#    name of run 1
#    comparison result (>)
#    name of run 2
#    probability observed difference due to to chance,
#    count of generated differences equal or more extreme than observed,
#    total number of differences generated under the null hypothesis,
#    observed difference in the means

sub printResultLine
{
    my ($p,$c,$numRandomizations,$observedDiff,$run1Name,$result,$run2Name) = @_;
    $line = sprintf " %-35s %s %-35s %6.3f %6d %6d %6.3f\n",
    $run1Name,$result,$run2Name,$p,$c,$numRandomizations,$observedDiff,;
    push @resultsList, ($line);
}
