#!/usr/bin/perl -w

# parser.pl
# derives probability vectors from raw voting records
# 1st-2nd October 2002

# 19th June 2005
# amended for updated records
# also previous results were skewed a little by
# duplication of year 1979 into 1980

# 11th July 2005
# takes basic voting records
# and simulates a contest based on the previous form.
# uses code from randcontest.pl series

# 20th July 2005
# tests for significance of total pattern

# 18th August 2005
# simplified and algorithm made more accurate

use strict;
use POSIX;
srand();

unless($ARGV[2]) { die "\nUsage: predictor.pl START_YEAR END_YEAR sig."; }
my $start = $ARGV[0];  # eg 1975
my $end = $ARGV[1];  # eg 1981
my $sig = $ARGV[2];  # eg 0.05

open INFILE, "total_summary.txt" or die "Hello Stockholm, can you hear me?";
open OUTFILE, ">significance.xls" or die "Greece votes 12 points to Cyprus";
open PAIRFILE, ">pairs.xls" or die "Greece votes 12 points to Cyprus";

$/ = "//";
my %data;  # the basic data structure
my %total; # stores totals derived from %data
my %num_years; # number of years of votes from donor to recipient
my %average; # the average score over the form period
my @years;

# now list the 44 countries that have ever participated
my @judges =    qw(LA	IS YU GB NL	AT DK	PR AD	MR SE	MO LI	IT CH	HR BU	ES SL	FR CY IE AL	BE EE	BY NO	RU TR	MK GR	DE FI	LU BH	RO PL	CS MC	IL UA	SK HU	ML);
my @countries = qw(LA	IS YU GB NL	AT DK	PR AD	MR SE	MO LI	IT CH	HR BU	ES SL	FR CY IE AL	BE EE	BY NO	RU TR	MK GR	DE FI	LU BH	RO PL	CS MC	IL UA	SK HU	ML);
# and the number of other voting participants per year (ie. total - 1)
my %contests = qw(
1975 18
1976 17
1977 17
1978 19
1979 18
1980 18
1981 19
1982 17
1983 19
1984 18
1985 18
1986 19
1987 21
1988 20
1989 21
1990 21
1991 21
1992 22
1993 24
1994 24
1995 22
1996 22
1997 24
1998 24
1999 22
2000 23
2001 22
2002 23
2003 25
2004 35
2005 38);

my %points = (
    0, 12,     # as rand() will very rarely give absolute zero, which ceil can't handle
		1, 12,
		2, 10,
  	3, 8,
		4, 7,
		5, 6,
		6, 5,
		7, 4,
		8, 3,
		9, 2,
		10, 1);

print PAIRFILE "pair\tyears\treal average\t".(100*$sig)."% random threshold";
# the next block stokes up the data matrix
while() # each record is a block of votes from a single country
{
	chomp;
	my @results = split "\n", $_;  # divides the donor block into recipient lines
	my @votes;   # hold list of votes for each song from each judge per year
	my @years; # holds list of countries for each year
#	my $year;
	foreach(@results)
	{
    my $av_sim = 0;   # the average for each "DON to REC" simulation
		if(/(\d{4}(semi)?)/)  # identify the year title line in the record
		{
      print OUTFILE "\n$_\ttotal years\ttotal real votes\taverage real votes\ttop rand\tav. rand\t".(100*$sig)."% cutoff";
			@years = split /\t/, $_;
		}
		elsif(/^([A-Z]{2})\sto\s([A-Z]{2})/) # eg LA to EE
		{
      print OUTFILE "\n$1 to $2";
      print "\n$1 to $2";
      my $donor = $1;
      my $recipient = $2;
      $num_years{$donor}{$recipient} = 0;
      $total{$donor}{$recipient} = 0;
      $average{$donor}{$recipient} = 0;
			chomp;
			@votes = split /\t/, $_;        # $votes[0] is the country name
			my $start_pos = $#votes-2005+$start;
			my $end_pos = $#votes-2005+$end;
			my $together = 0; # counts the years a country has voted for another
      for(my $x=$start_pos; $x<=$end_pos; $x++)  # up to now
			{
        if($votes[$x] =~ /\d+/)
        {
          $together++;   # find out how many times to run the random vote per simulation run
        }
      }
      print OUTFILE " voted on $together occasions";
      my @sim_list; # holds the array of sim results from which 1% threshold will be taken
      if($together > 0) # only simulate if worthwhile
      {
        for(my $sim=1;$sim<=100000;$sim++)  # 1000 simulated situations
        {
          my $average = 0; # the average of the year, one for each simulation
          for(my $x=$start_pos; $x<=$end_pos; $x++) # won't run at all if together is zero
          {
            if($sim==1) # only total real votes once
            {
              print OUTFILE "\t$votes[$x]";
              $data{$donor}{$recipient}{$years[$x]} = $votes[$x];
            }
            if($votes[$x]=~/[\.\d]+/) # avoid trying to add "N/A"
				    {
              my $place = rand()*$contests{$years[$x]}; # rand is multiplied by the number of other contestants that year
#            print "\t$place\t".ceil($place);
              if(ceil($place)<=10)   # only the top 10 get points
              {
                $average += $points{ceil($place)};  # add it to the average random score
              }
              if($sim==1 ) # only total real votes once
              {
                $total{$donor}{$recipient} += $votes[$x];  # don't add N/A
                $num_years{$donor}{$recipient}++;       # only count if not N/A
              }
            }  #    end of action in scoring year
          }    # end of run through years
          if($num_years{$donor}{$recipient} > 0)
          {
            $average/=$num_years{$donor}{$recipient};
          }
          push @sim_list, $average; # add average of year runs in to the sim list
          $av_sim += $average; # the average of the averages
        } # end of 1000 simulations
      }
      my @sort_sim = sort {$b <=> $a} @sim_list;
      if(@sort_sim > 0)
      {
        $av_sim /= @sort_sim;
      }
      my $one_per_cent = $sig*@sort_sim;
      print OUTFILE "\t$num_years{$donor}{$recipient}\t$total{$donor}{$recipient}";
      if($num_years{$donor}{$recipient} > 0)
      {
        $average{$donor}{$recipient} = $total{$donor}{$recipient}/$num_years{$donor}{$recipient};   # the average of the real scores
        print OUTFILE "\t$average{$donor}{$recipient}\t$sort_sim[0]\t$av_sim\t$sort_sim[$one_per_cent]";
        if($average{$donor}{$recipient} > $sort_sim[$one_per_cent]) # if significant
        {
          print PAIRFILE "\n$1 to $2\t$num_years{$donor}{$recipient}\t$average{$donor}{$recipient}\t$sort_sim[$one_per_cent]";
        }
      }
      else { print OUTFILE "\tdonor never had chance to vote for recipient"; }
		}  # end of donor-recipient pair
	}  # end of // separated donor block
}   # finished parsing infile