#!/usr/bin/perl

# The purpose of this program is to take TREC documents and
# add a tag for each sentence. The code for determining
# sentence breaks is based on breakSents.pl

use SDBM_File;

@ARGV > 0 or die"Usage: tagSents.pl file1 ...\n";

if ($ARGV[1] eq "d") {$debug = 1} else {$debug = 0;}

$dtdpath = "/indexes/TREC10_QA_Track_Index_256/";

#Abbreviations
my %abbrevs = ();
dbmopen(%abbrevs, "/nlpir/duc/duc2001/data/abbrevs", 0666) or die "Can't open abbreviations diskhash /nlpir/duc/duc2001/data/abbrevs: $!\n";

#Proper nouns
my %pnouns = ();
dbmopen(%pnouns, "/nlpir/duc/duc2002/data/pnouns", 0666) or die "Can't open proper nouns diskhash /nlpir/duc/duc2002/data/pnouns: $!\n";

for my $infile (@ARGV) {

    ($outfile = `/bin/mktemp /tmp/toSEE.XXXXXX`) or
        die "Couldn't create temporary output file: $!\n";
    chomp $outfile;
    open(OUT, ">$outfile") || die "Cannot open $outfile: $!\n";

    $lastBackSlashIndex = rindex $infile, "/";
    $infilePrefix = substr($infile,$lastBackSlashIndex+1,2);

    if (substr($infile,length($infile)-4) eq ".csh") {next};

    # Choose dtd name base on file name
    if    ($infilePrefix eq "AP") {$dtd = "ap.dtd"}
    elsif ($infilePrefix eq "FB") {$dtd = "fbis.dtd"}
    elsif ($infilePrefix eq "FT") {$dtd = "ft.dtd"}
    elsif ($infilePrefix eq "LA") {$dtd = "latimes.dtd"}
    elsif ($infilePrefix eq "SJ") {$dtd = "sjm.dtd"}
    elsif ($infilePrefix eq "WS") {$dtd = "wsj.dtd"}
    else {}

    # Invoke nsgmls using chosen dtd
    open NSGMLSOUT, "nsgmls $dtdpath$dtd $infile|" or die "Can't read from nsgmls ipipe: $!\n";
    while ($line = <NSGMLSOUT>)
    {
	chomp $line;
	$c1 = substr($line,0,1);
	if ($c1 eq "(")
	{
	    if ($line =~ m/\(DOCNO/)
	    {
		$lookingForDocno = 1;
		$n=0; # reset sentence counter
	    }
		    
	    print  OUT "<".substr($line,1).">\n";
	}
	elsif ($c1 eq ")")
	{
	    print  OUT "</".substr($line,1).">\n";
	}
	elsif ($c1 eq "-")
	{
	    if ($lookingForDocno)
	    {
		(undef,$docno,undef) = split /\s+/,$line;
		$lookingForDocno = 0;
	    }
	    # Undo nsgmls's changes
	    $data = substr($line,1); # remove leading "-"
	    $data =~ s/\\n/ /g;     # change slashn to blank

	    # Sentence separate the data
	    print  OUT &markBreaks($data);

	}
	else {}
    }
    close NSGMLSOUT;
    
    close OUT;

    if ($infile !~ /\.S$/) {
        unlink $infile;
        $infile .= ".S";
    }

    (system(("/bin/mv", $outfile, $infile)) == 0)
        or die "Couldn't move $outfile to $infile: $!\n";
}



sub markBreaks {

    my($text) = @_;
    my $t ="";

    # move period, exclamation/question mark after following quote mark
    # and separate with blank. 
    $text =~ s/([\.\!\?]+)([\"\'\)]+) /$2$1 /g;

    # w+{ becomes w+ {   - peculiarity of some newspaper data
    #$text =~ s/(\w+)({)/$1 $2 /g;
    $text =~ s/(\w+)({)/$1/g;

    # remove ; following . ? ! (SJMN peculiarity)
    $text =~ s/([\.\!\?])( *;)/$1/g;
	
    # a series of whitespace chars becomes a space	     
    $text =~ s/\s+/ /g;

    # insert a space before each comma
    $text =~ s/,/ ,/g;

    # this loop handles periods and ellipsis as well as question and 
    # exclamation marks - finding and marking each sentence-ending 
    # instance by inserting an end-of-sentence marker (\n)
    # - $1 has ? to minimize its matching so $2 can match maxmimally
    # and recognize ...
    # - $3 needs to be able to contain / end with punctuation e.g.,
    # an abbreviation starting the next sentence

    while ($text =~ / (\S+?)(\.\.\.|\.|\?|\!) +(\S+)( .+)$/) {
       my $pre = $1; 
       my $delim = $2;
       my $post = $3; 
       my $rest = $4;
       my $skipped = substr($text,0,length($text)-1-length($1.$2.$3.$4));

       $fullpost = $post;
       if (substr($post,-1) eq ".") {
	   chop $post;
       }

       if ($debug) 
       {
	   print "TEXT+[$text]\n";
	   print "\nSKI=[$skipped]\nPRE=[$pre]\nDELIM=[$delim]\nPOS=[$post]\nRES=[$rest]\n";
	   if ($pre =~ /^\w+\.\w+/) {print "$pre WITHPERIOD\n";}
	   if ($abbrevs{$pre}) {print "$pre ABBREVIATION\n";}
	   if ($pnouns{$post}) {print "$post PROPER\n";}
       }
       # if the word before the delimiter can legitimately precede
       # the delimiter and the word after the delimiter
       # is usually capitalized or is lowercase then mark the
       # period/ellipsis as NOT ending a sentence; otherwise mark
       # it as ending a sentence.
       if ( 
	    (
	     $pre =~ /^\w+\.\w+/ || $abbrevs{$pre} || 
             $delim eq "..."     || $delim eq "?"  || $delim eq "!"
	    ) 
            &&       
	    ($pnouns{$post} || $post =~ /^[a-zß-ÿ0-9,;:\-\.]/)     
	  )    
       {  
	   # C o n t i n u e   c u r r e n t   s e n t e n c e
	   $t .= $skipped.$pre."$delim ";
	   $restoredspaces = " ";
       }
       else 
       {
	   # M a k e   t h i s   a   s e n t e n c e - f i n a l   p e r i o d
	   $t .= $skipped.$pre.$delim."\n";
	   # lack of in initial space prevents a sentence initial abbrev.
	   # from being interpreted as sentence-terminating - we want this
	   # though it means one word sentences will be concatenated with
	   # the following.
	   $restoredspaces = ""; 
       }
       $text = $restoredspaces.$fullpost.$rest;
   }# endwhile

   $text = $t . $text;

   $text =~ s/\. *$/ ./; # final period will be followed by exactly 1 space
   $text =~ s/ +/ /g; # multiple spaces become one
   $text =~ s/^ //g;  # leading space is removed
   $text =~ s/ $//g;  # trailing space is removed

   $text =~ s/ ,/,/g; # remove the (added) space before commas

   $text .= "\n" unless $text =~ /\n$/; 

   # add the tag for each "sentence"
   $textout = "";
   @sentlist = split /\n/,$text;
   foreach $s (@sentlist)
   {
       chomp $s;
       @tokens = split /\s+/, $s;
       $c = scalar(@tokens);
       $n++;
       $textout .= "\n<s docid=$docno num=$n wdcount=$c> $s</s>\n";
   }

   return $textout;
}
