#!/usr/bin/perl 

# based on breaksent.pl

use SDBM_File;

@ARGV >= 1 or die"Usage: breaksent-multi.pl file1 ...\n";

my %abbrevs = ();
dbmopen(%abbrevs, "/home/duc/duc2001/data/abbrevs", 0666) or die "Can't open disk hash abbrevs: $!\n";

for my $infile (@ARGV) {
    open(IN,$infile) || die "Cannot open $infile: $!\n";
    ($outfile = `/bin/mktemp /tmp/brksent.XXXXXX`) or 
	die "Couldn't create temporary output file: $!\n";
    chomp $outfile;
    open(OUT, ">$outfile") || die "Cannot open $outfile: $!\n";

    my $CHECK = 0;
    my $MARK_PARAGRAPHS = 0;

    my $text = "";
    while(<IN>) {
	chop;
	if (/^<.+>$/ || /^\s*$/) {
	    &do_it_for($text,$_);
	    print "<P>\n" if (/^\s*$/ && $text && $MARK_PARAGRAPHS);
	    $text = "";
	}
	else {
	    $text .= $_. " ";
	}
    }
    &do_it_for($text,"") if $text;

    close(IN);
    close(OUT);

    # move tempfile to orig loc
    (system(("/bin/mv", $outfile, $infile)) == 0) 
	or die "Couldn't move $outfile to $infile: $!\n";
} 

dbmclose (%abbrevs);



sub do_it_for {
  my($text,$markup) = @_;
  if ($CHECK) {
    &check($text) if $text;
  }
  else {
    print OUT &preprocess($text) if $text;
    print OUT "$markup\n" if ($markup =~ /^<.+>$/);
  }
}


sub preprocess {
  my($text) = @_;

  # moved out period from " , ', and )
  $text =~ s/([\.\!\?])([\"\'\)]+)/$2$1/g;

  # remove ; following . ? ! (SJMN peculiarity)
  $text =~ s/([\.\!\?])( *;)/$1/g;

  # seperate out all "other" special characters
  #$text =~ s/([^0-9a-zA-ZÀ-ÖØ-öø-ÿ\s\.\'\`\,])/$1\n/g;
	
  # deal with "'" and "`"
  #$text =~ s/\`/\'/g;                        # no funny stuff

  # special treatment for "-" ?
  # special treatment for "/", "&" ?

  # . abbreviator / end of sentence / within numbers

  my $t = "";
  $text =~ s/\s+/ /g;
  while ($text =~ / (\S+)\. +(\S+)( .+)$/) {
    my $pre = $1; 
    my $post = $2; 
    my $rest = $3;

    my $skipped = substr($text,0,length($text)-2-length($pre.$post.$rest));
#----------------------------------------
    $ad_pre="";             # To handle abbreviations after '(', the following
   if ($pre=~/^\(/) {       # statements are included.
     $ad_pre=substr($pre,0,1);
     $pre=substr($pre,1,length($pre)-1);  # removes '('
 }
#----------------------------------------
    if ($pre =~ /^\w+\.\w+/ ||   # U.S.A
	$abbrevs{$pre} ||   #  known abbreviation and abbreviation after '('.
	$post =~ /^[a-zß-ÿ]/) {  # next word is lowercase
        $t .= $skipped.$ad_pre.$pre.". ";  # put back the '(' or null string
    }
    else {
      $t .= $skipped.$ad_pre.$pre." . "; # put back the '(' or null string
    }
    $text = " ".$post.$rest;
  }
  $text = $t . $text;
  $text =~ s/\. *$/ ./;
    
  # clean up non-spaces
  $text =~ s/ +/ /g;
  $text =~ s/^ //g;
  $text =~ s/ $//g;

  $text =~ s/([\!\?])/$1\n/g;
  #special cases for abreviations etc.
  $text =~ s/ (\.) */$1\n/g;
  $text .= "\n" unless $text =~ /\n$/;

  return "$text";
}

sub check {
  my($text) = @_;
  $text =~ s/\s+/ /g;
  while ($text =~ / (\S+)\. +([a-zß-ÿ]\S+)( .+)$/) {
    print "$1\n";
    $text = $2.$3;
  }
}












