#!/usr/bin/perl # based on breaksent.pl use SDBM_File; @ARGV >= 1 or die"Usage: breaksent-multi.pl file1 ...\n"; my %abbrevs = (); dbmopen(%abbrevs, "/home/duc/duc2001/data/abbrevs", 0666) or die "Can't open disk hash abbrevs: $!\n"; for my $infile (@ARGV) { open(IN,$infile) || die "Cannot open $infile: $!\n"; ($outfile = `/bin/mktemp /tmp/brksent.XXXXXX`) or die "Couldn't create temporary output file: $!\n"; chomp $outfile; open(OUT, ">$outfile") || die "Cannot open $outfile: $!\n"; my $CHECK = 0; my $MARK_PARAGRAPHS = 0; my $text = ""; while() { chop; if (/^<.+>$/ || /^\s*$/) { &do_it_for($text,$_); print "

\n" if (/^\s*$/ && $text && $MARK_PARAGRAPHS); $text = ""; } else { $text .= $_. " "; } } &do_it_for($text,"") if $text; close(IN); close(OUT); # move tempfile to orig loc (system(("/bin/mv", $outfile, $infile)) == 0) or die "Couldn't move $outfile to $infile: $!\n"; } dbmclose (%abbrevs); sub do_it_for { my($text,$markup) = @_; if ($CHECK) { &check($text) if $text; } else { print OUT &preprocess($text) if $text; print OUT "$markup\n" if ($markup =~ /^<.+>$/); } } sub preprocess { my($text) = @_; # moved out period from " , ', and ) $text =~ s/([\.\!\?])([\"\'\)]+)/$2$1/g; # remove ; following . ? ! (SJMN peculiarity) $text =~ s/([\.\!\?])( *;)/$1/g; # seperate out all "other" special characters #$text =~ s/([^0-9a-zA-ZÀ-ÖØ-öø-ÿ\s\.\'\`\,])/$1\n/g; # deal with "'" and "`" #$text =~ s/\`/\'/g; # no funny stuff # special treatment for "-" ? # special treatment for "/", "&" ? # . abbreviator / end of sentence / within numbers my $t = ""; $text =~ s/\s+/ /g; while ($text =~ / (\S+)\. +(\S+)( .+)$/) { my $pre = $1; my $post = $2; my $rest = $3; my $skipped = substr($text,0,length($text)-2-length($pre.$post.$rest)); #---------------------------------------- $ad_pre=""; # To handle abbreviations after '(', the following if ($pre=~/^\(/) { # statements are included. $ad_pre=substr($pre,0,1); $pre=substr($pre,1,length($pre)-1); # removes '(' } #---------------------------------------- if ($pre =~ /^\w+\.\w+/ || # U.S.A $abbrevs{$pre} || # known abbreviation and abbreviation after '('. $post =~ /^[a-zß-ÿ]/) { # next word is lowercase $t .= $skipped.$ad_pre.$pre.". "; # put back the '(' or null string } else { $t .= $skipped.$ad_pre.$pre." . "; # put back the '(' or null string } $text = " ".$post.$rest; } $text = $t . $text; $text =~ s/\. *$/ ./; # clean up non-spaces $text =~ s/ +/ /g; $text =~ s/^ //g; $text =~ s/ $//g; $text =~ s/([\!\?])/$1\n/g; #special cases for abreviations etc. $text =~ s/ (\.) */$1\n/g; $text .= "\n" unless $text =~ /\n$/; return "$text"; } sub check { my($text) = @_; $text =~ s/\s+/ /g; while ($text =~ / (\S+)\. +([a-zß-ÿ]\S+)( .+)$/) { print "$1\n"; $text = $2.$3; } }