#!/usr/bin/perl # The purpose of this program is to take TREC documents and # add a tag for each sentence. The code for determining # sentence breaks is based on breakSents.pl use SDBM_File; @ARGV > 0 or die"Usage: tagSents.pl file1 ...\n"; if ($ARGV[1] eq "d") {$debug = 1} else {$debug = 0;} $dtdpath = "/indexes/TREC10_QA_Track_Index_256/"; #Abbreviations my %abbrevs = (); dbmopen(%abbrevs, "/nlpir/duc/duc2001/data/abbrevs", 0666) or die "Can't open abbreviations diskhash /nlpir/duc/duc2001/data/abbrevs: $!\n"; #Proper nouns my %pnouns = (); dbmopen(%pnouns, "/nlpir/duc/duc2002/data/pnouns", 0666) or die "Can't open proper nouns diskhash /nlpir/duc/duc2002/data/pnouns: $!\n"; for my $infile (@ARGV) { ($outfile = `/bin/mktemp /tmp/toSEE.XXXXXX`) or die "Couldn't create temporary output file: $!\n"; chomp $outfile; open(OUT, ">$outfile") || die "Cannot open $outfile: $!\n"; $lastBackSlashIndex = rindex $infile, "/"; $infilePrefix = substr($infile,$lastBackSlashIndex+1,2); if (substr($infile,length($infile)-4) eq ".csh") {next}; # Choose dtd name base on file name if ($infilePrefix eq "AP") {$dtd = "ap.dtd"} elsif ($infilePrefix eq "FB") {$dtd = "fbis.dtd"} elsif ($infilePrefix eq "FT") {$dtd = "ft.dtd"} elsif ($infilePrefix eq "LA") {$dtd = "latimes.dtd"} elsif ($infilePrefix eq "SJ") {$dtd = "sjm.dtd"} elsif ($infilePrefix eq "WS") {$dtd = "wsj.dtd"} else {} # Invoke nsgmls using chosen dtd open NSGMLSOUT, "nsgmls $dtdpath$dtd $infile|" or die "Can't read from nsgmls ipipe: $!\n"; while ($line = ) { chomp $line; $c1 = substr($line,0,1); if ($c1 eq "(") { if ($line =~ m/\(DOCNO/) { $lookingForDocno = 1; $n=0; # reset sentence counter } print OUT "<".substr($line,1).">\n"; } elsif ($c1 eq ")") { print OUT "\n"; } elsif ($c1 eq "-") { if ($lookingForDocno) { (undef,$docno,undef) = split /\s+/,$line; $lookingForDocno = 0; } # Undo nsgmls's changes $data = substr($line,1); # remove leading "-" $data =~ s/\\n/ /g; # change slashn to blank # Sentence separate the data print OUT &markBreaks($data); } else {} } close NSGMLSOUT; close OUT; if ($infile !~ /\.S$/) { unlink $infile; $infile .= ".S"; } (system(("/bin/mv", $outfile, $infile)) == 0) or die "Couldn't move $outfile to $infile: $!\n"; } sub markBreaks { my($text) = @_; my $t =""; # move period, exclamation/question mark after following quote mark # and separate with blank. $text =~ s/([\.\!\?]+)([\"\'\)]+) /$2$1 /g; # w+{ becomes w+ { - peculiarity of some newspaper data #$text =~ s/(\w+)({)/$1 $2 /g; $text =~ s/(\w+)({)/$1/g; # remove ; following . ? ! (SJMN peculiarity) $text =~ s/([\.\!\?])( *;)/$1/g; # a series of whitespace chars becomes a space $text =~ s/\s+/ /g; # insert a space before each comma $text =~ s/,/ ,/g; # this loop handles periods and ellipsis as well as question and # exclamation marks - finding and marking each sentence-ending # instance by inserting an end-of-sentence marker (\n) # - $1 has ? to minimize its matching so $2 can match maxmimally # and recognize ... # - $3 needs to be able to contain / end with punctuation e.g., # an abbreviation starting the next sentence while ($text =~ / (\S+?)(\.\.\.|\.|\?|\!) +(\S+)( .+)$/) { my $pre = $1; my $delim = $2; my $post = $3; my $rest = $4; my $skipped = substr($text,0,length($text)-1-length($1.$2.$3.$4)); $fullpost = $post; if (substr($post,-1) eq ".") { chop $post; } if ($debug) { print "TEXT+[$text]\n"; print "\nSKI=[$skipped]\nPRE=[$pre]\nDELIM=[$delim]\nPOS=[$post]\nRES=[$rest]\n"; if ($pre =~ /^\w+\.\w+/) {print "$pre WITHPERIOD\n";} if ($abbrevs{$pre}) {print "$pre ABBREVIATION\n";} if ($pnouns{$post}) {print "$post PROPER\n";} } # if the word before the delimiter can legitimately precede # the delimiter and the word after the delimiter # is usually capitalized or is lowercase then mark the # period/ellipsis as NOT ending a sentence; otherwise mark # it as ending a sentence. if ( ( $pre =~ /^\w+\.\w+/ || $abbrevs{$pre} || $delim eq "..." || $delim eq "?" || $delim eq "!" ) && ($pnouns{$post} || $post =~ /^[a-zß-ÿ0-9,;:\-\.]/) ) { # C o n t i n u e c u r r e n t s e n t e n c e $t .= $skipped.$pre."$delim "; $restoredspaces = " "; } else { # M a k e t h i s a s e n t e n c e - f i n a l p e r i o d $t .= $skipped.$pre.$delim."\n"; # lack of in initial space prevents a sentence initial abbrev. # from being interpreted as sentence-terminating - we want this # though it means one word sentences will be concatenated with # the following. $restoredspaces = ""; } $text = $restoredspaces.$fullpost.$rest; }# endwhile $text = $t . $text; $text =~ s/\. *$/ ./; # final period will be followed by exactly 1 space $text =~ s/ +/ /g; # multiple spaces become one $text =~ s/^ //g; # leading space is removed $text =~ s/ $//g; # trailing space is removed $text =~ s/ ,/,/g; # remove the (added) space before commas $text .= "\n" unless $text =~ /\n$/; # add the tag for each "sentence" $textout = ""; @sentlist = split /\n/,$text; foreach $s (@sentlist) { chomp $s; @tokens = split /\s+/, $s; $c = scalar(@tokens); $n++; $textout .= "\n $s\n"; } return $textout; }