#!/usr/bin/perl

# The purpose of this program is to take TREC documents and
# add a tag for each sentence. The code for determining
# sentence breaks is based on breakSents.pl This program
# uses nsgmls to parse the TREC documents' SGML tagging.

use SDBM_File;

@ARGV > 0 or die "Usage: tagSents.pl [t] file1 file2 ...\n";

if ($ARGV[0] eq "t") {$keepTRECtags = 1; shift;} else {$keepTRECtags = 0;}

$homeDir = "/CHANGE/TO/CURRENT/DIRECTORY";
$nsgmlspath = "CHANGE/TO/NSGMLS/PATH";
$dtdpath = $homeDir . "/dtd/";

#Abbreviations
my %abbrevs = ();
dbmopen(%abbrevs, "$homeDir/abbrevs", 0666) or die "Can't open abbreviations diskhash abbrevs: $!\n";

#Proper nouns
my %pnouns = ();
dbmopen(%pnouns, "$homeDir/pnouns", 0666) or die "Can't open proper nouns diskhash pnouns: $!\n";

for my $infile (@ARGV) {

    $outfile = "toSEE.XXXXXX";

    open(OUT, ">$outfile") || die "Cannot open $outfile: $!\n";

    $lastBackSlashIndex = rindex $infile, "/";
    $infilePrefix = substr($infile,0,2);

    if (substr($infile,length($infile)-4) eq ".csh") {next};

    # Choose dtd name base on file name
    if    ($infilePrefix eq "FR") {$dtd = "fr.dtd"}
    elsif ($infilePrefix eq "FB") {$dtd = "fbis.dtd"}
    elsif ($infilePrefix eq "FT") {$dtd = "ft.dtd"}
    elsif ($infilePrefix eq "LA") {$dtd = "latimes.dtd"}
    else 
    {
	print "\nNO DTD FOUND FOR $infilePrefix\n";
	exit;
    }

    # Invoke nsgmls using chosen dtd
    open NSGMLSOUT, "$nsgmlspath/nsgmls $dtdpath$dtd $infile|" or die "Can't read from nsgmls ipipe: $!\n";

    $process = 0;
    while ($line = <NSGMLSOUT>)
    {
	chomp $line;
	$c1 = substr($line,0,1);
	if ($c1 eq "(")
	{
	    # skip tags before <DOC>
	    if ($process == 0)
	    {
		if ($line =~ m/\(DOC/)
		{
		    $process = 1;
		}
		else
		{
		    next;
		}
	    }

	    if ($line =~ m/\(DOCNO/)
	    {
		$lookingForDocno = 1;
		$n=0; # reset sentence counter
	    }
		
	    if ($keepTRECtags)
	    {
		print  OUT "<".substr($line,1).">\n";
	    }
	}
	elsif ($c1 eq ")")
	{
	    if ($keepTRECtags)
	    {
		print  OUT "</".substr($line,1).">\n";
	    }

	    # Stop after </DOC>
	    if (substr($line,1) eq "DOC")
	    {
	      last;
	    }
	}
	elsif ($c1 eq "-")
	{

	    # Undo nsgmls's changes
	    $data = substr($line,1); # remove leading "-"

	    $line = " $data"; #add blank so FT parses like others w/ undef 1st
	    if ($lookingForDocno) # get docno
	    {
		(undef,$docno,undef) = split /\s+/,$line;
		$lookingForDocno = 0;
	    }

	    $data =~ s/\\n/ /g;     # change slashn to blank
	    # Sentence separate the data
	    print  OUT &markBreaks($data);

	}
	else {}


    }
    close NSGMLSOUT;
    
    close OUT;

    # REMOVES the input file and moves temp file to a new file
    # named after old input file with .S extension

    (system(("/bin/mv", $outfile, $infile . ".S")) == 0)
        or die "Couldn't move $outfile to $infile: $!\n";
}

dbmclose(%abbrevs);
dbmclose(%pnouns);


sub markBreaks {

    my($text) = @_;
    my $t ="";

    # move period, exclamation/question mark after following quote mark
    # and separate with blank. 
    $text =~ s/([\.\!\?]+)([\"\'\)]+) /$2$1 /g;

    # w+{ becomes w+ {   - peculiarity of some newspaper data
    #$text =~ s/(\w+)({)/$1 $2 /g;
    $text =~ s/(\w+)({)/$1/g;

    # remove ; following . ? ! (SJMN peculiarity)
    $text =~ s/([\.\!\?])( *;)/$1/g;
	
    # a series of whitespace chars becomes a space	     
    $text =~ s/\s+/ /g;

    # insert a space before each comma
    $text =~ s/,/ ,/g;

    # this loop handles periods and ellipsis as well as question and 
    # exclamation marks - finding and marking each sentence-ending 
    # instance by inserting an end-of-sentence marker (\n)
    # - $1 has ? to minimize its matching so $2 can match maxmimally
    # and recognize ...
    # - $3 needs to be able to contain / end with punctuation e.g.,
    # an abbreviation starting the next sentence

    while ($text =~ / (\S+?)(\.\.\.|\.|\?|\!) +(\S+)( .+)$/) {
       my $pre = $1; 
       my $delim = $2;
       my $post = $3; 
       my $rest = $4;
       my $skipped = substr($text,0,length($text)-1-length($1.$2.$3.$4));

       my $debug = 0;

       $fullpost = $post;
       # if the last character in $post is a period, it is removed.
       if (substr($post,-1) eq ".") {
	   chop $post;
       }

       if ($debug) 
       {
	   print "TEXT+[$text]\n";
	   print "\nSKI=[$skipped]\nPRE=[$pre]\nDELIM=[$delim]\nPOS=[$post]\nRES=[$rest]\n";
	   if ($pre =~ /^\w+\.\w+/) {print "$pre WITHPERIOD\n";}
	   if ($abbrevs{$pre}) {print "$pre ABBREVIATION\n";}
	   if ($pnouns{$post}) {print "$post PROPER\n";}
       }
       # if the word before the delimiter can legitimately precede
       # the delimiter and the word after the delimiter
       # is usually capitalized or is lowercase then mark the
       # period/ellipsis as NOT ending a sentence; otherwise mark
       # it as ending a sentence.
       if ( 
	    (
	     $pre =~ /^\w+\.\w+/ || $abbrevs{$pre} || 
             $delim eq "..."     || $delim eq "?"  || $delim eq "!"
	    ) 
            &&       
	    ($pnouns{$post} || $post =~ /^[a-z-0-9,;:\-\.]/)     
	  )    
       {  
	   # C o n t i n u e   c u r r e n t   s e n t e n c e
	   $t .= $skipped.$pre."$delim ";
	   $restoredspaces = " ";
       }
       else 
       {
	   # M a k e   t h i s   a   s e n t e n c e - f i n a l   p e r i o d
	   $t .= $skipped.$pre.$delim."\n";
	   # lack of in initial space prevents a sentence initial abbrev.
	   # from being interpreted as sentence-terminating - we want this
	   # though it means one word sentences will be concatenated with
	   # the following.
	   $restoredspaces = ""; 
       }
       $text = $restoredspaces.$fullpost.$rest;
   }# endwhile

   $text = $t . $text;

   $text =~ s/\. *$/ ./; # final period will be followed by exactly 1 space
   $text =~ s/ +/ /g; # multiple spaces become one
   $text =~ s/^ //g;  # leading space is removed
   $text =~ s/ $//g;  # trailing space is removed

   $text =~ s/ ,/,/g; # remove the (added) space before commas

   $text .= "\n" unless $text =~ /\n$/; 

   # add the tag for each "sentence"
   $textout = "";
   @sentlist = split /\n/,$text;
   foreach $s (@sentlist)
   {
       chomp $s;
       #@tokens = split /\s+/, $s;
       #$c = scalar(@tokens);
       $n++;
       $textout .= "\n<s docid=\"$docno\" num=\"$n\"> $s</s>\n";
   }
   $textout =~ s/ \././;
   return $textout;
}
