#!/usr/local/bin/perl -w # Given a passage defined as the triple {docid, offset, length}, # print the ASCII characters that constitute the passage # Script assumes HARD 2003 document collection: 1999 docs # from the AQUAINT collection plus Congressional Record # and Federal Register documents released by the LDC # The script assumes the data files were read from the CDs, # uncompressed, and then arranged in a directory structure # such that there is a root, each data source has a directory that is # the child of the root, and the text files themselves reside # in the source directory. # Passages cannot cross document boundaries. The opening "<" # of the tag is position 0. The first character in # the pasasge is the character at position offset; length # is the total number of characters returned. %root_dir = ("APW", "/collections/Hard2003/text/apw99", "CRE", "/collections/Hard2003/text/cr99", "FR", "/collections/Hard2003/text/fr99", "NYT", "/collections/Hard2003/text/nyt99", "XIE", "/collections/Hard2003/text/xie99"); %fprefix = ("APW", "", "CRE", "CRE", "FR", "FR", "NYT", "", "XIE", ""); %fsuffix = ("APW", "_APW_ENG", "CRE", ".sgm", "FR", ".sgm", "NYT", "_NYT", "XIE", "_XIN_ENG"); $#ARGV == 2 || die "Usage: extract_passage.pl docid offset length\n"; $docid = $ARGV[0]; $offset = $ARGV[1]; $length = $ARGV[2]; $docid =~ /((?:FR)|(?:CRE)|(?:XIE)|(?:NYT)|(?:APW))1999(\d\d)(\d\d).\d\d\d\d/ || die "extract_passage: invalid document id `$docid'\n"; $source = $1; $year = "1999"; $month = $2; if ($month > 12|| $month < 1) { die "extract_passage: invalid document id `$docid'\n"; } $day=$3; if ($day > 31 || $day < 1) { die "extract_passage: invalid document id `$docid'\n"; } if ($offset < 0 || $offset !~ /^\d+$/) { die "extract_passage: offset must be a non-negative integer\n"; } if ($length <= 0 || $length !~ /^\d+$/) { die "extract_passage: length must be a positive integer\n"; } $filename = "$root_dir{$source}/$fprefix{$source}$year$month$day$fsuffix{$source}"; if ( (! -e $filename) || (! open TXT, "<$filename") ) { die "extract_passage: Can't find/open file `$filename' for document $docid: $!\n"; } $doctext = ""; $found = 0; $indoc = 0; while ($line = ) { if ($line =~ /^\s*]/) { while ($line =~ /^\s/) { # strip leading white space, if any $line = substr $line, 1; } if ($source eq "FR" || $source eq "CRE") { $line =~ // || die "formatting error in $source id: $line\n"; $id = $1; if ($id ne $docid) { $doctext = ""; $indoc = 0; next; } else { $found = 1; } } $doctext .= $line; $indoc = 1; next; } if ($line =~ /^\s*\s*([A-Z0-9.]+)/) { $docno = $1; if ($docno ne $docid) { $doctext = ""; $indoc = 0; } else { $found = 1; $doctext .= $line; } next; } if ($line =~ /^\s*<\/DOC>/) { if ($indoc) { $doctext .= $line; last; } next; } if ($indoc) { $doctext .= $line; } } close TXT || die "extract_passage: close of file `$filename' failed: $!\n"; if (! $found) { die "extract_passage: Could not find $docid in file $filename\n"; } # $doctext is one string containing exactly the characters # contained within doc $docid $l = length $doctext; if ($offset+$length > $l) { die "extract_passage: requested passage crosses document boundaries\n"; } $passage = substr $doctext, $offset, $length; print "$passage"; exit 0;