#!/usr/pubsw/bin/perl 

# Created by Tom Veatch 
# this version last updated by him December 20, 1992
# Mods by Chris Manning Apr 1997 to have -v option
# More mods 2000 to be faster: it doesn't split into words any more! [cdm]

# If the texts are newswire with SGML text blocks, you should pipe into this
# the output of extractbody.pl TEXT to get suitable text

if ($#ARGV < 0) {
  print "Usage: sgrep.pl [-s separator (default is .)] [-v] pattern file*\n";
  print "Looks for sentences of the given pattern in the files or stdin\n";
  print "Pattern specification is a la Perl (cf. Perl Programming, pp24ff)\n";
  exit(0);
}
$nonmatch = 0;
$/ = "."; 	# record separator == end of sentence. 
$* = 1; 	# enable multi-line patterns.
$pat = shift(@ARGV);	# store desired pattern.

while ($pat =~ /^-[a-z]/) {
    if ($pat eq "-s") {	# set record separator to this optional argument.
	$/ = shift(@ARGV);
    }
    elsif ($pat eq "-v") { # record to print non matches
	$nonmatch = 1;
    }
    else {
	print "Unrecognized option: $pat\n";
    }
    $pat = shift(@ARGV);
}
while (<>) { # reads one record at a time from file1 file2 ...
    # print "\nText is |$_|\n";
    # s/-\n//g;	# dehyphenate (the record) -- omitted for modern corpora
    s/\n/ /g;	# remove \n's (from the record).
		# Now slurp more records if this ends in a title, acronym, etc.
    $loop = 0;
    while (/(?:[ 0-9][ap]\.m|Dr|Mr|Mrs|Ms|Sen|Rep|Gov|Lt|Gen|Col|Adm|Maj|Sgt|Prof|Jr| [A-Z][aeiou]|[ .][A-Z0-9]| [a-z]|Wash|Mass|Nev|Calif|Tenn|Inc)\.$/ && $loop < 20) {
	# print "#### Text is |$_|\n";
	# print "#### Extending for bad period\n";
	$line = $_;
	$_ = $line . <>;
	s/\n/ /g;	# remove \n's (from the record).
	$loop++;
    }
    # if ($loop == 20) {
    #    print STDERR "Aborted out of extending line\n";
    # }
    $printit = 0;
    if (/$pat/o) {
        if ( ! $nonmatch ) {
	    $printit = 1;
	}
    } else {
	if ($nonmatch) {
	    $printit = 1;
	}
    }
    if ($printit) {
	while ($_) {
	    write;	# print filled text.
        }
	print "\n";
    }
}

format STDOUT = 
^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
$_
.