#!/usr/pubsw/bin/perl # Created by Tom Veatch # this version last updated by him December 20, 1992 # Mods by Chris Manning Apr 1997 to have -v option # More mods 2000 to be faster: it doesn't split into words any more! [cdm] # If the texts are newswire with SGML text blocks, you should pipe into this # the output of extractbody.pl TEXT to get suitable text if ($#ARGV < 0) { print "Usage: sgrep.pl [-s separator (default is .)] [-v] pattern file*\n"; print "Looks for sentences of the given pattern in the files or stdin\n"; print "Pattern specification is a la Perl (cf. Perl Programming, pp24ff)\n"; exit(0); } $nonmatch = 0; $/ = "."; # record separator == end of sentence. $* = 1; # enable multi-line patterns. $pat = shift(@ARGV); # store desired pattern. while ($pat =~ /^-[a-z]/) { if ($pat eq "-s") { # set record separator to this optional argument. $/ = shift(@ARGV); } elsif ($pat eq "-v") { # record to print non matches $nonmatch = 1; } else { print "Unrecognized option: $pat\n"; } $pat = shift(@ARGV); } while (<>) { # reads one record at a time from file1 file2 ... # print "\nText is |$_|\n"; # s/-\n//g; # dehyphenate (the record) -- omitted for modern corpora s/\n/ /g; # remove \n's (from the record). # Now slurp more records if this ends in a title, acronym, etc. $loop = 0; while (/(?:[ 0-9][ap]\.m|Dr|Mr|Mrs|Ms|Sen|Rep|Gov|Lt|Gen|Col|Adm|Maj|Sgt|Prof|Jr| [A-Z][aeiou]|[ .][A-Z0-9]| [a-z]|Wash|Mass|Nev|Calif|Tenn|Inc)\.$/ && $loop < 20) { # print "#### Text is |$_|\n"; # print "#### Extending for bad period\n"; $line = $_; $_ = $line . <>; s/\n/ /g; # remove \n's (from the record). $loop++; } # if ($loop == 20) { # print STDERR "Aborted out of extending line\n"; # } $printit = 0; if (/$pat/o) { if ( ! $nonmatch ) { $printit = 1; } } else { if ($nonmatch) { $printit = 1; } } if ($printit) { while ($_) { write; # print filled text. } print "\n"; } } format STDOUT = ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< $_ .