#!/usr/pubsw/bin/perl # extractbody ELEMENT # where element has the real text in an SGML/XML(-like) file, assuming that # the ELEMENT start and end tags are on a line by themselves # (this works for a lot of the LDC newswire corpora). # By default, we print the element, so that articles are delimited. # The -n flag means not to print the element tags. # Christopher Manning 1999-2001 $extracting = 0; $printbodytag = 1; if ($#ARGV < 0) { die "usage: extractbody [-n] bodytag\n"; } $word = shift(@ARGV); while ($word =~ /^-[a-z]/) { if ($word eq "-n") { # don't print tag body $printbodytag = 0; } else { print "Unrecognized option: $word\n"; } $word = shift(@ARGV); } while (<>) { if (/<$word>/) { $extracting = 1; if ($printbodytag) { print; } } elsif (/<\/$word>/) { $extracting = 0; if ($printbodytag) { print; } } elsif ($extracting) { print; } }