#

# Tokenize and segment a file into sentences
# and create a command file to be read by the hmmtagger
# to tag the result.

# NOTES
#	12/23/2004	An option for tagging raw text should be added.

# HISTORY
#	02/12/2004	First public release
#	05/18/2004	Do not remove Copyright, etc, if there is no segmentation.
#	05/18/2004	Periods at end of text is always end-stopping.
#	10/20/2004	Corrected: An unbalanced paren or bracket may cause the remainder of
#			the abstract to be unsegmented.
#	10/20/2004	Corrected: When tagging titles in medline format, if there is no field
#			between TI and AB then whichever comes first is ignored.
#	02/14/2005	Numbered the tokenizer rules

# Constants

$tagsep = "_";							# The tag separator
$replacesep = "-";						# Replace tag separator, if any, with this

# Command line options

$option_compute = "viterbi";					# Which tagger algorithm to use
$option_segment = 1;						# Segment lines into sentences
$option_letter = "P";						# The id letter to use in sentence ids
$option_titles = 0;						# Tag titles
$option_id = 0;							# Option to include identifier
$option_print = "printsent";					# Command used to print sentences

# The input options.
#
# itame		- (default) itame format
# medline	- medline format
# xml		- xml format
#
# These are assumed to be generated or derived
# from PUBMED (eg save to file). The itame format
# is the most general, and any other desired tagger
# input should be processed into the itame format.

$option_input = "itame";

# Required and optional files

$input_file = "";						# The input file or -
$install_dir = "";						# The installation directory

# The output options are
#
# stdout	- (default) make a single output to stdout with commands to tag all sentences
# multiple	- write a separate file for each sentence
# faithtest	- test each sentence for faithful tokenization
# original	- output the original untokenized sentences

$option_output = "stdout";

# Internal options

$option_silent = 1;						# Instruct tagger to be terse
$option_list = 0;						# Write a list of sentence ids
$option_notag = 0;						# Exclude the tag when printing
$option_text_fields = "A";					# Which itame fields to tokenize
@option_rmlex = ();						# Lexicon files for removal
$option_start = 0;						# First sentence to tag
$option_step = 0;						# Decimate the sentences to be tagged
$option_maxsent = 0;						# Maximum number of sentences to tag
@option_addlex = ();						# Lexicon files to add
$option_command = "";						# Extra command(s) before sentences

# Internal optional files

$ngram_file = "";						# The ngram file
$lex_file = "";							# The (main) lexicon file
$word_file = "";						# Keep only sentences with these words
$list_file = "";						# A file to write sentence ids
$id_file = "";							# Keep only sentence ids in this file
$save_dir = "";							# subdirectory to save sentences

# Get a default installation directory

$x = $0;
if ($install_dir eq "" && $x =~ /util\/tokenizer.perl/)
{
	$install_dir = $x;
	$install_dir =~ s/util\/tokenizer.perl//;
	$install_dir = "." if (! $install_dir);
}

# Process all command line arguments

while ($ARGV[0])
{
	# Process any input file arguments, there can be at most one

	if ($ARGV[0] eq "-" || substr($ARGV[0],0,1) ne "-")
	{
		if ($input_file)
		{
			print STDERR "Only one input file may be specified.\n";
			exit(1);
		}
		$input_file = $ARGV[0];
	}

	# External options

	if ($ARGV[0] eq "-nosegment") { $option_segment = 0; }
	if ($ARGV[0] eq "-titles") { $option_titles = 1; }
	if ($ARGV[0] eq "-id") { $option_id = 1; }
	if ($ARGV[0] eq "-xml") { $option_input = "xml"; }
	if ($ARGV[0] eq "-medline") { $option_input = "medline"; }

	# Internal options

	if ($ARGV[0] eq "-input") { shift @ARGV; $input_file = $ARGV[0]; }
	if ($ARGV[0] eq "-home") { shift @ARGV; $install_dir = $ARGV[0]; }
	if ($ARGV[0] eq "-viterbi") { $option_compute = "viterbi"; }
	if ($ARGV[0] eq "-mle") { $option_compute = "compute"; }
	if ($ARGV[0] eq "-noop") { $option_compute = ""; }
	if ($ARGV[0] eq "-letter") { shift @ARGV; $option_letter = $ARGV[0]; }
	if ($ARGV[0] eq "-sentence") { $option_output = "multiple"; }
	if ($ARGV[0] eq "-resort") { $option_output = "resort"; }
	if ($ARGV[0] eq "-save") { shift @ARGV; $save_dir = $ARGV[0]; }
	if ($ARGV[0] eq "-original") { $option_output = "original"; }
	if ($ARGV[0] eq "-test") { $option_output = "faithtest"; }
	if ($ARGV[0] eq "-lexicon") { shift @ARGV; $lex_file = $ARGV[0]; }
	if ($ARGV[0] eq "-ngrams") { shift @ARGV; $ngram_file = $ARGV[0]; }
	if ($ARGV[0] eq "-addlex") { shift @ARGV; push @option_addlex, $ARGV[0]; }
	if ($ARGV[0] eq "-keep") { shift @ARGV; $id_file = $ARGV[0]; }
	if ($ARGV[0] eq "-keepid") { shift @ARGV; $keep_id = $ARGV[0]; }
	if ($ARGV[0] eq "-list") { $option_list = 1; }
	if ($ARGV[0] eq "-rmlex") { shift @ARGV; push @option_rmlex, $ARGV[0]; }
	if ($ARGV[0] eq "-words") { shift @ARGV; $word_file = $ARGV[0]; }
	if ($ARGV[0] eq "-step") { shift @ARGV; $option_step = $ARGV[0]; }
	if ($ARGV[0] eq "-start") { shift @ARGV; $option_start = $ARGV[0]; }
	if ($ARGV[0] eq "-maxsent") { shift @ARGV; $option_maxsent = $ARGV[0]; }
	if ($ARGV[0] eq "-verbose") { $option_silent = 0; }
	if ($ARGV[0] eq "-printfull") { $option_print = "printfull"; }
	if ($ARGV[0] eq "-command") { shift @ARGV; $option_command .= "$ARGV[0]\n"; }

	shift @ARGV;
}

# Get the default directory for the lexicon and ngram files

$install_dir = "." if (! $install_dir);
$lex_file = "${install_dir}/models/lex.cur" if (! $lex_file);
$ngram_file = "${install_dir}/models/ngrams.cur" if (! $ngram_file);
$input_file = "-" if (! $input_file);

if ($option_output eq "multiple" && $save_dir eq "")
{
	print "The option -sentence requires -save dir.\n";
	exit(1);
}

# Make sure the input file is good

unless ($input_file eq "-" || -e $input_file)
{
	print STDERR "The specified input file ($input_file) does not exist.\n";
	exit(1);
}

# If the list option was specified, get its file name

if ($option_list)
{
	if ($input_file eq "-")
	{
		print STDERR "A list file cannot be written when the input is stdin.\n";
		exit(1);
	}

	$list_file = $input_file;
	$list_file =~ s/\.[^\/]*$//;
	$list_file .= ".list";
}

# If specified, get the list of words to retain

if ($word_file)
{
	unless (-e $word_file)
	{
		print STDERR "The specified word file ($word_file) does not exist.\n";
		exit(1);
	}

	for (`cat $word_file`)
	{
		chomp;
		$keepwords{lc($_)}++;
	}
}

# If specified, get the list of sentences ids to retain

if ($id_file)
{
	unless (-e $id_file)
	{
		print STDERR "The specified id file ($id_file) does not exist.\n";
		exit(1);
	}

	for (`cat $id_file`)
	{
		chomp;
		s/\s/ /g;
		if (/^([^ ]+)/)
		{
			$keep_names{$1}++;
		}
	}
}

# If the silent option was specified, command the tagger to keep quiet

if ($option_silent && ($option_output eq "stdout" || $option_output eq "resort"))
{
	print "verbose 0\n";
}

# For standard output, print the default commands to initialize the tagger

if ($option_output eq "stdout" || $option_output eq "resort")
{
	print "#$install_dir/util/tagger\n";
	print "ngrams $ngram_file\n";
	print "lex 30 $lex_file\n";
	for $l (@option_addlex) { print "addlex $l\n"; }
	for $l (@option_rmlex) { print "rmlex $l\n"; }
	print "backoff\n";
	print "init 2\n";
	print "smooth\n";
	print $option_command;
}

open H,">$list_file" if ($option_list);

# Read and interpret the input file

$pmid = "";
$tot_sent = 0;
$collect_text = 0;
$step = $option_start;
open IN, "<$input_file";
$unread_text = "";						# "unread" line
while ($_ = $unread_text || <IN>)
{
	chomp;
	$unread_text = "";					# Reset unread line

	if ($option_input eq "itame")				# Process itame format
	{
		if (/^\.I(.*)/)					# Read the identifier
		{
			$pmid = $1;
			next;
		} elsif ($option_titles && /^\.T(.*)/)		# Tag titles?
		{
			$loc = "T";
			$text = $1;
		} elsif (/^\.A(.*)/)				# Tag abstracts always
		{
			$loc = "A";
			$text = $1;
		} else						# Skip everything else
		{
			next;
		}
	} elsif ($option_input eq "xml")			# Process xml format
	{
		# Convert escaped characters


		if (/\<PMID\>(.*)\<\/PMID\>/)			# Read the identifier
		{
			$pmid = $1;
			next;
		} elsif ($option_titles && /\<ArticleTitle\>(.*)\<\/ArticleTitle\>/)	# Tag titles?
		{
			$loc = "T";
			$text = $1;
		} elsif (/\<AbstractText\>(.*)\<\/AbstractText\>/)	# Whole abstracts
		{
			$loc = "A";
			$text = $1;
		} else
		{
			next;
		}

		# Convert the escapes

		while ($text =~ /\&\#([0-9]{1,3})\;/)
		{
			$text = "$`" . chr($1) . "$'";
		}

	} elsif ($option_input eq "medline")			# Process medline format
	{
		if (/^PMID\-\s*(.*)/)				# Read the identifier
		{
			$pmid = $1;
			next;
		} elsif ($collect_text && /^\s/)		# Still colecting text
		{
			$text .= " $_";
			next;
		} elsif ($collect_text && /^\S/)		# End of a section
		{
			$unread_text = $_;
			$collect_text = 0;
		} elsif ($option_titles && /^TI\s*\-\s*(.*)/)	# Tag titles?
		{
			$collect_text = 1;			# Start collecting text
			$loc = "T";
			$text = $1;
			next;
		} elsif (/^AB\s*\-\s*(.*)/)			# Start scanning abstracts
		{
			$collect_text = 1;			# Start collecting text
			$loc = "A";
			$text = $1;
			next;
		} else
		{
			next;
		}
	} else
	{
		print STDERR "Input format unknown.\n";
		exit(2);
	}

	$text =~ s/\s+/ /g;
	@out = tokenize($text);

	$num = 0;
	$sent_num = 0;

	for $sent (@out)
	{
		if ($word_file)
		{
			$keepit = 0;
			while ($sent =~ /([^ ]+)${tagsep}UNTAGGED/g)
			{
				if ($keepwords{lc($1)})
				{
					$keepit = 1;
					last;
				}
			}
			next if ($keepit == 0);
		}

		$sent_num++;

		# Skip every so many sentences

		$step++;
		next if ($option_step > 0 && $step < $option_step);
		$step = 0;
		last if ($option_maxsent > 0 && $tot_sent >= $option_maxsent);

		# Format the identifier for file name.
		# If segmenting, append the sentence number.

		$sentence_id = sprintf("%s%08s%s", $option_letter, $pmid, $loc);
		if ($option_segment)
		{
			$sentence_id .= sprintf("%02s", $sent_num);
		}

		# Skip if not for keeping

		next if (($id_file && ! $keep_names{$sentence_id}) || ($keep_id && $sentence_id ne $keep_id));

		# Count the sentences

		$num++;
		$tot_sent++;

		# Output file names with list values for reference

		print H "$sentence_id\n" if ($option_list);

		# Process the output options: default is to output to stdout

		if ($option_output eq "stdout")
		{
			print "echo $sentence_id\n" if ($option_id);
			print "sentence\n";
			print "$sent\n";
			print "$option_compute\n";
			print "$option_print\n";
		}

		if ($option_output eq "resort")
		{
			$x = "";
			$x .= "echo $sentence_id\n" if ($option_id);
			$x .= "sentence\n";
			$x .= "$sent\n";
			$x .= "$option_compute\n";
			$x .= "$option_print\n";
			$resorted_output{$sentence_id} = $x;
		}

		# Save each sentence in its own input file for the tagger

		if ($option_output eq "multiple")
		{
			open F,">$save_dir/$sentence_id.i";

			print F "verbose 0\n" if ($option_silent);
			print F "# sentence $num\n";
			print F "ngrams $ngram_file\n";
			print F "lex 30 $lex_file\n";
			for $l (@option_addlex) { print F "addlex $l\n"; }
			for $l (@option_rmlex) { print F "rmlex $l\n"; }

			print F "backoff\n";
			print F "init 2\n";
			print F "smooth\n";
			print F $option_command;
			print F "sentence\n$sent\n";
			print F "$option_compute\n";
			print F "option_print\n";

			close F;
		}

		# Test to see if the sentence was tokenized faithfully

		if ($option_output eq "faithtest")
		{
			$test_sent = $sent;
			$test_sent =~ s/$tagsep[^ $tagsep]+//g;
			$test_sent =~ s/ //g;
			$test_text = $save_text;
			$test_text =~ s/ //g;
			$test_text =~ s/${tagsep}/$replacesep/g;
			if (index($test_text, $test_sent, 0) < 0)
			{
				print "$Sentence $num ($sentence_id) did not tokenize faithfully.\n";
				print "$test_text\n";
				print "$test_sent\n";
			} else
			{
				print "$sentence_id ok!\n";
			}
		}

		# Locate the original sentence

		if ($option_output eq "original")
		{
			$test_sent = $sent;
			$test_sent =~ s/${tagsep}[^ $tagsep]+//g;
			$test_sent =~ s/ //g;
			$i = 0;
		search_sent:
			while (($i = index($save_text, substr($test_sent, 0, 1), $i)) >= 0)
			{
				for ($j = 0; $j + $i <= length($save_text); $j++)
				{
					$test_text = substr($save_text, $i, $j);
					$test_text =~ s/ //g;
					$test_text =~ s/${tagsep}/$replacesep/g;
					if ($test_sent eq $test_text)
					{
						# Output the original sentence

						$test_text = substr($save_text, $i, $j);
						$test_text =~ s/^ +//;
						$test_text =~ s/ +$//;
						$test_text =~ s/\'/\'\'/g;

						print "insert into chunk (chunk_id, chunk_type_cd, text) values ";
						print "('$sentence_id', 'medpost', '$test_text');\n";

						# Output the tokenization

						$n = 0;
						$t = 0;
						for (split(/ /, $sent))
						{
							($w, $a) = split(/_/, $_);
							$w =~ s/\'/\'\'/g;
							$m = $n + length($w) - 1;
							print "insert into annotation ";
							print "(chunk_id, corpus_cd, token_offs, text_type, first_offs, last_offs, text) values ";
							print "('$sentence_id', 'unknown', $t, 'token', $n, $m, '$w');\n";

							$n = $m + 1;
							$t++;
						}

						last search_sent;
					}
				}
				$i++;
			}
		}
	}

	last if ($option_maxsent > 0 && $tot_sent >= $option_maxsent);
}

if ($option_output eq "resort" && %resorted_output)
{
	for $sentence_id (sort keys %resorted_output)
	{
		print $resorted_output{$sentence_id};
	}
}

close IN;
close H if ($option_list);

# This is the tokenizer algorithm. It is a sequence of regular
# expressions that move spaces around so that tokens are space delimited.
# The algorithm is faithful, ie it does not add or delete any
# non-whitespace characters. Copyright notices, however, are removed.
# Contractions are not processed as would be required in unrestricted english text.
# A perl array is returned with one sentence per element. Each token
# is tagged with UNTAGGED or "." for the end-of-sentence punctuation,
# these tags are removed by the tagger.

sub tokenize($)
{
	my @out;

	$_ = $_[0];

	s/\s+/ /g;						# Start with simple white space

	# Remove a Copyright tag at the end of the abstract.

	if ($option_segment == 1 && /((?:Copyright|Published) .*)$/)
	{
		s/(?:Copyright|Published) .*$// if (length($1) < 80);
	}

	$save_text = $_;					# Save a copy (for internal options)

	$_ = " $_ ";						# insert spaces at beginning and end

	s/ +/ /g;						# Use only single spaces

	s/([ \(\[\{\<])\"/$1 \" /g;				# 1. quote
	s/\.\.\./ \.\.\. /g;					# 2. ellipsis

	s/([\,\;\:\@\#\$\%\&])/ $1 /g;				# 3. non-sentence ending punctuation

	s/${tagsep}/$replacesep/g;				# 4. Replace embedded tag separators

	s/ *\/ */ \/ /g;					# 5. Separate the slashes
	s/ *([+-]) *\/ *([+-]) */ $1\/$2 /g;			# 6. Put back +/- signs
	s/ +and *\/ *or +/ and\/or /g;				# 7. Put back and/or sign

	s/([\]\[\(\)\{\}\<\>])/ $1 /g;				# 8. all brackets
	s/([?!])/ $1 /g;					# 9. exclamation and question mark

	s/([^.])([.])([\]\)\}\>\"\']?) *$/$1 $2 $3/g;		# 10. Period at end of line, may be followed
								# by a closed-bracket or quote

	s/([^. ])([.]) +([A-Z])/$1 $2 $3/g;			# 11. period followed by capitalized word

	while (/([A-Za-z-]{2,100})([.]) /g)			# 12. A normal word followed by a period
	{
		s/([A-Za-z-]{2,100})([.]) /$1 $2 /g;
	}

	while (/ ([^a-z ]{2,100})([.]) /g)			# 13. A non-normal word followed by a period?
	{
		s/ ([^a-z ]{2,100})([.]) / $1 $2 /g;
	}

								# 14. Put some periods
	while (/ ([A-Za-z]) +\. +([^A-Z0-9\(])/g)		# with single-letter abbreviations
	{
		s/ ([A-Za-z]) +\. +([^A-Z0-9\(])/ $1\. $2/g;
	}

	s/ i *\. *e *\. / i\.e\. /g;				# 15. Some abbreviations, cannot end a sentence
	s/ e *\. *g *\. / e\.g\. /g;
	s/ vs +\. / vs\. /g;
	s/ approx +\. / approx\. /g;
	s/ al +\. / al\. /g;

	# 16. Look for normal words, containing at least one vowel on both sides
	# of a period. This is to get the cases where a space after a sentence
	# has been omitted.

	while (/ ([A-Za-z]*[aeiouAEIOU][A-Za-z]*)(\.)([A-Za-z]*[aeiouAEIOU][A-Za-z]*) /g)
	{
		s/ ([A-Za-z]*[aeiouAEIOU][A-Za-z]*)(\.)([A-Za-z]*[aeiouAEIOU][A-Za-z]*) / $1 $2 $3 /g;
	}

	# 17. These are numeric endings of sentences!

	while (/ ([0-9]+)([.]) +([A-Za-z])/g)
	{
		s/ ([0-9]+)([.]) +([A-Za-z])/ $1 $2 $3/g
	}

	s/ +/ /g;						# Use only single spaces (again)

	# 18. Handle periods within parentheses, allow for nesting

	$lp = "\\(";
	$rp = "\\)";
	$sp = "[ ]+";
	$dot = "${sp}[.]";
	$npt = "(?:${sp}[^\\(\\)]+)";

	if (1 == 0)
	{
		while (/ $lp$npt+$sp$rp /g)
		{
			print "Before $pmid: $&\n";
		}
	}

	while (/ $lp$npt*$sp$rp /g)
	{
		pos() = 0;
		while (/ $lp$npt*$dot$npt*$sp$rp /g)
		{
			s/( $lp$npt*)$dot($npt*$sp$rp )/$1\.$2/g;
		}
		s/ $lp($npt*$sp)$rp / \*L\*$1\*R\* /g;
	}

	s/\*L\*/\(/g;
	s/\*R\*/\)/g;

	if (1 == 0)
	{
		while (/ $lp$npt+$sp$rp /g)
		{
			print "After $pmid: $&\n";
		}
	}

	# 19. Same with brackets

	$lp = "\\[";
	$rp = "\\]";
	$npt = "(?:${sp}[^\\[\\]]+)";

	if (1 == 0)
	{
		while (/ $lp$npt+$sp$rp /g)
		{
			print "Before $pmid: $&\n";
		}
	}

	while (/ $lp$npt*$sp$rp /g)
	{
		pos() = 0;
		while (/ $lp$npt*$dot$npt*$sp$rp /g)
		{
			s/( $lp$npt*)$dot($npt*$sp$rp )/$1\.$2/g;
		}
		s/ $lp($npt*$sp)$rp / \*L\*$1\*R\* /g;
	}

	s/\*L\*/\[/g;
	s/\*R\*/\]/g;

	if (1 == 0)
	{
		while (/ $lp$npt+$sp$rp /g)
		{
			print "After $pmid: $&\n";
		}
	}

	# 20. Finally, if there is a period that is the last character of the line
	# make sure it is tokenized as an end-stop

	s/([^ ])([?!.]) *$/$1 $2 /;

	s/--/ -- /g;						# 21. handle long dash
	s/\"/ \" /g;						# 22. quote
	s/([^\'])\' /$1 \' /g;					# 23. possessive
	s/([\' ])\'([^\' ])/$1\' $2/g;				# 24. quote
	s/ \' s / \'s /g;					# 25. put back possessive

	s/(\'\'|\`\`)/ \'\' /g;					# 26. quote

	# Don't reassemble compounds, now there is a way to tag them!
	#
	# s/ ([Ii])n (vivo|vitro|situ) / ${1}n$2 /g;		# Commonly used compound words
	# s/ ([Dd])e (novo) / ${1}e$2 /g;

	s/\'([sS]) / \'$1 /g;					# 27. possessive
	if (1 == 0)
	{
		s/\'([mMdD]) / \'$1 /g;				# contraction
		s/\'ll / \'ll /g;				# contraction
		s/\'re / \'re /g;				# contraction
		s/\'ve / \'ve /g;				# contraction
		s/n\'t / n\'t /g;				# contraction
		s/\'LL / \'LL /g;				# contraction
		s/\'RE / \'RE /g;				# contraction
		s/\'VE / \'VE /g;				# contraction
		s/N\'T / N\'T /g;				# contraction
	}

	s/ ([Cc])annot / $1an not /g;				# 28. slang with apostrophe

	if (1 == 0)
	{
		s/ ([Dd])\'ye / $1\' ye /g;			# slang
		s/ ([Gg])imme / $1im me /g;			# slang
		s/ ([Gg])onna / $1on na /g;			# slang
		s/ ([Gg])otta / $1ot ta /g;			# slang
		s/ ([Ll])emme / $1em me /g;			# slang
		s/ ([Mm])ore\'n / $1ore \'n /g;			# slang
		s/ \'([Tt])is / \'$1 is /g;			# slang
		s/ \'([Tt])was / \'$1 was /g;			# slang
		s/ ([Ww])anna / $1an na /g;			# slang
	}

	s/  +/ /g;						# cleanup whitespace

	s/ ([?!.]) ([A-Za-z1-9]) (\.) / $1 $2$3 /g;		# 29. Put list item elements back at sentence end
	s/^ *([A-Za-z1-9]) ([.]) / $1$2 /g;			# 30. Or at beginning of text

	# 31. process number (and should also get money formats),
	# American number formats.

	while (/ ([0-9,]+) (,) ([0-9]{3}) /g)
	{
		s/ ([0-9,]+) (,) ([0-9]{3}) / $1$2$3 /g;
	}

	# 32. British number formats.

	while (/ ([0-9]+) ([0-9]{3}) /g)
	{
		s/ ([0-9]+) ([0-9]{3}) / $1$2 /g;
	}

	# Now, this is a trick to get some special sentence breaks where
	# there is a lower case word or a missing space after the period.

	s/([^ ]+)/$1${tagsep}UNTAGGED/g;			# Make everything UNTAGGED

	s/ {2,100}/ /g;						# Cleanup whitespace,
	s/^ +//;						# space at beginning,
	s/ +$/ /;						# Leave one space at end

	if ($option_segment)					# 33. Tag end-of-sentence marks
	{
		s/( [?!.])${tagsep}UNTAGGED */$1${tagsep}.SENTENCEBREAK/g;
	}
	s/ $/SENTENCEBREAK/g;					# Force end-of-sentence if no period

	return split(/SENTENCEBREAK/, $_);			# Return as an array
}


