
# Convert iHMM output to formatted file with Penn treebank tags
# (approximately) and slash convention

# require "tagsep.perl";
$tagsep = "_";

$option_print = "sentence";
$option_preserve = 0;
$option_tagsep = "/";

while (substr($ARGV[0], 0, 1) eq "-")
{
	$option_print = "abgene" if ($ARGV[0] eq "-abgene");
	$option_preserve = 1 if ($ARGV[0] eq "-preserve");
	$option_tagsep = "_" if ($ARGV[0] eq "-uscore");
	shift @ARGV;
}

($sec,$min,$hour,$mday,$mon,$year) = localtime();
$year += 1900; $mon++;

while (length($mon) < 2) {$mon = "0$mon";}
while (length($mday) < 2) {$mday = "0$mday";}

$date = "$year$mon$mday";

%multiwords = (
	"a priori"		=> "FW FW",
	"according to"		=> "VBG TO",
	"ad libitum"		=> "FW FW",
	"ahead of"		=> "RB IN",
	"apart from"		=> "RB IN",
	"as if"			=> "IN IN",
	"as far as"		=> "RB RB IN",
	"as for"		=> "IN IN",
	"as per"		=> "IN IN",
	"as long as"		=> "RB RB IN",
	"as of"			=> "IN IN",
	"as soon as"		=> "RB RB IN",
	"as though"		=> "IN IN",
	"as to"			=> "IN TO",
	"as well"		=> "RB RB",
	"as well as"		=> "RB RB IN",
	"as yet"		=> "RB RB",
	"aside from"		=> "RB IN",
	"assuming that"		=> "VBG IN",
	"at least"		=> "IN JJS",
	"at most"		=> "IN RBS",
	"away from"		=> "RB IN",
	"because of"		=> "IN IN",
	"considering that"	=> "VBG IN",
	"contrary to"		=> "JJ TO",
	"de novo"		=> "FW FW",
	"devoid of"		=> "JJ IN",
	"due to"		=> "JJ TO",
	"except for"		=> "IN IN",
	"except that"		=> "IN IN",
	"excepting that"	=> "VBG IN",
	"exclusive of"		=> "JJ IN",
	"given that"		=> "VBN IN",
	"granted that"		=> "VBN IN",
	"granting that"		=> "VBG IN",
	"in as much as"		=> "IN RB RB IN",
	"in gel"		=> "IN NN",
	"in order that"		=> "IN NN IN",
	"in the event that"	=> "IN DT NN IN",
	"in situ"		=> "FW FW",
	"in utero"		=> "FW FW",
	"in vitro"		=> "FW FW",
	"in vivo"		=> "FW FW",
	"inasmuch as"		=> "RB IN",
	"inside of"		=> "NN IN",
	"insofar as"		=> "RB IN",
	"instead of"		=> "RB IN",
	"irrespective of"	=> "RB IN",
	"just as"		=> "RB IN",
	"next to"		=> "JJ TO",
	"other than"		=> "JJ IN",
	"out of"		=> "IN IN",
	"outside of"		=> "IN IN",
	"owing to"		=> "JJ TO",
	"per cell"		=> "IN NN",
	"per se"		=> "FW FW",
	"preparatory to"	=> "JJ TO",
	"previous to"		=> "JJ TO",
	"prior to"		=> "RB TO",
	"provided that"		=> "VBN IN",
	"providing that"	=> "VBG IN",
	"pursuant to"		=> "JJ TO",
	"rather than"		=> "RB IN",
	"regardless of"		=> "RB IN",
	"seeing that"		=> "VBG IN",
	"so that"		=> "IN IN",
	"subsequent to"		=> "JJ TO",
	"such as"		=> "JJ IN",
	"such that"		=> "JJ IN",
	"supposing that"	=> "VBG IN",
	"up to"			=> "IN TO",
	"vice versa"		=> "RB RB",
);

$sent = "";
while (<>)
{
	chomp;
	s/\s+/ /g;

	# If abgene output is requested and this is a SENTENCE comment
	# Then print the required format and clear the sentence

	if ($option_print eq "abgene" && /echo SENTENCE (.*)/)
	{
		$pmid = $1;
		$pmid =~ s/[^0-9]+//g;
		print "@@" . $pmid . "${option_tagsep}CD\n";
		print "\&\&$date${option_tagsep}CD\n";
		$sent = "";
	}

	# Reconstruct the sentence by mapping all of the tags first

	while (/([^ ]+)${tagsep}([^ $tagsep]+)/g)
	{
		$w = $1;
		$t = $2;
		$s = maptag($w, $t);
		$sent .= " " if ($sent);
		$sent .= "$w${option_tagsep}$s";
	}

	# Now look for multi-word items and translate them too

	$sent = multimaptag($sent);

	# Print the sentence, or "in-betweens",

	if ($sent) { print "$sent\n"; }
	elsif ($option_preserve) { print "$_\n"; }

	# Reset for the next sentence

	$sent = "";
}

sub maptag($$)
{
	my $w = $_[0];
	my $t = $_[1];
	my $s = $t;

	# These are the general rules

	$s = "NN" if ($t =~ /^(?:VVGN)$/);
	$s = "IN" if ($t =~ /^(?:CS|CSN|CST|II|CS\+|II\+)$/);
	$s = "DT" if ($t =~ /^(?:DD)$/);
	$s = "PDT" if ($t =~ /^(?:DB)$/);
	$s = "CC" if ($t =~ /^(?:CC|CC\+)$/);
	$s = "CD" if ($t =~ /^(?:MC)$/);
	$s = "POS" if ($t =~ /^(?:GE)$/);
	$s = "JJ" if ($t =~ /^(?:VVNJ|VVGJ|JJ\+)$/);
	$s = "JJS" if ($t =~ /^(?:JJT)$/);
	$s = "RB" if ($t =~ /^(?:RR|RR\+)$/);
	$s = "RBR" if ($t =~ /^(?:RRR)$/);
	$s = "RBS" if ($t =~ /^(?:RRT)$/);
	$s = "PRP" if ($t =~ /^(?:PN|PND)$/);
	$s = "PRP\$" if ($t =~ /^(?:PNG)$/);
	$s = "WDT" if ($t =~ /^(?:PNR)$/);
	$s = "MD" if ($t =~ /^(?:VM)$/);
	$s = "VBP" if ($t =~ /^(?:VBB|VHB|VDB|VVB)$/);
	$s = "VB" if ($t =~ /^(?:VBI|VHI|VDI|VVI)$/);
	$s = "VBD" if ($t =~ /^(?:VBD|VHD|VDD|VVD)$/);
	$s = "VBN" if ($t =~ /^(?:VBN|VHN|VDN|VVN)$/);
	$s = "VBG" if ($t =~ /^(?:VBG|VHG|VDG|VVG)$/);
	$s = "VBZ" if ($t =~ /^(?:VBZ|VHZ|VDZ|VVZ)$/);

	# Some words are handled the same way always

	$s = "TO" if ($w =~ /^(?i:to)$/);
	$s = "WP" if ($w =~ /^(?i:who|what|whom)$/);
	$s = "WRB" if ($w =~ /^(?i:how|however|when|whenever|where|whereby|why)$/);

	$s =~ s/\+//g;

	return $s;
}

sub multimaptag($)
{
	my $sent = " $_[0] ";
	my $anytag = "${option_tagsep}[^ ]+";
	my $a;
	my $b;
	my @b;
	my $c;
	my $i;
	my $j;

	for $phrase (keys %multiwords)
	{
		$tags = $multiwords{$phrase};

		$a = " $phrase ";
		$a =~ s/([^ ]+)/\($1\)$anytag/g;
		$a =~ s/\//\\\//g;

		$b = $tags;
		@b = split(/ +/, $tags);
		for $i (0 .. $#b) { $j = $i + 1; $b[$i] = "\$$j\\${option_tagsep}$b[$i]"; }
		$b = join(" ", @b);
		$b = " $b ";

		$c = "\$sent =~ s/$a/$b/gi";
		# print "evaluating $c\n" if ($sent =~ /$a/);
		eval $c;
	}

	$sent =~ s/^ +//;
	$sent =~ s/ +$//;
	return $sent;
}
