
# Convert iHMM output to formatted file with SPECIALIST lexicon tags
# (approximately).

# require "tagsep.perl";
$tagsep = "_";

$option_print = "sentence";
$option_preserve = 0;

while (substr($ARGV[0], 0, 1) eq "-")
{
	$option_print = "abgene" if ($ARGV[0] eq "-abgene");
	$option_preserve = 1 if ($ARGV[0] eq "-preserve");
	shift @ARGV;
}

($sec,$min,$hour,$mday,$mon,$year) = localtime();
$year += 1900; $mon++;

while (length($mon) < 2) {$mon = "0$mon";}
while (length($mday) < 2) {$mday = "0$mday";}

$date = "$year$mon$mday";

%multiwords = (
	"a priori"		=> "prep noun",
	"according to"		=> "verb prep",
	"ad libitum"		=> "prep noun",
	"ahead of"		=> "adv prep",
	"apart from"		=> "adv prep",
	"as if"			=> "prep prep",
	"as far as"		=> "prep adv prep",
	"as for"		=> "prep prep",
	"as per"		=> "prep prep",
	"as long as"		=> "prep adv prep",
	"as of"			=> "prep prep",
	"as soon as"		=> "prep adv prep",
	"as though"		=> "prep prep",
	"as to"			=> "prep prep",
	"as well"		=> "prep adj",
	"as well as"		=> "prep adv prep",
	"as yet"		=> "prep adv",
	"aside from"		=> "adv prep",
	"assuming that"		=> "verb compl",
	"at least"		=> "prep adj",
	"at most"		=> "prep adv",
	"away from"		=> "adv prep",
	"because of"		=> "prep prep",
	"considering that"	=> "verb compl",
	"contrary to"		=> "adj prep",
	"de novo"		=> "prep noun",
	"devoid of"		=> "adj prep",
	"due to"		=> "adj prep",
	"except for"		=> "verb prep",
	"except that"		=> "verb compl",
	"excepting that"	=> "verb compl",
	"exclusive of"		=> "adj prep",
	"given that"		=> "verb compl",
	"granted that"		=> "verb compl",
	"granting that"		=> "verb compl",
	"in as much as"		=> "prep prep adv prep",
	"in gel"		=> "prep noun",
	"in order that"		=> "prep noun compl",
	"in the event that"	=> "prep det noun compl",
	"in situ"		=> "prep noun",
	"in utero"		=> "prep noun",
	"in vitro"		=> "prep noun",
	"in vivo"		=> "prep noun",
	"inasmuch as"		=> "conj prep",
	"inside of"		=> "noun prep",
	"insofar as"		=> "conj prep",
	"instead of"		=> "adv prep",
	"irrespective of"	=> "adv prep",
	"just as"		=> "adv prep",
	"next to"		=> "adv prep",
	"other than"		=> "adj prep",
	"out of"		=> "adv prep",
	"outside of"		=> "adv prep",
	"owing to"		=> "verb prep",
	"per cell"		=> "prep noun",
	"per se"		=> "prep noun",
	"preparatory to"	=> "adj prep",
	"previous to"		=> "adj prep",
	"prior to"		=> "adv prep",
	"provided that"		=> "verb compl",
	"providing that"	=> "verb compl",
	"pursuant to"		=> "adj prep",
	"rather than"		=> "adv prep",
	"regardless of"		=> "adv prep",
	"seeing that"		=> "verb compl",
	"so that"		=> "prep compl",
	"subsequent to"		=> "adj prep",
	"such as"		=> "adj prep",
	"such that"		=> "adj compl",
	"supposing that"	=> "verb compl",
	"up to"			=> "prep prep",
	"vice versa"		=> "adv adv",
);

$sent = "";
while (<>)
{
	chomp;
	s/\s+/ /g;

	if ($option_print eq "abgene" && /echo SENTENCE (.*)/)
	{
		$pmid = $1;
		$pmid =~ s/[^0-9]+//g;
		print "\n@@" . $pmid . "/CD\n";
		print "\&\&$date/CD\n";
		$sent = "";
	}

	while (/([^ ]+)${tagsep}([^ $tagsep]+)/g)
	{
		$w = $1;
		$t = $2;
		$s = maptag($w, $t);
		$sent .= " " if ($sent);
		$sent .= "$w$tagsep$s";
	}
	$sent = multimaptag($sent);
	if ($sent) { print "$sent\n"; }
	elsif ($option_preserve) { print "$_\n"; }
	$sent = "";
}

$sent = multimaptag($sent);
if ($sent) { print "$sent\n"; }
elsif ($option_preserve) { print "$_\n"; }

sub maptag($$)
{
	my $t = $_[1];
	my $s = $t;

	# These are the general rules

	$s =~ s/\+//g;

	$s = "noun" if ($s =~ /^(?:NN|NNS|NNP|VVGN)$/);
	$s = "prep" if ($s =~ /^(?:II|CSN)/);
	$s = "det" if ($s =~ /^(?:DD|DB)$/);
	$s = "conj" if ($s =~ /^(?:CC|CS)$/);
	$s = "compl" if ($s =~ /^(?:CST)$/);
	$s = "num" if ($s =~ /^(?:MC)$/);
	$s = "adj" if ($s =~ /^(?:VVNJ|VVGJ|JJ|JJT|JJR)$/);
	$s = "adv" if ($s =~ /^(?:RR|RRR|RRT)$/);
	$s = "pron" if ($s =~ /^(?:PN|PND|PNR|PNG)$/);
	$s = "modal" if ($s =~ /^(?:VM)$/);
	$s = "adv" if ($s =~ /^(?:TO)$/);
	$s = "aux" if ($s =~ /^(?:V[BHD].)/);
	$s = "verb" if ($s =~ /^(?:VV.)/);
	$s = "adv" if ($s =~ /^(?:EX)$/);
	$s = "ap" if ($s eq "''");
	$s = "bq" if ($s eq "``");

	# Some things that are not handled

	$s = "noun" if ($s eq "SYM");
	$s = "noun" if ($s eq "GE");
	$s = "pd" if ($s eq ".");

	# Some words are handled the same way always

	$s = "hy" if ($w eq "-" || $w eq "--");
	$s = "cm" if ($w eq ",");
	$s = "pd" if ($w eq ".");
	$s = "rp" if ($w eq ")");
	$s = "lp" if ($w eq "(");
	$s = "pc" if ($w eq "%");
	$s = "pl" if ($w eq "+");
	$s = "cl" if ($w eq ":");
	$s = "sc" if ($w eq ";");
	$s = "rk" if ($w eq "]");
	$s = "lk" if ($w eq "[");
	$s = "ls" if ($w eq "<");
	$s = "gr" if ($w eq ">");
	$s = "eq" if ($w eq "=");
	$s = "am" if ($w eq "&");
	$s = "qu" if ($w eq "?");
	$s = "ax" if ($w eq "*");
	$s = "dq" if ($w eq "\"");
	$s = "sl" if ($w eq "/");

	return $s;
}

sub multimaptag($)
{
	my $sent = " $_[0] ";
	my $anytag = "${tagsep}[^ ]+";
	my $a;
	my $b;
	my @b;
	my $c;
	my $i;
	my $j;

	for $phrase (keys %multiwords)
	{
		$tags = $multiwords{$phrase};

		$a = " $phrase ";
		$a =~ s/([^ ]+)/\($1\)$anytag/g;
		$b = $tags;
		@b = split(/ +/, $tags);
		for $i (0 .. $#b) { $j = $i + 1; $b[$i] = "\$$j$tagsep$b[$i]"; }
		$b = join(" ", @b);
		$b = " $b ";

		$c = "\$sent =~ s/$a/$b/gi";
		# print "evaluating $c\n" if ($sent =~ /$a/);
		eval $c;
	}

	$sent =~ s/^ +//;
	$sent =~ s/ +$//;
	return $sent;
}
