/* tokenizer.C
**
** This function performs tokenization of raw input to be read by a tagger.
*/

#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>

static char replacesep = '-';
static char tagsep = '_';
static char option_untag[1000];

static char install_dir[1000];
static char ngram_file[1000];
static char lex_file[1000];

static int option_segment = 1;
static char option_compute[1000];
static char option_print[1000];
static char option_input[1000];
static int option_id = 0;
static int option_textid = 0;
static int option_titles = 0;
static char input_file[1000];
static char option_letter[1000];
static int option_loc = 1;
static int option_hyphen = 0;
static char sentence_id[1000];

static int option_silent = 1;
static FILE *input_fp = NULL;
static FILE *option_ofp = NULL;

// These global functions are stored as records are scanned

static char	input_pmid[1000];
static char	input_loc[1000];

static void chomp(char *s)
{
	int     i;

	for (i = strlen(s) - 1; i >= 0 && isspace(s[i]); --i)
		s[i] = '\0';
}

char nextchar(const char *t, int i)
{
	while (isspace(t[i])) i++;
	return t[i];
}

// Look for a token at or prior to the text position

int lookbehind(const char *t, int i, const char *s, int *tokflag)
{
	int	k = strlen(s) - 1;

	while (i > 0 && isspace(t[i])) i--;

	while (k >= 0 && i >= 0)
	{
		if (k > 0 && tokflag[i]) break;

		if (tolower(s[k]) != tolower(t[i]))
			return -1;
		k--;
		i--;
	}

	return (k < 0 && tokflag[i+1]) ? i + 1 : -1;
}

// Look for a token at or following the text position

int lookahead(const char *t, int i, const char *s, int *tokflag)
{
	int	k = 0;

	while (isspace(t[i])) i++;

	while (k < strlen(s) && i < strlen(t))
	{
		if (k > 0 && tokflag[i]) break;

		if (tolower(s[k]) != tolower(t[i]))
			return -1;
		k++;
		i++;
	}

	return (k == strlen(s) && tokflag[i]) ? i - (int) strlen(s) : -1;
}

// Set the initial tokens at spaces

void tok_0(const char *text, int *tokflag)
{
	int i;

	tokflag[0] = 1;
	for (i = 1; i < strlen(text); i++)
	{
		tokflag[i] = isspace(text[i]) || (i > 0 && isspace(text[i - 1])) ? 1 : 0;
	}
	tokflag[i] = 1;
}

// Get quotes preceded by open parens
//
// A double quote, preceded by a space or open bracket is a separate token
//

void tok_1(const char *text, int *tokflag)
{
	for (int i = 1; i < strlen(text); i++)
	{
		if (text[i] == '"' && strchr("([{<", text[i-1]))
		{
			tokflag[i] = 1;
			if (i + 1 < strlen(text)) tokflag[i+1] = 1;
		}
	}
}

// Look for ellipses
//
// Three dots in a row is a separate token

void tok_2(const char *text, int *tokflag)
{
	for (int i = 1; i < strlen(text) - 2; i++)
	{
		if (strncmp(&text[i], "...", 3) == 0)
		{
			tokflag[i] = 1;
			if (i + 3 < strlen(text)) tokflag[i+3] = 1;
		}
	}
}

// Non-sentence-ending punctuation
//
// Certain punctuation characters are separate tokens

void tok_3(const char *text, int *tokflag)
{
	for (int i = 0; i < strlen(text); i++)
	{
		if (strchr(",;:@#$%&", text[i]))
		{
			tokflag[i] = 1;
			tokflag[i + 1] = 1;
		}
	}
}

// Separate the slashes
//
// Slashes are a separate token
// except for +/-, +/+, -/-, -/+, and and/or.

void tok_5_6_7(const char *text, int *tokflag)
{
	for (int i = 0; i < strlen(text); i++)
	{
		if (text[i] == '/')
		{
			tokflag[i] = 1;
			if (i+1 < strlen(text)) tokflag[i+1] = 1;

			// Put back +/-, etc

			if (i - 1 >= 0
			&& i + 1 < strlen(text)
			&& strchr("+-", text[i - 1])
			&& strchr("+-", text[i + 1]))
			{
				tokflag[i - 1] = 1;
				tokflag[i] = tokflag[i+1] = 0;
				tokflag[i + 2] = 1;
			}

			// Put back and/or, etc

			int i0 = lookbehind(text, i-1, "and", tokflag);
			int i1 = lookahead(text, i+1, "or", tokflag);
			if (i0 >= 0 && i1 >= 0)
			{
				for (i = i0 + 1; i < i1 + 1; i++)
					tokflag[i] = 0;
			}
		}
	}
}

// All brackets
//
// Any open or closed bracket is a separate token
//
// Exclamation and question mark
//
// Any question or exclamation mark is a separate token

void tok_8_9(const char *text, int *tokflag)
{
	for (int i = 0; i < strlen(text); i++)
	{
		if (strchr("[](){}<>", text[i])
		|| strchr("?!", text[i]))
		{
			tokflag[i] = 1;
			if (i + 1 < strlen(text)) tokflag[i+1] = 1;
		}
	}
}

// Period at the end of a string may be followed by closed-bracket or quote
//
// A period that is preceded by a non-period
// and optionally followed by a close paren
// and any amount of space at the end of the string
// is a separate token.

void tok_10(const char *text, int *tokflag)
{
	for (int i = strlen(text) - 1; i >= 0; i--)
	{
		if (isspace(text[i])) continue;
		if (strchr("])}>\"'", text[i])) continue;
		if (text[i] != '.') break;
		if (text[i] == '.' && (i - 1 < 0 || text[i-1] != '.'))
		{
			tokflag[i] = 1;
			if (i + 1 < strlen(text)) tokflag[i+1] = 1;
		}
	}
}

// Period followed by a capitalized word
//
// A period preceded by a character that is not another period and not a space
// and followed by a space then an upper case letter is a separate token

void tok_11(const char *text, int *tokflag)
{
	for (int i = 0; i < strlen(text); i++)
	{
		if (text[i] == '.'
		&& (i + 1 < strlen(text) && isspace(text[i+1]))
		&& (i - 1 < 0 || text[i - 1] != '.' || isspace(text[i-1]) == 0)
		&& isupper(nextchar(text, i + 1)))
			tokflag[i] = 1;
	}
}

// A normal word followed by a period
//
// A period followed by a space
// and preceded by 2 or more alphabetic characters or hyphens
// is a separate token

void tok_12(const char *text, int *tokflag)
{
	int wcnt = 0;

	for (int i = 0; i < strlen(text); i++)
	{
		if (text[i] == '.'
		&& tokflag[i + 1]
		&& wcnt >= 2)
			tokflag[i] = 1;

		if (isalpha(text[i]) || text[i] == '-')
			++wcnt;
		else
			wcnt = 0;
	}
}

// A non-normal token (that has no lower case letters) followed by a period
//
// A period at the end of a token made of characters excluding lower case
// is a separate token

void tok_13(const char *text, int *tokflag)
{
	int	stok = 0;
	int	wcnt = 0;

	for (int i = 0; i < strlen(text); i++)
	{
		if (text[i] == '.'
		&& tokflag[i + 1]
		&& wcnt >= 2)
			tokflag[i] = 1;

		if (tokflag[i] == 1) stok = 1;

		if (islower(text[i]) || text[i] == '.')
		{
			stok = 0;
			wcnt = 0;
		}

		if (stok)
			wcnt++;
	}
}

// put some periods with single-letter abbreviations
//
// A single alphabetic token followed by a period followed
// by a token that does not begin with an upper case letter
// or number is taken to be an abbreviation and the period
// does not start a new token.
//
// NOTE: This does not recognize initials in people's names,
//	 that problem is not simply solved.

void tok_14(const char *text, int *tokflag)
{
	for (int i = 0; i < strlen(text); i++)
	{
		if (text[i] == '.'
		&& i - 1 >= 0 && isalpha(text[i - 1]) && tokflag[i - 1]
		&& tokflag[i + 1]
		&& isupper(nextchar(text, i + 1)) == 0
		&& isdigit(nextchar(text, i + 1)) == 0
		&& nextchar(text, i + 1) != '('
		)
		{
			tokflag[i] = 0;
		}
	}
}

// some abbreviations, cannot end a sentence
//
// Look for some specific common abbreviations
// that don't usually end a sentence.

static char *common_abb[] = { "i.e.", "e.g.", "approx.", "vs.", "al.", "viz.", "v.",
	"Mr.", "Ms.", "Dr.", "Mrs.", "Drs.", "Prof.", "Sen.", "St.",
	NULL };

void tok_15(const char *text, int *tokflag)
{
	int	k;

	for (int i = 0; i < strlen(text); i++)
	{
		if (tokflag[i] == 1)
		{
			for (int a = 0; common_abb[a]; a++)
			{
				k = strlen(common_abb[a]);
				if (strncmp(&text[i], common_abb[a], k) == 0 && tokflag[i + k])
				{
					for (int j = 1; j < k; j++) tokflag[i + j] = 0;
					break;
				}
			}
		}
#if 0
		&& (0
		|| ((k = 4) && strncmp(&text[i], "i.e.", k) == 0 && tokflag[i + k])
		|| ((k = 4) && strncmp(&text[i], "e.g.", k) == 0 && tokflag[i + k])
		|| ((k = 7) && strncmp(&text[i], "approx.", k) == 0 && tokflag[i + k])
		|| ((k = 3) && strncmp(&text[i], "vs.", k) == 0 && tokflag[i + k])
		|| ((k = 3) && strncmp(&text[i], "al.", k) == 0 && tokflag[i + k]))
		)
		{
			for (int j = 1; j < k; j++) tokflag[i + j] = 0;
		}
#endif
	}
}

// Get cases where a space after a sentence has been omitted
//
// A period that occurs in a token consisting of alphabetic
// letters with a vowel to the left and the right is a
// separate token.

void tok_16(const char *text, int *tokflag)
{
	int	j;
	int	has_vowel;

	for (int i = 0; i < strlen(text); i++)
	{
		if (text[i] == '.' && tokflag[i] == 0)
		{
			has_vowel = 0;
			for (j = i - 1; j >= 0; --j)
			{
				if (isalpha(text[j]) == 0)
					break;
				if (strchr("aeiouAEIOU", text[j]))
					has_vowel = 1;
				if (tokflag[j])
					break;
			}
			if ((j >= 0 && tokflag[j] == 0) || has_vowel == 0)
				continue;

			has_vowel = 0;
			for (j = i + 1; j < strlen(text) && tokflag[j] == 0; ++j)
			{
				if (isalpha(text[j]) == 0)
					break;
				if (strchr("aeiouAEIOU", text[j]))
					has_vowel = 1;
			}

			if ((j < strlen(text) && tokflag[j] == 0) || has_vowel == 0)
				continue;

			tokflag[i] = 1;
			tokflag[i + 1] = 1;
		}
	}
}

// Numeric endings of sentences
//
// A period after a numeric token followed by a token that starts
// with an alphabetic character, is a separate token.
//
// This should be covered already by tok_13

void tok_17(const char *text, int *tokflag)
{
	int	j;

	for (int i = 0; i < strlen(text); i++)
	{
		if (text[i] == '.'
		&& tokflag[i] == 0
		&& tokflag[i + 1])
		{
			for (j = i - 1; j >= 0 && isdigit(text[j]) && tokflag[j] == 0; --j)
				;
			if (j >= 0 && j < i - 1 && tokflag[j] && isalpha(nextchar(text, i + 1)))
				tokflag[i] = 1;
		}
	}
}

// period at end of string is a token

void tok_20(const char *text, int *tokflag)
{
	for (int i = strlen(text) - 1; i >= 0; --i)
	{
		if (isspace(text[i]))
			continue;

		if (strchr(".!?", text[i]))
			tokflag[i] = 1;

		break;
	}
}


// long dash
//
// A pair of hyphens is a complete token

void tok_21(const char *text, int *tokflag)
{
	for (int i = 0; i < strlen(text) - 1; i++)
	{
		if (strncmp(&text[i], "--", 2) == 0)
		{
			tokflag[i] = 1;
			if (i + 2 < strlen(text))
			{
				i += 2;
				tokflag[i] = 1;
			}
		}
	}
}

// hyphens
//
// If specified as an option, a hyphen between letters is a complete token

void tok_21a(const char *text, int *tokflag)
{
	if (option_hyphen == 0) return;

	for (int i = 0; i < strlen(text) - 1; i++)
	{
		if (text[i] == '-'
		&& (i == 0 || text[i-1] != '-')
		&& text[i+1] != '-')
		{
			tokflag[i] = 1;
			tokflag[i+1] = 1;
		}
	}
}


// quote
//
// Any double quote is a separate token

void tok_22(const char *text, int *tokflag)
{
	for (int i = 0; i < strlen(text); i++)
	{
		if (text[i] == '"')
		{
			tokflag[i] = 1;
			if (i + 1 < strlen(text))
			{
				i += 1;
				tokflag[i] = 1;
			}
		}
	}
}

// possessive
//
// Any single quote at the end of a token that is not
// preceded by a single quote is a separate token

void tok_23(const char *text, int *tokflag)
{
	for (int i = 0; i < strlen(text); i++)
	{
		if (text[i] == '\''
		&& (i - 1 >= 0 && text[i - 1] != '\'')
		&& tokflag[i + 1])
		{
			tokflag[i] = 1;
		}
	}
}


// quote
//
// If a single quote starts a token, or is preceded by a
// single quote, and followed by a character
// that is not a single quote, then
// the character to it's right is the start of a new token

void tok_24(const char *text, int *tokflag)
{
	for (int i = 0; i < strlen(text); i++)
	{
		if (text[i] == '\''
		&& (tokflag[i] == 1 || (i - 1 >= 0 && text[i - 1] == '\''))
		&& (i + 1 < strlen(text) && text[i + 1] != '\''))
		{
			tokflag[i + 1] = 1;
		}
	}
}

// put back possessive
//
// A single quote that is a whole token followed by a lower case s
// that is also a whole token (without space between them)
// should be merged into a single token

void tok_25(const char *text, int *tokflag)
{
	for (int i = 0; i < strlen(text); i++)
	{
		if (text[i] == '\''
		&& tokflag[i] == 1
		&& i + 1 < strlen(text) && text[i + 1] == 's'
		&& tokflag[i+1] == 1
		&& (i + 2 >= strlen(text) || isspace(text[i + 2]) || tokflag[i + 2] == 1))
		{
			tokflag[i + 1] = 0;
		}
	}
}

// quote
//
// A pair of single quotes is a separate token

void tok_26(const char *text, int *tokflag)
{
	for (int i = 0; i < strlen(text); i++)
	{
		if (strncmp(&text[i], "''", 2) == 0
		|| strncmp(&text[i], "``", 2) == 0)
		{
			tokflag[i] = 1;
			if (i + 2 < strlen(text)) tokflag[i + 2] = 1;
		}
	}
}

// possessive
//
// A single quote followed by a letter s is a possessive

void tok_27(const char *text, int *tokflag)
{
	for (int i = 0; i < strlen(text); i++)
	{
		if (text[i] == '\''
		&& i + 1 < strlen(text)
		&& tolower(text[i + 1]) == 's'
		&& (i + 2 >= strlen(text) || tokflag[i + 2]))
		{
			tokflag[i] = 1;
		}
	}
}

// split "cannot" to "can not"
//
// A single token that is the word cannot (in any case)
// is split into two words

void tok_28(const char *text, int *tokflag)
{
	for (int i = 0; i < strlen(text); i++)
	{
		if ((strncmp(&text[i], "cannot", 6) == 0
		|| strncmp(&text[i], "Cannot", 6) == 0)
		&& tokflag[i + 6])
		{
			tokflag[i + 3] = 1;
		}
	}
}

// put list item elements back at sentence end
//
// A period that is preceded by an alphanumeric (no space)
// and any amount of preceding space and an end-mark
// stays with the alphanumeric.

void tok_29(const char *text, int *tokflag)
{
	int	j;

	for (int i = 0; i < strlen(text); i++)
	{
		if (text[i] == '.'
		&& tokflag[i] && tokflag[i + 1]
		&& i - 1 >= 0 && isalnum(text[i - 1])
		&& tokflag[i - 1]
		&& ((j = lookbehind(text, i-2, ".", tokflag)) >= 0
		||  (j = lookbehind(text, i-2, "?", tokflag)) >= 0
		||  (j = lookbehind(text, i-2, "!", tokflag)) >= 0)
		&& tokflag[j])
		{
			tokflag[i] = 0;
		}
	}
}

// list elements at the beginning of a string
//
// An alphanumeric token followed by a period
// at the beginning of the line stays with the
// alphanumeric

void tok_30(const char *text, int *tokflag)
{
	int	i = 0;

	while (isspace(text[i])) i++;

	if (isalnum(text[i])
	&& tokflag[i]
	&& i + 1 < strlen(text)
	&& text[i + 1] == '.'
	&& tokflag[i + 1])
	{
		tokflag[i + 1] = 0;
	}
}

// process American sylte numbers

void tok_31(const char *text, int *tokflag)
{
	int	j;

	for (int i = 0; i < strlen(text); i++)
	{
		if (text[i] == ','
		&& i + 3 < strlen(text)
		&& tokflag[i] && tokflag[i + 1]
		&& isdigit(text[i + 1])
		&& isdigit(text[i + 2])
		&& isdigit(text[i + 3])
		&& i - 1 >= 0 && isdigit(text[i - 1])
		)
		{
			tokflag[i] = 0;
			tokflag[i + 1] = 0;
		}
	}
}

// process British sylte numbers

void tok_32(const char *text, int *tokflag)
{
	int	j;

	for (int i = 0; i < strlen(text); i++)
	{
		if (text[i] == ' '
		&& i + 3 < strlen(text)
		&& tokflag[i] && tokflag[i + 1]
		&& isdigit(text[i + 1])
		&& isdigit(text[i + 2])
		&& isdigit(text[i + 3])
		&& i - 1 >= 0 && isdigit(text[i - 1])
		)
		{
			tokflag[i] = 0;
			tokflag[i + 1] = 0;
		}
	}
}

// tokenize unicode escapes
//
// Added

void tok_33(const char *text, int *tokflag)
{
	int	j;

	for (int i = 0; i < strlen(text); i++)
	{
		if (text[i] == '&')
		{
			if (text[i + 1] == '#')
			{
				for (j = i + 2; isdigit(text[j]); j++)
					;
			} else
			{
				for (j = i + 1; isalpha(text[j]); j++)
					;
			}

			if (text[j] == ';')
			{
				// Tokenize the escape, untokenize everything inside

				tokflag[i] = 1;
				for (i++; i <= j; i++) tokflag[i] = 0;
				tokflag[i] = 1;
			}
		}
	}
}


void set_tokflag(const char *text, int *tokflag)
{

	int	i;

	tok_0(text, tokflag);
	tok_1(text, tokflag);
	tok_2(text, tokflag);
	tok_3(text, tokflag);

	// step 4 replaces tag char, this is done at output

	tok_5_6_7(text, tokflag);
	tok_8_9(text, tokflag);

	tok_10(text, tokflag);
	tok_11(text, tokflag);
	tok_12(text, tokflag);
	tok_13(text, tokflag);
	tok_14(text, tokflag);
	tok_15(text, tokflag);
	tok_16(text, tokflag);
	tok_17(text, tokflag);

	// steps 18 and 19 recognize periods within parens,
	// and this is moved to the segmentation section

	tok_20(text, tokflag);
	tok_21(text, tokflag);
	tok_21a(text, tokflag);		// added to optionally tokenize hyphens
	tok_22(text, tokflag);
	tok_23(text, tokflag);
	tok_24(text, tokflag);
	tok_25(text, tokflag);
	tok_26(text, tokflag);
	tok_27(text, tokflag);
	tok_28(text, tokflag);
	tok_29(text, tokflag);
	tok_30(text, tokflag);
	tok_31(text, tokflag);
	tok_32(text, tokflag);

	tok_33(text, tokflag);
}

/* set_endflag
** 
** After tokflag has been set, find the possible sentence endings.
*/

void set_endflag(const char *text, int *tokflag, int *endflag)
{
	int	i;

	// The following tests look for end-stops and label them.
	// They include steps 18 and 19

	for (i = 0; i <= strlen(text); i++)
		endflag[i] = 0;

	// Count the number of unmatched parens

	int up = 0;	// unmatched round parens
	int ub = 0;	// unmatched brackets

	for (i = 0; i < strlen(text); i++)
	{
		if (text[i] == '(') ++up;
		if (text[i] == ')') --up;
		if (text[i] == '[') ++ub;
		if (text[i] == ']') --ub;
		if (up < 0) up = 0;
		if (ub < 0) ub = 0;
	}

	// Now find the end-of-sentence marks

	// tok_18: periods within parentheses, allow for nesting
	// tok_19: periods within brackets, allow for nesting
	//	the perl version solves this by putting the period
	//	back with the previous token, but a better solution
	//	is to allow it to be tokenized but just don't
	// 	allow it to be an end-of-sentence.
	//	Therefore, these are moved to the segmentation
	//	section

	int p = 0;	// round parens
	int b = 0;	// brackets

	for (i = 0; i < strlen(text); i++)
	{
		if (text[i] == '(') ++p;
		if (text[i] == ')') --p;
		if (text[i] == '[') ++b;
		if (text[i] == ']') --b;
		if (p < 0) p = 0;
		if (b < 0) b = 0;

		if (strchr(".!?", text[i])
		&& tokflag[i]
		&& tokflag[i + 1])
		{
			if (option_segment && p <= up && b <= ub)
				endflag[i] = 1;

			// This is optional to join periods with
			// probable abbreviations

			if (p > up || b > ub)
				tokflag[i] = 0;
		}
	}

}

/* print_tok
** 
** Print a single token.
*/

#define NO_SPACES 1		// Don't print spaces in tokens, this may need to be an option

// Clean up a token to print, and return 1 if something was printed, 0 otherwise
// Convert underscore characters to dash and back quotes to regular quotes (?)

int print_tok(int sp, int tok_pos, char *tok, int endflag)
{
	// Null terminate

	tok[tok_pos--] = '\0';

	// Remove trailing white space

	while (tok_pos >= 0 && isspace(tok[tok_pos])) tok[tok_pos--] = '\0';

	// Convert any tag separator chars

	for (int i = 0; i <= tok_pos; i++)
	{
		if (tok[i] == tagsep) tok[i] = replacesep;
		if (tok[i] == '`') tok[i] = '\'';
	}

	// Print the token

	if (strlen(tok) > 0)
	{
		if (sp)
			fprintf(option_ofp, " ");
		else
		{
			if (option_id) fprintf(option_ofp, "echo %s\n", sentence_id);
			fprintf(option_ofp, "sentence\n");
		}

		for (int i = 0; i < strlen(tok); i++)
		{
			if (! isspace(tok[i])) fprintf(option_ofp, "%c", tok[i]);
		}
		if (endflag)
		{
			fprintf(option_ofp, "%c.", tagsep);
		} else
		{
			fprintf(option_ofp, "%c%s", tagsep, option_untag);
		}
		sp = 1;
	}
	return sp;
}

void make_sent_id(int sent_num)
{
	sprintf(sentence_id, "%s%08s", option_letter, input_pmid);
	if (option_loc)
	{
		sprintf(sentence_id + strlen(sentence_id), "%s", input_loc);
		if (option_segment)
			sprintf(sentence_id + strlen(sentence_id), "%02d", sent_num);
	}
	for (char *s = &sentence_id[0]; *s; s++)
		if (isspace(*s)) *s = '0';
}

/* print_sent
**
** After the tokflag and endflag have been set, print the tokens.
*/

void print_sent(const char *text, int *tokflag, int *endflag)
{
	int	i;

	// Move token starts to non-whitespace chars

	int last_tok = 0;
	for (i = 0; i < strlen(text); i++)
	{
		if (tokflag[i] == 1 && isspace(text[i]))
		{
			tokflag[i] = 0;
			last_tok = 1;
		} else if (isspace(text[i]) == 0 && last_tok)
		{
			tokflag[i] = 1;
			last_tok = 0;
		}
	}


	// Extract the tokens and print them out now

	char	tok[1000];
	int	tok_pos = 0;
	int	n = 0;
	int	ef = 0;
	int	sent_num = 1;
	make_sent_id(sent_num);

	for (i = 0; i <= strlen(text); i++)
	{
		// The start of a new token

		if (tokflag[i])
		{
			// Print the last token

			n = print_tok(n, tok_pos, tok, ef);

			// If this was an end-stop, and segmentation is desired,
			// Then command the tagger to tag

			if (n && ef)
			{
				fprintf(option_ofp, "\n%s\n%s\n", option_compute, option_print);
				n = 0;
				++sent_num;
				make_sent_id(sent_num);
			}

			// Start a new token

			tok_pos = 0;
			ef = 0;
		}

		// Append to the current token

		if (tok_pos > 0 || isspace(text[i]) == 0)
		{
			tok[tok_pos++] = text[i];
			if (endflag[i]) ef = 1;
		}
	}

	// Print the last token

	n = print_tok(n, tok_pos, tok, ef);
	if (n) fprintf(option_ofp, "\n%s\n%s\n", option_compute, option_print);
}

void map_escapes(char *text)
{
	char	*s;
	int	j, k, ch;
	char	buff[10];
	int	len;

	k = 0;
	len = strlen(text);
	for (int i = 0; text[i]; i++)
	{
		if (text[i] == '&' && text[i + 1] == '#')
		{
			for (s = &buff[0], j = 2; j <= 4 && i + j < strlen(text) && isdigit(text[i + j]); j++)
				*s++ = text[i + j];
			*s = '\0';
			ch = atoi(buff);
			if (strlen(buff) > 0 && text[i + j] == ';' && ch > 0 && ch <= 256)
			{
				text[k] = ch;
				if (! text[k]) text[k] = ' ';
				k++;
				i = i + j;
				continue;
			}
		}
		text[k++] = text[i];
	}
	text[k] = '\0';
}

void tokenize(char *text)
{
	if (strlen(text) == 0) return;

	map_escapes(text);

	int *tokflag = new int[strlen(text) + 1];
	int *endflag = new int[strlen(text) + 1];

	set_tokflag(text, tokflag);
	set_endflag(text, tokflag, endflag);
	print_sent(text, tokflag, endflag);

	delete[] tokflag;
	delete[] endflag;
}

void tok_un(char *text)
{
	int untok = 0;
	for (int i = 0; text[i]; ++i)
	{
		if (isspace(text[i])) untok = 0;
		if (text[i] == '_') untok = 1;
		if (untok) text[i] = ' ';
	}
}

void pretokenize(char *text)
{
	if (strlen(text) == 0) return;

	map_escapes(text);

	int *tokflag = new int[strlen(text) + 1];
	int *endflag = new int[strlen(text) + 1];

	tok_un(text);
	tok_0(text, tokflag);
	set_endflag(text, tokflag, endflag);
	print_sent(text, tokflag, endflag);

	delete[] tokflag;
	delete[] endflag;
}


char *skip(char *text, char *what)
{
	while (*text && strchr(what, *text)) ++text;
	return text;
}

#define MAX_BUFF 10000

int process_tokenizer_args(int argc, char **argv)
{
	strcpy(install_dir, "");
	strcpy(input_file, "");
	strcpy(option_compute, "viterbi");
	strcpy(option_print, "printsent");
	strcpy(option_input, "itame");
	strcpy(option_letter, "P");
	option_loc = 1;
	strcpy(option_untag, "UNTAGGED");

	for (int i = 1; i < argc; i++)
	{
		// Process any input file arguments, there can be at most one

		if (strcmp(argv[i], "-") == 0 || argv[i][0] != '-')
		{
			if (strlen(input_file) > 0)
			{
				fprintf(stderr, "Only one input file may be specified.\n");
				return 0;
			}
			strcpy(input_file, argv[i]);
		}

		if (strcmp(argv[i], "-nosegment") == 0) option_segment = 0;
		if (strcmp(argv[i], "-titles") == 0) option_titles = 1;
		if (strcmp(argv[i], "-id") == 0) option_id = 1;
		if (strcmp(argv[i], "-xml") == 0) strcpy(option_input, "xml");
		if (strcmp(argv[i], "-medline") == 0) strcpy(option_input, "medline");
		if (strcmp(argv[i], "-text") == 0) strcpy(option_input, "text");
		if (strcmp(argv[i], "-idtext") == 0) { strcpy(option_input, "text"); option_textid = 1; }
		if (strcmp(argv[i], "-token") == 0) { strcpy(option_input, "token"); option_segment = 0; }
		if (strcmp(argv[i], "-idtoken") == 0) { strcpy(option_input, "token"); option_textid = 1; option_segment = 0; }

		if (strcmp(argv[i], "-input") == 0) strcpy(input_file, argv[++i]);
		if (strcmp(argv[i], "-home") == 0) strcpy(install_dir, argv[++i]);
		if (strcmp(argv[i], "-viterbi") == 0) strcpy(option_compute, "viterbi");
		if (strcmp(argv[i], "-mle") == 0) strcpy(option_compute, "compute");
		if (strcmp(argv[i], "-noop") == 0) strcpy(option_compute, "");
		if (strcmp(argv[i], "-letter") == 0) strcpy(option_letter, argv[++i]);
		if (strcmp(argv[i], "-noloc") == 0) option_loc = 0;
		if (strcmp(argv[i], "-hyphen") == 0) option_hyphen = 1;
		if (strcmp(argv[i], "-untag") == 0) strcpy(option_untag, argv[++i]);

		if (strcmp(argv[i], "-printfull") == 0) strcpy(option_print, "printfull");
		if (strcmp(argv[i], "-verbose") == 0) option_silent = 0;
	}

	if (strlen(install_dir) == 0)
		strcpy(install_dir, "/home/lsmith/medpost");

	sprintf(lex_file, "%s/models/lex.cur", install_dir);
	sprintf(ngram_file, "%s/models/ngrams.cur", install_dir);

	return 1;
}

int run_tokenizer(FILE *ofp)
{
	int	line;

	if (strlen(input_file) == 0 || strcmp(input_file, "-") == 0)
	{
		input_fp = stdin;
	} else
	{
		input_fp = fopen(input_file, "r");
		if (input_fp == NULL)
		{
			fprintf(stderr, "Could not open file %s\n", input_file);
			return 0;
		}
	}

	char *save_text = new char[MAX_BUFF + 1];

	option_ofp = ofp;

	if (option_silent)
		fprintf(option_ofp, "verbose 0\n");

	if (strcmp(option_untag, "UNTAGGED") != 0)
		fprintf(option_ofp, "untag %s\n", option_untag);

	fprintf(option_ofp, "#%s/util/tagger\n", install_dir);
	fprintf(option_ofp, "ngrams %s\n", ngram_file);
	fprintf(option_ofp, "lex 30 %s\n", lex_file);
	fprintf(option_ofp, "backoff\n");
	fprintf(option_ofp, "init 2\n");
	fprintf(option_ofp, "smooth\n");

	strcpy(input_pmid, "");
	strcpy(input_loc, "");

	char	*text = new char[MAX_BUFF + 1];

	int	collect_text;

	collect_text = 0;

	line = 0;
	while (input_fp && fgets(text, MAX_BUFF, input_fp))
	{
		line++;
		chomp(text);

		// fprintf(option_ofp, "\n%s\n\n", text);

		if (strcmp(option_input, "itame") == 0)
		{
			if (strncmp(text, ".I", 2) == 0)
			{
				strcpy(input_pmid, text + 2);
			} else if (option_titles && strncmp(text, ".T", 2) == 0)
			{
				strcpy(input_loc, "T");
				tokenize(text + 2);
			} else if (strncmp(text, ".A", 2) == 0)
			{
				strcpy(input_loc, "A");
				tokenize(text + 2);
			}
		} else if (strcmp(option_input, "xml") == 0)
		{
			char *s1;
			char *s2;

			if ((s1 = strstr(text, "<PMID>")) && (s2 = strstr(text, "</PMID>")) && s2 > s1)
			{
				*s2 = '\0';
				strcpy(input_pmid, s1 + 6);
			} else if (option_titles && (s1 = strstr(text, "<ArticleTitle>")) && (s2 = strstr(text, "</ArticleTitle>")) && s2 > s1)
			{
				strcpy(input_loc, "T");
				*s2 = '\0';
				tokenize(s1 + 14);
			} else if ((s1 = strstr(text, "<AbstractText>")) && (s2 = strstr(text, "</AbstractText>")) && s2 > s1)
			{
				strcpy(input_loc, "A");
				*s2 = '\0';
				tokenize(s1 + 14);
			}
		} else if (strcmp(option_input, "medline") == 0)
		{
			if (collect_text)
			{
				if (isspace(*text))
					strcat(save_text, text);
				else
				{
					tokenize(save_text);
					collect_text = 0;
				}
			}

			if (strncmp(text, "PMID", 4) == 0)
			{
				strcpy(input_pmid, skip(text + 4, "- "));
			} else if (option_titles && strncmp(text, "TI", 2) == 0)
			{
				strcpy(input_loc, "T");
				strcpy(save_text, skip(text + 2, "- "));
				collect_text = 1;
			} else if (strncmp(text, "AB", 2) == 0)
			{
				strcpy(input_loc, "A");
				strcpy(save_text, skip(text + 2, "- "));
				collect_text = 1;
			}

		} else if (strcmp(option_input, "text") == 0)
		{
			// If id is specified for text input
			// each line is preceded by an id

			if (option_textid && (line % 2))
			{
				fprintf(option_ofp, "echo %s\n", text);
			} else
			{
				tokenize(text);
			}
		} else if (strcmp(option_input, "token") == 0)
		{
			// If id is specified for text input
			// each line is preceded by an id

			if (option_textid && (line % 2))
			{
				fprintf(option_ofp, "echo %s\n", text);
			} else
			{
				pretokenize(text);
			}
		}
	}
	delete[] save_text;
}

main(int argc, char **argv)
{
	process_tokenizer_args(argc, argv);
	run_tokenizer(stdout);
}
