#!/usr/bin/env perl

# Proudly written using vi (OpenBSD nvi)

# NOTE: If you're wondering why the error codes used by the functions are so
# inconsistent, go ask my former self

# NOTE 2: This codebase has grown as new features were needed, so it's quite
# ugly now, but I haven't had time to clean it up.

use strict;
use warnings;
use utf8;
use feature 'unicode_strings';
use open qw< :encoding(UTF-8) >;
binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";
use Unicode::Normalize;
use Glib qw/TRUE FALSE/;
use Gtk2 '-init';
use Getopt::Long;
use Pod::Usage;
use Scalar::Util qw(weaken);
use File::Basename qw(dirname);
use File::Spec::Functions qw(rel2abs file_name_is_absolute);

# takes a string of words separated by '$config->{choicesep}' and returns a new string in the
# same format with duplicates removed
sub get_unique_words {
	my ($word, $config) = @_;
	my %tmp;
	my @words_uniq = grep !$tmp{$_}++, split /\Q$config->{choicesep}\E/, $word;
	return join $config->{choicesep}, @words_uniq;
}

# Adds all words in $words to $trie
# Automatically combines duplicate words with "$config->{choicesep}" inbetween
sub add_to_trie {
	my ($table_name, $trie, $words, $args, $config, $override) = @_;
	foreach my $word (keys %$words) {
		my $cur_node = $trie;
		foreach my $char (split //, $word) {
			if (!exists($cur_node->{$char})) {
				$cur_node->{$char}->{"parent"} = $cur_node;
				# This is required to avoid circular references
				# (otherwise, the garbage collector doesn't ever
				# destroy these nodes, leading to the memory
				# consumption growing without restraint if
				# "Reload config" is used)
				weaken($cur_node->{$char}->{"parent"});
			}
			$cur_node = $cur_node->{$char};
		}
		if (exists($cur_node->{"final"})) {
			if ($override) {
				$cur_node->{"final"} = $words->{$word};
				next;
			}
			if ($args->{"checkduplicates"}) {
				warn "WARNING: Duplicate word \"$word\". Last occurrence as " .
					"\"$cur_node->{final}\" in table \"$cur_node->{table_name}\", " .
					"current occurrence as \"$words->{$word}\" in " .
					"table \"$table_name\.\n";
			}
			$cur_node->{"final"} = get_unique_words($cur_node->{"final"} . $config->{choicesep} . $words->{$word}, $config);
		} else {
			$cur_node->{"final"} = $words->{$word};
			if ($args->{"checkduplicates"}) {
				$cur_node->{"table_name"} = $table_name;
			}
		}
	}
}

# Prompt user when no replacement has been found for a word
# $word is the word that was not found and $context* the context,
# $word_repl is the replacement word - this is only used when the window is
#            called from the word choice window, since the original and
#            replacement aren't the same then
# with context*_orig being the original, non-transliterated context.
# $table_paths is a mapping of table paths (here, only the keys, i.e.
# the actual paths, are used) to allow the user to choose a table to
# save a new replacement to.
# $cur_lineno is a display string to show the current line number
# $config_error is an optional flag to specify whether an error
# message should be displayed, informing the user that the config
# could not be loaded (used when "Reload config" is clicked)
# Returns: an array reference containing an action to be taken,
# in the form ["<action name>", <optional args>].
# See `handle_unknown_word_action` for currently accepted values
sub prompt_unknown_word {
	# yes, this function really should take fewer arguments...
	# it would be better to just pass the substrings and an index
	my ($contextl, $contextl_orig, $word_repl, $word, $contextr, $contextr_orig,
	    $config, $cur_lineno, $args, $config_error) = @_;
	my $action;
	my $stop = 0;

	my $window = Gtk2::Window->new('toplevel');
	$window->signal_connect(delete_event => sub {return FALSE});
	$window->signal_connect(destroy => sub { Gtk2->main_quit; });
	$window->set_border_width(10);

	my $vbox = Gtk2::VBox->new(FALSE, 10);

	my $linelabel = Gtk2::Label->new("Current line: $cur_lineno");
	$vbox->pack_start($linelabel, FALSE, FALSE, 0);
	$linelabel->show;

	my $wordlabel = Gtk2::Label->new("Word not found: $word");
	$wordlabel->set_alignment(0.0, 0.0);
	$vbox->pack_start($wordlabel, FALSE, FALSE, 0);
	$wordlabel->show;

	# Make a text box with the given left and right context and label
	# Also creates a button allowing the user to set the currently
	# selected text as the word to be replaced - useful when only part
	# of the entire word that was not found has to be replaced
	my $make_context_box = sub {
		# look, don't blame me for these miserably named variables...
		my ($ctxtl, $wrd, $ctxtr, $lbl) = @_;
		my $hbox = Gtk2::HBox->new(FALSE, 5);
		my $label = Gtk2::Label->new($lbl);
		my $text = Gtk2::TextView->new;
		$text->set_wrap_mode("word");
		my $buffer = $text->get_buffer();
		$buffer->set_text($ctxtr);
		my $highlight = $buffer->create_tag("yellow_bg", "background", "yellow");
		my $start = $buffer->get_start_iter();
		$buffer->insert_with_tags($start, $wrd, $highlight);
		$start = $buffer->get_start_iter();
		$buffer->insert($start, $ctxtl);
		my $button = Gtk2::Button->new("Use selection as word");
		$button->signal_connect(
			clicked => sub {
				if (my ($start, $end) = $buffer->get_selection_bounds()) {
					$word = $buffer->get_text($start, $end, FALSE);
					$wordlabel->set_text("Selected: $word");
				}
			}, $window);
		$hbox->pack_start($label, FALSE, FALSE, 0);
		$hbox->pack_start($text, TRUE, TRUE, 0);
		$vbox->pack_start($hbox, FALSE, FALSE, 0);
		$hbox = Gtk2::HBox->new(FALSE, 5);
		$hbox->pack_start($button, FALSE, FALSE, 0);
		my $complete_text = $ctxtl . $wrd . $ctxtr;
		$button = Gtk2::Button->new("Reset text");
		$button->signal_connect(
			clicked => sub {
				$buffer->set_text($complete_text);
			}, $window);
		$hbox->pack_start($button, FALSE, FALSE, 0);
		$vbox->pack_start($hbox, FALSE, FALSE, 0);
	};
	$make_context_box->($contextl, $word_repl, $contextr, "Context: ");
	$make_context_box->($contextl_orig, $word, $contextr_orig, "Original: ");

	my $hbox = Gtk2::HBox->new(FALSE, 5);
	my $label = Gtk2::Label->new("Ignore: ");
	$hbox->pack_start($label, FALSE, FALSE, 0);
	my $button = Gtk2::Button->new("This run");
	$button->signal_connect(
		clicked => sub {
			$action = ["ignore", "run", $word];
			$window->destroy;
		}, $window);
	$hbox->pack_start($button, FALSE, FALSE, 0);
	$button = Gtk2::Button->new("Permanently");
	$button->signal_connect(
		clicked => sub {
			$action = ["ignore", "permanent", $word];
			$window->destroy;
		}, $window);
	$hbox->pack_start($button, FALSE, FALSE, 0);
	$button = Gtk2::Button->new("Whole line");
	$button->signal_connect(
		clicked => sub {
			$action = ["ignore", "wholeline"];
			$window->destroy;
		}, $window);
	$hbox->pack_start($button, FALSE, FALSE, 0);
	$vbox->pack_start($hbox, FALSE, FALSE, 0);

	# AHHHH! IT BURNS!!! THE CODE IS SO HORRIBLE!
	# Take note, kids - this is what happens when you keep adding
	# features without rethinking your basic design.

	$hbox = Gtk2::HBox->new(FALSE, 5);
	$label = Gtk2::Label->new("Add to list: ");
	$hbox->pack_start($label, FALSE, FALSE, 0);
	my $path_list = Gtk2::ComboBox->new_text;
	foreach my $path (sort keys %{$config->{"display_tables"}}) {
		$path_list->append_text($path);
	}
	$hbox->pack_start($path_list, FALSE, FALSE, 0);
	$vbox->pack_start($hbox, FALSE, FALSE, 0);

	$hbox = Gtk2::HBox->new(FALSE, 5);
	$label = Gtk2::Label->new("Replacement: ");
	$hbox->pack_start($label, FALSE, FALSE, 0);
	my $replace_entry = Gtk2::Entry->new;
	$hbox->pack_start($replace_entry, TRUE, TRUE, 0);
	$vbox->pack_start($hbox, FALSE, FALSE, 0);

	if (exists $config->{"retrywithout"}) {
		$hbox = Gtk2::HBox->new(FALSE, 5);
		$label = Gtk2::Label->new("Retry without: ");
		$hbox->pack_start($label, FALSE, FALSE, 0);
		foreach my $without (@{$config->{"retrywithout"}}) {
			$button = Gtk2::Button->new("$without->[0]");
			$button->signal_connect(
				clicked => sub {
					my @chars = @{$without}[1..$#$without];
					my $stripped = replace_strip_chars($config, $args, \@chars, $word);
					# recombine substrings
					my $repl_text = "";
					$repl_text .= $_->[1] foreach @$stripped;
					$replace_entry->set_text($repl_text);
				}, $window);
			$hbox->pack_start($button, FALSE, FALSE, 0);
		}
		$vbox->pack_start($hbox, FALSE, FALSE, 0);
	}

	$hbox = Gtk2::HBox->new(FALSE, 0);
	$button = Gtk2::Button->new("Add replacement");
	$button->signal_connect(
		clicked => sub {
			if ($path_list->get_active != -1) {
				$action = ["add", $word, $replace_entry->get_text, $path_list->get_active_text];
				$window->destroy;
			}
		}, $window);
	$hbox->pack_start($button, FALSE, FALSE, 0);
	$vbox->pack_start($hbox, FALSE, FALSE, 0);

	$hbox = Gtk2::HBox->new(FALSE, 5);
	$button = Gtk2::Button->new("Stop processing");
	$button->signal_connect(
		clicked => sub {
			$stop = 1;
			$window->destroy;
		}, $window);
	$hbox->pack_start($button, FALSE, FALSE, 0);

	$button = Gtk2::Button->new("Reload config");
	$button->signal_connect(
		clicked => sub {
			$action = ["reload"];
			$window->destroy;
		}, $window);
	$hbox->pack_start($button, FALSE, FALSE, 0);

	if ($config_error) {
		$label = Gtk2::Label->new("Error loading config; see terminal output for details");
		$hbox->pack_start($label, FALSE, FALSE, 0);
	}
	$vbox->pack_start($hbox, FALSE, FALSE, 0);

	$window->add($vbox);
	$window->show_all;
	Gtk2->main;

	die "Processing stopped at line $cur_lineno\n" if $stop;

	if (!$action) {
		# This action isn't explicitly handled, but at least nothing
		# breaks when the window is closed without selecting an action
		$action = ["dummy"];
	}
	return $action;
}

# Prompt the user when a word has multiple replacement options (separated by $config->{choicesep})
# $cur_lineno - display string to show the current line number
# Returns:
# 3, if this window needs to be called again but nothing needs
#    to be re-transliterated
# 1, if the line needs to be re-transliterated
# 0, if nothing needs to be done
sub prompt_choose_word {
	my ($substrings, $config, $args, $cur_lineno) = @_;

	# make a list of all substrings that contain multiple word options
	my @replacements;
	foreach (0..$#$substrings) {
		if ($substrings->[$_]->[0] && $substrings->[$_]->[1] =~ /\Q$config->{choicesep}\E/) {
			# This ugly bit of code is here as a special case for transliterating
			# Hindi to Urdu text - if there are *exactly* two choices and one
			# contains diacritics but the other one doesn't, the one with diacritics
			# is automatically used instead of prompting the user.
			if (exists $config->{"targetdiacritics"}) {
				my @choices = split /\Q$config->{choicesep}\E/, $substrings->[$_]->[1];
				my @diacritics = @{$config->{"targetdiacritics"}};
				if (@choices == 2) {
					@choices = map {NFD($_)} @choices;
					my $first_matches = grep {$choices[0] =~ /$_/} @diacritics;
					my $second_matches = grep {$choices[1] =~ /$_/} @diacritics;
					if ($first_matches && !$second_matches) {
						$substrings->[$_]->[1] = $choices[0];
						next;
					} elsif (!$first_matches && $second_matches) {
						$substrings->[$_]->[1] = $choices[1];
						next;
					}
				}
			}
			# Format of the elements in @replacements:
			# [<id of substrings in $substrings>, <replacement word>, <original string>]
			push @replacements, [$_, $substrings->[$_]->[1], $substrings->[$_]->[1]];
		}
	}
	# no substrings have multiple options
	return if (!@replacements);

	my $stop = 0;
	my $open_unknown = 0;
	my $cur_replacement = 0;

	my $window = Gtk2::Window->new('toplevel');
	$window->signal_connect(delete_event => sub {return FALSE});
	$window->signal_connect(destroy => sub { Gtk2->main_quit; });
	$window->set_border_width(10);

	my $vbox = Gtk2::VBox->new(FALSE, 0);

	my $linelabel = Gtk2::Label->new("Current line: $cur_lineno");
	$vbox->pack_start($linelabel, FALSE, FALSE, 0);

	my $wordlabel = Gtk2::Label->new("");
	$wordlabel->set_alignment(0.0, 0.0);
	$vbox->pack_start($wordlabel, FALSE, FALSE, 0);

	my $undo = Gtk2::Button->new("Undo");
	$vbox->pack_start($undo, FALSE, FALSE, 0);
	$undo->set_sensitive(FALSE);

	my $button_vbox = Gtk2::VBox->new(FALSE, 0);
	$vbox->pack_start($button_vbox, FALSE, FALSE, 0);

	my $accept = Gtk2::Button->new("Accept changes?");
	$vbox->pack_start($accept, FALSE, FALSE, 0);

	my $hbox = Gtk2::HBox->new(FALSE, 0);
	my $label = Gtk2::Label->new("Context: ");
	my $text = Gtk2::TextView->new;
	$text->set_wrap_mode("word");
	my $buffer = $text->get_buffer();
	my $highlight = $buffer->create_tag("yellow_bg", "background", "yellow");
	$text->set_editable(FALSE);
	$hbox->pack_start($label, FALSE, FALSE, 0);
	$hbox->pack_start($text, TRUE, TRUE, 10);
	$vbox->pack_start($hbox, FALSE, FALSE, 10);

	$hbox = Gtk2::HBox->new(FALSE, 5);
	my $skip_button = Gtk2::Button->new("Skip word");
	$hbox->pack_start($skip_button, FALSE, FALSE, 0);
	my $unknown_button = Gtk2::Button->new("Open in unknown word window");
	$hbox->pack_start($unknown_button, FALSE, FALSE, 0);
	my $stop_button = Gtk2::Button->new("Stop processing");
	$hbox->pack_start($stop_button, FALSE, FALSE, 0);
	$vbox->pack_start($hbox, FALSE, FALSE, 0);

	# generate the context to the left and to the right of the current word being replaced
	my $get_context = sub {
		my ($contextl, $contextr) = ("", "");
		my $tmp_replacement = 0;
		foreach (0..$#$substrings) {
			my $word = $substrings->[$_]->[1];
			if ($tmp_replacement <= $#replacements && $replacements[$tmp_replacement]->[0] == $_) {
				$word = $replacements[$tmp_replacement]->[1];
				$tmp_replacement++;
			}
			# When nothing is left to replace, the entire string is in $contextl
			if ($cur_replacement > $#replacements || $_ < $replacements[$cur_replacement]->[0]) {
				$contextl .= $word;
			} elsif ($_ > $replacements[$cur_replacement]->[0]) {
				$contextr .= $word;
			}
		}
		return ($contextl, $contextr);
	};

	# fill the text buffer with the context and current word, highlighting the word
	# if $cur_replacement is after the end of @replacements, don't highlight anything
	# (this happens when all words have been replaced and the user only needs to accept the changes)
	my $fill_text_buffer = sub {
		my $start = $buffer->get_start_iter();
		my $end = $buffer->get_end_iter();
		$buffer->delete($start, $end);
		my ($contextl, $contextr) = $get_context->();
		$buffer->set_text($contextr);
		if ($cur_replacement <= $#replacements) {
			$start = $buffer->get_start_iter();
			$buffer->insert_with_tags($start, $replacements[$cur_replacement]->[1], $highlight);
		}
		$start = $buffer->get_start_iter();
		$buffer->insert($start, $contextl);
	};

	my $show_accept = sub {
		$button_vbox->foreach(sub {my $child = shift; $child->destroy();});
		$accept->show;
		$accept->grab_focus;
		$wordlabel->set_text("");
		$skip_button->set_sensitive(FALSE);
		$unknown_button->set_sensitive(FALSE);
	};

	my $fill_button_vbox; # forward-declaration so it can be used here already
	my $next_word = sub {
		$undo->set_sensitive(TRUE);
		$cur_replacement++;
		$fill_text_buffer->();
		if ($cur_replacement > $#replacements) {
			$show_accept->();
			return;
		}
		$fill_button_vbox->();
	};

	# fill $button_vbox with the word options for the current word
	$fill_button_vbox = sub {
		$button_vbox->foreach(sub {my $child = shift; $child->destroy();});
		my $word = $replacements[$cur_replacement]->[1];
		$wordlabel->set_text("Word \"$word\" has multiple replacement options:");
		my @choices = split /\Q$config->{choicesep}\E/, $replacements[$cur_replacement]->[1];
		if (exists $config->{"targetdiacritics"}) {
			# This nasty bit of code finds the number of diacritics in every
			# choice and sorts the choice in descending order based on that
			my %choice_nums;
			foreach my $choice (@choices) {
				$choice_nums{$choice} = 0;
				foreach my $diacritic (@{$config->{"targetdiacritics"}}) {
					my @matches = NFD($choice) =~ /$diacritic/;
					$choice_nums{$choice} += scalar @matches if @matches;
				}
			}
			@choices = sort {$choice_nums{$b} <=> $choice_nums{$a}} @choices;
		}
		foreach my $word_choice (@choices) {
			my $button = Gtk2::Button->new($word_choice);
			$button->signal_connect(
				clicked => sub {
					$replacements[$cur_replacement]->[1] = $word_choice;
					$next_word->();
				}, $window);
			$button_vbox->pack_start($button, FALSE, FALSE, 0);
			$button->show;
		}
	};

	$undo->signal_connect(
		clicked => sub {
			if ($cur_replacement > 0) {
				$cur_replacement--;
				if ($cur_replacement == 0) {
					$undo->set_sensitive(FALSE);
				}
				$replacements[$cur_replacement]->[1] = $replacements[$cur_replacement]->[2];
				$fill_button_vbox->();
				$fill_text_buffer->();
				$accept->hide;
				$skip_button->set_sensitive(TRUE);
				$unknown_button->set_sensitive(TRUE);
				my $word = $replacements[$cur_replacement]->[1];
				$wordlabel->set_text("Word \"$word\" has multiple replacement options:");
			}
		}, $window);

	$accept->signal_connect(
		clicked => sub {
			# write the changes to the original $substrings
			foreach (@replacements) {
				$substrings->[$_->[0]]->[1] = $_->[1];
			}
			$window->destroy;
		}, $window);

	$skip_button->signal_connect(clicked => $next_word, $window);

	$unknown_button->signal_connect(
		clicked => sub {
			$open_unknown = 1;
			$window->destroy;
		}, $window);

	$stop_button->signal_connect(
		clicked => sub {
			$stop = 1;
			$window->destroy;
		}, $window);

	$fill_button_vbox->();
	$fill_text_buffer->();

	$window->add($vbox);
	$window->show_all;
	$accept->hide;
	Gtk2->main;
	die "Processing stopped at line $cur_lineno\n" if $stop;
	if ($open_unknown) {
		my $ret = call_unknown_word_window(
			$substrings, $replacements[$cur_replacement]->[0],
			$config, $args, $cur_lineno);
		# the word choice window still needs to be called again
		# when 0 is returned
		return 3 if $ret == 0;
		return $ret;
	}
	return 0;
}

my $ID = 0;
my $STRING = 1;

# Parse the configuration file into data type (currently only ID and STRING)
sub parse_config {
	my $f = shift;
	my $fh;
	if (!open($fh, "<", $f)) {
		warn "Can't open config file \"$f\"!\n";
		return;
	}
	my @commands;
	my $state = 0;
	my $IN_ID = 1;
	my $IN_STR = 2;
	my $cur_val = "";
	while (my $line = <$fh>) {
		chomp($line);
		$state = 0;
		push(@commands, []);
		foreach my $char (split(//, $line)) {
			if ($char eq "#" && !($state & $IN_STR)) {
				last;
			} elsif ($char eq '"') {
				if ($state & $IN_STR) {
					push(@{$commands[-1]}, {type => $STRING, value => $cur_val});
					$cur_val = "";
					$state &= ~$IN_STR;
				} else {
					$cur_val = "";
					$state |= $IN_STR;
				}
			} elsif ($char eq " ") {
				if ($state & $IN_ID) {
					push(@{$commands[-1]}, {type => $ID, value => $cur_val});
					$state &= ~$IN_ID;
					$cur_val = "";
				} elsif ($state) {
					$cur_val .= $char;
				}
			} else {
				if (!$state) {
					$state |= $IN_ID;
				}
				$cur_val .= $char;
			}
		}
		if ($state & $IN_STR) {
			warn "ERROR: Unterminated string in config:\n$line";
			return;
		} elsif ($cur_val) {
			push(@{$commands[-1]}, {type => $ID, value => $cur_val});
			$cur_val = "";
		}
		if ($#{$commands[-1]} == -1) {
			pop(@commands);
		}
	}
	close($fh);

	return \@commands;
}

# if the path is relative, find its absolute location based
# on the location of the config file
sub open_file_rel_abs {
	my ($filename, $config_file, $mode) = @_;
	$mode //= "<";
	if (!file_name_is_absolute $filename) {
		my $config_dir = dirname $config_file;
		$filename = rel2abs($filename, $config_dir);
	}
	my $fh;
	if (!open $fh, $mode, $filename) {
		warn "Can't open file \"$filename\"!\n";
		return;
	}
	return $fh;
}

# Load a file of replacement words into a hash table
sub load_table {
	my ($filename, $args, $config, $revert) = @_;
	my $fh = open_file_rel_abs $filename, $args->{"config"};
	return if !$fh;
	my %table;
	while (my $line = <$fh>) {
		chomp $line;
		next if (!$line);
		my @words = split(/\Q$config->{tablesep}\E/, $line);
		if (@words != 2) {
			warn "ERROR: Malformed line in file \"$filename\":\n$line\n";
			close $fh;
			return;
		}
		my $word;
		my $replacement;
		if ($revert) {
			$word = NFD $words[1];
			$replacement = $words[0];
		} else {
			$word = NFD $words[0];
			$replacement = $words[1];
		}
		my @word_choices = split /\Q$config->{choicesep}\E/, $word;
		foreach my $word_choice (@word_choices) {
			if (exists $table{$word_choice}) {
				if ($args->{"checkduplicates"}) {
					warn "WARNING: Duplicate word in file \"$filename\": " .
						"\"$word_choice\", with replacement \"$replacement\", " .
						"already exists with replacement \"$table{$word_choice}\".\n";
				}
				$table{$word_choice} = get_unique_words(
					$table{$word_choice} .
					$config->{choicesep} .
					$replacement, $config);
			} else {
				$table{$word_choice} = $replacement;
			}
		}
	}
	close $fh;
	return \%table;
}

# Load table for words to ignore - only the keys matter, since there is no replacement
sub load_ignore_table {
	my ($filename, $args) = @_;
	my $fh = open_file_rel_abs $filename, $args->{"config"};
	return if !$fh;
	my %table;
	while (my $line = <$fh>) {
		chomp $line;
		$table{NFD($line)} = "" if $line;
	}
	close $fh;
	return \%table;
}

# Generate all forms of a word by combining it with endings
# Returns:
# 0 - an error occurred
# 1 - everything's fine
sub expand_table {
	my ($table, $forms, $noroot, $config) = @_;
	my %new_table;
	foreach my $word (keys %$table) {
		foreach my $ending (keys %$forms) {
			# Some words and/or endings have multiple options, separated by $config->{choicesep}
			# These must be temporarily separated in order to properly generate the forms
			my @word_options;
			my @stem_options = split(/\Q$config->{choicesep}\E/, $table->{$word});
			my @ending_options = split(/\Q$config->{choicesep}\E/, $forms->{$ending});
			foreach my $stem_option (@stem_options) {
				foreach my $ending_option (@ending_options) {
					push(@word_options, $stem_option . $ending_option);
				}
			}
			$new_table{$word . $ending} = join($config->{choicesep}, @word_options);
		}
		$new_table{$word} = $table->{$word} if !$noroot;
	}
	return \%new_table;
}

# Check if the number and types of arguments given to a config command are right
# Returns:
# undef - the arguments don't match
# 1 - the arguments match
sub check_args {
	my ($args, $cmd) = @_;
	my $cmd_name = $cmd->[0]->{"value"};
	if ($#$cmd - 1 < $#$args) {
		my $err = "ERROR: not enough arguments for command \"$cmd_name\":";
		foreach my $arg (@{$cmd}[1..$#$cmd]) {
			$err .= " " . $arg->{"value"}
		}
		warn "$err\n";
		return;
	}
	my $arg_num = 0;
	while ($arg_num <= $#$args) {
		if ($cmd->[$arg_num + 1]->{"type"} != $args->[$arg_num]) {
			my $err = "ERROR: argument type mismatch for command \"$cmd_name\".\n";
			$err .= "Expected:";
			foreach my $arg_type (@$args) {
				$err .= " ID" if ($arg_type == $ID);
				$err .= " STRING" if ($arg_type == $STRING);
			}
			$err .= "\nReceived:";
			foreach my $arg (@{$cmd}[1..$#$cmd]) {
				$err .= " ID" if ($arg->{"type"} == $ID);
				$err .= " STRING" if ($arg->{"type"} == $STRING);
			}
			warn "$err\n";
			return;
		}
		$arg_num++;
	}
	return 1;
}

# Interpret the config file - load and expand tables, etc.
# $config_list - the list returned by parse_config
sub interpret_config {
	my ($config_list, $args) = @_;
	my %tables;
	my %config;
	# table_paths stores a list of all table and replacement ids that are
	# impacted by the path so the replacement can be added on the fly when
	# a new replacement is added from the GUI
	# the "replacement id" is just the number of the replacement group,
	# starting at 0 with the first group in the config
	$config{"table_paths"} = {};
	# reverted_tables stores a hash of the paths of all tables that are
	# reverted so that replacements added from the GUI are added in the
	# right order
	$config{"reverted_tables"} = {};
	# these are the paths of the tables that are displayed in the GUI
	$config{"display_tables"} = {};
	# a mapping between the table ids and tables for all tables used as
	# ending tables in expand statements - so expansions can be done
	# on the fly when adding a replacement word from the GUI
	$config{"ending_tables"} = {};
	# ignore is the path to the ignore file, ignore_words the actual table
	$config{"ignore"} = "";
	$config{"ignore_words"} = {};
	$config{"split"} = "\\s+";
	$config{"beforeword"} = "\\s";
	$config{"afterword"} = "\\s";
	$config{"tablesep"} = "\t";
	$config{"choicesep"} = "\$";
	# a list of "replacement configs", which specify the type and any
	# other arguments (this is given to replace_match, etc.
	$config{"replacements"} = [];
	# these are temporary mappings used while loading the config
	my %path_to_table;
	my %table_id_to_path;
	my %mandatory_args = (
		"ignore" => [$STRING],
		"table" => [$ID],
		"expand" => [$ID, $ID],
		"match" => [$STRING, $STRING],
		"matchignore" => [$STRING],
		"replace" => [$ID],
		"split" => [$STRING],
		"beforeword" => [$STRING],
		"afterword" => [$STRING],
		"tablesep" => [$STRING],
		"choicesep" => [$STRING],
		"comment" => [$STRING],
		"group" => [],
		"endgroup" => [],
		"retrywithout" => [$STRING],
		"targetdiacritics" => [$STRING]
	);
	my $in_group = 0;
	foreach my $cmd (@$config_list) {
		# All load statements must be before expand statements
		# All expand, beforeword, and afterword statements must be before replace statements
		my $cmd_name = $cmd->[0]->{"value"};
		if ($cmd->[0]->{"type"} == $ID) {
			if (!exists($mandatory_args{$cmd->[0]->{"value"}})) {
				warn "ERROR: Unknown command \"" . $cmd->[0]->{"value"} . "\" in config\n";
				return;
			}
			return if !check_args($mandatory_args{$cmd_name}, $cmd);
			if ($cmd_name eq "table") {
				my $table_path = $cmd->[2]->{"value"};
				my %table_args;
				foreach (3..$#$cmd) {
					$table_args{$cmd->[$_]->{"value"}} = 1;
				}
				my $table;
				# add to temporary path-to-table mapping so tables aren't
				# loaded unnecessarily
				if (exists $path_to_table{$table_path}) {
					$table = $path_to_table{$table_path};
				} else {
					$table = load_table $table_path, $args, \%config, $table_args{"revert"};
					return if !defined $table;
					$path_to_table{$table_path} = $table;
				}
				if ($table_args{"revert"}) {
					$config{"reverted_tables"}->{$table_path} = 1;
				}
				my $table_id = $cmd->[1]->{"value"};
				$tables{$table_id} = $table;
				$table_id_to_path{$table_id} = $table_path;
				# this is a hash to avoid duplicates if the same file
				# is loaded multiple times
				$config{"display_tables"}->{$table_path} = 1 if !exists $table_args{"nodisplay"};
			} elsif ($cmd_name eq "expand") {
				my $orig_table_id = $cmd->[1]->{"value"};
				my $ending_table_id = $cmd->[2]->{"value"};
				my $noroot = 0;
				if ($#$cmd >= 3 && $cmd->[3]->{"value"} eq "noroot") {
					$noroot = 1;
				}
				if (!exists $tables{$orig_table_id}) {
					warn "expand: table \"$orig_table_id\" doesn't exist\n";
					return;
				} elsif (!exists $tables{$ending_table_id}) {
					warn "expand: table \"$ending_table_id\" doesn't exist\n";
					return;
				}

				$config{"ending_tables"}->{$ending_table_id} = $tables{$ending_table_id};
				$config{"expands"}->{$orig_table_id} = [] if !exists $config{"expands"}->{$orig_table_id};
				push @{$config{"expands"}->{$orig_table_id}}, [$ending_table_id, $noroot];

				my $new_table = expand_table($tables{$orig_table_id}, $tables{$ending_table_id}, $noroot, \%config);
				return if !$new_table;
				$tables{$orig_table_id} = $new_table;
			} elsif ($cmd_name eq "group") {
				if ($in_group) {
					warn "ERROR: New group started without ending last one in config\n";
					return;
				}
				push @{$config{"replacements"}}, {
					"type" => "group", "tables" => [],
					"words" => {}, "options" => {}};
				# add all options such as "endword" to the options hash
				for (1..$#$cmd) {
					$config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1;
				}
				$in_group = 1;
			} elsif ($cmd_name eq "endgroup") {
				if (!$in_group) {
					warn "ERROR: endgroup command called while not in group\n";
					return;
				}
				$in_group = 0;
			} elsif ($cmd_name eq "match") {
				if ($in_group) {
					warn "ERROR: match command is invalid inside group\n";
					return;
				}
				push @{$config{"replacements"}}, {
					"type" => "match",
					"options" => {},
					"search" => NFD($cmd->[1]->{"value"}),
					"replace" => $cmd->[2]->{"value"}};
				for (3..$#$cmd) {
					# add optional arguments as keys in options hash
					$config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1;
				}
			} elsif ($cmd_name eq "matchignore") {
				if ($in_group) {
					warn "ERROR: matchignore command is invalid inside group\n";
					return;
				}
				push @{$config{"replacements"}}, {
					"type" => "match",
					"options" => {},
					"search" => NFD($cmd->[1]->{"value"})};
				for (2..$#$cmd) {
					$config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1;
				}
			} elsif ($cmd_name eq "replace") {
				if (!$in_group) {
					warn "ERROR: replace command called while not in group\n";
					return;
				}
				my $table = $cmd->[1]->{"value"};
				if (!exists($tables{$table})) {
					warn "ERROR: nonexistent table \"$table\" in replace statement.\n";
					return;
				}

				# make a list of all replacements that are affected by this
				# file so that they can be updated when a word is added
				# through the gui
				my $table_path = $table_id_to_path{$table};
				my $replacement_id = $#{$config{"replacements"}};
				$config{"table_paths"}->{$table_path} = [] if !exists $config{"table_paths"}->{$table_path};
				push @{$config{"table_paths"}->{$table_path}}, [$replacement_id, $table];

				# store list of tables for --debug
				push @{$config{"replacements"}->[-1]->{"tables"}}, $table;

				# Note: we don't need to check if $table{"choicesep"} was defined
				# here since we can't ever get this far without first having
				# loaded a table anyways
				my $trie_root = $config{"replacements"}->[-1]->{"words"};
				my $override = $#$cmd >= 2 && $cmd->[2]->{"value"} eq "override";
				add_to_trie($table, $trie_root, $tables{$table}, $args, \%config, $override);
			} elsif ($cmd_name eq "retrywithout") {
				if (!exists $config{"retrywithout"}) {
					$config{"retrywithout"} = [];
				}
				# first value is the display name
				my @values = map {$_->{"value"}} @{$cmd}[1..$#$cmd];
				push @{$config{"retrywithout"}}, \@values;
			} elsif ($cmd_name eq "targetdiacritics") {
				if (!exists $config{$cmd_name}) {
					$config{$cmd_name} = [];
				}
				foreach (1..$#$cmd) {
					push @{$config{$cmd_name}}, $cmd->[$_]->{"value"};
				}
			} elsif ($cmd_name eq "split" || $cmd_name eq "beforeword" ||
				$cmd_name eq "afterword" || $cmd_name eq "tablesep" ||
				$cmd_name eq "choicesep" || $cmd_name eq "comment") {
				$config{$cmd_name} = $cmd->[1]->{"value"};
			} elsif ($cmd_name eq "ignore") {
				$config{"ignore"} = $cmd->[1]->{"value"};
				my $table = load_ignore_table $cmd->[1]->{"value"}, $args;
				return if !defined $table;
				$config{"ignore_words"} = $table;
			} else {
				warn "ERROR: unknown command \"" . $cmd_name . "\" in config.\n";
				return;
			}
		} else {
			my $err =  "ERROR: line does not start with command:\n";
			foreach my $cmd_part (@$cmd) {
				$err .= $cmd_part->{"value"};
			}
			warn "$err\n";
			return;
		}
	}
	if ($in_group) {
		warn "ERROR: unclosed group in config\n";
		return;
	}
	if (!$config{"ignore"}) {
		warn "ERROR: no file of words to ignore specified.\n";
		return;
	}
	return \%config;
}

# load the config file
# Returns:
# the config hash or undef if an error occurred
sub load_config {
	my $args = shift;
	my $config_list = parse_config($args->{"config"});
	if (!$config_list) {
		return;
	}
	return interpret_config $config_list, $args;
}

# Handle the action returned by `prompt_unknown_word`
# $config - the current program config
# $args - the command line arguments
# Currently accepted values for $action:
# ["ignore", "run", $word] - only ignore $word for the rest of this run
# ["ignore", "permanent", $word] - write $word to the permanent ignore file
# ["add", $word, $replace_word, $table_path] - add $word to the table 
# 	corresponding to $table_path with $replace_word as its replacement. Note that
# 	only tables directly corresponding to paths work here - tables that only
# 	were created through "expand" in the config aren't ever shown separately
# 	in `prompt_unknown_word`
# ["reload"] - reload the configuration file
# Returns:
# 0 - nothing needs to be done
# 1 - the current line needs to be re-transliterated with the new config
# 2 - an error occurred while reloading the config
# 3 - stop asking for unknown words on this line
sub handle_unknown_word_action {
	my ($action, $config, $args) = @_;
	if ($action->[0] eq "ignore") {
		# yeah, this is a bit messy and inconsistent
		return 3 if $action->[1] eq "wholeline";
		$config->{"ignore_words"}->{$action->[2]} = "";
		if ($action->[1] eq "permanent") {
			my $fh = open_file_rel_abs $config->{"ignore"}, $args->{"config"}, ">>";
			return 1 if !$fh;
			print($fh $action->[2] . "\n");
			close($fh);
		} elsif ($action->[1] eq "run") {
			# Print to error file if ignore isn't permanent
			return 0 if ($args->{"errors"} eq "");
			my $fh;
			if (!open($fh, ">>", $args->{"errors"})) {
				warn "ERROR: Can't open error file \"$args->{errors}\".\n";
				return 0;
			}
			print($fh $action->[2] . "\n");
			close($fh);
		}
		return 0;
	} elsif ($action->[0] eq "add") {
		my $table_path = $action->[3];
		my $word = $action->[1];
		my $replace_word = $action->[2];
		# make sure to write the words in the correct order if the
		# tables were reverted while loading
		my $reverted = exists $config->{"reverted_tables"}->{$table_path};
		my $word_abs = $reverted ? $action->[2] : $action->[1];
		my $replace_word_abs = $reverted ? $action->[1] : $action->[2];
		my $fh = open_file_rel_abs $table_path, $args->{"config"}, ">>";
		return 1 if !$fh;
		print($fh $word_abs . $config->{tablesep} . $replace_word_abs . "\n");
		close($fh);
		# loop over all table ids that are impacted by this file
		foreach my $replacement (@{$config->{"table_paths"}->{$table_path}}) {
			my $replacement_id = $replacement->[0];
			my $table_id = $replacement->[1];
			my $trie = $config->{"replacements"}->[$replacement_id]->{"words"};
			my $final_table = {$word => $replace_word};
			# handle expansions
			foreach my $expand (@{$config->{"expands"}->{$table_id}}) {
				my $ending_table_id = $expand->[0];
				my $noroot = $expand->[1];
				my $endings_table = $config->{"ending_tables"}->{$ending_table_id};
				$final_table = expand_table $final_table, $endings_table, $noroot, $config;
			}
			add_to_trie($table_id, $trie, $final_table, $args, $config);
		}
		return 1;
	} elsif ($action->[0] eq "reload") {
		my $new_config = load_config $args;
		if ($new_config) {
			%$config = %$new_config;
			return 1;
		} else {
			return 2;
		}
	}
}

# Split $substrings based on the "split" regex in $config.
# $substrings can already be split at this point; only the
# ones that haven't been transliterated yet are modified
sub split_words {
	my ($config, $substrings) = @_;
	my $split_re = qr/($config->{"split"})/;
	my @substrings_new;
	foreach my $cur_substr (@$substrings) {
		if ($cur_substr->[0] == 1) {
			push(@substrings_new, $cur_substr);
			next;
		}

		my @words = split(/$split_re/, $cur_substr->[1]);
		for my $i (0..$#words) {
			# Word is not delimiter
			# Split produces an empty field at the beginning if the string
			# starts with the delimiter
			if ($i % 2 == 0) {
				push(@substrings_new, [0, $words[$i], $words[$i]]) if ($words[$i] ne '');
			} else {
				# Delimiters can count as already replaced
				push(@substrings_new, [1, $words[$i], $words[$i]]);
			}
		}
	}
	@$substrings = @substrings_new;
}

# small helper function to add a untransliterated string to the last substring
# if that is not transliterated yet, or push it onto @$substrings otherwise
# -> used to keep all untransliterated text in one piece
# since this is also used for the "nofinal" attribute on "match", it takes
# an original and replaced string (since, when using "match" and "nofinal",
# the original string was replaced, but is still marked as unknown)
sub push_unknown {
	my ($substrings, $orig, $replaced) = @_;
	$replaced //= $orig;
	if (@$substrings && !$substrings->[-1]->[0]) {
		$substrings->[-1]->[1] .= $replaced;
		$substrings->[-1]->[2] .= $orig;
	} else {
		push(@$substrings, [0, $replaced, $orig]);
	}
}

# Replace a word in $substrings based on $replace_config using regex
# $replace_config->{"search"} is the word to replace
# $replace_config->{"replace"} is the replacement word
#	if $replace_config->{"replace"} is undefined, just splits
#	$substrings at the the match and marks that the match has
#	been transliterated - currently used for "matchignore"
# $replace_config->{"beginword"}, $replace_config->{"afterword"} -
#	specifies if the match is only valid when $config->{"beforeword"}
#	or $config->{"afterword"} occur before or after it, respectively
sub replace_match {
	my ($config, $replace_config, $substrings, $debug_msg) = @_;
	my $beginword = exists $replace_config->{"options"}->{"beginword"};
	my $endword = exists $replace_config->{"options"}->{"endword"};
	my $fullword = $beginword && $endword;
	my $beforeword = $config->{"beforeword"};
	my $afterword = $config->{"afterword"};
	my $word = $replace_config->{"search"};
	my $replace_word = $replace_config->{"replace"};
	if ($fullword) {
		$word = qr/(\A|$beforeword)$word(\z|$afterword)/;
	} elsif ($beginword) {
		$word = qr/(\A|$beforeword)$word/;
	} elsif ($endword) {
		$word = qr/$word(\z|$afterword)/;
	} else {
		$word = qr/$word/;
	}

	my @substrings_new;
	# only modify $substrings at all if the word was found
	my $found_word = 0;
	my $last_idx;
	# @substrings_new is only used if needed to improve efficiency
	foreach my $i (0..$#$substrings) {
		if ($substrings->[$i]->[0]) {
			# FIXME: is there a way to make it more efficient by keeping the old array?
			# This is a major bottleneck
			# Note: the above statement *may* be a bit exaggerated
			if ($found_word) {
				push(@substrings_new, $substrings->[$i]);
			}
			next;
		}
		$last_idx = 0;
		my $i0 = 0;
		my $i1 = 0;
		while ($substrings->[$i]->[1] =~ m/$word/g) {
			if (!$found_word) {
				print $debug_msg if $debug_msg;
				$found_word = 1;
				if ($i != 0) {
					push(@substrings_new, @{$substrings}[0..$i-1]);
				}
			}
			# This mess is needed to reliably match $beforeword and $afterword and put the captured
			# "splitting" characters back into the text. This would be much easier just using
			# a lookbehind and lookahead, but I couldn't find a way to also match beginning and
			# end of string that way.
			$i0 = $-[0];
			$i1 = $+[0];
			if ($fullword) {
				$i0 += length($1);
				$i1 -= length($2);
				# pos need to be decreased so that matches still work right next to each other
				pos($substrings->[$i]->[1]) -= length($2);
			} elsif ($beginword) {
				$i0 += length($1);
			} elsif ($endword) {
				$i1 -= length($1);
				pos($substrings->[$i]->[1]) -= length($1);
			}
			if ($last_idx != $i0) {
				my $unknown = substr($substrings->[$i]->[1], $last_idx, $i0-$last_idx);
				push_unknown \@substrings_new, $unknown;
			}
			my $orig_str = substr($substrings->[$i]->[1], $i0, $i1-$i0);
			my $replace_str = $replace_word // $orig_str;
			if ($replace_config->{"options"}->{"nofinal"}) {
				warn "Replaced (nofinal) \"$orig_str\" with \"$replace_str\"\n" if $debug_msg;
				push_unknown \@substrings_new, $orig_str, $replace_str;
			} else {
				warn "Replaced \"$orig_str\" with \"$replace_str\"\n" if $debug_msg;
				push(@substrings_new, [1, $replace_str, $orig_str]);
			}
			$last_idx = $i1;
		}
		if ($last_idx < length($substrings->[$i]->[1]) && $found_word) {
			my $unknown = substr($substrings->[$i]->[1], $last_idx);
			push_unknown \@substrings_new, $unknown;
		}
	}
	if ($found_word) {
		@$substrings = @substrings_new;
	}
}

# Replace a group, i.e. replace all the words in a trie
# $replace_config->{"words"} - the root node of the trie
# $replace_config->{"beginword"}, $replace_config->{"endword"} -
#	same as in `replace_match`
sub replace_group {
	my ($config, $replace_config, $substrings, $debug_msg) = @_;
	my @substrings_new;
	my $word_found = 0;
	# Recurse backwords towards the root node of the trie to find the first
	# node with a key "final" which satisfies the ending condition (if "endword" is set)
	# Returns the id *after* the last match and the node that was found
	# with the key "final" (or undef, if nothing matched)
	my $find_final = sub {
		my ($i, $tmp_cur_node, $s) = @_;
		do {
			my $after_ch = substr($s->[1], $i, 1);
			if (exists $tmp_cur_node->{"final"} && (!exists($replace_config->{"options"}->{"endword"}) ||
				$after_ch eq "" || $after_ch =~ $config->{"afterword"})) {
				return ($i, $tmp_cur_node);
			}
			$i--;
		} while ($tmp_cur_node = $tmp_cur_node->{"parent"});
		# none of the points were appropriate for breaking the word, so
		# $tmp_cur_node now points to the nonexistent parent node of the
		# root node
		return ($i, undef);
	};
	foreach my $s (@$substrings) {
		if ($s->[0]) {
			push(@substrings_new, $s);
			next;
		}
		my $cur_node = $replace_config->{"words"};
		my $start_i = 0;
		my $i = 0;
		# This deliberately goes off the end of the string! $cur_node is always "one behind" $i
		# since the node is only advanced in the iteration *after* $i has increased, meaning that
		# $i has to already be after the end of the string for the first if statement to definitely
		# fail, causing the elsif statement to handle that case
		while ($i <= length($s->[1])) {
			# This works even when $i is one index after the end of the string - it just returns "" then
			my $ch = substr($s->[1], $i, 1);
			if (exists $cur_node->{$ch}) {
				if ($cur_node == $replace_config->{"words"}) {
					my $before_ch = $i > 0 ? substr($s->[1], $i - 1, 1) : "";
					if (exists($replace_config->{"options"}->{"beginword"}) &&
						$before_ch ne "" && $before_ch !~ $config->{"beforeword"}) {
						push_unknown \@substrings_new, $ch;
						$i++;
						next;
					}
					$start_i = $i;
				}
				$cur_node = $cur_node->{$ch};
			} elsif (exists $cur_node->{"final"} || $cur_node != $replace_config->{"words"} || $i == length($s->[1])-1) {
				my $tmp_cur_node = $cur_node;
				($i, $tmp_cur_node) = $find_final->($i, $tmp_cur_node, $s);
				if (!defined($tmp_cur_node)) {
					push_unknown \@substrings_new, substr($s->[1], $i + 1, 1);
					$i += 2;
				} else {
					my $orig = substr($s->[1], $start_i, $i-$start_i);
					my $final = $tmp_cur_node->{"final"};
					if ($debug_msg) {
						if (!$word_found) {
							warn $debug_msg;
							$word_found = 1;
						}
						warn "Replaced \"$orig\" with \"$final\"\n";
					}
					push(@substrings_new, [1, $final, $orig]);
				}
				$cur_node = $replace_config->{"words"};
				next;
			} else {
				push_unknown \@substrings_new, $ch;
			}
			$i++;
		}
	}
	@$substrings = @substrings_new;
}

# Perform all replacements on $word, first removing all
# characters specified in $chars
sub replace_strip_chars {
	my ($config, $args, $chars, $word) = @_;
	foreach my $char (@$chars) {
		$word =~ s/\Q$char\E//g;
	}
	return replace_line($config, $args, $word);
}

# Perform all replacements on $line based on $config
# $substrings: array of arrays - each one has three elements:
# first 0 or 1, indicating if the substring following it has already
# been replaced or not (1 means it has been replaced), then the
# transliterated string, and lastly the original string.
# If the first element is 0, the second two elements are obviously same
sub replace_line {
	my ($config, $args, $line) = @_;
	my $substrings = [[0, $line, $line]];
	foreach my $replacement (@{$config->{"replacements"}}) {
		if ($replacement->{"type"} eq "match") {
			my $debug_msg;
			if ($args->{"debug"}) {
				my $options = join " ", keys(%{$replacement->{"options"}});
				$debug_msg =  "Match ($options): \"$replacement->{search}\"";
				if ($replacement->{"replace"}) {
					$debug_msg .= " \"$replacement->{replace}\"\n";
				} else {
					$debug_msg .= " (ignore)\n";
				}
			}
			replace_match($config, $replacement, $substrings, $debug_msg);
		} elsif ($replacement->{"type"} eq "group") {
			my $debug_msg;
			if ($args->{"debug"}) {
				my $options = join " ", keys(%{$replacement->{"options"}});
				my $tables = '"' . join('" "', @{$replacement->{"tables"}}) . '"';
				$debug_msg = "Group ($options): $tables\n";
			}
			replace_group($config, $replacement, $substrings, $debug_msg);
		}
	}
	# splits all words at the end so that the splitting characters
	# aren't taken as unknown words and the unknown words are (hopefully)
	# in better chunks for prompting the user about them
	split_words($config, $substrings);

	return $substrings;
}

# Call the unknown word window with the given substrings and index
# See `get_unknown_words` for explanation of other parameters
# (should be obvious)
# Returns:
# 3, if the rest of the line should be skipped
# 1, if the line needs to be re-transliterated
# 0, if nothing needs to be done
sub call_unknown_word_window {
	my ($substrings, $i, $config, $args, $cur_lineno) = @_;
	my $word = $substrings->[$i];
	my $contextl = "";
	my $contextl_orig = "";
	foreach my $j (0..$i-1) {
		$contextl .= $substrings->[$j]->[1];
		$contextl_orig .= $substrings->[$j]->[2];
	}
	my $contextr = "";
	my $contextr_orig = "";
	foreach my $j ($i+1..$#$substrings) {
		$contextr .= $substrings->[$j]->[1];
		$contextr_orig .= $substrings->[$j]->[2];
	}
	my $action = prompt_unknown_word($contextl, $contextl_orig,
		$word->[1], $word->[2], $contextr, $contextr_orig,
		$config, "$cur_lineno", $args);
	# if $ret == 3, rest of line should be skipped
	# if $ret == 2, config could not be loaded
	# if $ret == 1, line must be redone with new config
	my $ret = handle_unknown_word_action($action, $config, $args);
	# keep retrying until the user chooses an action which
	# didn't throw an error
	while ($ret == 2) {
		$action = prompt_unknown_word($contextl, $contextl_orig,
			$word->[1], $word->[2], $contextr, $contextr_orig,
			$config, "$cur_lineno", $args, 1);
		$ret = handle_unknown_word_action($action, $config, $args);
	}
	return $ret;
}

# NOTE: MUST ALWAYS ADD REPLACEMENT WORDS FIRST!
# If an ignore word is added which is attached to a word that should have a replacement
# added and just that word is selected to ignore, you never get a chance to add a
# replacement for the other word that it is attached to

# NOTE: This is very ugly code. The GUI code is the worst, but this whole part
# of the program is nasty. This is partially due to the fact that features kept
# being added when their use was discovered. This problem might be fixed in the
# future when I have time to rewrite all of this.

# Handle unknown words
# $substrings - the current substrings with unknown words
# $config - the program config
# $args - the command line args
# $cur_lineno - display string to show the user the current line number
# Returns:
# 1 - the line needs to be re-transliterated
# 0 - all done
sub get_unknown_words {
	my ($substrings, $config, $args, $cur_lineno) = @_;
	foreach my $i (0 .. $#$substrings) {
		my $word = $substrings->[$i];
		if (!$word->[0] && !exists($config->{"ignore_words"}->{$word->[1]})) {
			my $ret = call_unknown_word_window $substrings, $i, $config, $args, $cur_lineno;
			# 3 means we ignore the line
			if ($ret == 3) {
				foreach my $s (@$substrings) {
					# revert all changes done on the line
					$s->[1] = $s->[2];
				}
				return 0;
			}
			# 1 means the line needs to be re-transliterated
			return 1 if $ret == 1;
		}
		$i++;
	}
	return 0;
}

# Main replacement function
# Opens the input file ($args->{"input"}) and writes the transliterated text
# to the file handle $outputfh, prompting the user for unknown words or
# word choices (if those aren't disabled on the command line)
sub replace {
	my ($config, $args, $total_lines, $inputfh, $outputfh) = @_;
	while (my $line = <$inputfh>) {
		next if $. < $args->{"start"};
		my $comment;
		if (exists $config->{"comment"}) {
			$line =~ s/\Q$config->{comment}\E(.*)\z//s;
			$comment = $1;
		}
		my $nfd_line = NFD($line);
		my $substrings = replace_line($config, $args, $nfd_line);

		if (!$args->{"nounknowns"}) {
			# re-transliterate the string if the config was reloaded
			while (get_unknown_words($substrings, $config, $args, "$./$total_lines")) {
				$substrings = replace_line($config, $args, $nfd_line);
			}
		} elsif ($args->{"debugspecial"}) {
			foreach my $s (@$substrings) {
				if (!$s->[0] && !exists($config->{"ignore_words"}->{$s->[1]})) {
					warn "Unknown word: \"$s->[1]\"\n";
				}
			}
		}
		if (!$args->{"nochoices"}) {
			# this only loops more than once if the user presses the button
			# "Open in unknown word window"
			while (my $ret = prompt_choose_word($substrings, $config, $args, "$./$total_lines")) {
				if ($ret == 1) {
					$substrings = replace_line($config, $args, $nfd_line);
				}
			}
		} elsif ($args->{"debugspecial"}) {
			foreach my $s (@$substrings) {
				if ($s->[0] && $s->[1] =~ /\Q$config->{choicesep}\E/) {
					my $num_choices = split /\Q$config->{choicesep}\E/, $s->[1];
					warn "Word \"$s->[1]\" with $num_choices word choices.\n";
				}
			}
		}

		foreach (@$substrings) {
			print $outputfh $_->[1];
		}
		print $outputfh $comment if $comment;
	}
}

my %args = ("config" => "config", "start" => 1, "errors" => "", "output" => "");
GetOptions(
	\%args, "debug", "debugspecial",
	"nochoices", "nounknowns",
	"force", "start=i",
	"output=s", "config=s",
	"errors=s", "help",
	"checkduplicates") or pod2usage(1);

pod2usage(-exitval => 0, -verbose => 2) if $args{"help"};
pod2usage(-exitval => 1, -verbose => 1) if @ARGV > 1;

if (!-f $args{"config"}) {
	die "ERROR: config file \"$args{config}\" does not exist or is not a file.\n";
}
my $config = load_config \%args;
if (!$config) {
	die "ERROR: Invalid config\n";
}
exit 0 if ($args{"checkduplicates"});

my $inputfh;
my $total_lines = "UNKNOWN";
if (@ARGV < 1) {
	warn "WARNING: no input file supplied; taking input from STDIN\n";
	$inputfh = \*STDIN;
} else {
	open $inputfh, "<", $ARGV[0] or die "ERROR: Cannot open input file \"$ARGV[0]\" for reading.\n";
	# Is there *really* no more efficient way to get the total number of lines?
	$total_lines = 0;
	while (<$inputfh>) {$total_lines++};
	close $inputfh;
	open $inputfh, "<", $ARGV[0] or die "ERROR: Cannot open input file \"$ARGV[0]\" for reading.\n";
}

if (-f $args{"errors"} && !$args{"force"}) {
	my $choice = "";
	while ($choice !~ /^[yn]$/) {
		print STDERR "\"$args{errors}\" already exists. Do you want to overwrite it? ";
		$choice = <STDIN>;
		chomp $choice;
	}
	die "ERROR: \"$args{errors}\" already exists.\n" if $choice ne "y";
}

my $outputfh;
if ($args{"output"} eq "") {
	warn "WARNING: no output file supplied; printing to STDOUT\n";
	$outputfh = \*STDOUT;
} elsif (-f $args{"output"} && !$args{"force"}) {
	my $choice = "";
	while ($choice !~ /^[aoe]$/) {
		print STDERR "\"$args{output}\" already exists. (a)ppend, (o)verwrite, or (e)xit? ";
		$choice = <STDIN>;
		chomp $choice;
	}
	if ($choice eq "a") {
		open $outputfh, ">>", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n";
	} elsif ($choice eq "e") {
		die "ERROR: \"$args{output}\" already exists.\n";
	} else {
		open $outputfh, ">", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n";
	}
} else {
	open $outputfh, ">", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n";
}

replace($config, \%args, $total_lines, $inputfh, $outputfh);
close $inputfh;
close $outputfh;

__END__

=head1 NAME

transliterate.pl - Transliterate text files

=head1 SYNOPSIS

transliterate.pl [options][input file]

Start the transliteration engine with the given file as input.
The input file defaults to STDIN if no filename is given.

=head1 OPTIONS

=over 8

=item B<--output> <filename>

Sets the output file to print to.

If the file exists already and B<--force> is not set, the user is asked
if the file should be overwritten or appended to.

B<Default:> C<STDOUT> (print to terminal)

=item B<--config> <filename>

Sets the configuration file to use.

B<Default:> C<config>

=item B<--checkduplicates>

Prints all duplicate words within single table files and across tables
that are replaced within the same group, then exits the program.

Note that this simply prints B<all> duplicates, even ones that are
legitimate. When duplicates are found during normal operation of
the program, they are simply combined in exactly the same way as the
regular word choices.

Also note that the words are still added as possible choices, which
may be slightly confusing. If, for instance, a word "word" is stored
in the tables "tablea", "tableb", and "tablec" with the replacements
"a", "b", and "c", the first duplicate message will say that the
first occurrence was in table "tablea" with the replacement "a", and
the second duplicate message will say that the first occurrence was
in table "tablea" with the replacement "a$b" (assuming $ is the
value set as B<choicesep> in the config). This is just something to
be aware of.

On that note, before duplicates are checked between tables in the
same replacement group, duplicates inside the same file are already
replaced, so that might be a bit confusing as well.

=item B<--nochoices>

Disables prompting for the right word when multiple replacement words exist.

This can be used to "weed out" all the unknown words before
commencing the laborious task of choosing the right word every time
multiple options exist.

=item B<--nounknowns>

Disables prompting for the right word when a word is not found in the database.

This can be used together with B<--nochoices> to perform a quick test of how
well the actual engine is working without having to click through all the
prompts.

=item B<--debug>

Prints information helpful for debugging problems with the B<match> and B<group>
statements.

For each B<match> or B<group> statement which replaces anything, the original
statement is printed (the format is a bit different than in the config) and
each actual word that's replaced is printed.

=item B<--debugspecial>

This option is only useful for automatic testing of the transliteration engine.

If B<--nochoices> is enabled, each word in the input with multiple choices will
be output, along with the number of choices (can be used to test the proper
functioning of B<choicesep> in the config file).

If B<--nounknowns> is enabled, each unknown word in the input is printed
(can be used to test that the B<ignore> options are working correctly).

=item B<--force>

Always overwrites the output and error file without asking.

=item B<--start> <line number>

Starts at the given line number instead of the beginning of the file.

Note: when "Stop processing" is pressed, the current line number is
printed out. This is the current line that was being processed, so it
has not been printed to the output file yet and thus the program must
be resumed at that line, not the one afterwards.

=item B<--errors> <filename>

Specifies a file to write errors in. Note that this does not refer to
actual errors, but to any words that were temporarily ignored
(i.e. words for which "Ignore: This run" was clicked).

If no file is specified, nothing is written. If a file is specified
that already exists and B<--force> is not set, the user is prompted
for action.

=item B<--help>

Displays the full documentation.

=back

=head1 DESCRIPTION

B<transliterate.pl> will read the given input file and transliterate it
based on the given configuration file, prompting the user for action if
a word has multiple replacement options or is not found in the database.

See L</"CONFIGURATION"> for details on what is possible.

Note that this is B<not> some sort of advanced transliteration engine
which understands the grammar of the language and tries to guess words
based on that. This is only a glorified find-and-replace program
with some extra features to make it useful for transliterating text
using large wordlists.

WARNING: All input data is assumed to be UTF-8!

=head1 WORD CHOICE WINDOW

The word choice window is opened any time one word has multiple replacement
options and prompts the user to choose one.

For each word with multiple options, the user must choose the right option
and then press "Accept changes" to finalize the transliteration of the
current line. The button to accept changes is selected by default, so it
is possible to just press enter instead of manually clicking it. Before the
line is finalized, the user may press "Undo" to undo any changes on the
current line.

"Skip word" just leaves it as is. This shouldn't be needed in most cases
since B<choicesep> should always be set to a character that doesn't occur
normally in the text anyways.

"Open in unknown word window" will open the
L<unknown word window|/"UNKNOWN WORD WINDOW"> with the current word
selected. This is meant as a helper if you notice that another word choice
needs to be added.

Warning: This is very inconsistent and buggy! Since the unknown word window
is just opened directly, it isn't modified to make more sense for this
situation. Whenever "Add replacement" is pressed, the whole line is
re-transliterated as usual, but the word choice window is opened again
right afterwards. If you just want to go back to the word choice window,
press the ignore button for "whole line" since that shouldn't break
anything. There are weird inconsistencies, though - for instance, if you
delete all words in the tables, then press "Reload config", the line will
be re-transliterated and none of the words will actually be found, but it
will still go on because control passes back to the word choice window no
matter what. Also, none of the word choices that were already done on this
line are saved since the line is restarted from the beginning. As I said,
it's only there as a helper and is very buggy/inconsistent. Maybe I'll make
everything work better in a future release.

"Stop processing" will exit the program and print the line number that was
currently being processed.

=head1 UNKNOWN WORD WINDOW

The unknown word window is opened any time a word could not be replaced.

Both the context from the original script and the context from the
transliterated version (so far) is shown. If a part of the text is
selected in one of the text boxes and "Use selection as word" is
pressed for the appropriate box, the selected text is used for the
action that is taken subsequently. "Reset text" resets the text in
the text box to its original state (except for the highlight because
I'm too lazy to do that).

The possible actions are:

=over 8

=item Ignore

"This run" only ignores the word until the program exits, while
"Permanently" saves the word in the ignore file specified in the
configuration. "Whole line" stops asking for unknown words on
this line and prints the line out as it originally was in the
file. Note that any words in the original line that contain
B<choicesep> will still cause the L<word choice window|/"WORD CHOICE WINDOW">
to appear due to the way it is implemented. Just press "Skip word"
if that happens.

=item Retry without <display name>

Removes all characters specified in the corresponding B<retrywithout>
statement in the L<config|/"CONFIGURATION">
from the currently selected word and re-transliterates just that
word. The result is then pasted into the text box beside
"Add replacement" so it can be added to a table. This is only a
sort of helper for languages like Urdu in which words often can
be written with or without diacritics. If the "base form" without
diacritics is already in the tables, this button can be used to
quickly find the transliteration instead of having to type it
out again. Any part of the word that couldn't be transliterated
is just pasted verbatim into the text box (but after the
characters have been removed).

Note that the selection can still be modified after this, before
pressing "Add to list". This could potentially be useful if a word
is in a table that is expanded using "noroot" because for instance
"Retry without diacritics" would only work with the full word (with
the ending), but only the stem should be added to the list. If that
is the case, "Retry without diacritics" could be pressed with the
whole word selected, but the ending could be removed before actually
pressing "Add to list".

A separate button is shown for every B<retrywithout> statement
in the config.

=item Add to list

Adds the word typed in the text box beside "Add replacement" to the
selected table file as the replacement for the word currently selected
and re-runs the replacement on the current line. All table files that
do not have B<nodisplay> set are shown as options, see L</"CONFIGURATION">.

Warning: This simply appends the word and its replacement to the end
of the file, so it will cause an error if there was no newline
("\n") at the end of the file before.

Note that this always re-transliterates the entire line afterwards.
This is to allow more flexibility. Consider, for instance, a compound
word of which the first part is also a valid single word. If the
entire line was not re-transliterated, it would be impossible to
add a replacement for that entire compound word and have it take
effect during the same run since the first part of the word would
not even be available for transliteration anymore.

One problem is that the word is just written directly to the file
and there is no undo. This is the way it currently is and will
probably not change very soon. If a mistake is made, the word can
always be removed again manually from the list and "Reload config"
pressed.

=item Reload config

Reload the configuration file along with all tables an re-runs the
replacement on the current line. Note that this can take a short
while since the entire word database has to be reloaded.

=item Stop processing

Prints the current line number to the terminal and exits the program.

The program can always be started again at this line number using
the B<--start> option if needed.

=back

=head1 INTERNALS/EXAMPLES

This section was added to explain to the user how the transliteration
process works internally since that may be necessary to understand
why certain words are replaced the way they are.

First off, the process works line-by-line, i.e. no B<match> statement
will ever match anything that crosses the end of a line.

Each line is initially stored as one chunk which is marked as
untransliterated. Then, all B<match>, B<matchignore>, and B<replace>
(or, rather, B<group>) statements are executed in the order they
appear in the config file. Whenever a word/match is replaced, it
is split off into a separate chunk which is marked as transliterated.
A chunk marked as transliterated I<is entirely ignored by any
replacement statements that come afterwards>. Note that B<beginword>
and B<endword> can always match at the boundary between an
untransliterated and transliterated chunk. This is to facilitate
automated replacement of certain grammatical constructions. For instance:

If the string "a-" could be attached as a prefix to any word and needed
to be replaced as "b-" everywhere, it would be quite trivial to add
a match statement C<'match "a-" "b-" beginword'>. If run on the text
"a-word", where "word" is some word that should be transliterated
as "word_replaced", and the group replace statement for the word comes
after the match statement given above, the following would happen:
First, the match statement would replace "a-" and split the text into
the two chunks "b-" and "word", where "b-" is already marked as
transliterated. Since "word" is now separate, it will be matched
by the group replace statement later, even if it has B<beginword> set
and would normally not match if "a-" came before it. Thus, the final
output will be "b-word_replaced", allowing for the uniform replacement
of the prefix instead of having to add each word twice, once with and
once without the prefix.

In certain cases, this behavior may not be desired. Consider, for
instance, a prefix "c-" which cannot be replaced uniformly as in the
example above due to differences in the source and destination script.
Since it cannot be replaced uniformly, two words "word1" and "word2"
would both need to be specified separately with replacements for
"c-word1" and "c-word2". If, however, the prefix "c-" has an
alternate spelling "c " (without the hyphen), it would be very useful
to be able to automatically recognize that as well. This is where the
B<nofinal> attribute for the B<match> statements comes in. If there is
a match statement C<'match "c " "c-" beginword nofinal'>, the replaced
chunk is B<not> marked as transliterated, so after executing this
statement on the text "c word1", there will still only be one chunk,
"c-word1", allowing for the regular word replacements to function
properly.

Once all the replacement statements have been processed, each chunk
of text that is not marked as transliterated yet is split based on
the B<split> pattern specified in the config and all actual characters
matched by the B<split> pattern are marked as transliterated (this
usually means all the spaces, newlines, quotation marks, etc.). Any
remaining words/text chunks that are still marked as untransliterated are
now processed by the unknown word window. If one of these remaining
unknown chunks is present in the file specified by the B<ignore>
statement in the config, it is simply ignored and later printed out
as is. After all untransliterated words have either had a replacement
added or been ignored, any words with multiple replacement choices are
processed by the word choice window. Once this is all done, the final
output is written to the output file and the process is repeated with
the next line. Note that the entire process is started again each time
a word is added to a table or the config is reloaded from the
L<unknown word window|/"UNKNOWN WORD WINDOW">.

=head1 CONFIGURATION

These are the commands accepted in the configuration file.
Any parameters in square brackets are optional.
Comments are started with C<#>. Strings (filenames, regex strings, etc.)
are enclosed in double quotes ("").

The B<match>, B<matchignore>, and B<replace> commands are executed in
the order they are specified, except that all B<replace> commands within
the same group are replaced together.

The B<match> and B<matchignore> statements accept any RegEx strings and
are thus very powerful. The B<group> statements only work with the
non-RegEx words from the tables, but are very efficient for large numbers
of words and should thus be used for the main bulk of the words.

Any duplicate words found will cause the user to be prompted to choose
one option every time the word is replaced in the input text.

Note that any regex strings specified in the config should B<not>
contain capture groups, as that would break the B<endword> functionality
since this is also implemented internally using capture groups. Capture
groups are also entirely pointless in the config since they currently
cannot be used as part of the replacement string in B<match> statements.
Lookaheads and lookbehinds are fine, though, and could be useful in
certain cases.

All tables must be loaded before they are used, or there will be an
error that the table does not exist.

Warning: If a B<replace> statement is located before an B<expand>
statement that would have impacted the table used, there will be no
error but the expand statement won't have any impact.

Basic rule of thumb: Always put the B<table> statements before the
B<expand> statements and the B<expand> statements before the B<replace>
statements.

=over 8

=item B<split> <regex string>

Sets the RegEx string to be used for splitting words. This is only used
for splitting the words which couldn't be replaced after all replacement
has been done, before prompting the user for unknown words.

Note that B<split> should probably always contain at least C<\n>, since
otherwise all of the newlines will be marked as unknown words. Usually,
this will be included anyways through C<\s>.

Note also that B<split> should probably include the C<+> RegEx-quantifier
since that allows the splitting function in the end to ignore several
splitting characters right after each other (e.g. several spaces) in one
go instead of splitting the string again for every single one of them.
This shouldn't actually make any difference functionality-wise, though.

B<Default:> C<\s+> (all whitespace)

=item B<beforeword> <regex string>

Sets the RegEx string to be matched before a word if B<beginword> is set.

B<Default:> C<\s>

=item B<afterword> <regex string>

Sets the RegEx string to be matched after a word if B<endword> is set.

Note that B<afterword> should probably always contain at least C<\n>,
since otherwise words with B<endword> set will not be matched at the
end of a line.

B<beforeword> and B<afterword> will often be exactly the same, but
they are left as separate options in case more fine-tuning is needed.

B<Default:> C<\s>

=item B<tablesep> <string>

Sets the separator used to split the lines in the table files into the
original and replacement word.

B<Default:> C<Tab>

=item B<choicesep> <string>

Sets the separator used to split replacement words into multiple choices for
prompting the user.

B<Default:> C<$>

=item B<comment> <string>

If enabled, anything after C<< <string> >> will be ignored on all lines in
the input file. This will not be displayed in the
L<unknown word window|/"UNKNOWN WORD WINDOW"> or L<word choice window|/"WORD CHOICE WINDOW">
but will still be printed in the end, with the comment character removed
(that seems to be the most sensible thing to do).

Note that this is really just a "dumb replacement", so there's no way to
prevent a line with the comment character from being ignored. Just try
to always set this to a character that does not occur anywhere in the text
(or don't use the option at all).

=item B<ignore> <filename>

Sets the file of words to ignore.

This has to be set even if the file is just empty because the user can
add words to it from the unknown word window.

=item B<table> <table identifier> <filename> [nodisplay] [revert]

Load the table from C<< <filename> >>, making it available for later use in the
B<expand> and B<replace> commands using the identifier C<< <table identifier> >>.

if B<nodisplay> is set, the filename for this table is not shown in the
L<unknown word window|/"UNKNOWN WORD WINDOW">. If, however, the same filename
is loaded again for another table that does not have B<nodisplay> set, it is
still displayed.

If B<revert> is set, the original and replacement words are switched. This can
be useful for creating a config for transliterating in the opposite direction
with the same database. I don't know why I called it "revert" since it should
actually be called "reverse". I guess I was a bit confused.

Note that if C<< <filename> >> is not an absolute path, it is taken to be relative
to the location of the configuration file.

The table files simply consist of B<tablesep>-separated values, with the word in the
original script first and the replacement word second. Both the original and
replacement word can optionally have several parts separated by B<choicesep>. If the
original word has multiple parts, it is separated and each of the parts is added
to the table with the replacement. If the replacement has multiple parts, the user
will be prompted to choose one of the options during the transliteration process.
If the same word occurs multiple times in the same table with different replacements,
the replacements are automatically added as choices that will be handled by the
L<word choice window|/"WORD CHOICE WINDOW">.

If, for whatever reason, the same table is needed twice, but with different endings,
the table can simply be loaded twice with different IDs. If the same path is loaded,
the table that has already been loaded will be reused. Note that this feature was
added before adding B<revert>, so the old table is used even if it had B<revert>
set and the new one doesn't. This is technically a problem, but I don't know of
any real-world case where it would be a problem, so I'm too lazy to change it.
Tell me if it actually becomes a problem for you.

WARNING: Don't load the same table file both with and without B<revert> in the same
config! When a replacement word is added through the GUI, the program has to know
which way to write the words. Currently, whenever a table file is loaded with
B<revert> anywhere in the config (even if it is loaded without B<revert> in a
different place), words will automatically be written as if B<revert> was on. I
cannot currently think of any reason why someone would want to load a file both
with and without B<revert> in the same config, but I still wanted to add this
warning just in case.

=item B<expand> <table identifier> <word ending table> [noroot]

Expand the table C<< <table identifier> >>, i.e. generate all the word forms using
the word endings in C<< <word ending table> >>, saving the result as a table with the
identifier C<< <new table identifier> >>.

Note: There used to be a C<< <new table identifier> >> argument to create a new
table in case one table had to be expanded with different endings. This has been
removed because it was a bit ugly, especially since there wasn't a proper mapping
from table IDs to filenames anymore. If this functionality is needed, the same table
file can simply be loaded multiple times. See the B<table> section above.

If B<noroot> is set, the root forms of the words are not kept.

If the replacement for a word ending contains B<choicesep>, it is split and each part
is combined with the root form separately and the user is prompted to choose one of
the options later. it is thus possible to allow multiple choices for the ending if
there is a distinction in the replacement script but not in the source script.
Note that each of the root words is also split into its choices (if necessary)
during the expanding, so it is possible to use B<choicesep> in both the endings
and root words.

=item B<match> <regex string> <replacement string> [beginword] [endword] [nofinal]

Perform a RegEx match using the given C<< <regex string> >>, replacing it with
C<< <replacement string> >>. Note that the replacement cannot contain any RegEx
(e.g. groups) in it. B<beginword> and B<endword> specify whether the match must
be at the beginning or ending of a word, respectively, using the RegEx specified
in B<beforeword> and B<afterword>. If B<nofinal> is set, the string is not marked
as transliterated after the replacement, allowing it to be modified by subsequent
B<match> or B<replace> commands.

=item B<matchignore> <regex string> [beginword] [endword]

Performs a RegEx match in the same manner as B<match>, except that the original
match is used as the replacement instead of specifying a replacement string, i.e.
whatever is matched is just marked as transliterated without changing it.

=item B<group> [beginword] [endword]

Begins a replacement group. All B<replace> commands must occur between B<group>
and B<endgroup>, since they are then grouped together and replaced in one go.
B<beginword> and B<endword> act in the same way as specified for B<match> and
apply to all B<replace> statements in this group.

=item B<replace> <table identifier> [override]

Replace all words in the table with the identifier C<< <table identifier> >>,
using the B<beginword> and B<endword> settings specified by the current group.

Unless B<override> is set on the latter table, if the same word occurs in two
tables with different replacements, both are automatically added as choices.
See L</"WORD CHOICE WINDOW">.

B<override> can be useful if the same database is used for both directions and
one direction maps multiple words to one word, but in the other direction this
word should always default to one of the choices. In that case, a small table
with these special cases can be created and put at the end of the main B<group>
statement with B<override> set. This is technically redundant since you could
just add a special group with only the override table in it earlier in the
config, but it somehow seems cleaner this way.

Note that a table must have been loaded before being used in a B<replace> statement.

=item B<endgroup>

End a replacement group.

=item B<retrywithout> <display name> [character] [...]

Adds a button to the L<unknown word window|/"UNKNOWN WORD WINDOW"> to retry the
replacements on the selected word, first removing the given characters.
The button is named "<display name>" and located after the "Retry without" label.
Whatever is found with the replacements is pasted into the regular text box for
the "Add replacement" functionality.

This can be used as an aid when, for instance, words can be written with or without
certain diacritics. If the actual word without diacritics is already in the
database and there is a B<retrywithout> statement for all the diacritics, the
button can be used to quickly find the replacement for the word instead of having
to type it out manually. The same goes for compound words that can be written
with or without a space.

It is also possible to specify B<retrywithout> without any characters, which just
adds a button that takes whatever word is selected and retries the replacements
on it. This can be useful if you want to manually edit words and quickly see if
they are found with the edits in place.

Note that all input text is first normalized to the unicode canonical decomposition
form so that diacritics can be removed individually.

Also note that all buttons are currently just dumped in the GUI without any
sort of wrapping, so they'll run off the screen if there are too many.
Tell me if this becomes a problem. I'm just too lazy to change it right now.

Small warning: This only removes the given characters from the word selected in
the GUI, not from the tables. Thus, this only works if the version of the word
without any of the characters is already present in the tables. It would be useful
when handling diacritics if the program could simply make a comparison while
completely ignoring diacritics, but I haven't figured out a nice way to implement
that yet.

Historical note: This was called B<diacritics> in a previous version and only
allowed removal of diacritics. This is exactly the same functionality, just
generalized to allow removal of any characters with different buttons.

=item B<targetdiacritics> <diacritic> [...]

This was only added to simplify transliteration from Hindi to Urdu with the
same database. When this is set, the choices in the
L<word choice window|/"WORD CHOICE WINDOW"> are sorted in descending order
based on the number of diacritics from this list that are matched in each
choice. This is so that when transliterating from Hindi to Urdu, the choice
with the most diacritics is always at the top.

Additionally, if there are I<exactly> two choices for a word and one of
them contains diacritics but the other one doesn't, the one containing
diacritics is automatically taken without ever prompting the user. This
is, admittedly, a very language-specific feature, but I couldn't think of
a simple way of adding it without building it directly into the actual program.

Note that due to the way this is implemented, it will not take any effect
if B<--nochoices> is enabled.

The attentive reader will notice at this point that most of the features
in this program were added specifically for dealing with Urdu and Hindi,
which does appear to make sense, considering that this program was written
specifically for transliterating Urdu to Hindi and vice versa (although
not quite as much vice versa).

=back

=head1 BUGS

Although it may not seem like it, one of the ugliest parts of the program is the
GUI functionality that allows the user to add a replacement word. The problem is
that all information about the B<expand> and B<replace> statements has to be kept
in order to properly handle adding a word to one of the files and simultaneously
adding it to the currently loaded tables I<without reloading the entire config>.
The way it currently works, the replacement word is directly written to the file,
then all B<expand> statements that would have impacted the words from this file
are redone (just for the newly added word) and the resulting words are added to
the appropriate tables (or, technically, the appropriate 'trie'). Since a file
can be mapped to multiple table IDs and a table ID can occur in multiple replace
statements, this is more complicated than it sounds, and thus it is very likely
that there are bugs lurking here somewhere. Do note that "Reload config" will
B<always> reload the entire configuration, so that's safe to do even if the
on-the-fly replacing doesn't work.

In general, I have tested the GUI code much less than the rest since you can't
really test it automatically very well.

The code is generally quite nasty, especially the parts belonging to the GUI.
Don't look at it.

Tell me if you find any bugs.

=head1 SEE ALSO

perlre, perlretut

=head1 LICENSE

Copyright (c) 2019, 2020 lumidify <nobody[at]lumidify.org>

Permission to use, copy, modify, and distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.

THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

=cut
