#!/opt/bin/perl
# From a file containing examples from a grammar, the
# examples from a certain text are selected, and these 
# are translated to XML.

# An input file <file>.ex contains lines starting with %ex and %nr that divide
# the file into sections and subdivide into subsections, resp. 
# An input file <file>.txt contains lines starting with %pa and %li that divide
# the file into pages and lines, resp.

# For each example, we can have lines starting with 
# %hi hieroglyphs
# %al (alphabetic) for transliteration
# %tr for translation
# %no for a note
# %qu for a question to the reader
# %cf for similar sentences in other publications
# %bi for bibliographic references concerning the full text
# For each exercise there should be at most one entry for 
# each one of the above.
# For multi-line entries for one of the above, the second and following
# lines should not be prefixed by a "%"-tag. The first empty line
# or line starting with % marks the end of the entry.
# Use <p> for dividing the entry into paragraphs.
# A line starting with two % is ignored; this can be used for comments.

###########################################################################
# Begin changeables.

# Value is 0 if all available information is to be copied to XML file, and
# is 1 is only the page number is to be copied.

my $ref_only = 1;

my $name = # identifies the name of the grammar.
	# "Graefe";
	"Gardiner";
	# "Allen";
	# "Loprieno";

$descr_graefe =
<<DESCR;
Inverted index of examples from Bauer, in:
<ul>
<li>
E. Graefe.
<i>Mittel&auml;gyptische Grammatik f&uuml;r Anf&auml;nger</i>.
Harrassowitz Verlag, Wiesbaden, 1994.
</li>
</ul>
DESCR

$descr_gardiner =
<<DESCR;
Inverted index of examples from Peas., in:
<ul>
<li>
A. Gardiner.
<i>Egyptian Grammar</i>.
Griffith Institute, Ashmolean Museum, Oxford, 1957.
</li>
</ul>
Only direct quotes were included, 
and examples consisting of single words were mostly excluded.
DESCR

$descr_allen =
<<DESCR;
Inverted index of examples from Peas., in:
<ul>
<li>
J.P. Allen.
<i>Middle Egyptian: An Introduction to the Language and Culture
of Hieroglyphs</i>.
Cambridge University Press, 2000.
</li>
</ul>
Examples consisting of single words were excluded.
DESCR

$descr_loprieno =
<<DESCR; 
Inverted index of examples from Peas., in:
<ul>
<li>
A. Loprieno.
<i>Ancient Egyptian: a linguistic introduction</i>.
Cambridge University Press, 1995.
</li>
</ul>
Unnumbered examples in running text were excluded.
DESCR

my $descr = # gives description of the source
	($name eq "Graefe") ? $descr_graefe :
	($name eq "Gardiner") ? $descr_gardiner :
	($name eq "Allen") ? $descr_allen :
	($name eq "Loprieno") ? $descr_loprieno : "";

my @input_names = # identifies input files.
        ($name eq "Graefe") ? ("Graefe.txt") :
        ($name eq "Gardiner") ? ("Gardiner.txt") :
        ($name eq "Allen") ? ("Allen.txt", "Allen-cite.ex") :
        ($name eq "Loprieno") ? ("Loprieno.txt") : ();

my $output_name = # identifies output file.
        ($name eq "Graefe") ? "../AEL/Bauer-Graefe.xml" :
        ($name eq "Gardiner") ? "../AEL/Peas-Gardiner.xml" :
        ($name eq "Allen") ? "../AEL/Peas-Allen.xml" :
        ($name eq "Loprieno") ? "../AEL/Peas-Loprieno.xml" : "";

my $url = # identifies file as URL.
        ($name eq "Graefe") ? "http://www.dfki.uni-sb.de/~nederhof/AEL/peasant/Bauer-Graefe.xml" :
        ($name eq "Gardiner") ? "http://www.dfki.uni-sb.de/~nederhof/AEL/peasant/Peas-Gardiner.xml" :
        ($name eq "Allen") ? "http://www.dfki.uni-sb.de/~nederhof/AEL/peasant/Peas-Allen.xml" : 
        ($name eq "Loprieno") ? "http://www.dfki.uni-sb.de/~nederhof/AEL/peasant/Peas-Loprieno.xml" : "";

# Adapt this function to select on %bi entries of interest:

sub to_be_selected {
	my ($str) = @_;
	if ($name eq "Graefe") {
		return ($str =~ /Bauer/);
	} else {
		return ($str =~ /Peas/);
	}
}

# Adapt this function to extract the version from the %bi entry:

sub version {
        my ($str) = @_;
	if ($name eq "Graefe") {
		if ($str =~ /Bauer (B1|R)/) {return $1}
	} elsif ($name eq "Gardiner") {
		if ($str =~ /B 1/) {return "B1 Old"}
		elsif ($str =~ /B 2/) {return "B2"}
		elsif ($str =~ /R/) {return "R Old"}
		elsif ($str =~ /Bt/) {return "Bt"}
	} elsif ($name eq "Allen") {
		if ($str =~ /Peas\. ([BRt12]+)/) {return $1}
	} elsif ($name eq "Loprieno") {
		if ($str =~ /Peas\. (R|B[t12])/) {return $1}
	}
}

# Adapt this function to extract the position from the %bi entry:

sub position {
        my ($str) = @_;
	if ($name eq "Graefe") {
        	if ($str =~ /B1,([0-9]+)/) {return $1} 
		elsif ($str =~ /R([0-9]+),([0-9]+)/) {return "$1.$2"}
		else {return ""}
	} elsif ($name eq "Gardiner") {
        	if ($str =~ /B [12], ([0-9]+)/) {return $1}
        	elsif ($str =~ /(?:R|Bt\.) ([0-9]+)/) {return $1}
		else {return ""}
	} elsif ($name eq "Allen") {
		if ($str =~ /B[12], ([0-9]+)/) {return $1}
		elsif ($str =~ /Bt ([0-9]+)/) {return $1}
		elsif ($str =~ /R ([0-9]+), ([0-9]+)/) {return "$1.$2"}
	} elsif ($name eq "Loprieno") {
		if ($str =~ /B[12],([0-9]+)/) {return $1}
		elsif ($str =~ /Bt ([0-9]+)/) {return $1}
		elsif ($str =~ /R([0-9]+)\.([0-9]+)/) {return "$1.$2"}
	}
}

# Adapt this function to extract the keys on which examples may be sorted:

sub sorting_key {
        my ($str) = @_;
	my $key = position($str);
	if ($key =~ /^([0-9]+)[^0-9]([0-9]+)/) {
		$key = 1000*$1 + $2;
	}
        return $key
}

# End changeables.
###########################################################################

open(OUT, ">$output_name") || die "cannot open $output_name\n";

my $thisday = (localtime)[3];
my $thismonth = (Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec)[(localtime)[4]];
my $thisyear = (localtime)[5] + 1900;

print OUT <<XMLSTART;
<?xml version="1.0"?>
<!DOCTYPE resource SYSTEM "AELalign.0.1.dtd">
<resource>
<created>
Automatically generated by ex2xml, $thismonth $thisday, $thisyear.
</created>
<header name="$name"
	url="$url">
$descr
</header>
<body>

XMLSTART

##########################################
# Files containing sentences are read. Those
# whose %bi entry satisfies the pattern are stored.

my($ex,$nr,$pa,$li,$hi,$al,$tr,$bi,$entry);

# To contain references to all relevant examples:

my @ids = ();

# To map ids of examples to the examples themselves:

my %example_hi = ();
my %example_al = ();
my %example_tr = ();

foreach $input_name (@input_names) {
	if ($input_name =~ /\.ex$/) {
		read_exercises()
	} elsif ($input_name =~ /\.txt$/) {
       		read_txt()
	}
}

sub read_exercises {
open(IN, "$input_name") || die "cannot open $input_name";
$ex = "";
$nr = "";
$hi = "";
$al = "";
$tr = "";
$bi = "";
$entry = "";

while (<IN>) {
	if (/^%ex(\s+.*)$/) {
                my $new_ex = $1;
		store_exer();
		$ex = clear_spaces($new_ex);
		$ex =~ s/Exercise /Ex./;
		$nr = "";
        }
        elsif (/^%nr(\s+.*)$/) {
                my $new_nr = $1;
		store_exer();
                $nr = clear_spaces($new_nr)
        }
        elsif (/^%hi(\s+.*)$/) {
                if ($hi) {die "second %hi entry:\n$_"};
                $hi = "$1\n";
                $entry = "hi";
        }
        elsif (/^%al(\s+.*)$/) {
                if ($al) {die "second %al entry:\n$_"};
                $al = "$1\n";
                $entry = "al";
        }
        elsif (/^%tr(\s+.*)$/) {
                if ($tr) {die "second %tr entry:\n$_"};
                $tr = "$1\n";
                $entry = "tr";
        }
        elsif (/^%bi(\s+.*)$/) {
                if ($bi) {die "second %bi entry:\n$_"};
                $bi = "$1\n";
                $entry = "bi";
        }
        elsif (/^%/) {
		$entry = ""
        }
	elsif (/^\s*$/) {
		$entry = ""
        }
        else {  if ($entry eq "hi")
                        {$hi .= $_}
		elsif ($entry eq "al")
                        {$al .= $_}
                elsif ($entry eq "tr")
                        {$tr .= $_}
                elsif ($entry eq "bi")
                        {$bi .= $_}
        }
};

store_exer();
close(IN)
}

sub read_txt {
open(IN, "$input_name") || die "cannot open $input_name";
$pa = "";
$li = "";
$hi = "";
$al = "";
$tr = "";
$bi = "";
$entry = "";

while (<IN>) {
        if (/^%pa(\s+.*)$/) {
                my $new_pa = $1;
                store_example();
                $pa = clear_spaces($new_pa);
                $li = "";
        }
        elsif (/^%li(\s+.*)$/) {
                my $new_li = $1;
                store_example();
                $li = clear_spaces($new_li)
        }
        elsif (/^%hi(\s+.*)$/) {
                if ($hi) {die "second %hi entry:\n$_"};
                $hi = "$1\n";
                $entry = "hi";
        }
        elsif (/^%al(\s+.*)$/) {
                if ($al) {die "second %al entry:\n$_"};
                $al = "$1\n";
                $entry = "al";
        }
        elsif (/^%tr(\s+.*)$/) {
                if ($tr) {die "second %tr entry:\n$_"};
                $tr = "$1\n";
                $entry = "tr";
        }
        elsif (/^%bi(\s+.*)$/) {
                if ($bi) {die "second %bi entry:\n$_"};
                $bi = "$1\n";
                $entry = "bi";
        }
        elsif (/^%/) {
                $entry = ""
        }
        elsif (/^\s*$/) {
                $entry = ""
        }
        else {  if ($entry eq "hi")
                        {$hi .= $_}
		elsif ($entry eq "al")
                        {$al .= $_}
                elsif ($entry eq "tr")
                        {$tr .= $_}
                elsif ($entry eq "bi")
                        {$bi .= $_}
        }
};

store_example();
close(IN)
}

sub store_exer {
if ($hi || $al || $tr || $bi) {
unless ($ex && $nr) {die "no %ex or no %nr"}
$hi = clear_spaces($hi);
$al = clear_spaces($al);
$tr = clear_spaces($tr);
$bi = clear_spaces($bi);
if (to_be_selected($bi)) {
	my $id = sorting_key($bi) . "\n" .
			version($bi) . "\n" .
			position($bi) . "\n" .
			# "$name, $ex, $nr";
			"$ex,$nr";
	$example_hi{$id} = $hi;
	$example_al{$id} = $al;
	$example_tr{$id} = $tr;
	push(@ids, $id);
}

$hi = "";
$al = "";
$tr = "";
$bi = "";
$nr = "";
};
$entry = ""
};

sub store_example {
if ($hi || $al || $tr || $bi) {
unless ($pa && $li) {die "no %pa or no %li"}
$hi = clear_spaces($hi);
$al = clear_spaces($al);
$tr = clear_spaces($tr);
$bi = clear_spaces($bi);
if (to_be_selected($bi)) {
        my $id = sorting_key($bi) . "\n" .
                        version($bi) . "\n" .
                        position($bi) . "\n" .
                        # "$name, p. $pa";
                        "p.$pa";
        $example_hi{$id} = $hi;
        $example_al{$id} = $al;
        $example_tr{$id} = $tr;
	push(@ids, $id);
}

$hi = "";
$al = "";
$tr = "";
$bi = "";
$li = "";
};
$entry = ""
};

##########################################
# Auxiliary routines.

sub clear_spaces {
my($txt) = @_;
$txt =~ s/ +/ /g;
$txt =~ s/^\s*//;
$txt =~ s/\s*$//;
return $txt
}

########################################
# sorting:

sub cmp_by_pos {
	$a =~ /^(.*)\n/;
	my $p1 = $1;
	$b =~ /^(.*)\n/;
	my $p2 = $1;
	if ($p1 =~ /^[0-9]+$/ && $p2 =~ /^[0-9]+$/) {
		return ($p1 <=> $p2);
	} else {
		return ($p1 cmp $p2);
	}
}

my @sorted_ids = sort cmp_by_pos @ids;

########################################
# Merging entries.
# Coordinates should not be written twice. 
# Therefore, if two such entries should occur,
# the entries are merged. Each coordinate has
# at most one representative id.

%coord_label = ();

@ids = ();

foreach $id (@sorted_ids) {
	$id =~ /^.*\n(.*)\n(.*)\n(.*)$/;
        my $version = $1;
        my $pos = $2; 
        my $label = $3;
        my $hi = $ref_only ? "" : $example_hi{$id};
        my $al = $ref_only ? "" : $example_al{$id};
        my $tr = $ref_only ? "" : $example_tr{$id};
	my $hi_out = ""; 
	my $al_out = ""; 
	my $tr_out = ""; 
        unless ($hi eq "") {
		$hi_out = "<text>($label)</text>\n$hi\n";
	}
	unless ($al eq "") {
		$al_out = "<text>($label)</text>\n$al\n";
	}
	unless ($tr eq "") {
		$tr_out = "($label)\n$tr\n";
	}
	if ($al eq "" && $hi eq "" && $tr eq "") {
		$al_out = "<text>($label)</text>\n";
	}
	if (defined ($coord_label{"$version\n$pos"})) {
		my $id2 = $coord_label{"$version\n$pos"};
		$example_hi{$id2} .= $hi_out;
		$example_al{$id2} .= $al_out;
		$example_tr{$id2} .= $tr_out;
	} else {
		$coord_label{"$version\n$pos"} = $id;
		$example_hi{$id} = $hi_out;
                $example_al{$id} = $al_out;
                $example_tr{$id} = $tr_out;
		push(@ids, $id);
	}
}

########################################
# Writing the entries.

foreach $id (@ids) {
	$id =~ /^.*\n(.*)\n(.*)\n(.*)$/;
	my $version = $1;
	my $pos = $2;
	my $label = $3;
	print OUT "<coord version=\"$version\" pos=\"$pos\"/>\n";
	my $hi = $example_hi{$id};
	my $al = $example_al{$id};
	my $tr = $example_tr{$id};
	unless ($hi eq "") {
		print OUT "<texthi>\n$hi</texthi>\n"; 
	}
	unless ($al eq "") {
		print OUT "<textal>\n$al</textal>\n"; 
	}
	unless ($tr eq "") {
		print OUT "<texttr>\n$tr</texttr>\n"; 
	}
	print OUT "\n";
}

##########################################

print OUT <<XMLEND;
</body>
</resource>
XMLEND

close(OUT);
