#! /usr/bin/perl

# This script validates collections of PhotoML files

# Copyright © 2005-2007 Brendt Wohlberg <photoml@wohlberg.net>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License at
# http://www.gnu.org/licenses/gpl-2.0.txt.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.

# Most recent modification: 14 January 2008

use strict;
use File::Basename;
use Getopt::Std;

# Set up path variables
my $pmlpath = dirname($0) . "/..";
my $xsl = "$pmlpath/xsl/misc/valid.xsl";
my $dtd = "$pmlpath/dtd/photo.dtd";
if (-r '/etc/xml/catalog' and $ENV{'XML_CATALOG_FILES'} eq '') {
  $ENV{'XML_CATALOG_FILES'} = "/etc/xml/catalog";
}
$ENV{'XML_CATALOG_FILES'} = "$pmlpath/dtd/catalog.xml " .
                            $ENV{'XML_CATALOG_FILES'};
undef $ENV{'SGML_CATALOG_FILES'};

# Ensure xmllint and xmllint are available
die "pmlvalid: error executing xmllint\n"
    if (`which xmllint 2>/dev/null` eq '');
die "pmlvalid: error executing xstlproc\n"
    if (`which xsltproc 2>/dev/null` eq '');

# Set up DTD version verification
my $dtdver = getdtdver($dtd);
my $pubid = "-//BW//DTD PhotoML $dtdver//EN";

# Handle command line options
my $usagetext = <<EOF;
usage: pmlvalid [-h] infile [infile] ...
       -h Display usage information
EOF
my $opts = {};
getopts('h', $opts);

# Display usage information if requested, or if no command line arguments
if ($opts->{'h'}) {print STDERR $usagetext; exit(0)};
if (@ARGV == 0) {print STDERR $usagetext; exit(1)};

my $ibpath = (defined $opts->{'b'})?"$opts->{'b'}/":'';

# Stage 1: validate each file with respect to the PhotoML DTD
my $nerr1 = 0;
my ($f, $lintcmd);
foreach $f ( @ARGV ) {
  die "pmlvalid: could not read file $f\n" if (!-f $f or !-r $f);
  $lintcmd = ['xmllint', '--valid', '--noout', $f];
  system(@$lintcmd) == 0 or $nerr1++;
}

# Exit if there are any stage 1 errors
exit(1) if ($nerr1 > 0);

# Stage 2: verify correct PhotoML DTD version
my $nerr2 = 0;
foreach $f ( @ARGV ) {
  if ($pubid ne getdoctype($f)) {
    print "$f: DTD version mismatch with installed PhotoML version\n";
    $nerr2++;
  }
}

# Exit if there are any stage 2 errors
exit(2) if ($nerr2 > 0);

# Stage 3: impose additional constraints
my $nerr3 = 0;
my ($l, $t, $i, $p);
my $id = {};
my $ih = {};
my $fh = {};
my $hr = {};
my $co = {};
foreach $f ( @ARGV ) {
  # Open pipe from xsltproc
  open(IPH, "xsltproc $xsl $f |") or die "pmlvalid: pipe open failed\n";
  # Parse xsltproc output
  while ($l = <IPH>) {
    # Get a line from the pipe
    chomp($l);
    if ($l =~ /^([\w-]+):\s+/) {
      # New roll, sheet, digital, or digimage element
      $t = $1;
      $id->{$t} = {} if (!defined $id->{$t});
      $i = join(' ',split(/\s+/, $'));
      if (defined $id->{$t}->{$i}) {
	# Duplicate id error
	report_id_occur_error($f, $t, $i, $id->{$t}->{$i});
	$nerr3++;
      } else {
	# Record file $f in which element $t with id $i encountered
	$id->{$t}->{$i} = $f;
      }
    } elsif ($l =~ /^\s+([\w-]+):\s+/) {
      # Data associated with the current roll, sheet, digital,
      # or digimage element
      $p = parse_quoted_list($');
      if ($1 eq 'error') {
	# An error not associated with non-uniqueness of id, image-hash etc.
	report_error($f, $t, $i, $p);
        $nerr3++;
      } elsif ($1 eq 'href') {
	# An href to an image file
	if (defined $hr->{$p->[0]}) {
	  # Duplicate image href error
	  report_href_occur_error($f, $t, $i, $hr->{$p->[0]});
	  $nerr3++;
	} else {
	  # Record element id $id in which href $p->[0] was encountered
	  $hr->{$p->[0]} = $i;
	}
      } elsif ($1 eq 'image-hash') {
	# An image-hash of an image file
	if (defined $ih->{$p->[0]}) {
	  # Duplicate image hash error
	  report_ihash_occur_error($f, $t, $i, $p->[0], $ih->{$p->[0]},
				   $id->{$t}->{$ih->{$p->[0]}});
	  $nerr3++;
	} else {
	  # Record element id $i in which image hash $p->[0] was encountered
	  $ih->{$p->[0]} = $i;
	}
      } elsif ($1 eq 'file-hash') {
	# A file-hash of an image file
	if (defined $fh->{$p->[0]}) {
	  # Duplicate file hash error
	  report_fhash_occur_error($f, $t, $i, $ih->{$p->[0]});
	  $nerr3++;
	} else {
	  # Record element id $i in which file hash $p->[0] was encountered
	  $fh->{$p->[0]} = $i;
	}
      } elsif ($1 eq 'collection') {
	# A collection associated with a roll, sheet, or digital element
	$co->{$p->[0]} = {} if (!defined $co->{$p->[0]});
	if (defined $co->{$p->[0]}->{$p->[1]}) {
	  # Duplicate collection cgid error
	  report_collect_occur_error($f, $t, $i, $p->[0], $p->[1],
				     $co->{$p->[0]}->{$p->[1]},
				     $id->{$t}->{$co->{$p->[0]}->{$p->[1]}});
	  $nerr3++;
	} else {
	  # Record element $t and id $i in which collection with
	  # id $p->[0] and cgid $p->[1] encountered
	  $co->{$p->[0]}->{$p->[1]} = [$t, $i];
	}
      } else {
	# Unrecognised field from xsltproc pipe
	print "pmlvalid warning: discarded valid.xsl output:\n   $l\n";
      }
    } else {
      # Invalid line read from xsltproc pipe
      $! = 3;
      die "pmlvalid: internal error while parsing valid.xsl output\n   $l\n";
    }
  }
  close(IPH);
}

exit(($nerr3>0)?3:0);


# Print general error reports
sub report_error {
  my $f = shift;
  my $t = shift;
  my $i = shift;
  my $e = shift;

  my $l = {
       'invalid-pos-int'                  =>
         "invalid positive integer \"$e->[2]\" in element $e->[1]",
       'invalid-real'                     =>
         "invalid real number \"$e->[2]\" in element $e->[1]",
       'invalid-fraction'                 =>
         "invalid fraction \"$e->[2]\" in element $e->[1]",
       'invalid-hex-string'               =>
         "invalid hex string \"$e->[2]\" in element $e->[1]",
       'invalid-date'                     =>
         "invalid date \"$e->[2]\" in element $e->[1]",
       'invalid-time'                     =>
         "invalid time \"$e->[2]\" in element $e->[1]",
       'invalid-time-zone'                =>
         "invalid time zone \"$e->[2]\" in element $e->[1]",
       'invalid-latitude'                 =>
	 "invalid latitude value \"$e->[2]\" in element $e->[1]",
       'invalid-longitude'                =>
	 "invalid longitude value \"$e->[2]\" in element $e->[1]",
       'frame-without-id'                 =>
         "frame without id is not a defaults child",
       'defaults-frame-with-id'           =>
	 "defaults child frame has id (id=\"$e->[1]\")",
       'defaults-frame-with-fn'           =>
	 "defaults child frame has fn (fn=\"$e->[1]\")",
       'frame-nonunique-id'               =>
         "more than one frame with id=\"$e->[1]\"",
       'frame-nonunique-fn'               =>
         "more than one frame with fn=\"$e->[1]\"",
       'parent-frame-not-digital-descendant'   =>
         'is a child of frame which is not a child of digital',
       'parent-frame-and-explicit-id'     =>
         'is a child of frame and has explicit group-id or frame-id attribute',
       'collection-fstfid-out-of-range'   =>,
         "in collection with id=\"$e->[1]\" and cgid=\"$e->[2]\"; ".
       "parent element\n   has no frame with id matching fstfid=\"$e->[3]\"",
         'collection-lstfid-out-of-range' =>,
       "in collection with id=\"$e->[1]\" and cgid=\"$e->[2]\"; ".
       "parent element\n   has no frame with id matching lstfid=\"$e->[3]\""
	  };
  my $r = (defined $l->{$e->[0]})?$l->{$e->[0]}:
            "unrecognized error: $e->[0]";
  my $j = id_description($t, $i);
  print "$f: $t $j\n   $r\n";
}


# Print non-unique id (for roll, sheet, digital, and digimage) errors
sub report_id_occur_error {
  my $f = shift;
  my $t = shift;
  my $i = shift;
  my $d = shift;

  my $j = id_description($t, $i);
  print "$f: $t $j\n   the id is not unique; ".
        "there is another occurrence in file $d\n";
}


# Print non-unique image hash errors
sub report_ihash_occur_error {
  my $f = shift;
  my $t = shift;
  my $i = shift;
  my $h = shift;
  my $d = shift;
  my $e = shift;

  my $j = id_description($t, $i);
  print "$f: $t $j\n   image hash $h is not unique;\n   ".
        "there is another occurrence in file $e in\n   $t ".
	id_description($t, $d) . "\n";
}


# Print non-unique file hash errors
sub report_fhash_occur_error {
  my $f = shift;
  my $t = shift;
  my $i = shift;
  my $h = shift;
  my $d = shift;
  my $e = shift;

  my $j = id_description($t, $i);
  print "$f: $t $j\n   file hash $h is not unique;\n   ".
        "there is another occurrence in $t id $d in file $e\n";
}


# Print non-unique href errors
sub report_href_occur_error {
  my $f = shift;
  my $t = shift;
  my $i = shift;
  my $h = shift;
  my $d = shift;

  my $j = id_description($t, $i);
  print "$f: $t $j\n   href $h is not unique;\n   ".
        "there is another occurrence in file $d\n";
}


# Print non-unique collection cgid errors
sub report_collect_occur_error {
  my $f = shift;
  my $t = shift;
  my $i = shift;
  my $c = shift;
  my $d = shift;
  my $e = shift;
  my $g = shift;

  my $j = id_description($t, $i);
  print "$f: $t $j\n   collection id=\"$c\" cgid=\"$d\" is not unique;\n   ".
        "there is another occurrence in $e->[0] id=$e->[1] file $g\n";
}


# Construct an id description string
sub id_description {
  my $t = shift;
  my $i = shift;

  my $a = parse_quoted_list($i);
  my $l = {'roll'     => "id=\"$a->[0]\"",
	   'digital'  => "id=\"$a->[0]\"",
	   'sheet'    => "id=\"$a->[0]\"",
	   'digimage' => "group-id=\"$a->[0]\" frame-id=\"$a->[1]\" ".
	                 "image-id=\"$a->[2]\""};
  return (defined $l->{$t})?$l->{$t}:"id $i";
}


# Parse a space-separated list of quoted strings and return as an array
sub parse_quoted_list {
  my $s = shift;

  my $a = [];
  while ($s =~ /^\s*\"([^\"]*)\"/) {
    push @$a, $1;
    $s = $';
  }

  return $a;
}


# Extract the public doctype from an PhotoML XML file
sub getdoctype {
  my $xfn = shift;

  my $public = undef;
  open(FH, "<$xfn") or return $public;
  my $line;
  while ($line = <FH>) {
    last if ($line =~ /<!DOCTYPE/);
  }
  while ($line !~ /<!DOCTYPE[^>]+>/ and $l = <FH>) {
    $line .= $l;
  }
  if ($line =~ /<!DOCTYPE\s+photo\s+PUBLIC\s+\"([^\"]+)\"[^>]+>/) {
    $public = $1;
  }
  close(FH);
  return $public;
}


# Get DTD version number
sub getdtdver {
  my $dfn = shift;

  my $ver = undef;
  open(FH, "<$dfn") or return $ver;
  my $line;
  while ($line = <FH>) {
    if ($line =~ /Version:\s+(\d+\.\d+)\s+/) {
      $ver = $1;
      last;
    }
  }
  close(FH);
  return $ver;
}
