#!/usr/bin/perl
#-------------------------------------------------------------------------------
# Cross reference Dita XML.
# Philip R Brenan at gmail dot com, Appa Apps Ltd Inc, 2016-2018
#-------------------------------------------------------------------------------
# podDocumentation

package Data::Edit::Xml::Xref;
our $VERSION = 20181204;
use v5.20;
use warnings FATAL => qw(all);
use strict;
use Carp qw(confess cluck);
use Data::Dump qw(dump);
use Data::Edit::Xml;
use Data::Table::Text qw(:all);
use utf8;

#D1 Cross reference                                                             # Check the cross references in a set of Dita files and report the results.

sub xref(%)                                                                     # Check the cross references in a set of Dita files held in  L<inputFolder|/inputFolder> and report the results in the L<reports|/reports> folder. The possible attributes are defined in L<Data::Edit::Xml::Xref|/Data::Edit::Xml::Xref>
 {my (%attributes) = @_;                                                        # Attributes
  my $xref = genHash(__PACKAGE__,                                               # Attributes used by the Xref cross referencer.
    attributeCount=>{},                                                         # {file}{attribute} == count of the different xml attributes found in the xml files.
    badBookMaps=>{},                                                            # Bad book maps
    badConRefs=>{},                                                             # [file, href]   Invalid conref attributes.
    badImageRefs=>{},                                                           # Consolidated images missing.
    badTopicRefs=>{},                                                           # [file, href]   Invalid href attributes found on topicref tags.
    badXml1=>{},                                                                # [Files] with a bad xml encoding header on the first line.
    badXml2=>{},                                                                # [Files] with a bad xml doc type on the second line.
    badXRefs=>{},                                                               # [file, href]   Invalid href attributes found on xref tags.
    conRefs=>{},                                                                # {file}{href}   Count of conref definitions in each file.
    docType=>{},                                                                # {file} == docType:  the docType for each xml file.
    duplicateIds=>{},                                                           # [file, id]     Duplicate id definitions within each file.
    duplicateTopicIds=>{},                                                      # Duplicate topic ids
    duplicateTopicIds=>{},                                                      # [topicId, [files]] Files with duplicate topic ids - the id on the outermost tag.
    fixBadRefs=>undef,                                                          # Try to fix bad references in L<these files|/fixedFiles> where possible.
    fixedFiles=>{},                                                             # Files that have a missing conref or href which can be ameliorated by renaming the failing attribute to  "xtrf".
    goodBookMaps=>{},                                                           # Good book maps
    goodConRefs=>{},                                                            # Good con refs
    goodImageRefs=>{},                                                          # Consolidated images found.
    goodTopicRefs=>{},                                                          # Good topic refs
    goodXRefs=>{},                                                              # Good xrefs
    ids=>{},                                                                    # {file}{id}     Id definitions across all files.
    images=>{},                                                                 # {file}{href}   Count of image references in each file.
    inputFiles=>[],                                                             # Input files from L<inputFolder|/inputFolder>.
    inputFolder=>undef,                                                         # A folder containing the dita and ditamap files to be cross referenced.
    maximumNumberOfProcesses=>4,                                                # Maximum number of processes to run in parallel at any one time.
    missingImageFiles=>{},                                                      # [file, href] == Missing images in each file.
    missingTopicIds=>{},                                                        # Missing topic ids
    notReferenced=>{},                                                          # files in input area that are not referenced by a conref, image, topicref or xref tag and are not a bookmap.
    parseFailed=>{},                                                            # [file] files that failed to parse
    reports=>q(reports),                                                        # Reports folder: the cross referencer will write reports to files in this folder.
    results=>[],                                                                # Summary of results table
    sourceFile=>undef,                                                          # The source file from which this structure was generated
    statusLine=>undef,                                                          # Status line summarizing the cross reference. The counts listed in the status line are the counts of the files that have the described problems not a count of all the instances of all the problems in all the files.
    summary=>1,                                                                 # Print the summary line.
    tagCount=>{},                                                               # {file}{tags} == count of the different tag names found in the xml files.
    topicIds=>{},                                                               # {file} = topic id - the id on the outermost tag.
    topicRefs=>{},                                                              # {file}{href}++ References from bookmaps to topics via appendix, chapter, topicref.
    validationErrors=>{},                                                       # True means that Lint detected errors in the xml contained in the file
    xRefs=>{},                                                                  # {file}{href}++ Xrefs references.
#    from=>{},                                                                   # Xrefs contributing to this xref by file
   );
  loadHash($xref, @_);                                                          # Load attributes complaining about any invalid ones

  $xref->inputFolder or confess "Please supply a value for: inputFolder";

  $xref->inputFolder = absFromAbsPlusRel(currentDirectory, $xref->inputFolder)  # Make input folder absolute
    if $xref->inputFolder !~ m(\A/);

  my @phases = qw(loadInputFiles analyze reportBadXml1 reportBadXml2
                  reportDuplicateIds reportDuplicateTopicIds
                  reportBadXrefs reportBadTopicRefs
                  reportBadConrefs reportImages reportParseFailed
                  reportAttributeCount reportTagCount reportDocTypeCount
                  reportFileExtensionCount reportFileTypes
                  reportValidationErrors reportBookMaps
                  reportNotReferenced
                 );
  for my $phase(@phases)                                                        # Perform analysis phases
   {$xref->$phase;
   }

  if ($xref->fixBadRefs and my @files = sort keys %{$xref->fixedFiles})         # Fix files if requested
   {for my $file(@files)
     {fixOneFile($xref, $file);
     }
   }

  if (1)                                                                        # Summarize
   {my @o;
    my $save = sub
     {my ($field, $plural, $single) = @_;
      my $n = keys %{$xref->{$field}};
      push @o, [$n,            $plural]                   if $n >  1;
      push @o, [$n, $single // ($plural =~ s(s\Z) ()gsr)] if $n == 1;
     };

    $save->("missingImageFiles", q(missing image files));
    $save->("badTopicRefs",      q(bad topicrefs));
    $save->("badXRefs",          q(bad xrefs));
    $save->("badConRefs",        q(bad conrefs));
    $save->("duplicateIds",      q(duplicate ids));
    $save->("duplicateTopicIds", q(duplicate topic ids));
    $save->("missingTopicIds",   q(missing topic ids));
    $save->("badXml1",           q(bad first lines));
    $save->("badXml2",           q(bad second lines));
    $save->("badBookMaps",       q(bad book maps));
    $save->("badImageRefs",      q(missing image references));
    $save->("validationErrors",  q(validation errors));
    $save->("parseFailed",       q(files failed to parse), q(file failed to parse));
    $save->("notReferenced",     q(files not referenced), q(file not referenced));

    $xref->statusLine = @o ? join " ",
      "Xref:", join ", ",
               map {join " ", @$_}
               sort
                {return $$a[1] cmp $$b[1] if $$b[0] == $$a[0];
                 $$b[0] <=> $$a[0]
                }
               @o : q();
    $xref->results    = \@o;
    if (@o and $xref->summary)
     {say STDERR $xref->statusLine;
     }
   }

  $xref                                                                         # Return Xref results
 }

sub loadInputFiles($)                                                           #P Load the names of the files to be processed
 {my ($xref) = @_;                                                              # Cross referencer
  $xref->inputFiles = [searchDirectoryTreesForMatchingFiles
    $xref->inputFolder, qw(.dita .ditamap .xml)];
 }

sub analyzeOneFile($)                                                           #P Analyze one input file
 {my ($iFile) = @_;                                                             # File to analyze
  my $xref = bless {};                                                          # Cross referencer for this file
     $xref->sourceFile = $iFile;                                                # File analyzed

  my $x = eval {Data::Edit::Xml::new($iFile)};                                  # Parse xml

  if ($@)
   {$xref->parseFailed->{$iFile}++;
    return $xref;
   }

  $x->by(sub                                                                    # Each node
   {my ($o) = @_;
    if (my $i = $o->id)                                                         # Id definitions
     {$xref->ids->{$iFile}{$i}++;
     }
    if ($o->at_xref)                                                            # Xrefs but not to the web
     {if (my $h = $o->href)
#      {if ($h !~ m(\A(https?://|mailto:))i)
       {if ($o->attrX_format =~ m(\Adita)i)                                     # Check xref has format=dita
         {$xref->xRefs->{$iFile}{$h}{$o->stringText}++;
         }
       }
     }
    elsif ($o->at(qr(\A(appendix|chapter|topicref)\Z)))                         # References from bookmaps
     {if (my $h = $o->href)
       {$xref->topicRefs->{$iFile}{$h}{$o->attr_navtitle//$o->stringText}++;
       }
     }
    elsif ($o->at(qw(image)))                                                   # Images
     {if (my $h = $o->href)
       {$xref->images->{$iFile}{$h}++;
       }
     }
    if (my $conref = $o->attr_conref)                                           # Conref
     {$xref->conRefs->{$iFile}{$conref}++;
     }
   });

  $xref->topicIds->{$iFile} = $x->id;                                           # Topic Id
  $xref->docType ->{$iFile} = $x->tag;                                          # Document type
  $xref->attributeCount->{$iFile} = $x->countAttrNames;                         # Attribute names
  $xref->tagCount      ->{$iFile} = $x->countTagNames;                          # Tag names

  if (1)                                                                        # Check xml headers and lint errors
   {my @h = split /\n/, my $s = readFile($iFile);
    if (!$h[0] or $h[0] !~ m(\A<\?xml version=\"1.0\" encoding=\"UTF-8\"\?>\Z))
     {$xref->badXml1->{$iFile}++;
     }
    my $tag = $x->tag;
    if (!$h[1] or $h[1] !~ m(\A<!DOCTYPE $tag PUBLIC "-//))
     {$xref->badXml2->{$iFile}++;
     }

    $xref->validationErrors->{$iFile}++ if $s =~ m(<!--compressedErrors:)s;     # File has validation errors
   }

  $xref
 }

sub fixOneFile($$)                                                              #P Analyze one input file
 {my ($xref, $file) = @_;                                                       # Xref results, file to fix

  my $x = Data::Edit::Xml::new($file);                                          # Parse xml - should parse OK else otherwise how did we find out that this file needed to be fixed

  $x->by(sub                                                                    # Each node
   {my ($o) = @_;
    if ($o->at_xref or $o->at_topicref or $o->at_image)                         # Hrefs that need to be fixed
     {if (my $h = $o->href)
       {if ($xref->fixedFiles->{$file}{$h})
         {$o->renameAttr_href_xtrf;
         }
       }
     }
    if (my $conref = $o->attr_conref)                                           # Conref
     {if ($xref->fixedFiles->{$file}{$conref})
       {$o->renameAttr_href_xtrf;
       }
     }
   });

  if (1)                                                                        # Replace xml in source file
   {my $S = my $s = readFile($file);
    my $T = -p $x;
    my $t = -t $x;
    $s =~ s(<\s*$t.*<\s*\/\s*$t\s*>) ($T)is;
    owf($file, $s) unless $s eq $S;
   }

  $xref
 }

sub analyze($)                                                                  #P Analyze the input files
 {my ($xref) = @_;                                                              # Cross referencer
  my @in = @{$xref->inputFiles};                                                # Input files
  my @square = squareArray(@in);                                                # Divide the task

  my $process = temporaryFolder;
  my $ps = newProcessStarter($xref->maximumNumberOfProcesses, $process);        # Process starter
     $ps->processingTitle   = q(Xref);
     $ps->totalToBeStarted  = scalar @square;
     $ps->processingLogFile = fpe($xref->reports, qw(log xref txt));

  for my $row(@square)                                                          # Each row of input files file
   {$ps->start(sub
     {my @r;                                                                    # Results
      for my $col(@$row)                                                        # Each column in the row
       {push @r, analyzeOneFile($col);                                          # Analyze one input file
       }
      [@r]                                                                      # Return results as a reference
     });
   }

  for my $x(deSquareArray($ps->finish))                                         # Merge results from each file analyzed
   {#$xref->from->{$x->sourceFile} = $x;                                         # Record contribution

    for my $field(qw(parseFailed badXml1 badXml2
                     ids xRefs topicRefs images conRefs topicIds
                     validationErrors docType attributeCount tagCount))         # Merge hashes by file names which are unique
     {next unless my $xf = $x->{$field};
      for my $f(sort keys %$xf)
       {$xref->{$field}{$f} = $xf->{$f};
       }
     }
   }

  clearFolder($process, scalar @in);
 }

sub reportDuplicateIds($)                                                       #P Report duplicate ids
 {my ($xref) = @_;                                                              # Cross referencer

  my @dups;                                                                     # Duplicate ids definitions
  for my $file(sort keys %{$xref->ids})                                         # Each input file
   {for my $id(sort keys %{$xref->ids->{$file}})                                # Each id in the file
     {my $count = $xref->ids->{$file}{$id};                                     # Number of definitions of this id in the file
      if ($count > 1)                                                           # Duplicate definition
       {push @dups, [$id, $count, $file];                                       # Save details of duplicate definition
       }
     }
   }

  $xref->duplicateIds = {map {$$_[2]=>$_} @dups};                               # All duplicates

  formatTable(\@dups, [qw(Id Count File)],
    head=><<END,
Data::Edit::Xml::Xref found NNNN intra file duplicate id definitions on DDDD

These ids are duplicated within a file, possibly becuase they were copied from
another part of the same file.  This report does not show ids that are the same
in different files as this is not a problem using Dita's three part addressing
scheme which requires only that the topic id be unique across all files.

Duplicate topic ids are reported in ../bad/topicIds.txt.

END
    file=>(my $f = fpe($xref->reports, qw(bad idDefinitionsDuplicated txt))));
 }

sub reportDuplicateTopicIds($)                                                  #P Report duplicate topic ids
 {my ($xref) = @_;                                                              # Cross referencer

  my %dups;                                                                     # Duplicate topic ids definitions
  my @dups;                                                                     # Duplicate topic ids definitions report
  my @miss;                                                                     # Missing topic id definitions report
  for my $file(sort keys %{$xref->topicIds})                                    # Each input file
   {if (my $i = $xref->topicIds->{$file})                                       # Topic Id
     {if (my $d = $dups{$i})                                                    # Duplicate topic id
       {push @dups, [$i, $file, $d];                                            # Save details of duplicate definition
       }
      else
       {$dups{$i} = $file;                                                      # Save topic id
       }
     }
    else
     {push @miss, [$file];                                                      # Missing topic id
     }
   }

  $xref->duplicateTopicIds = {map {$$_[0]=>$_} @dups};                          # All duplicates
  $xref->missingTopicIds   = {map {$$_[0]=>$_} @miss};                          # All missing

  formatTable(\@dups, [qw(TopicId File1 File2)],
    head=><<END,
Data::Edit::Xml::Xref found NNNN duplicate topic id definitions on DDDD

File1, File2 are two files that both define TopicId

END
    file=>(fpe($xref->reports, qw(bad topicIdDefinitionsDuplicated txt))));

  formatTable(\@miss, [qw(File)],
    head=><<END,
Data::Edit::Xml::Xref found NNNN topics that have no topic id on DDDD

END
    file=>(fpe($xref->reports, qw(bad topicIdDefinitionsMissing txt))));
 }

sub reportBadRefs($$)                                                           #P Report bad references found in xrefs or conrefs as they have the same structure
 {my ($xref, $type) = @_;                                                       # Cross referencer, type of reference to be processed

  my @bad; my @good;                                                            # Bad xrefs.
  for   my $file(sort keys %{$xref->{${type}.q(Refs)}})                         # Each input file which will be absolute
   {for my $href(sort keys %{$xref->{${type}.q(Refs)}->{$file}})                # Each href in the file which will be relative
     {my @text;
      if (               ref($xref->{${type}.q(Refs)}->{$file}{$href}))         # Text associated with reference
       {@text =  sort keys %{$xref->{${type}.q(Refs)}->{$file}{$href}};
        s(\s+) ( )gs for @text;                                                 # Normalize white space
       }
      if ($href =~ m(#))                                                        # Href with #
       {my ($hFile, $hId) = split m(#), $href;                                  # File, topicId components
        my ($topic, $id)  = split m(/), $hId;                                   # Topic, id
                    $id //= '';
        my $fFile = $hFile ? absFromAbsPlusRel($file, $hFile) : $file;          # Target file absolute
        if ($hFile and !(-e $fFile or -e wwwDecode($fFile)))                    # Check target file
         {push @bad, [qq(No such file), $href,
           $hFile, $id, $topic, q(), $file, $fFile, @text];
         }

        elsif (my $t = $xref->topicIds->{$fFile})                               # Check topic id
         {if ($t eq $topic)
           {if (my $i = $xref->ids->{$fFile}{$id})
             {if ($i == 1)
               {push @good,[$href, $fFile, $file];
               }
              else
               {push @bad, [qq(Duplicate id in topic), $href,
                 $hFile, $topic, $t, $id, $file, $fFile, @text];
               }
             }
            elsif ($id)
             {push @bad, [qq(No such id in topic), $href,
                $hFile, $topic, $t, $id, $file, $fFile, @text];

             }
            else
             {push @good, [$href, $fFile, $file];
             }
           }
          else
           {push @bad, [qq(Topic id does not match target topic), $href,
             $hFile, $topic, $id, $t, $file, $fFile, @text];
           }
         }
        elsif ($topic =~ m(\S)s)                                                # The href contains a topic id but there is not topic with that id
         {push @bad, [qq(No topic id on topic in target file), $href,
           $hFile, $topic, $id, $t, $file, $fFile, @text];
         }
        else
         {push @good,[$href, $fFile, $file];
         }
       }
      else                                                                      # No # in href
       {my $fFile = absFromAbsPlusRel($file, $href);
        if (!-e $fFile and !-e wwwDecode($fFile))                               # Actual file name or www encoded file name
         {push @bad, my $p = [qq(No such file), $href,
           $fFile, q(), q(), q(), $file, $fFile, @text];
         }
        else
         {push @good, my $p = [$href, $fFile, $file];
         }
       }
     }
   }

  my $Type = ucfirst $type;
  $xref->{q(bad).$Type.q(Refs)}  = {map {$$_[6]=>$_} @bad};                     # Bad references
  $xref->{q(good).$Type.q(Refs)} = {map {$$_[2]=>$_} @good};                    # Good references

  my $in = $xref->inputFolder//'';
  formatTable(\@bad, <<END,
Reason          The reason why the conref failed to resolve
Href            The href in the source file
Href_File       The target file referenced by the href in the source files
Href_Topic_Id   The id of the topic referenced by the href in the source file
Target_Topic_Id The actual id of the topic in the target file
HRef_Id         The id of the statement in the body of the topic referenced by the href in the source file
Source_File     The source file containing the reference
Target_File     The target file
Example_Text    Any text associated with the link such as the navtitle of a bad topicRef or the CDATA text of an xref.
END
    head     =>qq(Data::Edit::Xml::Xref found NNNN Bad ${type}Refs on DDDD),
    summarize=>1,
    wide     =>1,
    file     =>(fpe($xref->reports, q(bad), qq(${Type}Refs), q(txt))));

  formatTable(\@good, <<END,
Href            The href in the source file
Source_File     The source file containing the reference
Target_File     The target file
END
    head     =>qq(Data::Edit::Xml::Xref found NNNN Good $type refs on DDDD),
    file     =>(fpe($xref->reports, q(good), qq(${Type}Refs), q(txt))));

  for my $bad(@bad)                                                             # List of files to fix
   {my $href = $$bad[1];
    my $file = $$bad[6];
    $xref->fixedFiles->{$file}{$href}++;
   }
 }

sub reportBadXrefs($)                                                           #P Report bad xrefs
 {my ($xref) = @_;                                                              # Cross referencer
  reportBadRefs($xref, q(x));
 }

sub reportBadTopicRefs($)                                                       #P Report bad topic refs
 {my ($xref) = @_;                                                              # Cross referencer

  my @bad; my @good;                                                            # Bad xrefs
  for   my $file(sort keys %{$xref->topicRefs})                                 # Each input file
   {for my $href(sort keys %{$xref->topicRefs->{$file}})                        # Each topic ref in the file
     {my @text;
      if (               ref($xref->topicRefs->{$file}{$href}))                 # Text associated with reference
       {@text =  sort keys %{$xref->topicRefs->{$file}{$href}};
        s(\s+) ( )gs for @text;                                                 # Normalize white space
       }
      my $f = absFromAbsPlusRel(fullFileName($file), $href);                    # Target file absolute
      if ($f)
       {if (!-e $f and !-e wwwDecode($f))                                       # Check target file
         {push @bad, my $p = [qq(No such file), $f, $href, $file, @text];
          $xref->fixedFiles->{$file}{$href}++;
         }
        else
         {push @good, my $p = [$f, $href, $file];
         }
       }
     }
   }

  $xref->badTopicRefs  = {map {$$_[1]=>$_} @bad};                               # Bad topic references
  $xref->goodTopicRefs = {map {$$_[0]=>$_} @good};                              # Good topic references

  my $in = $xref->inputFolder//'';
  formatTable(\@bad, <<END,
Reason         Reason the topic reference failed
FullFileName   Name of the targeted file
Href           Href text
Source         Source file
Example_Text   Any text bracketed by the topic ref
END
    head     =>qq(Data::Edit::Xml::Xref found NNNN Bad topicrefs on DDDD),
    summarize=>1,
    wide     =>1,
    file     =>(fpe($xref->reports, qw(bad topicRefs txt))));

  formatTable(\@good, <<END,
FullFileName  The target file name
Href          The href text in the source file
Source        The source file
END
    head=>qq(Data::Edit::Xml::Xref found NNNN Good topicrefs on DDDD),
    file=>(fpe($xref->reports, qw(good topicRefs txt))));
 }

sub reportBadConrefs($)                                                         #P Report bad conrefs refs
 {my ($xref) = @_;                                                              # Cross referencer
  reportBadRefs($xref, q(con));
 }

sub reportImages($)                                                             #P Reports on images and references to images
 {my ($xref) = @_;                                                              # Cross referencer

  my @bad;                                                                      # Bad images
  for my $file(sort keys %{$xref->images})                                      # Each input file
   {for my $href(sort keys %{$xref->images->{$file}})                           # Each image in the file
     {my $image = absFromAbsPlusRel($file, $href);                              # Image relative to current file
      if (-e $image or -e wwwDecode($image))                                    # Actual image name or www encoded image name
       {$xref->goodImageRefs->{$image}++;                                       # Found image
       }
      else
       {push @bad, [$href, $image, $file];                                      # Missing image reference
        $xref->badImageRefs->{$image}++;                                        # Number of missing references
        $xref->fixedFiles->{$file}{$href}++;
       }
     }
   }

  $xref->missingImageFiles = {map {$$_[2]=>$_} @bad};                           # Missing image file names

  formatTable(\@bad, <<END,
Href   Image reference in source file
Image  Targetted image name
File   Source file containing image reference
END
    head=>qq(Data::Edit::Xml::Xref found NNNN Bad image references on DDDD),
    summarize=>1,
    file=>(my $f = fpe($xref->reports, qw(bad imageRefs txt))));

  my $found = [map {[$xref->goodImageRefs->{$_}, $_]}
              keys %{$xref->goodImageRefs}];

  formatTable($found, <<END,
Count          Number of references to each image file found.
ImageFileName  Full image file name
END
    head=>qq(Data::Edit::Xml::Xref found NNNN image files found on DDDD),
    file=>(fpe($xref->reports, qw(good imagesFound txt))));

  my $missing = [map {[$xref->badImageRefs->{$_}, $_]}
                keys %{$xref->badImageRefs}];

  formatTable($missing, <<END,
Count          Number of references to each image file found.
ImageFileName  Full image file name
END
    head=>qq(Data::Edit::Xml::Xref found NNNN images missing on DDDD),
    file=>(fpe($xref->reports, qw(bad imagesMissing txt))));
 }

sub reportParseFailed($)                                                        #P Report failed parses
 {my ($xref) = @_;                                                              # Cross referencer

  formatTable($xref->parseFailed, [qw(File)],
    head=>qq(Data::Edit::Xml::Xref found NNNN files failed to parse on DDDD),
    file=>(my $f = fpe($xref->reports, qw(bad parseFailed txt))));
 }

sub reportBadXml1($)                                                            #P Report bad xml on line 1
 {my ($xref) = @_;                                                              # Cross referencer

  formatTable($xref->badXml1, [qw(File)],
    head=>qq(Data::Edit::Xml::Xref found NNNN Files with the incorrect xml on line 1 on DDDD),
    file=>(my $f = fpe($xref->reports, qw(bad xmlLine1 txt))));
 }

sub reportBadXml2($)                                                            #P Report bad xml on line 2
 {my ($xref) = @_;                                                              # Cross referencer

  formatTable($xref->badXml2, [qw(File)],
    head=>qq(Data::Edit::Xml::Xref found NNNN Files with the incorrect xml on line 2 on DDDD),
    file=>(my $f = fpe($xref->reports, qw(bad xmlLine2 txt))));
 }

sub reportDocTypeCount($)                                                       #P Report doc type count
 {my ($xref) = @_;                                                              # Cross referencer

  my %d;
  for my $f(sort keys %{$xref->docType})
   {my $d = $xref->docType->{$f};
    $d{$d}++
   }

  formatTable(\%d, [qw(DocType)],
    head=>qq(Data::Edit::Xml::Xref found NNNN different doc types on DDDD),
    file=>(fpe($xref->reports, qw(count docTypes txt))));
 }

sub reportTagCount($)                                                           #P Report tag counts
 {my ($xref) = @_;                                                              # Cross referencer

  my %d;
  for   my $f(sort keys %{$xref->tagCount})
   {for my $t(sort keys %{$xref->tagCount->{$f}})
     {my $d = $xref->tagCount->{$f}{$t};
      $d{$t} += $d;
     }
   }

  formatTable(\%d, [qw(Tag Count)],
    head=>qq(Data::Edit::Xml::Xref found NNNN different tags on DDDD),
    file=>(fpe($xref->reports, qw(count tags txt))));
 }

sub reportAttributeCount($)                                                     #P Report attribute counts
 {my ($xref) = @_;                                                              # Cross referencer

  my %d;
  for   my $f(sort keys %{$xref->attributeCount})
   {for my $t(sort keys %{$xref->attributeCount->{$f}})
     {my $d = $xref->attributeCount->{$f}{$t};
      $d{$t} += $d;
     }
   }

  formatTable(\%d, [qw(Attribute Count)],
    head=>qq(Data::Edit::Xml::Xref found NNNN different attributes on DDDD),
    file=>(fpe($xref->reports, qw(count attributes txt))));
 }

sub reportValidationErrors($)                                                   #P Report the files known to have validation errors
 {my ($xref) = @_;                                                              # Cross referencer

  formatTable([map {[$_]} sort keys %{$xref->validationErrors}], [qw(File)],
    head=><<END,
Data::Edit::Xml::Xref found NNNN different topics with validation errors on DDDD
END
    file=>(fpe($xref->reports, qw(good bookMap txt))));
 }

sub checkBookMap($$)                                                            #P Check whether a bookmap is valid or not
 {my ($xref, $bookMap) = @_;                                                    # Cross referencer, bookmap

  for my $href($bookMap, sort keys %{$xref->topicRefs->{$bookMap}})             # Each topic ref in the bookmap
   {my $t = absFromAbsPlusRel($bookMap, $href);
    for my $field                                                               # Fields that report errors
     (qw(parseFailed badXml1 badXml2 badTopicRefs badXRefs
         imagesMissing badConRefs missingTopicIds
         validationErrors))
     {if ($xref->{$field}->{$t})
       {return [$field, $bookMap, $href, $t];
       }
     }
   }
  undef                                                                         # No errors
 }

sub reportBookMaps($)                                                           #P Report on whether each bookmap is good or bad
 {my ($xref) = @_;                                                              # Cross referencer

  my @bad;
  my @good;
  for my $f(sort keys %{$xref->docType})
   {if ($xref->docType->{$f} =~ m(map\Z)s)
     {if (my $r = $xref->checkBookMap($f))
       {push @bad, $r;
       }
      else
       {push @good, [$f];
       }
     }
   }

  $xref-> badBookMaps = {map {$$_[1]=>$_} @bad};                                # Bad bookmaps
  $xref->goodBookMaps = {map {$$_[0]=>$_} @good};                               # Good book maps

  formatTable(\@bad, <<END,
Reason      Reason bookmap failed
Bookmap     Bookmap source file name
Topic-Ref   Failing appendix, chapter or topic ref.
Topic-File  Targeted topic file if known
END
    head=><<END,
Data::Edit::Xml::Xref found NNNN bookmaps with errors on DDDD
END
    summarize=>1,
    file=>(fpe($xref->reports, qw(bad bookMap txt))));

  formatTable(\@good, [qw(File)],
    head=><<END,
Data::Edit::Xml::Xref found NNNN good bookmaps on DDDD
END
    file=>(fpe($xref->reports, qw(good bookMap txt))));
 }

sub reportFileExtensionCount($)                                                 #P Report file extension counts
 {my ($xref) = @_;                                                              # Cross referencer

  formatTable(countFileExtensions($xref->inputFolder), [qw(Ext Count)],
    head=><<END,
Data::Edit::Xml::Xref found NNNN different file extensions on DDDD
END
    file=>(fpe($xref->reports, qw(count fileExtensions txt))));
 }

sub reportFileTypes($)                                                          #P Report file type counts - takes too long in series
 {my ($xref) = @_;                                                              # Cross referencer

  formatTable(countFileTypes
   ($xref->inputFolder, $xref->maximumNumberOfProcesses),
   [qw(Type Count)],
    head=><<END,
Data::Edit::Xml::Xref found NNNN different file types on DDDD
END
    file=>(my $f = fpe($xref->reports, qw(count fileTypes txt))));
 }

sub reportNotReferenced($)                                                      #P Report files not referenced by any of conref, image, topicref, xref and are not bookmaps.
 {my ($xref) = @_;                                                              # Cross referencer

  my %files = map {$_=>1}
    searchDirectoryTreesForMatchingFiles($xref->inputFolder);

  for my $file(sort keys %{$xref->docType})
   {my $tag = $xref->docType->{$file};
    if ($tag =~ m(map\Z)is)
     {delete $files{$file};
     }
   }

  for my $file(sort keys %{$xref->conRefs},
               sort keys %{$xref->goodImageRefs},
               sort keys %{$xref->goodTopicRefs},
               sort keys %{$xref->xRefs},
              )
   {delete $files{$file};
   }

  $xref->notReferenced = \%files;
  formatTable([sort keys %files],
   [qw(FileNo Unreferenced)],
    head=><<END,
Data::Edit::Xml::Xref found NNNN unreferenced files on DDDD.

These are the files not mentioned in any conref, image, topicref, or xref and
which are not bookmaps.

END
    file=>(my $f = fpe($xref->reports, qw(bad notReferenced txt))));
 }

sub createSampleInputFiles($)                                                   #P Create sample input files for testing. The attribute B<inputFolder> supplies the name of the folder in which to create the sample files.
 {my ($N) = @_;                                                                 # Number of sample files
  my $in = q(in);
  clearFolder($in, 20);
  if (1)
   {for my $n(1..$N)
     {my $o = $n + 1; $o -= $N if $o > $N;
      my $f = owf(fpe($in, $n, q(dita)), <<END);
<concept id="c$n">
  <title>Concept $n refers to $o</title>
  <conbody id="b$n">
     <xref id="x$n"  format="dita" href="$o.dita#c$o/x$o">Good</xref>
     <xref id="x$n"  format="dita" href="$o.dita#c$n/x$o">Duplicate id</xref>
     <xref id="b1$n" format="dita" href="bad$o.dita#c$o/x$o">Bad file</xref>
     <xref id="b2$n" format="dita" href="$o.dita#c$n/x$o">Bad topic id</xref>
     <xref id="b3$n" format="dita" href="$o.dita#c$o/x$n">Bad id in topic</xref>
     <xref id="g1$n" format="dita" href="$o.dita#c$o">Good 1</xref>
     <xref id="g2$n" format="dita" href="#c$o/x$o">Good 2</xref>
     <xref id="g3$n" format="dita" href="#c$o">Good 3</xref>
     <p conref="#c$n">Good conref</p>
     <p conref="#b$n">Bad conref</p>
     <image href="a$n.png"/>
  </conbody>
</concept>
END
#   push @{$cross->inputFiles}, $f;                                             # Save input file name
     }
   }
  owf(fpe($in, qw(act1 dita)), <<END);
<concept id="c1">
  <title id="title">All Timing Codes Begin Here</title>
  <conbody/>
</concept>
END
  owf(fpe($in, qw(act2 dita)), <<END);
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
<concept id="c2">
  <title id="title">All Timing Codes Begin Here</title>
  <conbody>
    <section>
      <title/>
      <xref  format="dita" href="act1.dita#c1/title"/>
      <xref  format="dita" href="9999#c1/title"/>
      <image href="9999.png"/>
      <p conref="9999.dita"/>
    </section>
  </conbody>
</concept>
<!--linted: 2018-Nov-23 -->
END

  owf(fpe($in, qw(act3 dita)), <<END);
<concept id="c3">
  <title>Error</title>
  <conbody>
    <p/>
  </body>
</concept>
END
  owf(fpe($in, qw(map bookmap ditamap)), <<END);
<map id="m1">
  <title>Test</title>
  <topicref format="dita" href="../act1.dita">Interesting topic</topicref>
  <topicref format="dita" href="../act2.dita"/>
  <topicref format="dita" href="../map/r.txt"/>
  <topicref format="dita" href="9999.dita"/>
  <topicref format="dita" href="bbb.txt"/>
</map>
END
  createEmptyFile(fpe($in, qw(a1 png)));
 }

#D
# podDocumentation
=pod

=encoding utf-8

=head1 Name

Data::Edit::Xml::Xref - Cross reference Dita XML.

=head1 Synopsis

Check the references in a set of Dita XML documents held in folder
L<inputFolder|/inputFolder>:

  use Data::Edit::Xml::Xref;

  my $x = xref(inputFolder=>q(in));
  ok nws($x->statusLine) eq nws(<<END);
Xref:
 10 bad first lines,
 10 bad second lines,
  9 bad conrefs,
  9 bad xrefs,
  8 duplicate ids,
  8 missing image files,
  8 missing image references,
  3 bad topicrefs,
  2 duplicate topic ids,
  1 bad book map,
  1 file failed to parse,
  1 file not referenced
END

The counts listed in the L<statusLine|/statusLine> are the counts of the files
that have the described problems not a count of all the instances of the
problem in all the files which would be larger.

More detailed reports are produced in the  L<reports|/reports> folder:

  $x->reports

=head1 Description

Cross reference Dita XML.


Version 20181204.


The following sections describe the methods in each functional area of this
module.  For an alphabetic listing of all methods by name see L<Index|/Index>.



=head1 Cross reference

Check the cross references in a set of Dita files and report the results.

=head2 xref(%)

Check the cross references in a set of Dita files held in  L<inputFolder|/inputFolder> and report the results in the L<reports|/reports> folder. The possible attributes are defined in L<Data::Edit::Xml::Xref|/Data::Edit::Xml::Xref>

     Parameter    Description
  1  %attributes  Attributes

B<Example:>


    my $x = 𝘅𝗿𝗲𝗳(inputFolder=>q(in), fixBadRefs=>1, maximumNumberOfProcesses=>2);



=head1 Hash Definitions




=head2 Data::Edit::Xml::Xref Definition


Attributes used by the Xref cross referencer.


B<attributeCount> - {file}{attribute} == count of the different xml attributes found in the xml files.

B<badBookMaps> - Bad book maps

B<badConRefs> - [file, href]   Invalid conref attributes.

B<badImageRefs> - Consolidated images missing.

B<badTopicRefs> - [file, href]   Invalid href attributes found on topicref tags.

B<badXRefs> - [file, href]   Invalid href attributes found on xref tags.

B<badXml1> - [Files] with a bad xml encoding header on the first line.

B<badXml2> - [Files] with a bad xml doc type on the second line.

B<conRefs> - {file}{href}   Count of conref definitions in each file.

B<docType> - {file} == docType:  the docType for each xml file.

B<duplicateIds> - [file, id]     Duplicate id definitions within each file.

B<duplicateTopicIds> - [topicId, [files]] Files with duplicate topic ids - the id on the outermost tag.

B<fixBadRefs> - Try to fix bad references in L<these files|/fixedFiles> where possible.

B<fixedFiles> - Files that have a missing conref or href which can be ameliorated by renaming the failing attribute to  "xtrf".

B<goodBookMaps> - Good book maps

B<goodConRefs> - Good con refs

B<goodImageRefs> - Consolidated images found.

B<goodTopicRefs> - Good topic refs

B<goodXRefs> - Good xrefs

B<ids> - {file}{id}     Id definitions across all files.

B<images> - {file}{href}   Count of image references in each file.

B<inputFiles> - Input files from L<inputFolder|/inputFolder>.

B<inputFolder> - A folder containing the dita and ditamap files to be cross referenced.

B<maximumNumberOfProcesses> - Maximum number of processes to run in parallel at any one time.

B<missingImageFiles> - [file, href] == Missing images in each file.

B<missingTopicIds> - Missing topic ids

B<notReferenced> - files in input area that are not referenced by a conref, image, topicref or xref tag and are not a bookmap.

B<parseFailed> - [file] files that failed to parse

B<reports> - Reports folder: the cross referencer will write reports to files in this folder.

B<results> - Summary of results table

B<sourceFile> - The source file from which this structure was generated

B<statusLine> - Status line summarizing the cross reference. The counts listed in the status line are the counts of the files that have the described problems not a count of all the instances of all the problems in all the files.

B<summary> - Print the summary line.

B<tagCount> - {file}{tags} == count of the different tag names found in the xml files.

B<topicIds> - {file} = topic id - the id on the outermost tag.

B<topicRefs> - {file}{href}++ References from bookmaps to topics via appendix, chapter, topicref.

B<validationErrors> - True means that Lint detected errors in the xml contained in the file

B<xRefs> - {file}{href}++ Xrefs references.



=head1 Private Methods

=head2 loadInputFiles($)

Load the names of the files to be processed

     Parameter  Description
  1  $xref      Cross referencer

=head2 analyzeOneFile($)

Analyze one input file

     Parameter  Description
  1  $iFile     File to analyze

=head2 fixOneFile($$)

Analyze one input file

     Parameter  Description
  1  $xref      Xref results
  2  $file      File to fix

=head2 analyze($)

Analyze the input files

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportDuplicateIds($)

Report duplicate ids

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportDuplicateTopicIds($)

Report duplicate topic ids

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportBadRefs($$)

Report bad references found in xrefs or conrefs as they have the same structure

     Parameter  Description
  1  $xref      Cross referencer
  2  $type      Type of reference to be processed

=head2 reportBadXrefs($)

Report bad xrefs

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportBadTopicRefs($)

Report bad topic refs

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportBadConrefs($)

Report bad conrefs refs

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportImages($)

Reports on images and references to images

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportParseFailed($)

Report failed parses

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportBadXml1($)

Report bad xml on line 1

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportBadXml2($)

Report bad xml on line 2

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportDocTypeCount($)

Report doc type count

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportTagCount($)

Report tag counts

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportAttributeCount($)

Report attribute counts

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportValidationErrors($)

Report the files known to have validation errors

     Parameter  Description
  1  $xref      Cross referencer

=head2 checkBookMap($$)

Check whether a bookmap is valid or not

     Parameter  Description
  1  $xref      Cross referencer
  2  $bookMap   Bookmap

=head2 reportBookMaps($)

Report on whether each bookmap is good or bad

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportFileExtensionCount($)

Report file extension counts

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportFileTypes($)

Report file type counts - takes too long in series

     Parameter  Description
  1  $xref      Cross referencer

=head2 reportNotReferenced($)

Report files not referenced by any of conref, image, topicref, xref and are not bookmaps.

     Parameter  Description
  1  $xref      Cross referencer

=head2 createSampleInputFiles($)

Create sample input files for testing. The attribute B<inputFolder> supplies the name of the folder in which to create the sample files.

     Parameter  Description
  1  $N         Number of sample files


=head1 Index


1 L<analyze|/analyze> - Analyze the input files

2 L<analyzeOneFile|/analyzeOneFile> - Analyze one input file

3 L<checkBookMap|/checkBookMap> - Check whether a bookmap is valid or not

4 L<createSampleInputFiles|/createSampleInputFiles> - Create sample input files for testing.

5 L<fixOneFile|/fixOneFile> - Analyze one input file

6 L<loadInputFiles|/loadInputFiles> - Load the names of the files to be processed

7 L<reportAttributeCount|/reportAttributeCount> - Report attribute counts

8 L<reportBadConrefs|/reportBadConrefs> - Report bad conrefs refs

9 L<reportBadRefs|/reportBadRefs> - Report bad references found in xrefs or conrefs as they have the same structure

10 L<reportBadTopicRefs|/reportBadTopicRefs> - Report bad topic refs

11 L<reportBadXml1|/reportBadXml1> - Report bad xml on line 1

12 L<reportBadXml2|/reportBadXml2> - Report bad xml on line 2

13 L<reportBadXrefs|/reportBadXrefs> - Report bad xrefs

14 L<reportBookMaps|/reportBookMaps> - Report on whether each bookmap is good or bad

15 L<reportDocTypeCount|/reportDocTypeCount> - Report doc type count

16 L<reportDuplicateIds|/reportDuplicateIds> - Report duplicate ids

17 L<reportDuplicateTopicIds|/reportDuplicateTopicIds> - Report duplicate topic ids

18 L<reportFileExtensionCount|/reportFileExtensionCount> - Report file extension counts

19 L<reportFileTypes|/reportFileTypes> - Report file type counts - takes too long in series

20 L<reportImages|/reportImages> - Reports on images and references to images

21 L<reportNotReferenced|/reportNotReferenced> - Report files not referenced by any of conref, image, topicref, xref and are not bookmaps.

22 L<reportParseFailed|/reportParseFailed> - Report failed parses

23 L<reportTagCount|/reportTagCount> - Report tag counts

24 L<reportValidationErrors|/reportValidationErrors> - Report the files known to have validation errors

25 L<xref|/xref> - Check the cross references in a set of Dita files held in  L<inputFolder|/inputFolder> and report the results in the L<reports|/reports> folder.

=head1 Installation

This module is written in 100% Pure Perl and, thus, it is easy to read,
comprehend, use, modify and install via B<cpan>:

  sudo cpan install Data::Edit::Xml::Xref

=head1 Author

L<philiprbrenan@gmail.com|mailto:philiprbrenan@gmail.com>

L<http://www.appaapps.com|http://www.appaapps.com>

=head1 Copyright

Copyright (c) 2016-2018 Philip R Brenan.

This module is free software. It may be used, redistributed and/or modified
under the same terms as Perl itself.

=cut



# Tests and documentation

sub test
 {my $p = __PACKAGE__;
  binmode($_, ":utf8") for *STDOUT, *STDERR;
  return if eval "eof(${p}::DATA)";
  my $s = eval "join('', <${p}::DATA>)";
  $@ and die $@;
  eval $s;
  $@ and die $@;
  1
 }

test unless caller;

1;
# podDocumentation
__DATA__
use warnings FATAL=>qw(all);
use strict;
use Test::More tests=>1;

my $windows = $^O =~ m(MSWin32)is;
my $mac     = $^O =~ m(darwin)is;

Test::More->builder->output("/dev/null")                                        # Show only errors during testing
  if ((caller(1))[0]//'Data::Edit::Xml::Xref') eq "Data::Edit::Xml::Xref";

if (!$windows)
 {my $N = 8;
  clearFolder(q(reports), 32);
  createSampleInputFiles($N);
  my $x = xref(inputFolder=>q(in), fixBadRefs=>1, maximumNumberOfProcesses=>2); #Txref
  ok nws($x->statusLine) eq nws(<<END);
Xref:
 10 bad first lines,
 10 bad second lines,
  9 bad conrefs,
  9 bad xrefs,
  8 duplicate ids,
  8 missing image files,
  8 missing image references,
  3 bad topicrefs,
  2 duplicate topic ids,
  1 bad book map,
  1 file failed to parse,
  1 file not referenced
END
 }
else
 {ok 1 for 1..1;
 }

1


