Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benutze Poppler fuer PDF-Verarbeitung beim ZUGFeRD-Import #300

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions SL/Controller/ZUGFeRD.pm
Original file line number Diff line number Diff line change
Expand Up @@ -132,25 +132,26 @@ sub action_import_zugferd {
die t8("missing file for action import") unless $file;
die t8("can only parse a pdf or xml file") unless $file =~ m/^%PDF|<\?xml/;

# save the zugferd file to session file for reuse in ap.pl
my $session_file = SL::SessionFile->new($file_name, mode => 'w');
$session_file->fh->print($file);
$session_file->fh->close;

if ( $::form->{file} =~ m/^%PDF/ ) {
%res = %{SL::ZUGFeRD->extract_from_pdf($file)};
%res = %{SL::ZUGFeRD->extract_from_pdf($session_file->file_name)};
} else {
%res = %{SL::ZUGFeRD->extract_from_xml($file)};
}

if ($res{'result'} != SL::ZUGFeRD::RES_OK()) {
# An error occurred; log message from parser:
unlink($session_file->file_name);
die(t8("Could not extract Factur-X/ZUGFeRD data, data and error message:") . " $res{'message'}");
}

my $form_defaults = $self->build_ap_transaction_form_defaults(\%res);

# save the zugferd file to session file for reuse in ap.pl
my $session_file = SL::SessionFile->new($file_name, mode => 'w');
$session_file->fh->print($file);
$session_file->fh->close;
$form_defaults->{zugferd_session_file} = $file_name;

$form_defaults->{callback} = $self->url_for(action => 'upload_zugferd');

$self->redirect_to(
Expand Down
181 changes: 44 additions & 137 deletions SL/ZUGFeRD.pm
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ use strict;
use warnings;
use utf8;

use PDF::API2;
use Data::Dumper;
use File::Temp;
use File::Slurp qw(slurp);
use List::Util qw(first);
use Poppler;
use XML::LibXML;

use SL::Locale::String qw(t8);
Expand Down Expand Up @@ -44,82 +45,54 @@ sub convert_customer_setting {
}

sub _extract_zugferd_invoice_xml {
my $doc = shift;
my %res_fail;

# unfortunately PDF::API2 has no public facing api to access the actual pdf name dictionaries
# so we need to use the internal data, just like with PDF::CAM before
#
# PDF::API2 will internally read $doc->{pdf}{Root}{Names} for us, but after that every entry
# in the tree may be an indirect object (Objind) before realising it.
#
# The actual embedded files will be located at $doc->{pdf}{Root}{Names}{EmbeddedFiles}
#

my $node = $doc->{pdf};
for (qw(Root Names EmbeddedFiles)) {
$node = $node->{$_};
if (!ref $node) {
return {
result => RES_ERR_NO_ATTACHMENT(),
message => "unexpected unbless node while trying to access $_ node",
}
}
if ('PDF::API2::Basic::PDF::Objind' eq ref $node) {
$node->realise;
}
# after realising it should be a Dict
if ('PDF::API2::Basic::PDF::Dict' ne ref $node) {
return {
result => RES_ERR_NO_ATTACHMENT(),
message => "unexpected node type [@{[ref($node)]}] after realising $_ node",
}
}
}
my $file = shift;
my (%res_fail, @res, @attachments);

my $userspath = SL::System::Process::exe_dir() . "/" . $::lx_office_conf{paths}->{userspath};

# now we have an array of possible attachments
my @agenda = $node;
my $pdf = eval { Poppler::Document->new_from_file($file) };

my $parser; # SL::XMLInvoice object used as return value
my @res; # Temporary storage for error messages encountered during
# attempts to process attachments.
unless ( $pdf ) { return {
'result' => RES_ERR_NO_ATTACHMENT(),
'message' => "Cannot open PDF file: $!",
};
}

my $tmpfile = File::Temp->new(
'zugferd-import-XXXXXXXX',
DIR => $userspath,
UNLINK => 1,
);

# Hardly ever more than single leaf, but...
@attachments = eval { $pdf->get_attachments };

while (@agenda) {
my $item = shift @agenda;
if ( scalar @attachments == 0 ) {
return {
'result' => RES_ERR_NO_ATTACHMENT(),
'message' => "PDF file does not have any attachments.",
};
}

if ($item->{Kids}) {
my @kids = $item->{Kids}->realise->elements;
push @agenda, @kids;
my $i = 0;

foreach my $attachment ( @attachments ) {
my $retval = eval { Poppler::Attachment::save($attachment, $tmpfile->filename) };
unless ( $retval ) {
$i++;
next;
}
my $xml = slurp($tmpfile->filename);
my $parser = SL::XMLInvoice->new($xml);
if ( $parser->{result} == SL::XMLInvoice::RES_OK ){
return $parser;
} else {
my @names = $item->{Names}->realise->elements;

TRY_NEXT:
while (@names) {
my ($k, $v) = splice @names, 0, 2;
my $fnode = $v->realise->{EF}->realise->{F}->realise;

$fnode->read_stream(1);

my $content = $fnode->{' stream'};

$parser = SL::XMLInvoice->new($content);

# Caveat: this will only ever catch the first attachment looking like
# an XML invoice.
if ( $parser->{status} == SL::XMLInvoice::RES_OK ){
return $parser;
} else {
push @res, t8(
"Could not parse PDF embedded attachment #1: #2",
$k,
$parser->{result}
);
}
}
push @res, t8(
"Could not parse PDF embedded attachment #1: #2",
$i,
$parser->{result}
);
}
$i++;
}

# There's going to be at least one attachment that failed to parse as XML by
Expand All @@ -134,78 +107,17 @@ sub _extract_zugferd_invoice_xml {
return \%res_fail;
}

sub _get_xmp_metadata {
my ($doc) = @_;

$doc->xmpMetadata;
}

sub extract_from_pdf {
my ($self, $file_name) = @_;
my @warnings;

my $pdf_doc = PDF::API2->openScalar($file_name);

if (!$pdf_doc) {
return {
result => RES_ERR_FILE_OPEN,
message => $::locale->text('The file \'#1\' could not be opened for reading.', $file_name),
};
}

my $xmp = _get_xmp_metadata($pdf_doc);

if (!defined $xmp) {
push @warnings, $::locale->text('The file \'#1\' does not contain the required XMP meta data.', $file_name);
} else {
my $dom = eval { XML::LibXML->load_xml(string => $xmp) };

push @warnings, $::locale->text('Parsing the XMP metadata failed.'), if !$dom;

my $xpc = XML::LibXML::XPathContext->new($dom);
$xpc->registerNs('rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#');

my $zugferd_version;

my $test = $xpc->findnodes('/x:xmpmeta/rdf:RDF/rdf:Description');

foreach my $node ($xpc->findnodes('/x:xmpmeta/rdf:RDF/rdf:Description')) {
my $ns = first { ref($_) eq 'XML::LibXML::Namespace' } $node->attributes;
next unless $ns;

if ($ns->getData =~ m{urn:zugferd:pdfa:CrossIndustryDocument:invoice:2p0}) {
$zugferd_version = 'zugferd:2p0';
last;
}

if ($ns->getData =~ m{urn:factur-x:pdfa:CrossIndustryDocument:invoice:1p0}) {
$zugferd_version = 'factur-x:1p0';
last;
}

if ($ns->getData =~ m{zugferd|factur-x}i) {
$zugferd_version = 'unsupported';
last;
}
}

if (!$zugferd_version) {
push @warnings, $::locale->text('The XMP metadata does not declare the Factur-X/ZUGFeRD data.'),
}

if ($zugferd_version eq 'unsupported') {
push @warnings, $::locale->text('The Factur-X/ZUGFeRD version used is not supported.'),
}
}

my $invoice_xml = _extract_zugferd_invoice_xml($pdf_doc);
my $invoice_xml = _extract_zugferd_invoice_xml($file_name);

my %res;

%res = (
result => $invoice_xml->{result},
message => $invoice_xml->{message},
metadata_xmp => $xmp,
invoice_xml => $invoice_xml,
warnings => \@warnings,
);
Expand All @@ -223,7 +135,6 @@ sub extract_from_xml {
%res = (
result => $invoice_xml->{result},
message => $invoice_xml->{message},
metadata_xmp => undef,
invoice_xml => $invoice_xml,
warnings => (),
);
Expand Down Expand Up @@ -297,8 +208,6 @@ Other than that, the hash ref contains the following keys:

=item C<message> - An error message detailing the problem upon nonzero C<result>, undef otherwise.

=item C<metadata_xmp> - The XMP metadata extracted from the Factur-X/ZUGFeRD invoice (if present)

=item C<invoice_xml> - An SL::XMLInvoice object holding the data extracted from the parsed XML invoice.

=item C<warnings> - Warnings encountered upon extracting/parsing XML files (if any)
Expand Down Expand Up @@ -331,8 +240,6 @@ Other than that, the hash ref contains the following keys:

=item C<message> - An error message detailing the problem upon nonzero C<result>, undef otherwise.

=item C<metadata_xmp> - Always undef and only present to let downstream code expecting its presence fail gracefully.

=item C<invoice_xml> - An SL::XMLInvoice object holding the data extracted from the parsed XML invoice.

=item C<warnings> - Warnings encountered upon extracting/parsing XML data (if any)
Expand Down
2 changes: 1 addition & 1 deletion bin/mozilla/ap.pl
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ sub load_zugferd {
if $::form->{record_template_id};
$template_ap ||= SL::DB::Manager::RecordTemplate->get_first(where => [vendor_id => $::form->{form_defaults}->{vendor_id}])
if $::form->{form_defaults}->{vendor_id};
if ($template_ap) {
if ($template_ap and $template_ap->items->[0]) {
$::form->{id} = $template_ap->id;
# set default values for items
my $template_item = $template_ap->items->[0];
Expand Down