Hi Rob -
It seems that PDF-API2 returns unicode characters in attribute values in byte-swapped order for some reason.
Here's how I changed your code to compensate for that.
#!perl
use strict;
use PDF::API2 1.19; # Revision 1.19 2004/03/20 08:38:38 fredo added isEncrypted determinator
use Unicode::String qw(utf8);
use Encode;
use LWP::UserAgent;
use HTTP::Request::Common;
use HTTP::Response;
binmode STDOUT, ":utf8";
use constant GET_FROM_HTTP => 1;
my $ua = LWP::UserAgent->new;
$ua->agent('PDFInspector/0.2 (email@host.com)');
my @url_list = qw(
http://www.gs.gov.nl.ca/ohs/pdf/ann-rep-whsi.pdf
http://www.gs.gov.nl.ca/cca/cr/pdf/coop/coop21-art-dis.pdf
http://www.gs.gov.nl.ca/misc/data/gazette/wk/2006-01-13.pdf
);
# ann-rep-whsi.pdf contains wide characters
# coop21-art-dis.pdf
# 2006-01-13.pdf is encrypted
foreach my $pdf_url (@url_list) {
my ($pdf_doc, $pdf_status, $pdf, %pdf_info);
($pdf_doc, $pdf_status, undef, undef) = do_get ($pdf_url, "Accept-Language" => "en");
$pdf = PDF::API2->openScalar($pdf_doc);
%pdf_info = $pdf->info();
print sprintf("%-10s %s\n", "URL:",$pdf_url);
foreach my $attribute (qw(Title Author Subject Creator Producer)) {
my $value = $pdf_info{$attribute};
print sprintf("%-10s ", $attribute.":");
if ($pdf->isEncrypted) {
print "[encrypted]";
}
elsif (Encode::is_utf8($value)) {
# Unicode bug in PDF-API2? Byte order is swapped.
print utf8($value)->byteswap();
}
else {
print $value;
}
print "\n";
}
print "\n";
}
sub str2hex {
my $line = shift;
$line =~ s/(.)/sprintf("%02x ",ord($1))/eg;
return $line;
}
sub do_get {
my $url = shift;
if (GET_FROM_HTTP) {
my $response = $ua->request(GET $url);
return ($response->content, $response->code, undef, undef);
}else{
$url =~ s/^.*\///;
my $content = read_file($url);
return ($content, 200, undef, undef);
}
}
sub read_file {
# modified code from Perl Slurp-Eaze by Uri Guttman http://www.perl.com/pub/a/2003/11/21/slurp.
+html
my( $file_name ) = shift;
my $buf;
my $buf_ref = \$buf;
open( FH, "<", $file_name ) or die "Can't open $file_name: $!";
binmode(FH);
my $size_left = -s FH;
while( $size_left > 0 ) {
my $read_cnt = sysread( FH, ${$buf_ref}, $size_left, length ${$buf_ref} );
unless( $read_cnt ) {
die "read error in file $file_name: $!";
last;
}
$size_left -= $read_cnt;
}
return ${$buf_ref};
}
__END__
# Results look like...
URL: http://www.gs.gov.nl.ca/ohs/pdf/ann-rep-whsi.pdf
Title: (OHS).PDF
Author: JDutton
Subject:
Creator: A:\annual report 2000 (OHS).wpd
Producer: Acrobat PDFWriter 4.0 for Windows
URL: http://www.gs.gov.nl.ca/cca/cr/pdf/coop/coop21-art-dis.pdf
Title: The Co-Operatives Act (Form 21)
Author: Commercial Registrations
Subject: Articles of Dissolution (Section 116, 117, 118)
Creator:
Producer: Acrobat Distiller 4.05 for Windows
URL: http://www.gs.gov.nl.ca/misc/data/gazette/wk/2006-01-13.pdf
Title: [encrypted]
Author: [encrypted]
Subject: [encrypted]
Creator: [encrypted]
Producer: [encrypted]
|