#!/usr/local/bin/perl # # Copyright (C) 2006 Instituto Tecnologico y de Estudios Superiores de Monterrey. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. $VERSION = '1.0'; use 5.8.0; use strict; my $__usage__ = qq(A simple script to automatically produce sitemaps for a webserver, in the Google Sitemap Protocol (GSP). Usage: perl sitemap_gen.pl --config=config.xml [--help] [--testing] --config=config.xml, specifies config file location --help, displays usage message --testing, specified when user is experimenting ); # Avoid warnings from XML::SAX BEGIN { $ENV{HARNESS_ACTIVE} = 1 } my $DOWARN = 1; my $WARNING = 0; BEGIN { $SIG{'__WARN__'} = sub { warn $_[0] if $DOWARN; $WARNING = 1; } } # Required Perl Modules my @REQUIRED_MODULES = ( "Digest::MD5", # Perl interface to the MD5 Message-Digest Algorithm "Encode", # Character encodings "File::Find", # Traverse a directory tree "File::Glob", # Perl extension for BSD glob routine "File::Spec", # Portably perform operations on file names "Getopt::Long", # Extended processing of command line options "LWP::Simple", # Simple procedural interface to LWP "URI::URL", # Uniform Resource Locators "URI::Escape", # Escape and unescape unsafe characters "XML::SAX" # Simple API for XML ); # Validate that required modules are installed foreach my $module (@REQUIRED_MODULES) { eval "use $module"; die "[ERROR] Perl Module '$module' is required but not installed.\nSee README for installation notes.\n" if $@; } # Boolean and other variables my $True = 1; my $False = 0; my $None = 2; # Command flags my %flags; # Text encodings my $ENC_ASCII = 'ASCII'; my $ENC_UTF8 = 'UTF-8'; my @ENC_ASCII_LIST = ('ASCII', 'US-ASCII', 'US', 'IBM367', 'CP367', 'ISO646-US', 'ISO_646.IRV:1991', 'ISO-IR-6', 'ANSI_X3.4-1968', 'ANSI_X3.4-1986', 'CPASCII' ); my @ENC_DEFAULT_LIST = ('ISO-8859-1', 'ISO-8859-2', 'ISO-8859-5'); # Maximum number of urls in each sitemap, before next Sitemap is created my $MAXURLS_PER_SITEMAP = 50000; # Suffix on a Sitemap index file my $SITEINDEX_SUFFIX = '_index.xml'; # Regular expressions tried for extracting URLs from access logs. my $ACCESSLOG_CLF_PATTERN = qr{.+\s+"([^\s]+)\s+([^\s]+)\s+HTTP/\d+\.\d+"\s+200\s+.*}; # Match patterns for lastmod attributes my @LASTMOD_PATTERNS = ( qr{^\d\d\d\d$}, qr{^\d\d\d\d-\d\d$}, qr{^\d\d\d\d-\d\d-\d\d$}, qr{^\d\d\d\d-\d\d-\d\dT\d\d:\d\dZ$}, qr{^\d\d\d\d-\d\d-\d\dT\d\d:\d\d[+-]\d\d:\d\d$'}, qr{^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?Z$}, qr{^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?[+-]\d\d:\d\d$} ); # Match patterns for changefreq attributes my @CHANGEFREQ_PATTERNS = ('always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'); # XML formats my $SITEINDEX_HEADER = qq( \n); my $SITEINDEX_FOOTER = "\n"; my $SITEINDEX_ENTRY = qq( LOC LASTMOD \n); my $SITEMAP_HEADER = qq( \n); my $SITEMAP_FOOTER = "\n"; my $SITEURL_XML_PREFIX = " \n"; my $SITEURL_XML_SUFFIX = " \n"; # Search engines to notify with the updated sitemaps # This list is very non-obvious in what's going on. Here's the gist: my @NOTIFICATION_SITES = ({ scheme => 'http', netloc => 'www.google.com', path => 'webmasters/sitemaps/ping', query => {}, # <-- EXCEPTION: specify a query map rather than a string fragment => '', sitemap => 'sitemap' # - query attribute that should be set to the new Sitemap URL }); my $encoder = Encoder->new(); my $output = Output->new() ; {package Encoder; # Manages wide-character/narrow-character conversions. # General high-level methodologies used in sitemap_gen: # [PATHS] # File system paths may be wide or narrow, depending on platform. # This class has MaybeNarrowPath() which should be called on every # file system path you deal with. # [URLS] # URL locations are stored in Narrow form, already escaped. This has the # benefit of keeping escaping and encoding as close as possible to the format # we read them in. # [OTHER] # Other text, such as attributes of the URL class, configuration options, # etc, are generally stored in Unicode for simplicity. sub new { my($class) = @_; my $self = {}; $self->{_user} = undef; # User-specified default encoding $self->{_learned} = (); # Learned default encodings $self->{_widefiles} = $False; # File system can be wide bless ($self, $class); # Can the file system be Unicode? if($^O eq "MSWin32"){ #Windows my $version_Windows_NT = 2; $self->{_widefiles} = Win32::GetOSVersion() == $version_Windows_NT; } elsif($^O eq "darwin"){ #Mac OS $self->{_widefiles} = $True; } # Try to guess a working default my $encoding = undef; eval{ #Windows if($^O eq "MSWin32"){ $encoding = "mbcs"; } #Mac OS elsif($^O eq "darwin"){ $encoding = "utf-8"; } #Unix and Unix-Like platforms elsif($^O eq "aix" || $^O eq "bsdos" || $^O eq "dgux" || $^O eq "dynixptx" || $^O eq "freebsd" ){ $encoding = 'nl_langinfo(CODESET)'; } elsif($^O eq "linux" || $^O eq "hpux" || $^O eq "irix" || $^O eq "machten" || $^O eq "next" || $^O eq "openbsd" ){ $encoding = 'nl_langinfo(CODESET)'; } elsif($^O eq "dec_osf" || $^O eq "svr4" || $^O eq "sco_sv" || $^O eq "svr4" || $^O eq "unicos" || $^O eq "unicosmk" ){ $encoding = 'nl_langinfo(CODESET)'; } elsif($^O eq "solaris" || $^O eq "sunos"){ $encoding = 'nl_langinfo(CODESET)'; } my $encoding_in_ascii_list = $False; if($encoding){ foreach my $myencoding (@ENC_ASCII_LIST) { if($myencoding eq uc($encoding)) { $encoding_in_ascii_list = $True; last; } } if(!$encoding_in_ascii_list) { @{$self->{_learned}} = ($encoding); } } }; if($@){} # If we had no guesses, start with some European defaults if(!@{$self->{_learned}}){ @{$self->{_learned}} = @ENC_DEFAULT_LIST; } return $self; } # end new sub SetUserEncoding { my($self, $encoding) = @_; $self->{_user} = $encoding; } # end SetUserEncoding sub NarrowText { my($self, $text, $encoding) = @_; # Narrow a piece of arbitrary text # Return if not Unicode $DOWARN = 0; my @chars = unpack("U0U*", $text); $DOWARN = 1; if($WARNING) { $WARNING = 0; return $text; } if($^O ne "MSWin32") { return $text; } # Look through learned defaults, knock any failing ones out of the list while(@{$self->{_learned}}){ my $result = undef; eval{ if($self->{_learned}->[0] eq "mbcs"){ eval "use Win32::MBCS"; if($@){ $output->Error("Perl Module Win32::MBCS is not installed in your computer and it is required to run this script."); return $text; } Win32::MBCS::Utf8ToLocal($text); $result = $text; } else{ $result = encode($self->{_learned}->[0], $text); } }; if($@){ shift @{$self->{_learned}}; $result = undef; } if($result){ return $result; } } # Try the passed in preference if($encoding) { my $result = undef; eval { $result = encode($encoding, $text); my $enc_in_learned = $False; foreach my $enc (@{$self->{_learned}}) { if($enc eq $encoding) { $enc_in_learned = $True; last; } } if(!$enc_in_learned){ push(@{$self->{_learned}}, $encoding); } }; if($@) { $output->Warn('Unknown encoding: '. $encoding); } if($result){ return $result; } } # Try the user preference if($self->{_user}){ my $result = undef; eval{ $result = encode($self->{_user}, $text); }; if($@){ my $temp = $self->{_user}; $self->{_user} = undef; $output->Warn('Unknown default_encoding:'. $temp); } if($result){ return $result; } } # When all other defaults are exhausted, use UTF-8 my $result = undef; eval{ $result = Encode::encode_utf8($text); }; if($@){ } if($result){ return $result; } # Something is seriously wrong if we get to here return encode($ENC_ASCII, $text, undef); } # end NarrowText sub MaybeNarrowPath { my($self, $text) = @_; #Paths may be allowed to stay wide return $self->NarrowText($text, undef); } # end MaybeNarrowPath } #end package Encoder {package Output; # Exposes logging functionality, and tracks how many errors # we have thus output. # Logging levels should be used as thus: # Fatal -- extremely sparingly # Error -- config errors, entire blocks of user 'intention' lost # Warn -- individual URLs lost # Log(,0) -- Un-suppressable text that's not an error # Log(,1) -- touched files, major actions # Log(,2) -- parsing notes, filtered or duplicated URLs # Log(,3) -- each accepted URL sub new { my $class = shift; my $self = {}; $self->{num_errors} = 0; # Count of errors $self->{num_warns} = 0; # Count of warnings $self->{_errors_shown} = {}; # Shown errors $self->{_warns_shown} = {}; # Shown warnings $self->{_verbose} = 0; # Level of verbosity bless($self, $class); return $self; } # end new sub Log { my($self, $text, $level) = @_; # Output a blurb of diagnostic text, if the verbose level allows it. if($text) { if($self->{_verbose} >= $level) { print $text."\n"; } } } # end Log sub Warn { my($self, $text) = @_; # Output and count a warning. Suppress duplicate warnings. if($text) { my $md5 = Digest::MD5->new(); $md5->add($text); my $hash = $md5->digest(); if(!exists($self->{_warns_shown}->{$hash})) { $self->{_warns_shown}->{$hash} = 1; print '[WARNING] '.$text."\n"; } else { $self->Log('(suppressed) [WARNING] '.$text, 3); } $self->{num_warns}++; } } # end Warn sub Error { my($self, $text) = @_; # Output and count an error. Suppress duplicate errors. if($text) { my $md5 = Digest::MD5->new(); $md5->add($text); my $hash = $md5->digest(); if(!exists($self->{_errors_shown}->{$hash})) { $self->{_errors_shown}->{$hash} = 1; print '[ERROR] ' . $text . "\n"; } else { $self->Log('(suppressed) [ERROR] ' . $text, 3); } $self->{num_errors}++; } } # end Error sub Fatal { my($self, $text) = @_; # Output an error and terminate the program. if($text) { die '[FATAL] ' . $text . "\n"; } else { die "Fatal error.\n"; } } # end Fatal sub SetVerbose { my ($self, $level) = @_; # Sets the verbose level. if (($level >= 0) && ($level <= 3)) { $self->{_verbose} = $level; return; } else { $self->Error("Verbose level $level must be between 0 and 3 inclusive."); } } #end SetVerbose } # end package Output {package URL; # URL is a smart structure grouping together the properties we # care about for a single web reference. ); my @__slots__ = ('loc', 'lastmod', 'changefreq', 'priority'); sub new { my $class = shift; my $self = {}; $self->{loc} = undef; # URL -- in Narrow characters $self->{lastmod} = undef; # ISO8601 timestamp of last modify $self->{changefreq} = undef; # Text term for update frequency $self->{priority} = undef; # Float between 0 and 1 (inc) bless($self, $class); return $self; } # end new sub __cmp__ { my($self, $other) = @_; if($self->{loc} < $other->{loc}) { return -1; } if($self->{loc} > $other->{loc}) { return 1; } return 0; } # end __cmp__ sub TrySetAttribute { my ($self, $attribute, $value) = @_; # Attempt to set the attribute to the value, with a pretty try # block around it. if($attribute eq 'loc') { $self->{loc} = $self->Canonicalize($value); } else { if(exists($self->{$attribute})) { $self->{$attribute} = $value; } else { $output->Warn("Unknown URL attribute: $attribute"); } } } # end TrySetAttribute sub IsAbsolute { my($self, $loc) = @_; #Decide if the URL is absolute or not if(!$loc){ return $False; } my $narrow = $loc; my ($volume, $directories, $file) = File::Spec->splitpath($narrow); my $scheme_pos = index($directories, "://"); if($scheme_pos == -1) { return $False; } return $True; } # end IsAbsolute sub Canonicalize { my($self, $loc) = @_; # Do encoding and canonicalization on a URL string if(!$loc){ return $loc; } # Let the encoder try to narrow it my $narrow = $encoder->NarrowText($loc, undef); # Do URL escaping to URL components use URI::URL; my $url = new URI::URL $narrow; # Make IDNA encoding on the netloc my $netloc = URI::Escape::uri_unescape($url->netloc); my @netloc = split(//, $netloc); my $IDN_hostname = $False; foreach my $char (@netloc) { if($char ge chr(128)) { $IDN_hostname = $True; last; } } if($IDN_hostname) { eval "use IDNA::Punycode"; if($@) { $output->Error("An International Domain Name (IDN) is being used. Perl Module IDNA::Punycode is required to encode this kind of hostnames, but it is not installed in your computer. See installation notes in README file."); return $loc; } my @hostname_labels = split(/\./, $netloc); $netloc = ""; foreach my $label (@hostname_labels) { $label = IDNA::Punycode::encode_punycode($label); $netloc .= $label."."; } # Remove last '.' added $netloc = substr($netloc, 0, length($netloc) - 1); } $url->netloc($netloc); my $bad_netloc = $False; if(index($url->netloc, '%') != -1) { $bad_netloc = $True; } # Put it all back together if($netloc) { $narrow = $url->as_string; } # I let '%' through. Fix any that aren't pre-existing escapes. my $HEXDIG = '0123456789abcdefABCDEF'; my @list = split('%', $narrow); $narrow = shift(@list); foreach my $item (@list){ if((length($item) >= 2) && (index($HEXDIG, substr($item, 0, 1)) != -1) && (index($HEXDIG, substr($item, 1, 1)) != -1)) { $narrow = $narrow ."%". $item; } else { $narrow = $narrow ."%25". $item; } } # Issue a warning if this is a bad URL if($bad_netloc){ $output->Warn("Invalid characters in the host or domain portion of a URL: ". $narrow); } return $narrow; } # end Canonicalize sub Validate { my($self, $base_url, $allow_fragment) = @_; #Verify the data in this URL is well-formed, and override if not. # Test (and normalize) the ref if(!$self->{loc}) { $output->Warn("Empty URL"); return $False; } if($allow_fragment) { my $endswith_slash = (substr($base_url, length($base_url) - 1) eq "/"); my $startswith_slash = (substr($self->{loc}, 0, 1) eq "/"); if($endswith_slash && $startswith_slash) { $self->{loc} = substr($self->{loc}, 1); } if(substr($self->{loc}, 0, length($base_url)) ne $base_url) { $self->{loc} = join "", $base_url, $self->{loc}; } } if(substr($self->{loc}, 0, length($base_url)) ne $base_url) { $output->Warn("Discarded URL for not starting with the base_url: " . $self->{loc}); $self->{loc} = undef; return $False; } # Test the lastmod if ($self->{lastmod}) { my $match = $False; $self->{lastmod} = uc($self->{lastmod}); foreach my $pattern (@LASTMOD_PATTERNS){ if($self->{lastmod} =~ /$pattern/) { $match = $True; last; } } if(!$match) { $output->Warn( "Lastmod \"". $self->{lastmod}. "\" does not appear to be in ISO8601 format on URL: " . $self->{loc}); $self->{lastmod} = undef; } } # Test the changefreq if($self->{changefreq}) { my $match = $False; $self->{changefreq} = lc($self->{changefreq}); foreach my $pattern (@CHANGEFREQ_PATTERNS) { if($self->{changefreq} eq $pattern){ $match = $True; last; } } if(!$match) { $output->Warn("Changefreq \"" . $self->{changefreq} . "\" is not a valid change frequency on URL: " . $self->{loc}); $self->{changefreq} = undef; } } # Test the priority if($self->{priority}){ my $priority = -1.0; my $test_priority = $self->{priority}; if(!($test_priority =~ /^-?(?:\d+(?:\.\d*)?|\.\d+)$/)) { $output->Warn("Priority \"" . $self->{priority} . "\" is not a valid number inclusive on URL: " . $self->{loc}); $self->{priority} = undef; } elsif(($self->{priority} < 0.0) || ($self->{priority} > 1.0)){ $output->Warn("Priority \"" . $self->{priority} . "\" is not a number between 0 and 1 inclusive on URL: " . $self->{loc}); $self->{priority} = undef; } } return $True; } # end Validate sub MakeHash { my($self) = @_; # Provides a uniform way of hashing URLs if(!$self->{loc}) { return undef; } my $md5 = Digest::MD5->new(); if(substr($self->{loc}, length($self->{loc}) - 1) eq '/') { $md5->add(substr($self->{loc}, 0, length($self->{loc}) - 1)); return $md5->digest(); } $md5->add($self->{loc}); return $md5->digest(); } # end MakeHash sub Log { my($self, $prefix, $level) = @_; #Dump the contents, empty or not, to the log if(!$prefix) { $prefix = "URL"; } if(!$level) { $level = 3; } my $out = $prefix . ':'; foreach my $attribute (@__slots__){ my $value = $self->{$attribute}; if(!$value) { $value = ''; } $out .= " $attribute=[$value]"; } $output->Log($out, $level); } # end Log sub WriteXML { my($self, $file, $is_gzip) = @_; if(!$self->{loc}){ return; } my $out = $SITEURL_XML_PREFIX; foreach my $attribute (@__slots__){ my $value = $self->{$attribute}; if($value) { #Entity escaping if necessary. Other characters were escaped using URL escaping $value =~ s/&/&/g; $out .= " <$attribute>$value\n"; } } $out = $out . $SITEURL_XML_SUFFIX; if($is_gzip) { eval "use Compress::Zlib"; if($@) { $output->Error("Perl Module 'Compress::Zlib' required for compression/decompression.\nSee README installation notes.\n"); return $False; } my $gz = $file; $gz->gzwrite($out); } else { print $file $out; } } # end WriteXML } # end package URL {package Filter; # A filter on the stream of URLs we find. A filter is, in essence, # a wildcard applied to the stream. You can think of this as an # operator that returns a tri-state when given a URL: # # (0) False -- this URL is to be dropped from the sitemap # (1) True -- this URL is to be included in the sitemap # (2) None -- this URL is undecided sub new { my($class, $attributes) = @_; my $self = {}; bless($self, $class); $self->{_wildcard} = undef; # Pattern for wildcard match $self->{_regexp} = undef; # Pattern for regexp match $self->{_pass} = $False; # "Drop" filter vs. "Pass" filter my @attributes = keys %{$attributes}; foreach my $attr (@attributes) { # Attributes have the format '{}pattern', '{}type', etc. Remove '{}' from them. $attr = substr($attr, 2); } my @goodattributes = ('pattern', 'type', 'action'); if(!::ValidateAttributes('FILTER', \@attributes, \@goodattributes)) { return; } # Check error count on the way in my $num_errors = $output->{num_errors}; # Fetch the attributes my $pattern = $attributes->{'{}pattern'}->{Value}; my $type = $attributes->{'{}type'}->{Value}; if(!$type) { $type = 'wildcard'; } my $action = $attributes->{'{}action'}->{Value}; if(!$action) { $action = 'drop'; } $type = lc($type); $action = lc($action); # Verify the attributes if(!$pattern) { $output->Error("On a filter you must specify a \"pattern\" to match"); } elsif((!$type) || (($type ne 'wildcard') && ($type ne 'regexp'))) { $output->Error('On a filter you must specify either \'type = "wildcard"\' or \'type = "regexp"\''); } elsif(($action ne 'pass') && ($action ne 'drop')) { $output->Error('If you specify a filter action, it must be either \'action = "pass"\' or \'action = "drop"\''); } # Set the rule if($action eq 'drop') { $self->{_pass} = $False; } elsif($action eq 'pass') { $self->{_pass} = $True; } if($type eq 'wildcard') { # Convert wildcard to regular expression my %patmap = ( '*' => '.*', '?' => '.', '[' => '[', ']' => ']', ); my $glob = $pattern; $glob =~ s{(.)} { $patmap{$1} || "\Q$1" }ge; $self->{_wildcard} = '^'.$glob.'$'; } elsif($type eq 'regexp') { eval { $self->{_regexp} = qr{$pattern}; }; if($@) { $output->Error("Bad regular expression: $pattern"); } } # Log the final results if we didn't add any errors if($num_errors == $output->{num_errors}){ $output->Log("Filter: $action any URL that matches $type \"$pattern\"", 2); } return $self; } # end new sub Apply { my($self, $url) = @_; # Process the URL, as above. if(!$url || !$url->{loc}) { return undef; } if($self->{_wildcard}) { if($url->{loc} =~ m/$self->{_wildcard}/) { return $self->{_pass}; } return $None; } if($self->{_regexp}) { my $pattern = $self->{_regexp}; if($url->{loc} =~ /$pattern/){ return $self->{_pass}; } return $None; } die if not $False; } # end Apply } # end package Filter {package InputURL; # Each Input class knows how to yield a set of URLs from a data source. # This one handles a single URL, manually specified in the config file. sub new { my($class, $attributes) = @_; my $self = {}; bless($self, $class); $self->{_url} = undef; # The lonely URL my @attributes = keys %{$attributes}; foreach my $attr (@attributes) { # Attributes have the format '{}href', '{}lastmod', etc. Remove '{}' from them. $attr = substr($attr, 2); } my @goodattributes = ('href', 'lastmod', 'changefreq', 'priority'); if(!::ValidateAttributes('URL', \@attributes, \@goodattributes)) { return; } my $url = URL->new(); foreach my $attr (@attributes){ if($attr eq 'href') { $url->TrySetAttribute('loc', $attributes->{"{}".$attr}->{Value}); } else { $url->TrySetAttribute($attr, $attributes->{"{}".$attr}->{Value}); } } if(!$url->{loc}){ $output->Error('Url entries must have href attribute.'); return; } $self->{_url} = $url; $output->Log("Input From URL \"" . $self->{_url}->{loc} . "\"", 2); return $self; } # end new sub ProduceURLs { my($self, $sitemap_obj, $consumer) = @_; # Produces URLs from our data source, hands them in to the consumer. if($self->{_url}) { $sitemap_obj->$consumer($self->{_url}, $True); } } # end ProduceURLs } # end package InputURL {package InputURLList; # Each Input class knows how to yield a set of URLs from a data source. # This one handles a text file with a list of URLs sub new { my($class, $attributes) = @_; my $self = {}; bless($self, $class); $self->{_path} = undef; $self->{_encoding} = undef; my @attributes = keys %{$attributes}; foreach my $attr (@attributes) { # Attributes have the format '{}path', '{}encoding', etc. Remove '{}' from them. $attr = substr($attr, 2); } my @goodattributes = ('path','encoding'); if(!::ValidateAttributes('URLLIST', \@attributes, \@goodattributes)) { return; } $self->{_path} = $attributes->{'{}path'}; $self->{_encoding} = $attributes->{'{}encoding'}; if(!$self->{_encoding}) { $self->{_encoding} = $ENC_UTF8; } if($self->{_path}){ $self->{_path} = $encoder->MaybeNarrowPath($self->{_path}); if(-e $self->{_path}) { $output->Log("Input: From URLLIST \"". $self->{_path} . "\"", 2); } else{ $output->Error("Can not locate file: " . $self->{_path}); $self->{_path} = undef; } } else{ $output->Error("Urllist entries must habe a \"path\" attribute"); } return $self; } # end new sub ProduceURLs { my($self, $sitemap_obj, $consumer) = @_; # Produces URLs from our data source, hands them in to the consumer. # Open the file my $opened_file = ::OpenFileForRead($self->{_path}, 'URLLIST'); if(!$opened_file) { return; } my $file = $opened_file->{file}; # Iterate lines my @file = (); if(!$opened_file->{is_gzip}) { @file = <$file>; } else { eval "use Compress::Zlib"; if($@) { $output->Error("Perl Module 'Compress::Zlib' required for compression/decompression.\nSee README installation notes."); return $False; } local *FILE = $file; my $gz = gzopen(*FILE, "rb"); my $line; while($gz->gzreadline($line)) { push(@file, $line); } $gz->gzclose; } my $linenum = 0; foreach my $line (@file){ $linenum++; # Strip comments and empty lines $line = ::StripString($line); if((!$line) || substr($line, 0, 1) eq '#') { next; } # Split the line on space my $url = URL->new(); my @cols = split(/ /, $line); my $size = @cols; for(my $i = 0; $i < $size; $i++){ $cols[$i] = ::StripString($cols[$i]); } $url->TrySetAttribute('loc', $cols[0]); # Extract attributes from the other columns for(my $i = 1; $i < $size; $i++){ if($cols[$i]) { my($attr_name, @attr_values); eval { ($attr_name, @attr_values) = split('=', $cols[$i]); my $attr_val = shift @attr_values; my $attr_values_len = @attr_values; if($attr_values_len > 0) { foreach my $val (@attr_values) { $attr_val .= "=" . $val; } } $url->TrySetAttribute($attr_name, $attr_val); }; if($@) { $output->Warn("Line $linenum: Unable to parse attribute: " . $cols[$i]); } } } # Pass it on $sitemap_obj->$consumer($url, $False); } close($file); } # end ProduceURLs } # end package InputURLList {package InputDirectory; # Each Input class knows how to yield a set of URLs from a data source. # This one handles a directory that acts as base for walking the filesystem. sub new { my($class, $attributes, $base_url) = @_; my $self = {}; bless($self, $class); $self->{_path} = undef; # The directory $self->{_url} = undef; # The URL equivalent $self->{_default_file} = undef; my @attributes = keys %{$attributes}; foreach my $attr (@attributes) { # Attributes have the format '{}path', '{}url', etc. Remove '{}' from them. $attr = substr($attr, 2); } my @goodattributes = ('path', 'url', 'default_file'); if(!::ValidateAttributes('DIRECTORY', \@attributes, \@goodattributes)) { return; } # Prep the path -- it MUST end in a sep my $path = $attributes->{"{}path"}->{Value}; if(!$path){ $output->Error('Directory entries must have both "path" and "url" atributes'); return; } $path = $encoder->MaybeNarrowPath($path); my $os_sep; if($^O eq "MSWin32") { $os_sep = "\\"; } elsif($^O eq "darwin") { $os_sep = ":"; } else { $os_sep = "/"; } if(substr($path, length($path) - 1) ne $os_sep){ $path .= $os_sep; } if(!(-d $path)){ $output->Error("Can not locate directory: " . $path); return; } # Prep the URL -- it MUST end in a sep my $url = $attributes->{"{}url"}->{Value}; if(!$url){ $output->Error('Directory entries must have both "path" and "url" attributes'); return; } $url = URL->Canonicalize($url); if(substr($url, length($url) - 1) ne '/'){ $url .= '/'; } if(substr($url, 0, length($base_url)) ne $base_url) { if(substr($url, 0, length($base_url)) ne $base_url) { $output->Error("The directory URL \"" . $url . "\" is not relative to the base_url: " . $base_url); return; } } # Prep the default file -- it MUST be just a filename my $file = $attributes->{"{}default_file"}->{Value}; if($file) { $file = $encoder->MaybeNarrowPath($file); if(index($file, $os_sep) != -1) { $output->Error("The default_file \"" . $file . "\" can not include path information."); $file = undef; } } $self->{_path} = $path; $self->{_url} = $url; $self->{_default_file} = $file; if($file) { $output->Log("Input: From DIRECTORY \"$path\" ($url) with default file \"$file\"", 2); } else { $output->Log("Input: From DIRECTORY \"$path\" ($url) with no default file", 2); } return $self; } # end new sub ProduceURLs { our($self, $sitemap_obj, $consumer) = @_; # Produces URLs from our data source, hands them in to the consumer. if(!$self->{_path}) { return; } our $root_path = $self->{_path}; our $root_URL = $self->{_url}; our $root_file = $self->{_default_file}; sub PerFile { # Called once per file # Pull a timestamp my $url = URL->new(); my $isdir = $False; my $time = undef; my $dirpath = $File::Find::dir; my $name = $_; my $path = undef; eval { if($name ne '.') { $path = join '/', $dirpath, $name; } else { $path = $dirpath; } $isdir = -d $path; if($isdir) { if(!PerDirectory($dirpath)) { return; } } if($isdir && $root_file) { my $file = join '/', $path, $root_file; eval { $time = (stat($file))[9]; }; if($@) { } } if(!$time) { $time = (stat($path))[9]; } $url->{lastmod} = ::TimestampISO8601($time); }; if($@) { } # Build a URL my $middle = ''; if(length($path) > length($root_path)) { my $is_win = $^O eq "MSWin32"; $middle = substr($path, length($root_path) + $is_win); } my $os_sep; if($^O eq "MSWin32") { $os_sep = "\\"; } elsif($^O eq "darwin") { $os_sep = ":"; } else { $os_sep = "/"; } if($os_sep ne '/') { $middle =~ tr|:|/|; $middle =~ tr|\\|/|; } if($isdir && $name ne '.'){ $middle .= '/'; } $url->TrySetAttribute('loc', $root_URL . $middle); # Suppress default files. (All the way down here so we can log it.) if(($name ne '.') && ($root_file eq $name)){ $url->Log('IGNORED(default file)', 2); return; } $sitemap_obj->$consumer($url, $False); } # end PerFile sub PerDirectory { my($dirpath) = @_; # Called once per directory my $os_sep; if($^O eq "MSWin32") { $os_sep = "\\"; } elsif($^O eq "darwin") { $os_sep = ":"; } else { $os_sep = "/"; } if(substr($dirpath, length($dirpath) - 1) ne $os_sep){ $dirpath .= $os_sep; } if(substr($dirpath, 0, length($root_path)) ne $root_path) { $output->Warn('Unable to decide what the root path is for directory ' . $dirpath); return $False; } return $True; } # end PerDirectory $output->Log("Walking DIRECTORY \"" . $self->{_path} . "\"", 1); ::find(\&PerFile, $self->{_path}); } # end ProduceURLs } # end package InputDirectory {package InputAccessLog; # Each Input class knows how to yield a set of URLs from a data source. # This one handles access logs. It's non-trivial in that we want to # auto-detect log files in the Common Logfile Format (as used by Apache, # for instance) and the Extended Log File Format (as used by IIS, for # instance). sub new { my($class, $attributes) = @_; my $self = {}; bless($self, $class); $self->{_path} = undef; # The file path $self->{_encoding} = undef; # Encoding of that file $self->{_is_elf} = $False; # Extended Log File Format? $self->{_is_clf} = $False; # Common Logfile Format? $self->{_elf_status} = -1; # ELF field: '200' $self->{_elf_method} = -1; # ELF field: 'HEAD' $self->{_elf_uri} = -1; # ELF field: '/foo?bar=1' $self->{_elf_urifrag1} = -1; # ELF field: '/foo' $self->{_elf_urifrag2} = -1; # ELF field: 'bar=1' my @attributes = keys %{$attributes}; foreach my $attr (@attributes) { # Attributes have the format '{}path', '{}encoding', etc. Remove '{}' from them. $attr = substr($attr, 2); } my @goodattributes = ('path', 'encoding'); if(!::ValidateAttributes('ACCESSLOG', \@attributes, \@goodattributes)) { return; } $self->{_path} = $attributes->{'{}path'}; $self->{_encoding} = $attributes->{'{}encoding'}; if(!$self->{_encoding}) { $self->{_encoding} = $ENC_UTF8; } if($self->{_path}) { $self->{_path} = $encoder->MaybeNarrowPath($self->{_path}); if(-e $self->{_path}) { $output->Log("Input: From ACCESSLOG \"" . $self->{_path} . "\"", 2); } else { $output->Error("Can not locate file: " . $self->{_path}); $self->{_path} = undef; } } else { $output->Error("Accesslog entries must have a \"path\" attribute."); } return $self; } # end new sub RecognizeELFLine { my($self, $line) = @_; # Recognize the Fields directive that heads an ELF file if(substr($line, 0, 8) ne '#Fields:'){ return $False; } my @fields = split(' ', $line); shift(@fields); my $fields_len = @fields; for(my $i = 0; $i < $fields_len; $i++){ my $field = $fields[$i]; # Strip field $field = ::StripString($field); if ($field eq 'sc-status'){ $self->{_elf_status} = $i; } elsif ($field eq 'cs-method'){ $self->{_elf_method} = $i; } elsif ($field eq 'cs-uri'){ $self->{_elf_uri} = $i; } elsif ($field eq 'cs-uri-stem'){ $self->{_elf_urifrag1} = $i; } elsif ($field eq 'uri-query'){ $self->{_elf_urifrag2} = $i; } } $output->Log('Recognized an Extended Log File Format file.', 2); return $True; } # end RecognizeELFLine sub GetELFLine{ my($self, $line) = @_; # Fetch the requested URL from an ELF line my @fields = split(' ', $line); my $count = @fields; # Strip fields for(my $i = 0; $i < $count; $i++) { $fields[$i] = ::StripString($fields[$i]); } # Verify status was Ok if($self->{_elf_status} >= 0){ if($self->{_elf_status} >= $count){ return undef; } my $field_strip_status = $fields[$self->{_elf_status}]; if($field_strip_status ne '200'){ return undef; } } # Verify method was HEAD or GET if($self->{_elf_method} >= 0){ if($self->{_elf_method} >= $count){ return undef; } my $field_strip_method = $fields[$self->{_elf_method}]; if(($field_strip_method ne 'HEAD') && ($field_strip_method ne 'GET')){ return undef; } } # Pull the full URL if we can if($self->{_elf_uri} >= 0){ if($self->{_elf_uri} >= $count){ return undef; } my $url = $fields[$self->{_elf_uri}]; if($url ne '-'){ return $url; } } # Put together a fragmentary URL if($self->{_elf_urifrag1} >= 0){ if(($self->{_elf_urifrag1} >= $count) || ($self->{_elf_urifrag2} >= $count)){ return undef; } my $urlfrag1 = $fields[$self->{_elf_urifrag1}]; my $urlfrag2 = undef; if($self->{_elf_urifrag2} >= 0){ $urlfrag2 = $fields[$self->{_elf_urifrag2}]; } if($urlfrag1 && ($urlfrag1 ne '-')){ if($urlfrag2 && ($urlfrag2 ne '-')){ $urlfrag1 .= '?' . $urlfrag2; } return $urlfrag1; } } return undef; } # end GetELFLine sub RecognizeCLFLine{ my($self, $line) = @_; # Try to tokenize a log file line according to CLF pattern and see if it works my $recognize = $False; $_ = $line; if(/$ACCESSLOG_CLF_PATTERN/) { $recognize = (($1 eq 'HEAD') || ($1 eq 'GET')); if($recognize){ $output->Log('Recognized a Common Logfile Format file', 2); } } return $recognize; } # end RecognizeCLFLine sub GetCLFLine{ my($self, $line) = @_; # Fetch the requested URL from a CLf line my $recognize = $False; $_ = $line; if(/$ACCESSLOG_CLF_PATTERN/) { my $request = $1; if(($request eq 'HEAD') || ($request eq 'GET')) { return $2; } } return undef; } # end GetCLFLine sub ProduceURLs { my($self, $sitemap_obj, $consumer) = @_; # Produces URLs from our data source, hands them in to the consumer. # Open the file my $opened_file = ::OpenFileForRead($self->{_path}, 'ACCESSLOG'); if(!$opened_file) { return; } my $file = $opened_file->{file}; # Iterate lines my @file = (); if(!$opened_file->{is_gzip}) { @file = <$file>; } else { eval "use Compress::Zlib"; if($@) { $output->Error("Perl Module 'Compress::Zlib' required for compression/decompression.\nSee README installation notes."); return $False; } local *FILE = $file; my $gz = gzopen(*FILE, "rb"); my $line; while($gz->gzreadline($line)) { push(@file, $line); } $gz->gzclose; } my $linenum = 0; foreach my $line (@file){ $linenum++; $line = ::StripString($line); # If we don't know the format yet, try them both if((!$self->{_is_clf}) && (!$self->{_is_elf})){ $self->{_is_elf} = $self->RecognizeELFLine($line); $self->{_is_clf} = $self->RecognizeCLFLine($line); } # Digest the line my $match = undef; if($self->{_is_elf}){ $match = $self->GetELFLine($line); } elsif($self->{_is_clf}){ $match = $self->GetCLFLine($line); } if(!$match){ next; } # Pass it on my $url = URL->new(); $url->TrySetAttribute('loc', $match); $sitemap_obj->$consumer($url, $True); } close($file); } # end ProduceURLs } # end package InputAccessLog {package InputSitemap; # Each Input class knows how to yield a set of URLs from a data source. # This one handles Sitemap files and Sitemap index files. For the sake # of simplicity in design (and simplicity in interfacing with the SAX # package), we do not handle these at the same time, recursively. Instead # we read an index file completely and make a list of Sitemap files, then # go back and process each Sitemap. {package _ContextBase; # Base class for context handlers in our SAX processing. A context # handler is a class that is responsible for understanding one level of # depth in the XML schema. The class knows what sub-tags are allowed, # and doing any processing specific for the tag we're in. # This base class is the API filled in by specific context handlers, # all defined below. sub new { my($class, $subtags) = @_; $class = ref($class) || $class; my $self = {}; bless($self, $class); # Initialize with a sequence of the sub-tags that would be valid in # this context. $self->{_allowed_tags} = $subtags; # Sequence of sub-tags we can have $self->{_last_tag} = undef; # Most recent seen sub-tag return $self; } # end new sub AcceptTag { my($self, $tag) = @_; # Returns True if opening a sub-tag is valid in this context. my $valid = $False; foreach my $allowed_tag (@{$self->{_allowed_tags}}) { if($tag eq $allowed_tag) { $valid = $True; last; } } if($valid) { $self->{_last_tag} = $tag; } else { $self->{_last_tag} = undef; } return $valid; } # end AcceptTag sub AcceptText { my($self, $text) = @_; # Returns True if a blurb of text is valid in this context. return $False; } # end AcceptText sub Open { my($self) = @_; # The context is opening. Do initialization. } # end Open sub Close { my($self) = @_; # The context is closing. Return our result, if any. } # end Close sub Return { my($self, $result) = @_; # We're returning to this context after handling a sub-tag. This # method is called with the result data from the sub-tag that just # closed. Here in _ContextBase, if we ever see a result it means # the derived child class forgot to override this method. if($result) { #raise NotImplementedError } } # end Return } # end package _ContextBase {package _ContextUrlSet; use base '_ContextBase'; # Context handler for the document node in a Sitemap. sub new { my($class) = @_; my @subtags = ('url'); my $self = $class->SUPER::new(\@subtags); bless($self, $class); return $self; } # end new } #end class _ContextUrlSet {package _ContextUrl; use base '_ContextBase'; # Context handler for a URL node in a Sitemap. my @URL__slots__ = ('loc', 'lastmod', 'changefreq', 'priority'); sub new { my($class, $sitemap_obj, $consumer) = @_; # Initialize this context handler with the callable consumer that # wants our URLs. my $self = $class->SUPER::new(\@URL__slots__); $self->{_url} = undef; # The URL object we're building $self->{_sitemap_obj} = $sitemap_obj; # The sitemap object to call the consumer $self->{_consumer} = $consumer; # Who wants to consume it bless($self, $class); return $self; } # end new sub Open { my($self) = @_; # Initialize the URL. die if $self->{_url}; $self->{_url} = URL->new(); } # end Open sub Close { my($self) = @_; # Pass the URL to the consumer and reset it to undef. die if not $self->{_url}; my $sitemap_obj = $self->{_sitemap_obj}; my $consumer = $self->{_consumer}; $sitemap_obj->$consumer($self->{_url}, $False); $self->{_url} = undef; } # end Close sub Return { my($self, $result) = @_; # A value context has closed, absorb the data it gave us. die if not $self->{_url}; if($result) { $self->{_url}->TrySetAttribute($self->{_last_tag}, $result); } } # end Return } # end package _ContextUrl {package _ContextSitemapIndex; use base '_ContextBase'; # Context handler for the document node in an index file. sub new { my($class) = @_; my @subtags = ('sitemap',); my $self = $class->SUPER::new(\@subtags); $self->{_loclist} = (); # List of accumulated Sitemap URLs bless($self, $class); return $self; } # end new sub Open { my($self) = @_; # Just a quick verify of state. die if $self->{_loclist}; } # end Open sub Close { my($self) = @_; # Return our list of accumulated URLs. if($self->{_loclist}) { my @temp = @{$self->{_loclist}}; $self->{_loclist} = (); return @temp; } } # end Close sub Return { my($self, $result) = @_; # Getting a new loc URL, add it to the collection. if($result) { push(@{$self->{_loclist}}, $result); } } # end Return } # end package _ContextSitemapIndex {package _ContextSitemap; use base '_ContextBase'; # Context handler for a Sitemap entry in an index file. sub new { my($class) = @_; my @subtags = ('loc', 'lastmod'); my $self = $class->SUPER::new(\@subtags); $self->{_loc} = undef; # The URL to the Sitemap bless($self, $class); return $self; } # end new sub Open { my($self) = @_; # Just a quick verify of state. die if $self->{_loc}; } # end Open sub Close { my($self) = @_; # Return our URL to our parent. if($self->{_loc}) { my $temp = $self->{_loc}; $self->{_loc} = undef; return $temp; } $output->Warn("In the Sitemap index file, a \"sitemap\" entry had no \"loc\"."); } # end Close sub Return { my($self, $result) = @_; # A value has closed. If it was a 'loc', absorb it. if($result && ($self->{_last_tag} eq 'loc')) { $self->{_loc} = $result; } } # end Return } # end package _ContextSitemap {package _ContextValue; use base '_ContextBase'; # Context handler for a single value. We return just the value. The # higher level context has to remember what tag led into us. sub new { my($class) = @_; my @subtags = (); my $self = $class->SUPER::new(\@subtags); $self->{_text} = undef; bless($self, $class); return $self; } # end new sub AcceptText { my($self, $text) = @_; # Allow all text, adding it to our buffer. if($self->{_text}) { $self->{_text} .= $text; } else { $self->{_text} = $text; } return $True; } # end AcceptText sub Open { my($self) = @_; # Initialize our buffer. $self->{_text} = undef; } # end Open sub Close { my($self) = @_; # Return what's in our buffer. my $text = $self->{_text}; $self->{_text} = undef; if($text) { # Remove spaces while(substr($text, 0, 1) eq ' ') { $text = substr($text, 1); } while(substr($text, length($text) - 1, 1) eq ' ') { $text = substr($text, 0, length($text) - 1); } } return $text; } # end Close } # end package _ContextValue sub new { my($class, $attributes) = @_; my $self = {}; bless($self, $class); # Initialize with a hash of attributes from our entry in the config file. $self->{_pathlist} = undef; # A list of files $self->{_current} = -1; # Current context in _contexts $self->{_contexts} = undef; # The stack of contexts we allow $self->{_contexts_idx} = undef; # ...contexts for index files $self->{_contexts_stm} = undef; # ...contexts for Sitemap files my @attributes = keys %{$attributes}; foreach my $attr (@attributes) { # Attributes have the format '{}path'. Remove '{}' from them. $attr = substr($attr, 2); } my @goodattributes = ('path'); if(!::ValidateAttributes('SITEMAP', \@attributes, \@goodattributes)) { return; } # Init the first file path my $path = $attributes->{'{}path'}; if($path) { $path = $encoder->MaybeNarrowPath($path); if(-e $path) { $output->Log("Input: From SITEMAP \"$path\"", 2); @{$self->{_pathlist}} = ($path); } else { $output->Error("Can not locate file $path"); } } else { $output->Error("Sitemap entries must have a \"path\" attribute."); } return $self; } #end new sub ProduceURLs { my($self, $sitemap_obj, $consumer) = @_; # In general: Produces URLs from our data source, hand them to the callable consumer. # In specific: Iterate over our list of paths and delegate the actual # processing to helper methods. This is a complexity no other data source # needs to suffer. We are unique in that we can have files that tell us # to bring in other files. # Note the decision to allow an index file or not is made in this method. # If we call our parser with (self._contexts == None) the parser will # grab whichever context stack can handle the file. IE: index is allowed. # If instead we set (self._contexts = ...) before parsing, the parser # will only use the stack we specify. IE: index not allowed. # Set up two stacks of contexts @{$self->{_contexts_idx}} = (_ContextSitemapIndex->new(), _ContextSitemap->new(), _ContextValue->new()); @{$self->{_contexts_stm}} = (_ContextUrlSet->new(), _ContextUrl->new($sitemap_obj, $consumer), _ContextValue->new()); # Process the first file die if not @{$self->{_pathlist}}; my $path = $self->{_pathlist}->[0]; $self->{_contexts} = undef; # We allow an index file here $self->_ProcessFile($path); # Iterate over remaining files @{$self->{_contexts}} = @{$self->{_contexts_stm}}; # No index files allowed my @pathlist = @{$self->{_pathlist}}; shift @pathlist; foreach my $path (@pathlist) { $self->_ProcessFile($path); } } #end ProduceURLs sub _ProcessFile { my($self, $path) = @_; # Do per-file reading/parsing/consuming for the file path passed in. die if not $path; # Open our file my $opened_file = ::OpenFileForRead($path, 'SITEMAP'); if(!$opened_file) { return; } my $file = $opened_file->{file}; if($opened_file->{is_gzip}) { eval "use Compress::Zlib"; if($@) { $output->Error("Perl Module 'Compress::Zlib' required for compression/decompression.\nSee README installation notes."); return $False; } $path = substr($path, 0, length($path) - 3); open OUTPUT, "> $path"; local *FILE = $file; my $gz = gzopen(*FILE, "rb"); my($line, $out) = ("", ""); while($gz->gzreadline($line)) { $line =~ s/\n+$//; $line =~ s/\r+$//; $out .= $line; } print OUTPUT $out; close $file; close OUTPUT; } # Rev up the SAX engine eval { $self->{_current} = -1; my $xml_parser = XML::SAX::ParserFactory->parser( ContentHandler => $self ); $xml_parser->parse_uri($path); }; if($@) { if(index($@, 'LWP Request Failed') != -1 || index($@, 'Permission denied') != -1) { $output->Error("Cannot read from file $path"); } else { $@ =~ s/\n+$//; $@ =~ s/\r+$//; $output->Error("XML error in the file $path: $@"); } } # Clean up close($file); if($opened_file->{is_gzip}) { unlink($path); } } #end _ProcessFile sub _MungeLocationListIntoFiles { my($self, @urllist) = @_; # Given a list of URLs, munge them into our self._pathlist property. # We do this by assuming all the files live in the same directory as # the first file in the existing pathlist. That is, we assume a # Sitemap index points to Sitemaps only in the same directory. This # is not true in general, but will be true for any output produced # by this script. die if not @{$self->{_pathlist}}; my $path = $self->{_pathlist}->[0]; $path = File::Spec->canonpath($path); my($volume, $dir, $ignore); ($volume, $dir, $ignore) = File::Spec->splitpath($path); foreach my $url (@urllist) { $url = URL->Canonicalize($url); $output->Log("Index points to Sitemap file at: $url", 2); my $file; ($ignore, $ignore, $file) = File::Spec->splitpath($url); if($dir) { $file = File::Spec->catpath($volume, $dir, $file); } if($file) { push(@{$self->{_pathlist}}, $file); $output->Log("Will attempt to read Sitemap file: $file", 1); } } } #end _MungeLocationListIntoFiles sub start_element { my($self, $element) = @_; # SAX processing, called per node in the config stream. # As long as the new tag is legal in our current context, this # becomes an Open call on one context deeper. # If this is the document node, we may have to look for a context stack if(($self->{_current} < 0) && !$self->{_contexts}) { die if not($self->{_contexts_idx} && $self->{_contexts_stm}); if($element->{Name} eq 'urlset') { @{$self->{_contexts}} = @{$self->{_contexts_stm}}; } elsif($element->{Name} eq 'sitemapindex') { @{$self->{_contexts}} = @{$self->{_contexts_idx}}; $output->Log("File is a Sitemap index.", 2); } else { $output->Error("The document appears to be neither a Sitemap nor a Sitemap index."); } } # Compare stacks my $is_idx = $False; my $is_stm = $False; if($self->{_contexts} && (exists($self->{_contexts}->[1]->{_loc}))) { $is_idx = $True; } if($self->{_contexts} && (exists($self->{_contexts}->[1]->{_url}))) { $is_stm = $True; } # Display a kinder error on a common mistake if(($self->{_current} < 0) && $is_stm && ($element->{Name} eq 'sitemapindex')) { $output->Error("A Sitemap index can not refer to another Sitemap index."); } # Normalize hash of attributes my %attributes; foreach my $attr (keys %{$element->{Attributes}}) { $attributes{$attr} = $element->{Attributes}->{$attr}->{Value}; } # Verify no unexpected attributes if($element->{Attributes}) { my $text = ""; foreach my $attr (keys %attributes) { # The document node will probably have namespaces if($self->{_current} < 0) { if(index($attr, 'xmlns') >= 0) { next; } if(index($attr, 'xsi') >= 0) { next; } if($element->{Attributes}->{$attr}->{NamespaceURI}) { next; } } if($text) { $text .= ', '; } $text .= $attr; } if($text) { $output->Warn("Did not expect any attributes on any tag, instead tag \"" . $element->{Name} . "\" had attributes: $text"); } } # contexts if(($self->{_current} < 0) || ($self->{_contexts}->[$self->{_current}]->AcceptTag($element->{Name}))) { $self->{_current}++; my $contexts_len = @{$self->{_contexts}}; die if not($self->{_current} < $contexts_len); $self->{_contexts}->[$self->{_current}]->Open(); } else { $output->Error("Can not accept tag \"" . $element->{Name} . "\" where it appears."); } } #end start_element sub end_element { my($self, $element) = @_; # SAX processing, called per node in the config stream. # This becomes a call to Close on one context followed by a call # to Return on the previous. die if not $self->{_current} >= 0; # Compare stacks my $is_idx = $False; my $is_stm = $False; if($self->{_contexts} && (exists($self->{_contexts}->[1]->{_loc}))) { $is_idx = $True; } elsif($self->{_contexts} && (exists($self->{_contexts}->[1]->{_url}))) { $is_stm = $True; } my $retval = undef; my @retval = undef; if($is_idx && $self->{_current} == 0) { @retval = $self->{_contexts}->[$self->{_current}]->Close(); } else { $retval = $self->{_contexts}->[$self->{_current}]->Close(); } $self->{_current}--; if($self->{_current} >= 0) { $self->{_contexts}->[$self->{_current}]->Return($retval); } elsif($retval && ($is_idx)) { $self->_MungeLocationListIntoFiles($retval); } elsif(@retval && ($is_idx)) { $self->_MungeLocationListIntoFiles(@retval); } } #end end_element sub characters { my($self, $characters) = @_; # SAX processing, called when text values are read. Important to # note that one single text value may be split across multiple calls # of this method. if(($self->{_current} < 0) || (!$self->{_contexts}->[$self->{_current}]->AcceptText($characters->{Data}))) { # Strip text my $text_strip = $characters->{Data}; $text_strip = ::StripString($text_strip); if($text_strip) { $output->Error("Can not accept text \"" . $characters->{Data} . "\" where it appears."); } } } #end characters } # end package InputSitemap {package FilePathGenerator; # This class generates filenames in a series, upon request. # You can request any iteration number at any time, you don't # have to go in order. # Example of iterations for '/path/foo.xml.gz': # 0 --> /path/foo.xml.gz # 1 --> /path/foo1.xml.gz # 2 --> /path/foo2.xml.gz # _index.xml --> /path/foo_index.xml sub new { my $class = shift; my $self = {}; $self->{is_gzip} = 0; # Is this a GZIP file? $self->{_path} = undef; # '/path/' $self->{_abs_path} = undef; # absolute form of 'path' $self->{_prefix} = undef; # 'foo' $self->{_suffix} = undef; # '.xml.gz' bless($self, $class); return $self; } #end new sub Preload { my($self, $path) = @_; # Splits up a path into forms ready for recombination. $path = $encoder->MaybeNarrowPath($path); # Get down to a base name $path = File::Spec->canonpath($path); my($volume,$directories,$base) = File::Spec->splitpath($path); if(!$base) { $output->Error("Couldn\'t parse the file path: $path"); return $False; } my $lenbase = length($base); # Get absolute form for the given path -- which is relative to the config file my $absolute = $path; if(!File::Spec->file_name_is_absolute($path)){ my($volume, $abspath, $ignore) = File::Spec->splitpath(File::Spec->rel2abs($flags{'config'})); $directories = $abspath.$directories; $absolute = File::Spec->catpath($volume, $directories, $base); } $self->{_abs_path} = $absolute; # Recognize extension my $lensuffix = 0; my @compare_suffix = ('.xml', '.xml.gz', '.gz'); foreach my $suffix (@compare_suffix) { # if base ends with suffix: if(substr($base, length($base) - length($suffix), length($suffix)) eq $suffix) { $lensuffix = length($suffix); last; } } if(!$lensuffix) { $output->Error("The path $path doesn\'t end in a supported file extension."); return $False; } # Find out if base ends with '.gz': $self->{is_gzip} = substr($base, length($base) - 3) eq '.gz'; # Split the original path my $lenpath = length($path); $self->{_path} = substr($path, 0, $lenpath - $lenbase); $self->{_prefix} = substr($path, $lenpath - $lenbase, $lenbase - $lensuffix); $self->{_suffix} = substr($path, $lenpath - $lensuffix); return $True; } #end Preload sub GeneratePath { my($self, $instance) = @_; # Generates the iterations, as described above. my $prefix = $self->{_path} . $self->{_prefix}; if($instance =~ /^\d+$/) { if($instance) { return $prefix . $instance . $self->{_suffix}; } return $prefix . $self->{_suffix}; } return $prefix . $instance; } #end GeneratePath sub AbsolutePath { my($self, $path) = @_; # Gets the absolute form of a path that is relative to the config file. my($volume,$directories,$base) = File::Spec->splitpath($path); if(!File::Spec->file_name_is_absolute($path)){ my($volume, $abspath, $ignore) = File::Spec->splitpath($self->{_abs_path}); $directories = $abspath.$directories; $path = File::Spec->catpath($volume, $directories, $base); } return $path; } #end AbsolutePath sub GenerateURL { my($self, $instance, $root_url) = @_; # Generates iterations, but as a URL instead of a path. my $prefix = $root_url . $self->{_prefix}; my $retval = undef; if($instance =~ /^\d+$/) { if($instance) { $retval = $prefix . $instance . $self->{_suffix}; } else { $retval = $prefix . $self->{_suffix}; } } else { $retval = $prefix . $instance; } return URL->Canonicalize($retval); } #end GenerateURL sub GenerateWildURL { my($self, $root_url) = @_; # Generates a wildcard that should match all our iterations my $prefix = URL->Canonicalize($root_url . $self->{_prefix}); my $temp = URL->Canonicalize($prefix . $self->{_suffix}); my $suffix = substr($temp, length($prefix)); my $wild = $prefix . '*' . $suffix; # Convert wildcard to regular expression my %patmap = ( '*' => '.*', '?' => '.', '[' => '[', ']' => ']', ); $wild =~ s{(.)} { $patmap{$1} || "\Q$1" }ge; return '^'.$wild.'$'; } #end def GenerateWildURL } # end package FilePathGenerator {package PerURLStatistics; # Keep track of some simple per-URL statistics, like file extension. sub new { my $class = shift; my $self = {}; $self->{_extensions} = {}; # Count of extension instances bless($self, $class); return $self; } #end new sub Consume { my($self, $url) = @_; # Log some stats for the URL. At the moment, that means extension. my $path; if($url && $url->{loc}){ use URI::URL; my $uri_url = URI::URL->new($url->{loc}); $path = $uri_url->path; if(!$path) { return; } } # Recognize directories if(substr($path, length($path) - 1) eq '/') { if(exists($self->{_extensions}->{'/'})) { $self->{_extensions}->{'/'}++; } else { $self->{_extensions}->{'/'} = 1; } return; } # Strip to a filename my $i = rindex($path, '/'); if($i >= 0) { $path = substr($path, $i); } # Find extension $i = rindex($path, '.'); if($i > 0) { my $ext_without_case_shift = substr($path, $i); my $ext = lc($ext_without_case_shift); if(exists($self->{_extensions}->{$ext})) { $self->{_extensions}->{$ext}++; } else { $self->{_extensions}->{$ext} = 1; } } else { if(exists($self->{_extensions}->{'(no extension)'})) { $self->{_extensions}->{'(no extension)'}++; } else { $self->{_extensions}->{'(no extension)'} = 1; } } } #end Consume sub Log { my($self) = @_; # Dump out stats to the output. if($self->{_extensions}) { $output->Log("Count of file extensions on URLs:", 1); foreach my $ext (sort keys %{$self->{_extensions}}) { my $formatted = sprintf(" %7d", $self->{_extensions}->{$ext}); $output->Log($formatted . " $ext", 1); } } } #end Log } # end package PerURLStatistics {package Sitemap; # This is the big workhorse class that processes your inputs and spits # out sitemap files. It is built as a SAX handler for set up purposes. # That is, it processes an XML stream to bring itself up. sub new { my($class, $suppress_notify) = @_; my $self = {}; bless($self, $class); $self->{_filters} = (); # Filter objects $self->{_inputs} = (); # Input objects $self->{_urls} = {}; # Maps URLs to count of dups $self->{_set} = (); # Current set of URLs $self->{_filegen} = undef; # Path generator for output files $self->{_wildurl1} = undef; # Sitemap URLs to filter out $self->{_wildurl2} = undef; # Sitemap URLs to filter out $self->{_sitemaps} = 0; # Number of output files # We init _dup_max to 2 so the default priority is 0.5 instead of 1.0 $self->{_dup_max} = 2; # Max number of duplicate URLs $self->{_stat} = PerURLStatistics->new(); # Some simple stats $self->{_in_site} = $False; # SAX: are we in a Site node? $self->{_in_site_ever} = $False; # SAX: were we ever in a Site? $self->{_default_enc} = undef; # Best encoding to try on URLs $self->{_base_url} = undef; # Prefix to all valid URLs $self->{_store_into} = undef; # Output filepath $self->{_suppress} = $suppress_notify; # Suppress notify of servers return $self; } # end new sub ValidateBasicConfig { my($self) = @_; # Verifies (and cleans up) the basic user-configurable options. my $all_good = $True; if($self->{_default_enc}) { $encoder->SetUserEncoding($self->{_default_enc}); } # Canonicalize the base_url if($all_good && !$self->{_base_url}) { $output->Error("A site needs a \"base_url\" attribute."); $all_good = $False; } if($all_good && !URL->IsAbsolute($self->{_base_url})) { $output->Error("The \"base_url\" must be absolute, not relative: " . $self->{_base_url}); $all_good = $False; } if($all_good) { $self->{_base_url} = URL->Canonicalize($self->{_base_url}); if(substr($self->{_base_url}, length($self->{_base_url}) - 1) ne '/') { $self->{_base_url} .= '/'; } $output->Log("BaseURL is set to: " . $self->{_base_url}, 2); } # Load store_into into a generator if($all_good) { if($self->{_store_into}) { $self->{_filegen} = FilePathGenerator->new(); if(!$self->{_filegen}->Preload($self->{_store_into})) { $all_good = $False; } } else { $output->Error("A site needs a \"store_into\" attribute."); $all_good = $False; } } # Ask the generator for patterns on what its output will look like if($all_good) { $self->{_wildurl1} = $self->{_filegen}->GenerateWildURL($self->{_base_url}); $self->{_wildurl2} = $self->{_filegen}->GenerateURL($SITEINDEX_SUFFIX, $self->{_base_url}); } # Done if(!$all_good) { $output->Log("See \"example_config.xml\" for more information.", 0); } return $all_good; } # end ValidateBasicConfig sub Generate { my($self) = @_; # Run over all the Inputs and ask them to Produce. # Run the inputs foreach my $input (@{$self->{_inputs}}) { $input->ProduceURLs($self, \&ConsumeURL); } # Do last flushes if($self->{_set}) { if(@{$self->{_set}}) { $self->FlushSet(); } } if(!$self->{_sitemaps}) { $output->Warn("No URLs were recorded, writing an empty sitemap."); $self->FlushSet(); } # Write an index as needed if($self->{_sitemaps} > 1) { $self->WriteIndex(); } # Notify $self->NotifySearch(); # Dump stats $self->{_stat}->Log(); } #end Generate sub ConsumeURL { my($self, $url, $allow_fragment) = @_; # All per-URL processing comes together here, regardless of Input. # Here we run filters, remove duplicates, spill to disk as needed, etc. if(!$url) { return; } # Validate if(!$url->Validate($self->{_base_url}, $allow_fragment)) { return; } # Run filters my $accept = $None; foreach my $filter (@{$self->{_filters}}) { $accept = $filter->Apply($url); if($accept != $None) { last; } } if(!($accept || ($accept == $None))) { $url->Log('FILTERED', 2); return; } # Ignore our out output URLs if($url->{loc} =~ $self->{_wildurl1} || $url->{loc} =~ $self->{_wildurl2}) { $url->Log('IGNORED (output file)', 2); return; } # Note the sighting my $hash = $url->MakeHash(); if(exists($self->{_urls}->{$hash})) { my $dup = $self->{_urls}->{$hash}; if($dup > 0) { $dup++; $self->{_urls}->{$hash} = $dup; if($self->{_dup_max} < $dup) { $self->{_dup_max} = $dup; } } $url->Log('DUPLICATE'); return; } #Acceptance -- add to set $self->{_urls}->{$hash} = 1; push(@{$self->{_set}}, $url); $self->{_stat}->Consume($url); $url->Log(); # Flush the set if needed my $set_length = @{$self->{_set}}; if($set_length >= $MAXURLS_PER_SITEMAP) { $self->FlushSet(); } } #end ConsumeURL sub FlushSet { my($self) = @_; # Flush the current set of URLs to the output. This is a little # slow because we like to sort them all and normalize the priorities # before dumping. # Sort and normalize $output->Log("Sorting and normalizing collected URLs.", 1); if($self->{_set}) { @{$self->{_set}} = sort { $a->{loc} cmp $b->{loc} } @{$self->{_set}}; } foreach my $url (@{$self->{_set}}) { my $hash = $url->MakeHash(); my $dup = $self->{_urls}->{$hash}; if($dup > 0) { $self->{_urls}->{$hash} = -1; if(!$url->{priority}) { $url->{priority} = $dup / $self->{_dup_max}; $url->{priority} = substr($url->{priority}, 0, 6); } } } # Get the filename we're going to write to my $relative_filename = $self->{_filegen}->GeneratePath($self->{_sitemaps}); my $filename = $self->{_filegen}->AbsolutePath($relative_filename); if(!$filename) { $output->Fatal("Unexpected: Couldn't generate output filename."); } $self->{_sitemaps}++; my $num_urls; if(@{$self->{_set}}) { $num_urls = @{$self->{_set}}; } else { $num_urls = 0; } $output->Log("Writing Sitemap file \"$relative_filename\" with " . $num_urls . " URLs", 1); # Write to it local *SITEMAP; my $success = open SITEMAP, "> $filename"; unless($success) { $output->Fatal("Couldn't write out to file: $filename"); return $False; } if(substr($filename, length($filename) - 3) eq '.gz') { eval "use Compress::Zlib"; if($@) { $output->Fatal("Perl Module 'Compress::Zlib' required for compression/decompression.\nSee README installation notes."); return $False; } my $gz = gzopen(*SITEMAP, "wb"); $gz->gzwrite($SITEMAP_HEADER); foreach my $url (@{$self->{_set}}) { $url->WriteXML($gz, $True); } $gz->gzwrite($SITEMAP_FOOTER); $gz->gzclose(); } else { print SITEMAP $SITEMAP_HEADER; foreach my $url (@{$self->{_set}}) { $url->WriteXML(*SITEMAP{IO}, $False); } print SITEMAP $SITEMAP_FOOTER; close SITEMAP; } chmod(0644, $filename); # Flush @{$self->{_set}} = (); } #end FlushSet sub WriteIndex { my($self) = @_; # Write the master index of all Sitemap files # Make a filename my $filename = $self->{_filegen}->GeneratePath($SITEINDEX_SUFFIX); if(!$filename) { $output->Fatal("Unexpected: Couldn't generate output index filename."); } $output->Log("Writing index file \"$filename\" with " . $self->{_sitemaps} . " Sitemaps", 1); # Make a lastmod time my $lastmod = ::TimestampISO8601(time()); # Write to it local *SITEINDEX; my $success = open SITEINDEX, "> $filename"; unless($success) { $output->Error("Can not open file: $filename"); return $False; } eval { print SITEINDEX $SITEINDEX_HEADER; foreach my $mapnumber (0 .. $self->{_sitemaps} - 1) { # Write the entry my $mapurl = $self->{_filegen}->GenerateURL($mapnumber, $self->{_base_url}); my $siteindex_entry = $SITEINDEX_ENTRY; $siteindex_entry =~ s/LOC/$mapurl/; $siteindex_entry =~ s/LASTMOD/$lastmod/; print SITEINDEX $siteindex_entry; } print SITEINDEX $SITEINDEX_FOOTER; close SITEINDEX; }; if($@) { $output->Fatal("Couldn't write out to file: $filename"); } chmod(0644, $filename); } #end WriteIndex sub NotifySearch { my($self) = @_; # Send notification of the new Sitemap(s) to the search engines. if($self->{_suppress}) { $output->Log("Search engine notification is suppressed.", 1); return; } $output->Log("Notifying search engines.", 1); {package ExceptionURLopener; # Handle HTTP response code errors sub http_error_default { my($self, $url) = @_; my $file; my $http_code; $http_code = LWP::Simple::getstore($url,$file); my $errmsg; if($http_code == 400){ $errmsg = "Bad request"; } elsif($http_code == 401){ $errmsg = "Not Authorised"; } elsif($http_code == 402){ $errmsg = "Payment required"; } elsif($http_code == 403){ $errmsg = "Forbidden"; } elsif($http_code == 404){ $errmsg = "Not Found"; } elsif($http_code == 405){ $errmsg = "Method Not Allowed"; } elsif($http_code == 406){ $errmsg = "Not Acceptable"; } elsif($http_code == 407){ $errmsg = "Proxy Authentication Required"; } elsif($http_code == 408){ $errmsg = "Request Timeout"; } elsif($http_code == 409){ $errmsg = "Conflict"; } elsif($http_code == 410){ $errmsg = "Gone"; } elsif($http_code == 411){ $errmsg = "Lenght required"; } elsif($http_code == 412){ $errmsg = "Precondition Failed"; } elsif($http_code == 413){ $errmsg = "Request Entity Too Large"; } elsif($http_code == 414){ $errmsg = "Request URI Too Long"; } elsif($http_code == 415){ $errmsg = "Unsupported Media Type"; } elsif($http_code == 416){ $errmsg = "Request Range Not Satisfiable"; } elsif($http_code == 417){ $errmsg = "Expectation Failed"; } if($http_code != 200){ $output->Log("HTTP error " . $http_code . " : " . $errmsg, 2); } return $http_code; } } # Build the URL we want to send in my $url; if($self->{_sitemaps} > 1) { $url = $self->{_filegen}->GenerateURL($SITEINDEX_SUFFIX, $self->{_base_url}); } else { $url = $self->{_filegen}->GenerateURL(0, $self->{_base_url}); } # Test if we can hit it ourselves my $http_code = ExceptionURLopener->http_error_default($url); if($http_code != 200) { $output->Error("When attempting to access our generated Sitemap at the following URL: \n". $url . "\nwe failed to read it. Please verify the store_into path you specified in \nyour configuration file is a web-accessable. Consult the FAQ for more \ninformation."); $output->Warn("Proceeding to notify with an unverifyable URL."); } # Cycle through notifications. To understand this, see the comment near the NOTIFICATION_SITES comment my $ping; my @query_map; foreach my $ping (@NOTIFICATION_SITES){ my $query_map = $ping->{query}; my $query_attr = $ping->{sitemap}; $query_map->{$query_attr} = $url; # Build notification URL use URI::URL; my $notify = new URI::URL; $notify->scheme ($ping->{scheme}); $notify->netloc ($ping->{netloc}); $notify->path ($ping->{path}); $notify->query_form ($query_map); if($ping->{fragment}) { $notify->frag ($ping->{fragment}); } # Send the notification $output->Log("Notifying: ".$ping->{netloc}, 1); $output->Log("Notification URL: ".$notify, 2); if(ExceptionURLopener->http_error_default($notify) != 200) { $output->Warn("Cannot contact: ".$ping->{netloc}); } } } #end NotifySearch sub start_element { my($self, $element) = @_; # SAX processing, called per node in the config stream. # Replace existing character references in attribute values my %attributes; foreach my $attr (keys %{$element->{Attributes}}) { if($element->{Attributes}->{$attr}->{Value} =~ m/&#(\d*);/) { my $char_ref = $1; my $char = chr($char_ref); $element->{Attributes}->{$attr}->{Value} = Encode::decode("UTF-8", $element->{Attributes}->{$attr}->{Value}); $element->{Attributes}->{$attr}->{Value} =~ s/&#\d*;/$char/; } elsif($element->{Attributes}->{$attr}->{Value} =~ m/&#x([a-zA-Z]*\d*);/) { my $char_ref = $1; my $char = chr(hex($char_ref)); $element->{Attributes}->{$attr}->{Value} = Encode::decode("UTF-8", $element->{Attributes}->{$attr}->{Value}); $element->{Attributes}->{$attr}->{Value} =~ s/&#x[a-zA-Z]*\d*;/$char/; } $attributes{$attr} = $element->{Attributes}->{$attr}->{Value}; } if($element->{Name} eq 'site') { if($self->{_in_site}) { $output->Error("Can not nest Site entries in the configuration."); } else { $self->{_in_site} = $True; } my @attributes = keys %{$element->{Attributes}}; foreach my $attr (@attributes) { # Attributes have the format '{}base_url', '{}store_into', etc. Remove '{}' from them. $attr = substr($attr, 2); } my @goodattributes = ('verbose', 'default_encoding', 'base_url', 'store_into', 'suppress_search_engine_notify'); if(!::ValidateAttributes('SITE', \@attributes, \@goodattributes)) { return; } my $verbose = $element->{Attributes}->{"{}verbose"}->{Value}; if($verbose) { $output->SetVerbose($verbose) } $self->{_default_enc} = $element->{Attributes}->{"{}default_encoding"}->{Value}; $self->{_base_url} = $element->{Attributes}->{"{}base_url"}->{Value}; $self->{_store_into} = $element->{Attributes}->{"{}store_into"}->{Value}; if(!$self->{_suppress}) { $self->{_suppress} = $element->{Attributes}->{"{}suppress_search_engine_notify"}->{Value}; } $self->ValidateBasicConfig(); } elsif($element->{Name} eq 'filter') { push(@{$self->{_filters}}, Filter->new($element->{Attributes})); } elsif($element->{Name} eq 'url') { push(@{$self->{_inputs}}, InputURL->new($element->{Attributes})); } elsif($element->{Name} eq 'urllist') { foreach my $attributeset (::ExpandPathAttribute('{}path', %attributes)) { push(@{$self->{_inputs}}, InputURLList->new($attributeset)); } } elsif($element->{Name} eq 'directory') { push(@{$self->{_inputs}}, InputDirectory->new($element->{Attributes}, $self->{_base_url})); } elsif($element->{Name} eq 'accesslog') { foreach my $attributeset (::ExpandPathAttribute('{}path', %attributes)) { push(@{$self->{_inputs}}, InputAccessLog->new($attributeset)); } } elsif($element->{Name} eq 'sitemap') { foreach my $attributeset (::ExpandPathAttribute('{}path', %attributes)) { push(@{$self->{_inputs}}, InputSitemap->new($attributeset)); } } else { $output->Error("Unrecognized tag in the configuration: " . $element->{Name}); } } #end start_element sub end_element { my($self, $element) = @_; # SAX processing, called per node in the config stream. if($element->{Name} eq 'site') { die if not $self->{_in_site}; $self->{_in_site} = $False; $self->{_in_site_ever} = $True; } } #end end_element sub end_document { my($self) = @_; # End of SAX, verify we can proceed. if(!$self->{_in_site_ever}) { $output->Error("The configuration must specify a \"site\" element."); } else { if(!$self->{_inputs}) { $output->Warn("There were no inputs to generate a sitemap from."); } } } #end end_document } # end package Sitemap sub StripString { my($string) = @_; # Remove spaces at the beginning and end of a string $string =~ s/^\s+//; $string =~ s/\s+$//; return $string; } # end StripString sub ValidateAttributes { my($tag, $attributes, $goodattributes) = @_; # Makes sure 'attributes' does not contain any attribute not # listed in 'goodattributes' my $all_good = $True; foreach my $attr (@$attributes) { my $is_element = $False; foreach my $element (@$goodattributes) { if($attr eq $element) { $is_element = $True; } } if(!$is_element) { $output->Error("Uknown $tag attribute: $attr"); $all_good = $False; } } return $all_good; } # end ValidateAttributes sub ExpandPathAttribute { my($attrib, %src) = @_; # Given a hash of attributes, return an array of hashes with all the same attributes except for the one named attrib. # That one, we treat as a file path and expand into all its possible variations. # Do the path expansion. On any error, just return the source hash. if(!exists($src{$attrib})) { return %src; } my $path = $src{$attrib}; $path = $encoder->MaybeNarrowPath($path); my @pathlist = File::Glob::bsd_glob($path); if(!@pathlist) { return %src; } # Create N new hashes and store into array my @retval = (); foreach my $path (@pathlist) { my %dst = %src; $dst{$attrib} = $path; push(@retval, \%dst); } return @retval; } # end ExpandPathAttribute sub OpenFileForRead { my($path, $logtext) = @_; # Opens a text file, be it GZip or plain if(!$path) { return undef; } # Open the file local *FILE; my $success = open FILE, "< $path"; unless($success) { $output->Error("Can not open file: $path"); return $False; } # Check if we have a GZip file my $is_gzip_file = $False; if(substr($path, length($path) - 3) eq '.gz') { $is_gzip_file = $True; } if($logtext) { $output->Log("Opened $logtext file: $path", 1); } else { $output->Log("Opened file: $path", 1); } return {file => *FILE{IO}, is_gzip => $is_gzip_file}; } # end OpenFileForRead sub TimestampISO8601 { my($t) = @_; # Seconds since epoch (1970-01-01) --> ISO 8601 time string. my($sec, $min, $hour, $day, $mon, $year) = gmtime($t); $year += 1900; $mon += 1; foreach my $datetime_element ($sec, $min, $hour, $day, $mon) { if(length($datetime_element) < 2) { $datetime_element = '0'.$datetime_element; } } return "$year-$mon-$day".'T'."$hour:$min:$sec".'Z'; } # end TimestampISO8601 sub CreateSitemapFromFile { my($configpath, $suppress_notify) = @_; # Sets up a new Sitemap object from the specified configuration file. # Remember error count on the way in my $num_errors = $output->{num_errors}; # Rev up SAX to parse the config my $sitemap = Sitemap->new($suppress_notify); eval { $output->Log("Reading configuration file: " . $configpath, 0); my $xml_parser = XML::SAX::ParserFactory->parser( ContentHandler => $sitemap ); $xml_parser->parse_uri($configpath); }; if($@) { if(index($@, 'LWP Request Failed') != -1 || index($@, 'Permission denied') != -1) { $output->Error("Cannot read configuration file: $configpath"); } else { $@ =~ s/\n+$//; $@ =~ s/\r+$//; $output->Error("XML error in the config file: $@"); } } # If we added any errors, return no sitemap if($num_errors == $output->{num_errors}) { return $sitemap; } return undef; } # end CreateSitemapFromFile sub ProcessCommandFlags { # Parse command line flags per specified usage, pick off key, value pairs my %flags = (); if(GetOptions(\%flags, 'config=s', 'help', 'testing')) { return %flags; } } # end ProcessCommandFlags %flags = ProcessCommandFlags(); if(!%flags || !exists($flags{'config'}) || exists($flags{'help'})) { $output->Log($__usage__, 0); } else { my $suppress_notify = exists($flags{'testing'}); my $sitemap = CreateSitemapFromFile($flags{'config'}, $suppress_notify); if(!$sitemap) { $output->Log('Configuration file errors -- exiting.', 0); } else { $sitemap->Generate(); $output->Log("Number of errors: " . $output->{num_errors}, 1); $output->Log("Number of warnings: " . $output->{num_warns}, 1); } } 1;