diff options
Diffstat (limited to 'perl/contrib/crawlink.pl')
-rwxr-xr-x | perl/contrib/crawlink.pl | 166 |
1 files changed, 83 insertions, 83 deletions
diff --git a/perl/contrib/crawlink.pl b/perl/contrib/crawlink.pl index ba10b8b7c..8cb239a62 100755 --- a/perl/contrib/crawlink.pl +++ b/perl/contrib/crawlink.pl @@ -125,32 +125,32 @@ my $badlinks=0; sub SplitURL { my $inurl = $_[0]; if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) { - $getprotocol = $1; - $getserver = $2; - $getpath = $3; - $getdocument = $4; + $getprotocol = $1; + $getserver = $2; + $getpath = $3; + $getdocument = $4; } elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) { - $getprotocol = $1; - $getserver = $2; - $getpath = $3; - $getdocument = ""; - - if($getpath !~ /\//) { - $getpath =""; - $getdocument = $3; - } + $getprotocol = $1; + $getserver = $2; + $getpath = $3; + $getdocument = ""; + + if($getpath !~ /\//) { + $getpath =""; + $getdocument = $3; + } } elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) { - $getprotocol = $1; - $getserver = $2; - $getpath = ""; - $getdocument = ""; + $getprotocol = $1; + $getserver = $2; + $getpath = ""; + $getdocument = ""; } else { - print "Couldn't parse the specified URL, retry please!\n"; - exit; + print "Couldn't parse the specified URL, retry please!\n"; + exit; } } @@ -164,35 +164,35 @@ sub GetRootPage { my $pagemoved=0; open(HEADGET, "$linkcheck $geturl|") || - die "Couldn't get web page for some reason"; + die "Couldn't get web page for some reason"; while(<HEADGET>) { - #print STDERR $_; - if($_ =~ /HTTP\/1\.[01] (\d\d\d) /) { + #print STDERR $_; + if($_ =~ /HTTP\/1\.[01] (\d\d\d) /) { $code=$1; if($code =~ /^3/) { $pagemoved=1; } - } + } elsif($_ =~ /^Content-Type: ([\/a-zA-Z]+)/) { $type=$1; } - elsif($pagemoved && - ($_ =~ /^Location: (.*)/)) { - $geturl = $1; + elsif($pagemoved && + ($_ =~ /^Location: (.*)/)) { + $geturl = $1; - &SplitURL($geturl); + &SplitURL($geturl); - $pagemoved++; - last; - } + $pagemoved++; + last; + } } close(HEADGET); if($pagemoved == 1) { - print "Page is moved but we don't know where. Did you forget the ", - "traling slash?\n"; - exit; + print "Page is moved but we don't know where. Did you forget the ", + "traling slash?\n"; + exit; } if($type ne "text/html") { @@ -229,21 +229,21 @@ sub LinkWorks { boo: if( $doc[0] =~ /^HTTP[^ ]+ (\d+)/ ) { - my $error = $1; - - if($error < 400 ) { - return "GOOD"; - } - else { - - if($head && ($error >= 500)) { - # This server doesn't like HEAD! - @doc = `$linkcheckfull \"$check\"`; - $head = 0; - goto boo; - } - return "BAD"; - } + my $error = $1; + + if($error < 400 ) { + return "GOOD"; + } + else { + + if($head && ($error >= 500)) { + # This server doesn't like HEAD! + @doc = `$linkcheckfull \"$check\"`; + $head = 0; + goto boo; + } + return "BAD"; + } } return "BAD"; } @@ -254,45 +254,45 @@ sub GetLinks { my @result; while($in =~ /[^<]*(<[^>]+>)/g ) { - # we have a tag in $1 - my $tag = $1; - - if($tag =~ /^<!--/) { - # this is a comment tag, ignore it - } - else { - if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ \)>]*)/i) { - my $url=$2; - if($url =~ /^\"(.*)\"$/) { - # this was a "string" now $1 has removed the quotes: - $url=$1; - } - - - $url =~ s/([^\#]*)\#.*/$1/g; - - if($url eq "") { - # if the link was nothing than a #-link it may now have - # been emptied completely so then we skip the rest - next; - } - - if($done{$url}) { - # if this url already is done, do next - $done{$url}++; + # we have a tag in $1 + my $tag = $1; + + if($tag =~ /^<!--/) { + # this is a comment tag, ignore it + } + else { + if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ \)>]*)/i) { + my $url=$2; + if($url =~ /^\"(.*)\"$/) { + # this was a "string" now $1 has removed the quotes: + $url=$1; + } + + + $url =~ s/([^\#]*)\#.*/$1/g; + + if($url eq "") { + # if the link was nothing than a #-link it may now have + # been emptied completely so then we skip the rest + next; + } + + if($done{$url}) { + # if this url already is done, do next + $done{$url}++; if($verbose) { print " FOUND $url but that is already checked\n"; } - next; - } + next; + } - $done{$url} = 1; # this is "done" + $done{$url} = 1; # this is "done" - push @result, $url; - if($tag =~ /< *([^ ]+)/) { - $tagtype{$url}=$1; - } - } + push @result, $url; + if($tag =~ /< *([^ ]+)/) { + $tagtype{$url}=$1; + } + } } } return @result; @@ -437,7 +437,7 @@ while(1) { if($verbose) { print "$allcount links were checked"; if($badlinks > 0) { - print ", $badlinks were found bad"; + print ", $badlinks were found bad"; } print "\n"; } |