aboutsummaryrefslogtreecommitdiff
path: root/perl/contrib/crawlink.pl
diff options
context:
space:
mode:
authorYang Tse <yangsita@gmail.com>2010-02-16 13:32:45 +0000
committerYang Tse <yangsita@gmail.com>2010-02-16 13:32:45 +0000
commit46b112bcd439f4413925a7300d66a3e6f148765e (patch)
treec925ce0c32a0af8bbcf427187c3838d005e3218b /perl/contrib/crawlink.pl
parentf442dd6496c8710c4000a078e8085238fdaa7545 (diff)
replaced tabs with spaces
Diffstat (limited to 'perl/contrib/crawlink.pl')
-rwxr-xr-xperl/contrib/crawlink.pl166
1 files changed, 83 insertions, 83 deletions
diff --git a/perl/contrib/crawlink.pl b/perl/contrib/crawlink.pl
index ba10b8b7c..8cb239a62 100755
--- a/perl/contrib/crawlink.pl
+++ b/perl/contrib/crawlink.pl
@@ -125,32 +125,32 @@ my $badlinks=0;
sub SplitURL {
my $inurl = $_[0];
if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) {
- $getprotocol = $1;
- $getserver = $2;
- $getpath = $3;
- $getdocument = $4;
+ $getprotocol = $1;
+ $getserver = $2;
+ $getpath = $3;
+ $getdocument = $4;
}
elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) {
- $getprotocol = $1;
- $getserver = $2;
- $getpath = $3;
- $getdocument = "";
-
- if($getpath !~ /\//) {
- $getpath ="";
- $getdocument = $3;
- }
+ $getprotocol = $1;
+ $getserver = $2;
+ $getpath = $3;
+ $getdocument = "";
+
+ if($getpath !~ /\//) {
+ $getpath ="";
+ $getdocument = $3;
+ }
}
elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) {
- $getprotocol = $1;
- $getserver = $2;
- $getpath = "";
- $getdocument = "";
+ $getprotocol = $1;
+ $getserver = $2;
+ $getpath = "";
+ $getdocument = "";
}
else {
- print "Couldn't parse the specified URL, retry please!\n";
- exit;
+ print "Couldn't parse the specified URL, retry please!\n";
+ exit;
}
}
@@ -164,35 +164,35 @@ sub GetRootPage {
my $pagemoved=0;
open(HEADGET, "$linkcheck $geturl|") ||
- die "Couldn't get web page for some reason";
+ die "Couldn't get web page for some reason";
while(<HEADGET>) {
- #print STDERR $_;
- if($_ =~ /HTTP\/1\.[01] (\d\d\d) /) {
+ #print STDERR $_;
+ if($_ =~ /HTTP\/1\.[01] (\d\d\d) /) {
$code=$1;
if($code =~ /^3/) {
$pagemoved=1;
}
- }
+ }
elsif($_ =~ /^Content-Type: ([\/a-zA-Z]+)/) {
$type=$1;
}
- elsif($pagemoved &&
- ($_ =~ /^Location: (.*)/)) {
- $geturl = $1;
+ elsif($pagemoved &&
+ ($_ =~ /^Location: (.*)/)) {
+ $geturl = $1;
- &SplitURL($geturl);
+ &SplitURL($geturl);
- $pagemoved++;
- last;
- }
+ $pagemoved++;
+ last;
+ }
}
close(HEADGET);
if($pagemoved == 1) {
- print "Page is moved but we don't know where. Did you forget the ",
- "traling slash?\n";
- exit;
+ print "Page is moved but we don't know where. Did you forget the ",
+ "traling slash?\n";
+ exit;
}
if($type ne "text/html") {
@@ -229,21 +229,21 @@ sub LinkWorks {
boo:
if( $doc[0] =~ /^HTTP[^ ]+ (\d+)/ ) {
- my $error = $1;
-
- if($error < 400 ) {
- return "GOOD";
- }
- else {
-
- if($head && ($error >= 500)) {
- # This server doesn't like HEAD!
- @doc = `$linkcheckfull \"$check\"`;
- $head = 0;
- goto boo;
- }
- return "BAD";
- }
+ my $error = $1;
+
+ if($error < 400 ) {
+ return "GOOD";
+ }
+ else {
+
+ if($head && ($error >= 500)) {
+ # This server doesn't like HEAD!
+ @doc = `$linkcheckfull \"$check\"`;
+ $head = 0;
+ goto boo;
+ }
+ return "BAD";
+ }
}
return "BAD";
}
@@ -254,45 +254,45 @@ sub GetLinks {
my @result;
while($in =~ /[^<]*(<[^>]+>)/g ) {
- # we have a tag in $1
- my $tag = $1;
-
- if($tag =~ /^<!--/) {
- # this is a comment tag, ignore it
- }
- else {
- if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ \)>]*)/i) {
- my $url=$2;
- if($url =~ /^\"(.*)\"$/) {
- # this was a "string" now $1 has removed the quotes:
- $url=$1;
- }
-
-
- $url =~ s/([^\#]*)\#.*/$1/g;
-
- if($url eq "") {
- # if the link was nothing than a #-link it may now have
- # been emptied completely so then we skip the rest
- next;
- }
-
- if($done{$url}) {
- # if this url already is done, do next
- $done{$url}++;
+ # we have a tag in $1
+ my $tag = $1;
+
+ if($tag =~ /^<!--/) {
+ # this is a comment tag, ignore it
+ }
+ else {
+ if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ \)>]*)/i) {
+ my $url=$2;
+ if($url =~ /^\"(.*)\"$/) {
+ # this was a "string" now $1 has removed the quotes:
+ $url=$1;
+ }
+
+
+ $url =~ s/([^\#]*)\#.*/$1/g;
+
+ if($url eq "") {
+ # if the link was nothing than a #-link it may now have
+ # been emptied completely so then we skip the rest
+ next;
+ }
+
+ if($done{$url}) {
+ # if this url already is done, do next
+ $done{$url}++;
if($verbose) {
print " FOUND $url but that is already checked\n";
}
- next;
- }
+ next;
+ }
- $done{$url} = 1; # this is "done"
+ $done{$url} = 1; # this is "done"
- push @result, $url;
- if($tag =~ /< *([^ ]+)/) {
- $tagtype{$url}=$1;
- }
- }
+ push @result, $url;
+ if($tag =~ /< *([^ ]+)/) {
+ $tagtype{$url}=$1;
+ }
+ }
}
}
return @result;
@@ -437,7 +437,7 @@ while(1) {
if($verbose) {
print "$allcount links were checked";
if($badlinks > 0) {
- print ", $badlinks were found bad";
+ print ", $badlinks were found bad";
}
print "\n";
}