diff options
Diffstat (limited to 'perl/checklinks.pl')
-rw-r--r-- | perl/checklinks.pl | 336 |
1 files changed, 0 insertions, 336 deletions
diff --git a/perl/checklinks.pl b/perl/checklinks.pl deleted file mode 100644 index 347416ac9..000000000 --- a/perl/checklinks.pl +++ /dev/null @@ -1,336 +0,0 @@ -#!/usr/local/bin/perl -# -# checklinks.pl -# -# This script extracts all links from a HTML page and checks their validity. -# Written to use 'curl' for URL checking. -# -# Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se> -# Version: 0.7 Sept 30, 1998 -# -# HISTORY -# -# 0.5 - Cuts off the #-part from links before checking. -# -# 0.6 - Now deals with error codes 3XX better and follows the Location: -# properly. -# - Added the -x flag that only checks http:// -links -# -# 0.7 - Ok, http://www.viunga.se/main.html didn't realize this had no path -# but a document. Now it does. -# -# - -$in=""; - - argv: -if($ARGV[0] eq "-v" ) { - $verbose = 1; - shift @ARGV; - goto argv; -} -elsif($ARGV[0] eq "-i" ) { - $usestdin = 1; - shift @ARGV; - goto argv; -} -elsif($ARGV[0] eq "-l" ) { - $linenumber = 1; - shift @ARGV; - goto argv; -} -elsif($ARGV[0] eq "-h" ) { - $help = 1; - shift @ARGV; - goto argv; -} -elsif($ARGV[0] eq "-x" ) { - $external = 1; - shift @ARGV; - goto argv; -} - -$geturl = $ARGV[0]; - -if(($geturl eq "") || $help) { - print "Usage: $0 [-hilvx] <full URL>\n", - " Use a traling slash for directory URLs!\n", - " -h This help text\n", - " -i Read the initial page from stdin\n", - " -l Line number report for BAD links\n", - " -v Verbose mode\n", - " -x Check non-local (external?) links only\n"; - exit; -} - -if($ARGV[1] eq "-") { - print "We use stdin!\n"; - $usestdin = 1; -} - -# This is necessary from where I tried this: -#$proxy =" -x 194.237.142.41:80"; - -# linkchecker, URL will be appended to the right of this command line -# this is the one using HEAD: -$linkcheck = "curl -s -m 20 -I$proxy"; - -# as a second attempt, this will be used. This is not using HEAD but will -# get the whole frigging document! -$linkcheckfull = "curl -s -m 20 -i$proxy"; - -# htmlget, URL will be appended to the right of this command line -$htmlget = "curl -s$proxy"; - -# Parse the input URL and split it into the relevant parts: - -sub SplitURL { - my $inurl = $_[0]; - - if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) { - $getprotocol = $1; - $getserver = $2; - $getpath = $3; - $getdocument = $4; - } - elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) { - $getprotocol = $1; - $getserver = $2; - $getpath = $3; - $getdocument = ""; - - if($getpath !~ /\//) { - $getpath =""; - $getdocument = $3; - } - - } - elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) { - $getprotocol = $1; - $getserver = $2; - $getpath = ""; - $getdocument = ""; - } - else { - print "Couldn't parse the specified URL, retry please!\n"; - exit; - } -} - -&SplitURL($geturl); - -#print "protocol = $getprotocol\n"; -#print "server = $getserver\n"; -#print "path = $getpath\n"; -#print "document = $getdocument\n"; -#exit; - -if(!$usestdin) { - open(HEADGET, "$linkcheck $geturl|") || - die "Couldn't get web page for some reason"; - headget: - while(<HEADGET>) { -# print $_; - if($_ =~ /HTTP\/.*3\d\d /) { - $pagemoved=1; - } - elsif($pagemoved && - ($_ =~ /^Location: (.*)/)) { - $geturl = $1; - - &SplitURL($geturl); - - $pagemoved++; - last headget; - } - } - close(HEADGET); - - if($pagemoved == 1) { - print "Page is moved but we don't know where. Did you forget the ", - "traling slash?\n"; - exit; - } - - open(WEBGET, "$htmlget $geturl|") || - die "Couldn't get web page for some reason"; - - while(<WEBGET>) { - $line = $_; - push @indoc, $line; - $line=~ s/\n//g; - $line=~ s/\r//g; -# print $line."\n"; - $in=$in.$line; - } - - close(WEBGET); -} -else { - while(<STDIN>) { - $line = $_; - push @indoc, $line; - $line=~ s/\n//g; - $line=~ s/\r//g; - $in=$in.$line; - } -} - -#print length($in)."\n"; - -sub LinkWorks { - my $check = $_[0]; - -# URL encode: -# $check =~s/([^a-zA-Z0-9_:\/.-])/uc sprintf("%%%02x",ord($1))/eg; - - @doc = `$linkcheck \"$check\"`; - - $head = 1; - -# print "COMMAND: $linkcheck \"$check\"\n"; -# print $doc[0]."\n"; - - boo: - if( $doc[0] =~ /^HTTP[^ ]+ (\d+)/ ) { - $error = $1; - - if($error < 400 ) { - return "GOOD"; - } - else { - - if($head && ($error >= 500)) { - # This server doesn't like HEAD! - @doc = `$linkcheckfull \"$check\"`; - $head = 0; - goto boo; - } - return "BAD"; - } - } - return "BAD"; -} - - -sub GetLinks { - my $in = $_[0]; - my @result; - - getlinkloop: - while($in =~ /[^<]*(<[^>]+>)/g ) { - # we have a tag in $1 - $tag = $1; - - if($tag =~ /^<!--/) { - # this is a comment tag, ignore it - } - else { - if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ )>]*)/i) { - $url=$2; - if($url =~ /^\"(.*)\"$/) { - # this was a "string" now $1 has removed the quotes: - $url=$1; - } - - - $url =~ s/([^\#]*)\#.*/$1/g; - - if($url eq "") { - # if the link was nothing than a #-link it may now have - # been emptied completely so then we skip the rest - next getlinkloop; - } - - if($done{$url}) { - # if this url already is done, do next - $done{$url}++; - next getlinkloop; - } - - $done{$url} = 1; # this is "done" - - push @result, $url; - if($tag =~ /< *([^ ]+)/) { -# print "TAG: $1\n"; - $tagtype{$url}=$1; - } - } - } - } - return @result; -} - -@links = &GetLinks($in); - - linkloop: -for(@links) { - $url = $_; - - if($url =~ /^([^:]+):/) { - $prot = $1; -# if($prot !~ /(http|ftp|gopher)/i) { - if($prot !~ /http/i) { - # this is an unsupported protocol, we ignore this - next linkloop; - } - $link = $url; - } - else { - if($external) { - next linkloop; - } - - # this is a link on the save server: - if($url =~ /^\//) { - # from root - $link = "$getprotocol://$getserver$url"; - } - else { - # from the scanned page's dir - $nyurl=$url; - - if(length($getpath) && - ($getpath !~ /\/$/) && - ($nyurl !~ /^\//)) { - # lacks ending slash, add one to the document part: - $nyurl = "/".$nyurl; - } - $link = "$getprotocol://$getserver/$getpath$nyurl"; - } - } - -#print "test $link\n"; -#$success = "GOOD"; - - $success = &LinkWorks($link); - - $count = $done{$url}; - - $allcount += $count; - - print "$success $count <".$tagtype{$url}."> $link $url\n"; - -# If bad and -l, present the line numbers of the usage - if("BAD" eq $success) { - $badlinks++; - if($linenumber) { - $line =1; - for(@indoc) { - if($_ =~ /$url/) { - print " line $line\n"; - } - $line++; - } - } - } - -} - -if($verbose) { - print "$allcount links were checked"; - if($badlinks > 0) { - print ", $badlinks were found bad"; - } - print "\n"; -} |