#!/usr/local/bin/perl # # checklinks.pl # # This script extracts all links from a HTML page and checks their validity. # Written to use 'curl' for URL checking. # # Author: Daniel Stenberg # Version: 0.7 Sept 30, 1998 # # HISTORY # # 0.5 - Cuts off the #-part from links before checking. # # 0.6 - Now deals with error codes 3XX better and follows the Location: # properly. # - Added the -x flag that only checks http:// -links # # 0.7 - Ok, http://www.viunga.se/main.html didn't realize this had no path # but a document. Now it does. # # $in=""; argv: if($ARGV[0] eq "-v" ) { $verbose = 1; shift @ARGV; goto argv; } elsif($ARGV[0] eq "-i" ) { $usestdin = 1; shift @ARGV; goto argv; } elsif($ARGV[0] eq "-l" ) { $linenumber = 1; shift @ARGV; goto argv; } elsif($ARGV[0] eq "-h" ) { $help = 1; shift @ARGV; goto argv; } elsif($ARGV[0] eq "-x" ) { $external = 1; shift @ARGV; goto argv; } $geturl = $ARGV[0]; if(($geturl eq "") || $help) { print "Usage: $0 [-hilvx] \n", " Use a traling slash for directory URLs!\n", " -h This help text\n", " -i Read the initial page from stdin\n", " -l Line number report for BAD links\n", " -v Verbose mode\n", " -x Check non-local (external?) links only\n"; exit; } if($ARGV[1] eq "-") { print "We use stdin!\n"; $usestdin = 1; } # This is necessary from where I tried this: #$proxy =" -x 194.237.142.41:80"; # linkchecker, URL will be appended to the right of this command line # this is the one using HEAD: $linkcheck = "curl -s -m 20 -I$proxy"; # as a second attempt, this will be used. This is not using HEAD but will # get the whole frigging document! $linkcheckfull = "curl -s -m 20 -i$proxy"; # htmlget, URL will be appended to the right of this command line $htmlget = "curl -s$proxy"; # Parse the input URL and split it into the relevant parts: sub SplitURL { my $inurl = $_[0]; if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) { $getprotocol = $1; $getserver = $2; $getpath = $3; $getdocument = $4; } elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) { $getprotocol = $1; $getserver = $2; $getpath = $3; $getdocument = ""; if($getpath !~ /\//) { $getpath =""; $getdocument = $3; } } elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) { $getprotocol = $1; $getserver = $2; $getpath = ""; $getdocument = ""; } else { print "Couldn't parse the specified URL, retry please!\n"; exit; } } &SplitURL($geturl); #print "protocol = $getprotocol\n"; #print "server = $getserver\n"; #print "path = $getpath\n"; #print "document = $getdocument\n"; #exit; if(!$usestdin) { open(HEADGET, "$linkcheck $geturl|") || die "Couldn't get web page for some reason"; headget: while() { # print $_; if($_ =~ /HTTP\/.*3\d\d /) { $pagemoved=1; } elsif($pagemoved && ($_ =~ /^Location: (.*)/)) { $geturl = $1; &SplitURL($geturl); $pagemoved++; last headget; } } close(HEADGET); if($pagemoved == 1) { print "Page is moved but we don't know where. Did you forget the ", "traling slash?\n"; exit; } open(WEBGET, "$htmlget $geturl|") || die "Couldn't get web page for some reason"; while() { $line = $_; push @indoc, $line; $line=~ s/\n//g; $line=~ s/\r//g; # print $line."\n"; $in=$in.$line; } close(WEBGET); } else { while() { $line = $_; push @indoc, $line; $line=~ s/\n//g; $line=~ s/\r//g; $in=$in.$line; } } #print length($in)."\n"; sub LinkWorks { my $check = $_[0]; # URL encode: # $check =~s/([^a-zA-Z0-9_:\/.-])/uc sprintf("%%%02x",ord($1))/eg; @doc = `$linkcheck \"$check\"`; $head = 1; # print "COMMAND: $linkcheck \"$check\"\n"; # print $doc[0]."\n"; boo: if( $doc[0] =~ /^HTTP[^ ]+ (\d+)/ ) { $error = $1; if($error < 400 ) { return "GOOD"; } else { if($head && ($error >= 500)) { # This server doesn't like HEAD! @doc = `$linkcheckfull \"$check\"`; $head = 0; goto boo; } return "BAD"; } } return "BAD"; } sub GetLinks { my $in = $_[0]; my @result; getlinkloop: while($in =~ /[^<]*(<[^>]+>)/g ) { # we have a tag in $1 $tag = $1; if($tag =~ /^