#!/usr/bin/perl # # crawlink.pl # # This script crawls across all found links below the given "root" URL. # It reports all good and bad links to stdout. This code was based on the # checklink.pl script I wrote ages ago. # # Written to use 'curl' for URL checking. # # Author: Daniel Stenberg <daniel@haxx.se> # Version: 0.3 Jan 3, 2001 # # HISTORY # # 0.3 - The -i now adds regexes that if a full URL link matches one of those, # it is not followed. This can then be used to prevent this script from # following '.*\.cgi', specific pages or whatever. # # 0.2 - Made it only HEAD non html files (i.e skip the GET). Makes it a lot # faster to skip large non HTML files such as pdfs or big RFCs! ;-) # Added a -c option that allows me to pass options to curl. # # 0.1 - The given url works as the root. This script will only continue # and check other URLs if the leftmost part of the new URL is identical # to the root URL. # use strict; my $in=""; my $verbose=0; my $usestdin; my $linenumber; my $help; my $external; my $curlopts; my @ignorelist; argv: if($ARGV[0] eq "-v" ) { $verbose++; shift @ARGV; goto argv; } elsif($ARGV[0] eq "-c" ) { $curlopts=$ARGV[1]; shift @ARGV; shift @ARGV; goto argv; } elsif($ARGV[0] eq "-i" ) { push @ignorelist, $ARGV[1]; shift @ARGV; shift @ARGV; goto argv; } elsif($ARGV[0] eq "-l" ) { $linenumber = 1; shift @ARGV; goto argv; } elsif($ARGV[0] eq "-h" ) { $help = 1; shift @ARGV; goto argv; } elsif($ARGV[0] eq "-x" ) { $external = 1; shift @ARGV; goto argv; } my $geturl = $ARGV[0]; my $firsturl= $geturl; # # Define a hash array to hold all root URLs to visit/we have visited # my %rooturls; $rooturls{$ARGV[0]}=1; if(($geturl eq "") || $help) { print "Usage: $0 [-hilvx] <full URL>\n", " Use a traling slash for directory URLs!\n", " -c [data] Pass [data] as argument to every curl invoke\n", " -h This help text\n", " -i [regex] Ignore root links that match this pattern\n", " -l Line number report for BAD links\n", " -v Verbose mode\n", " -x Check non-local (external?) links only\n"; exit; } my $proxy; if($curlopts ne "") { $proxy=" $curlopts"; #$proxy =" -x 194.237.142.41:80"; } # linkchecker, URL will be appended to the right of this command line # this is the one using HEAD: my $linkcheck = "curl -s -m 20 -I$proxy"; # as a second attempt, this will be used. This is not using HEAD but will # get the whole frigging document! my $linkcheckfull = "curl -s -m 20 -i$proxy"; # htmlget, URL will be appended to the right of this command line my $htmlget = "curl -s$proxy"; # Parse the input URL and split it into the relevant parts: my $getprotocol; my $getserver; my $getpath; my $getdocument; my %done; my %tagtype; my $allcount=0; my $badlinks=0; sub SplitURL { my $inurl = $_[0]; if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) { $getprotocol = $1; $getserver = $2; $getpath = $3; $getdocument = $4; } elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) { $getprotocol = $1; $getserver = $2; $getpath = $3; $getdocument = ""; if($getpath !~ /\//) { $getpath =""; $getdocument = $3; } } elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) { $getprotocol = $1; $getserver = $2; $getpath = ""; $getdocument = ""; } else { print "Couldn't parse the specified URL, retry please!\n"; exit; } } my @indoc; sub GetRootPage { my $geturl = $_[0]; my $in=""; my $code=200; my $type="text/plain"; my $pagemoved=0; open(HEADGET, "$linkcheck $geturl|") || die "Couldn't get web page for some reason"; while(<HEADGET>) { #print STDERR $_; if($_ =~ /HTTP\/1\.[01] (\d\d\d) /) { $code=$1; if($code =~ /^3/) { $pagemoved=1; } } elsif($_ =~ /^Content-Type: ([\/a-zA-Z]+)/) { $type=$1; } elsif($pagemoved && ($_ =~ /^Location: (.*)/)) { $geturl = $1; &SplitURL($geturl); $pagemoved++; last; } } close(HEADGET); if($pagemoved == 1) { print "Page is moved but we don't know where. Did you forget the ", "traling slash?\n"; exit; } if($type ne "text/html") { # there no point in getting anything but HTML $in=""; } else { open(WEBGET, "$htmlget $geturl|") || die "Couldn't get web page for some reason"; while(<WEBGET>) { my $line = $_; push @indoc, $line; $line=~ s/\n/ /g; $line=~ s/\r//g; $in=$in.$line; } close(WEBGET); } return ($in, $code, $type); } sub LinkWorks { my $check = $_[0]; # URL encode: # $check =~s/([^a-zA-Z0-9_:\/.-])/uc sprintf("%%%02x",ord($1))/eg; my @doc = `$linkcheck \"$check\"`; my $head = 1; # print "COMMAND: $linkcheck \"$check\"\n"; # print $doc[0]."\n"; boo: if( $doc[0] =~ /^HTTP[^ ]+ (\d+)/ ) { my $error = $1; if($error < 400 ) { return "GOOD"; } else { if($head && ($error >= 500)) { # This server doesn't like HEAD! @doc = `$linkcheckfull \"$check\"`; $head = 0; goto boo; } return "BAD"; } } return "BAD"; } sub GetLinks { my $in = $_[0]; my @result; while($in =~ /[^<]*(<[^>]+>)/g ) { # we have a tag in $1 my $tag = $1; if($tag =~ /^<!--/) { # this is a comment tag, ignore it } else { if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ \)>]*)/i) { my $url=$2; if($url =~ /^\"(.*)\"$/) { # this was a "string" now $1 has removed the quotes: $url=$1; } $url =~ s/([^\#]*)\#.*/$1/g; if($url eq "") { # if the link was nothing than a #-link it may now have # been emptied completely so then we skip the rest next; } if($done{$url}) { # if this url already is done, do next $done{$url}++; if($verbose) { print " FOUND $url but that is already checked\n"; } next; } $done{$url} = 1; # this is "done" push @result, $url; if($tag =~ /< *([^ ]+)/) { $tagtype{$url}=$1; } } } } return @result; } while(1) { $geturl=-1; for(keys %rooturls) { if($rooturls{$_} == 1) { if($_ !~ /^$firsturl/) { $rooturls{$_} += 1000; # don't do this, outside our scope if($verbose) { print "SKIP: $_\n"; } next; } $geturl=$_; last; } } if($geturl == -1) { last; } # # Splits the URL in its different parts # &SplitURL($geturl); # # Returns the full HTML of the root page # my ($in, $error, $ctype) = &GetRootPage($geturl); $rooturls{$geturl}++; # increase to prove we have already got it if($ctype ne "text/html") { # this is not HTML, we skip this if($verbose == 2) { print "Non-HTML link, skipping\n"; next; } } if($error >= 400) { print "ROOT page $geturl returned $error\n"; next; } print " ==== $geturl ====\n"; if($verbose == 2) { printf("Error code $error, Content-Type: $ctype, got %d bytes\n", length($in)); } #print "protocol = $getprotocol\n"; #print "server = $getserver\n"; #print "path = $getpath\n"; #print "document = $getdocument\n"; #exit; # # Extracts all links from the given HTML buffer # my @links = &GetLinks($in); for(@links) { my $url = $_; my $link; if($url =~ /^([^:]+):/) { my $prot = $1; if($prot !~ /http/i) { # this is an unsupported protocol, we ignore this next; } $link = $url; } else { if($external) { next; } # this is a link on the same server: if($url =~ /^\//) { # from root $link = "$getprotocol://$getserver$url"; } else { # from the scanned page's dir my $nyurl=$url; if(length($getpath) && ($getpath !~ /\/$/) && ($nyurl !~ /^\//)) { # lacks ending slash, add one to the document part: $nyurl = "/".$nyurl; } $link = "$getprotocol://$getserver/$getpath$nyurl"; } } my $success = &LinkWorks($link); my $count = $done{$url}; $allcount += $count; print "$success $count <".$tagtype{$url}."> $link $url\n"; if("BAD" eq $success) { $badlinks++; if($linenumber) { my $line =1; for(@indoc) { if($_ =~ /$url/) { print " line $line\n"; } $line++; } } } else { # the link works, add it if it isn't in the ingore list my $ignore=0; for(@ignorelist) { if($link =~ /$_/) { $ignore=1; } } if(!$ignore) { # not ignored, add $rooturls{$link}++; # check this if not checked already } } } } if($verbose) { print "$allcount links were checked"; if($badlinks > 0) { print ", $badlinks were found bad"; } print "\n"; }