From 6ad9bd80226dd1468a949b26af265d60f29ac910 Mon Sep 17 00:00:00 2001 From: Daniel Stenberg Date: Thu, 14 Dec 2000 12:19:57 +0000 Subject: crawls through a whole site and verifies links --- perl/crawlink.pl | 402 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 402 insertions(+) create mode 100755 perl/crawlink.pl diff --git a/perl/crawlink.pl b/perl/crawlink.pl new file mode 100755 index 000000000..eada58c25 --- /dev/null +++ b/perl/crawlink.pl @@ -0,0 +1,402 @@ +#!/usr/bin/perl +# +# crawlink.pl +# +# This script crawls across all found links below the given "root" URL. +# It reports all good and bad links to stdout. This code was based on the +# checklink.pl script I wrote ages ago. +# +# Written to use 'curl' for URL checking. +# +# Author: Daniel Stenberg +# Version: 0.1 Dec 14, 2000 +# +# HISTORY +# +# 0.1 - The given url works as the root. This script will only continue +# and check other URLs if the leftmost part of the new URL is identical +# to the root URL. +# + +use strict; + +my $in=""; +my $verbose=0; +my $usestdin; +my $linenumber; +my $help; +my $external; + + argv: +if($ARGV[0] eq "-v" ) { + $verbose++; + shift @ARGV; + goto argv; +} +elsif($ARGV[0] eq "-l" ) { + $linenumber = 1; + shift @ARGV; + goto argv; +} +elsif($ARGV[0] eq "-h" ) { + $help = 1; + shift @ARGV; + goto argv; +} +elsif($ARGV[0] eq "-x" ) { + $external = 1; + shift @ARGV; + goto argv; +} + +my $geturl = $ARGV[0]; +my $firsturl= $geturl; + +# +# Define a hash array to hold all root URLs to visit/we have visited +# +my %rooturls; +$rooturls{$ARGV[0]}=1; + +if(($geturl eq "") || $help) { + print "Usage: $0 [-hilvx] \n", + " Use a traling slash for directory URLs!\n", + " -h This help text\n", + " -l Line number report for BAD links\n", + " -v Verbose mode\n", + " -x Check non-local (external?) links only\n"; + exit; +} + +# This is necessary from where I tried this: +my $proxy=""; +#$proxy =" -x 194.237.142.41:80"; + + +# linkchecker, URL will be appended to the right of this command line +# this is the one using HEAD: +my $linkcheck = "curl -s -m 20 -I$proxy"; + +# as a second attempt, this will be used. This is not using HEAD but will +# get the whole frigging document! +my $linkcheckfull = "curl -s -m 20 -i$proxy"; + +# htmlget, URL will be appended to the right of this command line +my $htmlget = "curl -s$proxy"; + +# Parse the input URL and split it into the relevant parts: + +my $getprotocol; +my $getserver; +my $getpath; +my $getdocument; + +my %done; +my %tagtype; +my $allcount=0; +my $badlinks=0; + +sub SplitURL { + my $inurl = $_[0]; + if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) { + $getprotocol = $1; + $getserver = $2; + $getpath = $3; + $getdocument = $4; + } + elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) { + $getprotocol = $1; + $getserver = $2; + $getpath = $3; + $getdocument = ""; + + if($getpath !~ /\//) { + $getpath =""; + $getdocument = $3; + } + + } + elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) { + $getprotocol = $1; + $getserver = $2; + $getpath = ""; + $getdocument = ""; + } + else { + print "Couldn't parse the specified URL, retry please!\n"; + exit; + } +} + +my @indoc; + +sub GetRootPage { + my $geturl = $_[0]; + my $in=""; + my $code=200; + my $type="text/plain"; + + my $pagemoved=0; + open(HEADGET, "$linkcheck $geturl|") || + die "Couldn't get web page for some reason"; + + while() { + #print STDERR $_; + if($_ =~ /HTTP\/1\.[01] (\d\d\d) /) { + $code=$1; + if($code =~ /^3/) { + $pagemoved=1; + } + } + elsif($_ =~ /^Content-Type: ([\/a-zA-Z]+)/) { + $type=$1; + } + elsif($pagemoved && + ($_ =~ /^Location: (.*)/)) { + $geturl = $1; + + &SplitURL($geturl); + + $pagemoved++; + last; + } + } + close(HEADGET); + + if($pagemoved == 1) { + print "Page is moved but we don't know where. Did you forget the ", + "traling slash?\n"; + exit; + } + + open(WEBGET, "$htmlget $geturl|") || + die "Couldn't get web page for some reason"; + + while() { + my $line = $_; + push @indoc, $line; + $line=~ s/\n/ /g; + $line=~ s/\r//g; +# print $line."\n"; + $in=$in.$line; + } + + close(WEBGET); + + return ($in, $code, $type); +} + +sub LinkWorks { + my $check = $_[0]; + +# URL encode: +# $check =~s/([^a-zA-Z0-9_:\/.-])/uc sprintf("%%%02x",ord($1))/eg; + + my @doc = `$linkcheck \"$check\"`; + + my $head = 1; + +# print "COMMAND: $linkcheck \"$check\"\n"; +# print $doc[0]."\n"; + + boo: + if( $doc[0] =~ /^HTTP[^ ]+ (\d+)/ ) { + my $error = $1; + + if($error < 400 ) { + return "GOOD"; + } + else { + + if($head && ($error >= 500)) { + # This server doesn't like HEAD! + @doc = `$linkcheckfull \"$check\"`; + $head = 0; + goto boo; + } + return "BAD"; + } + } + return "BAD"; +} + + +sub GetLinks { + my $in = $_[0]; + my @result; + + while($in =~ /[^<]*(<[^>]+>)/g ) { + # we have a tag in $1 + my $tag = $1; + + if($tag =~ /^