From 28cdc0d82c6159addc6d2a66449bc6bd2ef0bb71 Mon Sep 17 00:00:00 2001 From: Daniel Stenberg Date: Sun, 24 May 2015 00:54:55 +0200 Subject: perl: remove subdir, not touched in 9 years --- perl/contrib/crawlink.pl | 443 ----------------------------------------------- 1 file changed, 443 deletions(-) delete mode 100755 perl/contrib/crawlink.pl (limited to 'perl/contrib/crawlink.pl') diff --git a/perl/contrib/crawlink.pl b/perl/contrib/crawlink.pl deleted file mode 100755 index c224be005..000000000 --- a/perl/contrib/crawlink.pl +++ /dev/null @@ -1,443 +0,0 @@ -#!/usr/bin/perl -# -# crawlink.pl -# -# This script crawls across all found links below the given "root" URL. -# It reports all good and bad links to stdout. This code was based on the -# checklink.pl script I wrote ages ago. -# -# Written to use 'curl' for URL checking. -# -# Author: Daniel Stenberg -# Version: 0.3 Jan 3, 2001 -# -# HISTORY -# -# 0.3 - The -i now adds regexes that if a full URL link matches one of those, -# it is not followed. This can then be used to prevent this script from -# following '.*\.cgi', specific pages or whatever. -# -# 0.2 - Made it only HEAD non html files (i.e skip the GET). Makes it a lot -# faster to skip large non HTML files such as pdfs or big RFCs! ;-) -# Added a -c option that allows me to pass options to curl. -# -# 0.1 - The given url works as the root. This script will only continue -# and check other URLs if the leftmost part of the new URL is identical -# to the root URL. -# - -use strict; - -my $in=""; -my $verbose=0; -my $usestdin; -my $linenumber; -my $help; -my $external; -my $curlopts; - -my @ignorelist; - - argv: -if($ARGV[0] eq "-v" ) { - $verbose++; - shift @ARGV; - goto argv; -} -elsif($ARGV[0] eq "-c" ) { - $curlopts=$ARGV[1]; - shift @ARGV; - shift @ARGV; - goto argv; -} -elsif($ARGV[0] eq "-i" ) { - push @ignorelist, $ARGV[1]; - shift @ARGV; - shift @ARGV; - goto argv; -} -elsif($ARGV[0] eq "-l" ) { - $linenumber = 1; - shift @ARGV; - goto argv; -} -elsif($ARGV[0] eq "-h" ) { - $help = 1; - shift @ARGV; - goto argv; -} -elsif($ARGV[0] eq "-x" ) { - $external = 1; - shift @ARGV; - goto argv; -} - -my $geturl = $ARGV[0]; -my $firsturl= $geturl; - -# -# Define a hash array to hold all root URLs to visit/we have visited -# -my %rooturls; -$rooturls{$ARGV[0]}=1; - -if(($geturl eq "") || $help) { - print "Usage: $0 [-hilvx] \n", - " Use a traling slash for directory URLs!\n", - " -c [data] Pass [data] as argument to every curl invoke\n", - " -h This help text\n", - " -i [regex] Ignore root links that match this pattern\n", - " -l Line number report for BAD links\n", - " -v Verbose mode\n", - " -x Check non-local (external?) links only\n"; - exit; -} - -my $proxy; -if($curlopts ne "") { - $proxy=" $curlopts"; - #$proxy =" -x 194.237.142.41:80"; -} - -# linkchecker, URL will be appended to the right of this command line -# this is the one using HEAD: -my $linkcheck = "curl -s -m 20 -I$proxy"; - -# as a second attempt, this will be used. This is not using HEAD but will -# get the whole frigging document! -my $linkcheckfull = "curl -s -m 20 -i$proxy"; - -# htmlget, URL will be appended to the right of this command line -my $htmlget = "curl -s$proxy"; - -# Parse the input URL and split it into the relevant parts: - -my $getprotocol; -my $getserver; -my $getpath; -my $getdocument; - -my %done; -my %tagtype; -my $allcount=0; -my $badlinks=0; - -sub SplitURL { - my $inurl = $_[0]; - if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) { - $getprotocol = $1; - $getserver = $2; - $getpath = $3; - $getdocument = $4; - } - elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) { - $getprotocol = $1; - $getserver = $2; - $getpath = $3; - $getdocument = ""; - - if($getpath !~ /\//) { - $getpath =""; - $getdocument = $3; - } - - } - elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) { - $getprotocol = $1; - $getserver = $2; - $getpath = ""; - $getdocument = ""; - } - else { - print "Couldn't parse the specified URL, retry please!\n"; - exit; - } -} - -my @indoc; - -sub GetRootPage { - my $geturl = $_[0]; - my $in=""; - my $code=200; - my $type="text/plain"; - - my $pagemoved=0; - open(HEADGET, "$linkcheck $geturl|") || - die "Couldn't get web page for some reason"; - - while() { - #print STDERR $_; - if($_ =~ /HTTP\/1\.[01] (\d\d\d) /) { - $code=$1; - if($code =~ /^3/) { - $pagemoved=1; - } - } - elsif($_ =~ /^Content-Type: ([\/a-zA-Z]+)/) { - $type=$1; - } - elsif($pagemoved && - ($_ =~ /^Location: (.*)/)) { - $geturl = $1; - - &SplitURL($geturl); - - $pagemoved++; - last; - } - } - close(HEADGET); - - if($pagemoved == 1) { - print "Page is moved but we don't know where. Did you forget the ", - "traling slash?\n"; - exit; - } - - if($type ne "text/html") { - # there no point in getting anything but HTML - $in=""; - } - else { - open(WEBGET, "$htmlget $geturl|") || - die "Couldn't get web page for some reason"; - while() { - my $line = $_; - push @indoc, $line; - $line=~ s/\n/ /g; - $line=~ s/\r//g; - $in=$in.$line; - } - close(WEBGET); - } - return ($in, $code, $type); -} - -sub LinkWorks { - my $check = $_[0]; - -# URL encode: -# $check =~s/([^a-zA-Z0-9_:\/.-])/uc sprintf("%%%02x",ord($1))/eg; - - my @doc = `$linkcheck \"$check\"`; - - my $head = 1; - -# print "COMMAND: $linkcheck \"$check\"\n"; -# print $doc[0]."\n"; - - boo: - if( $doc[0] =~ /^HTTP[^ ]+ (\d+)/ ) { - my $error = $1; - - if($error < 400 ) { - return "GOOD"; - } - else { - - if($head && ($error >= 500)) { - # This server doesn't like HEAD! - @doc = `$linkcheckfull \"$check\"`; - $head = 0; - goto boo; - } - return "BAD"; - } - } - return "BAD"; -} - - -sub GetLinks { - my $in = $_[0]; - my @result; - - while($in =~ /[^<]*(<[^>]+>)/g ) { - # we have a tag in $1 - my $tag = $1; - - if($tag =~ /^