diff options
Diffstat (limited to 'perl')
-rw-r--r-- | perl/Makefile.am | 3 | ||||
-rw-r--r-- | perl/README | 24 | ||||
-rw-r--r-- | perl/contrib/checklinks.pl.in | 336 | ||||
-rwxr-xr-x | perl/contrib/crawlink.pl | 443 | ||||
-rw-r--r-- | perl/contrib/formfind | 193 | ||||
-rw-r--r-- | perl/contrib/getlinks.pl.in | 261 | ||||
-rw-r--r-- | perl/contrib/mirror.pl | 104 | ||||
-rw-r--r-- | perl/contrib/recursiveftpget.pl.in | 67 |
8 files changed, 0 insertions, 1431 deletions
diff --git a/perl/Makefile.am b/perl/Makefile.am deleted file mode 100644 index cc51ee126..000000000 --- a/perl/Makefile.am +++ /dev/null @@ -1,3 +0,0 @@ -SUBDIRS = Curl_easy - -EXTRA_DIST = README diff --git a/perl/README b/perl/README deleted file mode 100644 index d19deed81..000000000 --- a/perl/README +++ /dev/null @@ -1,24 +0,0 @@ - _ _ ____ _ - ___| | | | _ \| | - / __| | | | |_) | | - | (__| |_| | _ <| |___ - \___|\___/|_| \_\_____| - -Perl - - Perl's a great script language, not the least for quick prototyping. Curl is -elegantly used from within it. You can either invoke external curl command -line or use the curl interface. - -The latest release of Curl_easy, a Perl interface to curl is available from - - http://curl.haxx.se/libcurl/perl/ - -(Georg Horn's original version of Curl_easy, supporting curl versions -before 7.7 is still available from: http://www.koblenz-net.de/~horn/export/ ) - -Using the Curl::easy module is just straightforward and -works much like using libcurl in a C programm, so please refer to the -documentation of libcurl. Have a look at test.pl to get an idea of how -to start. - diff --git a/perl/contrib/checklinks.pl.in b/perl/contrib/checklinks.pl.in deleted file mode 100644 index 324a91791..000000000 --- a/perl/contrib/checklinks.pl.in +++ /dev/null @@ -1,336 +0,0 @@ -#!@PERL@ -# -# checklinks.pl -# -# This script extracts all links from a HTML page and checks their validity. -# Written to use 'curl' for URL checking. -# -# Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se> -# Version: 0.7 Sept 30, 1998 -# -# HISTORY -# -# 0.5 - Cuts off the #-part from links before checking. -# -# 0.6 - Now deals with error codes 3XX better and follows the Location: -# properly. -# - Added the -x flag that only checks http:// -links -# -# 0.7 - Ok, http://www.viunga.se/main.html didn't realize this had no path -# but a document. Now it does. -# -# - -$in=""; - - argv: -if($ARGV[0] eq "-v" ) { - $verbose = 1; - shift @ARGV; - goto argv; -} -elsif($ARGV[0] eq "-i" ) { - $usestdin = 1; - shift @ARGV; - goto argv; -} -elsif($ARGV[0] eq "-l" ) { - $linenumber = 1; - shift @ARGV; - goto argv; -} -elsif($ARGV[0] eq "-h" ) { - $help = 1; - shift @ARGV; - goto argv; -} -elsif($ARGV[0] eq "-x" ) { - $external = 1; - shift @ARGV; - goto argv; -} - -$geturl = $ARGV[0]; - -if(($geturl eq "") || $help) { - print "Usage: $0 [-hilvx] <full URL>\n", - " Use a traling slash for directory URLs!\n", - " -h This help text\n", - " -i Read the initial page from stdin\n", - " -l Line number report for BAD links\n", - " -v Verbose mode\n", - " -x Check non-local (external?) links only\n"; - exit; -} - -if($ARGV[1] eq "-") { - print "We use stdin!\n"; - $usestdin = 1; -} - -# This is necessary from where I tried this: -#$proxy =" -x 194.237.142.41:80"; - -# linkchecker, URL will be appended to the right of this command line -# this is the one using HEAD: -$linkcheck = "curl -s -m 20 -I$proxy"; - -# as a second attempt, this will be used. This is not using HEAD but will -# get the whole frigging document! -$linkcheckfull = "curl -s -m 20 -i$proxy"; - -# htmlget, URL will be appended to the right of this command line -$htmlget = "curl -s$proxy"; - -# Parse the input URL and split it into the relevant parts: - -sub SplitURL { - my $inurl = $_[0]; - - if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) { - $getprotocol = $1; - $getserver = $2; - $getpath = $3; - $getdocument = $4; - } - elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) { - $getprotocol = $1; - $getserver = $2; - $getpath = $3; - $getdocument = ""; - - if($getpath !~ /\//) { - $getpath =""; - $getdocument = $3; - } - - } - elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) { - $getprotocol = $1; - $getserver = $2; - $getpath = ""; - $getdocument = ""; - } - else { - print "Couldn't parse the specified URL, retry please!\n"; - exit; - } -} - -&SplitURL($geturl); - -#print "protocol = $getprotocol\n"; -#print "server = $getserver\n"; -#print "path = $getpath\n"; -#print "document = $getdocument\n"; -#exit; - -if(!$usestdin) { - open(HEADGET, "$linkcheck $geturl|") || - die "Couldn't get web page for some reason"; - headget: - while(<HEADGET>) { -# print $_; - if($_ =~ /HTTP\/.*3\d\d /) { - $pagemoved=1; - } - elsif($pagemoved && - ($_ =~ /^Location: (.*)/)) { - $geturl = $1; - - &SplitURL($geturl); - - $pagemoved++; - last headget; - } - } - close(HEADGET); - - if($pagemoved == 1) { - print "Page is moved but we don't know where. Did you forget the ", - "traling slash?\n"; - exit; - } - - open(WEBGET, "$htmlget $geturl|") || - die "Couldn't get web page for some reason"; - - while(<WEBGET>) { - $line = $_; - push @indoc, $line; - $line=~ s/\n//g; - $line=~ s/\r//g; -# print $line."\n"; - $in=$in.$line; - } - - close(WEBGET); -} -else { - while(<STDIN>) { - $line = $_; - push @indoc, $line; - $line=~ s/\n//g; - $line=~ s/\r//g; - $in=$in.$line; - } -} - -#print length($in)."\n"; - -sub LinkWorks { - my $check = $_[0]; - -# URL encode: -# $check =~s/([^a-zA-Z0-9_:\/.-])/uc sprintf("%%%02x",ord($1))/eg; - - @doc = `$linkcheck \"$check\"`; - - $head = 1; - -# print "COMMAND: $linkcheck \"$check\"\n"; -# print $doc[0]."\n"; - - boo: - if( $doc[0] =~ /^HTTP[^ ]+ (\d+)/ ) { - $error = $1; - - if($error < 400 ) { - return "GOOD"; - } - else { - - if($head && ($error >= 500)) { - # This server doesn't like HEAD! - @doc = `$linkcheckfull \"$check\"`; - $head = 0; - goto boo; - } - return "BAD"; - } - } - return "BAD"; -} - - -sub GetLinks { - my $in = $_[0]; - my @result; - - getlinkloop: - while($in =~ /[^<]*(<[^>]+>)/g ) { - # we have a tag in $1 - $tag = $1; - - if($tag =~ /^<!--/) { - # this is a comment tag, ignore it - } - else { - if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ )>]*)/i) { - $url=$2; - if($url =~ /^\"(.*)\"$/) { - # this was a "string" now $1 has removed the quotes: - $url=$1; - } - - - $url =~ s/([^\#]*)\#.*/$1/g; - - if($url eq "") { - # if the link was nothing than a #-link it may now have - # been emptied completely so then we skip the rest - next getlinkloop; - } - - if($done{$url}) { - # if this url already is done, do next - $done{$url}++; - next getlinkloop; - } - - $done{$url} = 1; # this is "done" - - push @result, $url; - if($tag =~ /< *([^ ]+)/) { -# print "TAG: $1\n"; - $tagtype{$url}=$1; - } - } - } - } - return @result; -} - -@links = &GetLinks($in); - - linkloop: -for(@links) { - $url = $_; - - if($url =~ /^([^:]+):/) { - $prot = $1; -# if($prot !~ /(http|ftp)/i) { - if($prot !~ /http/i) { - # this is an unsupported protocol, we ignore this - next linkloop; - } - $link = $url; - } - else { - if($external) { - next linkloop; - } - - # this is a link on the save server: - if($url =~ /^\//) { - # from root - $link = "$getprotocol://$getserver$url"; - } - else { - # from the scanned page's dir - $nyurl=$url; - - if(length($getpath) && - ($getpath !~ /\/$/) && - ($nyurl !~ /^\//)) { - # lacks ending slash, add one to the document part: - $nyurl = "/".$nyurl; - } - $link = "$getprotocol://$getserver/$getpath$nyurl"; - } - } - -#print "test $link\n"; -#$success = "GOOD"; - - $success = &LinkWorks($link); - - $count = $done{$url}; - - $allcount += $count; - - print "$success $count <".$tagtype{$url}."> $link $url\n"; - -# If bad and -l, present the line numbers of the usage - if("BAD" eq $success) { - $badlinks++; - if($linenumber) { - $line =1; - for(@indoc) { - if($_ =~ /$url/) { - print " line $line\n"; - } - $line++; - } - } - } - -} - -if($verbose) { - print "$allcount links were checked"; - if($badlinks > 0) { - print ", $badlinks were found bad"; - } - print "\n"; -} diff --git a/perl/contrib/crawlink.pl b/perl/contrib/crawlink.pl deleted file mode 100755 index c224be005..000000000 --- a/perl/contrib/crawlink.pl +++ /dev/null @@ -1,443 +0,0 @@ -#!/usr/bin/perl -# -# crawlink.pl -# -# This script crawls across all found links below the given "root" URL. -# It reports all good and bad links to stdout. This code was based on the -# checklink.pl script I wrote ages ago. -# -# Written to use 'curl' for URL checking. -# -# Author: Daniel Stenberg <daniel@haxx.se> -# Version: 0.3 Jan 3, 2001 -# -# HISTORY -# -# 0.3 - The -i now adds regexes that if a full URL link matches one of those, -# it is not followed. This can then be used to prevent this script from -# following '.*\.cgi', specific pages or whatever. -# -# 0.2 - Made it only HEAD non html files (i.e skip the GET). Makes it a lot -# faster to skip large non HTML files such as pdfs or big RFCs! ;-) -# Added a -c option that allows me to pass options to curl. -# -# 0.1 - The given url works as the root. This script will only continue -# and check other URLs if the leftmost part of the new URL is identical -# to the root URL. -# - -use strict; - -my $in=""; -my $verbose=0; -my $usestdin; -my $linenumber; -my $help; -my $external; -my $curlopts; - -my @ignorelist; - - argv: -if($ARGV[0] eq "-v" ) { - $verbose++; - shift @ARGV; - goto argv; -} -elsif($ARGV[0] eq "-c" ) { - $curlopts=$ARGV[1]; - shift @ARGV; - shift @ARGV; - goto argv; -} -elsif($ARGV[0] eq "-i" ) { - push @ignorelist, $ARGV[1]; - shift @ARGV; - shift @ARGV; - goto argv; -} -elsif($ARGV[0] eq "-l" ) { - $linenumber = 1; - shift @ARGV; - goto argv; -} -elsif($ARGV[0] eq "-h" ) { - $help = 1; - shift @ARGV; - goto argv; -} -elsif($ARGV[0] eq "-x" ) { - $external = 1; - shift @ARGV; - goto argv; -} - -my $geturl = $ARGV[0]; -my $firsturl= $geturl; - -# -# Define a hash array to hold all root URLs to visit/we have visited -# -my %rooturls; -$rooturls{$ARGV[0]}=1; - -if(($geturl eq "") || $help) { - print "Usage: $0 [-hilvx] <full URL>\n", - " Use a traling slash for directory URLs!\n", - " -c [data] Pass [data] as argument to every curl invoke\n", - " -h This help text\n", - " -i [regex] Ignore root links that match this pattern\n", - " -l Line number report for BAD links\n", - " -v Verbose mode\n", - " -x Check non-local (external?) links only\n"; - exit; -} - -my $proxy; -if($curlopts ne "") { - $proxy=" $curlopts"; - #$proxy =" -x 194.237.142.41:80"; -} - -# linkchecker, URL will be appended to the right of this command line -# this is the one using HEAD: -my $linkcheck = "curl -s -m 20 -I$proxy"; - -# as a second attempt, this will be used. This is not using HEAD but will -# get the whole frigging document! -my $linkcheckfull = "curl -s -m 20 -i$proxy"; - -# htmlget, URL will be appended to the right of this command line -my $htmlget = "curl -s$proxy"; - -# Parse the input URL and split it into the relevant parts: - -my $getprotocol; -my $getserver; -my $getpath; -my $getdocument; - -my %done; -my %tagtype; -my $allcount=0; -my $badlinks=0; - -sub SplitURL { - my $inurl = $_[0]; - if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) { - $getprotocol = $1; - $getserver = $2; - $getpath = $3; - $getdocument = $4; - } - elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) { - $getprotocol = $1; - $getserver = $2; - $getpath = $3; - $getdocument = ""; - - if($getpath !~ /\//) { - $getpath =""; - $getdocument = $3; - } - - } - elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) { - $getprotocol = $1; - $getserver = $2; - $getpath = ""; - $getdocument = ""; - } - else { - print "Couldn't parse the specified URL, retry please!\n"; - exit; - } -} - -my @indoc; - -sub GetRootPage { - my $geturl = $_[0]; - my $in=""; - my $code=200; - my $type="text/plain"; - - my $pagemoved=0; - open(HEADGET, "$linkcheck $geturl|") || - die "Couldn't get web page for some reason"; - - while(<HEADGET>) { - #print STDERR $_; - if($_ =~ /HTTP\/1\.[01] (\d\d\d) /) { - $code=$1; - if($code =~ /^3/) { - $pagemoved=1; - } - } - elsif($_ =~ /^Content-Type: ([\/a-zA-Z]+)/) { - $type=$1; - } - elsif($pagemoved && - ($_ =~ /^Location: (.*)/)) { - $geturl = $1; - - &SplitURL($geturl); - - $pagemoved++; - last; - } - } - close(HEADGET); - - if($pagemoved == 1) { - print "Page is moved but we don't know where. Did you forget the ", - "traling slash?\n"; - exit; - } - - if($type ne "text/html") { - # there no point in getting anything but HTML - $in=""; - } - else { - open(WEBGET, "$htmlget $geturl|") || - die "Couldn't get web page for some reason"; - while(<WEBGET>) { - my $line = $_; - push @indoc, $line; - $line=~ s/\n/ /g; - $line=~ s/\r//g; - $in=$in.$line; - } - close(WEBGET); - } - return ($in, $code, $type); -} - -sub LinkWorks { - my $check = $_[0]; - -# URL encode: -# $check =~s/([^a-zA-Z0-9_:\/.-])/uc sprintf("%%%02x",ord($1))/eg; - - my @doc = `$linkcheck \"$check\"`; - - my $head = 1; - -# print "COMMAND: $linkcheck \"$check\"\n"; -# print $doc[0]."\n"; - - boo: - if( $doc[0] =~ /^HTTP[^ ]+ (\d+)/ ) { - my $error = $1; - - if($error < 400 ) { - return "GOOD"; - } - else { - - if($head && ($error >= 500)) { - # This server doesn't like HEAD! - @doc = `$linkcheckfull \"$check\"`; - $head = 0; - goto boo; - } - return "BAD"; - } - } - return "BAD"; -} - - -sub GetLinks { - my $in = $_[0]; - my @result; - - while($in =~ /[^<]*(<[^>]+>)/g ) { - # we have a tag in $1 - my $tag = $1; - - if($tag =~ /^<!--/) { - # this is a comment tag, ignore it - } - else { - if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ \)>]*)/i) { - my $url=$2; - if($url =~ /^\"(.*)\"$/) { - # this was a "string" now $1 has removed the quotes: - $url=$1; - } - - - $url =~ s/([^\#]*)\#.*/$1/g; - - if($url eq "") { - # if the link was nothing than a #-link it may now have - # been emptied completely so then we skip the rest - next; - } - - if($done{$url}) { - # if this url already is done, do next - $done{$url}++; - if($verbose) { - print " FOUND $url but that is already checked\n"; - } - next; - } - - $done{$url} = 1; # this is "done" - - push @result, $url; - if($tag =~ /< *([^ ]+)/) { - $tagtype{$url}=$1; - } - } - } - } - return @result; -} - - -while(1) { - $geturl=-1; - for(keys %rooturls) { - if($rooturls{$_} == 1) { - if($_ !~ /^$firsturl/) { - $rooturls{$_} += 1000; # don't do this, outside our scope - if($verbose) { - print "SKIP: $_\n"; - } - next; - } - $geturl=$_; - last; - } - } - if($geturl == -1) { - last; - } - - # - # Splits the URL in its different parts - # - &SplitURL($geturl); - - # - # Returns the full HTML of the root page - # - my ($in, $error, $ctype) = &GetRootPage($geturl); - - $rooturls{$geturl}++; # increase to prove we have already got it - - if($ctype ne "text/html") { - # this is not HTML, we skip this - if($verbose == 2) { - print "Non-HTML link, skipping\n"; - next; - } - } - - if($error >= 400) { - print "ROOT page $geturl returned $error\n"; - next; - } - - print " ==== $geturl ====\n"; - - if($verbose == 2) { - printf("Error code $error, Content-Type: $ctype, got %d bytes\n", - length($in)); - } - - #print "protocol = $getprotocol\n"; - #print "server = $getserver\n"; - #print "path = $getpath\n"; - #print "document = $getdocument\n"; - #exit; - - # - # Extracts all links from the given HTML buffer - # - my @links = &GetLinks($in); - - for(@links) { - my $url = $_; - my $link; - - if($url =~ /^([^:]+):/) { - my $prot = $1; - if($prot !~ /http/i) { - # this is an unsupported protocol, we ignore this - next; - } - $link = $url; - } - else { - if($external) { - next; - } - - # this is a link on the same server: - if($url =~ /^\//) { - # from root - $link = "$getprotocol://$getserver$url"; - } - else { - # from the scanned page's dir - my $nyurl=$url; - - if(length($getpath) && - ($getpath !~ /\/$/) && - ($nyurl !~ /^\//)) { - # lacks ending slash, add one to the document part: - $nyurl = "/".$nyurl; - } - $link = "$getprotocol://$getserver/$getpath$nyurl"; - } - } - - my $success = &LinkWorks($link); - - my $count = $done{$url}; - - $allcount += $count; - - print "$success $count <".$tagtype{$url}."> $link $url\n"; - - if("BAD" eq $success) { - $badlinks++; - if($linenumber) { - my $line =1; - for(@indoc) { - if($_ =~ /$url/) { - print " line $line\n"; - } - $line++; - } - } - } - else { - # the link works, add it if it isn't in the ingore list - my $ignore=0; - for(@ignorelist) { - if($link =~ /$_/) { - $ignore=1; - } - } - if(!$ignore) { - # not ignored, add - $rooturls{$link}++; # check this if not checked already - } - } - - } -} - -if($verbose) { - print "$allcount links were checked"; - if($badlinks > 0) { - print ", $badlinks were found bad"; - } - print "\n"; -} diff --git a/perl/contrib/formfind b/perl/contrib/formfind deleted file mode 100644 index 3ca8ea49f..000000000 --- a/perl/contrib/formfind +++ /dev/null @@ -1,193 +0,0 @@ -#!/usr/bin/env perl -# -# formfind.pl -# -# This script gets a HTML page on stdin and presents form information on -# stdout. -# -# Author: Daniel Stenberg <daniel@haxx.se> -# Version: 0.2 Nov 18, 2002 -# -# HISTORY -# -# 0.1 - Nov 12 1998 - Created now! -# 0.2 - Nov 18 2002 - Enhanced. Removed URL support, use only stdin. -# - -$in=""; - -if($ARGV[0] eq "-h") { - print "Usage: $0 < HTML\n"; - exit; -} - -sub namevalue { - my ($tag)=@_; - my $name=$tag; - if($name =~ /name *=/i) { - if($name =~ /name *= *([^\"\']([^ \">]*))/i) { - $name = $1; - } - elsif($name =~ /name *= *(\"|\')([^\"\']*)(\"|\')/i) { - $name=$2; - } - else { - # there is a tag but we didn't find the contents - $name="[weird]"; - } - - } - else { - # no name given - $name=""; - } - # get value tag - my $value= $tag; - if($value =~ /[^\.a-zA-Z0-9]value *=/i) { - if($value =~ /[^\.a-zA-Z0-9]value *= *([^\"\']([^ \">]*))/i) { - $value = $1; - } - elsif($value =~ /[^\.a-zA-Z0-9]value *= *(\"|\')([^\"\']*)(\"|\')/i) { - $value=$2; - } - else { - # there is a tag but we didn't find the contents - $value="[weird]"; - } - } - else { - $value=""; - } - return ($name, $value); -} - - -while(<STDIN>) { - $line = $_; - push @indoc, $line; - $line=~ s/\n//g; - $line=~ s/\r//g; - $in=$in.$line; -} - -while($in =~ /[^<]*(<[^>]+>)/g ) { - # we have a tag in $1 - $tag = $1; - - if($tag =~ /^<!--/) { - # this is a comment tag, ignore it - } - else { - if(!$form && - ($tag =~ /^< *form/i )) { - $method= $tag; - if($method =~ /method *=/i) { - $method=~ s/.*method *= *(\"|)([^ \">]*).*/$2/gi; - } - else { - $method="get"; # default method - } - $action= $tag; - $action=~ s/.*action *= *(\'|\"|)([^ \"\'>]*).*/$2/gi; - - $method=uc($method); - - $enctype=$tag; - if ($enctype =~ /enctype *=/) { - $enctype=~ s/.*enctype *= *(\'|\"|)([^ \"\'>]*).*/$2/gi; - - if($enctype eq "multipart/form-data") { - $enctype="multipart form upload [use -F]" - } - $enctype = "\n--- type: $enctype"; - } - else { - $enctype=""; - } - - print "--- FORM report. Uses $method to URL \"$action\"$enctype\n"; - $form=1; - } - elsif($form && - ($tag =~ /< *\/form/i )) { - - print "--- end of FORM\n"; - $form=0; - if( 0 ) { - print "*** Fill in all or any of these: (default assigns may be shown)\n"; - for(@vars) { - $var = $_; - $def = $value{$var}; - print "$var=$def\n"; - } - print "*** Pick one of these:\n"; - for(@alts) { - print "$_\n"; - } - } - undef @vars; - undef @alts; - } - elsif($form && - ($tag =~ /^< *(input|select)/i)) { - $mtag = $1; - - ($name, $value)=namevalue($tag); - - if($mtag =~ /select/i) { - print "Select: NAME=\"$name\"\n"; - push @vars, "$name"; - $select = 1; - } - else { - $type=$tag; - if($type =~ /type *=/i) { - $type =~ s/.*type *= *(\'|\"|)([^ \"\'>]*).*/$2/gi; - } - else { - $type="text"; # default type - } - $type=uc($type); - if(lc($type) eq "reset") { - # reset types are for UI only, ignore. - } - elsif($name eq "") { - # let's read the value parameter - - print "Button: \"$value\" ($type)\n"; - push @alts, "$value"; - } - else { - print "Input: NAME=\"$name\""; - if($value ne "") { - print " VALUE=\"$value\""; - } - print " ($type)\n"; - push @vars, "$name"; - # store default value: - $value{$name}=$value; - } - } - } - elsif($form && - ($tag =~ /^< *textarea/i)) { - my ($name, $value)=namevalue($tag); - - print "Textarea: NAME=\"$name\"\n"; - } - elsif($select) { - if($tag =~ /^< *\/ *select/i) { - print "[end of select]\n"; - $select = 0; - } - elsif($tag =~ /[^\/] *option/i ) { - my ($name, $value)=namevalue($tag); - my $s; - if($tag =~ /selected/i) { - $s= " (SELECTED)"; - } - print " Option VALUE=\"$value\"$s\n"; - } - } - } -} diff --git a/perl/contrib/getlinks.pl.in b/perl/contrib/getlinks.pl.in deleted file mode 100644 index 7253f1e7b..000000000 --- a/perl/contrib/getlinks.pl.in +++ /dev/null @@ -1,261 +0,0 @@ -#!@PERL@ -# -# getlinks.pl -# -# This script extracts all links from a HTML page, compares them to a pattern -# entered on the command line and then downloads matching links into the -# target dir (also specified on the command line). -# -# Written to use 'curl' for URL fetching, uses the source file names in the -# target directory. -# -# Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se> -# Version: 0.1 Oct 7, 1998 -# -# HISTORY -# -# 0.1 - Created now! -# - -$in=""; - - argv: -if($ARGV[0] eq "-v" ) { - $verbose = 1; - shift @ARGV; - goto argv; -} -if($ARGV[0] eq "-d" ) { - $display = 1; - shift @ARGV; - goto argv; -} -elsif($ARGV[0] eq "-h" ) { - $help = 1; - shift @ARGV; - goto argv; -} - -$geturl = $ARGV[0]; -$getdir = $ARGV[1]; -$getregex = $ARGV[2]; - -if(($geturl eq "") || - (($getdir eq "") && !$display) || - $help) { - print "Usage: $0 [-hv] <full source URL> <target dir> [regex]\n", - " Use a traling slash for directory URLs!\n", - " Use \"quotes\" around the regex!\n", - " -h This help text\n", - " -d Display matches only instead of downloading\n", - " -v Verbose mode\n"; - exit; -} - -# change to target directory: -chdir $getdir || - die "couldn't cd into $getdir"; - -# This is necessary from where I tried this: -#$proxy =" -x 194.237.142.41:80"; - -# linkchecker, URL will be appended to the right of this command line -# this is the one using HEAD: -$linkcheck = "curl -s -m 20 -I$proxy"; - -# as a second attempt, this will be used. This is not using HEAD but will -# get the whole frigging document! -$linkcheckfull = "curl -s -m 20 -i$proxy"; - -# htmlget, URL will be appended to the right of this command line -$htmlget = "curl -s$proxy"; - -# urlget, URL will be appended to the right of this command line -# this stores the file with the remote file name in the current dir -$urlget = "curl -O -s$proxy"; - -# Parse the input URL and split it into the relevant parts: - -sub SplitURL { - my $inurl = $_[0]; - - if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) { - $getprotocol = $1; - $getserver = $2; - $getpath = $3; - $getdocument = $4; - } - elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) { - $getprotocol = $1; - $getserver = $2; - $getpath = $3; - $getdocument = ""; - - if($getpath !~ /\//) { - $getpath =""; - $getdocument = $3; - } - - } - elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) { - $getprotocol = $1; - $getserver = $2; - $getpath = ""; - $getdocument = ""; - } - else { - print "Couldn't parse the specified URL, retry please!\n"; - exit; - } -} - -&SplitURL($geturl); - -#print "protocol = $getprotocol\n"; -#print "server = $getserver\n"; -#print "path = $getpath\n"; -#print "document = $getdocument\n"; -#exit; - -if(!$usestdin) { - open(HEADGET, "$linkcheck $geturl|") || - die "Couldn't get web page for some reason"; - headget: - while(<HEADGET>) { -# print $_; - if($_ =~ /HTTP\/.*3\d\d /) { - $pagemoved=1; - } - elsif($pagemoved && - ($_ =~ /^Location: (.*)/)) { - $geturl = $1; - - &SplitURL($geturl); - - $pagemoved++; - last headget; - } - } - close(HEADGET); - - if($pagemoved == 1) { - print "Page is moved but we don't know where. Did you forget the ", - "traling slash?\n"; - exit; - } - - open(WEBGET, "$htmlget $geturl|") || - die "Couldn't get web page for some reason"; - - while(<WEBGET>) { - $line = $_; - push @indoc, $line; - $line=~ s/\n//g; - $line=~ s/\r//g; -# print $line."\n"; - $in=$in.$line; - } - - close(WEBGET); -} -else { - while(<STDIN>) { - $line = $_; - push @indoc, $line; - $line=~ s/\n//g; - $line=~ s/\r//g; - $in=$in.$line; - } -} - -sub GetLinks { - my $in = $_[0]; - my @result; - - getlinkloop: - while($in =~ /[^<]*(<[^>]+>)/g ) { - # we have a tag in $1 - $tag = $1; - - if($tag =~ /^<!--/) { - # this is a comment tag, ignore it - } - else { - if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ )>]*)/i) { - $url=$2; - if($url =~ /^\"(.*)\"$/) { - # this was a "string" now $1 has removed the quotes: - $url=$1; - } - - - $url =~ s/([^\#]*)\#.*/$1/g; - - if($url eq "") { - # if the link was nothing than a #-link it may now have - # been emptied completely so then we skip the rest - next getlinkloop; - } - - if($done{$url}) { - # if this url already is done, do next - $done{$url}++; - next getlinkloop; - } - - $done{$url} = 1; # this is "done" - - push @result, $url; - if($tag =~ /< *([^ ]+)/) { -# print "TAG: $1\n"; - $tagtype{$url}=$1; - } - } - } - } - return @result; -} - -@links = &GetLinks($in); - - linkloop: -for(@links) { - $url = $_; - - if($url =~ /^([^:]+):/) { - $link = $url; - } - else { - # this is an absolute link on the same server: - if($url =~ /^\//) { - # from root - $link = "$getprotocol://$getserver$url"; - } - else { - # from the scanned page's dir - $nyurl=$url; - - if(length($getpath) && - ($getpath !~ /\/$/) && - ($nyurl !~ /^\//)) { - # lacks ending slash, add one to the document part: - $nyurl = "/".$nyurl; - } - $link = "$getprotocol://$getserver/$getpath$nyurl"; - } - } - - if($link =~ /$getregex/) { - if($display) { - print "$link\n"; - } - else { - if($verbose) { - print "Gets $link\n"; - } - print `$urlget $link`; - } - } - - -} diff --git a/perl/contrib/mirror.pl b/perl/contrib/mirror.pl deleted file mode 100644 index 99e2709ac..000000000 --- a/perl/contrib/mirror.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/perl - -# -# Author: Daniel Stenberg <daniel@haxx.se> -# Version: 0.1 -# Date: October 10, 2000 -# -# This is public domain. Feel free to do whatever you please with this script. -# There are no warranties whatsoever! It might work, it might ruin your hard -# disk. Use this on your own risk. -# -# PURPOSE -# -# This script uses a local directory to maintain a "mirror" of the curl -# packages listed in the remote curl web sites package list. Files present in -# the local directory that aren't present in the remote list will be removed. -# Files that are present in the remote list but not in the local directory -# will be downloaded and put there. Files present at both places will not -# be touched. -# -# WARNING: don't put other files in the mirror directory, they will be removed -# when this script runs if they don't exist in the remote package list! -# - -# this is the directory to keep all the mirrored curl files in: -$some_dir = $ARGV[0]; - -if( ! -d $some_dir ) { - print "$some_dir is not a dir!\n"; - exit; -} - -# path to the curl binary -$curl = "/home/danste/bin/curl"; - -# this is the remote file list -$filelist = "http://curl.haxx.se/download/curldist.txt"; - -# prepend URL: -$prepend = "http://curl.haxx.se/download"; - -opendir(DIR, $some_dir) || die "can't opendir $some_dir: $!"; -@existing = grep { /^[^\.]/ } readdir(DIR); -closedir DIR; - -$LOCAL_FILE = 1; -$REMOTE_FILE = 2; - -# create a hash array -for(@existing) { - $allfiles{$_} |= $LOCAL_FILE; -} - -# get remote file list -print "Getting file list from $filelist\n"; -@remotefiles=`$curl -s $filelist`; - -# fill in the hash array -for(@remotefiles) { - chomp; - $allfiles{$_} |= $REMOTE_FILE; - $remote++; -} -if($remote < 10) { - print "There's something wrong. The remote file list seems too smallish!\n"; - exit; -} - -@sfiles = sort { $a cmp $b } keys %allfiles; - - -$leftalone = $downloaded = $removed = 0; -for(@sfiles) { - $file = $_; - $info = $allfiles{$file}; - - if($info == ($REMOTE_FILE|$LOCAL_FILE)) { - print "$file is LOCAL and REMOTE, left alone\n"; - $leftalone++; - } - elsif($info == $REMOTE_FILE) { - print "$file is only REMOTE, getting it...\n"; - system("$curl $prepend/$file -o $some_dir/$file"); - $downloaded++; - } - elsif($info == $LOCAL_FILE) { - print "$file is only LOCAL, removing it...\n"; - system("rm $some_dir/$file"); - $removed++; - } - else { - print "Problem, file $file was marked $info\n"; - } - $loops++; -} - -if(!$loops) { - print "No remote or local files were found!\n"; - exit; -} - -print "$leftalone files were already present\n", - "$downloaded files were added\n", - "$removed files were removed\n"; diff --git a/perl/contrib/recursiveftpget.pl.in b/perl/contrib/recursiveftpget.pl.in deleted file mode 100644 index abee22c1c..000000000 --- a/perl/contrib/recursiveftpget.pl.in +++ /dev/null @@ -1,67 +0,0 @@ -#!@PERL@ -# -# Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se> -# Date: August 25 1998 -# Version: 0.1 -# -# This is just meant as an example of why we wrote curl in the first place. -# Quick n' easy scripting use. -# - -$dir = $ARGV[0]; - -$target = $ARGV[1]; - -$maxdepth = $ARGV[2]; - -if($dir eq "" || $target eq "") { - print "Usage: <URL> <dir> [max depth level] \n"; - print " End the URL with a slash if a directory is specified, please\n"; - exit; -} - -if(($maxdepth ne "") && ($maxdepth == 0)) { - # reached maximum depth, die - print "Reached maximum recursive depth level ($maxdepth), exiting...\n"; - exit; -} - -# get dir -@all = `curl -s $dir`; - -if($all[0] ne "") { - print "Got the main $dir dir\n"; -} - -line: -for(@all) { - chop; # cut off newline - @linep= split(" ", $_); - - $name = $linep[$#linep]; - - $firstletter=substr($linep[0], 0, 1); - - if($firstletter eq "d") { - # this is a subdir, recurse - # if not . or .. of course - - if(($name eq ".") || ($name eq "..")) { - next line; - } - print "Recursing for dir $dir$name in target $target/$name\n"; - - $nextdepth=$maxdepth-1; - print `$0 $dir$name/ $target/$name $nextdepth`; - } - elsif($firstletter eq "-") { - # this is a file, get it - # oh, make sure the target dir exists first - - if(! -r $target ) { - mkdir($target,0777); - } - print "Getting file $dir$name in target $target/$name\n"; - print `curl -s $dir$name >$target/$name`; - } -} |