#!@PERL@ # # getlinks.pl # # This script extracts all links from a HTML page, compares them to a pattern # entered on the command line and then downloads matching links into the # target dir (also specified on the command line). # # Written to use 'curl' for URL fetching, uses the source file names in the # target directory. # # Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se> # Version: 0.1 Oct 7, 1998 # # HISTORY # # 0.1 - Created now! # $in=""; argv: if($ARGV[0] eq "-v" ) { $verbose = 1; shift @ARGV; goto argv; } if($ARGV[0] eq "-d" ) { $display = 1; shift @ARGV; goto argv; } elsif($ARGV[0] eq "-h" ) { $help = 1; shift @ARGV; goto argv; } $geturl = $ARGV[0]; $getdir = $ARGV[1]; $getregex = $ARGV[2]; if(($geturl eq "") || (($getdir eq "") && !$display) || $help) { print "Usage: $0 [-hv] <full source URL> <target dir> [regex]\n", " Use a traling slash for directory URLs!\n", " Use \"quotes\" around the regex!\n", " -h This help text\n", " -d Display matches only instead of downloading\n", " -v Verbose mode\n"; exit; } # change to target directory: chdir $getdir || die "couldn't cd into $getdir"; # This is necessary from where I tried this: #$proxy =" -x 194.237.142.41:80"; # linkchecker, URL will be appended to the right of this command line # this is the one using HEAD: $linkcheck = "curl -s -m 20 -I$proxy"; # as a second attempt, this will be used. This is not using HEAD but will # get the whole frigging document! $linkcheckfull = "curl -s -m 20 -i$proxy"; # htmlget, URL will be appended to the right of this command line $htmlget = "curl -s$proxy"; # urlget, URL will be appended to the right of this command line # this stores the file with the remote file name in the current dir $urlget = "curl -O -s$proxy"; # Parse the input URL and split it into the relevant parts: sub SplitURL { my $inurl = $_[0]; if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) { $getprotocol = $1; $getserver = $2; $getpath = $3; $getdocument = $4; } elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) { $getprotocol = $1; $getserver = $2; $getpath = $3; $getdocument = ""; if($getpath !~ /\//) { $getpath =""; $getdocument = $3; } } elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) { $getprotocol = $1; $getserver = $2; $getpath = ""; $getdocument = ""; } else { print "Couldn't parse the specified URL, retry please!\n"; exit; } } &SplitURL($geturl); #print "protocol = $getprotocol\n"; #print "server = $getserver\n"; #print "path = $getpath\n"; #print "document = $getdocument\n"; #exit; if(!$usestdin) { open(HEADGET, "$linkcheck $geturl|") || die "Couldn't get web page for some reason"; headget: while(<HEADGET>) { # print $_; if($_ =~ /HTTP\/.*3\d\d /) { $pagemoved=1; } elsif($pagemoved && ($_ =~ /^Location: (.*)/)) { $geturl = $1; &SplitURL($geturl); $pagemoved++; last headget; } } close(HEADGET); if($pagemoved == 1) { print "Page is moved but we don't know where. Did you forget the ", "traling slash?\n"; exit; } open(WEBGET, "$htmlget $geturl|") || die "Couldn't get web page for some reason"; while(<WEBGET>) { $line = $_; push @indoc, $line; $line=~ s/\n//g; $line=~ s/\r//g; # print $line."\n"; $in=$in.$line; } close(WEBGET); } else { while(<STDIN>) { $line = $_; push @indoc, $line; $line=~ s/\n//g; $line=~ s/\r//g; $in=$in.$line; } } sub GetLinks { my $in = $_[0]; my @result; getlinkloop: while($in =~ /[^<]*(<[^>]+>)/g ) { # we have a tag in $1 $tag = $1; if($tag =~ /^<!--/) { # this is a comment tag, ignore it } else { if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ )>]*)/i) { $url=$2; if($url =~ /^\"(.*)\"$/) { # this was a "string" now $1 has removed the quotes: $url=$1; } $url =~ s/([^\#]*)\#.*/$1/g; if($url eq "") { # if the link was nothing than a #-link it may now have # been emptied completely so then we skip the rest next getlinkloop; } if($done{$url}) { # if this url already is done, do next $done{$url}++; next getlinkloop; } $done{$url} = 1; # this is "done" push @result, $url; if($tag =~ /< *([^ ]+)/) { # print "TAG: $1\n"; $tagtype{$url}=$1; } } } } return @result; } @links = &GetLinks($in); linkloop: for(@links) { $url = $_; if($url =~ /^([^:]+):/) { $link = $url; } else { # this is an absolute link on the same server: if($url =~ /^\//) { # from root $link = "$getprotocol://$getserver$url"; } else { # from the scanned page's dir $nyurl=$url; if(length($getpath) && ($getpath !~ /\/$/) && ($nyurl !~ /^\//)) { # lacks ending slash, add one to the document part: $nyurl = "/".$nyurl; } $link = "$getprotocol://$getserver/$getpath$nyurl"; } } if($link =~ /$getregex/) { if($display) { print "$link\n"; } else { if($verbose) { print "Gets $link\n"; } print `$urlget $link`; } } }