Website Link Crawler
January 10, 2005, updated January 29, 2016
This is a handy perl script that will crawl a website and report broken links. It looks at all links, not just hrefs. This includes images and css files. I talk about how I use this script to find broken links in another post.
This uses perl and curl, so you may need to install some dependencies.
sudo apt-get install perl curl
On to the crawl.pl script.
#!/usr/bin/perl # # This script crawls across all links in a given URL and reports good and bad # links. By default, it will only crawl local links that are a child of the root # URL. To also test external links from local pages, use the -x option. # # Adapted from a script originally by Daniel Stenberg <daniel@haxx.se> # use strict; my $in=""; my $verbose=0; my $usestdin; my $linenumber; my $help; my $external=0; argv: if($ARGV[0] eq "-v" ) { $verbose++; shift @ARGV; goto argv; } elsif($ARGV[0] eq "-l" ) { $linenumber = 1; shift @ARGV; goto argv; } elsif($ARGV[0] eq "-h" ) { $help = 1; shift @ARGV; goto argv; } elsif($ARGV[0] eq "-x" ) { $external = 1; shift @ARGV; goto argv; } my $geturl = $ARGV[0]; my $firsturl= $geturl; my %rooturls; $rooturls{$ARGV[0]}=1; if (($geturl eq "") || $help) { print "Usage: $0 [-hlvx] <full URL>\n", " Use a traling slash for directory URLs!\n", " -h This help text\n", " -l Line number report for BAD links\n", " -v Verbose mode\n", " -x Check non-local links\n"; exit; } # This is necessary from where I tried this: my $proxy=""; #$proxy = "-x 194.237.142.41:80"; my $linkcheck = "curl -s -m 30 -I $proxy --retry 3 -A \"Mozilla/4.0\""; my $linkcheckfull = "curl -s -m 30 -i $proxy --retry 3 -A \"Mozilla/4.0\""; my $htmlget = "curl -s -m 30 $proxy --retry 3 -A \"Mozilla/4.0\""; my $getprotocol; my $getserver; my $getpath; my $getdocument; my %done; my %tagtype; my $allcount = 0; my $badlinks = 0; sub split_url { my $inurl = $_[0]; if ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) { $getprotocol = $1; $getserver = $2; $getpath = $3; $getdocument = $4; } elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) { $getprotocol = $1; $getserver = $2; $getpath = $3; $getdocument = ""; if($getpath !~ /\//) { $getpath =""; $getdocument = $3; } } elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) { $getprotocol = $1; $getserver = $2; $getpath = ""; $getdocument = ""; } else { print "Couldn't parse the specified URL, retry please!\n"; exit; } } my @indoc; sub get_page { my $geturl = $_[0]; my $in = ""; my $code = 200; my $type = "text/plain"; my $pagemoved=0; open(HEADGET, "$linkcheck $geturl|") || die "Couldn't get web page for some reason"; while (<HEADGET>) { if ($_ =~ /HTTP\/1\.[01] (\d\d\d) /) { $code=$1; if ($code =~ /^3/) { $pagemoved=1; } } elsif ($_ =~ /^Content-Type: ([\/a-zA-Z]+)/) { $type=$1; } elsif ($pagemoved && ($_ =~ /^Location: (.*)/)) { $geturl = $1; &split_url($geturl); $pagemoved++; last; } } close(HEADGET); if ($pagemoved == 1) { print "Page is moved but we don't know where. Did you forget the ", "trailing slash?\n"; exit; } open(WEBGET, "$htmlget $geturl|") || die "Couldn't get web page for some reason"; while (<WEBGET>) { my $line = $_; push @indoc, $line; $line =~ s/\n/ /g; $line =~ s/\r//g; $in = $in.$line; } close(WEBGET); return ($in, $code, $type); } sub check_link { my $check = $_[0]; my @doc = `$linkcheck \"$check\"`; my $head = 1; boo: if ( $doc[0] =~ /^HTTP[^ ]+ (\d+)/ ) { my $error = $1; if ($error < 400 ) { return "GOOD"; } else { if ($head && ($error >= 500)) { # This server doesn't like HEAD! if ($verbose) { print "RETRY $check\n"; } @doc = `$linkcheckfull \"$check\"`; $head = 0; goto boo; } if ($verbose) { print "ERROR $error $check\n"; } return "BAD"; } } return "BAD"; } sub parse_links { my $in = $_[0]; my @result; while ($in =~ /[^<]*(<[^>]+>)/g ) { # we have a tag in $1 my $tag = $1; if ($tag =~ /^<!--/) { # this is a comment tag, ignore it } else { if ($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ \)>]*)/i) { my $url=$2; if ($url =~ /^\"(.*)\"$/) { # this was a "string" now $1 has removed the quotes: $url=$1; } $url =~ s/([^\#]*)\#.*/$1/g; if ($url eq "") { # if the link was nothing than a #-link it may now have # been emptied completely so then we skip the rest next; } if ($done{$url}) { # if this url already is done, do next $done{$url}++; next; } $done{$url} = 1; # this is "done" push @result, $url; if ($tag =~ /< *([^ ]+)/) { $tagtype{$url}=$1; } } } } return @result; } while(1) { $geturl = -1; for (keys %rooturls) { if ($rooturls{$_} == 1) { if ($_ !~ /^$firsturl/) { $rooturls{$_}++; if ($verbose) { print "SKIP: $_\n"; } next; } $geturl = $_; last; } } if ($geturl == -1) { last; } if ($verbose) { print "ROOT: $geturl\n"; } &split_url($geturl); my ($in, $error, $ctype) = &get_page($geturl); $rooturls{$geturl}++; if ($ctype ne "text/html") { if ($verbose == 2) { print "Non-HTML link, skipping\n"; next; } } if ($error >= 400) { print "BAD $geturl returned $error\n"; next; } if ($verbose == 2) { printf("Error code $error, Content-Type: $ctype, got %d bytes\n", length($in)); } my @links = &parse_links($in); for(@links) { my $url = $_; my $link; if($url =~ /^([^:]+):/) { my $prot = $1; if($prot !~ /http/i) { # this is an unsupported protocol, we ignore this next; } $link = $url; } else { # this is a link on the same server: if ($url =~ /^\//) { # from root $link = "$getprotocol://$getserver$url"; } else { # from the scanned page's dir my $nyurl = $url; if (length($getpath) && ($getpath !~ /\/$/) && ($nyurl !~ /^\//)) { # lacks ending slash, add one to the document part: $nyurl = "/".$nyurl; } $link = "$getprotocol://$getserver/$getpath$nyurl"; } } if (!$external) { if ($link !~ /^$firsturl/) { next; } } my $success = &check_link($link); my $count = $done{$url}; $allcount += $count; print "$success $count <".$tagtype{$url}."> $link $url\n"; $rooturls{$link}++; if("BAD" eq $success) { $badlinks++; if ($linenumber) { my $line =1; for (@indoc) { if ($_ =~ /$url/) { print " line $line\n"; } $line++; } } } } } if($verbose) { print "$allcount links were checked"; if ($badlinks > 0) { print ", $badlinks were found bad"; } print "\n"; }
If this doesn't do it for you, then this should get the job done.