#!/usr/local/bin/perl # # Written by Paul Schinder # Modified frequently. # # modified 10/26/96 by W. A. Kibbe # support for hard wired destination file and comment field in input text file # line~ = /(\S+)\s+(\S+)\s+(\S+)\s*(.*)/; #($1 has URL, $2 has output path info,$3 has filename, $4 has rest of line) # line~ = /(\S+)\s+(\S+)\s+(\S+)\s*(.*)/; # ($1 has baseURL, $2 rel. path URL, $3 has local root, $4 has comments) # ($line, $outpath, $outfilename, $comment) = ($1, $2, $3, $4); # read all the urls into an array and then delete duplicates? # now set up so that the 'geturls' can't climb higher in the directory heirarchy # require "GUSI.ph"; require "myutil.pl"; require "url.pl"; require "isunix.pl"; require "www.pl"; require "wwwurl.pl"; require "wwwerror.pl"; $DEBUG=1; use Net::SMTP; use Config; # ---- Variables ----- # ---- where to store the logs and how to name them ---- if ($Config{'osname'} =~ /^macos/i) { # do mac specific stuff here $wrkdir = ""; # throw the logs in the working directory $logfile = "geturls.log"; $errlogfile = "session.log"; &MacPerl'Quit(1); #quit if a stand alone app # $inmacintosh = 0; @fileAttributes = ("R*ch", "TEXT"); # the type of text editor to use. 'R*ch' is BBEdit $nativedirchar=":"; $incominghtmldir = "bWAK:Test:crawl:"; $incomingftpdir = "bWAK:Test:ftp:"; $incominggopherdir = "bWAK:Test:gopher:"; $urllistdir="bWAK:Test:urls:"; # directory to store list of "spidered" URLs } elsif ($Config{'osname'} =~ /^win/i) { $incominghtmldir = "f:\\pub\\geturls\\html\\"; $incomingftpdir = "f:\\pub\\geturls\\ftp\\"; $incominggopherdir = "f:\\pub\\geturls\\gopher\\"; $urllistdir="f:\\pub\\geturls\\urls\\"; # directory to store list of "spidered" URLs $nativedirchar="\\"; $logfile = $ENV{'HOME'}.$nativedirchar."geturls.log"; $errlogfile = $ENV{'HOME'}.$nativedirchar."session.log"; } else{ $incominghtmldir = "/geturls/html/"; $incomingftpdir = "/geturls/ftp/"; $incominggopherdir = "/geturls/gopher/"; $urllistdir="/geturls/urls/"; # directory to store list of "spidered" URLs $nativedirchar="/"; $logfile = $ENV{'HOME'}.$nativedirchar."geturls.log"; $errlogfile = $ENV{'HOME'}.$nativedirchar."session.log"; } # --- control variables for "crawling" html files $limittodomain = 1; #allow pages outside of $domainallowed to be downloaded and searched $domainallowed = "nwu.edu"; #name of the domain allowed if $limittodomain=1 $limittoservers = 0; #even more restrictive: limit to list of servers @allowedservers = ("basic.nwu.edu","www.nums.nwu.edu", "ghsl.nwu.edu"); #allowed servers, $limittoserver = 1; #most restrictive: can we switch host servers at all? $traverseurllevels = 4; # go this many levels deep from a url listed in $urllistdir $defaultname = "index.html"; # default name for root directory access using http # ------ Shouldn't need to change anything below this point ------- $currentnestlevel=1; $inmacintosh = 0; @urlsread = (); $urlsearchstring = ""; #if (!&myutil'ispppopen() ){ # &Die("No point in running this without a network connection"); #} open (LOG,">>".$logfile) || &Die("Unable to open $logfile"); $oldfh = select(LOG); $| = 1; select($oldfh); close(STDERR); open(STDERR,">".$errlogfile) || &Die("Unable to open $errlogfile"); select(STDERR); $| = 1; ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime; $mon++; if($min >= 10) { print LOG "\n$mon/$mday/$year $hour:$min\n"; } else { print LOG "\n$mon/$mday/$year $hour:0$min\n"; } if ($#ARGV < 0) { if (opendir ( URLDIR, $urllistdir)) { print LOG "directory opened!\n\t" if $DEBUG; @ARGV=readdir(URLDIR); closedir(URLDIR); local($cnt); for($cnt = 0;$cnt <= $#ARGV; $cnt++) { $ARGV[$cnt]=$urllistdir.$ARGV[$cnt]; print LOG "$ARGV[$cnt]," if $DEBUG; } print LOG "\n" if $DEBUG; } } while ($in = pop(@ARGV)) { &ProcessFile($in) } print LOG "Printing out all read URLS\n*************************\n"; $cnt=0; foreach $url (@urlsread) { $cnt++; print LOG "$cnt: $url\n"; } ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime; $mon++; if($min >= 10) { print LOG "Finished at $mon/$mday/$year $hour:$min\n"; } else { print LOG "Finished at $mon/$mday/$year $hour:0$min\n"; } close(LOG); &MacPerl::SetFileInfo(@file_attributes, $logfile); close(STDERR); &MacPerl::SetFileInfo(@file_attributes, $errlogfile); if($inmacintosh) { &MacPerl'DoAppleScript(<".$fname) || &Die("Unable to open temporary file $fname !!"); $oldfh = select(NGET); $| = 1; select($oldfh); local ($baseurl,$localroot, $relurl); local ($proto, $user, $password, $host, $path); local (@wrk); # make sure that $localwrkdir is global while($line = ) { # skip lines with comments! next if ($line =~ m/[ \t]*[!\\#]/ && length($line)<10); print LOG "line = $line" if $DEBUG; chop $line; $line =~ /(\S+)\s+(\S+)\s+(\S+)\s*(.*)/; # ($pfurl, $outpath, $outfilename) = ($1, $2, $3); #($1 has URL, $2 has output path info,$3 has filename, $4 has rest of line) ($baseurl, $relurl, $localroot) = ($1, $2, $3); #($1 has baseURL, $2 has relatice URL, $3 local root directory) if (length($localroot)>3) { if ($localroot !~ m/:$/) { $localroot .= ":"; } } else { $localroot = ''; } if (( $incominghtmldir =~ m/^$localroot/i ) || ($localroot =~ m/$incominghtmldir/i )) {} # incominghtmldir already prepended to outpath or outpath is contained in else { $localroot=$incominghtmldir.$localroot; } if ($baseurl =~ m/^#/) { print NGET "$baseurl\n"; next; } if ($infourl = &infomacscan($baseurl)) { $baseurl = $infourl; } ($proto, $user, $password, $host, $path) = &url::scan($baseurl.$relurl); if (!$proto) { print NGET "$baseurl\n"; print LOG "Skipping URL (no proto): $baseurl\n"; next; } if (&checkurlsearched($baseurl.$relurl,1)) { #we've done it already! print NGET "$baseurl.$relurl\n"; print LOG "Skipping URL(already read): $baseurl.$relurl\n"; next; } $localwrkdir=$localroot; @wrk = split(/\//,$relurl); while ($#wrk > 0 ) { $localwrkdir .= shift(@wrk) . ":"; } if ($path =~ m,/$,) { $localwrkdir .= shift(@wrk) . ":"; } if ($proto eq "ftp") { require "ftplib.pl"; &ftp'debug(1); # save the server responses into session.log if (!&ftp::open($host, $user, $password)) { print NGET "$baseurl.$relurl\n"; print LOG "Unable to open a connection with $host\n"; $err = &ftp'error(); print LOG "$err\n"; next; } else { print LOG "Successfully opened a connection to $host\n"; } # # If $ path ends in a /, the URL is a directory URL. Instead of retriving # a file, generate a listing and turn it into a proper HTML page. I # should really do a stat and see what kind of file it is, but this # will do for now. # $path = &wwwurl::unescape($path); if ($path =~ m,/$,) { &ftp'timeout(120); @wrk = split(/\//,$path); $file = $wrk[$#wrk].".html"; # # uniquename takes care of the following problem: given a folder and a # potential file name, make sure that the file name contains no :'s, # is <= 31 characters long, and does not already exist. Construct a unique # filename if $file already exists. $macpath is thus guaranteed to be a # proper Mac full file path and to not exist already. # $macpath = &myutil'uniquename($localwrkdir,$file); # # Do this the "proper" way, and chdir into the proper directory, one at # a time. # for($iii = 0;$iii <= $#wrk;$iii++) { &ftp'cwd($wrk[$iii]); } @list = &ftp'dir(); if(@list == ()) { print LOG "Unable to get listing for $path\n"; $err = &ftp'error(); print LOG "$err\n"; print NGET "$pfurl\n"; } else { open (FTPOUT,">".$macpath) || &Die("Unable to open $macpath"); $ftppage = &ftptohtml($host,$path,*list); print FTPOUT $ftppage; close FTPOUT; print LOG "Successfully got a listing for $path from $host\n"; $localwrkdir =~ m/^$localroot(.*)$/; push(@filesread, "$macpath\t$baseurl\t$relurl\t$localroot\t$1\t$file"); push(@urlsread, $baseurl.$relurl); } } else { &ftp'timeout(240); &ftp'binary(); @wrk = split(/\//,$path); if (length($outfilename)<4) { $file = $wrk[$#wrk]; } else { $file=$outfilename; } print LOG "later file set to '$file'"; if($file =~ m/\.htm/) { $macpath = &myutil'uniquename($localwrkdir,$file); } else { $macpath = &myutil'uniquename($localwrkdir,$file); $inmacintosh = 1; } # # Do this the "proper" way, and chdir into the proper directory, one at # a time. # for($iii = 0;$iii < $#wrk;$iii++) { &ftp'cwd($wrk[$iii]); } if (!&ftp'get($file,$macpath)) { print LOG "Unable to get $file\n"; $err = &ftp'error(); print LOG "$err\n"; print NGET "$baseurl.$relurl\n"; } else { print LOG "Successfully retrieved $path from $host\n"; $localwrkdir =~ m/^$localroot(.*)$/; push(@filesread, "$macpath\t$baseurl\t$relurl\t$localroot\t$1\t$file"); push(@urlsread, $pfurl); } } &ftp'close(); } elsif ($proto eq "http") { if(($response = &httpget($baseurl.$relurl)) == 200) { # # httpget returns the HTTP response code, 200 if successful # print LOG "Got $baseurl.$relurl\n"; $localwrkdir =~ m/^$localroot(.*)$/; push(@filesread, "$macpath\t$baseurl\t$relurl\t$localroot\t$1\t$outfilename"); push(@urlsread, $baseurl.$relurl); } else { print LOG "Failed to get $baseurl.$relurl\n"; print(LOG "HTTP/1.0 $response $wwwerror'RespMessage{$response}\n"); print NGET "$baseurl.$relurl\n"; } } elsif ($proto eq "gopher") { require("gopherlib.pl"); local($page,$stuff); if(&gopher'open($host)) { # # Strip off the leading character of the gopher URL path, which is just # an indication of the file type, and translate the URL escapes. # if ($path) { $firstchar = substr($path,0,1); $path = substr($path,1); $path = &wwwurl'unescape($path); } else { $firstchar = "1"; } unless ($firstchar eq "9" || $firstchar eq "I") { $stuff = &gopher'gettext($path); } else { $stuff = &gopher'getbinary($path); } # # At this point, the connection gets closed from the other end. Now we have # to do something with the information. # if ($firstchar eq "1") { # # If it's a gopher listing, rather than a plain file, html'ize it and # print the html file out in the usual incoming HTML folder # $page = &gopher'gopher2html($host,*stuff); $file = $host.".html"; $macpath = &myutil'uniquename($incominggopherdir,$file); open(GOUT,">".$macpath) || &Die("Unable to open $macpath"); print GOUT $page; close GOUT; } else { $page = $stuff; @wrk = split(/\//,$path); $file = $wrk[$#wrk]; $macpath = &myutil'uniquename($incominggopherdir,$file); open(GOUT,">".$macpath) || &Die("Unable to open $macpath"); print GOUT $page; close GOUT; } print LOG "Successfully retrieved $baseurl.$relurl\n"; } else { print LOG "Unable to gopher $baseurl.$relurl\n"; print NGET "$baseurl.$relurl\n"; } } else { print LOG "Cannot handle $baseurl.$relurl at this time\n"; print NGET "$baseurl.$relurl\n"; } # // feed ExtractURLs the complete file spec, the original url ($line), local dir and local filename } # while close(GETME); close(NGET); # &Trash($in); # i don't want to do this! # $mirrorin = &myutil'uniquename($in); # rename($fname,$mirrorin); print LOG "ProcessFile: call ExtractURLs $#filesread times. currentnestlevel= $currentnestlevel.\n"; while (@filesread) { local($line)=shift(@filesread); local($mpath,$burl,$rurl,$mroot,$mdir,$moutfile)=split(/\t/, $line); &ExtractURLs($mpath,$burl,$rurl,$mroot,$mdir,$moutfile); } } #end of ProcessFile # -------------------------------------------------------------------------- sub infomacscan { # local($preferred) = "ftp://grind.isca.uiowa.edu/mac/infomac/"; # local($preferred) = "ftp://ftp.uu.net/archive/systems/mac/info-mac/"; local($preferred) = "ftp://mirror.aol.com/pub/info-mac/"; local($path); if ($_[0] =~ m, /info-mac/,) { # # "/info-mac/" is 10 characters # $lft = index($_[0],"/info-mac/") + 10; $rgt = index($_[0],";"); $path = substr($_[0],$lft,$rgt - $lft); $path = $preferred.$path; # $path =~ s/\.hqx/\.bin/; #because grind stores MacBinaries return $path; } else { return undef; } } # -------------------------------------------------------------------------- sub httpget { # # This subroutine does an http GET. It is a heavily hacked version of # libwww 0.40's get script. It returns the http response code. # local($pname) = "GET"; # # Method = program name # local($method) = $pname; # # uppercase it # $method =~ tr/a-z/A-Z/; local($Version) = "$method/0.5"; # # Set up User-Agent: header # &www'set_def_header('http', 'User-Agent', $Version); # # Set up initial Base URL # local($Base) = ""; # # Time-out in seconds # local($Tout) = 30; # # If-Modified-Since header # local($Ims) = ''; # # No logging if Quiet # local($Quiet) = 0; local($proto, $user, $password, $host, $httppath) = &url::scan($_[0]); if ($proto ne "http") { print (LOG "httpget: The URL is not a proper http URL, bailing out. $_[0] passed\n"); return 0; } local($url) = &wwwurl'absolute($Base, $_[0]); local($hd, $response); local(%headers) = (); local($headers) = ''; local($content) = ''; if ($method eq 'GET') { if ($Ims) { $headers{'If-Modified-Since'} = $Ims; } } print(STDERR "$method $url HTTP/1.0\n") # Show user what it looks like unless $Quiet; # and then do the request # # Note that there's a subtle difference between lrequest and request: # lrequest takes a *pointer* to a URL, while request takes the URL itself. # lrequest munges the URL to point to the actual location, so it needs to # have access to $url itself rather than just a copy. # # $response = &www'request($method, $url, *headers, *content, $Tout); $response = &www'lrequest($method, *url, *headers, *content, $Tout); if (!$Quiet) { foreach $hd (keys(%headers)) # This is cheating, but it shows { # the default headers generated next if ($hd =~ m#^[a-z]#); # by the www.pl request library. print(STDERR "$hd: $headers{$hd}\n"); } print(STDERR "\n"); # And print out the result if ($headers) { local($lheaders) = $headers; $lheaders =~ s/\012/\015/g; print(STDERR "$lheaders"); } else { local($lresponse) = $response; local($lerror) = $wwwerror'RespMessage{$response}; $lresponse =~ s/\012/\015/g; $lerror =~ s/\012/\015/g; print(STDERR "HTTP/1.0 $lresponse $lerror\n"); foreach $hd (keys(%headers)) { next if ($hd =~ m#^[A-Z]#); print(STDERR "$hd: $headers{$hd}\n"); } } print(STDERR "\n"); } if ($content && $response == 200) { # # Construct a reasonable name for the incoming file # $httppath = &wwwurl::unescape($httppath); @wrk = split(/\//,$httppath); if ($httppath =~ m,/$,) { if ($#wrk>0) { $filename=$defaultname; } else { # # It was a directory URL. The name of the file will be as much of the # hostname as can be safely used (given the Mac file name limit) + ".html". # If this file already exists (for whatever reason) try to find a unique # name. # $filename = $host.".html"; } } else { $filename=$wrk[$#wrk]; } $macpath = &myutil'uniquename($localwrkdir,$filename); &mymakedir($localwrkdir); open (OUT,">".$macpath) || &Die("Unable to open $macpath"); # # Convert from Unix text to Mac text. Rely on the content-type header # to tell us what is text and what is not. If the file is html, # add in a /i) { if ($content !~ m//\012\012/i; } } else { $content = "\012\012$url\012\012".$content; } $content =~ s/\012/\015/g; } elsif($headers{'content-type'} =~ m,text/,) { $content =~ s/\012/\015/g; } print OUT $content; close(OUT); &MacPerl::SetFileInfo(@file_attributes, $macpath); local(@macnames) = split(/\:/,$macpath); $outfilename=$macnames[$#macnames]; } return $response; } # -------------------------------------------------------------------------- # call looks like: &ExtractURLs($mpath,$burl,$rurl,$mroot,$mdir,$moutfile); # mpath is the complete mac file description # burl is the base URL # rurl is relative to the base # mroot is the local root # mdir is the local additional directory spec # moutfile is the local file name sub ExtractURLs { if ( $currentnestlevel > $traverseurllevels ) { return 0; } # check if we are allowed to nest this deep. local($macfilespec,$baseURL,$relURL, $macRoot, $macDir, $macFilename)=@_; return 0 unless open( EXTRACT,"<".$macfilespec ); $macFilename="from".$macFilename; $macFilename =~ s,([^\.]+)\..+,$1,; local($fromproto, $user, $password, $fromhost, $frompath) = &url::scan($baseURL.$relURL); local($urllistfilepath)=&myutil::uniquename($urllistdir,$macFilename); local($line); local($urlfound); local($proto, $host, $path); local($isdir); # # precheck and prepare the path in case we do relative addressing # guaranteed that path does not start with a / # # now check to see if the path already has us in a directory # $frompath =~ s,^/+(.*),$1,i; #remove beginnng slashes (won't be any) local($isdir) = 0; if ( $frompath =~ m,/$, ) { $isdir=1; } $frompath =~ s,(.*)/+$,$1,i; #remove ending slashes if (( $frompath eq "" ) || ( $isdir )) { # everything will work - $frompath ends in a directory or is empty; } else { local(@urldirs) = split(/\//,$frompath); if ($isdir==0) { pop(@urldirs); #remove the filename from the array } while (($urldirs[0] eq "") && ( $#urldirs > 0)) { # pop off empty entries shift(@urldirs); } $frompath=$urldirs[0]; for ($i=1;$i<=$#urldirs;$i++) { if ( $urldirs[$i] ne "" ) { $frompath = "$frompath/".$urldirs[$i]; } } } ((print LOG "ExtractURLS: Error: from URL=$fromURL;\n\t\t fromProto=$fromproto,fromHost=$fromhost, basepath=$frompath\n") && (return 0)) if ( ($fromproto eq "") || ($fromhost eq "") ); local($basepath)=$frompath; if ( open( URLSOUT, ">".$urllistfilepath ) ) { while($line = ) { chop $line; while ($line =~ m/$urlsearchstring/ig) { $urlfound=$1; if($urlfound =~ m/(.+)\?/i) { next; # don't get query fields } if($urlfound =~ m/mailto:/i) { next; # don't get mailtos - this should be in proto, but?? } if($urlfound =~ m/\.gif/i) { next; # don't get gifs } if($urlfound =~ m/\.jpe?g/i) { next; # don't get gifs } $urlfound =~ s/([^#]+)#.*/$1/i; # remove internal page links $urlfound =~ s/(.+)\/index.htm.*/$1/i; $urlfound =~ s/(.+)\/default.htm.*/$1/i; ($proto, $user, $password, $host, $path) = &url::scan($urlfound); if ((length($proto)==0)) { # # do relative addressing! # $basepath=$frompath; $host=$fromhost; $proto=$fromproto; #better be http! $path=$urlfound; #whole thing should be path @urldirs = split(/\//,$basepath); while ( $path =~ s,/?\.\./(.*),$1,ig ) { # decrement back one directory print LOG "Removing one directory from $frompath for this entry only\n"; pop(@urldirs); } $basepath=$urldirs[0]; for ($i=1;$i<=$#urldirs;$i++) { if ( $urldirs[$i] ne "" ) { $basepath = "$basepath/".$urldirs[$i]; } } # remove terminating '/' $path =~ s,^/+(.*),$1,; #remove slashes at the beginning of $path if ( $basepath ne '') { if ( $path !~ m/^$basepath/) { $path=$basepath."/".$path; } } $urlfound=&url::const($proto, $user, $password, $host, $path); print LOG "Relative url: $urlfound \t $host \t $path \n"; } else { print LOG "Extracting url: $urlfound\n"; } if (( $limittodomain == 0 || ($host =~ m/$domainallowed/io) ) && ( ($limittoservers == 0) || ($host =~ m/@allowedservers/io))) { ( ($limittoserver == 0) || ($host eq $fromhost))) { @urldirs = split(/\//,$path); if ( (length($path)<5 ) ) { if (length($host)<20) { $path = $host."/index.html"; } else { $path = $host; } } elsif ( $path =~ m,/$, ) { if (length($path)<20) { $path .= "index.html"; } elsif (length($urldirs[$#urldirs])<20) { $path = $urldirs[$#urldirs]."/index.html"; } else { $path = $urldirs[$#urldirs]; } } else { # not the end of a directory, lets just make it the name of the file $path = $urldirs[$#urldirs]; } while($path =~ s,/,_,ig) {} if ($urlfound =~ m/^$baseURL(.*)/) { print URLSOUT "$baseURL\t$1\t$macRoot\tfrom $baseURL$relURL\n" if ($path !~ m,#,); } } # if the url is on an allowed server } # end of while a href on this line } # while something to EXTRACT close (URLSOUT); } # if able to open the file to extract from print LOG "Done Extracting URLS from '$filename'.\n"; close (EXTRACT); # now process the newly made file! $currentnestlevel++; &ProcessFile($urllistfilepath); $currentnestlevel--; } #end of ExtractURLs # -------------------------------------------------------------------------- sub ftptohtml { # # Subroutine to convert an ftp listing into a valid HTML page # local($line,$furl); local($host,$path,*list) = @_; # # Create an HTML header. # local($page) = "\n\nftp listing of ".$host.":/".$path."\n\n"; # # For each line, create a reasonable approximation to a valid link # $page .= "\n
\n";
    foreach $line (@list) {
#
#  If the total line appears, make it a link to the directory whose list was
#  just retrieved
#
       if($line =~ m/^total/) {
           $page .= "$line\n";
           next;
       }
       local(@parts) = split(/[ \t]+/,$line);
#
#  Assumes the file name is the last part of $line
#
       local($lp) = length($parts[$#parts]);
       local($ll) = length($line);
       local($dirl) = substr($line,0,$ll-$lp);
       local($fname) = $parts[$#parts];
       $dirl =~ s/[ ]+/\t/;
       if ($line =~ m/^d/) {
           $furl = "ftp://$host/$path".$fname."/";
       } else {
           $furl = "ftp://$host/$path".$fname;
       }
       $page .= "$dirl$fname\n";
    }
    $page .= "
\n\n\n"; return $page; } # -------------------------------------------------------------------------- sub checkurlsearched { local($newurl,$exact) = @_; local($found) = 0; foreach $readurl (@urlsread) { if ($exact) { if ($readurl eq $newurl ) { $found = 1; last; } } else { if ($readurl =~ /$newurl/i) { $found = 1; last; } } } $found; } # -------------------------------------------------------------------------- sub mymakedir { # # Make sure that $dir exists. $dir should be a proper Macintosh # directory path, ending with a :. # local($dir) = @_; if (-e $dir) {} else { mkdir($dir,0777); } }