This code is used by Lycos to limit its web searching.
These sites had older style RobotsNotWanted.txt files as of June 9, 1994:
http://babbage.sissa.it http://gams.cam.nist.gov http://snodaq.phy.queensu.ca http://ubu.hahnemann.edu http://web.nexor.co.uk http://www.stir.ac.uk http://www.tandem.com http://www.win.tue.nl
This code assumes that the exectuable scoutget is available. Scoutget takes a URL name, retrieves the document and prints it on standard output.
This code also assumes that the directory .robots exists in the current directory, and it is used to save copies of the robots.txt files retrieved. As written, it does not create this directory on its own.
#!/usr/bin/perl # ------------------------------------------------------------------------ # check for server that doesn't want robot accesses sub off_limits { local($url, $app) = @_; local($server, $port, $path, $robotfile, $atype, $ret); if ($url =~ m#^http://[^/]+:70[^0-9]#i) { # gopher masquerading as http, asume okay return 0; } # Parse URL ($server, $port, $path) = ($url =~ m#^([a-z]+://[^/]+(:[0-9]+)?)(.*)#i); $server =~ tr/A-Z/a-z/; if ($server =~ /^http/) { $atype = $robotaccess{$server}; # "", "ok", or "disallow paths" # If we haven't cached a Robot Exclusion file, check it if (length ($atype) < 1) { $robotfile = $server . "/robots.txt"; $rname = '.robots/' . $server; $rname =~ s#://#:#; if (-e $rname) { print "; Already had $rname\n"; } else { print "; Looking for $robotfile\n"; system ("scoutget '$robotfile' > '$rname'"); $ret = $?; # print "Return code from scoutget '$robotfile' is ", $ret, "\n"; system ("echo '; Return code $ret' >> '$rname'"); } # Load results in the .robots directory open (R, $rname); $atype = "ok"; MAIN: while (<R>) { s/#.*$//; if (/^User-agent:.*\W$app\W/io || /^User-agent:\s*[*]/io) { $atype = "disallow"; while (<R>) { if (/Disallow:\s*(.*\S)/io) { $atype .= " $1"; } else { last MAIN; } } last MAIN; } } close (R); $robotaccess{$server} = $atype; print "; robot access: $atype\n"; } # Loop through paths, checking for prefix foreach $prohibit (split (/\s+/, $atype)) { if (length ($prohibit) > 0 && $path =~ /^$prohibit/) { print "; access disallowed for robots\n"; return (1); } } print "; access allowed for robots\n"; return (0); } else { return (0); } # no server, can't check, assume okay } # ---------------------------------------------------------------- # Main program $url = $ARGV[0]; $app = $ARGV[1]; if ($app eq "") { $app = "Lycos"; } if (&off_limits ($url, $app)) { print "Access disallowed for user-agent $app\n"; } else { print "Access ok for user-agent $app\n"; }
# Robot Policy file as per Robot Exclusion standard 17-jun-94 User-agent: Lycos # Match Lycos Disallow: /spool User-agent: * # Match any robot Disallow: /tmp /spool
fz% check-rnw.perl http://fuzine.vperson.com/tmp/cc00234 Lycos ; Looking for http://fuzine.vperson.com/robots.txt ; robot access: disallow /spool ; access allowed for robots Access ok for user-agent Lycos
fz% check-rnw.perl http://fuzine.vperson.com/tmp/cc00234 MomSpider ; Already had .robots/http:fuzine.vperson.com ; robot access: disallow /tmp /spool ; access disallowed for robots Access disallowed for user-agent MomSpider
fz% check-rnw.perl http://fuzine.vperson.com/mlm/lycos-home.html MomSpider ; Already had .robots/http:fuzine.vperson.com ; robot access: disallow /tmp /spool ; access allowed for robots Access ok for user-agent MomSpider