C code for SCOUTGET (URL fetcher)

C Code to fetch URLs

This code is used by the Lycos project to fetch URLs. For an example, see the check-rnw.perl page that shows how to implement the standard for robot exclusion.

This code compiles with CERN's libwww version 2.14, and is known to require changes to compile with version 2.15.


/*****************************************************************
 * scoutget: Fetch a URL (link with libwww version 2.14)
 ****************************************************************/
#include <stdio.h>
#include <HTUtils.h>
#include <HTParse.h>
#include <HTAccess.h>
#include <HTAnchor.h>
#include <HTStream.h>

#define REF_MARK "[%d]"
#define END_MARK "     [End]"
#define SCREEN_WIDTH 79			/* Default width of the screen */ 

PUBLIC char * HTAppName;		/* Application name */
PUBLIC char * HTAppVersion; 		/* Application version */

PUBLIC  int  HTScreenWidth   = SCREEN_WIDTH; /* By default */
PUBLIC  int  HTScreenHeight  = -1;	/* Undefined */
PUBLIC  BOOL display_anchors = YES;	/* anchor will be shown in text? */
PRIVATE  BOOL interactive     = YES;	/*  e.g. shows prompts etc */
PRIVATE  char * output_file_name = NULL; /* -o xxxx */
PUBLIC HTStream* HTOutputStream;
					   
PUBLIC char * start_reference = NULL;   /* Format string for start anchor */
PUBLIC char * end_reference = REF_MARK; /* for end anchor */
PUBLIC char * reference_mark = "[%d] "; /* for reference lists */
PRIVATE char * refhead = NULL;		/* Reference list heading */
PUBLIC char * end_mark = END_MARK;      /* Format string for [End] */


void main(argc, argv)
int argc;
char **argv;
{
   char *ref;

   HTParentAnchor *anchor;

   if (argc != 2)
   { fprintf (stderr, "Usage: scoutget <url>\n");
     exit (1);
   }

   logfile = fopen("/tmp/test.logfile", "a");
   alarm (150);

   /* HTFormatInit(); */
   HTFileInit();

   HTAppName = "Lycos";	/* Application name */
   HTAppVersion = "0.9";	/* Application version */

   HTOutputFormat = WWW_SOURCE;
   HTOutputStream = (HTStream *) HTFWriter_new (stdout);

   if ((ref = HTParse(argv[1], "", PARSE_ALL)) &&
       (anchor = (HTParentAnchor *) HTAnchor_findAddress(ref)) &&
       HTLoadAnchor((HTAnchor *)anchor))
   { printf ("\n");
     exit (0);
   }
   else
   { exit (1); }
}


Linking SCOUTGET and CERN libwww

Here is a csh script that links scoutget.c against CERN libwww 2.14 (note that it includes code form the textmode browser www).
#!/bin/csh
#
# First build libwww 2.14, and set WDIR to point to it

set WDIR = /afs/cs.cmu.edu/project/scout/src/libwww

cc -g -o scoutget -I$WDIR/lib/WWW/Library/Implementation \
	lib/WWW/LineMode/Implementation/DefaultStyles.c \
	lib/WWW/LineMode/Implementation/GridText.c \
	lib/WWW/Library/Implementation/HTTP.c \
	lib/WWW/Library/Implementation/HTML.c scoutget.c \
	-L${WDIR}/lib/WWW/Library/next -lwww

Sample Run


fz% scoutget http://fuzine.vperson.com/
MIME-Version: 1.0
Server: CERN/3.0pre2
Date: Friday, 08-Jul-94 18:36:43 GMT
Content-Type: text/html
Content-Length: 1201
Last-Modified: Friday, 24-Jun-94 22:00:46 GMT

<html>
<title>Welcome to the Fuzine Web Server</title>

<h1><img src="turtle.gif" align=bottom>Welcome to the<br>
<img src="gray140.gif" align=top>Fuzine Web Server<img src="wolf2-r.gif" align=top></h1> <p>

<body>
<hr>

<h2>Local Resources </h2>
<p>
<ul>
<li> <a href="home.html">Michael Mauldin's Home page</a>
<li> <a href="lycos-home.html">Lycos Home page</a>
<li> <a href="http://fuzine.vperson.com/scout/home.html">Scout Home page</a>
<li> <a href="julia.html">Julia's Home page</a>
<li> <a href="http://fuzine.vperson.com/popular.html">Most popular URLs on this server</a>
</ul>

<p>
<h2>Other Servers at CMU</h2>
<p>
<ul>
<li> <a href="http://www.cmu.edu">Carnegie Mellon University</a>
<li> <a href="http://www.cs.cmu.edu:8001/Web/FrontDoor.html">School of Computer Science</a>
<li> <a href="http://www.mt.cs.cmu.edu/cmt/CMT-home.html">Center for Machine Translation</a>
<li> <a href="http://thule.mt.cs.cmu.edu:8001/">The Thule Server at the CMT</a>
</ul>


</body>
<p> <hr>
<address>Last updated 25-Jun-94 by <a href="home.html">fuzzy@cmu.edu</a></address>


Last updated 8-Jul-94