#This is the global configuration file for Labrador Web Crawler


#if omitted, defaults to folder above binary
Base /users/students4/level4/macdonch/src/project/labrador/

#what port to run the dispatcher on, default 2680
DispatcherPort 2680

#minumum number of seconds between each request to a
#given hostname
PerHostDelay 60

#time until a robots.txt file expires
RobotsTxtExpiry 25

#where to put a robots.txt cache directory
#defaults to data/robots.txt/
#RobotsTxtCache data/robots.txt/

#hmmm required - the method to allocate URL 
#TODO not implemented
#hmm ObtainURLs File data/rootset.txt

#URL allocation - eg DFS, BFS etc
#implementation detail - BFS adds to end of main queue,
#DFS adds to start
URLAlloc BFS

#takes from the head of the main queue, and allocates
#each URL to a crawler host
#this only encapsulates data manipulation, not
#storage
#CrawlerAlloc PerHost
CrawlerAlloc AnyHost


#URLFilter occurs as URLs are passed to the dispatcher
#from the subcrawlers, ON THE DISPATCHER
#URLFilter ModuleHandler (params)
#we want DNSlookup to be first
URLFilter DNSLookup #passes to another process across a pipe or a socket to perform async lookups, ignores result
URLFilter File Blacklist data/blacklist.txt
#URLFilter File Whitelist data/whitelist.txt

#next two are fairly self-explanatory - basically crawler trap protection
URLFilter URLDepth 15 
URLFilter Length 1024


URLFilter Regexp ^http://www\.dcs\.gla\.ac\.uk/ #return BAD it $url does NOT match this regexp
#TODO figure out how to share robot's data structures with other things

#how far into a crawl to go, should always come last
#not happy - chicken & egg syndrome
#URLFilter LinkDistance 3

DataPersistence Hash GDBM_File
DataPersistence Array Tie::File

#end the crawl after successfully downloading MaxCrawlURLs urls
#defaults to 0, which means unlimited
MaxCrawlURLs 0
#MaxCrawlURLs 10


#FROM HERE DOWN IS FOR THE SUBCRAWLERS

SpiderProxy http://wwwcache.dcs.gla.ac.uk:8080
#SpiderProxyUsername
#SpiderProxyPassword

#required - the HTTP Header name field each spider should give
SpiderName Labrador

#Required
SpiderVersion 0.01

#required - the HTTP Header email field each spider should give
SpiderEmail macdonch@dcs.gla.ac.uk

#other possible options are img:
FollowLinks a area meta link

ExtensionsBlacklist jp(?:e?)g gif png zip xls swf dvi avi movie mpg* mp3 bmp tiff tar\.gz tar css
ContentTypeWhitelist text/html text/plain application/xhtml+xml application/postscript application/pdf text/xml application/vnd.ms-powerpoint
HTMLContentTypeWhitelist text/html application/xhtml+xml text/xml


#not sure how these relate to HTML/pdf/ps etc
#ReceivePage HandlerModule (params)
ReceivePage Terrier
ReceivePage LinkAnalysis


#Map HTTP given content types to content classes
#eg text/html -> HTML
ContentMap data/content_map.txt

#TODO design content filtering for topic driven crawlers

#HandleContent ContentClass HandlerModule
HandleContent HTML HTML
HandleContent HTML LinkAnalysis
HandleContent PDF PDF
#Handle_Content #your custom content handler goes here!

#SpiderSyncStats timeinseconds to tell the dispatcher your stats
#default 3600 (1 hour)
#stats: Number req done, bytes in, bytes out, CPU seconds used etc, mem footprint?
SpiderSyncStats 60

#todo should these be %h %p rather than $HOSTNAME $PID etc?
SpiderAccessLog /tmp/labrador/access%H-%P.log
SpiderMessageLog /tmp/labrador/message$HOSTNAME-$PID.log

# %a - Remote IP address
# %A - Remote hostname
# %d - sprintf standard date format
# %f - Filename
# %p - Remote port
# %T - Time taken to make request
# %t - Current epoch time
# %m - Request method
# %q - Querystring
# %U - entire URL requested
# %u - URI of request
# %s - HTTP status code
# %S - Protocol scheme (eg http, https)
# %r - Referring URL
# %c - Size of content downloaded uncompressed (excluding headers)
# %C - Size of content downloaded compressed (excluding headers)
# %P - PID of requesting crawler
# %H - the hostname of the requesting crawler
SpiderAccessLogFormat "%t %s %U"