#!/usr/bin/env python

'''
racoon - web hoarder.

Script works like a web spider, but it's output is a single html,
containing all the fetched pages. It's sole purpose is to create
single-html documents from multi-page guides and manuals, for
ease-of-use with simple readers, like palm/ppc/smartphones.

Composition of the resulting page is simple: everywhere there's a link,
linked content is inserted. Roaming rules are defined by regular
expressions below, but they don't define everything.

Resulting html can then be converted to plain text by (for example)
html2text.py script (by Aaron Swartz), so it can be used w/o dumb and
slow browsers. I use "Weasel Reader" (w/ preprocessing by makeztxt
util) to read the results on a palm device.

Feel free to use this code as you see fit.
'''

from optparse import OptionParser
parser = OptionParser(usage='%prog [options] ADDRESS FILE', description='Fetch ADDRESS and all subpages into a single FILE.')
parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='output more operational data')
parser.add_option('-c', '--max-pages', action='store', type='int', dest='max_count', metavar='INT', default=None, help='max number of pages to fetch, 0 - no limit (default)')
optz, argz = parser.parse_args()

if len(argz) != 2: parser.error('Invalid arguments')
src, dst = argz

from string import whitespace as spaces
import logging as log
import re, urllib2, inspect
if not optz.verbose: log.basicConfig(level=log.INFO)
else: log.basicConfig(level=log.DEBUG)

# These patterns will be (repeatedly) used later
a_ = re.compile( # opening hyperlink tag dissection pattern
	'('+
		'\s*<('+
			'dt|'+
			'h\d'+
		')[^>]*>'+
		'[^<]*'+
	')?'+
	'<a[^>]+?(href=([^> ]+))?[^>]*>',
	re.I
)
_a = re.compile( # closing hyperlink tag dissection pattern
	'</a\s*>'+
	'('+
		'[^<]*'+
		'</('+
			'dt|'+
			'h\d'+
		')\s*>'+
	')?',
	re.I
)
page_body = re.compile('<body[^>]*>(.*)</body\s*>', re.I)
page_strip = ( # tags/patterns to strip, leaving their contents (group 1) intact
	re.compile('<\s*pre[^>]*>(.*?)<\s*/\s*pre\s*>', re.I),
	re.compile('<a\s+href="\.\./[^"]+"[^>]*>(.*?)</a\s*>', re.I),
	re.compile('<a[^>]*>(\d+\.\d+)\w?</a\s*>', re.I),
)
page_trash = ( # patterns to erase completely (like page header/footer)
	# Top nav
	re.compile('.*<!-- qtag -->'),
	re.compile('.*<!-- qbegin -->'),
	# Bottom nav / refs, if any
	re.compile('<p>References:.*'),
	re.compile('<!-- aend -->.*'),
	re.compile('<!-- lastfooter -->.*'),
	# Any leftover nav
	re.compile('<a\s+href="[^"]+"[^>]*><img[^>]+></a>'),
	# Footnote header/footer
	re.compile('.*alt="search">\s*</a>\s*<hr>'),
	re.compile('<hr>\s*<p>\s*<a href="[^"]+" rev=subdocument>.*'),
)
page_drop = ( # drop fetched pages w/ this patterns (like "error 404")
	re.compile('HTTP Error 404 - File or Directory Not Found.'),
)
link_skip = ( # link patterns not to fetch/insert
	re.compile('.*/contact/.*'),
	re.compile('.*/support/.*'),
	re.compile('.*/services/.*'),
	re.compile('.*/cgi-bin/.*'),
	re.compile('.*mailto:.*'),
	re.compile('(about|eskimo|search|feedback|copyright)\.html'),
	re.compile('.*/games'),
)
link_abs = re.compile('^\w+://')

# Global set of already used links
links = set()

log.debug('Init completed...')

def parse(url, core=None):
	level = len(inspect.stack())-1
	doc = [] # container for the pages, that are being fetched in this and child frames

	# Construct page path (aka chroot)
	if not core: core = url
	if not core.endswith('/'): core = core.rsplit('/', 1)[0]+'/'

	# Fetch the page
	log.debug('Fetching page: %s'%url)
	try: src = urllib2.urlopen(url)
	except urllib2.HTTPError, err:
		log.debug('Fetch failed: %s'%err)
		return '' # not the best practice, but oh well

	# Since newlines in html are essentially irrelevant...
	line = src.read()
	line = line.replace('\n', ' ').replace('\r', ' ') # strip them, for pcre usage convenience (no need for DOTALL)

	# Check if page contains valid info, not some error message
	for pat in page_drop:
		if pat.search(line):
			log.debug('Drop pattern detected, dropping this page entirely...')
			return ''

	log.debug('Performing initial stripping on %s bytes...'%len(line))
	# Only the <body> part is considered relevant
	match = page_body.findall(line)
	if match: line = match[0]
	log.debug('Got body: %s bytes'%len(line))

	# Patterns, considered irrelevant, which can break sequential order - like quicknav
	for pat in page_trash:
		while 1: # re.findall and re.finditer are unacceptable here
			match = pat.search(line)
			if match: line = line[:match.start()] + line[match.end():]
			else: break
	log.debug('Stripped trash, %s bytes left...'%len(line))

	# Patterns to strip, preserving some relevant contents (like "<pre>-</pre>" or some links)
	if level > 1: # yep, that's a bit dirty
		for pat in page_strip:
			while 1: # re.findall and re.finditer are unacceptable here
				match = pat.search(line)
				if match: line = line[:match.start()] + match.group(1) + line[match.end():]
				else: break
		log.debug('Stripped defined tags, %s bytes left...'%len(line))

	log.debug('Stripping completed (%s bytes left), parsing hyperlinks...'%len(line))

	# Main loop. Line is used as a kinda random-access buffer
	while line:
		log.debug( 'Hyperloop iteration on depth %s: %s bytes left'%(level, len(line)) )
		# The tricky and the most important part here is
		#  finding and parsing hyperlinks. Nothing else matters ;)
		m_as = a_.search(line) # opening a-tag

		if m_as:
			try: url = m_as.group(4).strip('"\'') # href url w/o quotes
			except: url = None
			else:
				log.debug('Found link: %s'%url)
				url = url.split('#',1)[0] # stripping id-jump, not strictly necessary, but helps on the next line check
				for pat in link_skip:
					if pat.match(url):
						log.debug('Link matched skip pattern')
						url = None # to skip the next part w/o indent, oh py :)
						break
			m_ae = _a.search(line) # closing a-tag, matched to preserve <a>-</a> contents
			if (
				url
				and not link_abs.search(url) # absolute urls are considered external
				and not url.startswith('/') # unrelated pages
				and not url.startswith('..') # cross-chapter links
			):
				url = core + url # since it's relative here, we need to construct absolution
				if url not in links: # prevent repeats, in case of inter-linking (or harmful quicknav)
					log.debug('Link validation success')
					links.add(url)
					if m_ae: # should always be true, but who knows
						doc.append(line[:m_ae.end()])
						line = line[m_ae.end():]
					log.info('Parsing link: %s // depth: %s'%(url, level))
					if not optz.max_count or len(links) < optz.max_count: doc.append(parse(url)) # recurse
				else: # probably a flashback link to previous chapters - just preserve the contents
					log.debug('Link validation failure: already fetched')
					if m_as and m_ae:
						doc.append(line[m_as.start():m_ae.end()])
						line = line[m_ae.end():]
			else: # id-jump, erroneous link, absolute or intentionally skipped url
				log.debug('Link validation failure: empty, absolute or skipped')
				m_ae = m_ae.end() if m_ae else m_as.end()
				doc.append(line[:m_ae])
				line = line[m_ae:]

		else: # no a-tag - just flush it
			log.debug('No links found, flushing buffer...')
			doc.append(line)
			break

	log.debug('Hyperloop finished')
	return ''.join(doc)


log.debug('Starting up...')

# Now, all that's left to do is to start the recursion...
open(dst, 'w').write(parse(src))
# Simple, huh? :)

log.debug('Finished!')
