#!/usr/bin/env python ''' racoon - web hoarder. Script works like a web spider, but it's output is a single html, containing all the fetched pages. It's sole purpose is to create single-html documents from multi-page guides and manuals, for ease-of-use with simple readers, like palm/ppc/smartphones. Composition of the resulting page is simple: everywhere there's a link, linked content is inserted. Roaming rules are defined by regular expressions below, but they don't define everything. Resulting html can then be converted to plain text by (for example) html2text.py script (by Aaron Swartz), so it can be used w/o dumb and slow browsers. I use "Weasel Reader" (w/ preprocessing by makeztxt util) to read the results on a palm device. Feel free to use this code as you see fit. ''' from optparse import OptionParser parser = OptionParser(usage='%prog [options] ADDRESS FILE', description='Fetch ADDRESS and all subpages into a single FILE.') parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='output more operational data') parser.add_option('-c', '--max-pages', action='store', type='int', dest='max_count', metavar='INT', default=None, help='max number of pages to fetch, 0 - no limit (default)') optz, argz = parser.parse_args() if len(argz) != 2: parser.error('Invalid arguments') src, dst = argz from string import whitespace as spaces import logging as log import re, urllib2, inspect if not optz.verbose: log.basicConfig(level=log.INFO) else: log.basicConfig(level=log.DEBUG) # These patterns will be (repeatedly) used later a_ = re.compile( # opening hyperlink tag dissection pattern '('+ '\s*<('+ 'dt|'+ 'h\d'+ ')[^>]*>'+ '[^<]*'+ ')?'+ ']+?(href=([^> ]+))?[^>]*>', re.I ) _a = re.compile( # closing hyperlink tag dissection pattern ''+ '('+ '[^<]*'+ ''+ ')?', re.I ) page_body = re.compile(']*>(.*)', re.I) page_strip = ( # tags/patterns to strip, leaving their contents (group 1) intact re.compile('<\s*pre[^>]*>(.*?)<\s*/\s*pre\s*>', re.I), re.compile(']*>(.*?)', re.I), re.compile(']*>(\d+\.\d+)\w?', re.I), ) page_trash = ( # patterns to erase completely (like page header/footer) # Top nav re.compile('.*'), re.compile('.*'), # Bottom nav / refs, if any re.compile('

References:.*'), re.compile('.*'), re.compile('.*'), # Any leftover nav re.compile(']*>]+>'), # Footnote header/footer re.compile('.*alt="search">\s*\s*

'), re.compile('

\s*

\s*.*'), ) page_drop = ( # drop fetched pages w/ this patterns (like "error 404") re.compile('HTTP Error 404 - File or Directory Not Found.'), ) link_skip = ( # link patterns not to fetch/insert re.compile('.*/contact/.*'), re.compile('.*/support/.*'), re.compile('.*/services/.*'), re.compile('.*/cgi-bin/.*'), re.compile('.*mailto:.*'), re.compile('(about|eskimo|search|feedback|copyright)\.html'), re.compile('.*/games'), ) link_abs = re.compile('^\w+://') # Global set of already used links links = set() log.debug('Init completed...') def parse(url, core=None): level = len(inspect.stack())-1 doc = [] # container for the pages, that are being fetched in this and child frames # Construct page path (aka chroot) if not core: core = url if not core.endswith('/'): core = core.rsplit('/', 1)[0]+'/' # Fetch the page log.debug('Fetching page: %s'%url) try: src = urllib2.urlopen(url) except urllib2.HTTPError, err: log.debug('Fetch failed: %s'%err) return '' # not the best practice, but oh well # Since newlines in html are essentially irrelevant... line = src.read() line = line.replace('\n', ' ').replace('\r', ' ') # strip them, for pcre usage convenience (no need for DOTALL) # Check if page contains valid info, not some error message for pat in page_drop: if pat.search(line): log.debug('Drop pattern detected, dropping this page entirely...') return '' log.debug('Performing initial stripping on %s bytes...'%len(line)) # Only the part is considered relevant match = page_body.findall(line) if match: line = match[0] log.debug('Got body: %s bytes'%len(line)) # Patterns, considered irrelevant, which can break sequential order - like quicknav for pat in page_trash: while 1: # re.findall and re.finditer are unacceptable here match = pat.search(line) if match: line = line[:match.start()] + line[match.end():] else: break log.debug('Stripped trash, %s bytes left...'%len(line)) # Patterns to strip, preserving some relevant contents (like "

" or some links) if level > 1: # yep, that's a bit dirty for pat in page_strip: while 1: # re.findall and re.finditer are unacceptable here match = pat.search(line) if match: line = line[:match.start()] + match.group(1) + line[match.end():] else: break log.debug('Stripped defined tags, %s bytes left...'%len(line)) log.debug('Stripping completed (%s bytes left), parsing hyperlinks...'%len(line)) # Main loop. Line is used as a kinda random-access buffer while line: log.debug( 'Hyperloop iteration on depth %s: %s bytes left'%(level, len(line)) ) # The tricky and the most important part here is # finding and parsing hyperlinks. Nothing else matters ;) m_as = a_.search(line) # opening a-tag if m_as: try: url = m_as.group(4).strip('"\'') # href url w/o quotes except: url = None else: log.debug('Found link: %s'%url) url = url.split('#',1)[0] # stripping id-jump, not strictly necessary, but helps on the next line check for pat in link_skip: if pat.match(url): log.debug('Link matched skip pattern') url = None # to skip the next part w/o indent, oh py :) break m_ae = _a.search(line) # closing a-tag, matched to preserve - contents if ( url and not link_abs.search(url) # absolute urls are considered external and not url.startswith('/') # unrelated pages and not url.startswith('..') # cross-chapter links ): url = core + url # since it's relative here, we need to construct absolution if url not in links: # prevent repeats, in case of inter-linking (or harmful quicknav) log.debug('Link validation success') links.add(url) if m_ae: # should always be true, but who knows doc.append(line[:m_ae.end()]) line = line[m_ae.end():] log.info('Parsing link: %s // depth: %s'%(url, level)) if not optz.max_count or len(links) < optz.max_count: doc.append(parse(url)) # recurse else: # probably a flashback link to previous chapters - just preserve the contents log.debug('Link validation failure: already fetched') if m_as and m_ae: doc.append(line[m_as.start():m_ae.end()]) line = line[m_ae.end():] else: # id-jump, erroneous link, absolute or intentionally skipped url log.debug('Link validation failure: empty, absolute or skipped') m_ae = m_ae.end() if m_ae else m_as.end() doc.append(line[:m_ae]) line = line[m_ae:] else: # no a-tag - just flush it log.debug('No links found, flushing buffer...') doc.append(line) break log.debug('Hyperloop finished') return ''.join(doc) log.debug('Starting up...') # Now, all that's left to do is to start the recursion... open(dst, 'w').write(parse(src)) # Simple, huh? :) log.debug('Finished!')