#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

enc_fallback = 'utf-8'


from optparse import OptionParser
parser = OptionParser(usage='%prog [options] { URL | PATH }',
	description='Leech stuff, process it, and recurse!')
optz,argz = parser.parse_args()
try: src, = argz
except ValueError: parser.error('Need exactly one argument.')


import itertools as it, operator as op, functools as ft
from BeautifulSoup import BeautifulSoup
from urllib2 import urlopen
from urlparse import urljoin
import re

if re.search('^https?://', src):
	req = urlopen(src)
	try: enc = req.headers.plist[0].rsplit('=', 1)[-1]
	except IndexError: enc = enc_fallback
	page = req.read().decode(enc)
else:
	page = open(src).read().decode(enc_fallback)


soup = BeautifulSoup(page)\
	.find('td', {'class': 'pagecontent'})\
	.find('div', {'class': 'wiki-content'})

links_top = soup.find('ul')
links = map(
	op.methodcaller('find', 'a'),
	links_top.findAll('li') )

result = BeautifulSoup()

for idx,link in enumerate(links):
	href = urljoin(src, link['href'])
	req = urlopen(href)
	try: enc = req.headers.plist[0].rsplit('=', 1)[-1]
	except IndexError: enc = enc_fallback
	page = req.read().decode(enc)

	link_up, nesting = link, 0
	while True:
		link_up = link_up.findParent()
		if not link_up or link_up == links_top: break
		if link_up.name != 'ul': nesting += 1

	result.append( '<h{level}>{markings} {text} {markings}</h{level}>'\
		.format(text=link.text, markings='*'*(5-nesting), level=nesting) )

	sub = BeautifulSoup(page)\
		.find('td', {'class': 'pagecontent'})\
		.find('div', {'class': 'wiki-content'})
	for img in sub.findAll('img'):
		img['src'] = urljoin(href, img['src'])
	result.append(sub)

print(result)
