#!/usr/bin/env python
# -*- encoding: utf-8 -*-

"""Convert many unicode characters to ascii characters that are like them.

I want to collate names, with the property that a last name starting with
O-umlaut will be in with the last name's starting with O.	Horrors!

So I want that many Latin-1 characters have their umlaute's, etc., stripped.
Some of it can be done automatically but some needs to be done by hand, that
I can tell.
"""
__version__='1.0.1'
__author__='Jim Hefferon: ftpmaint at tug.ctan.org'
__date__='2008-July-15'
__notes__="""As sources, used effbot's web site, and
	http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/251871
and
	man uni2ascii
"""

import os, os.path, sys, re
import unicodedata

# These characters that are not done automatically by NFKD, and
# have a name starting with "LATIN".	Some of these I found on the interwebs,
# but some I did by eye.	Corrections or additions appreciated.
EXTRA_LATIN_NAMES={
	# First are ones I got off the interweb
	u"\N{LATIN CAPITAL LETTER O WITH STROKE}": u"O",
	u"\N{LATIN SMALL LETTER A WITH GRAVE}": u"a",
	u"\N{LATIN SMALL LETTER A WITH ACUTE}": u"a",
	u"\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}": u"a",
	u"\N{LATIN SMALL LETTER A WITH TILDE}": u"a",
	u"\N{LATIN SMALL LETTER A WITH DIAERESIS}": u"a",
	u"\N{LATIN SMALL LETTER A WITH RING ABOVE}": u"a",
	u"\N{LATIN SMALL LETTER C WITH CEDILLA}": u"c",
	u"\N{LATIN SMALL LETTER E WITH GRAVE}": u"e",
	u"\N{LATIN SMALL LETTER E WITH ACUTE}": u"e",
	u"\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}": u"e",
	u"\N{LATIN SMALL LETTER E WITH DIAERESIS}": u"e",
	u"\N{LATIN SMALL LETTER I WITH GRAVE}": u"i",
	u"\N{LATIN SMALL LETTER I WITH ACUTE}": u"i",
	u"\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}": u"i",
	u"\N{LATIN SMALL LETTER I WITH DIAERESIS}": u"i",
	u"\N{LATIN SMALL LETTER N WITH TILDE}": u"n",
	u"\N{LATIN SMALL LETTER O WITH GRAVE}": u"o",
	u"\N{LATIN SMALL LETTER O WITH ACUTE}": u"o",
	u"\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}": u"o",
	u"\N{LATIN SMALL LETTER O WITH TILDE}": u"o",
	u"\N{LATIN SMALL LETTER O WITH DIAERESIS}": u"o",
	u"\N{LATIN SMALL LETTER O WITH STROKE}": u"o",
	u"\N{LATIN SMALL LETTER U WITH GRAVE}": u"u",
	u"\N{LATIN SMALL LETTER U WITH ACUTE}": u"u",
	u"\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}": u"u",
	u"\N{LATIN SMALL LETTER U WITH DIAERESIS}": u"u",
	u"\N{LATIN SMALL LETTER Y WITH ACUTE}": u"y",
	u"\N{LATIN SMALL LETTER Y WITH DIAERESIS}": u"y",
	u"\N{LATIN SMALL LETTER A WITH MACRON}": u"a",
	u"\N{LATIN SMALL LETTER A WITH BREVE}": u"a",
	u"\N{LATIN SMALL LETTER C WITH ACUTE}": u"c",
	u"\N{LATIN SMALL LETTER C WITH CIRCUMFLEX}": u"c",
	u"\N{LATIN SMALL LETTER E WITH MACRON}": u"e",
	u"\N{LATIN SMALL LETTER E WITH BREVE}": u"e",
	u"\N{LATIN SMALL LETTER G WITH CIRCUMFLEX}": u"g",
	u"\N{LATIN SMALL LETTER G WITH BREVE}": u"g",
	u"\N{LATIN SMALL LETTER G WITH CEDILLA}": u"g",
	u"\N{LATIN SMALL LETTER H WITH CIRCUMFLEX}": u"h",
	u"\N{LATIN SMALL LETTER I WITH TILDE}": u"i",
	u"\N{LATIN SMALL LETTER I WITH MACRON}": u"i",
	u"\N{LATIN SMALL LETTER I WITH BREVE}": u"i",
	u"\N{LATIN SMALL LETTER J WITH CIRCUMFLEX}": u"j",
	u"\N{LATIN SMALL LETTER K WITH CEDILLA}": u"k",
	u"\N{LATIN SMALL LETTER L WITH ACUTE}": u"l",
	u"\N{LATIN SMALL LETTER L WITH CEDILLA}": u"l",
	u"\N{LATIN CAPITAL LETTER L WITH STROKE}": u"L",
	u"\N{LATIN SMALL LETTER L WITH STROKE}": u"l",
	u"\N{LATIN SMALL LETTER N WITH ACUTE}": u"n",
	u"\N{LATIN SMALL LETTER N WITH CEDILLA}": u"n",
	u"\N{LATIN SMALL LETTER O WITH MACRON}": u"o",
	u"\N{LATIN SMALL LETTER O WITH BREVE}": u"o",
	u"\N{LATIN SMALL LETTER R WITH ACUTE}": u"r",
	u"\N{LATIN SMALL LETTER R WITH CEDILLA}": u"r",
	u"\N{LATIN SMALL LETTER S WITH ACUTE}": u"s",
	u"\N{LATIN SMALL LETTER S WITH CIRCUMFLEX}": u"s",
	u"\N{LATIN SMALL LETTER S WITH CEDILLA}": u"s",
	u"\N{LATIN SMALL LETTER T WITH CEDILLA}": u"t",
	u"\N{LATIN SMALL LETTER U WITH TILDE}": u"u",
	u"\N{LATIN SMALL LETTER U WITH MACRON}": u"u",
	u"\N{LATIN SMALL LETTER U WITH BREVE}": u"u",
	u"\N{LATIN SMALL LETTER U WITH RING ABOVE}": u"u",
	u"\N{LATIN SMALL LETTER W WITH CIRCUMFLEX}": u"w",
	u"\N{LATIN SMALL LETTER Y WITH CIRCUMFLEX}": u"y",
	u"\N{LATIN SMALL LETTER Z WITH ACUTE}": u"z",
	u"\N{LATIN SMALL LETTER W WITH GRAVE}": u"w",
	u"\N{LATIN SMALL LETTER W WITH ACUTE}": u"w",
	u"\N{LATIN SMALL LETTER W WITH DIAERESIS}": u"w",
	u"\N{LATIN SMALL LETTER Y WITH GRAVE}": u"y",
	# Below are the ones that failed automated conversion
	u'\N{LATIN CAPITAL LETTER AE}': u'AE',
	u'\N{LATIN CAPITAL LETTER ETH}': u'D',
	u'\N{LATIN CAPITAL LETTER O WITH STROKE}': u'O',
	u'\N{LATIN CAPITAL LETTER THORN}': u'TH',
	u'\N{LATIN SMALL LETTER SHARP S}': u'ss',
	u'\N{LATIN SMALL LETTER AE}': u'ae',
	u'\N{LATIN SMALL LETTER ETH}': u'd',
	u'\N{LATIN SMALL LETTER O WITH STROKE}': u'o',
	u'\N{LATIN SMALL LETTER THORN}': 'th',
	u'\N{LATIN CAPITAL LETTER D WITH STROKE}': u'D',
	u'\N{LATIN SMALL LETTER D WITH STROKE}': u'd',
	u'\N{LATIN CAPITAL LETTER H WITH STROKE}': u'H',
	u'\N{LATIN SMALL LETTER H WITH STROKE}': u'h',
	u'\N{LATIN SMALL LETTER DOTLESS I}': u'i',
	u'\N{LATIN SMALL LETTER KRA}': u'q',
	u'\N{LATIN CAPITAL LETTER L WITH STROKE}': u'L',
	u'\N{LATIN SMALL LETTER L WITH STROKE}': u'l',
	u'\N{LATIN CAPITAL LETTER ENG}': u'N',
	u'\N{LATIN SMALL LETTER ENG}': u'n',
	u'\N{LATIN CAPITAL LIGATURE OE}': u'OE',
	u'\N{LATIN SMALL LIGATURE OE}': u'oe',
	u'\N{LATIN CAPITAL LETTER T WITH STROKE}': u'T',
	u'\N{LATIN SMALL LETTER T WITH STROKE}': u't',
	u'\N{LATIN SMALL LETTER B WITH STROKE}': u'b',
	u'\N{LATIN CAPITAL LETTER B WITH HOOK}': u'B',
	u'\N{LATIN CAPITAL LETTER B WITH TOPBAR}': u'B',
	u'\N{LATIN SMALL LETTER B WITH TOPBAR}': u'b',
	# u'\N{LATIN CAPITAL LETTER TONE SIX}': u'',	# ?B
	# u'\N{LATIN SMALL LETTER TONE SIX}': u'',	# ?b
	u'\N{LATIN CAPITAL LETTER OPEN O}': u'O',
	u'\N{LATIN CAPITAL LETTER C WITH HOOK}': u'C',
	u'\N{LATIN SMALL LETTER C WITH HOOK}': u'c',
	u'\N{LATIN CAPITAL LETTER AFRICAN D}': u'D',
	u'\N{LATIN CAPITAL LETTER D WITH HOOK}': u'D',
	u'\N{LATIN CAPITAL LETTER D WITH TOPBAR}': u'D',
	u'\N{LATIN SMALL LETTER D WITH TOPBAR}': u'd',
	# u'\N{LATIN SMALL LETTER TURNED DELTA}': u'',
	u'\N{LATIN CAPITAL LETTER REVERSED E}': u'E',
	# u'\N{LATIN CAPITAL LETTER SCHWA}': u'',
	u'\N{LATIN CAPITAL LETTER OPEN E}': u'E',
	u'\N{LATIN CAPITAL LETTER F WITH HOOK}': u'F',
	u'\N{LATIN SMALL LETTER F WITH HOOK}': u'f',
	u'\N{LATIN CAPITAL LETTER G WITH HOOK}': u'G',
	# u'\N{LATIN CAPITAL LETTER GAMMA}': u'',
	u'\N{LATIN SMALL LETTER HV}': u'hv',
	u'\N{LATIN CAPITAL LETTER IOTA}': u'i',
	u'\N{LATIN CAPITAL LETTER I WITH STROKE}': u'I',
	u'\N{LATIN CAPITAL LETTER K WITH HOOK}': u'K',
	u'\N{LATIN SMALL LETTER K WITH HOOK}': u'k',
	u'\N{LATIN SMALL LETTER L WITH BAR}': u'l',
	# u'\N{LATIN SMALL LETTER LAMBDA WITH STROKE}': u'',
	# u'\N{LATIN CAPITAL LETTER TURNED M}': u'',
	u'\N{LATIN CAPITAL LETTER N WITH LEFT HOOK}': u'N',
	u'\N{LATIN SMALL LETTER N WITH LONG RIGHT LEG}': u'N',
	u'\N{LATIN CAPITAL LETTER O WITH MIDDLE TILDE}': u'O',
	u'\N{LATIN CAPITAL LETTER OI}': u'OI',
	u'\N{LATIN SMALL LETTER OI}': u'oi',
	u'\N{LATIN CAPITAL LETTER P WITH HOOK}': u'P',
	u'\N{LATIN SMALL LETTER P WITH HOOK}': u'p',
	# u'\N{LATIN LETTER YR}': u'',
	# u'\N{LATIN CAPITAL LETTER TONE TWO}': u'',
	# u'\N{LATIN SMALL LETTER TONE TWO}': u'',
	u'\N{LATIN CAPITAL LETTER ESH}': u'SH',
	# u'\N{LATIN LETTER REVERSED ESH LOOP}': u'',
	u'\N{LATIN SMALL LETTER T WITH PALATAL HOOK}': u't',
	u'\N{LATIN CAPITAL LETTER T WITH HOOK}': u'T',
	u'\N{LATIN SMALL LETTER T WITH HOOK}': u't',
	u'\N{LATIN CAPITAL LETTER T WITH RETROFLEX HOOK}': u'T',
	# u'\N{LATIN CAPITAL LETTER UPSILON}': u'',
	u'\N{LATIN CAPITAL LETTER V WITH HOOK}': u'V',
	u'\N{LATIN CAPITAL LETTER Y WITH HOOK}': u'Y',
	u'\N{LATIN SMALL LETTER Y WITH HOOK}': u'y',
	u'\N{LATIN CAPITAL LETTER Z WITH STROKE}': u'Z',
	u'\N{LATIN SMALL LETTER Z WITH STROKE}': u'z',
	u'\N{LATIN CAPITAL LETTER EZH}': u'S',
	# u'\N{LATIN CAPITAL LETTER EZH REVERSED}': u'',
	# u'\N{LATIN SMALL LETTER EZH REVERSED}': u'',
	u'\N{LATIN SMALL LETTER EZH WITH TAIL}': u's',
	# u'\N{LATIN LETTER TWO WITH STROKE}': u'',
	# u'\N{LATIN CAPITAL LETTER TONE FIVE}': u'',
	# u'\N{LATIN SMALL LETTER TONE FIVE}': u'',
	# u'\N{LATIN LETTER INVERTED GLOTTAL STOP WITH STROKE}': u'',
	u'\N{LATIN LETTER WYNN}': u'w',
	# u'\N{LATIN LETTER DENTAL CLICK}': u'',
	# u'\N{LATIN LETTER LATERAL CLICK}': u'',
	# u'\N{LATIN LETTER ALVEOLAR CLICK}': u'',
	# u'\N{LATIN LETTER RETROFLEX CLICK}': u'',
	# u'\N{LATIN SMALL LETTER TURNED E}': u'',
	u'\N{LATIN CAPITAL LETTER AE WITH MACRON}': u'AE',
	u'\N{LATIN SMALL LETTER AE WITH MACRON}': u'ae',
	u'\N{LATIN CAPITAL LETTER G WITH STROKE}': u'G',
	u'\N{LATIN SMALL LETTER G WITH STROKE}': u'g',
	u'\N{LATIN CAPITAL LETTER EZH WITH CARON}': u'S',
	u'\N{LATIN SMALL LETTER EZH WITH CARON}': u's',
	u'\N{LATIN CAPITAL LETTER HWAIR}': u'HW',
	u'\N{LATIN CAPITAL LETTER WYNN}': u'W',
	u'\N{LATIN CAPITAL LETTER AE WITH ACUTE}': u'AE',
	u'\N{LATIN SMALL LETTER AE WITH ACUTE}': u'AE',
	u'\N{LATIN CAPITAL LETTER O WITH STROKE AND ACUTE}': u'O',
	u'\N{LATIN SMALL LETTER O WITH STROKE AND ACUTE}': u'o',
	u'\N{LATIN CAPITAL LETTER YOGH}': u'J',
	u'\N{LATIN SMALL LETTER YOGH}': u'j',
	u'\N{LATIN CAPITAL LETTER N WITH LONG RIGHT LEG}': u'N',
	u'\N{LATIN SMALL LETTER D WITH CURL}': u'd',
	u'\N{LATIN CAPITAL LETTER OU}': u'OU',
	u'\N{LATIN SMALL LETTER OU}': u'ou',
	u'\N{LATIN CAPITAL LETTER Z WITH HOOK}': u'Z',
	u'\N{LATIN SMALL LETTER Z WITH HOOK}': u'z',
	u'\N{LATIN SMALL LETTER L WITH CURL}': u'l',
	u'\N{LATIN SMALL LETTER N WITH CURL}': u'n',
	u'\N{LATIN SMALL LETTER T WITH CURL}': u't',
	u'\N{LATIN SMALL LETTER DOTLESS J}': u'j',
	u'\N{LATIN SMALL LETTER DB DIGRAPH}': u'db',
	u'\N{LATIN SMALL LETTER QP DIGRAPH}': u'qp',
	u'\N{LATIN CAPITAL LETTER A WITH STROKE}': u'A',
	u'\N{LATIN CAPITAL LETTER C WITH STROKE}': u'C',
	u'\N{LATIN SMALL LETTER C WITH STROKE}': u'C',
	u'\N{LATIN CAPITAL LETTER L WITH BAR}': u'L',
	u'\N{LATIN CAPITAL LETTER T WITH DIAGONAL STROKE}': u'T',
	u'\N{LATIN SMALL LETTER S WITH SWASH TAIL}': u'S',
	u'\N{LATIN SMALL LETTER Z WITH SWASH TAIL}': u'Z',
	# u'\N{LATIN CAPITAL LETTER GLOTTAL STOP}': u'',
	# u'\N{LATIN SMALL LETTER TURNED A}': u'',
	# u'\N{LATIN SMALL LETTER ALPHA}': u'',
	# u'\N{LATIN SMALL LETTER TURNED ALPHA}': u'',
	u'\N{LATIN SMALL LETTER B WITH HOOK}': u'b',
	u'\N{LATIN SMALL LETTER OPEN O}': u'o',
	u'\N{LATIN SMALL LETTER C WITH CURL}': u'c',
	u'\N{LATIN SMALL LETTER D WITH TAIL}': u'd',
	u'\N{LATIN SMALL LETTER D WITH HOOK}': u'd',
	# u'\N{LATIN SMALL LETTER REVERSED E}': u'',
	# u'\N{LATIN SMALL LETTER SCHWA}': u'',
	# u'\N{LATIN SMALL LETTER SCHWA WITH HOOK}': u'',
	u'\N{LATIN SMALL LETTER OPEN E}': u'e',
	# u'\N{LATIN SMALL LETTER REVERSED OPEN E}': u'',
	# u'\N{LATIN SMALL LETTER REVERSED OPEN E WITH HOOK}': u'',
	# u'\N{LATIN SMALL LETTER CLOSED REVERSED OPEN E}': u'',
	u'\N{LATIN SMALL LETTER DOTLESS J WITH STROKE}': u'j',
	u'\N{LATIN SMALL LETTER G WITH HOOK}': u'g',
	u'\N{LATIN SMALL LETTER SCRIPT G}': u'g',
	u'\N{LATIN LETTER SMALL CAPITAL G}': u'G',
	# u'\N{LATIN SMALL LETTER GAMMA}': u'',
	# u'\N{LATIN SMALL LETTER RAMS HORN}': u'',
	# u'\N{LATIN SMALL LETTER TURNED H}': u'',
	u'\N{LATIN SMALL LETTER H WITH HOOK}': u'h',
	u'\N{LATIN SMALL LETTER HENG WITH HOOK}': u'h',
	u'\N{LATIN SMALL LETTER I WITH STROKE}': u'i',
	# u'\N{LATIN SMALL LETTER IOTA}': u'',
	u'\N{LATIN LETTER SMALL CAPITAL I}': u'I',
	u'\N{LATIN SMALL LETTER L WITH MIDDLE TILDE}': u'L',
	u'\N{LATIN SMALL LETTER L WITH BELT}': u'L',
	u'\N{LATIN SMALL LETTER L WITH RETROFLEX HOOK}': u'L',
	# u'\N{LATIN SMALL LETTER LEZH}': u'',
	# u'\N{LATIN SMALL LETTER TURNED M}': u'',
	# u'\N{LATIN SMALL LETTER TURNED M WITH LONG LEG}': u'',
	u'\N{LATIN SMALL LETTER M WITH HOOK}': u'm',
	u'\N{LATIN SMALL LETTER N WITH LEFT HOOK}': u'n',
	u'\N{LATIN SMALL LETTER N WITH RETROFLEX HOOK}': u'n',
	u'\N{LATIN LETTER SMALL CAPITAL N}': u'N',
	u'\N{LATIN SMALL LETTER BARRED O}': u'o',
	u'\N{LATIN LETTER SMALL CAPITAL OE}': u'OE',
	# u'\N{LATIN SMALL LETTER CLOSED OMEGA}': u'',
	# u'\N{LATIN SMALL LETTER PHI}': u'',
	# u'\N{LATIN SMALL LETTER TURNED R}': u'',
	# u'\N{LATIN SMALL LETTER TURNED R WITH LONG LEG}': u'',
	# u'\N{LATIN SMALL LETTER TURNED R WITH HOOK}': u'',
	u'\N{LATIN SMALL LETTER R WITH LONG LEG}': u'r',
	u'\N{LATIN SMALL LETTER R WITH TAIL}': u'r',
	u'\N{LATIN SMALL LETTER R WITH FISHHOOK}': u'r',
	# u'\N{LATIN SMALL LETTER REVERSED R WITH FISHHOOK}': u'',
	u'\N{LATIN LETTER SMALL CAPITAL R}': u'R',
	# u'\N{LATIN LETTER SMALL CAPITAL INVERTED R}': u'',
	u'\N{LATIN SMALL LETTER S WITH HOOK}': u's',
	u'\N{LATIN SMALL LETTER ESH}': u'sh',
	u'\N{LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK}': u'j',
	# u'\N{LATIN SMALL LETTER SQUAT REVERSED ESH}': u'',
	u'\N{LATIN SMALL LETTER ESH WITH CURL}': u'sh',
	# u'\N{LATIN SMALL LETTER TURNED T}': u'',
	u'\N{LATIN SMALL LETTER T WITH RETROFLEX HOOK}': u't',
	u'\N{LATIN SMALL LETTER U BAR}': u'u',
	# u'\N{LATIN SMALL LETTER UPSILON}': u'',
	u'\N{LATIN SMALL LETTER V WITH HOOK}': u'v',
	# u'\N{LATIN SMALL LETTER TURNED V}': u'',
	# u'\N{LATIN SMALL LETTER TURNED W}': u'',
	# u'\N{LATIN SMALL LETTER TURNED Y}': u'',
	u'\N{LATIN LETTER SMALL CAPITAL Y}': u'Y',
	u'\N{LATIN SMALL LETTER Z WITH RETROFLEX HOOK}': u'z',
	u'\N{LATIN SMALL LETTER Z WITH CURL}': u'z',
	u'\N{LATIN SMALL LETTER EZH}': u's',
	u'\N{LATIN SMALL LETTER EZH WITH CURL}': u's',
	# u'\N{LATIN LETTER GLOTTAL STOP}': u'',
	# u'\N{LATIN LETTER PHARYNGEAL VOICED FRICATIVE}': u'',
	# u'\N{LATIN LETTER INVERTED GLOTTAL STOP}': u'',
	u'\N{LATIN LETTER STRETCHED C}': u'c',
	# u'\N{LATIN LETTER BILABIAL CLICK}': u'',
	u'\N{LATIN LETTER SMALL CAPITAL B}': u'B',
	u'\N{LATIN SMALL LETTER CLOSED OPEN E}': u'e',
	u'\N{LATIN LETTER SMALL CAPITAL G WITH HOOK}': u'G',
	u'\N{LATIN LETTER SMALL CAPITAL H}': u'H',
	u'\N{LATIN SMALL LETTER J WITH CROSSED-TAIL}': u'j',
	# u'\N{LATIN SMALL LETTER TURNED K}': u'',
	u'\N{LATIN LETTER SMALL CAPITAL L}': u'L',
	u'\N{LATIN SMALL LETTER Q WITH HOOK}': u'q',
	# u'\N{LATIN LETTER GLOTTAL STOP WITH STROKE}': u'',
	# u'\N{LATIN LETTER REVERSED GLOTTAL STOP WITH STROKE}': u'',
	# u'\N{LATIN SMALL LETTER DZ DIGRAPH}': u'',
	# u'\N{LATIN SMALL LETTER DEZH DIGRAPH}': u'',
	# u'\N{LATIN SMALL LETTER DZ DIGRAPH WITH CURL}': u'',
	# u'\N{LATIN SMALL LETTER TS DIGRAPH}': u'',
	# u'\N{LATIN SMALL LETTER TESH DIGRAPH}': u'',
	# u'\N{LATIN SMALL LETTER TC DIGRAPH WITH CURL}': u'',
	# u'\N{LATIN SMALL LETTER FENG DIGRAPH}': u'',
	# u'\N{LATIN SMALL LETTER LS DIGRAPH}': u'',
	# u'\N{LATIN SMALL LETTER LZ DIGRAPH}': u'',
	# u'\N{LATIN LETTER BILABIAL PERCUSSIVE}': u'',
	# u'\N{LATIN LETTER BIDENTAL PERCUSSIVE}': u'',
	# u'\N{LATIN SMALL LETTER TURNED H WITH FISHHOOK}': u'',
	# u'\N{LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL}': u'',
	u'\N{LATIN LETTER SMALL CAPITAL A}': u'A',
	u'\N{LATIN LETTER SMALL CAPITAL AE}': u'AE',
	# u'\N{LATIN SMALL LETTER TURNED AE}': u'',
	u'\N{LATIN LETTER SMALL CAPITAL BARRED B}': u'B',
	u'\N{LATIN LETTER SMALL CAPITAL C}': u'C',
	u'\N{LATIN LETTER SMALL CAPITAL D}': u'D',
	u'\N{LATIN LETTER SMALL CAPITAL ETH}': u'D',
	u'\N{LATIN LETTER SMALL CAPITAL E}': u'E',
	# u'\N{LATIN SMALL LETTER TURNED OPEN E}': u'',
	# u'\N{LATIN SMALL LETTER TURNED I}': u'',
	u'\N{LATIN LETTER SMALL CAPITAL J}': u'J',
	u'\N{LATIN LETTER SMALL CAPITAL K}': u'K',
	u'\N{LATIN LETTER SMALL CAPITAL L WITH STROKE}': u'L',
	u'\N{LATIN LETTER SMALL CAPITAL M}': u'M',
	# u'\N{LATIN LETTER SMALL CAPITAL REVERSED N}': u'',
	u'\N{LATIN LETTER SMALL CAPITAL O}': u'O',
	u'\N{LATIN LETTER SMALL CAPITAL OPEN O}': u'O',
	# u'\N{LATIN SMALL LETTER SIDEWAYS O}': u'',
	# u'\N{LATIN SMALL LETTER SIDEWAYS OPEN O}': u'',
	# u'\N{LATIN SMALL LETTER SIDEWAYS O WITH STROKE}': u'',
	# u'\N{LATIN SMALL LETTER TURNED OE}': u'',
	u'\N{LATIN LETTER SMALL CAPITAL OU}': u'OU',
	# u'\N{LATIN SMALL LETTER TOP HALF O}': u'',
	# u'\N{LATIN SMALL LETTER BOTTOM HALF O}': u'',
	u'\N{LATIN LETTER SMALL CAPITAL P}': u'P',
	# u'\N{LATIN LETTER SMALL CAPITAL REVERSED R}': u'',
	# u'\N{LATIN LETTER SMALL CAPITAL TURNED R}': u'',
	u'\N{LATIN LETTER SMALL CAPITAL T}': u'T',
	u'\N{LATIN LETTER SMALL CAPITAL U}': u'U',
	# u'\N{LATIN SMALL LETTER SIDEWAYS U}': u'',
	# u'\N{LATIN SMALL LETTER SIDEWAYS DIAERESIZED U}': u'',
	# u'\N{LATIN SMALL LETTER SIDEWAYS TURNED M}': u'',
	u'\N{LATIN LETTER SMALL CAPITAL V}': u'V',
	u'\N{LATIN LETTER SMALL CAPITAL W}': u'W',
	u'\N{LATIN LETTER SMALL CAPITAL Z}': u'',
	u'\N{LATIN LETTER SMALL CAPITAL EZH}': u'S',
	# u'\N{LATIN LETTER VOICED LARYNGEAL SPIRANT}': u'',
	# u'\N{LATIN LETTER AIN}': u'',
	u'\N{LATIN SMALL LETTER UE}': u'ue',
	u'\N{LATIN SMALL LETTER B WITH MIDDLE TILDE}': u'b',
	u'\N{LATIN SMALL LETTER D WITH MIDDLE TILDE}': u'd',
	u'\N{LATIN SMALL LETTER F WITH MIDDLE TILDE}': u'f',
	u'\N{LATIN SMALL LETTER M WITH MIDDLE TILDE}': u'm',
	u'\N{LATIN SMALL LETTER N WITH MIDDLE TILDE}': u'n',
	u'\N{LATIN SMALL LETTER P WITH MIDDLE TILDE}': u'p',
	u'\N{LATIN SMALL LETTER R WITH MIDDLE TILDE}': u'r',
	u'\N{LATIN SMALL LETTER R WITH FISHHOOK AND MIDDLE TILDE}': u'r',
	u'\N{LATIN SMALL LETTER S WITH MIDDLE TILDE}': u's',
	u'\N{LATIN SMALL LETTER T WITH MIDDLE TILDE}': u't',
	u'\N{LATIN SMALL LETTER Z WITH MIDDLE TILDE}': u'z',
	# u'\N{LATIN SMALL LETTER TURNED G}': u'',
	# u'\N{LATIN SMALL LETTER INSULAR G}': u'',
	u'\N{LATIN SMALL LETTER TH WITH STRIKETHROUGH}': u'th',
	u'\N{LATIN SMALL CAPITAL LETTER I WITH STROKE}': u'I',
	# u'\N{LATIN SMALL LETTER IOTA WITH STROKE}': u'',
	u'\N{LATIN SMALL LETTER P WITH STROKE}': u'p',
	u'\N{LATIN SMALL CAPITAL LETTER U WITH STROKE}': u'U',
	# u'\N{LATIN SMALL LETTER UPSILON WITH STROKE}': u'',
	u'\N{LATIN SMALL LETTER B WITH PALATAL HOOK}': u'b',
	u'\N{LATIN SMALL LETTER D WITH PALATAL HOOK}': u'd',
	u'\N{LATIN SMALL LETTER F WITH PALATAL HOOK}': u'f',
	u'\N{LATIN SMALL LETTER G WITH PALATAL HOOK}': u'g',
	u'\N{LATIN SMALL LETTER K WITH PALATAL HOOK}': u'k',
	u'\N{LATIN SMALL LETTER L WITH PALATAL HOOK}': u'l',
	u'\N{LATIN SMALL LETTER M WITH PALATAL HOOK}': u'm',
	u'\N{LATIN SMALL LETTER N WITH PALATAL HOOK}': u'n',
	u'\N{LATIN SMALL LETTER P WITH PALATAL HOOK}': u'p',
	u'\N{LATIN SMALL LETTER R WITH PALATAL HOOK}': u'r',
	u'\N{LATIN SMALL LETTER S WITH PALATAL HOOK}': u's',
	u'\N{LATIN SMALL LETTER ESH WITH PALATAL HOOK}': u'sh',
	u'\N{LATIN SMALL LETTER V WITH PALATAL HOOK}': u'v',
	u'\N{LATIN SMALL LETTER X WITH PALATAL HOOK}': u'x',
	u'\N{LATIN SMALL LETTER Z WITH PALATAL HOOK}': u'z',
	u'\N{LATIN SMALL LETTER A WITH RETROFLEX HOOK}': u'a',
	# u'\N{LATIN SMALL LETTER ALPHA WITH RETROFLEX HOOK}': u'',
	u'\N{LATIN SMALL LETTER D WITH HOOK AND TAIL}': u'd',
	u'\N{LATIN SMALL LETTER E WITH RETROFLEX HOOK}': u'e',
	u'\N{LATIN SMALL LETTER OPEN E WITH RETROFLEX HOOK}': u'e',
	u'\N{LATIN SMALL LETTER REVERSED OPEN E WITH RETROFLEX HOOK}': u'e',
	# u'\N{LATIN SMALL LETTER SCHWA WITH RETROFLEX HOOK}': u'',
	u'\N{LATIN SMALL LETTER I WITH RETROFLEX HOOK}': u'i',
	u'\N{LATIN SMALL LETTER OPEN O WITH RETROFLEX HOOK}': u'o',
	u'\N{LATIN SMALL LETTER ESH WITH RETROFLEX HOOK}': u'sh',
	u'\N{LATIN SMALL LETTER U WITH RETROFLEX HOOK}': u'u',
	u'\N{LATIN SMALL LETTER EZH WITH RETROFLEX HOOK}': u's',
	# u'\N{LATIN SUBSCRIPT SMALL LETTER SCHWA}': u'',
	# u'\N{LATIN CROSS}': u''
	}

# Additional ones; see "man uni2ascii"
UNI2ASCII_CONVERSIONS={
	u'\N{NO-BREAK SPACE}': u' ',
	u'\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}': u'"',
	u'\N{SOFT HYPHEN}': u'',	# Controversial: see http://www.cs.tut.fi/~jkorpela/shy.html
	u'\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}': u'"',
	u'\N{ETHIOPIC WORDSPACE}': u' ',
	u'\N{OGHAM SPACE MARK}': u' ',
	u'\N{EN QUAD}': u' ',
	u'\N{EM QUAD}': u' ',
	u'\N{EN SPACE}': u' ',
	u'\N{EM SPACE}': u' ',
	u'\N{THREE-PER-EM SPACE}': u' ',
	u'\N{FOUR-PER-EM SPACE}': u' ',
	u'\N{SIX-PER-EM SPACE}': u' ',
	u'\N{FIGURE SPACE}': u' ',
	u'\N{PUNCTUATION SPACE}': u' ',
	u'\N{THIN SPACE}': u' ',
	u'\N{HAIR SPACE}': u' ',
	u'\N{ZERO WIDTH SPACE}': u' ',
	u'\N{ZERO WIDTH NO-BREAK SPACE}': u' ',
	u'\N{HYPHEN}': u'-',
	u'\N{NON-BREAKING HYPHEN}': u'-',
	u'\N{FIGURE DASH}': u'-',
	u'\N{EN DASH}': u'-',
	u'\N{EM DASH}': u'-',
	u'\N{LEFT SINGLE QUOTATION MARK}': u'`',
	u'\N{RIGHT SINGLE QUOTATION MARK}': u"'",
	u'\N{SINGLE LOW-9 QUOTATION MARK}': u'`',
	u'\N{SINGLE HIGH-REVERSED-9 QUOTATION MARK}': u'`',
	u'\N{LEFT DOUBLE QUOTATION MARK}': u'"',
	u'\N{RIGHT DOUBLE QUOTATION MARK}': u'"',
	u'\N{DOUBLE LOW-9 QUOTATION MARK}': u'"',
	u'\N{DOUBLE HIGH-REVERSED-9 QUOTATION MARK}': u'"',
	u'\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}': u'`',
	u'\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}': u"'",
	u'\N{LOW ASTERISK}': u'*',
	u'\N{MINUS SIGN}': u'-',
	u'\N{ASTERISK OPERATOR}': u'*',
	u'\N{BOX DRAWINGS LIGHT HORIZONTAL}': u'-',
	u'\N{BOX DRAWINGS HEAVY HORIZONTAL}': u'-',
	u'\N{BOX DRAWINGS LIGHT VERTICAL}': u'|',
	u'\N{BOX DRAWINGS HEAVY VERTICAL}': u'|',
	u'\N{HEAVY ASTERISK}': u'*',
	u'\N{HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT}': u'"',
	u'\N{HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT}': u'"',
	u'\N{IDEOGRAPHIC SPACE}': u' ',
	u'\N{SMALL AMPERSAND}': u'&',
	u'\N{SMALL ASTERISK}': u'*',
	u'\N{SMALL PLUS SIGN}': u'+',
	u'\N{CENT SIGN}': u'cent',
	u'\N{POUND SIGN}': u'pound',
	u'\N{YEN SIGN}': u'yen',
	u'\N{COPYRIGHT SIGN}': u'(c)',
	u'\N{REGISTERED SIGN}': u'(R)',
	u'\N{VULGAR FRACTION ONE QUARTER}': u'1/4',
	u'\N{VULGAR FRACTION ONE HALF}': u'1/2',
	u'\N{VULGAR FRACTION THREE QUARTERS}': u'3/4',
	# u'\N{CAPITAL LETTER ASH}': u'AE',
	u'\N{LATIN SMALL LETTER SHARP S}': u'ss',
	# u'\N{SMALL LETTER ASH}': u'ae',
	u'\N{LATIN CAPITAL LIGATURE IJ}': u'IJ',
	u'\N{LATIN SMALL LIGATURE IJ}': u'ij',
	u'\N{LATIN CAPITAL LIGATURE OE}': u'OE',
	u'\N{LATIN SMALL LIGATURE oe}': u'oe',
	u'\N{LATIN CAPITAL LETTER DZ}': u'DZ',
	u'\N{LATIN CAPITAL LETTER DZ WITH CARON}': u'DZ',
	u'\N{LATIN CAPITAL LETTER D WITH SMALL LETTER Z}': u'Dz',
	u'\N{LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON}': u'Dz',
	u'\N{LATIN SMALL LETTER DZ}': u'dz',
	u'\N{LATIN SMALL LETTER TS DIGRAPH}': u'ts',
	u'\N{HORIZONTAL ELLIPSIS}': u'...',
	u'\N{MIDLINE HORIZONTAL ELLIPSIS}': u'...',
	u'\N{LEFTWARDS ARROW}': u'<-',
	u'\N{RIGHTWARDS ARROW}': u'->',
	u'\N{LEFTWARDS DOUBLE ARROW}': u'<=',
	u'\N{RIGHTWARDS DOUBLE ARROW}': u'=>',
	}

# More from "man uni2ascii", in a different category.
EXTRA_CHARACTERS={
	u'\N{ACUTE ACCENT}': u"'",
	u'\N{BROKEN BAR}': u'|',
	# u'\N{CEDILLA}': u'{cedilla}',
	u'\N{CENT SIGN}': u' cents ',
	u'\N{COPYRIGHT SIGN}': u'(C)',
	u'\N{CURRENCY SIGN}': u' currency ',
	u'\N{DEGREE SIGN}': u' degrees ',
	# u'\N{DIAERESIS}': u'{umlaut}',
	u'\N{DIVISION SIGN}': u'/',
	# u'\N{FEMININE ORDINAL INDICATOR}': u'{^a}',
	u'\N{INVERTED EXCLAMATION MARK}': u'!',
	u'\N{INVERTED QUESTION MARK}': u'?',
	# wrong? u'\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}': u'<<',
	u'\N{MACRON}': u'_',
	# u'\N{MASCULINE ORDINAL INDICATOR}': u'{^o}',
	u'\N{MICRO SIGN}': u'micro',
	u'\N{MIDDLE DOT}': u'*',
	u'\N{MULTIPLICATION SIGN}': u'*',
	u'\N{NOT SIGN}': u'not',
	u'\N{PILCROW SIGN}': u'paragraph',
	u'\N{PLUS-MINUS SIGN}': u'+/-',
	u'\N{POUND SIGN}': u'pound',
	u'\N{REGISTERED SIGN}': u'(R)',
	# wrong? u'\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}': u'>>',
	u'\N{SECTION SIGN}': u'section',
	u'\N{SOFT HYPHEN}': u'',
	u'\N{SUPERSCRIPT ONE}': u'^1',
	u'\N{SUPERSCRIPT THREE}': u'^3',
	u'\N{SUPERSCRIPT TWO}': u'^2',
	u'\N{VULGAR FRACTION ONE HALF}': u'1/2',
	u'\N{VULGAR FRACTION ONE QUARTER}': u'1/4',
	u'\N{VULGAR FRACTION THREE QUARTERS}': u'3/4',
	u'\N{YEN SIGN}': u'yen'
}
FG_HACKS={
	u'\u0082': u'', # "break permitted here" symbol
	u'\u2022': u'*', # Bullet
}


def build_dictionary():
	'Return the translation dictionary.'
	d = dict()
	# First do what can be done automatically
	for i in range(0xffff):
		u=unichr(i)
		try:
			n=unicodedata.name(u)
			if n.startswith('LATIN '):
				k=unicodedata.normalize('NFKD', u).encode('ASCII', 'ignore')
				if k: d[i]=unicode(k)	# i=ord(u)
		except ValueError: pass
	# Next, add some by-hand ones (overlap possible, so order matters)
	for m in [EXTRA_LATIN_NAMES,EXTRA_CHARACTERS,UNI2ASCII_CONVERSIONS,FG_HACKS]:
		for i in m:
			try: d[ord(i)]=unicode(m[i])
			except Exception, err: pass
	return d



udict = build_dictionary()
convert = lambda s: s.translate(udict)

def coroutine(func):
	def start(*argz, **kwz):
		cr = func(*argz, **kwz)
		cr.next()
		return cr
	return start

@coroutine
def co_filter(drain, in_enc='utf-8', out_enc='ascii'):
	bs = None
	while True:
		chunk = (yield bs)
		bs = drain(convert(unicode(chunk)).encode('utf-8'))

def filter(sin, sout, bs=8192, in_enc='utf-8', out_enc='ascii'):
	sout = co_filter(sout.write, in_enc, out_enc)
	while True:
		dta = sin.read(bs)
		if not dta: break
		else: sout.send(dta)


if __name__ == '__main__':
	from optparse import OptionParser
	parser = OptionParser(usage='%prog [options]',
		description='utf8 stdin -> ascii stdout')
	parser.add_option('-s', '--src-enc',
		action='store', type='str', dest='src_enc', metavar='ENC', default='utf-8',
		help='source encoding (utf-8)')
	parser.add_option('-d', '--dst-enc',
		action='store', type='str', dest='dst_enc', metavar='ENC', default='ascii',
		help='destination encoding (ascii)')
	parser.add_option('-c', '--chunk',
		action='store', type='int', dest='bs', metavar='BYTES', default=8192,
		help='read/write in chunks of a given size (8192)')
	optz, argz = parser.parse_args()
	if argz: parser.error('Only stdin -> stdout conversion suported')

	filter(sys.stdin, sys.stdout, bs=optz.bs, in_enc=optz.src_enc, out_enc=optz.dst_enc)
