import re def hangulize(text, toEncoding): """ Take romanized Korean text and return the hangulized version. """ output = '' toLoop = [] for i in text: toLoop.append(i) toLoop.reverse() # print toLoop # Set up the vars for the state machine state = 'START' # possible states: START, INITIAL, VOWEL, FINAL syllable = {} hangul = '' # Start the state machine while (len(toLoop) > 0): # get the next char char = toLoop.pop() if state == 'START': syllable = {} # check to make sure it's a letter, if not, just add it to the string if char in letters: if char in consonantLetters: syllable['initial'] = char state = 'INITIAL' if len(toLoop) == 0: if char in nonpachim: hangul += hangulize_syllable(syllable) else: hangul += syllable['initial'] elif char in vowelLetters: syllable['vowel'] = char state = 'VOWEL' if len(toLoop) == 0: if char in moum: hangul += hangulize_syllable(syllable) else: hangul += char else: if char == '.' and len(toLoop) > 0 and toLoop[-1] in letters: if syllable != {}: hangul += hangulize_syllable(syllable) syllable = {} char = '' else: if char != '.': hangul += char char = '' state = 'START' elif char == '<': if syllable != {}: hangul += hangulize_syllable(syllable) syllable = {} state = 'SINGLE' else: # print char + ' not in letters, not a period, and not a lesser than sign' if syllable != {}: hangul += hangulize_syllable(syllable) syllable = {} hangul += char char = '' state = 'START' elif state == 'INITIAL': if char in letters: if char in vowelLetters: # todo: but what if this char is 'w' and the next one is a consonant? syllable['vowel'] = char state = 'VOWEL' if char in moum: fullVowel = True else: if syllable['initial'] + char in nonpachim or syllable['initial'] + char in consonantLetters: syllable['initial'] = syllable['initial'] + char else: hangul += hangulize_syllable(syllable) syllable = {} state = 'START' else: if char == '.' and len(toLoop) > 0 and toLoop[-1] in letters: if syllable != {}: hangul += hangulize_syllable(syllable) syllable = {} char = '' else: if char != '.': hangul += char char = '' state = 'START' elif char == '<': if syllable != {}: hangul += hangulize_syllable(syllable) syllable = {} state = 'SINGLE' else: if syllable != {}: hangul += hangulize_syllable(syllable) syllable = {} hangul += char char = '' state = 'START' elif state == 'VOWEL': #print 'state is VOWEL' if char in letters: if char in vowelLetters: if syllable['vowel'] + char in moum: syllable['vowel'] = syllable['vowel'] + char else: hangul += hangulize_syllable(syllable) syllable = {} syllable['initial'] = '' syllable['vowel'] = char state = 'VOWEL' else: syllable['final'] = char state = 'FINAL' else: if char == '.' and len(toLoop) > 0 and toLoop[-1] in letters: if syllable != {}: hangul += hangulize_syllable(syllable) syllable = {} char = '' else: if char != '.': hangul += char char = '' state = 'START' elif char == '<': if syllable != {}: hangul += hangulize_syllable(syllable) syllable = {} state = 'SINGLE' else: if syllable != {}: hangul += hangulize_syllable(syllable) syllable = {} hangul += char char = '' state = 'START' elif state == 'FINAL': if char in letters: if syllable['final'] + char in pachim: syllable['final'] = syllable['final'] + char else: hangul += hangulize_syllable(syllable) syllable = {} if char in vowelLetters: syllable['initial'] = '' syllable['vowel'] = char state = 'VOWEL' if char in moum: fullVowel = True else: syllable['initial'] = char state = 'INITIAL' else: hangul += hangulize_syllable(syllable) syllable = {} if char == '.' and len(toLoop) > 0 and toLoop[-1] in letters: if syllable != {}: hangul += hangulize_syllable(syllable) syllable = {} char = '' else: if char != '.': hangul += char char = '' state = 'START' elif char == '<': if syllable != {}: hangul += hangulize_syllable(syllable) syllable = {} state = 'SINGLE' else: if syllable != {}: hangul += hangulize_syllable(syllable) syllable = {} hangul += char char = '' state = 'START' elif state == 'SINGLE': # print 'in state == SINGLE: ', # print str(syllable), # print "CURRENT LETTER: " + char if char in singleLetters: # print 'char in singleLetters' if 'single' in syllable and char + syllable['single'] in singleLetters: # print syllable['single'] + ' in singleLetters' syllable['single'] += char elif 'single' in syllable and not syllable['single'] + char in singleLetters: # print syllable['single'] + char + ' is not in singleLetters' hangul += '<' + syllable['single'] syllable = {} if char in vowelLetters: syllable['initial'] = '' syllable['vowel'] = char state = 'VOWEL' elif char in consonantLetters: syllable['initial'] = char state = 'INITIAL' else: state = 'START' else: if 'single' in syllable: syllable['single'] += char else: syllable['single'] = char elif char == '>': # print 'ending single', hangul += hangulize_syllable(syllable) syllable = {} char = '' state = 'START' else: # print char + ' not in singleLetters' hangul += char syllable = {} state = 'START' if len(toLoop) == 0: if syllable == {}: hangul += char else: if 'single' in syllable: hangul += '<' + syllable['single'] else: hangul += hangulize_syllable(syllable) continue return hangul.encode(toEncoding) def hangulize_syllable(syl): """ Take romanized Korean word and return a list of its syllables. """ single = vowel = final = None initial = '' # print str(syl) if 'single' in syl: single = syl['single'] if 'initial' in syl: initial = syl['initial'] if 'vowel' in syl: vowel = syl['vowel'] if 'final' in syl: final = syl['final'] #print initial #print vowel #print final if single == None: if vowel == None and final == None: #print "is single initial" single = initial elif initial == None and final == None: #print 'is single vowel' single = vowel else: #print 'is not single' single = None sIndex = iIndex = vIndex = fIndex = 0 if single != None: if single in singlesDict: sIndex = singlesDict[single] else: return single #print 'getting singles index.... ' + str(sIndex) else: if initial in nonpachimDict: iIndex = nonpachimDict[initial] else: return initial + vowel + final if vowel in moumDict: vIndex = moumDict[vowel] else: return initial + vowel + final if final in pachimDict: fIndex = pachimDict[final] else: fIndex = None sIndex = None #print iIndex #print vIndex #print fIndex if sIndex != None: return unichr(sIndex + 12593) else: total = 44032 if iIndex != None: total += iIndex * 588 if vIndex != None: total += 28 * vIndex if fIndex != None: total += fIndex return unichr(total) return def romanize(raw, fromEnc = 'utf8', toEnc = 'utf8'): """ Takes a raw string of Korean, a 'from encoding' and a 'to encoding'. Returns a romanized string of the text, encoded as specified (default 'from encoding' is None and default 'to encoding' is utf-8). """ if fromEnc != None: raw = raw.decode(fromEnc) newString = '' for i in range(len(raw)): index = gti(raw[i]) # If the index is a single (non-syllabic) hangul letter if index in range(12593, 12687): index = index - 12593 if singles[index] and len(newString) > 1 and newString[-1] != ' ': newString += '.' newString += '<' + singles[index] + '>' # If the index represents a hangul syllable elif index in range(44032, 55204): index = index - 44032 initial = index / 588 vowel = (index % 588) / 28 final = (index % 588) % 28 if len(newString) > 0: if nonpachim[initial] == 'g' and newString[-1] == 'n': newString += '.' elif nonpachim[initial] == '' and newString[-2:len(newString)] == 'ng': newString += '.' elif (newString[-1] in moum or newString[-2:len(newString)] in moum) and nonpachim[initial] in pachim + nonpachim: newString += '.' elif nonpachim[initial] == '' and newString[-1] in pachim + nonpachim: newString += '.' elif nonpachim[initial] == 'h' and newString[-1] in ['t','k','p','c','n','l']: newString += '.' elif newString[-1] + nonpachim[initial] in pachim + nonpachim or (len(nonpachim[initial]) > 1 and newString[-1] + nonpachim[initial][0] in pachim + nonpachim): newString += '.' newString += nonpachim[initial] newString += moum[vowel] newString += pachim[final] # Otherwise else: newString += unichr(index).upper() return newString.encode(toEnc) def gti(char): """ Only accepts unicode characters Return index of characters """ return ord(char) # Character lists singles = ['k', 'kk', 'ks', 'n', 'nc', 'nh', 't', 'tt', 'l', 'lk', 'lm', 'lp', 'ls', 'lth', 'lph', 'lh', 'm', 'p', 'pp', 'ps', 's', 'ss', 'ng', 'c', 'cc', 'ch', 'kh', 'th', 'ph', 'h', 'a', 'ay', 'ya', 'yay', 'e', 'ey', 'ye', 'yey', 'o', 'wa', 'way', 'oy', 'yo', 'wu', 'we', 'wey', 'wi', 'yu', 'u', 'uy', 'i', 'NONE', 'NN', 'NT', 'NS', 'NZ', 'LKS', 'LT', 'LPS', 'LZ', 'LH', 'MP', 'MS', 'MZ', 'MNG', 'PK', 'PT', 'PSK', 'PST', 'PC', 'PTH', 'PNG', 'PPNG', 'SK', 'SL', 'ST', 'SP', 'SC', 'Z', 'NGNG', 'NG', 'NGS', 'NGZ', 'PHNG', 'HH', 'H', 'YOYA', 'YOYAY', 'YOI', 'YUE', 'YUEY', 'YUI', 'A', 'E'] moum = ['a', 'ay', 'ya', 'yay', 'e', 'ey', 'ye', 'yey', 'o', 'wa', 'way', 'oy', 'yo', 'wu', 'we', 'wey', 'wi', 'yu', 'u', 'uy', 'i'] pachim = ['', 'k', 'kk', 'ks', 'n', 'nc', 'nh', 't', 'l', 'lk', 'lm', 'lp', 'ls', 'lth', 'lph', 'lh', 'm', 'p', 'ps', 's', 'ss', 'ng', 'c', 'ch', 'kh', 'th', 'ph', 'h'] nonpachim = ['k', 'kk', 'n', 't', 'tt', 'l', 'm', 'p', 'pp', 's', 'ss', '', 'c', 'cc', 'ch', 'kh', 'th', 'ph', 'h'] letters = ['k', 'n', 't', 'l', 'm', 'p', 's', 'c', 'k', 'h', 'g', 'a', 'y', 'e', 'o', 'w', 'u', 'i'] vowelLetters = ['a', 'e', 'i', 'o', 'u', 'w', 'y'] consonantLetters = ['c', 'g', 'h', 'k', 'l', 'm', 'n', 'p', 's', 't', 'lt', 'lp'] singleLetters = ['k', 'kk', 'ks', 'ns', 'n', 'nc', 'nh', 't', 'tt', 'l', 'lk', 'lm', 'lp', 'ls', 'lth', 'lt', 'lph', 'lt', 'lh', 'm', 'p', 'pp', 'ps', 's', 'ss', 'ng', 'c', 'cc', 'ch', 'kh', 'th', 'ph', 'h', 'a', 'ay', 'ya', 'yay', 'e', 'ey', 'ye', 'yey', 'o', 'wa', 'way', 'oy', 'yo', 'wu', 'we', 'wey', 'wi', 'yu', 'u', 'uy', 'i', 'NONE', 'N', 'NO', 'NON', 'NN', 'NT', 'NS', 'NZ', 'LKS', 'LK', 'L', 'LT', 'LPS', 'LP', 'LZ', 'LH', 'MP', 'M', 'MS', 'MZ', 'MNG', 'MN', 'PK', 'P', 'PT', 'PSK', 'PS', 'PST', 'PC', 'PTH', 'PNG', 'PPNG', 'PP', 'PPN', 'SK', 'S', 'SL', 'ST', 'SP', 'SC', 'Z', 'NGNG', 'NGN', 'NG', 'NGS', 'NGZ', 'PHNG', 'PH', 'PHN', 'HH', 'H', 'YOYA', 'Y', 'YO', 'O', 'YOY', 'YOYAY', 'YOI', 'YUE', 'YU', 'YUEY', 'YUI', 'A', 'E'] # Character dictionaries singlesDict = {'yey': 37, 'YUE': 89, 'YOYAY': 87, 'tt': 7, 'lm': 10, 'lk': 9, 'lh': 15, 'ls': 12, 'lp': 11, 'wey': 45, 'YUEY': 90, 'yo': 42, 'ya': 32, 'H': 85, 'LPS': 58, 'yu': 47, 'YUI': 91, 'h': 29, 'l': 8, 'p': 17, 't': 6, 'HH': 84, 'ey': 35, 'NGS': 81, 'n': 3, 'LKS': 56, 'NGZ': 82, 'NONE': 51, 'PT': 66, 'PTH': 70, 'PC': 69, 'PK': 65, 'MNG': 64, 'we': 44, 'wa': 39, 'PPNG': 72, 'wi': 46, 'wu': 43, 'PSK': 67, 'c': 23, 'k': 0, 'o': 38, 'PHNG': 83, 's': 20, 'MP': 61, 'MS': 62, 'YOYA': 86, 'lth': 13, 'PST': 68, 'MZ': 63, 'ch': 25, 'cc': 24, 'ps': 19, 'pp': 18, 'yay': 33, 'NN': 52, 'NG': 80, 'NZ': 55, 'way': 40, 'ph': 28, 'NS': 54, 'NT': 53, 'th': 27, 'Z': 78, 'uy': 49, 'SP': 76, 'ST': 75, 'SK': 73, 'ss': 21, 'SL': 74, 'SC': 77, 'ay': 31, 'NGNG': 79, 'nh': 5, 'nc': 4, 'LH': 60, 'ng': 22, 'LT': 57, 'ks': 2, 'LZ': 59, 'A': 92, 'E': 93, 'oy': 41, 'YOI': 88, 'ye': 36, 'kk': 1, 'a': 30, 'e': 34, 'i': 50, 'kh': 26, 'm': 16, 'u': 48, 'lph': 14, 'PNG': 71} moumDict = {'a': 0, 'we': 14, 'uy': 19, 'yay': 3, 'oy': 11, 'wa': 9, 'ya': 2, 'yo': 12, 'ye': 6, 'o': 8, 'yey': 7, 'i': 20, 'wu': 13, 'ey': 5, 'wi': 16, 'way': 10, 'ay': 1, 'e': 4, 'wey': 15, 'yu': 17, 'u': 18} pachimDict = {'': 0, 'nc': 5, 'ch': 23, 'nh': 6, 'ps': 18, 'p': 17, 'lm': 10, 'lk': 9, 'lh': 15, 'ng': 21, 'ls': 12, 'lp': 11, 'ph': 26, 'th': 25, 'c': 22, 'ss': 20, 'h': 27, 'k': 1, 'kh': 24, 'm': 16, 'l': 8, 'n': 4, 'ks': 3, 'kk': 2, 's': 19, 't': 7, 'lph': 14, 'lth': 13} nonpachimDict = {'': 11, 'pp': 8, 'ch': 14, 'ss': 10, 'kk': 1, 'c': 12, 'k': 0, 'kh': 15, 'm': 6, 'l': 5, 'n': 2, 'p': 7, 's': 9, 't': 3, 'th': 16, 'h': 18, 'ph': 17, 'tt': 4, 'cc': 13}