Automatically turn Romaji into Hiragana in an Anki deck
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

70 lines
2.7 KiB

# -*- coding:utf-8 -*-
# Modified by Macoy Madson. Original copied from
hiraganaRange = {'from': ord(u'\u3040'), 'to': ord(u'\u309f')} # Japanese Hiragana
katakanaRange = {"from": ord(u"\u30a0"), "to": ord(u"\u30ff")} # Japanese Katakana
# Specifically ignore Hiragana and Katakana in order to get all Kanji
cjkNonKanaRanges = [
{"from": ord(u"\u3300"), "to": ord(u"\u33ff")}, # compatibility ideographs
{"from": ord(u"\ufe30"), "to": ord(u"\ufe4f")}, # compatibility ideographs
{"from": ord(u"\uf900"), "to": ord(u"\ufaff")}, # compatibility ideographs
{"from": ord(u"\U0002F800"), "to": ord(u"\U0002fa1f")}, # compatibility ideographs
{"from": ord(u"\u2e80"), "to": ord(u"\u2eff")}, # cjk radicals supplement
{"from": ord(u"\u4e00"), "to": ord(u"\u9fff")},
{"from": ord(u"\u3400"), "to": ord(u"\u4dbf")},
{"from": ord(u"\U00020000"), "to": ord(u"\U0002a6df")},
{"from": ord(u"\U0002a700"), "to": ord(u"\U0002b73f")},
{"from": ord(u"\U0002b740"), "to": ord(u"\U0002b81f")},
{"from": ord(u"\U0002b820"), "to": ord(u"\U0002ceaf")} # included as of Unicode 8.0
cjkRanges = cjkNonKanaRanges.copy()
cjkRanges.append(hiraganaRange) # Japanese Hiragana
cjkRanges.append(katakanaRange) # Japanese Katakana
# "The Alphabet"
latinRanges = [
{"from": ord(u"\u0042"), "to": ord(u"\u005a")}, # Uppercase A-Z
{"from": ord(u"\u0061"), "to": ord(u"\u007a")} # Lowercase a-z
def is_cjk(char):
return any([range["from"] <= ord(char) <= range["to"] for range in cjkRanges])
def is_katakana(char):
return katakanaRange["from"] <= ord(char) <= katakanaRange["to"]
def is_hiragana(char):
return hiraganaRange["from"] <= ord(char) <= hiraganaRange["to"]
# This includes Chinese and Korean. This excludes hiragana and katakana
def is_kanji(char):
return any([range["from"] <= ord(char) <= range["to"] for range in cjkNonKanaRanges])
def is_latin(char):
return any([range["from"] <= ord(char) <= range["to"] for range in latinRanges])
def cjk_substrings(string):
i = 0
while i<len(string):
if is_cjk(string[i]):
start = i
while is_cjk(string[i]): i += 1
yield string[start:i]
i += 1
if __name__ == '__main__':
string = "sdf344asfasf天地方益3権sdfsdf"
for sub in cjk_substrings(string):
string = string.replace(sub, "(" + sub + ")")
katakanaStr = 'ボーリング'
for char in katakanaStr:
print("{} is katakana: {}".format(char, is_katakana(char)))
testStr = 'this is ボーリング'
for char in testStr:
print("{} is latin: {}".format(char, is_latin(char)))