
3 changed files with 233 additions and 0 deletions
@ -0,0 +1,148 @@ |
|||
# -*- coding:utf-8 -*- |
|||
import argparse |
|||
import json |
|||
import romkan |
|||
import sys |
|||
import urllib.request |
|||
import UnicodeHelpers |
|||
|
|||
|
|||
argParser = argparse.ArgumentParser( |
|||
description="""Automatically turn Romaji into Hiragana in an Anki deck. |
|||
|
|||
This is for if you get bothered by Romaji's inaccuracy, or in my opinion, its somewhat misleading format. |
|||
|
|||
This tool takes the name of a deck and the name of the field with Romaji, and converts it into Hiragana. |
|||
|
|||
It preserves most anything which isn't romaji, so if you have e.g. "setsumei(suru)" it will convert it to "せつめい(する)". |
|||
"-" and "’" will be removed, however. This is because the romkan converter can get confused by these. |
|||
It doesn't hurt to run the script again on a deck which has already been wholly or partially converted. |
|||
Katakana will be output if a "written field" is provided to hint the script that it should use katakana.""") |
|||
argParser.add_argument('DeckName', type=str, |
|||
help='The name of the deck to modify. Add "quotes" if the name has spaces, e.g. "My Deck"') |
|||
argParser.add_argument('RomajiFieldName', type=str, |
|||
help='The name of the field which has Romaji you want to convert, e.g. "Front"') |
|||
argParser.add_argument('--written-field-name', type=str, dest='WrittenFieldName', |
|||
help='The name of the field which has what would actually be written in ' |
|||
'realistic text (for example, kanji). This can be provided in order to ' |
|||
'hint the converter that it should output katakana instead of hiragana, ' |
|||
'if the word would normally be written as katakana') |
|||
argParser.add_argument('--soft-edit', action='store_const', const=True, default=False, dest='debugSoftEdit', |
|||
help='Do not make changes to the deck. Output all changes that would be made. ' |
|||
'I recommend running the script with this option first, then look over the results ' |
|||
'and confirm whether they are satisfactory.') |
|||
|
|||
def formatAnkiConnectRequest(action, **params): |
|||
return {'action': action, 'params': params, 'version': 6} |
|||
|
|||
def invokeAnkiConnect(action, **params): |
|||
requestJson = json.dumps(formatAnkiConnectRequest(action, **params)).encode('utf-8') |
|||
response = json.load(urllib.request.urlopen(urllib.request.Request('http://localhost:8765', requestJson))) |
|||
if len(response) != 2: |
|||
raise Exception('response has an unexpected number of fields') |
|||
if 'error' not in response: |
|||
raise Exception('response is missing required error field') |
|||
if 'result' not in response: |
|||
raise Exception('response is missing required result field') |
|||
if response['error'] is not None: |
|||
raise Exception(response['error']) |
|||
return response['result'] |
|||
|
|||
def getNotes(deckName): |
|||
cardsInDeck = invokeAnkiConnect('findCards', query='"deck:{}"'.format(deckName)) |
|||
if not cardsInDeck: |
|||
print("No cards in deck '{}'".format(deckName)) |
|||
return [] |
|||
print("{} cards in deck '{}'".format(len(cardsInDeck), deckName)) |
|||
return invokeAnkiConnect('cardsToNotes', cards = cardsInDeck) |
|||
|
|||
def sanitizeTextForConversion(fieldValue): |
|||
# These confuse romkan, and aren't usually a part of the language anyhow |
|||
return fieldValue.replace('-', ' ').replace('’', ' ') |
|||
|
|||
def convertNotes(deckName, fieldToConvert, conversionHintField=None, |
|||
shouldEdit=True): |
|||
notes = getNotes(deckName) |
|||
notesInfo = invokeAnkiConnect('notesInfo', notes = notes) |
|||
for currentNote in notesInfo: |
|||
textToConvert = sanitizeTextForConversion(currentNote['fields'][fieldToConvert]['value']) |
|||
|
|||
if not textToConvert: |
|||
print("\nWarning: Empty '{}' found in the following note, which may be malformed:".format(fieldToConvert)) |
|||
print(currentNote) |
|||
continue |
|||
|
|||
hint = (sanitizeTextForConversion(currentNote['fields'][conversionHintField]['value']) |
|||
if conversionHintField else None) |
|||
|
|||
convertedText = None |
|||
|
|||
# Chinese, Japanese, Korean |
|||
foundCJK = False |
|||
isAllKatakana = True |
|||
for char in hint: |
|||
if not UnicodeHelpers.is_katakana(char): |
|||
isAllKatakana = False |
|||
if UnicodeHelpers.is_cjk(char): |
|||
foundCJK = True |
|||
|
|||
# Determine if the hint is all katakana via converting it and seeing if there is a change |
|||
if hint: |
|||
if isAllKatakana: |
|||
# The hint is already all katakana; just use it. This fixes problems where romkan won't |
|||
# convert continuations properly: from input "booringu" the converter outputs ボオリング instead of ボーリング |
|||
convertedText = hint |
|||
elif not foundCJK: |
|||
# There are no Japanese characters; it's probably an initialism or acronym, e.g. 'WWW' |
|||
# Convert to katakana |
|||
convertedText = romkan.to_katakana(textToConvert) |
|||
|
|||
# It's not katakana, or we don't have a hint. All hiragana |
|||
if not convertedText: |
|||
convertedText = romkan.to_hiragana(textToConvert) |
|||
|
|||
# No conversion |
|||
if not convertedText: |
|||
print("ERROR: No conversion for text '{}'".format(textToConvert)) |
|||
continue |
|||
|
|||
for char in convertedText: |
|||
if UnicodeHelpers.is_latin(char): |
|||
print("Warning: conversion did not result in purely Japanese output. There may be " |
|||
"a typo in the romaji, or the romaji format is not understood.") |
|||
break |
|||
|
|||
# Already converted |
|||
if textToConvert == convertedText: |
|||
continue |
|||
|
|||
if hint: |
|||
print("'{}' -> '{}' (hint '{}')".format(currentNote['fields'][fieldToConvert]['value'], convertedText, hint)) |
|||
else: |
|||
print("'{}' -> '{}'".format(currentNote['fields'][fieldToConvert]['value'], convertedText)) |
|||
|
|||
if shouldEdit: |
|||
pass |
|||
|
|||
if __name__ == '__main__': |
|||
print('Anki Romaji Remover: Convert Romaji into Hiragana') |
|||
|
|||
if len(sys.argv) == 1: |
|||
argParser.print_help() |
|||
exit() |
|||
args = argParser.parse_args() |
|||
|
|||
shouldEdit = not args.debugSoftEdit |
|||
if shouldEdit: |
|||
answer = input("\nWARNING: This script will modify your Anki deck.\n" |
|||
"This script's creator is not liable for loss of data!\n" |
|||
"If you want to preview changes, run with --soft-edit.\n" |
|||
"\nHave you created a backup of your decks? (yes or no) ") |
|||
shouldEdit = answer.lower() in ['yes', 'y'] |
|||
|
|||
if not shouldEdit and not args.debugSoftEdit: |
|||
print("Please back up your data via Anki->File->Export->Anki Collection Package") |
|||
else: |
|||
convertNotes(args.DeckName, args.RomajiFieldName, |
|||
conversionHintField=args.WrittenFieldName, |
|||
shouldEdit=shouldEdit and not args.debugSoftEdit) |
@ -0,0 +1,26 @@ |
|||
#+TITLE: Anki Romaji Remover |
|||
Automatically turn Romaji into Hiragana in an Anki deck. |
|||
|
|||
This is for if you get bothered by Romaji's inaccuracy, or in my opinion, its somewhat misleading format. |
|||
|
|||
This tool takes the name of a deck and the name of the field with Romaji, and converts it into Hiragana. |
|||
|
|||
Some notes: |
|||
- It preserves most anything which isn't romaji, so if you have e.g. "setsumei(suru)" it will convert it to "せつめい(する)" |
|||
- "-" will be removed, however. This is because the romkan converter can get confused by these |
|||
- It doesn't hurt to run the script again on a deck which has already been wholly or partially converted |
|||
|
|||
Issues: |
|||
- [ ] English initialisms, e.g. URL, should output to Katakana, but do not |
|||
- [ ] For my deck, katakana output can differ in regards to continuations: from input "booringu" the converter outputs ボオリング instead of ボーリング |
|||
* Setup |
|||
- [[https://foosoft.net/projects/anki-connect/index.html#installation][Install AnkiConnect]] |
|||
- Install [[https://github.com/soimort/python-romkan][romkan]]: ~pip3 install romkan~ |
|||
- Run Anki |
|||
- Run script: ~python3 AnkiRomajiRemover.py~ |
|||
* Use |
|||
For my use case, the command looked like this: |
|||
#+BEGIN_SRC sh |
|||
python3 AnkiRomajiRemover.py "A Frequency of Japanese Words" "Romanization" --written-field-name "Lemma" |
|||
#+END_SRC |
|||
Where ~Lemma~ was the normal written form and ~Romanization~ was the field I wanted to replace romaji with kana. |
@ -0,0 +1,59 @@ |
|||
# -*- coding:utf-8 -*- |
|||
|
|||
# From |
|||
# https://stackoverflow.com/questions/30069846/how-to-find-out-chinese-or-japanese-character-in-a-string-in-python |
|||
|
|||
ranges = [ |
|||
{"from": ord(u"\u3300"), "to": ord(u"\u33ff")}, # compatibility ideographs |
|||
{"from": ord(u"\ufe30"), "to": ord(u"\ufe4f")}, # compatibility ideographs |
|||
{"from": ord(u"\uf900"), "to": ord(u"\ufaff")}, # compatibility ideographs |
|||
{"from": ord(u"\U0002F800"), "to": ord(u"\U0002fa1f")}, # compatibility ideographs |
|||
{'from': ord(u'\u3040'), 'to': ord(u'\u309f')}, # Japanese Hiragana |
|||
{"from": ord(u"\u30a0"), "to": ord(u"\u30ff")}, # Japanese Katakana |
|||
{"from": ord(u"\u2e80"), "to": ord(u"\u2eff")}, # cjk radicals supplement |
|||
{"from": ord(u"\u4e00"), "to": ord(u"\u9fff")}, |
|||
{"from": ord(u"\u3400"), "to": ord(u"\u4dbf")}, |
|||
{"from": ord(u"\U00020000"), "to": ord(u"\U0002a6df")}, |
|||
{"from": ord(u"\U0002a700"), "to": ord(u"\U0002b73f")}, |
|||
{"from": ord(u"\U0002b740"), "to": ord(u"\U0002b81f")}, |
|||
{"from": ord(u"\U0002b820"), "to": ord(u"\U0002ceaf")} # included as of Unicode 8.0 |
|||
] |
|||
|
|||
katakanaRange = {"from": ord(u"\u30a0"), "to": ord(u"\u30ff")} # Japanese Katakana |
|||
|
|||
# "The Alphabet" |
|||
latinRanges = [ |
|||
{"from": ord(u"\u0042"), "to": ord(u"\u005a")}, # Uppercase A-Z |
|||
{"from": ord(u"\u0061"), "to": ord(u"\u007a")} # Lowercase a-z |
|||
] |
|||
|
|||
|
|||
def is_cjk(char): |
|||
return any([range["from"] <= ord(char) <= range["to"] for range in ranges]) |
|||
|
|||
def is_katakana(char): |
|||
return katakanaRange["from"] <= ord(char) <= katakanaRange["to"] |
|||
|
|||
def is_latin(char): |
|||
return any([range["from"] <= ord(char) <= range["to"] for range in latinRanges]) |
|||
|
|||
def cjk_substrings(string): |
|||
i = 0 |
|||
while i<len(string): |
|||
if is_cjk(string[i]): |
|||
start = i |
|||
while is_cjk(string[i]): i += 1 |
|||
yield string[start:i] |
|||
i += 1 |
|||
|
|||
if __name__ == '__main__': |
|||
string = "sdf344asfasf天地方益3権sdfsdf" |
|||
for sub in cjk_substrings(string): |
|||
string = string.replace(sub, "(" + sub + ")") |
|||
print(string) |
|||
katakanaStr = 'ボーリング' |
|||
for char in katakanaStr: |
|||
print("{} is katakana: {}".format(char, is_katakana(char))) |
|||
testStr = 'this is ボーリング' |
|||
for char in testStr: |
|||
print("{} is latin: {}".format(char, is_latin(char))) |
Loading…
Reference in new issue