Browse Source

Added dumb dictionary, Calibre Wallabag rule

* The dictionary doesn't really work very well. I abandoned this once
I found Rikai-kun had deconjugators (etc.) that I'm not going to want
to re-implement, so I've moved to relying on their software instead
* I have been adding articles in Japanese to Wallabag for reading
practice. The Calibre rule filters out English articles so I can just
upload Japanese articles to my e-Reader (Onyx Boox Nova 2 running my
modified version of Typhon for dictionary)
master
Macoy Madson 5 months ago
parent
commit
e4f4aab68d
3 changed files with 249 additions and 32 deletions
  1. +1
    -0
      Jamrules
  2. +68
    -0
      src/Calibre_Wallabag_NewsSource.py
  3. +180
    -32
      src/TextProcessor.cpp

+ 1
- 0
Jamrules View File

@@ -30,6 +30,7 @@ HDRS = src
Dependencies/curl/include
Dependencies/rapidjson/include
Dependencies/mecab/build/local/include
Dependencies/parallel-hashmap/parallel_hashmap
;

# TODO: Make base hold all this weirdness?


+ 68
- 0
src/Calibre_Wallabag_NewsSource.py View File

@@ -0,0 +1,68 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe

class AdvancedUserRecipe1592324084(BasicNewsRecipe):
title = 'Wallabag Japanese'
oldest_article = 365
max_articles_per_feed = 100
auto_cleanup = True

feeds = [
('Wallabag', 'https://app.wallabag.it/feed/Macoy/hU7xNZIj2hRlyKk/unread'),
]
def parse_feeds(self):
# Call parent
feeds = BasicNewsRecipe.parse_feeds(self)
# Further filtering
for feed in feeds:
for article in feed.articles[:]:
foundJapanese = False
for char in article.title:
# While I could use <category> from Wallabag, I can't be assed
if is_cjk(char):
foundJapanese = True
break
if not foundJapanese:
feed.articles.remove(article)
return feeds

# Modified by Macoy Madson. Original copied from
# https://stackoverflow.com/questions/30069846/how-to-find-out-chinese-or-japanese-character-in-a-string-in-python

hiraganaRange = {'from': ord(u'\u3040'), 'to': ord(u'\u309f')} # Japanese Hiragana
katakanaRange = {"from": ord(u"\u30a0"), "to": ord(u"\u30ff")} # Japanese Katakana

# Specifically ignore Hiragana and Katakana in order to get all Kanji
cjkNonKanaRanges = [
{"from": ord(u"\u3300"), "to": ord(u"\u33ff")}, # compatibility ideographs
{"from": ord(u"\ufe30"), "to": ord(u"\ufe4f")}, # compatibility ideographs
{"from": ord(u"\uf900"), "to": ord(u"\ufaff")}, # compatibility ideographs
{"from": ord(u"\U0002F800"), "to": ord(u"\U0002fa1f")}, # compatibility ideographs
{"from": ord(u"\u2e80"), "to": ord(u"\u2eff")}, # cjk radicals supplement
{"from": ord(u"\u4e00"), "to": ord(u"\u9fff")},
{"from": ord(u"\u3400"), "to": ord(u"\u4dbf")},
{"from": ord(u"\U00020000"), "to": ord(u"\U0002a6df")},
{"from": ord(u"\U0002a700"), "to": ord(u"\U0002b73f")},
{"from": ord(u"\U0002b740"), "to": ord(u"\U0002b81f")},
{"from": ord(u"\U0002b820"), "to": ord(u"\U0002ceaf")} # included as of Unicode 8.0
]

cjkRanges = []
for nonKanaRange in cjkNonKanaRanges:
cjkRanges.append(nonKanaRange)
cjkRanges.append(hiraganaRange) # Japanese Hiragana
cjkRanges.append(katakanaRange) # Japanese Katakana

# "The Alphabet"
latinRanges = [
{"from": ord(u"\u0042"), "to": ord(u"\u005a")}, # Uppercase A-Z
{"from": ord(u"\u0061"), "to": ord(u"\u007a")} # Lowercase a-z
]
# Is Chinese, Japanese, or Korean unicode character
def is_cjk(char):
return any([range["from"] <= ord(char) <= range["to"] for range in cjkRanges])

+ 180
- 32
src/TextProcessor.cpp View File

@@ -1,9 +1,11 @@
#include <mecab.h>
#include <cstring>
#include <fstream>
#include <iostream>
#include <vector>

#include <mecab.h>
#include <phmap.h>

#define CHECK(eval) \
if (!eval) \
{ \
@@ -13,10 +15,42 @@
return -1; \
}

// Note that frustratingly, phmap::flat_hash_map does not support const char* as key
typedef phmap::flat_hash_map<std::string, const char*> DictionaryHashMap;

static DictionaryHashMap dictionary;
static char* rawDictionary = nullptr;
static size_t rawDictionarySize = 0;

void finishAddWordToDictionary(const char* word, size_t wordLength, const char* entry)
{
// TODO do I need to allocate the key?
// TODO Memory leak on newKey (iterate over hash map deleting all keys later)
// +1 for null terminator
char* newKey = new char[wordLength + 1];
std::memcpy(newKey, word, wordLength);
newKey[wordLength] = '\0';
std::string keyStr(newKey);
DictionaryHashMap::iterator checkDupeIt = dictionary.find(keyStr);
if (checkDupeIt != dictionary.end())
{
// TODO: Handle
// std::cout << "Warning: duplicate key '" << keyStr << "' found\n";
}

dictionary[keyStr] = entry;

// TODO Remove
// Test whether the dictionary is working
DictionaryHashMap::iterator findIt = dictionary.find(newKey);
assert(findIt != dictionary.end());
}

void loadDictionary()
{
// About how many entries there are (lower bound!)
dictionary.reserve(190000);
std::cout << "Loading dictionary..." << std::flush;
std::ifstream inputFile;
// std::ios::ate so tellg returns the size
inputFile.open("data/utf8Edict2", std::ios::in | std::ios::binary | std::ios::ate);
@@ -25,36 +59,150 @@ void loadDictionary()
inputFile.seekg(0, std::ios::beg);
inputFile.read(rawDictionary, rawDictionarySize);
inputFile.close();

enum class EDict2ReadState
{
VersionNumber = 0,
JapaneseWord,
Reading,
EnglishDefinition,
EntryId
};

EDict2ReadState readState = EDict2ReadState::VersionNumber;
// Multiple ways to say the same "word"
std::vector<const char*> wordsThisEntry;
char buffer[1024];
char* bufferWriteHead = buffer;
const char* beginningOfLine = nullptr;

#define FINISH_ADD_WORD() \
if (bufferWriteHead != buffer) \
{ \
finishAddWordToDictionary(buffer, bufferWriteHead - buffer, beginningOfLine); \
bufferWriteHead = buffer; \
}

for (size_t i = 0; i < rawDictionarySize; ++i)
{
if (rawDictionary[i] == '\n')
{
// If this is hit, an entry has a format this state machine doesn't understand
assert(readState == EDict2ReadState::VersionNumber ||
readState == EDict2ReadState::EntryId);
// Reset for the start of next word (words are separated by line)
beginningOfLine = nullptr;
readState = EDict2ReadState::JapaneseWord;
continue;
}

switch (readState)
{
case EDict2ReadState::VersionNumber:
// Ignore the whole first line, because it is a different format to report version
// info
break;
case EDict2ReadState::JapaneseWord:
if (!beginningOfLine)
beginningOfLine = &rawDictionary[i];

if (rawDictionary[i] == '/')
{
FINISH_ADD_WORD();
readState = EDict2ReadState::EnglishDefinition;
}
else if (rawDictionary[i] == '[')
{
FINISH_ADD_WORD();
readState = EDict2ReadState::Reading;
}
else if (rawDictionary[i] == ';')
{
// Separate writing of the same word
FINISH_ADD_WORD();
}
else if (rawDictionary[i] == ' ')
{
// Ignore all spaces
}
else
{
*bufferWriteHead = rawDictionary[i];
++bufferWriteHead;
}
break;
case EDict2ReadState::Reading:
if (rawDictionary[i] == ']')
{
FINISH_ADD_WORD();
readState = EDict2ReadState::JapaneseWord;
}
else if (rawDictionary[i] == ';')
{
// Separate reading
FINISH_ADD_WORD();
}
else if (rawDictionary[i] == ' ')
{
// Ignore all spaces
}
else
{
*bufferWriteHead = rawDictionary[i];
++bufferWriteHead;
}
break;
case EDict2ReadState::EnglishDefinition:
// Gross. Absorb '/' unless it's clearly the entry ID slash
if (rawDictionary[i] == '/' && rawDictionary[i + 1] == 'E' &&
rawDictionary[i + 2] == 'n' && rawDictionary[i + 3] == 't' &&
rawDictionary[i + 4] == 'L')
readState = EDict2ReadState::EntryId;
break;
case EDict2ReadState::EntryId:
break;
default:
break;
}
}

#undef FINISH_ADD_WORD

std::cout << "done.\n" << std::flush;
}

// enum class EDict2ReadState
// bool getDictionaryResults(const char* query, char* outBuffer, size_t outBufferSize)
// {
// None = 0,
// JapaneseWord,

// char* result = strstr(rawDictionary, query);
// if (result)
// {
// // TODO Range check raw Dictionary pointer
// for (size_t i = 0; i < outBufferSize; i++)
// {
// outBuffer[i] = result[i];
// if (result[i] == '\n')
// break;
// }
// return true;
// }
// return false;
// }

bool getDictionaryResults(const char* query, char* outBuffer, size_t outBufferSize)
{
// TODO index
// for (size_t i = 0; i < rawDictionarySize; ++i)
// {
// if (rawDictionary[i] == '\n')
// readState = EDict2ReadState::None;
// }
char* result = strstr(rawDictionary, query);
if (result)
std::cout << "Find '" << query << "'\n";
DictionaryHashMap::iterator findIt = dictionary.find(query);
if (findIt == dictionary.end())
return false;

// TODO Range check raw Dictionary pointer
for (size_t i = 0; i < outBufferSize; i++)
{
// TODO Range check raw Dictionary pointer
for (size_t i = 0; i < outBufferSize; i++)
{
outBuffer[i] = result[i];
if (result[i] == '\n')
break;
}
return true;
outBuffer[i] = findIt->second[i];
if (findIt->second[i] == '\n')
break;
}
return false;
return true;
}

void freeDictionary()
@@ -103,11 +251,11 @@ int main(int argc, char** argv)
// for (int i = 0; i < node->length; ++i)
// feature[i] = *(node->surface + i);

char dictionaryResult[512] = {0};
if (getDictionaryResults(feature, dictionaryResult, sizeof(dictionaryResult)))
std::cout << feature << " " << dictionaryResult << "\n";
else
std::cout << feature << "\n";
// char dictionaryResult[512] = {0};
// if (getDictionaryResults(feature, dictionaryResult, sizeof(dictionaryResult)))
// std::cout << feature << " " << dictionaryResult << "\n";
// else
std::cout << feature << "_\n";
}

// std::cout << node->id << ' ';
@@ -118,11 +266,11 @@ int main(int argc, char** argv)
// else
// std::cout.write(node->surface, node->length);

// std::cout << ' ' << node->feature << ' ' << (int)(node->surface - input) << ' '
// << (int)(node->surface - input + node->length) << ' ' << node->rcAttr << ' '
// << node->lcAttr << ' ' << node->posid << ' ' << (int)node->char_type << ' '
// << (int)node->stat << ' ' << (int)node->isbest << ' ' << node->alpha << ' '
// << node->beta << ' ' << node->prob << ' ' << node->cost << std::endl;
std::cout << ' ' << node->feature << ' ' << (int)(node->surface - input) << ' '
<< (int)(node->surface - input + node->length) << ' ' << node->rcAttr << ' '
<< node->lcAttr << ' ' << node->posid << ' ' << (int)node->char_type << ' '
<< (int)node->stat << ' ' << (int)node->isbest << ' ' << node->alpha << ' '
<< node->beta << ' ' << node->prob << ' ' << node->cost << std::endl;
}

delete tagger;


Loading…
Cancel
Save