|
|
@ -0,0 +1,135 @@ |
|
|
|
#!/usr/bin/env python2 |
|
|
|
# vim:fileencoding=utf-8 |
|
|
|
from __future__ import unicode_literals, division, absolute_import, print_function |
|
|
|
from calibre.web.feeds.news import BasicNewsRecipe |
|
|
|
|
|
|
|
import time |
|
|
|
|
|
|
|
# For Wallabag workaround |
|
|
|
from calibre.web.feeds import templates |
|
|
|
from calibre.ptempfile import PersistentTemporaryFile |
|
|
|
from calibre import (iswindows) |
|
|
|
from lxml import html, etree |
|
|
|
from lxml.html.builder import HTML, HEAD, TITLE, STYLE, DIV, BODY, \ |
|
|
|
STRONG, BR, SPAN, A, HR, UL, LI, H2, H3, IMG, P as PT, \ |
|
|
|
TABLE, TD, TR |
|
|
|
from calibre import strftime, isbytestring |
|
|
|
|
|
|
|
class AdvancedUserRecipe1592324084(BasicNewsRecipe): |
|
|
|
title = 'Wallabag Japanese {}'.format(time.strftime("%Y-%m-%d %H:%M")) |
|
|
|
oldest_article = 365 |
|
|
|
max_articles_per_feed = 100 |
|
|
|
auto_cleanup = True |
|
|
|
|
|
|
|
feeds = [ |
|
|
|
# Example: ('Wallabag', 'https://app.wallabag.it/feed/Macoy/MySecretToken/unread'), |
|
|
|
('Wallabag', 'https://app.wallabag.it/feed/You/YourToken/unread'), |
|
|
|
] |
|
|
|
|
|
|
|
def parse_feeds(self): |
|
|
|
# Call parent |
|
|
|
feeds = BasicNewsRecipe.parse_feeds(self) |
|
|
|
|
|
|
|
# Further filtering |
|
|
|
for feed in feeds: |
|
|
|
for article in feed.articles[:]: |
|
|
|
# if not "えんぴつの家出" in article.title: |
|
|
|
# feed.articles.remove(article) |
|
|
|
# continue |
|
|
|
foundJapanese = False |
|
|
|
for char in article.title: |
|
|
|
# While I could use <category> from Wallabag, I can't be assed |
|
|
|
if is_cjk(char): |
|
|
|
foundJapanese = True |
|
|
|
break |
|
|
|
if not foundJapanese: |
|
|
|
feed.articles.remove(article) |
|
|
|
return feeds |
|
|
|
|
|
|
|
# Workaround for Wallabag Content not providing necessary HTML tags |
|
|
|
def fetch_embedded_article(self, article, dir, f, a, num_of_feeds): |
|
|
|
# Override templates.EmbeddedContent |
|
|
|
templ = WallabagEmbeddedContent() |
|
|
|
raw = templ.generate(article).render('html') |
|
|
|
with PersistentTemporaryFile('_feeds2disk.html') as pt: |
|
|
|
pt.write(raw) |
|
|
|
url = ('file:'+pt.name) if iswindows else ('file://'+pt.name) |
|
|
|
return self._fetch_article(url, dir, f, a, num_of_feeds) |
|
|
|
|
|
|
|
""" |
|
|
|
Wallabag fixup content HTML |
|
|
|
I'm not sure why this works. All I did was comment the isinstance() line... |
|
|
|
""" |
|
|
|
class WallabagEmbeddedContent(templates.Template): |
|
|
|
|
|
|
|
def _generate(self, article, style=None, extra_css=None): |
|
|
|
print("\n\nUsed Wallabag Embedded Content!\n\n") |
|
|
|
content = article.content if article.content else '' |
|
|
|
summary = article.summary if article.summary else '' |
|
|
|
text = content if len(content) > len(summary) else summary |
|
|
|
head = HEAD(TITLE(article.title)) |
|
|
|
if style: |
|
|
|
head.append(STYLE(style, type='text/css')) |
|
|
|
if extra_css: |
|
|
|
head.append(STYLE(extra_css, type='text/css')) |
|
|
|
|
|
|
|
if isbytestring(text): |
|
|
|
text = text.decode('utf-8', 'replace') |
|
|
|
elements = html.fragments_fromstring(text) |
|
|
|
print(elements) |
|
|
|
self.root = HTML(head, |
|
|
|
BODY(H2(article.title), DIV())) |
|
|
|
div = self.root.find('body').find('div') |
|
|
|
# Commented only because I don't know how to import unicode_type without error |
|
|
|
print(elements[0]) |
|
|
|
if elements: #and isinstance(elements[0], unicode_type): |
|
|
|
div.text = elements[0] |
|
|
|
elements = list(elements)[1:] |
|
|
|
for elem in elements: |
|
|
|
if hasattr(elem, 'getparent'): |
|
|
|
elem.getparent().remove(elem) |
|
|
|
else: |
|
|
|
elem = SPAN(elem) |
|
|
|
div.append(elem) |
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
|
Unicode helpers |
|
|
|
""" |
|
|
|
|
|
|
|
# Modified by Macoy Madson. Original copied from |
|
|
|
# https://stackoverflow.com/questions/30069846/how-to-find-out-chinese-or-japanese-character-in-a-string-in-python |
|
|
|
|
|
|
|
hiraganaRange = {'from': ord(u'\u3040'), 'to': ord(u'\u309f')} # Japanese Hiragana |
|
|
|
katakanaRange = {"from": ord(u"\u30a0"), "to": ord(u"\u30ff")} # Japanese Katakana |
|
|
|
|
|
|
|
# Specifically ignore Hiragana and Katakana in order to get all Kanji |
|
|
|
cjkNonKanaRanges = [ |
|
|
|
{"from": ord(u"\u3300"), "to": ord(u"\u33ff")}, # compatibility ideographs |
|
|
|
{"from": ord(u"\ufe30"), "to": ord(u"\ufe4f")}, # compatibility ideographs |
|
|
|
{"from": ord(u"\uf900"), "to": ord(u"\ufaff")}, # compatibility ideographs |
|
|
|
{"from": ord(u"\U0002F800"), "to": ord(u"\U0002fa1f")}, # compatibility ideographs |
|
|
|
{"from": ord(u"\u2e80"), "to": ord(u"\u2eff")}, # cjk radicals supplement |
|
|
|
{"from": ord(u"\u4e00"), "to": ord(u"\u9fff")}, |
|
|
|
{"from": ord(u"\u3400"), "to": ord(u"\u4dbf")}, |
|
|
|
{"from": ord(u"\U00020000"), "to": ord(u"\U0002a6df")}, |
|
|
|
{"from": ord(u"\U0002a700"), "to": ord(u"\U0002b73f")}, |
|
|
|
{"from": ord(u"\U0002b740"), "to": ord(u"\U0002b81f")}, |
|
|
|
{"from": ord(u"\U0002b820"), "to": ord(u"\U0002ceaf")} # included as of Unicode 8.0 |
|
|
|
] |
|
|
|
|
|
|
|
cjkRanges = [] |
|
|
|
for nonKanaRange in cjkNonKanaRanges: |
|
|
|
cjkRanges.append(nonKanaRange) |
|
|
|
cjkRanges.append(hiraganaRange) # Japanese Hiragana |
|
|
|
cjkRanges.append(katakanaRange) # Japanese Katakana |
|
|
|
|
|
|
|
# "The Alphabet" |
|
|
|
latinRanges = [ |
|
|
|
{"from": ord(u"\u0042"), "to": ord(u"\u005a")}, # Uppercase A-Z |
|
|
|
{"from": ord(u"\u0061"), "to": ord(u"\u007a")} # Lowercase a-z |
|
|
|
] |
|
|
|
|
|
|
|
# Is Chinese, Japanese, or Korean unicode character |
|
|
|
def is_cjk(char): |
|
|
|
return any([range["from"] <= ord(char) <= range["to"] for range in cjkRanges]) |