Browse Source

Fixed Calibre download recipe

* I was encountering an issue where some Wallabag articles didn't have
  all of their content. I overrode the fetch_embedded_article() code,
  but I didn't really change anything and it worked. I commented the
  "isinstance" line, and Wallabag articles do start with text with no
  HTML tags ("Reading time: N minutes"), so maybe that was the fix?
* Added command-line tools to create an epub from the articles. See
  https://github.com/makuto/typhon for a good app for reading these articles
* I included a secret key in the last commit, which is now revoked (so
  don't get any ideas :D )
master
Macoy Madson 5 months ago
parent
commit
af35bb75ec
4 changed files with 139 additions and 68 deletions
  1. +2
    -0
      DownloadCalibreArticles.sh
  2. +2
    -0
      DownloadCalibreArticles_Debug.sh
  3. +0
    -68
      src/Calibre_Wallabag_NewsSource.py
  4. +135
    -0
      src/Calibre_Wallabag_To_EPUB.recipe

+ 2
- 0
DownloadCalibreArticles.sh View File

@@ -0,0 +1,2 @@
#!/bin/sh
ebook-convert src/Calibre_Wallabag_To_EPUB.recipe .epub

+ 2
- 0
DownloadCalibreArticles_Debug.sh View File

@@ -0,0 +1,2 @@
#!/bin/sh
ebook-convert src/Calibre_Wallabag_To_EPUB.recipe .epub -vv --debug-pipeline debug

+ 0
- 68
src/Calibre_Wallabag_NewsSource.py View File

@@ -1,68 +0,0 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe

class AdvancedUserRecipe1592324084(BasicNewsRecipe):
title = 'Wallabag Japanese'
oldest_article = 365
max_articles_per_feed = 100
auto_cleanup = True

feeds = [
('Wallabag', 'https://app.wallabag.it/feed/Macoy/hU7xNZIj2hRlyKk/unread'),
]
def parse_feeds(self):
# Call parent
feeds = BasicNewsRecipe.parse_feeds(self)
# Further filtering
for feed in feeds:
for article in feed.articles[:]:
foundJapanese = False
for char in article.title:
# While I could use <category> from Wallabag, I can't be assed
if is_cjk(char):
foundJapanese = True
break
if not foundJapanese:
feed.articles.remove(article)
return feeds

# Modified by Macoy Madson. Original copied from
# https://stackoverflow.com/questions/30069846/how-to-find-out-chinese-or-japanese-character-in-a-string-in-python

hiraganaRange = {'from': ord(u'\u3040'), 'to': ord(u'\u309f')} # Japanese Hiragana
katakanaRange = {"from": ord(u"\u30a0"), "to": ord(u"\u30ff")} # Japanese Katakana

# Specifically ignore Hiragana and Katakana in order to get all Kanji
cjkNonKanaRanges = [
{"from": ord(u"\u3300"), "to": ord(u"\u33ff")}, # compatibility ideographs
{"from": ord(u"\ufe30"), "to": ord(u"\ufe4f")}, # compatibility ideographs
{"from": ord(u"\uf900"), "to": ord(u"\ufaff")}, # compatibility ideographs
{"from": ord(u"\U0002F800"), "to": ord(u"\U0002fa1f")}, # compatibility ideographs
{"from": ord(u"\u2e80"), "to": ord(u"\u2eff")}, # cjk radicals supplement
{"from": ord(u"\u4e00"), "to": ord(u"\u9fff")},
{"from": ord(u"\u3400"), "to": ord(u"\u4dbf")},
{"from": ord(u"\U00020000"), "to": ord(u"\U0002a6df")},
{"from": ord(u"\U0002a700"), "to": ord(u"\U0002b73f")},
{"from": ord(u"\U0002b740"), "to": ord(u"\U0002b81f")},
{"from": ord(u"\U0002b820"), "to": ord(u"\U0002ceaf")} # included as of Unicode 8.0
]

cjkRanges = []
for nonKanaRange in cjkNonKanaRanges:
cjkRanges.append(nonKanaRange)
cjkRanges.append(hiraganaRange) # Japanese Hiragana
cjkRanges.append(katakanaRange) # Japanese Katakana

# "The Alphabet"
latinRanges = [
{"from": ord(u"\u0042"), "to": ord(u"\u005a")}, # Uppercase A-Z
{"from": ord(u"\u0061"), "to": ord(u"\u007a")} # Lowercase a-z
]
# Is Chinese, Japanese, or Korean unicode character
def is_cjk(char):
return any([range["from"] <= ord(char) <= range["to"] for range in cjkRanges])

+ 135
- 0
src/Calibre_Wallabag_To_EPUB.recipe View File

@@ -0,0 +1,135 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe

import time

# For Wallabag workaround
from calibre.web.feeds import templates
from calibre.ptempfile import PersistentTemporaryFile
from calibre import (iswindows)
from lxml import html, etree
from lxml.html.builder import HTML, HEAD, TITLE, STYLE, DIV, BODY, \
STRONG, BR, SPAN, A, HR, UL, LI, H2, H3, IMG, P as PT, \
TABLE, TD, TR
from calibre import strftime, isbytestring

class AdvancedUserRecipe1592324084(BasicNewsRecipe):
title = 'Wallabag Japanese {}'.format(time.strftime("%Y-%m-%d %H:%M"))
oldest_article = 365
max_articles_per_feed = 100
auto_cleanup = True

feeds = [
# Example: ('Wallabag', 'https://app.wallabag.it/feed/Macoy/MySecretToken/unread'),
('Wallabag', 'https://app.wallabag.it/feed/You/YourToken/unread'),
]

def parse_feeds(self):
# Call parent
feeds = BasicNewsRecipe.parse_feeds(self)

# Further filtering
for feed in feeds:
for article in feed.articles[:]:
# if not "えんぴつの家出" in article.title:
# feed.articles.remove(article)
# continue
foundJapanese = False
for char in article.title:
# While I could use <category> from Wallabag, I can't be assed
if is_cjk(char):
foundJapanese = True
break
if not foundJapanese:
feed.articles.remove(article)
return feeds

# Workaround for Wallabag Content not providing necessary HTML tags
def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
# Override templates.EmbeddedContent
templ = WallabagEmbeddedContent()
raw = templ.generate(article).render('html')
with PersistentTemporaryFile('_feeds2disk.html') as pt:
pt.write(raw)
url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
return self._fetch_article(url, dir, f, a, num_of_feeds)

"""
Wallabag fixup content HTML
I'm not sure why this works. All I did was comment the isinstance() line...
"""
class WallabagEmbeddedContent(templates.Template):

def _generate(self, article, style=None, extra_css=None):
print("\n\nUsed Wallabag Embedded Content!\n\n")
content = article.content if article.content else ''
summary = article.summary if article.summary else ''
text = content if len(content) > len(summary) else summary
head = HEAD(TITLE(article.title))
if style:
head.append(STYLE(style, type='text/css'))
if extra_css:
head.append(STYLE(extra_css, type='text/css'))

if isbytestring(text):
text = text.decode('utf-8', 'replace')
elements = html.fragments_fromstring(text)
print(elements)
self.root = HTML(head,
BODY(H2(article.title), DIV()))
div = self.root.find('body').find('div')
# Commented only because I don't know how to import unicode_type without error
print(elements[0])
if elements: #and isinstance(elements[0], unicode_type):
div.text = elements[0]
elements = list(elements)[1:]
for elem in elements:
if hasattr(elem, 'getparent'):
elem.getparent().remove(elem)
else:
elem = SPAN(elem)
div.append(elem)


"""
Unicode helpers
"""

# Modified by Macoy Madson. Original copied from
# https://stackoverflow.com/questions/30069846/how-to-find-out-chinese-or-japanese-character-in-a-string-in-python

hiraganaRange = {'from': ord(u'\u3040'), 'to': ord(u'\u309f')} # Japanese Hiragana
katakanaRange = {"from": ord(u"\u30a0"), "to": ord(u"\u30ff")} # Japanese Katakana

# Specifically ignore Hiragana and Katakana in order to get all Kanji
cjkNonKanaRanges = [
{"from": ord(u"\u3300"), "to": ord(u"\u33ff")}, # compatibility ideographs
{"from": ord(u"\ufe30"), "to": ord(u"\ufe4f")}, # compatibility ideographs
{"from": ord(u"\uf900"), "to": ord(u"\ufaff")}, # compatibility ideographs
{"from": ord(u"\U0002F800"), "to": ord(u"\U0002fa1f")}, # compatibility ideographs
{"from": ord(u"\u2e80"), "to": ord(u"\u2eff")}, # cjk radicals supplement
{"from": ord(u"\u4e00"), "to": ord(u"\u9fff")},
{"from": ord(u"\u3400"), "to": ord(u"\u4dbf")},
{"from": ord(u"\U00020000"), "to": ord(u"\U0002a6df")},
{"from": ord(u"\U0002a700"), "to": ord(u"\U0002b73f")},
{"from": ord(u"\U0002b740"), "to": ord(u"\U0002b81f")},
{"from": ord(u"\U0002b820"), "to": ord(u"\U0002ceaf")} # included as of Unicode 8.0
]

cjkRanges = []
for nonKanaRange in cjkNonKanaRanges:
cjkRanges.append(nonKanaRange)
cjkRanges.append(hiraganaRange) # Japanese Hiragana
cjkRanges.append(katakanaRange) # Japanese Katakana

# "The Alphabet"
latinRanges = [
{"from": ord(u"\u0042"), "to": ord(u"\u005a")}, # Uppercase A-Z
{"from": ord(u"\u0061"), "to": ord(u"\u007a")} # Lowercase a-z
]
# Is Chinese, Japanese, or Korean unicode character
def is_cjk(char):
return any([range["from"] <= ord(char) <= range["to"] for range in cjkRanges])

Loading…
Cancel
Save