Browse Source

Added .org file output

* Files will now be output with notes in org format
* Fixed bug where calling toOrgString() would modify the note and
  then fail on subsequent calls
* Added dates and sorting of notes by date
pull/2/head
Macoy Madson 2 years ago
parent
commit
28cdc3a9b7
2 changed files with 106 additions and 40 deletions
  1. +102
    -38
      KeepToOrg.py
  2. +4
    -2
      README.md

+ 102
- 38
KeepToOrg.py View File

@@ -1,20 +1,22 @@
import os
import html
import sys
import datetime

"""
KeepToOrg.py

Usage:
python KeepToOrg.py /path/to/google/Keep
python KeepToOrg.py /path/to/google/Keep output/dir

Given a Takeout of your Google Keep Notes in .html format, output .org files with logical groupings
based on tags. This will also format lists and try to be smart.
"""

# TODO: Format links:
# Links have the syntax [[https://blah][Example link]] (things can be internal links too!)
# See https://orgmode.org/manual/External-links.html
# TODO:
# Format links:
# Links have the syntax [[https://blah][Example link]] (things can be internal links too!)
# See https://orgmode.org/manual/External-links.html

# Convert an array of tags to an Emacs Org tag string
# Tags have the syntax :tag: or :tag1:tag2:
@@ -30,78 +32,95 @@ def tagsToOrgString(tags):

class Note:
def __init__(self):
self.title = ""
self.body = ""
self.title = ''
self.body = ''
self.tags = []
self.archived = False
self.date = ''

def toOrgString(self):
status = '(ARCHIVED) ' if self.archived else ''
# status = '(archived) ' if self.archived else ''
# Create a copy so we can mangle it
body = self.body
title = self.title

# Convert lists to org lists. This is a total hack but works
self.body = self.body.replace('<li class="listitem"><span class="bullet">&#9744;</span>\n', '- [ ] ')
self.body = self.body.replace('<li class="listitem checked"><span class="bullet">&#9745;</span>', '- [X] ')
body = body.replace('<li class="listitem"><span class="bullet">&#9744;</span>\n', '- [ ] ')
body = body.replace('<li class="listitem checked"><span class="bullet">&#9745;</span>', '- [X] ')
# Flat out remove these
for htmlTagToErase in ['<span class="text">', '</span>', '</li>', '<ul class="list">', '</ul>']:
self.body = self.body.replace(htmlTagToErase, '')
body = body.replace(htmlTagToErase, '')
# This is very weird, but fix the edge case where the list entry has a new line before the content
for listTypeToFixNewLines in ['- [ ] \n','- [X] \n']:
self.body = self.body.replace(listTypeToFixNewLines, listTypeToFixNewLines[:-1])
body = body.replace(listTypeToFixNewLines, listTypeToFixNewLines[:-1])

# Unescape all (e.g. remove &quot and replace with ")
self.title = html.unescape(self.title)
self.body = html.unescape(self.body)
title = html.unescape(title)
body = html.unescape(body)
for i, tag in enumerate(self.tags):
self.tags[i] = html.unescape(tag)

# Strip tags
for tag in self.tags:
self.body = self.body.replace('#{}'.format(tag), '')
body = body.replace('#{}'.format(tag), '')

# Remove any leading/trailing whitespace (possibly leftover from tags stripping)
self.body = self.body.strip()
body = body.strip()

# Make a title if necessary
orgTitle = self.title
orgTitle = title
if not orgTitle:
toNewline = self.body.find('\n')
toNewline = body.find('\n')
# If there's a line break; use the first line as a title
if toNewline >= 0:
orgTitle = self.body[:toNewline]
self.body = self.body[len(orgTitle) + 1:]
orgTitle = body[:toNewline]
body = body[len(orgTitle) + 1:]
# The note has no breaks; make the body the title
else:
orgTitle = self.body
orgTitle = body
# If the title is the whole body, clear the body
self.body = ''

if self.body or len(self.tags):
if self.body and not len(self.tags):
return '* {}{}\n{}'.format(status, orgTitle, self.body)
if not self.body and len(self.tags):
return '* {}{}\n{}'.format(status, orgTitle, tagsToOrgString(self.tags))
body = ''

nesting = '*' if self.archived else ''
# Various levels of information require different formats
if body or len(self.tags):
if body and not len(self.tags):
return '*{} {}\n{}'.format(nesting, orgTitle, body)
if not body and len(self.tags):
return '*{} {}\n{}'.format(nesting, orgTitle, tagsToOrgString(self.tags))
else:
return "* {}{}\n{}\n{}".format(status, orgTitle, self.body, tagsToOrgString(self.tags))
return "*{} {}\n{}\n{}".format(nesting, orgTitle, body, tagsToOrgString(self.tags))
# If no body nor tags, note should be a single line
else:
return "* {}{}".format(status, orgTitle)
return '*{} {}'.format(nesting, orgTitle)

def getAllNoteHtmlFiles(htmlDir):
print('Looking for notes in {}'.format(htmlDir))
noteHtmlFiles = []
for root, dirs, files in os.walk(htmlDir):
for file in files:
if file.endswith(".html"):
if file.endswith('.html'):
noteHtmlFiles.append(os.path.join(root, file))

print ('Found {} notes'.format(len(noteHtmlFiles)))
return noteHtmlFiles

def getHtmlValueIfMatches(line, tag, endTag):
if tag.lower() in line.lower() and endTag.lower() in line.lower():
return line[line.find(tag) + len(tag):-(len(endTag) + 1)], True
return '', False

def main(keepHtmlDir):
def makeSafeFilename(strToPurify):
strToPurify = strToPurify.replace('/', '')
strToPurify = strToPurify.replace('.', '')
return strToPurify

def main(keepHtmlDir, outputDir):
noteFiles = getAllNoteHtmlFiles(keepHtmlDir)

noteGroups = {}
for noteFilePath in noteFiles:
# Read in the file
@@ -109,7 +128,7 @@ def main(keepHtmlDir):
noteLines = noteFile.readlines()
noteFile.close()

print('\nParsing {}'.format(noteFilePath))
# print('Parsing {}'.format(noteFilePath))

note = Note()
@@ -138,7 +157,13 @@ def main(keepHtmlDir):
# This isn't great; for same-line bodies, strip opening div
line = line.replace('<div class="content">', '')

# Parse tags
# Parse the date
if ' AM</div>' in line or ' PM</div>' in line:
dateString = line.replace('</div>', '').strip()
# Example: "Apr 27, 2018, 6:32:15 PM"
note.date = datetime.datetime.strptime(dateString, '%b %d, %Y, %I:%M:%S %p')

# Parse tags, if any
potentialTag, isMatch = getHtmlValueIfMatches(line, '<span class="label-name">', '</span>')
if isMatch:
note.tags.append(potentialTag)
@@ -155,15 +180,54 @@ def main(keepHtmlDir):

note.body += line.replace('<br>', '\n')

print(note.toOrgString())
# Add to groups based on tags
for tag in note.tags:
if tag in noteGroups:
noteGroups[tag].append(note)
else:
noteGroups[tag] = [note]
if not note.tags:
if 'Untagged' in noteGroups:
noteGroups['Untagged'].append(note)
else:
noteGroups['Untagged'] = [note]

# We've parsed all the notes; write out the groups to separate .org files
numNotesWritten = 0
for tag, group in noteGroups.items():
# TODO: Make sure tags are filename-safe
outFileName = '{}/{}.org'.format(outputDir, makeSafeFilename(tag))

notesSortedByDate = sorted(group, key=lambda note: note.date)
# If capture etc. appends, we should probably follow that same logic (don't reverse)
# notesSortedByDate.reverse()

# Concatenate all notes into lines
lines = []
archivedLines = []
for note in notesSortedByDate:
if note.archived:
archivedLines.append(note.toOrgString() + '\n')
else:
lines.append(note.toOrgString() + '\n')
if len(archivedLines):
lines = ['* *Archived*\n'] + archivedLines + lines
outFile = open(outFileName, 'w')
outFile.writelines(lines)
outFile.close()
print('Wrote {} notes to {}'.format(len(group), outFileName))
numNotesWritten += len(group)

# TODO: Add to org list depending on tags
print('Wrote {} notes total'.format(numNotesWritten))

if __name__ == '__main__':
if len(sys.argv) != 2:
print('Wrong number of arguments!\nUsage:\n\tpython KeepToOrg.py /path/to/google/Keep')
if len(sys.argv) != 3:
print('Wrong number of arguments!\nUsage:\n\tpython KeepToOrg.py /path/to/google/Keep output/dir')

else:
keepHtmlDir = sys.argv[1]
main(keepHtmlDir)
outputDir = sys.argv[2]
main(keepHtmlDir, outputDir)


+ 4
- 2
README.md View File

@@ -2,9 +2,11 @@
Convert Google Keep notes to Emacs .org files

Usage:
`python KeepToOrg.py /path/to/google/Keep`
`python KeepToOrg.py /path/to/google/Keep output/dir`

Given a Takeout of your Google Keep Notes in .html format, output .org files with logical groupings
based on tags. This will also format lists and try to be smart.

Go to [Google Takeout](https://takeout.google.com/settings/takeout) to download your Keep data. Extract the data and run this script on it.
Go to [Google Takeout](https://takeout.google.com/settings/takeout) to download your Keep data. Extract the data and run this script on it.

Note that the dates used to sort the notes can be out of order. After investigating, the dates in the note .html files themselves appear to be wrong. I think this is Google's fault.

Loading…
Cancel
Save