@ -1,20 +1,22 @@
import os
import html
import sys
import datetime
"""
KeepToOrg . py
Usage :
python KeepToOrg . py / path / to / google / Keep
python KeepToOrg . py / path / to / google / Keep output / dir
Given a Takeout of your Google Keep Notes in . html format , output . org files with logical groupings
based on tags . This will also format lists and try to be smart .
"""
# TODO: Format links:
# Links have the syntax [[https://blah][Example link]] (things can be internal links too!)
# See https://orgmode.org/manual/External-links.html
# TODO:
# Format links:
# Links have the syntax [[https://blah][Example link]] (things can be internal links too!)
# See https://orgmode.org/manual/External-links.html
# Convert an array of tags to an Emacs Org tag string
# Tags have the syntax :tag: or :tag1:tag2:
@ -30,78 +32,95 @@ def tagsToOrgString(tags):
class Note :
def __init__ ( self ) :
self . title = " "
self . body = " "
self . title = ' '
self . body = ' '
self . tags = [ ]
self . archived = False
self . date = ' '
def toOrgString ( self ) :
status = ' (ARCHIVED) ' if self . archived else ' '
# status = '(archived) ' if self.archived else ''
# Create a copy so we can mangle it
body = self . body
title = self . title
# Convert lists to org lists. This is a total hack but works
self . body = self . body . replace ( ' <li class= " listitem " ><span class= " bullet " >☐</span> \n ' , ' - [ ] ' )
self . body = self . body . replace ( ' <li class= " listitem checked " ><span class= " bullet " >☑</span> ' , ' - [X] ' )
body = body . replace ( ' <li class= " listitem " ><span class= " bullet " >☐</span> \n ' , ' - [ ] ' )
body = body . replace ( ' <li class= " listitem checked " ><span class= " bullet " >☑</span> ' , ' - [X] ' )
# Flat out remove these
for htmlTagToErase in [ ' <span class= " text " > ' , ' </span> ' , ' </li> ' , ' <ul class= " list " > ' , ' </ul> ' ] :
self . body = self . body . replace ( htmlTagToErase , ' ' )
body = body . replace ( htmlTagToErase , ' ' )
# This is very weird, but fix the edge case where the list entry has a new line before the content
for listTypeToFixNewLines in [ ' - [ ] \n ' , ' - [X] \n ' ] :
self . body = self . body . replace ( listTypeToFixNewLines , listTypeToFixNewLines [ : - 1 ] )
body = body . replace ( listTypeToFixNewLines , listTypeToFixNewLines [ : - 1 ] )
# Unescape all (e.g. remove " and replace with ")
self . title = html . unescape ( self . title )
self . body = html . unescape ( self . body )
title = html . unescape ( title )
body = html . unescape ( body )
for i , tag in enumerate ( self . tags ) :
self . tags [ i ] = html . unescape ( tag )
# Strip tags
for tag in self . tags :
self . body = self . body . replace ( ' # {} ' . format ( tag ) , ' ' )
body = body . replace ( ' # {} ' . format ( tag ) , ' ' )
# Remove any leading/trailing whitespace (possibly leftover from tags stripping)
self . body = self . body . strip ( )
body = body . strip ( )
# Make a title if necessary
orgTitle = self . title
orgTitle = title
if not orgTitle :
toNewline = self . body . find ( ' \n ' )
toNewline = body . find ( ' \n ' )
# If there's a line break; use the first line as a title
if toNewline > = 0 :
orgTitle = self . body [ : toNewline ]
self . body = self . body [ len ( orgTitle ) + 1 : ]
orgTitle = body [ : toNewline ]
body = body [ len ( orgTitle ) + 1 : ]
# The note has no breaks; make the body the title
else :
orgTitle = self . body
orgTitle = body
# If the title is the whole body, clear the body
self . body = ' '
if self . body or len ( self . tags ) :
if self . body and not len ( self . tags ) :
return ' * {} {} \n {} ' . format ( status , orgTitle , self . body )
if not self . body and len ( self . tags ) :
return ' * {} {} \n {} ' . format ( status , orgTitle , tagsToOrgString ( self . tags ) )
body = ' '
nesting = ' * ' if self . archived else ' '
# Various levels of information require different formats
if body or len ( self . tags ) :
if body and not len ( self . tags ) :
return ' * {} {} \n {} ' . format ( nesting , orgTitle , body )
if not body and len ( self . tags ) :
return ' * {} {} \n {} ' . format ( nesting , orgTitle , tagsToOrgString ( self . tags ) )
else :
return " * {} {} \n {} \n {} " . format ( status , orgTitle , self . body , tagsToOrgString ( self . tags ) )
return " * {} {} \n {} \n {} " . format ( nesting , orgTitle , body , tagsToOrgString ( self . tags ) )
# If no body nor tags, note should be a single line
else :
return " * {} {} " . format ( status , orgTitle )
return ' * {} {} ' . format ( nesting , orgTitle )
def getAllNoteHtmlFiles ( htmlDir ) :
print ( ' Looking for notes in {} ' . format ( htmlDir ) )
noteHtmlFiles = [ ]
for root , dirs , files in os . walk ( htmlDir ) :
for file in files :
if file . endswith ( " .html " ) :
if file . endswith ( ' .html ' ) :
noteHtmlFiles . append ( os . path . join ( root , file ) )
print ( ' Found {} notes ' . format ( len ( noteHtmlFiles ) ) )
return noteHtmlFiles
def getHtmlValueIfMatches ( line , tag , endTag ) :
if tag . lower ( ) in line . lower ( ) and endTag . lower ( ) in line . lower ( ) :
return line [ line . find ( tag ) + len ( tag ) : - ( len ( endTag ) + 1 ) ] , True
return ' ' , False
def main ( keepHtmlDir ) :
def makeSafeFilename ( strToPurify ) :
strToPurify = strToPurify . replace ( ' / ' , ' ' )
strToPurify = strToPurify . replace ( ' . ' , ' ' )
return strToPurify
def main ( keepHtmlDir , outputDir ) :
noteFiles = getAllNoteHtmlFiles ( keepHtmlDir )
noteGroups = { }
for noteFilePath in noteFiles :
# Read in the file
@ -109,7 +128,7 @@ def main(keepHtmlDir):
noteLines = noteFile . readlines ( )
noteFile . close ( )
print ( ' \n Parsing {} ' . format ( noteFilePath ) )
# print('Parsing {}'.format(noteFilePath) )
note = Note ( )
@ -138,7 +157,13 @@ def main(keepHtmlDir):
# This isn't great; for same-line bodies, strip opening div
line = line . replace ( ' <div class= " content " > ' , ' ' )
# Parse tags
# Parse the date
if ' AM</div> ' in line or ' PM</div> ' in line :
dateString = line . replace ( ' </div> ' , ' ' ) . strip ( )
# Example: "Apr 27, 2018, 6:32:15 PM"
note . date = datetime . datetime . strptime ( dateString , ' % b %d , % Y, % I: % M: % S % p ' )
# Parse tags, if any
potentialTag , isMatch = getHtmlValueIfMatches ( line , ' <span class= " label-name " > ' , ' </span> ' )
if isMatch :
note . tags . append ( potentialTag )
@ -155,15 +180,54 @@ def main(keepHtmlDir):
note . body + = line . replace ( ' <br> ' , ' \n ' )
print ( note . toOrgString ( ) )
# Add to groups based on tags
for tag in note . tags :
if tag in noteGroups :
noteGroups [ tag ] . append ( note )
else :
noteGroups [ tag ] = [ note ]
if not note . tags :
if ' Untagged ' in noteGroups :
noteGroups [ ' Untagged ' ] . append ( note )
else :
noteGroups [ ' Untagged ' ] = [ note ]
# We've parsed all the notes; write out the groups to separate .org files
numNotesWritten = 0
for tag , group in noteGroups . items ( ) :
# TODO: Make sure tags are filename-safe
outFileName = ' {} / {} .org ' . format ( outputDir , makeSafeFilename ( tag ) )
notesSortedByDate = sorted ( group , key = lambda note : note . date )
# If capture etc. appends, we should probably follow that same logic (don't reverse)
# notesSortedByDate.reverse()
# Concatenate all notes into lines
lines = [ ]
archivedLines = [ ]
for note in notesSortedByDate :
if note . archived :
archivedLines . append ( note . toOrgString ( ) + ' \n ' )
else :
lines . append ( note . toOrgString ( ) + ' \n ' )
if len ( archivedLines ) :
lines = [ ' * *Archived* \n ' ] + archivedLines + lines
outFile = open ( outFileName , ' w ' )
outFile . writelines ( lines )
outFile . close ( )
print ( ' Wrote {} notes to {} ' . format ( len ( group ) , outFileName ) )
numNotesWritten + = len ( group )
# TODO: Add to org list depending on tags
print ( ' Wrote {} notes total ' . format ( numNotesWritten ) )
if __name__ == ' __main__ ' :
if len ( sys . argv ) != 2 :
print ( ' Wrong number of arguments! \n Usage: \n \t python KeepToOrg.py /path/to/google/Keep ' )
if len ( sys . argv ) != 3 :
print ( ' Wrong number of arguments! \n Usage: \n \t python KeepToOrg.py /path/to/google/Keep output/dir ' )
else :
keepHtmlDir = sys . argv [ 1 ]
main ( keepHtmlDir )
outputDir = sys . argv [ 2 ]
main ( keepHtmlDir , outputDir )